STT-Comparison / adjust_srt_timing.py
danielrosehill's picture
Fix SRT timestamp alignment with ground truth
0aa8adc
#!/usr/bin/env python3
"""
Adjust SRT file timestamps to align with ground truth.
This script removes timing offsets to ensure all transcripts start at 00:00:00,000
"""
import re
from pathlib import Path
def parse_timestamp(timestamp_str):
"""Convert SRT timestamp to milliseconds."""
# Format: HH:MM:SS,mmm
time_part, ms_part = timestamp_str.split(',')
h, m, s = map(int, time_part.split(':'))
ms = int(ms_part)
return (h * 3600 + m * 60 + s) * 1000 + ms
def format_timestamp(ms):
"""Convert milliseconds to SRT timestamp format."""
hours = ms // 3600000
ms %= 3600000
minutes = ms // 60000
ms %= 60000
seconds = ms // 1000
milliseconds = ms % 1000
return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"
def adjust_srt_timing(input_path, output_path, offset_ms):
"""
Adjust all timestamps in an SRT file by subtracting offset_ms.
Args:
input_path: Path to input SRT file
output_path: Path to output SRT file
offset_ms: Offset in milliseconds to subtract from all timestamps
"""
with open(input_path, 'r', encoding='utf-8') as f:
content = f.read()
# Remove BOM if present
content = content.lstrip('\ufeff')
# Pattern to match timestamp lines: HH:MM:SS,mmm --> HH:MM:SS,mmm
timestamp_pattern = re.compile(
r'(\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2},\d{3})'
)
def adjust_match(match):
start_str = match.group(1)
end_str = match.group(2)
start_ms = parse_timestamp(start_str)
end_ms = parse_timestamp(end_str)
# Subtract offset
new_start_ms = max(0, start_ms - offset_ms)
new_end_ms = max(0, end_ms - offset_ms)
new_start = format_timestamp(new_start_ms)
new_end = format_timestamp(new_end_ms)
return f"{new_start} --> {new_end}"
adjusted_content = timestamp_pattern.sub(adjust_match, content)
with open(output_path, 'w', encoding='utf-8') as f:
f.write(adjusted_content)
print(f"βœ“ Adjusted {input_path.name}: offset={offset_ms}ms β†’ {output_path.name}")
def find_first_timestamp(srt_path):
"""Find the first timestamp in an SRT file."""
with open(srt_path, 'r', encoding='utf-8') as f:
content = f.read()
timestamp_pattern = re.compile(r'(\d{2}:\d{2}:\d{2},\d{3})\s*-->')
match = timestamp_pattern.search(content)
if match:
return parse_timestamp(match.group(1))
return 0
def main():
srt_dir = Path(__file__).parent / "srt-out"
# Files to adjust
srt_files = [
"assembly.srt",
"gladia.srt",
"nova3.srt",
"speechmatics.srt"
]
print("Analyzing SRT files for timing offset...\n")
for filename in srt_files:
input_path = srt_dir / filename
if not input_path.exists():
print(f"⚠ Skipping {filename} (not found)")
continue
# Find first timestamp
first_ts_ms = find_first_timestamp(input_path)
if first_ts_ms == 0:
print(f"βœ“ {filename} already starts at 00:00:00,000 (no adjustment needed)")
continue
# Calculate offset
offset_ms = first_ts_ms
# Adjust the file in place
adjust_srt_timing(input_path, input_path, offset_ms)
print("\nβœ… All SRT files have been adjusted to start at 00:00:00,000")
if __name__ == "__main__":
main()