#!/usr/bin/env python3 """ Adjust SRT file timestamps to align with ground truth. This script removes timing offsets to ensure all transcripts start at 00:00:00,000 """ import re from pathlib import Path def parse_timestamp(timestamp_str): """Convert SRT timestamp to milliseconds.""" # Format: HH:MM:SS,mmm time_part, ms_part = timestamp_str.split(',') h, m, s = map(int, time_part.split(':')) ms = int(ms_part) return (h * 3600 + m * 60 + s) * 1000 + ms def format_timestamp(ms): """Convert milliseconds to SRT timestamp format.""" hours = ms // 3600000 ms %= 3600000 minutes = ms // 60000 ms %= 60000 seconds = ms // 1000 milliseconds = ms % 1000 return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}" def adjust_srt_timing(input_path, output_path, offset_ms): """ Adjust all timestamps in an SRT file by subtracting offset_ms. Args: input_path: Path to input SRT file output_path: Path to output SRT file offset_ms: Offset in milliseconds to subtract from all timestamps """ with open(input_path, 'r', encoding='utf-8') as f: content = f.read() # Remove BOM if present content = content.lstrip('\ufeff') # Pattern to match timestamp lines: HH:MM:SS,mmm --> HH:MM:SS,mmm timestamp_pattern = re.compile( r'(\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2},\d{3})' ) def adjust_match(match): start_str = match.group(1) end_str = match.group(2) start_ms = parse_timestamp(start_str) end_ms = parse_timestamp(end_str) # Subtract offset new_start_ms = max(0, start_ms - offset_ms) new_end_ms = max(0, end_ms - offset_ms) new_start = format_timestamp(new_start_ms) new_end = format_timestamp(new_end_ms) return f"{new_start} --> {new_end}" adjusted_content = timestamp_pattern.sub(adjust_match, content) with open(output_path, 'w', encoding='utf-8') as f: f.write(adjusted_content) print(f"✓ Adjusted {input_path.name}: offset={offset_ms}ms → {output_path.name}") def find_first_timestamp(srt_path): """Find the first timestamp in an SRT file.""" with open(srt_path, 'r', encoding='utf-8') as f: content = f.read() timestamp_pattern = re.compile(r'(\d{2}:\d{2}:\d{2},\d{3})\s*-->') match = timestamp_pattern.search(content) if match: return parse_timestamp(match.group(1)) return 0 def main(): srt_dir = Path(__file__).parent / "srt-out" # Files to adjust srt_files = [ "assembly.srt", "gladia.srt", "nova3.srt", "speechmatics.srt" ] print("Analyzing SRT files for timing offset...\n") for filename in srt_files: input_path = srt_dir / filename if not input_path.exists(): print(f"⚠ Skipping {filename} (not found)") continue # Find first timestamp first_ts_ms = find_first_timestamp(input_path) if first_ts_ms == 0: print(f"✓ {filename} already starts at 00:00:00,000 (no adjustment needed)") continue # Calculate offset offset_ms = first_ts_ms # Adjust the file in place adjust_srt_timing(input_path, input_path, offset_ms) print("\n✅ All SRT files have been adjusted to start at 00:00:00,000") if __name__ == "__main__": main()