Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Adjust SRT file timestamps to align with ground truth. | |
| This script removes timing offsets to ensure all transcripts start at 00:00:00,000 | |
| """ | |
| import re | |
| from pathlib import Path | |
| def parse_timestamp(timestamp_str): | |
| """Convert SRT timestamp to milliseconds.""" | |
| # Format: HH:MM:SS,mmm | |
| time_part, ms_part = timestamp_str.split(',') | |
| h, m, s = map(int, time_part.split(':')) | |
| ms = int(ms_part) | |
| return (h * 3600 + m * 60 + s) * 1000 + ms | |
| def format_timestamp(ms): | |
| """Convert milliseconds to SRT timestamp format.""" | |
| hours = ms // 3600000 | |
| ms %= 3600000 | |
| minutes = ms // 60000 | |
| ms %= 60000 | |
| seconds = ms // 1000 | |
| milliseconds = ms % 1000 | |
| return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}" | |
| def adjust_srt_timing(input_path, output_path, offset_ms): | |
| """ | |
| Adjust all timestamps in an SRT file by subtracting offset_ms. | |
| Args: | |
| input_path: Path to input SRT file | |
| output_path: Path to output SRT file | |
| offset_ms: Offset in milliseconds to subtract from all timestamps | |
| """ | |
| with open(input_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| # Remove BOM if present | |
| content = content.lstrip('\ufeff') | |
| # Pattern to match timestamp lines: HH:MM:SS,mmm --> HH:MM:SS,mmm | |
| timestamp_pattern = re.compile( | |
| r'(\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2},\d{3})' | |
| ) | |
| def adjust_match(match): | |
| start_str = match.group(1) | |
| end_str = match.group(2) | |
| start_ms = parse_timestamp(start_str) | |
| end_ms = parse_timestamp(end_str) | |
| # Subtract offset | |
| new_start_ms = max(0, start_ms - offset_ms) | |
| new_end_ms = max(0, end_ms - offset_ms) | |
| new_start = format_timestamp(new_start_ms) | |
| new_end = format_timestamp(new_end_ms) | |
| return f"{new_start} --> {new_end}" | |
| adjusted_content = timestamp_pattern.sub(adjust_match, content) | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| f.write(adjusted_content) | |
| print(f"β Adjusted {input_path.name}: offset={offset_ms}ms β {output_path.name}") | |
| def find_first_timestamp(srt_path): | |
| """Find the first timestamp in an SRT file.""" | |
| with open(srt_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| timestamp_pattern = re.compile(r'(\d{2}:\d{2}:\d{2},\d{3})\s*-->') | |
| match = timestamp_pattern.search(content) | |
| if match: | |
| return parse_timestamp(match.group(1)) | |
| return 0 | |
| def main(): | |
| srt_dir = Path(__file__).parent / "srt-out" | |
| # Files to adjust | |
| srt_files = [ | |
| "assembly.srt", | |
| "gladia.srt", | |
| "nova3.srt", | |
| "speechmatics.srt" | |
| ] | |
| print("Analyzing SRT files for timing offset...\n") | |
| for filename in srt_files: | |
| input_path = srt_dir / filename | |
| if not input_path.exists(): | |
| print(f"β Skipping {filename} (not found)") | |
| continue | |
| # Find first timestamp | |
| first_ts_ms = find_first_timestamp(input_path) | |
| if first_ts_ms == 0: | |
| print(f"β {filename} already starts at 00:00:00,000 (no adjustment needed)") | |
| continue | |
| # Calculate offset | |
| offset_ms = first_ts_ms | |
| # Adjust the file in place | |
| adjust_srt_timing(input_path, input_path, offset_ms) | |
| print("\nβ All SRT files have been adjusted to start at 00:00:00,000") | |
| if __name__ == "__main__": | |
| main() | |