Spaces:

danielrosehill
/

STT-Comparison

Running

App Files Files Community

STT-Comparison / adjust_srt_timing.py

danielrosehill

Fix SRT timestamp alignment with ground truth

0aa8adc 28 days ago

raw

history blame contribute delete

3.45 kB

	#!/usr/bin/env python3
	"""
	Adjust SRT file timestamps to align with ground truth.
	This script removes timing offsets to ensure all transcripts start at 00:00:00,000
	"""

	import re
	from pathlib import Path


	def parse_timestamp(timestamp_str):
	"""Convert SRT timestamp to milliseconds."""
	# Format: HH:MM:SS,mmm
	time_part, ms_part = timestamp_str.split(',')
	h, m, s = map(int, time_part.split(':'))
	ms = int(ms_part)
	return (h * 3600 + m * 60 + s) * 1000 + ms


	def format_timestamp(ms):
	"""Convert milliseconds to SRT timestamp format."""
	hours = ms // 3600000
	ms %= 3600000
	minutes = ms // 60000
	ms %= 60000
	seconds = ms // 1000
	milliseconds = ms % 1000
	return f"{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}"


	def adjust_srt_timing(input_path, output_path, offset_ms):
	"""
	Adjust all timestamps in an SRT file by subtracting offset_ms.

	Args:
	input_path: Path to input SRT file
	output_path: Path to output SRT file
	offset_ms: Offset in milliseconds to subtract from all timestamps
	"""
	with open(input_path, 'r', encoding='utf-8') as f:
	content = f.read()

	# Remove BOM if present
	content = content.lstrip('\ufeff')

	# Pattern to match timestamp lines: HH:MM:SS,mmm --> HH:MM:SS,mmm
	timestamp_pattern = re.compile(
	r'(\d{2}:\d{2}:\d{2},\d{3})\s-->\s(\d{2}:\d{2}:\d{2},\d{3})'
	)

	def adjust_match(match):
	start_str = match.group(1)
	end_str = match.group(2)

	start_ms = parse_timestamp(start_str)
	end_ms = parse_timestamp(end_str)

	# Subtract offset
	new_start_ms = max(0, start_ms - offset_ms)
	new_end_ms = max(0, end_ms - offset_ms)

	new_start = format_timestamp(new_start_ms)
	new_end = format_timestamp(new_end_ms)

	return f"{new_start} --> {new_end}"

	adjusted_content = timestamp_pattern.sub(adjust_match, content)

	with open(output_path, 'w', encoding='utf-8') as f:
	f.write(adjusted_content)

	print(f"✓ Adjusted {input_path.name}: offset={offset_ms}ms → {output_path.name}")


	def find_first_timestamp(srt_path):
	"""Find the first timestamp in an SRT file."""
	with open(srt_path, 'r', encoding='utf-8') as f:
	content = f.read()

	timestamp_pattern = re.compile(r'(\d{2}:\d{2}:\d{2},\d{3})\s*-->')
	match = timestamp_pattern.search(content)
	if match:
	return parse_timestamp(match.group(1))
	return 0


	def main():
	srt_dir = Path(__file__).parent / "srt-out"

	# Files to adjust
	srt_files = [
	"assembly.srt",
	"gladia.srt",
	"nova3.srt",
	"speechmatics.srt"
	]

	print("Analyzing SRT files for timing offset...\n")

	for filename in srt_files:
	input_path = srt_dir / filename
	if not input_path.exists():
	print(f"⚠ Skipping {filename} (not found)")
	continue

	# Find first timestamp
	first_ts_ms = find_first_timestamp(input_path)

	if first_ts_ms == 0:
	print(f"✓ {filename} already starts at 00:00:00,000 (no adjustment needed)")
	continue

	# Calculate offset
	offset_ms = first_ts_ms

	# Adjust the file in place
	adjust_srt_timing(input_path, input_path, offset_ms)

	print("\n✅ All SRT files have been adjusted to start at 00:00:00,000")


	if __name__ == "__main__":
	main()