Spaces:
Running
Running
| <html lang="en"> | |
| <head> | |
| <meta charset="utf-8" /> | |
| <meta name="viewport" content="width=device-width" /> | |
| <title>STT Comparison Playground</title> | |
| <link rel="stylesheet" href="style.css" /> | |
| </head> | |
| <body> | |
| <main class="app"> | |
| <section class="hero"> | |
| <div> | |
| <h1>Speech-to-Text Comparison</h1> | |
| <p> | |
| Play the sample podcast and compare how each transcription model handled it side-by-side. | |
| Each row shows all transcripts for the same time segment. | |
| </p> | |
| <p class="hero-meta">The active row follows the audio so you can quickly inspect what every model heard.</p> | |
| </div> | |
| <div class="audio-shell"> | |
| <audio id="audio" controls preload="auto" src="data/audio/podcast.mp3"></audio> | |
| <canvas id="waveform" role="img" aria-label="Audio waveform preview"></canvas> | |
| </div> | |
| </section> | |
| <section class="insights"> | |
| <div> | |
| <h2>Model snapshots</h2> | |
| <p class="section-subtitle">WER and coverage come from the loaded SRT files compared against the ground truth.</p> | |
| </div> | |
| <div class="insight-grid" id="model-insights" aria-live="polite" aria-busy="true"></div> | |
| </section> | |
| <section class="transcripts"> | |
| <header class="transcripts-toolbar"> | |
| <div class="search-group"> | |
| <label for="segment-search">Search segments</label> | |
| <div class="search-input"> | |
| <input id="segment-search" type="search" placeholder="Find a topic, word, or phrase..." aria-describedby="match-count" /> | |
| <button id="clear-search" type="button" aria-label="Clear search">Clear</button> | |
| </div> | |
| </div> | |
| <div class="match-count" id="match-count" role="status" aria-live="polite">Loading segments...</div> | |
| </header> | |
| <div class="model-legend" id="model-legend" aria-live="polite"></div> | |
| <div class="comparison-table" id="comparison-table" aria-live="polite"></div> | |
| </section> | |
| </main> | |
| <script src="transcripts.js"></script> | |
| <script type="module"> | |
| const comparisonTableEl = document.getElementById("comparison-table"); | |
| const insightsEl = document.getElementById("model-insights"); | |
| const legendEl = document.getElementById("model-legend"); | |
| const matchCountEl = document.getElementById("match-count"); | |
| const searchInput = document.getElementById("segment-search"); | |
| const clearSearchBtn = document.getElementById("clear-search"); | |
| const audioElem = document.getElementById("audio"); | |
| const waveformCanvas = document.getElementById("waveform"); | |
| const transcriptSources = window.TRANSCRIPTS || {}; | |
| const tracks = [ | |
| { | |
| id: "truth", | |
| label: "Ground Truth", | |
| file: "data/ground-truth/truth_1.srt", | |
| accent: "#00b894" | |
| }, | |
| { | |
| id: "assembly", | |
| label: "AssemblyAI", | |
| file: "srt-out/assembly.srt", | |
| accent: "#4070f4" | |
| }, | |
| { | |
| id: "gladia", | |
| label: "Gladia", | |
| file: "srt-out/gladia.srt", | |
| accent: "#9b5de5" | |
| }, | |
| { | |
| id: "nova3", | |
| label: "Whisper Nova 3", | |
| file: "srt-out/nova3.srt", | |
| accent: "#ff6b6b" | |
| }, | |
| { | |
| id: "speechmatics", | |
| label: "Speechmatics", | |
| file: "srt-out/speechmatics.srt", | |
| accent: "#ffa600" | |
| } | |
| ]; | |
| const segmentRows = []; | |
| let allTranscripts = {}; | |
| renderLegend(); | |
| searchInput?.addEventListener("input", (event) => filterRows(event.target.value)); | |
| clearSearchBtn?.addEventListener("click", () => { | |
| if (!searchInput) return; | |
| searchInput.value = ""; | |
| filterRows(""); | |
| searchInput.focus(); | |
| }); | |
| function parseTimestamp(value) { | |
| const [time, millisecondPart] = value.split(","); | |
| const [hours, minutes, seconds] = time.split(":").map(Number); | |
| const milliseconds = Number(millisecondPart); | |
| return hours * 3600 + minutes * 60 + seconds + milliseconds / 1000; | |
| } | |
| function parseSrt(text) { | |
| const blocks = text.replace(/\r/g, "").trim().split(/\n{2,}/); | |
| return blocks | |
| .map((block) => { | |
| const lines = block.split("\n"); | |
| if (lines.length < 3) return null; | |
| const timing = lines[1]; | |
| const [start, end] = timing.split("-->").map((part) => parseTimestamp(part.trim())); | |
| const content = lines.slice(2).join(" ").replace(/\s+/g, " ").trim(); | |
| return { start, end, content }; | |
| }) | |
| .filter(Boolean); | |
| } | |
| function formatTime(seconds) { | |
| const minutes = Math.floor(seconds / 60) | |
| .toString() | |
| .padStart(2, "0"); | |
| const secs = Math.floor(seconds % 60) | |
| .toString() | |
| .padStart(2, "0"); | |
| return `${minutes}:${secs}`; | |
| } | |
| function getTranscriptText(track) { | |
| const cached = transcriptSources[track.id]; | |
| if (cached) { | |
| return cached.replace(/^\ufeff/, ""); | |
| } | |
| return null; | |
| } | |
| async function fetchTranscript(track) { | |
| const response = await fetch(track.file); | |
| if (!response.ok) { | |
| throw new Error(`Unable to load ${track.label}`); | |
| } | |
| return (await response.text()).replace(/^\ufeff/, ""); | |
| } | |
| async function loadAllTranscripts() { | |
| const results = {}; | |
| await Promise.all( | |
| tracks.map(async (track) => { | |
| try { | |
| let transcriptText = getTranscriptText(track); | |
| if (!transcriptText) { | |
| transcriptText = await fetchTranscript(track); | |
| } | |
| results[track.id] = { | |
| segments: parseSrt(transcriptText), | |
| track: track | |
| }; | |
| } catch (error) { | |
| console.error(`Failed to load ${track.label}:`, error); | |
| results[track.id] = { | |
| segments: [], | |
| track: track, | |
| error: true | |
| }; | |
| } | |
| }) | |
| ); | |
| return results; | |
| } | |
| function findSegmentForTime(segments, time) { | |
| return segments.find((seg) => time >= seg.start && time < seg.end); | |
| } | |
| function escapeHtml(value = "") { | |
| return value.replace(/[&<>"']/g, (char) => { | |
| switch (char) { | |
| case "&": | |
| return "&"; | |
| case "<": | |
| return "<"; | |
| case ">": | |
| return ">"; | |
| case '"': | |
| return """; | |
| case "'": | |
| return "'"; | |
| default: | |
| return char; | |
| } | |
| }); | |
| } | |
| function tokenizeText(text = "") { | |
| return text | |
| .replace(/\s+/g, " ") | |
| .trim() | |
| .split(" ") | |
| .filter(Boolean); | |
| } | |
| function tokenizeSegments(segments = []) { | |
| return tokenizeText(segments.map((segment) => segment.content).join(" ")); | |
| } | |
| function buildLcsMatrix(referenceTokens, candidateTokens) { | |
| const rows = referenceTokens.length + 1; | |
| const cols = candidateTokens.length + 1; | |
| const matrix = Array.from({ length: rows }, () => new Array(cols).fill(0)); | |
| for (let i = 1; i < rows; i += 1) { | |
| for (let j = 1; j < cols; j += 1) { | |
| if (referenceTokens[i - 1] === candidateTokens[j - 1]) { | |
| matrix[i][j] = matrix[i - 1][j - 1] + 1; | |
| } else { | |
| matrix[i][j] = Math.max(matrix[i - 1][j], matrix[i][j - 1]); | |
| } | |
| } | |
| } | |
| return matrix; | |
| } | |
| function diffWords(reference, candidate) { | |
| const referenceTokens = tokenizeText(reference).map((token) => token.toLowerCase()); | |
| const candidateTokens = tokenizeText(candidate); | |
| const candidateLower = candidateTokens.map((token) => token.toLowerCase()); | |
| if (!candidateTokens.length) { | |
| return []; | |
| } | |
| const matrix = buildLcsMatrix(referenceTokens, candidateLower); | |
| const output = []; | |
| let i = referenceTokens.length; | |
| let j = candidateLower.length; | |
| while (i > 0 && j > 0) { | |
| if (referenceTokens[i - 1] === candidateLower[j - 1]) { | |
| output.unshift({ text: candidateTokens[j - 1], match: true }); | |
| i -= 1; | |
| j -= 1; | |
| } else if (matrix[i - 1][j] >= matrix[i][j - 1]) { | |
| i -= 1; | |
| } else { | |
| output.unshift({ text: candidateTokens[j - 1], match: false }); | |
| j -= 1; | |
| } | |
| } | |
| while (j > 0) { | |
| output.unshift({ text: candidateTokens[j - 1], match: false }); | |
| j -= 1; | |
| } | |
| return output; | |
| } | |
| function renderDiffTokens(tokens) { | |
| if (!tokens.length) { | |
| return '<span class="muted-text">No transcript</span>'; | |
| } | |
| return tokens | |
| .map((token) => { | |
| if (token.match) { | |
| return escapeHtml(token.text); | |
| } | |
| return `<span class="diff-token">${escapeHtml(token.text)}</span>`; | |
| }) | |
| .join(" "); | |
| } | |
| function levenshteinDistance(referenceTokens, candidateTokens) { | |
| const rows = referenceTokens.length + 1; | |
| const cols = candidateTokens.length + 1; | |
| const matrix = Array.from({ length: rows }, () => new Array(cols).fill(0)); | |
| for (let i = 0; i < rows; i += 1) { | |
| matrix[i][0] = i; | |
| } | |
| for (let j = 0; j < cols; j += 1) { | |
| matrix[0][j] = j; | |
| } | |
| for (let i = 1; i < rows; i += 1) { | |
| for (let j = 1; j < cols; j += 1) { | |
| const cost = referenceTokens[i - 1].toLowerCase() === candidateTokens[j - 1].toLowerCase() ? 0 : 1; | |
| matrix[i][j] = Math.min( | |
| matrix[i - 1][j] + 1, | |
| matrix[i][j - 1] + 1, | |
| matrix[i - 1][j - 1] + cost | |
| ); | |
| } | |
| } | |
| return matrix[referenceTokens.length][candidateTokens.length]; | |
| } | |
| function computeModelInsights() { | |
| const truthSegments = allTranscripts.truth?.segments || []; | |
| const truthTokens = tokenizeSegments(truthSegments); | |
| const truthTokenCount = truthTokens.length || 1; | |
| const truthSegmentCount = truthSegments.length || 1; | |
| return tracks | |
| .filter((track) => track.id !== "truth") | |
| .map((track) => { | |
| const transcript = allTranscripts[track.id]; | |
| const candidateSegments = transcript?.segments || []; | |
| const candidateTokens = tokenizeSegments(candidateSegments); | |
| const distance = levenshteinDistance(truthTokens, candidateTokens); | |
| const wer = distance / truthTokenCount; | |
| const wordMatch = Math.max(0, 1 - wer); | |
| const coverage = Math.min(1, candidateSegments.length / truthSegmentCount); | |
| const avgWords = | |
| candidateSegments.length > 0 ? Math.round(candidateTokens.length / candidateSegments.length) : 0; | |
| return { | |
| track, | |
| wer: wer * 100, | |
| wordMatch: wordMatch * 100, | |
| coverage: coverage * 100, | |
| segments: candidateSegments.length, | |
| avgWords | |
| }; | |
| }); | |
| } | |
| function renderModelInsights() { | |
| if (!insightsEl) return; | |
| insightsEl.removeAttribute("aria-busy"); | |
| if (!allTranscripts.truth?.segments?.length) { | |
| insightsEl.innerHTML = '<p class="match-count">Ground truth transcript was not found.</p>'; | |
| return; | |
| } | |
| const stats = computeModelInsights(); | |
| insightsEl.innerHTML = stats | |
| .map( | |
| (stat) => ` | |
| <article class="insight-card" style="--accent: ${stat.track.accent}"> | |
| <h3>${stat.track.label}</h3> | |
| <div> | |
| <div class="metric-value">${stat.wordMatch.toFixed(1)}%</div> | |
| <div class="metric-label">Word Match</div> | |
| </div> | |
| <div class="insight-meta"> | |
| <span>WER ${stat.wer.toFixed(1)}%</span> | |
| <span>${stat.segments} segments</span> | |
| </div> | |
| <div class="insight-meta"> | |
| <span>Coverage ${Math.round(stat.coverage)}%</span> | |
| <span>${stat.avgWords || 0} words/segment</span> | |
| </div> | |
| </article> | |
| ` | |
| ) | |
| .join(""); | |
| } | |
| function renderLegend() { | |
| if (!legendEl) return; | |
| legendEl.innerHTML = tracks | |
| .map((track) => `<span style="--accent: ${track.accent}"><i></i>${track.label}</span>`) | |
| .join(""); | |
| } | |
| function renderComparisonTable() { | |
| comparisonTableEl.innerHTML = ""; | |
| segmentRows.length = 0; | |
| const header = document.createElement("div"); | |
| header.className = "comparison-header"; | |
| header.innerHTML = ` | |
| <div class="time-column">Time</div> | |
| ${tracks.map((track) => `<div class="model-column" style="--accent: ${track.accent}">${track.label}</div>`).join("")} | |
| `; | |
| comparisonTableEl.appendChild(header); | |
| const groundTruthSegments = allTranscripts.truth?.segments || []; | |
| groundTruthSegments.forEach((truthSegment) => { | |
| const row = document.createElement("div"); | |
| row.className = "comparison-row"; | |
| row.dataset.start = truthSegment.start; | |
| row.dataset.end = truthSegment.end; | |
| const timeCell = document.createElement("div"); | |
| timeCell.className = "time-cell"; | |
| timeCell.textContent = formatTime(truthSegment.start); | |
| row.appendChild(timeCell); | |
| const rowTextParts = [truthSegment.content || ""]; | |
| tracks.forEach((track) => { | |
| const cell = document.createElement("div"); | |
| cell.className = "transcript-cell"; | |
| cell.style.setProperty("--accent", track.accent); | |
| const transcript = allTranscripts[track.id]; | |
| if (transcript?.error) { | |
| cell.innerHTML = '<em class="error-text">Error loading</em>'; | |
| rowTextParts.push(""); | |
| } else { | |
| const segment = findSegmentForTime(transcript?.segments || [], truthSegment.start); | |
| rowTextParts.push(segment?.content || ""); | |
| if (track.id === "truth") { | |
| cell.innerHTML = `<p class="transcript-text">${escapeHtml(truthSegment.content)}</p>`; | |
| } else { | |
| const diffTokens = diffWords(truthSegment.content || "", segment?.content || ""); | |
| cell.innerHTML = renderDiffTokens(diffTokens); | |
| } | |
| } | |
| row.appendChild(cell); | |
| }); | |
| row.dataset.rowText = rowTextParts.join(" ").toLowerCase(); | |
| comparisonTableEl.appendChild(row); | |
| segmentRows.push(row); | |
| }); | |
| filterRows(searchInput?.value || ""); | |
| } | |
| function updateMatchCount(matches, query) { | |
| if (!matchCountEl) return; | |
| const total = segmentRows.length; | |
| if (!total) { | |
| matchCountEl.textContent = "No segments loaded."; | |
| return; | |
| } | |
| if (!query) { | |
| matchCountEl.textContent = `Showing all ${total} segments`; | |
| } else { | |
| matchCountEl.textContent = `Showing ${matches}/${total} segments for "${query}"`; | |
| } | |
| } | |
| function filterRows(rawQuery) { | |
| const normalized = rawQuery.trim().toLowerCase(); | |
| let matches = 0; | |
| segmentRows.forEach((row) => { | |
| const rowText = row.dataset.rowText || ""; | |
| const isMatch = !normalized || rowText.includes(normalized); | |
| row.classList.toggle("is-hidden", !isMatch); | |
| if (isMatch) { | |
| matches += 1; | |
| } | |
| }); | |
| updateMatchCount(matches, normalized ? rawQuery : ""); | |
| } | |
| function updateActiveSegments(time) { | |
| segmentRows.forEach((row) => { | |
| const start = Number(row.dataset.start); | |
| const end = Number(row.dataset.end); | |
| const isActive = time >= start && time < end; | |
| row.classList.toggle("is-active", isActive); | |
| if (isActive) { | |
| row.scrollIntoView({ behavior: "smooth", block: "center" }); | |
| } | |
| }); | |
| } | |
| async function drawWaveform() { | |
| const response = await fetch(audioElem.currentSrc || audioElem.src); | |
| if (!response.ok) return; | |
| const arrayBuffer = await response.arrayBuffer(); | |
| const audioContext = new AudioContext(); | |
| const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); | |
| const rawData = audioBuffer.getChannelData(0); | |
| const canvas = waveformCanvas; | |
| const dpr = window.devicePixelRatio || 1; | |
| canvas.width = canvas.clientWidth * dpr; | |
| canvas.height = canvas.clientHeight * dpr; | |
| const ctx = canvas.getContext("2d"); | |
| ctx.scale(dpr, dpr); | |
| ctx.clearRect(0, 0, canvas.clientWidth, canvas.clientHeight); | |
| const sliceWidth = Math.max(1, Math.floor(rawData.length / Math.max(1, canvas.clientWidth))); | |
| const halfHeight = canvas.clientHeight / 2; | |
| ctx.lineWidth = 1.25; | |
| ctx.strokeStyle = "#1f2937"; | |
| ctx.beginPath(); | |
| for (let i = 0; i < canvas.clientWidth; i += 1) { | |
| const sliceStart = i * sliceWidth; | |
| let sum = 0; | |
| for (let j = 0; j < sliceWidth; j += 1) { | |
| sum += Math.abs(rawData[sliceStart + j] || 0); | |
| } | |
| const amplitude = sum / sliceWidth; | |
| const y = halfHeight - amplitude * halfHeight; | |
| const yBottom = halfHeight + amplitude * halfHeight; | |
| ctx.moveTo(i, y); | |
| ctx.lineTo(i, yBottom); | |
| } | |
| ctx.stroke(); | |
| } | |
| async function bootstrap() { | |
| allTranscripts = await loadAllTranscripts(); | |
| renderComparisonTable(); | |
| renderModelInsights(); | |
| void drawWaveform(); | |
| } | |
| audioElem.addEventListener("timeupdate", () => updateActiveSegments(audioElem.currentTime)); | |
| window.addEventListener("resize", () => drawWaveform()); | |
| bootstrap(); | |
| </script> | |
| </body> | |
| </html> | |