STT-Comparison / index.html
danielrosehill's picture
commit
4a63305
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width" />
<title>STT Comparison Playground</title>
<link rel="stylesheet" href="style.css" />
</head>
<body>
<main class="app">
<section class="hero">
<div>
<h1>Speech-to-Text Comparison</h1>
<p>
Play the sample podcast and compare how each transcription model handled it side-by-side.
Each row shows all transcripts for the same time segment.
</p>
<p class="hero-meta">The active row follows the audio so you can quickly inspect what every model heard.</p>
</div>
<div class="audio-shell">
<audio id="audio" controls preload="auto" src="data/audio/podcast.mp3"></audio>
<canvas id="waveform" role="img" aria-label="Audio waveform preview"></canvas>
</div>
</section>
<section class="insights">
<div>
<h2>Model snapshots</h2>
<p class="section-subtitle">WER and coverage come from the loaded SRT files compared against the ground truth.</p>
</div>
<div class="insight-grid" id="model-insights" aria-live="polite" aria-busy="true"></div>
</section>
<section class="transcripts">
<header class="transcripts-toolbar">
<div class="search-group">
<label for="segment-search">Search segments</label>
<div class="search-input">
<input id="segment-search" type="search" placeholder="Find a topic, word, or phrase..." aria-describedby="match-count" />
<button id="clear-search" type="button" aria-label="Clear search">Clear</button>
</div>
</div>
<div class="match-count" id="match-count" role="status" aria-live="polite">Loading segments...</div>
</header>
<div class="model-legend" id="model-legend" aria-live="polite"></div>
<div class="comparison-table" id="comparison-table" aria-live="polite"></div>
</section>
</main>
<script src="transcripts.js"></script>
<script type="module">
const comparisonTableEl = document.getElementById("comparison-table");
const insightsEl = document.getElementById("model-insights");
const legendEl = document.getElementById("model-legend");
const matchCountEl = document.getElementById("match-count");
const searchInput = document.getElementById("segment-search");
const clearSearchBtn = document.getElementById("clear-search");
const audioElem = document.getElementById("audio");
const waveformCanvas = document.getElementById("waveform");
const transcriptSources = window.TRANSCRIPTS || {};
const tracks = [
{
id: "truth",
label: "Ground Truth",
file: "data/ground-truth/truth_1.srt",
accent: "#00b894"
},
{
id: "assembly",
label: "AssemblyAI",
file: "srt-out/assembly.srt",
accent: "#4070f4"
},
{
id: "gladia",
label: "Gladia",
file: "srt-out/gladia.srt",
accent: "#9b5de5"
},
{
id: "nova3",
label: "Whisper Nova 3",
file: "srt-out/nova3.srt",
accent: "#ff6b6b"
},
{
id: "speechmatics",
label: "Speechmatics",
file: "srt-out/speechmatics.srt",
accent: "#ffa600"
}
];
const segmentRows = [];
let allTranscripts = {};
renderLegend();
searchInput?.addEventListener("input", (event) => filterRows(event.target.value));
clearSearchBtn?.addEventListener("click", () => {
if (!searchInput) return;
searchInput.value = "";
filterRows("");
searchInput.focus();
});
function parseTimestamp(value) {
const [time, millisecondPart] = value.split(",");
const [hours, minutes, seconds] = time.split(":").map(Number);
const milliseconds = Number(millisecondPart);
return hours * 3600 + minutes * 60 + seconds + milliseconds / 1000;
}
function parseSrt(text) {
const blocks = text.replace(/\r/g, "").trim().split(/\n{2,}/);
return blocks
.map((block) => {
const lines = block.split("\n");
if (lines.length < 3) return null;
const timing = lines[1];
const [start, end] = timing.split("-->").map((part) => parseTimestamp(part.trim()));
const content = lines.slice(2).join(" ").replace(/\s+/g, " ").trim();
return { start, end, content };
})
.filter(Boolean);
}
function formatTime(seconds) {
const minutes = Math.floor(seconds / 60)
.toString()
.padStart(2, "0");
const secs = Math.floor(seconds % 60)
.toString()
.padStart(2, "0");
return `${minutes}:${secs}`;
}
function getTranscriptText(track) {
const cached = transcriptSources[track.id];
if (cached) {
return cached.replace(/^\ufeff/, "");
}
return null;
}
async function fetchTranscript(track) {
const response = await fetch(track.file);
if (!response.ok) {
throw new Error(`Unable to load ${track.label}`);
}
return (await response.text()).replace(/^\ufeff/, "");
}
async function loadAllTranscripts() {
const results = {};
await Promise.all(
tracks.map(async (track) => {
try {
let transcriptText = getTranscriptText(track);
if (!transcriptText) {
transcriptText = await fetchTranscript(track);
}
results[track.id] = {
segments: parseSrt(transcriptText),
track: track
};
} catch (error) {
console.error(`Failed to load ${track.label}:`, error);
results[track.id] = {
segments: [],
track: track,
error: true
};
}
})
);
return results;
}
function findSegmentForTime(segments, time) {
return segments.find((seg) => time >= seg.start && time < seg.end);
}
function escapeHtml(value = "") {
return value.replace(/[&<>"']/g, (char) => {
switch (char) {
case "&":
return "&amp;";
case "<":
return "&lt;";
case ">":
return "&gt;";
case '"':
return "&quot;";
case "'":
return "&#39;";
default:
return char;
}
});
}
function tokenizeText(text = "") {
return text
.replace(/\s+/g, " ")
.trim()
.split(" ")
.filter(Boolean);
}
function tokenizeSegments(segments = []) {
return tokenizeText(segments.map((segment) => segment.content).join(" "));
}
function buildLcsMatrix(referenceTokens, candidateTokens) {
const rows = referenceTokens.length + 1;
const cols = candidateTokens.length + 1;
const matrix = Array.from({ length: rows }, () => new Array(cols).fill(0));
for (let i = 1; i < rows; i += 1) {
for (let j = 1; j < cols; j += 1) {
if (referenceTokens[i - 1] === candidateTokens[j - 1]) {
matrix[i][j] = matrix[i - 1][j - 1] + 1;
} else {
matrix[i][j] = Math.max(matrix[i - 1][j], matrix[i][j - 1]);
}
}
}
return matrix;
}
function diffWords(reference, candidate) {
const referenceTokens = tokenizeText(reference).map((token) => token.toLowerCase());
const candidateTokens = tokenizeText(candidate);
const candidateLower = candidateTokens.map((token) => token.toLowerCase());
if (!candidateTokens.length) {
return [];
}
const matrix = buildLcsMatrix(referenceTokens, candidateLower);
const output = [];
let i = referenceTokens.length;
let j = candidateLower.length;
while (i > 0 && j > 0) {
if (referenceTokens[i - 1] === candidateLower[j - 1]) {
output.unshift({ text: candidateTokens[j - 1], match: true });
i -= 1;
j -= 1;
} else if (matrix[i - 1][j] >= matrix[i][j - 1]) {
i -= 1;
} else {
output.unshift({ text: candidateTokens[j - 1], match: false });
j -= 1;
}
}
while (j > 0) {
output.unshift({ text: candidateTokens[j - 1], match: false });
j -= 1;
}
return output;
}
function renderDiffTokens(tokens) {
if (!tokens.length) {
return '<span class="muted-text">No transcript</span>';
}
return tokens
.map((token) => {
if (token.match) {
return escapeHtml(token.text);
}
return `<span class="diff-token">${escapeHtml(token.text)}</span>`;
})
.join(" ");
}
function levenshteinDistance(referenceTokens, candidateTokens) {
const rows = referenceTokens.length + 1;
const cols = candidateTokens.length + 1;
const matrix = Array.from({ length: rows }, () => new Array(cols).fill(0));
for (let i = 0; i < rows; i += 1) {
matrix[i][0] = i;
}
for (let j = 0; j < cols; j += 1) {
matrix[0][j] = j;
}
for (let i = 1; i < rows; i += 1) {
for (let j = 1; j < cols; j += 1) {
const cost = referenceTokens[i - 1].toLowerCase() === candidateTokens[j - 1].toLowerCase() ? 0 : 1;
matrix[i][j] = Math.min(
matrix[i - 1][j] + 1,
matrix[i][j - 1] + 1,
matrix[i - 1][j - 1] + cost
);
}
}
return matrix[referenceTokens.length][candidateTokens.length];
}
function computeModelInsights() {
const truthSegments = allTranscripts.truth?.segments || [];
const truthTokens = tokenizeSegments(truthSegments);
const truthTokenCount = truthTokens.length || 1;
const truthSegmentCount = truthSegments.length || 1;
return tracks
.filter((track) => track.id !== "truth")
.map((track) => {
const transcript = allTranscripts[track.id];
const candidateSegments = transcript?.segments || [];
const candidateTokens = tokenizeSegments(candidateSegments);
const distance = levenshteinDistance(truthTokens, candidateTokens);
const wer = distance / truthTokenCount;
const wordMatch = Math.max(0, 1 - wer);
const coverage = Math.min(1, candidateSegments.length / truthSegmentCount);
const avgWords =
candidateSegments.length > 0 ? Math.round(candidateTokens.length / candidateSegments.length) : 0;
return {
track,
wer: wer * 100,
wordMatch: wordMatch * 100,
coverage: coverage * 100,
segments: candidateSegments.length,
avgWords
};
});
}
function renderModelInsights() {
if (!insightsEl) return;
insightsEl.removeAttribute("aria-busy");
if (!allTranscripts.truth?.segments?.length) {
insightsEl.innerHTML = '<p class="match-count">Ground truth transcript was not found.</p>';
return;
}
const stats = computeModelInsights();
insightsEl.innerHTML = stats
.map(
(stat) => `
<article class="insight-card" style="--accent: ${stat.track.accent}">
<h3>${stat.track.label}</h3>
<div>
<div class="metric-value">${stat.wordMatch.toFixed(1)}%</div>
<div class="metric-label">Word Match</div>
</div>
<div class="insight-meta">
<span>WER ${stat.wer.toFixed(1)}%</span>
<span>${stat.segments} segments</span>
</div>
<div class="insight-meta">
<span>Coverage ${Math.round(stat.coverage)}%</span>
<span>${stat.avgWords || 0} words/segment</span>
</div>
</article>
`
)
.join("");
}
function renderLegend() {
if (!legendEl) return;
legendEl.innerHTML = tracks
.map((track) => `<span style="--accent: ${track.accent}"><i></i>${track.label}</span>`)
.join("");
}
function renderComparisonTable() {
comparisonTableEl.innerHTML = "";
segmentRows.length = 0;
const header = document.createElement("div");
header.className = "comparison-header";
header.innerHTML = `
<div class="time-column">Time</div>
${tracks.map((track) => `<div class="model-column" style="--accent: ${track.accent}">${track.label}</div>`).join("")}
`;
comparisonTableEl.appendChild(header);
const groundTruthSegments = allTranscripts.truth?.segments || [];
groundTruthSegments.forEach((truthSegment) => {
const row = document.createElement("div");
row.className = "comparison-row";
row.dataset.start = truthSegment.start;
row.dataset.end = truthSegment.end;
const timeCell = document.createElement("div");
timeCell.className = "time-cell";
timeCell.textContent = formatTime(truthSegment.start);
row.appendChild(timeCell);
const rowTextParts = [truthSegment.content || ""];
tracks.forEach((track) => {
const cell = document.createElement("div");
cell.className = "transcript-cell";
cell.style.setProperty("--accent", track.accent);
const transcript = allTranscripts[track.id];
if (transcript?.error) {
cell.innerHTML = '<em class="error-text">Error loading</em>';
rowTextParts.push("");
} else {
const segment = findSegmentForTime(transcript?.segments || [], truthSegment.start);
rowTextParts.push(segment?.content || "");
if (track.id === "truth") {
cell.innerHTML = `<p class="transcript-text">${escapeHtml(truthSegment.content)}</p>`;
} else {
const diffTokens = diffWords(truthSegment.content || "", segment?.content || "");
cell.innerHTML = renderDiffTokens(diffTokens);
}
}
row.appendChild(cell);
});
row.dataset.rowText = rowTextParts.join(" ").toLowerCase();
comparisonTableEl.appendChild(row);
segmentRows.push(row);
});
filterRows(searchInput?.value || "");
}
function updateMatchCount(matches, query) {
if (!matchCountEl) return;
const total = segmentRows.length;
if (!total) {
matchCountEl.textContent = "No segments loaded.";
return;
}
if (!query) {
matchCountEl.textContent = `Showing all ${total} segments`;
} else {
matchCountEl.textContent = `Showing ${matches}/${total} segments for "${query}"`;
}
}
function filterRows(rawQuery) {
const normalized = rawQuery.trim().toLowerCase();
let matches = 0;
segmentRows.forEach((row) => {
const rowText = row.dataset.rowText || "";
const isMatch = !normalized || rowText.includes(normalized);
row.classList.toggle("is-hidden", !isMatch);
if (isMatch) {
matches += 1;
}
});
updateMatchCount(matches, normalized ? rawQuery : "");
}
function updateActiveSegments(time) {
segmentRows.forEach((row) => {
const start = Number(row.dataset.start);
const end = Number(row.dataset.end);
const isActive = time >= start && time < end;
row.classList.toggle("is-active", isActive);
if (isActive) {
row.scrollIntoView({ behavior: "smooth", block: "center" });
}
});
}
async function drawWaveform() {
const response = await fetch(audioElem.currentSrc || audioElem.src);
if (!response.ok) return;
const arrayBuffer = await response.arrayBuffer();
const audioContext = new AudioContext();
const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
const rawData = audioBuffer.getChannelData(0);
const canvas = waveformCanvas;
const dpr = window.devicePixelRatio || 1;
canvas.width = canvas.clientWidth * dpr;
canvas.height = canvas.clientHeight * dpr;
const ctx = canvas.getContext("2d");
ctx.scale(dpr, dpr);
ctx.clearRect(0, 0, canvas.clientWidth, canvas.clientHeight);
const sliceWidth = Math.max(1, Math.floor(rawData.length / Math.max(1, canvas.clientWidth)));
const halfHeight = canvas.clientHeight / 2;
ctx.lineWidth = 1.25;
ctx.strokeStyle = "#1f2937";
ctx.beginPath();
for (let i = 0; i < canvas.clientWidth; i += 1) {
const sliceStart = i * sliceWidth;
let sum = 0;
for (let j = 0; j < sliceWidth; j += 1) {
sum += Math.abs(rawData[sliceStart + j] || 0);
}
const amplitude = sum / sliceWidth;
const y = halfHeight - amplitude * halfHeight;
const yBottom = halfHeight + amplitude * halfHeight;
ctx.moveTo(i, y);
ctx.lineTo(i, yBottom);
}
ctx.stroke();
}
async function bootstrap() {
allTranscripts = await loadAllTranscripts();
renderComparisonTable();
renderModelInsights();
void drawWaveform();
}
audioElem.addEventListener("timeupdate", () => updateActiveSegments(audioElem.currentTime));
window.addEventListener("resize", () => drawWaveform());
bootstrap();
</script>
</body>
</html>