/** * LFM2.5-Audio WebGPU Demo * * Demonstrates ASR, TTS, and interleaved audio generation using ONNX Runtime Web. */ import { AudioModel, loadAudioFile, clearModelCache, getCacheInfo } from './audio-model.js'; // HuggingFace model URL const MODEL_URL = 'https://huggingface.co/LiquidAI/LFM2.5-Audio-1.5B-ONNX/resolve/main'; // Model configurations const MODELS = { 'LFM2.5-Audio-1.5B-Q4': { path: MODEL_URL, label: 'LFM2.5-Audio-1.5B Q4 (~1.6 GB)', quantization: { decoder: 'q4', audioEncoder: 'q4', audioEmbedding: 'q4', audioDetokenizer: 'q4', vocoder: 'q4', }, }, }; // DOM elements const modelSelect = document.getElementById('modelSelect'); const loadBtn = document.getElementById('loadBtn'); const clearBtn = document.getElementById('clearBtn'); const statusEl = document.getElementById('status'); const chatContainer = document.getElementById('chatContainer'); const userInput = document.getElementById('userInput'); const sendBtn = document.getElementById('sendBtn'); const progressBar = document.getElementById('progressBar'); const progressFill = document.getElementById('progressFill'); const progressText = document.getElementById('progressText'); const audioModeSelect = document.getElementById('audioModeSelect'); const recordBtn = document.getElementById('recordBtn'); const audioBtn = document.getElementById('audioBtn'); const audioInput = document.getElementById('audioInput'); const audioPreview = document.getElementById('audioPreview'); const clearCacheBtn = document.getElementById('clearCacheBtn'); const cacheInfoEl = document.getElementById('cacheInfo'); const dropOverlay = document.getElementById('dropOverlay'); const spinner = document.getElementById('spinner'); const spinnerText = document.getElementById('spinnerText'); const spinnerStats = document.getElementById('spinnerStats'); // State let audioModel = null; let messages = []; let isGenerating = false; let pendingAudio = null; let audioMode = 'interleaved'; let isRecording = false; let mediaRecorder = null; let audioChunks = []; // ============================================================================ // Audio Helpers // ============================================================================ function createWavBlob(samples, sampleRate) { // Debug: check waveform statistics let min = Infinity, max = -Infinity, sum = 0, nonZero = 0; for (let i = 0; i < samples.length; i++) { const v = samples[i]; if (v < min) min = v; if (v > max) max = v; sum += Math.abs(v); if (Math.abs(v) > 0.001) nonZero++; } console.log('WAV input stats:', { length: samples.length, min: min.toFixed(6), max: max.toFixed(6), avgAbs: (sum / samples.length).toFixed(6), nonZeroSamples: nonZero, percentNonZero: ((nonZero / samples.length) * 100).toFixed(1) + '%' }); const numChannels = 1; const bitsPerSample = 16; const bytesPerSample = bitsPerSample / 8; const blockAlign = numChannels * bytesPerSample; const byteRate = sampleRate * blockAlign; const dataSize = samples.length * bytesPerSample; const bufferSize = 44 + dataSize; const buffer = new ArrayBuffer(bufferSize); const view = new DataView(buffer); const writeString = (offset, string) => { for (let i = 0; i < string.length; i++) { view.setUint8(offset + i, string.charCodeAt(i)); } }; writeString(0, 'RIFF'); view.setUint32(4, bufferSize - 8, true); writeString(8, 'WAVE'); writeString(12, 'fmt '); view.setUint32(16, 16, true); view.setUint16(20, 1, true); view.setUint16(22, numChannels, true); view.setUint32(24, sampleRate, true); view.setUint32(28, byteRate, true); view.setUint16(32, blockAlign, true); view.setUint16(34, bitsPerSample, true); writeString(36, 'data'); view.setUint32(40, dataSize, true); let offset = 44; for (let i = 0; i < samples.length; i++) { const sample = Math.max(-1, Math.min(1, samples[i])); const int16 = sample < 0 ? sample * 0x8000 : sample * 0x7FFF; view.setInt16(offset, int16, true); offset += 2; } return new Blob([buffer], { type: 'audio/wav' }); } // Test function to verify WAV creation works function createTestToneBlob(durationSec = 1, frequency = 440, sampleRate = 24000) { const numSamples = Math.floor(durationSec * sampleRate); const samples = new Float32Array(numSamples); for (let i = 0; i < numSamples; i++) { samples[i] = 0.5 * Math.sin(2 * Math.PI * frequency * i / sampleRate); } return createWavBlob(samples, sampleRate); } // ============================================================================ // UI Helpers // ============================================================================ function setStatus(text, type = '') { statusEl.textContent = text; statusEl.className = type; } function setLoading(loading) { loadBtn.disabled = loading; modelSelect.disabled = loading; } function setReady(ready) { userInput.disabled = !ready; sendBtn.disabled = !ready; audioBtn.disabled = !ready; recordBtn.disabled = !ready; } function showProgress(show) { progressBar.style.display = show ? 'block' : 'none'; } function updateProgress(percent, text) { progressFill.style.width = `${percent}%`; progressText.textContent = text || `${percent}%`; } function showSpinner(text, stats = '') { spinner.classList.add('active'); spinnerText.textContent = text; spinnerStats.textContent = stats; } function updateSpinner(text, stats = '') { if (text) spinnerText.textContent = text; spinnerStats.textContent = stats; } function hideSpinner() { spinner.classList.remove('active'); spinnerText.textContent = ''; spinnerStats.textContent = ''; } function addMessage(role, content, isStreaming = false, audio = null) { const msgEl = document.createElement('div'); msgEl.className = `message ${role}${isStreaming ? ' generating' : ''}`; if (audio) { const audioEl = document.createElement('div'); audioEl.className = 'audio-preview-item'; audioEl.style.marginBottom = '0.5rem'; audioEl.innerHTML = ` 馃帳 ${audio.fileName} ${(audio.audioData.length / audio.sampleRate).toFixed(1)}s `; msgEl.appendChild(audioEl); } const textEl = document.createElement('span'); textEl.textContent = content; msgEl.appendChild(textEl); chatContainer.appendChild(msgEl); chatContainer.scrollTop = chatContainer.scrollHeight; return { msgEl, textEl }; } function updatePlaceholder() { const mode = audioModeSelect?.value || 'interleaved'; if (mode === 'asr') { userInput.placeholder = 'Record or upload audio to transcribe...'; } else if (mode === 'tts') { userInput.placeholder = 'Type text to convert to speech...'; } else { userInput.placeholder = 'Type a message or record audio...'; } } function clearPendingAudio() { pendingAudio = null; if (audioPreview) { audioPreview.innerHTML = ''; } } async function updateCacheInfo() { if (!cacheInfoEl || !clearCacheBtn) return; const info = await getCacheInfo(); if (info && info.used > 0) { const usedMB = info.used / 1024 / 1024; if (usedMB >= 1000) { cacheInfoEl.textContent = `${(usedMB / 1024).toFixed(1)} GB cached`; } else if (usedMB >= 1) { cacheInfoEl.textContent = `${usedMB.toFixed(0)} MB cached`; } else { cacheInfoEl.textContent = 'No models cached'; } clearCacheBtn.disabled = usedMB < 1; } else { cacheInfoEl.textContent = 'No models cached'; clearCacheBtn.disabled = true; } } // ============================================================================ // Microphone Recording // ============================================================================ async function startRecording() { try { if (!window.isSecureContext) { throw new Error('Microphone requires HTTPS. Use localhost or enable HTTPS.'); } if (!navigator.mediaDevices?.getUserMedia) { throw new Error('MediaDevices API not available in this browser.'); } const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); audioChunks = []; mediaRecorder = new MediaRecorder(stream); mediaRecorder.ondataavailable = (e) => { if (e.data.size > 0) { audioChunks.push(e.data); } }; mediaRecorder.onstop = async () => { stream.getTracks().forEach(track => track.stop()); const audioBlob = new Blob(audioChunks, { type: 'audio/webm' }); await processRecordedAudio(audioBlob, false); }; mediaRecorder.start(); isRecording = true; recordBtn.classList.add('recording'); recordBtn.textContent = '鈴癸笍'; recordBtn.title = 'Stop recording'; setStatus('Recording... Click to stop', 'success'); } catch (error) { console.error('Failed to start recording:', error); setStatus(`Microphone error: ${error.message}`, 'error'); } } function stopRecording() { if (mediaRecorder && mediaRecorder.state === 'recording') { mediaRecorder.stop(); isRecording = false; recordBtn.classList.remove('recording'); recordBtn.textContent = '馃帳'; recordBtn.title = 'Record from microphone'; } } async function processRecordedAudio(audioBlob) { try { setStatus('Processing recording...'); const arrayBuffer = await audioBlob.arrayBuffer(); const audioContext = new (window.AudioContext || window.webkitAudioContext)(); const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); let audioData; if (audioBuffer.numberOfChannels === 1) { audioData = new Float32Array(audioBuffer.getChannelData(0)); } else { const ch0 = audioBuffer.getChannelData(0); const ch1 = audioBuffer.getChannelData(1); audioData = new Float32Array(ch0.length); for (let i = 0; i < ch0.length; i++) { audioData[i] = (ch0[i] + ch1[i]) / 2; } } const sampleRate = audioBuffer.sampleRate; const duration = (audioData.length / sampleRate).toFixed(1); pendingAudio = { audioData, sampleRate, fileName: `Recording (${duration}s)` }; audioContext.close(); if (audioPreview) { audioPreview.innerHTML = `
馃帳 Recording ${duration}s @ ${sampleRate}Hz
`; } setStatus('Recording ready. Click Send to process.', 'success'); } catch (error) { console.error('Failed to process recording:', error); setStatus(`Error processing recording: ${error.message}`, 'error'); } } // ============================================================================ // Model Loading // ============================================================================ async function loadModel() { const modelKey = modelSelect.value; const modelConfig = MODELS[modelKey]; if (!modelConfig) { setStatus('Invalid model selection', 'error'); return; } setLoading(true); setReady(false); showProgress(true); updateProgress(0, 'Starting...'); setStatus(`Loading ${modelConfig.label}...`); if (audioModel) { console.log('Disposing previous model...'); audioModel.dispose(); audioModel = null; } messages = []; pendingAudio = null; chatContainer.innerHTML = ''; try { let useWebGPU = !!navigator.gpu; if (!useWebGPU) { console.warn('WebGPU not available, falling back to WASM (CPU)'); } const device = useWebGPU ? 'webgpu' : 'wasm'; setStatus(`Loading audio model (${device})...`); audioModel = new AudioModel(); await audioModel.load(modelConfig.path, { device, quantization: modelConfig.quantization || null, progressCallback: (progress) => { if (progress.status === 'loading') { updateProgress(progress.progress, `Loading ${progress.file}...`); } else if (progress.status === 'done') { updateProgress(100, 'Done'); } }, }); showProgress(false); setStatus(`Ready! Audio model loaded on ${device === 'webgpu' ? 'WebGPU' : 'CPU'}`, 'success'); setReady(true); updateCacheInfo(); updatePlaceholder(); } catch (error) { console.error('Load error:', error); showProgress(false); const msg = error instanceof Error ? error.message : String(error); setStatus(`Error: ${msg}`, 'error'); audioModel = null; } finally { setLoading(false); } } // ============================================================================ // Generation // ============================================================================ async function generate(userMessage) { if (!audioModel || isGenerating) return; isGenerating = true; setReady(false); const audioToSend = pendingAudio; if (audioToSend && audioPreview) { audioPreview.innerHTML = ''; } messages.push({ role: 'user', content: userMessage }); addMessage('user', userMessage, false, audioToSend); const { msgEl, textEl } = addMessage('assistant', '', true); let generatedText = ''; const startTime = performance.now(); let tokenCount = 0; let audioFrameCount = 0; try { const currentMode = audioModeSelect?.value || 'interleaved'; const onTokenCallback = (token, tokenId) => { if (token.includes('<|im_end|>') || token.includes('<|endoftext|>')) { return true; } generatedText += token; tokenCount++; textEl.textContent = generatedText; chatContainer.scrollTop = chatContainer.scrollHeight; const elapsed = ((performance.now() - startTime) / 1000).toFixed(1); updateSpinner(null, `${tokenCount} tokens 路 ${elapsed}s`); return false; }; if (currentMode === 'asr' && audioToSend) { showSpinner('Transcribing audio...'); generatedText = await audioModel.transcribe( audioToSend.audioData, audioToSend.sampleRate, { onToken: onTokenCallback } ); pendingAudio = null; } else if (currentMode === 'tts') { showSpinner('Generating speech...'); const result = await audioModel.generateSpeech(userMessage, { onToken: onTokenCallback, onAudioFrame: (frame, count) => { audioFrameCount = count; const elapsed = ((performance.now() - startTime) / 1000).toFixed(1); updateSpinner('Generating audio...', `${count} frames 路 ${elapsed}s`); }, }); if (result.audioCodes && result.audioCodes.length > 0) { updateSpinner('Decoding audio...', `${result.audioCodes.length} frames`); const waveform = await audioModel.decodeAudioCodes(result.audioCodes); console.log('TTS waveform decoded:', waveform.length, 'samples'); if (waveform.length > 0) { generatedText = result.textOutput || `Generated ${result.audioCodes.length} audio frames (${(waveform.length / 24000).toFixed(2)}s)`; // Create audio player inline with the message const wavBlob = createWavBlob(waveform, 24000); console.log('TTS WAV blob created:', wavBlob.size, 'bytes, duration:', (waveform.length / 24000).toFixed(2), 's'); const audioUrl = URL.createObjectURL(wavBlob); // Add audio element to the existing message const audioContainer = document.createElement('div'); audioContainer.style.marginTop = '0.75rem'; audioContainer.innerHTML = ` Download WAV (${(waveform.length / 24000).toFixed(1)}s) `; msgEl.appendChild(audioContainer); chatContainer.scrollTop = chatContainer.scrollHeight; } else { generatedText = '[Audio decoding failed - no waveform generated]'; console.warn('TTS waveform decoding returned empty result'); } } else { generatedText = result.textOutput || '[No audio generated]'; } } else if (currentMode === 'interleaved' && audioToSend) { showSpinner('Processing audio...'); const result = await audioModel.generateInterleaved( audioToSend.audioData, audioToSend.sampleRate, userMessage, { onToken: (text, tokenId) => { generatedText = text; tokenCount = text.length; textEl.textContent = text; chatContainer.scrollTop = chatContainer.scrollHeight; const elapsed = ((performance.now() - startTime) / 1000).toFixed(1); updateSpinner('Generating text...', `${tokenCount} chars 路 ${elapsed}s`); }, onAudioFrame: (frame, count) => { audioFrameCount = count; const elapsed = ((performance.now() - startTime) / 1000).toFixed(1); updateSpinner('Generating audio...', `${count} frames 路 ${elapsed}s`); }, } ); pendingAudio = null; generatedText = result.text || ''; textEl.textContent = generatedText; if (result.audioCodes && result.audioCodes.length > 0) { updateSpinner('Decoding audio...', `${result.audioCodes.length} frames`); const waveform = await audioModel.decodeAudioCodes(result.audioCodes); console.log('Waveform decoded:', waveform.length, 'samples'); if (waveform.length > 0) { if (!generatedText) { generatedText = `Generated ${result.audioCodes.length} audio frames`; } // Create audio player inline with the message const wavBlob = createWavBlob(waveform, 24000); console.log('WAV blob created:', wavBlob.size, 'bytes, duration:', (waveform.length / 24000).toFixed(2), 's'); const audioUrl = URL.createObjectURL(wavBlob); // Add audio element to the existing message const audioContainer = document.createElement('div'); audioContainer.style.marginTop = '0.75rem'; audioContainer.innerHTML = ` Download WAV (${(waveform.length / 24000).toFixed(1)}s) `; msgEl.appendChild(audioContainer); chatContainer.scrollTop = chatContainer.scrollHeight; } else { console.warn('Waveform decoding returned empty result'); } } } else if (currentMode === 'interleaved' && userMessage) { // Text-only follow-up in interleaved mode (still produces audio) showSpinner('Generating response...'); const result = await audioModel.generateInterleavedFromText(userMessage, { onToken: (text, tokenId) => { generatedText = text; tokenCount = text.length; textEl.textContent = text; chatContainer.scrollTop = chatContainer.scrollHeight; const elapsed = ((performance.now() - startTime) / 1000).toFixed(1); updateSpinner('Generating text...', `${tokenCount} chars 路 ${elapsed}s`); }, onAudioFrame: (frame, count) => { audioFrameCount = count; const elapsed = ((performance.now() - startTime) / 1000).toFixed(1); updateSpinner('Generating audio...', `${count} frames 路 ${elapsed}s`); }, }); generatedText = result.text || ''; textEl.textContent = generatedText; // Decode and display audio if generated if (result.audioCodes && result.audioCodes.length > 0) { updateSpinner('Decoding audio...', `${result.audioCodes.length} frames`); const waveform = await audioModel.decodeAudioCodes(result.audioCodes); console.log('Waveform decoded:', waveform.length, 'samples'); if (waveform.length > 0) { if (!generatedText) { generatedText = `Generated ${result.audioCodes.length} audio frames`; } const wavBlob = createWavBlob(waveform, 24000); console.log('WAV blob created:', wavBlob.size, 'bytes, duration:', (waveform.length / 24000).toFixed(2), 's'); const audioUrl = URL.createObjectURL(wavBlob); const audioContainer = document.createElement('div'); audioContainer.className = 'audio-output'; audioContainer.innerHTML = ` Download WAV (${(waveform.length / 24000).toFixed(1)}s) `; msgEl.appendChild(audioContainer); chatContainer.scrollTop = chatContainer.scrollHeight; } } } else if (userMessage) { // Fallback text-only generation showSpinner('Generating response...'); const result = await audioModel.generateTextOnly(userMessage, { maxNewTokens: 256, onToken: (text, tokenId) => { generatedText = text; tokenCount = text.length; textEl.textContent = text; chatContainer.scrollTop = chatContainer.scrollHeight; }, }); generatedText = result.text || ''; } generatedText = generatedText.replace(/<\|im_end\|>$/g, '').trim(); const elapsed = (performance.now() - startTime) / 1000; const tokensPerSec = tokenCount / elapsed; msgEl.classList.remove('generating'); textEl.textContent = generatedText; const statsEl = document.createElement('div'); statsEl.className = 'stats'; statsEl.textContent = `${tokenCount} tokens in ${elapsed.toFixed(1)}s (${tokensPerSec.toFixed(1)} tok/s)`; msgEl.appendChild(statsEl); messages.push({ role: 'assistant', content: generatedText }); setStatus('Ready', 'success'); } catch (error) { console.error('Generation error:', error); textEl.textContent = `Error: ${error.message}`; msgEl.classList.remove('generating'); messages.pop(); setStatus(`Error: ${error.message}`, 'error'); } finally { hideSpinner(); isGenerating = false; setReady(true); userInput.focus(); } } // ============================================================================ // Event Handlers // ============================================================================ loadBtn.addEventListener('click', loadModel); audioModeSelect.addEventListener('change', () => { audioMode = audioModeSelect.value; updatePlaceholder(); console.log(`Audio mode changed to: ${audioMode}`); }); recordBtn.addEventListener('click', () => { if (isRecording) { stopRecording(); } else { startRecording(); } }); clearBtn.addEventListener('click', () => { messages = []; chatContainer.innerHTML = ''; clearPendingAudio(); // Reset model conversation state (KV cache) if (audioModel) { audioModel.reset(); setStatus('Conversation reset', 'success'); } }); clearCacheBtn.addEventListener('click', async () => { if (clearCacheBtn.disabled) return; const info = await getCacheInfo(); const usedMB = info ? (info.used / 1024 / 1024).toFixed(0) : 0; const confirmed = confirm( `Delete downloaded model files?\n\n` + `This will free up ~${usedMB} MB of storage.\n` + `Models will be re-downloaded next time you load them.` ); if (!confirmed) return; clearCacheBtn.textContent = 'Deleting...'; await clearModelCache(); clearCacheBtn.textContent = 'Delete Models'; await updateCacheInfo(); setStatus('Downloaded models deleted', 'success'); }); sendBtn.addEventListener('click', () => { const text = userInput.value.trim(); const mode = audioModeSelect?.value || 'interleaved'; if (mode === 'tts' && !text) { setStatus('Please enter text to convert to speech', 'error'); return; } if (mode === 'asr' && !pendingAudio && !text) { setStatus('Record or upload audio to transcribe', 'error'); return; } if (text || pendingAudio) { userInput.value = ''; generate(text); } }); userInput.addEventListener('keydown', (e) => { if (e.key === 'Enter' && !e.shiftKey) { e.preventDefault(); sendBtn.click(); } }); audioBtn.addEventListener('click', () => { audioInput.click(); }); audioInput.addEventListener('change', async (e) => { const file = e.target.files[0]; if (file) { try { setStatus('Loading audio file...'); const { audioData, sampleRate } = await loadAudioFile(file); pendingAudio = { audioData, sampleRate, fileName: file.name }; if (audioPreview) { audioPreview.innerHTML = `
馃帳 ${file.name} ${(audioData.length / sampleRate).toFixed(1)}s @ ${sampleRate}Hz
`; } setStatus('Audio loaded. Click Send to process.', 'success'); } catch (error) { console.error('Error loading audio:', error); setStatus(`Error loading audio: ${error.message}`, 'error'); } } audioInput.value = ''; }); window.clearPendingAudio = clearPendingAudio; // Drag and drop document.addEventListener('dragenter', (e) => { if (isGenerating) return; e.preventDefault(); dropOverlay.classList.add('active'); }); dropOverlay.addEventListener('dragleave', (e) => { e.preventDefault(); dropOverlay.classList.remove('active'); }); dropOverlay.addEventListener('dragover', (e) => { e.preventDefault(); }); dropOverlay.addEventListener('drop', async (e) => { e.preventDefault(); dropOverlay.classList.remove('active'); if (isGenerating) return; const files = e.dataTransfer?.files; if (!files) return; for (const file of files) { if (file.type.startsWith('audio/')) { try { setStatus('Loading audio file...'); const { audioData, sampleRate } = await loadAudioFile(file); pendingAudio = { audioData, sampleRate, fileName: file.name }; if (audioPreview) { audioPreview.innerHTML = `
馃帳 ${file.name} ${(audioData.length / sampleRate).toFixed(1)}s @ ${sampleRate}Hz
`; } setStatus('Audio loaded. Click Send to process.', 'success'); } catch (error) { console.error('Error loading audio:', error); setStatus(`Error loading audio: ${error.message}`, 'error'); } break; } } }); // Populate model dropdown function populateModelDropdown() { modelSelect.innerHTML = ''; let firstOption = null; for (const [key, config] of Object.entries(MODELS)) { const option = document.createElement('option'); option.value = key; option.textContent = config.label; modelSelect.appendChild(option); if (!firstOption) firstOption = option; } if (firstOption) firstOption.selected = true; } // Initialize populateModelDropdown(); updateCacheInfo(); updatePlaceholder(); // Check WebGPU on load (async () => { if (!navigator.gpu) { setStatus('WebGPU not available - will use CPU (WASM). For GPU acceleration, enable chrome://flags/#enable-unsafe-webgpu'); return; } try { const adapter = await navigator.gpu.requestAdapter(); if (!adapter) { setStatus('WebGPU adapter not found - will use CPU. Check chrome://gpu for WebGPU status.'); return; } const info = adapter.info || {}; const desc = info.description || info.vendor || info.architecture || 'Available'; setStatus(`WebGPU: ${desc}. Select model and click Load.`); } catch (e) { setStatus(`WebGPU error: ${e.message} - will use CPU.`); } })();