|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import { AudioModel, loadAudioFile, clearModelCache, getCacheInfo } from './audio-model.js'; |
|
|
|
|
|
|
|
|
const MODEL_URL = 'https://huggingface.co/LiquidAI/LFM2.5-Audio-1.5B-ONNX/resolve/main'; |
|
|
|
|
|
|
|
|
const MODELS = { |
|
|
'LFM2.5-Audio-1.5B-Q4': { |
|
|
path: MODEL_URL, |
|
|
label: 'LFM2.5-Audio-1.5B Q4 (~1.6 GB)', |
|
|
quantization: { |
|
|
decoder: 'q4', |
|
|
audioEncoder: 'q4', |
|
|
audioEmbedding: 'q4', |
|
|
audioDetokenizer: 'q4', |
|
|
vocoder: 'q4', |
|
|
}, |
|
|
}, |
|
|
}; |
|
|
|
|
|
|
|
|
const modelSelect = document.getElementById('modelSelect'); |
|
|
const loadBtn = document.getElementById('loadBtn'); |
|
|
const clearBtn = document.getElementById('clearBtn'); |
|
|
const statusEl = document.getElementById('status'); |
|
|
const chatContainer = document.getElementById('chatContainer'); |
|
|
const userInput = document.getElementById('userInput'); |
|
|
const sendBtn = document.getElementById('sendBtn'); |
|
|
const progressBar = document.getElementById('progressBar'); |
|
|
const progressFill = document.getElementById('progressFill'); |
|
|
const progressText = document.getElementById('progressText'); |
|
|
const audioModeSelect = document.getElementById('audioModeSelect'); |
|
|
const recordBtn = document.getElementById('recordBtn'); |
|
|
const audioBtn = document.getElementById('audioBtn'); |
|
|
const audioInput = document.getElementById('audioInput'); |
|
|
const audioPreview = document.getElementById('audioPreview'); |
|
|
const clearCacheBtn = document.getElementById('clearCacheBtn'); |
|
|
const cacheInfoEl = document.getElementById('cacheInfo'); |
|
|
const dropOverlay = document.getElementById('dropOverlay'); |
|
|
const spinner = document.getElementById('spinner'); |
|
|
const spinnerText = document.getElementById('spinnerText'); |
|
|
const spinnerStats = document.getElementById('spinnerStats'); |
|
|
|
|
|
|
|
|
let audioModel = null; |
|
|
let messages = []; |
|
|
let isGenerating = false; |
|
|
let pendingAudio = null; |
|
|
let audioMode = 'interleaved'; |
|
|
let isRecording = false; |
|
|
let mediaRecorder = null; |
|
|
let audioChunks = []; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function createWavBlob(samples, sampleRate) { |
|
|
|
|
|
let min = Infinity, max = -Infinity, sum = 0, nonZero = 0; |
|
|
for (let i = 0; i < samples.length; i++) { |
|
|
const v = samples[i]; |
|
|
if (v < min) min = v; |
|
|
if (v > max) max = v; |
|
|
sum += Math.abs(v); |
|
|
if (Math.abs(v) > 0.001) nonZero++; |
|
|
} |
|
|
console.log('WAV input stats:', { |
|
|
length: samples.length, |
|
|
min: min.toFixed(6), |
|
|
max: max.toFixed(6), |
|
|
avgAbs: (sum / samples.length).toFixed(6), |
|
|
nonZeroSamples: nonZero, |
|
|
percentNonZero: ((nonZero / samples.length) * 100).toFixed(1) + '%' |
|
|
}); |
|
|
|
|
|
const numChannels = 1; |
|
|
const bitsPerSample = 16; |
|
|
const bytesPerSample = bitsPerSample / 8; |
|
|
const blockAlign = numChannels * bytesPerSample; |
|
|
const byteRate = sampleRate * blockAlign; |
|
|
const dataSize = samples.length * bytesPerSample; |
|
|
const bufferSize = 44 + dataSize; |
|
|
|
|
|
const buffer = new ArrayBuffer(bufferSize); |
|
|
const view = new DataView(buffer); |
|
|
|
|
|
const writeString = (offset, string) => { |
|
|
for (let i = 0; i < string.length; i++) { |
|
|
view.setUint8(offset + i, string.charCodeAt(i)); |
|
|
} |
|
|
}; |
|
|
|
|
|
writeString(0, 'RIFF'); |
|
|
view.setUint32(4, bufferSize - 8, true); |
|
|
writeString(8, 'WAVE'); |
|
|
writeString(12, 'fmt '); |
|
|
view.setUint32(16, 16, true); |
|
|
view.setUint16(20, 1, true); |
|
|
view.setUint16(22, numChannels, true); |
|
|
view.setUint32(24, sampleRate, true); |
|
|
view.setUint32(28, byteRate, true); |
|
|
view.setUint16(32, blockAlign, true); |
|
|
view.setUint16(34, bitsPerSample, true); |
|
|
writeString(36, 'data'); |
|
|
view.setUint32(40, dataSize, true); |
|
|
|
|
|
let offset = 44; |
|
|
for (let i = 0; i < samples.length; i++) { |
|
|
const sample = Math.max(-1, Math.min(1, samples[i])); |
|
|
const int16 = sample < 0 ? sample * 0x8000 : sample * 0x7FFF; |
|
|
view.setInt16(offset, int16, true); |
|
|
offset += 2; |
|
|
} |
|
|
|
|
|
return new Blob([buffer], { type: 'audio/wav' }); |
|
|
} |
|
|
|
|
|
|
|
|
function createTestToneBlob(durationSec = 1, frequency = 440, sampleRate = 24000) { |
|
|
const numSamples = Math.floor(durationSec * sampleRate); |
|
|
const samples = new Float32Array(numSamples); |
|
|
for (let i = 0; i < numSamples; i++) { |
|
|
samples[i] = 0.5 * Math.sin(2 * Math.PI * frequency * i / sampleRate); |
|
|
} |
|
|
return createWavBlob(samples, sampleRate); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function setStatus(text, type = '') { |
|
|
statusEl.textContent = text; |
|
|
statusEl.className = type; |
|
|
} |
|
|
|
|
|
function setLoading(loading) { |
|
|
loadBtn.disabled = loading; |
|
|
modelSelect.disabled = loading; |
|
|
} |
|
|
|
|
|
function setReady(ready) { |
|
|
userInput.disabled = !ready; |
|
|
sendBtn.disabled = !ready; |
|
|
audioBtn.disabled = !ready; |
|
|
recordBtn.disabled = !ready; |
|
|
} |
|
|
|
|
|
function showProgress(show) { |
|
|
progressBar.style.display = show ? 'block' : 'none'; |
|
|
} |
|
|
|
|
|
function updateProgress(percent, text) { |
|
|
progressFill.style.width = `${percent}%`; |
|
|
progressText.textContent = text || `${percent}%`; |
|
|
} |
|
|
|
|
|
function showSpinner(text, stats = '') { |
|
|
spinner.classList.add('active'); |
|
|
spinnerText.textContent = text; |
|
|
spinnerStats.textContent = stats; |
|
|
} |
|
|
|
|
|
function updateSpinner(text, stats = '') { |
|
|
if (text) spinnerText.textContent = text; |
|
|
spinnerStats.textContent = stats; |
|
|
} |
|
|
|
|
|
function hideSpinner() { |
|
|
spinner.classList.remove('active'); |
|
|
spinnerText.textContent = ''; |
|
|
spinnerStats.textContent = ''; |
|
|
} |
|
|
|
|
|
function addMessage(role, content, isStreaming = false, audio = null) { |
|
|
const msgEl = document.createElement('div'); |
|
|
msgEl.className = `message ${role}${isStreaming ? ' generating' : ''}`; |
|
|
|
|
|
if (audio) { |
|
|
const audioEl = document.createElement('div'); |
|
|
audioEl.className = 'audio-preview-item'; |
|
|
audioEl.style.marginBottom = '0.5rem'; |
|
|
audioEl.innerHTML = ` |
|
|
<span class="audio-icon">🎤</span> |
|
|
<span class="audio-name">${audio.fileName}</span> |
|
|
<span class="audio-info">${(audio.audioData.length / audio.sampleRate).toFixed(1)}s</span> |
|
|
`; |
|
|
msgEl.appendChild(audioEl); |
|
|
} |
|
|
|
|
|
const textEl = document.createElement('span'); |
|
|
textEl.textContent = content; |
|
|
msgEl.appendChild(textEl); |
|
|
|
|
|
chatContainer.appendChild(msgEl); |
|
|
chatContainer.scrollTop = chatContainer.scrollHeight; |
|
|
return { msgEl, textEl }; |
|
|
} |
|
|
|
|
|
function updatePlaceholder() { |
|
|
const mode = audioModeSelect?.value || 'interleaved'; |
|
|
if (mode === 'asr') { |
|
|
userInput.placeholder = 'Record or upload audio to transcribe...'; |
|
|
} else if (mode === 'tts') { |
|
|
userInput.placeholder = 'Type text to convert to speech...'; |
|
|
} else { |
|
|
userInput.placeholder = 'Type a message or record audio...'; |
|
|
} |
|
|
} |
|
|
|
|
|
function clearPendingAudio() { |
|
|
pendingAudio = null; |
|
|
if (audioPreview) { |
|
|
audioPreview.innerHTML = ''; |
|
|
} |
|
|
} |
|
|
|
|
|
async function updateCacheInfo() { |
|
|
if (!cacheInfoEl || !clearCacheBtn) return; |
|
|
|
|
|
const info = await getCacheInfo(); |
|
|
if (info && info.used > 0) { |
|
|
const usedMB = info.used / 1024 / 1024; |
|
|
if (usedMB >= 1000) { |
|
|
cacheInfoEl.textContent = `${(usedMB / 1024).toFixed(1)} GB cached`; |
|
|
} else if (usedMB >= 1) { |
|
|
cacheInfoEl.textContent = `${usedMB.toFixed(0)} MB cached`; |
|
|
} else { |
|
|
cacheInfoEl.textContent = 'No models cached'; |
|
|
} |
|
|
clearCacheBtn.disabled = usedMB < 1; |
|
|
} else { |
|
|
cacheInfoEl.textContent = 'No models cached'; |
|
|
clearCacheBtn.disabled = true; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async function startRecording() { |
|
|
try { |
|
|
if (!window.isSecureContext) { |
|
|
throw new Error('Microphone requires HTTPS. Use localhost or enable HTTPS.'); |
|
|
} |
|
|
if (!navigator.mediaDevices?.getUserMedia) { |
|
|
throw new Error('MediaDevices API not available in this browser.'); |
|
|
} |
|
|
|
|
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); |
|
|
audioChunks = []; |
|
|
mediaRecorder = new MediaRecorder(stream); |
|
|
|
|
|
mediaRecorder.ondataavailable = (e) => { |
|
|
if (e.data.size > 0) { |
|
|
audioChunks.push(e.data); |
|
|
} |
|
|
}; |
|
|
|
|
|
mediaRecorder.onstop = async () => { |
|
|
stream.getTracks().forEach(track => track.stop()); |
|
|
const audioBlob = new Blob(audioChunks, { type: 'audio/webm' }); |
|
|
await processRecordedAudio(audioBlob, false); |
|
|
}; |
|
|
|
|
|
mediaRecorder.start(); |
|
|
isRecording = true; |
|
|
recordBtn.classList.add('recording'); |
|
|
recordBtn.textContent = '⏹️'; |
|
|
recordBtn.title = 'Stop recording'; |
|
|
setStatus('Recording... Click to stop', 'success'); |
|
|
} catch (error) { |
|
|
console.error('Failed to start recording:', error); |
|
|
setStatus(`Microphone error: ${error.message}`, 'error'); |
|
|
} |
|
|
} |
|
|
|
|
|
function stopRecording() { |
|
|
if (mediaRecorder && mediaRecorder.state === 'recording') { |
|
|
mediaRecorder.stop(); |
|
|
isRecording = false; |
|
|
recordBtn.classList.remove('recording'); |
|
|
recordBtn.textContent = '🎤'; |
|
|
recordBtn.title = 'Record from microphone'; |
|
|
} |
|
|
} |
|
|
|
|
|
async function processRecordedAudio(audioBlob) { |
|
|
try { |
|
|
setStatus('Processing recording...'); |
|
|
const arrayBuffer = await audioBlob.arrayBuffer(); |
|
|
const audioContext = new (window.AudioContext || window.webkitAudioContext)(); |
|
|
const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); |
|
|
|
|
|
let audioData; |
|
|
if (audioBuffer.numberOfChannels === 1) { |
|
|
audioData = new Float32Array(audioBuffer.getChannelData(0)); |
|
|
} else { |
|
|
const ch0 = audioBuffer.getChannelData(0); |
|
|
const ch1 = audioBuffer.getChannelData(1); |
|
|
audioData = new Float32Array(ch0.length); |
|
|
for (let i = 0; i < ch0.length; i++) { |
|
|
audioData[i] = (ch0[i] + ch1[i]) / 2; |
|
|
} |
|
|
} |
|
|
|
|
|
const sampleRate = audioBuffer.sampleRate; |
|
|
const duration = (audioData.length / sampleRate).toFixed(1); |
|
|
pendingAudio = { audioData, sampleRate, fileName: `Recording (${duration}s)` }; |
|
|
audioContext.close(); |
|
|
|
|
|
if (audioPreview) { |
|
|
audioPreview.innerHTML = ` |
|
|
<div class="audio-preview-item"> |
|
|
<span class="audio-icon">🎤</span> |
|
|
<span class="audio-name">Recording</span> |
|
|
<span class="audio-info">${duration}s @ ${sampleRate}Hz</span> |
|
|
<button class="remove-btn" onclick="window.clearPendingAudio()">×</button> |
|
|
</div> |
|
|
`; |
|
|
} |
|
|
setStatus('Recording ready. Click Send to process.', 'success'); |
|
|
} catch (error) { |
|
|
console.error('Failed to process recording:', error); |
|
|
setStatus(`Error processing recording: ${error.message}`, 'error'); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async function loadModel() { |
|
|
const modelKey = modelSelect.value; |
|
|
const modelConfig = MODELS[modelKey]; |
|
|
|
|
|
if (!modelConfig) { |
|
|
setStatus('Invalid model selection', 'error'); |
|
|
return; |
|
|
} |
|
|
|
|
|
setLoading(true); |
|
|
setReady(false); |
|
|
showProgress(true); |
|
|
updateProgress(0, 'Starting...'); |
|
|
setStatus(`Loading ${modelConfig.label}...`); |
|
|
|
|
|
if (audioModel) { |
|
|
console.log('Disposing previous model...'); |
|
|
audioModel.dispose(); |
|
|
audioModel = null; |
|
|
} |
|
|
messages = []; |
|
|
pendingAudio = null; |
|
|
chatContainer.innerHTML = ''; |
|
|
|
|
|
try { |
|
|
let useWebGPU = !!navigator.gpu; |
|
|
if (!useWebGPU) { |
|
|
console.warn('WebGPU not available, falling back to WASM (CPU)'); |
|
|
} |
|
|
|
|
|
const device = useWebGPU ? 'webgpu' : 'wasm'; |
|
|
setStatus(`Loading audio model (${device})...`); |
|
|
|
|
|
audioModel = new AudioModel(); |
|
|
await audioModel.load(modelConfig.path, { |
|
|
device, |
|
|
quantization: modelConfig.quantization || null, |
|
|
progressCallback: (progress) => { |
|
|
if (progress.status === 'loading') { |
|
|
updateProgress(progress.progress, `Loading ${progress.file}...`); |
|
|
} else if (progress.status === 'done') { |
|
|
updateProgress(100, 'Done'); |
|
|
} |
|
|
}, |
|
|
}); |
|
|
|
|
|
showProgress(false); |
|
|
setStatus(`Ready! Audio model loaded on ${device === 'webgpu' ? 'WebGPU' : 'CPU'}`, 'success'); |
|
|
setReady(true); |
|
|
updateCacheInfo(); |
|
|
updatePlaceholder(); |
|
|
|
|
|
} catch (error) { |
|
|
console.error('Load error:', error); |
|
|
showProgress(false); |
|
|
const msg = error instanceof Error ? error.message : String(error); |
|
|
setStatus(`Error: ${msg}`, 'error'); |
|
|
audioModel = null; |
|
|
} finally { |
|
|
setLoading(false); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async function generate(userMessage) { |
|
|
if (!audioModel || isGenerating) return; |
|
|
|
|
|
isGenerating = true; |
|
|
setReady(false); |
|
|
|
|
|
const audioToSend = pendingAudio; |
|
|
if (audioToSend && audioPreview) { |
|
|
audioPreview.innerHTML = ''; |
|
|
} |
|
|
|
|
|
messages.push({ role: 'user', content: userMessage }); |
|
|
addMessage('user', userMessage, false, audioToSend); |
|
|
|
|
|
const { msgEl, textEl } = addMessage('assistant', '', true); |
|
|
let generatedText = ''; |
|
|
const startTime = performance.now(); |
|
|
let tokenCount = 0; |
|
|
let audioFrameCount = 0; |
|
|
|
|
|
try { |
|
|
const currentMode = audioModeSelect?.value || 'interleaved'; |
|
|
|
|
|
const onTokenCallback = (token, tokenId) => { |
|
|
if (token.includes('<|im_end|>') || token.includes('<|endoftext|>')) { |
|
|
return true; |
|
|
} |
|
|
generatedText += token; |
|
|
tokenCount++; |
|
|
textEl.textContent = generatedText; |
|
|
chatContainer.scrollTop = chatContainer.scrollHeight; |
|
|
const elapsed = ((performance.now() - startTime) / 1000).toFixed(1); |
|
|
updateSpinner(null, `${tokenCount} tokens · ${elapsed}s`); |
|
|
return false; |
|
|
}; |
|
|
|
|
|
if (currentMode === 'asr' && audioToSend) { |
|
|
showSpinner('Transcribing audio...'); |
|
|
generatedText = await audioModel.transcribe( |
|
|
audioToSend.audioData, |
|
|
audioToSend.sampleRate, |
|
|
{ onToken: onTokenCallback } |
|
|
); |
|
|
pendingAudio = null; |
|
|
|
|
|
} else if (currentMode === 'tts') { |
|
|
showSpinner('Generating speech...'); |
|
|
const result = await audioModel.generateSpeech(userMessage, { |
|
|
onToken: onTokenCallback, |
|
|
onAudioFrame: (frame, count) => { |
|
|
audioFrameCount = count; |
|
|
const elapsed = ((performance.now() - startTime) / 1000).toFixed(1); |
|
|
updateSpinner('Generating audio...', `${count} frames · ${elapsed}s`); |
|
|
}, |
|
|
}); |
|
|
|
|
|
if (result.audioCodes && result.audioCodes.length > 0) { |
|
|
updateSpinner('Decoding audio...', `${result.audioCodes.length} frames`); |
|
|
const waveform = await audioModel.decodeAudioCodes(result.audioCodes); |
|
|
console.log('TTS waveform decoded:', waveform.length, 'samples'); |
|
|
|
|
|
if (waveform.length > 0) { |
|
|
generatedText = result.textOutput || `Generated ${result.audioCodes.length} audio frames (${(waveform.length / 24000).toFixed(2)}s)`; |
|
|
|
|
|
|
|
|
const wavBlob = createWavBlob(waveform, 24000); |
|
|
console.log('TTS WAV blob created:', wavBlob.size, 'bytes, duration:', (waveform.length / 24000).toFixed(2), 's'); |
|
|
const audioUrl = URL.createObjectURL(wavBlob); |
|
|
|
|
|
|
|
|
const audioContainer = document.createElement('div'); |
|
|
audioContainer.style.marginTop = '0.75rem'; |
|
|
audioContainer.innerHTML = ` |
|
|
<audio controls preload="auto" src="${audioUrl}" style="width:100%;max-width:360px;display:block;"></audio> |
|
|
<a href="${audioUrl}" download="generated_audio.wav" style="display:block;font-size:0.7rem;margin-top:0.25rem;color:#666;">Download WAV (${(waveform.length / 24000).toFixed(1)}s)</a> |
|
|
`; |
|
|
msgEl.appendChild(audioContainer); |
|
|
chatContainer.scrollTop = chatContainer.scrollHeight; |
|
|
} else { |
|
|
generatedText = '[Audio decoding failed - no waveform generated]'; |
|
|
console.warn('TTS waveform decoding returned empty result'); |
|
|
} |
|
|
} else { |
|
|
generatedText = result.textOutput || '[No audio generated]'; |
|
|
} |
|
|
|
|
|
} else if (currentMode === 'interleaved' && audioToSend) { |
|
|
showSpinner('Processing audio...'); |
|
|
const result = await audioModel.generateInterleaved( |
|
|
audioToSend.audioData, |
|
|
audioToSend.sampleRate, |
|
|
userMessage, |
|
|
{ |
|
|
onToken: (text, tokenId) => { |
|
|
generatedText = text; |
|
|
tokenCount = text.length; |
|
|
textEl.textContent = text; |
|
|
chatContainer.scrollTop = chatContainer.scrollHeight; |
|
|
const elapsed = ((performance.now() - startTime) / 1000).toFixed(1); |
|
|
updateSpinner('Generating text...', `${tokenCount} chars · ${elapsed}s`); |
|
|
}, |
|
|
onAudioFrame: (frame, count) => { |
|
|
audioFrameCount = count; |
|
|
const elapsed = ((performance.now() - startTime) / 1000).toFixed(1); |
|
|
updateSpinner('Generating audio...', `${count} frames · ${elapsed}s`); |
|
|
}, |
|
|
} |
|
|
); |
|
|
pendingAudio = null; |
|
|
|
|
|
generatedText = result.text || ''; |
|
|
textEl.textContent = generatedText; |
|
|
|
|
|
if (result.audioCodes && result.audioCodes.length > 0) { |
|
|
updateSpinner('Decoding audio...', `${result.audioCodes.length} frames`); |
|
|
const waveform = await audioModel.decodeAudioCodes(result.audioCodes); |
|
|
console.log('Waveform decoded:', waveform.length, 'samples'); |
|
|
|
|
|
if (waveform.length > 0) { |
|
|
if (!generatedText) { |
|
|
generatedText = `Generated ${result.audioCodes.length} audio frames`; |
|
|
} |
|
|
|
|
|
|
|
|
const wavBlob = createWavBlob(waveform, 24000); |
|
|
console.log('WAV blob created:', wavBlob.size, 'bytes, duration:', (waveform.length / 24000).toFixed(2), 's'); |
|
|
const audioUrl = URL.createObjectURL(wavBlob); |
|
|
|
|
|
|
|
|
const audioContainer = document.createElement('div'); |
|
|
audioContainer.style.marginTop = '0.75rem'; |
|
|
audioContainer.innerHTML = ` |
|
|
<audio controls preload="auto" src="${audioUrl}" style="width:100%;max-width:360px;display:block;"></audio> |
|
|
<a href="${audioUrl}" download="generated_audio.wav" style="display:block;font-size:0.7rem;margin-top:0.25rem;color:#666;">Download WAV (${(waveform.length / 24000).toFixed(1)}s)</a> |
|
|
`; |
|
|
msgEl.appendChild(audioContainer); |
|
|
chatContainer.scrollTop = chatContainer.scrollHeight; |
|
|
} else { |
|
|
console.warn('Waveform decoding returned empty result'); |
|
|
} |
|
|
} |
|
|
|
|
|
} else if (currentMode === 'interleaved' && userMessage) { |
|
|
|
|
|
showSpinner('Generating response...'); |
|
|
const result = await audioModel.generateInterleavedFromText(userMessage, { |
|
|
onToken: (text, tokenId) => { |
|
|
generatedText = text; |
|
|
tokenCount = text.length; |
|
|
textEl.textContent = text; |
|
|
chatContainer.scrollTop = chatContainer.scrollHeight; |
|
|
const elapsed = ((performance.now() - startTime) / 1000).toFixed(1); |
|
|
updateSpinner('Generating text...', `${tokenCount} chars · ${elapsed}s`); |
|
|
}, |
|
|
onAudioFrame: (frame, count) => { |
|
|
audioFrameCount = count; |
|
|
const elapsed = ((performance.now() - startTime) / 1000).toFixed(1); |
|
|
updateSpinner('Generating audio...', `${count} frames · ${elapsed}s`); |
|
|
}, |
|
|
}); |
|
|
|
|
|
generatedText = result.text || ''; |
|
|
textEl.textContent = generatedText; |
|
|
|
|
|
|
|
|
if (result.audioCodes && result.audioCodes.length > 0) { |
|
|
updateSpinner('Decoding audio...', `${result.audioCodes.length} frames`); |
|
|
const waveform = await audioModel.decodeAudioCodes(result.audioCodes); |
|
|
console.log('Waveform decoded:', waveform.length, 'samples'); |
|
|
|
|
|
if (waveform.length > 0) { |
|
|
if (!generatedText) { |
|
|
generatedText = `Generated ${result.audioCodes.length} audio frames`; |
|
|
} |
|
|
|
|
|
const wavBlob = createWavBlob(waveform, 24000); |
|
|
console.log('WAV blob created:', wavBlob.size, 'bytes, duration:', (waveform.length / 24000).toFixed(2), 's'); |
|
|
const audioUrl = URL.createObjectURL(wavBlob); |
|
|
|
|
|
const audioContainer = document.createElement('div'); |
|
|
audioContainer.className = 'audio-output'; |
|
|
audioContainer.innerHTML = ` |
|
|
<audio controls preload="auto" src="${audioUrl}" style="width:100%;max-width:360px;display:block;"></audio> |
|
|
<a href="${audioUrl}" download="generated_audio.wav" style="display:block;font-size:0.7rem;margin-top:0.25rem;color:#666;">Download WAV (${(waveform.length / 24000).toFixed(1)}s)</a> |
|
|
`; |
|
|
msgEl.appendChild(audioContainer); |
|
|
chatContainer.scrollTop = chatContainer.scrollHeight; |
|
|
} |
|
|
} |
|
|
|
|
|
} else if (userMessage) { |
|
|
|
|
|
showSpinner('Generating response...'); |
|
|
const result = await audioModel.generateTextOnly(userMessage, { |
|
|
maxNewTokens: 256, |
|
|
onToken: (text, tokenId) => { |
|
|
generatedText = text; |
|
|
tokenCount = text.length; |
|
|
textEl.textContent = text; |
|
|
chatContainer.scrollTop = chatContainer.scrollHeight; |
|
|
}, |
|
|
}); |
|
|
generatedText = result.text || ''; |
|
|
} |
|
|
|
|
|
generatedText = generatedText.replace(/<\|im_end\|>$/g, '').trim(); |
|
|
|
|
|
const elapsed = (performance.now() - startTime) / 1000; |
|
|
const tokensPerSec = tokenCount / elapsed; |
|
|
|
|
|
msgEl.classList.remove('generating'); |
|
|
textEl.textContent = generatedText; |
|
|
|
|
|
const statsEl = document.createElement('div'); |
|
|
statsEl.className = 'stats'; |
|
|
statsEl.textContent = `${tokenCount} tokens in ${elapsed.toFixed(1)}s (${tokensPerSec.toFixed(1)} tok/s)`; |
|
|
msgEl.appendChild(statsEl); |
|
|
|
|
|
messages.push({ role: 'assistant', content: generatedText }); |
|
|
setStatus('Ready', 'success'); |
|
|
|
|
|
} catch (error) { |
|
|
console.error('Generation error:', error); |
|
|
textEl.textContent = `Error: ${error.message}`; |
|
|
msgEl.classList.remove('generating'); |
|
|
messages.pop(); |
|
|
setStatus(`Error: ${error.message}`, 'error'); |
|
|
} finally { |
|
|
hideSpinner(); |
|
|
isGenerating = false; |
|
|
setReady(true); |
|
|
userInput.focus(); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
loadBtn.addEventListener('click', loadModel); |
|
|
|
|
|
audioModeSelect.addEventListener('change', () => { |
|
|
audioMode = audioModeSelect.value; |
|
|
updatePlaceholder(); |
|
|
console.log(`Audio mode changed to: ${audioMode}`); |
|
|
}); |
|
|
|
|
|
recordBtn.addEventListener('click', () => { |
|
|
if (isRecording) { |
|
|
stopRecording(); |
|
|
} else { |
|
|
startRecording(); |
|
|
} |
|
|
}); |
|
|
|
|
|
clearBtn.addEventListener('click', () => { |
|
|
messages = []; |
|
|
chatContainer.innerHTML = ''; |
|
|
clearPendingAudio(); |
|
|
|
|
|
if (audioModel) { |
|
|
audioModel.reset(); |
|
|
setStatus('Conversation reset', 'success'); |
|
|
} |
|
|
}); |
|
|
|
|
|
clearCacheBtn.addEventListener('click', async () => { |
|
|
if (clearCacheBtn.disabled) return; |
|
|
|
|
|
const info = await getCacheInfo(); |
|
|
const usedMB = info ? (info.used / 1024 / 1024).toFixed(0) : 0; |
|
|
const confirmed = confirm( |
|
|
`Delete downloaded model files?\n\n` + |
|
|
`This will free up ~${usedMB} MB of storage.\n` + |
|
|
`Models will be re-downloaded next time you load them.` |
|
|
); |
|
|
if (!confirmed) return; |
|
|
|
|
|
clearCacheBtn.textContent = 'Deleting...'; |
|
|
await clearModelCache(); |
|
|
clearCacheBtn.textContent = 'Delete Models'; |
|
|
await updateCacheInfo(); |
|
|
setStatus('Downloaded models deleted', 'success'); |
|
|
}); |
|
|
|
|
|
sendBtn.addEventListener('click', () => { |
|
|
const text = userInput.value.trim(); |
|
|
const mode = audioModeSelect?.value || 'interleaved'; |
|
|
|
|
|
if (mode === 'tts' && !text) { |
|
|
setStatus('Please enter text to convert to speech', 'error'); |
|
|
return; |
|
|
} |
|
|
|
|
|
if (mode === 'asr' && !pendingAudio && !text) { |
|
|
setStatus('Record or upload audio to transcribe', 'error'); |
|
|
return; |
|
|
} |
|
|
|
|
|
if (text || pendingAudio) { |
|
|
userInput.value = ''; |
|
|
generate(text); |
|
|
} |
|
|
}); |
|
|
|
|
|
userInput.addEventListener('keydown', (e) => { |
|
|
if (e.key === 'Enter' && !e.shiftKey) { |
|
|
e.preventDefault(); |
|
|
sendBtn.click(); |
|
|
} |
|
|
}); |
|
|
|
|
|
audioBtn.addEventListener('click', () => { |
|
|
audioInput.click(); |
|
|
}); |
|
|
|
|
|
audioInput.addEventListener('change', async (e) => { |
|
|
const file = e.target.files[0]; |
|
|
if (file) { |
|
|
try { |
|
|
setStatus('Loading audio file...'); |
|
|
const { audioData, sampleRate } = await loadAudioFile(file); |
|
|
pendingAudio = { audioData, sampleRate, fileName: file.name }; |
|
|
|
|
|
if (audioPreview) { |
|
|
audioPreview.innerHTML = ` |
|
|
<div class="audio-preview-item"> |
|
|
<span class="audio-icon">🎤</span> |
|
|
<span class="audio-name">${file.name}</span> |
|
|
<span class="audio-info">${(audioData.length / sampleRate).toFixed(1)}s @ ${sampleRate}Hz</span> |
|
|
<button class="remove-btn" onclick="window.clearPendingAudio()">×</button> |
|
|
</div> |
|
|
`; |
|
|
} |
|
|
setStatus('Audio loaded. Click Send to process.', 'success'); |
|
|
} catch (error) { |
|
|
console.error('Error loading audio:', error); |
|
|
setStatus(`Error loading audio: ${error.message}`, 'error'); |
|
|
} |
|
|
} |
|
|
audioInput.value = ''; |
|
|
}); |
|
|
|
|
|
window.clearPendingAudio = clearPendingAudio; |
|
|
|
|
|
|
|
|
document.addEventListener('dragenter', (e) => { |
|
|
if (isGenerating) return; |
|
|
e.preventDefault(); |
|
|
dropOverlay.classList.add('active'); |
|
|
}); |
|
|
|
|
|
dropOverlay.addEventListener('dragleave', (e) => { |
|
|
e.preventDefault(); |
|
|
dropOverlay.classList.remove('active'); |
|
|
}); |
|
|
|
|
|
dropOverlay.addEventListener('dragover', (e) => { |
|
|
e.preventDefault(); |
|
|
}); |
|
|
|
|
|
dropOverlay.addEventListener('drop', async (e) => { |
|
|
e.preventDefault(); |
|
|
dropOverlay.classList.remove('active'); |
|
|
|
|
|
if (isGenerating) return; |
|
|
|
|
|
const files = e.dataTransfer?.files; |
|
|
if (!files) return; |
|
|
|
|
|
for (const file of files) { |
|
|
if (file.type.startsWith('audio/')) { |
|
|
try { |
|
|
setStatus('Loading audio file...'); |
|
|
const { audioData, sampleRate } = await loadAudioFile(file); |
|
|
pendingAudio = { audioData, sampleRate, fileName: file.name }; |
|
|
|
|
|
if (audioPreview) { |
|
|
audioPreview.innerHTML = ` |
|
|
<div class="audio-preview-item"> |
|
|
<span class="audio-icon">🎤</span> |
|
|
<span class="audio-name">${file.name}</span> |
|
|
<span class="audio-info">${(audioData.length / sampleRate).toFixed(1)}s @ ${sampleRate}Hz</span> |
|
|
<button class="remove-btn" onclick="window.clearPendingAudio()">×</button> |
|
|
</div> |
|
|
`; |
|
|
} |
|
|
setStatus('Audio loaded. Click Send to process.', 'success'); |
|
|
} catch (error) { |
|
|
console.error('Error loading audio:', error); |
|
|
setStatus(`Error loading audio: ${error.message}`, 'error'); |
|
|
} |
|
|
break; |
|
|
} |
|
|
} |
|
|
}); |
|
|
|
|
|
|
|
|
function populateModelDropdown() { |
|
|
modelSelect.innerHTML = ''; |
|
|
let firstOption = null; |
|
|
|
|
|
for (const [key, config] of Object.entries(MODELS)) { |
|
|
const option = document.createElement('option'); |
|
|
option.value = key; |
|
|
option.textContent = config.label; |
|
|
modelSelect.appendChild(option); |
|
|
if (!firstOption) firstOption = option; |
|
|
} |
|
|
|
|
|
if (firstOption) firstOption.selected = true; |
|
|
} |
|
|
|
|
|
|
|
|
populateModelDropdown(); |
|
|
updateCacheInfo(); |
|
|
updatePlaceholder(); |
|
|
|
|
|
|
|
|
(async () => { |
|
|
if (!navigator.gpu) { |
|
|
setStatus('WebGPU not available - will use CPU (WASM). For GPU acceleration, enable chrome://flags/#enable-unsafe-webgpu'); |
|
|
return; |
|
|
} |
|
|
|
|
|
try { |
|
|
const adapter = await navigator.gpu.requestAdapter(); |
|
|
if (!adapter) { |
|
|
setStatus('WebGPU adapter not found - will use CPU. Check chrome://gpu for WebGPU status.'); |
|
|
return; |
|
|
} |
|
|
|
|
|
const info = adapter.info || {}; |
|
|
const desc = info.description || info.vendor || info.architecture || 'Available'; |
|
|
setStatus(`WebGPU: ${desc}. Select model and click Load.`); |
|
|
} catch (e) { |
|
|
setStatus(`WebGPU error: ${e.message} - will use CPU.`); |
|
|
} |
|
|
})(); |
|
|
|