Spaces:
Running
Running
<html lang="en"> | |
<head> | |
<meta charset="UTF-8"> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
<title>Voice Chat Bot with Advanced Echo Cancellation</title> | |
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.js"></script> | |
<script src="https://cdn.jsdelivr.net/npm/@ricky0123/[email protected]/dist/bundle.min.js"></script> | |
<script src="https://cdn.jsdelivr.net/npm/@xenova/[email protected]"></script> | |
<style> | |
body { | |
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; | |
margin: 0; | |
padding: 20px; | |
background-color: #1a1a1a; | |
color: #f0f0f0; | |
} | |
.container { | |
max-width: 800px; | |
margin: 0 auto; | |
} | |
h1 { | |
color: #ffd700; | |
text-align: center; | |
margin-bottom: 10px; | |
} | |
.subtitle { | |
text-align: center; | |
color: #ffd700; | |
margin-bottom: 20px; | |
} | |
#chat-container { | |
display: flex; | |
flex-direction: column; | |
height: 70vh; | |
} | |
#conversation { | |
flex-grow: 1; | |
border: 1px solid #444; | |
padding: 10px; | |
overflow-y: scroll; | |
background-color: #2a2a2a; | |
border-radius: 5px; | |
margin-bottom: 20px; | |
} | |
#controls { | |
display: flex; | |
justify-content: center; | |
margin-bottom: 20px; | |
} | |
button { | |
font-size: 18px; | |
padding: 10px 20px; | |
background-color: #ffd700; | |
color: #1a1a1a; | |
border: none; | |
border-radius: 5px; | |
cursor: pointer; | |
transition: background-color 0.3s; | |
} | |
button:hover { | |
background-color: #ffec8b; | |
} | |
button:disabled { | |
background-color: #666; | |
cursor: not-allowed; | |
} | |
#visualizer { | |
width: 100%; | |
height: 100px; | |
background-color: #2a2a2a; | |
border-radius: 5px; | |
overflow: hidden; | |
margin-bottom: 20px; | |
} | |
.bar { | |
width: 5px; | |
height: 100%; | |
background-color: #ffd700; | |
display: inline-block; | |
margin-right: 1px; | |
} | |
#loading { | |
position: fixed; | |
top: 0; | |
left: 0; | |
width: 100%; | |
height: 100%; | |
background-color: rgba(0, 0, 0, 0.8); | |
display: flex; | |
justify-content: center; | |
align-items: center; | |
z-index: 1000; | |
} | |
.spinner { | |
width: 50px; | |
height: 50px; | |
border: 5px solid #f3f3f3; | |
border-top: 5px solid #ffd700; | |
border-radius: 50%; | |
animation: spin 1s linear infinite; | |
} | |
@keyframes spin { | |
0% { transform: rotate(0deg); } | |
100% { transform: rotate(360deg); } | |
} | |
#configuration { | |
margin-bottom: 20px; | |
} | |
select { | |
width: 100%; | |
padding: 10px; | |
font-size: 16px; | |
background-color: #2a2a2a; | |
color: #f0f0f0; | |
border: 1px solid #444; | |
border-radius: 5px; | |
} | |
#model-info { | |
margin-top: 10px; | |
font-size: 14px; | |
color: #aaa; | |
} | |
#logs { | |
background-color: #2a2a2a; | |
border: 1px solid #444; | |
border-radius: 5px; | |
padding: 10px; | |
height: 200px; | |
overflow-y: scroll; | |
font-family: monospace; | |
font-size: 14px; | |
} | |
#clear-logs { | |
margin-top: 10px; | |
font-size: 14px; | |
padding: 5px 10px; | |
} | |
#localVideo, #remoteVideo { | |
display: none; | |
} | |
</style> | |
</head> | |
<body> | |
<div id="loading"> | |
<div class="spinner"></div> | |
</div> | |
<div class="container"> | |
<h1>Digital Human Voice Chat</h1> | |
<p class="subtitle">For best results, use headphones.</p> | |
<div id="chat-container"> | |
<div id="controls"> | |
<button id="startButton" disabled>Begin Call</button> | |
</div> | |
<div id="configuration"> | |
<select id="configSelect"> | |
<option value="fastest">Fastest</option> | |
<option value="balanced">Balanced</option> | |
<option value="quality">Highest Quality</option> | |
</select> | |
<div id="model-info"> | |
TTS: Xenova/mms-tts-eng / STT: Xenova/whisper-tiny.en / LLM: TinyLlama-1.1B-Chat-v0.4-q4f16_1-1k | |
</div> | |
</div> | |
<div id="visualizer"></div> | |
<div id="conversation"></div> | |
</div> | |
<h2>Logs</h2> | |
<div id="logs"></div> | |
<button id="clear-logs">Clear</button> | |
</div> | |
<video id="localVideo" autoplay></video> | |
<video id="remoteVideo" autoplay></video> | |
<script type="module"> | |
import { pipeline, env } from 'https://cdn.jsdelivr.net/npm/@xenova/[email protected]'; | |
import { ChatModule } from 'https://esm.run/@mlc-ai/web-llm'; | |
env.localModelPath = './models'; | |
// Configure environment before initializing pipelines | |
env.backends = ['wasm']; | |
env.wasm = env.wasm || {}; | |
env.wasm.wasmPaths = 'https://cdn.jsdelivr.net/npm/@xenova/[email protected]/'; | |
env.wasm.simd = true; | |
env.numThreads = navigator.hardwareConcurrency || 4; | |
const conversationDiv = document.getElementById('conversation'); | |
const startButton = document.getElementById('startButton'); | |
const visualizer = document.getElementById('visualizer'); | |
const loadingDiv = document.getElementById('loading'); | |
const logsDiv = document.getElementById('logs'); | |
const clearLogsButton = document.getElementById('clear-logs'); | |
const localVideo = document.getElementById('localVideo'); | |
const remoteVideo = document.getElementById('remoteVideo'); | |
let myvad; | |
let sttPipeline; | |
let ttsPipeline; | |
let llmEngine; | |
let audioContext; | |
let analyser; | |
let dataArray; | |
let bars; | |
let animationId; | |
let isListening = false; | |
let microphoneStream; | |
let isSpeaking = false; | |
let currentAudioSource = null; | |
let rtcConnection = null; | |
let rtcLoopbackConnection = null; | |
let loopbackStream = new MediaStream(); | |
function createVisualizer() { | |
const barCount = 64; | |
for (let i = 0; i < barCount; i++) { | |
const bar = document.createElement('div'); | |
bar.className = 'bar'; | |
visualizer.appendChild(bar); | |
} | |
bars = visualizer.getElementsByClassName('bar'); | |
} | |
function updateVisualizer() { | |
analyser.getByteFrequencyData(dataArray); | |
for (let i = 0; i < bars.length; i++) { | |
const barHeight = dataArray[i] / 2; | |
bars[i].style.height = barHeight + 'px'; | |
} | |
animationId = setTimeout(updateVisualizer, 50); | |
} | |
async function initializePipelines() { | |
try { | |
addLog('System: Initializing pipelines...'); | |
[sttPipeline, ttsPipeline] = await Promise.all([ | |
pipeline('automatic-speech-recognition', 'Xenova/whisper-tiny.en', { quantized: true }), | |
pipeline('text-to-speech', 'Xenova/mms-tts-eng', { quantized: true }) | |
]); | |
addLog('System: Initializing WebLLM...'); | |
llmChat = new ChatModule({ | |
model: "TinyLlama-1.1B-Chat-v0.4-q4f16_1-1k" | |
}); | |
await llmChat.init(); | |
addLog('System: WebLLM initialized successfully.'); | |
addLog('System: Digital Human Voice Chat initialized. Click "Begin Call" to start.'); | |
startButton.disabled = false; | |
loadingDiv.style.display = 'none'; | |
} catch (error) { | |
console.error('Error initializing pipelines:', error); | |
addLog(`System: Error initializing pipelines: ${error.message}`); | |
loadingDiv.style.display = 'none'; | |
} | |
} | |
async function processSpeech(audio) { | |
try { | |
if (!sttPipeline || !ttsPipeline || !llmChat) { | |
throw new Error('Pipelines not initialized'); | |
} | |
const transcription = await sttPipeline(audio); | |
addLog(`User: ${transcription.text}`); | |
const reply = await llmChat.generate(transcription.text, { | |
temperature: 0.7, | |
max_new_tokens: 256 | |
}); | |
const botResponse = reply.trim(); | |
addLog(`Bot: ${botResponse}`); | |
isSpeaking = true; | |
const speechOutput = await ttsPipeline(botResponse); | |
await playAudio(speechOutput.audio); | |
isSpeaking = false; | |
} catch (error) { | |
console.error('Error processing speech:', error); | |
addLog(`System: Error processing speech: ${error.message}`); | |
} | |
} | |
function addLog(message) { | |
const now = new Date(); | |
const timestamp = now.toLocaleTimeString(); | |
const logMessage = `[${timestamp}] ${message}`; | |
const messageElement = document.createElement('div'); | |
messageElement.textContent = logMessage; | |
logsDiv.appendChild(messageElement); | |
logsDiv.scrollTop = logsDiv.scrollHeight; | |
} | |
function playAudio(audioArray) { | |
return new Promise((resolve) => { | |
const audioBuffer = audioContext.createBuffer(1, audioArray.length, 16000); | |
const channelData = audioBuffer.getChannelData(0); | |
channelData.set(audioArray); | |
const source = audioContext.createBufferSource(); | |
currentAudioSource = source; | |
source.buffer = audioBuffer; | |
source.connect(analyser); | |
analyser.connect(audioContext.destination); | |
source.start(); | |
source.onended = () => { | |
currentAudioSource = null; | |
resolve(); | |
}; | |
}); | |
} | |
function stopCurrentAudio() { | |
if (currentAudioSource) { | |
currentAudioSource.stop(); | |
currentAudioSource = null; | |
} | |
} | |
async function toggleListening() { | |
if (isListening) { | |
await stopListening(); | |
} else { | |
await startListening(); | |
} | |
} | |
async function startListening() { | |
try { | |
audioContext = new (window.AudioContext || window.webkitAudioContext)(); | |
analyser = audioContext.createAnalyser(); | |
analyser.fftSize = 128; | |
dataArray = new Uint8Array(analyser.frequencyBinCount); | |
localVideo.volume = 0; | |
localVideo.muted = true; | |
document.getElementById('localVideo').volume = 0; | |
remoteVideo.volume = 0; | |
remoteVideo.muted = true; | |
document.getElementById('remoteVideo').volume = 0; | |
microphoneStream = await navigator.mediaDevices.getUserMedia({ | |
audio: true, | |
video: { width: 1, height: 1 } | |
}); | |
localVideo.srcObject = microphoneStream; | |
await localVideo.play(); | |
console.log('Active constraints:', microphoneStream.getAudioTracks()[0].getConstraints()); | |
console.log('Microphone stream settings:', microphoneStream.getAudioTracks()[0].getSettings()); | |
const offerOptions = { | |
offerToReceiveAudio: true, | |
offerToReceiveVideo: false, | |
}; | |
rtcConnection = new RTCPeerConnection(); | |
rtcLoopbackConnection = new RTCPeerConnection(); | |
rtcConnection.onicecandidate = e => e.candidate && rtcLoopbackConnection.addIceCandidate(new RTCIceCandidate(e.candidate)); | |
rtcLoopbackConnection.onicecandidate = e => e.candidate && rtcConnection.addIceCandidate(new RTCIceCandidate(e.candidate)); | |
rtcLoopbackConnection.ontrack = e => e.streams[0].getTracks().forEach(track => loopbackStream.addTrack(track)); | |
microphoneStream.getTracks().forEach(track => rtcConnection.addTrack(track, microphoneStream)); | |
const offer = await rtcConnection.createOffer(offerOptions); | |
await rtcConnection.setLocalDescription(offer); | |
await rtcLoopbackConnection.setRemoteDescription(offer); | |
const answer = await rtcLoopbackConnection.createAnswer(); | |
await rtcLoopbackConnection.setLocalDescription(answer); | |
await rtcConnection.setRemoteDescription(answer); | |
const source = audioContext.createMediaStreamSource(loopbackStream); | |
source.connect(analyser); | |
myvad = await vad.MicVAD.new({ | |
noiseSuppression: true, | |
aggressiveness: 3, | |
onSpeechStart: () => { | |
addLog('--- Voice activity: speech start'); | |
updateVisualizer(); | |
if (isSpeaking) { | |
addLog('User interrupted. Stopping bot speech.'); | |
stopCurrentAudio(); | |
isSpeaking = false; | |
} | |
}, | |
onSpeechEnd: (audio) => { | |
addLog('--- Voice activity: speech end'); | |
cancelAnimationFrame(animationId); | |
processSpeech(audio); | |
} | |
}); | |
await myvad.start(); | |
startButton.textContent = 'End Call'; | |
isListening = true; | |
addLog('System: Listening...'); | |
} catch (error) { | |
console.error('Error starting voice activity:', error); | |
addLog(`System: Error starting voice detection: ${error.message}`); | |
} | |
} | |
async function stopListening() { | |
if (myvad) { | |
try { | |
await myvad.destroy(); | |
} catch (error) { | |
console.error('Error stopping voice activity:', error); | |
} | |
myvad = null; | |
} | |
if (microphoneStream) { | |
microphoneStream.getTracks().forEach(track => track.stop()); | |
microphoneStream = null; | |
} | |
if (audioContext) { | |
await audioContext.close(); | |
audioContext = null; | |
} | |
if (localVideo) { | |
localVideo.srcObject = null; | |
} | |
if (remoteVideo) { | |
remoteVideo.srcObject = null; | |
} | |
if (rtcConnection) { | |
rtcConnection.close(); | |
rtcConnection = null; | |
} | |
if (rtcLoopbackConnection) { | |
rtcLoopbackConnection.close(); | |
rtcLoopbackConnection = null; | |
} | |
loopbackStream = new MediaStream(); | |
stopCurrentAudio(); | |
startButton.textContent = 'Begin Call'; | |
isListening = false; | |
addLog('System: Stopped listening.'); | |
cancelAnimationFrame(animationId); | |
addLog('System: Microphone closed'); | |
} | |
startButton.addEventListener('click', toggleListening); | |
clearLogsButton.addEventListener('click', () => { | |
logsDiv.innerHTML = ''; | |
}); | |
createVisualizer(); | |
initializePipelines(); | |
</script> | |
</body> | |
</html> |