Spaces:
Running
Running
import OnnxWrapper from './Silero.ts'; | |
const modelPath = "silero_vad.onnx"; // Make sure this path is correct | |
export class VadDetector { | |
constructor(startThreshold, endThreshold, samplingRate, minSilenceDurationMs, speechPadMs) { | |
if (samplingRate !== 8000 && samplingRate !== 16000) { | |
throw new Error("Does not support sampling rates other than [8000, 16000]"); | |
} | |
this.model = new OnnxWrapper(modelPath); | |
this.startThreshold = startThreshold; | |
this.endThreshold = endThreshold; | |
this.samplingRate = samplingRate; | |
this.minSilenceSamples = samplingRate * minSilenceDurationMs / 1000; | |
this.speechPadSamples = samplingRate * speechPadMs / 1000; | |
this.reset(); | |
console.log(`VadDetector initialized with: startThreshold=${startThreshold}, endThreshold=${endThreshold}, samplingRate=${samplingRate}`); | |
} | |
reset() { | |
this.model.resetStates(); | |
this.triggered = false; | |
this.tempEnd = 0; | |
this.currentSample = 0; | |
console.log('VadDetector reset'); | |
} | |
async apply(data, returnSeconds) { | |
console.log(`Applying VAD to data of length ${data.length}`); | |
const windowSizeSamples = data.length; | |
this.currentSample += windowSizeSamples; | |
const rowLength = this.samplingRate === 16000 ? 512 : 256; | |
// Ensure data is the correct length | |
if (data.length < rowLength) { | |
console.warn(`Input data length (${data.length}) is less than required (${rowLength}). Padding with zeros.`); | |
data = [...data, ...new Array(rowLength - data.length).fill(0)]; | |
} else if (data.length > rowLength) { | |
console.warn(`Input data length (${data.length}) is greater than required (${rowLength}). Truncating.`); | |
data = data.slice(0, rowLength); | |
} | |
const x = [Array.from(data)]; | |
let speechProb; | |
try { | |
console.log(`Calling model with input shape: [${x.length}, ${x[0].length}], sample rate: ${this.samplingRate}`); | |
const result = await this.model.call(x, this.samplingRate); | |
if (result && Array.isArray(result) && result[0] && result[0][0] !== undefined) { | |
speechProb = result[0][0]; | |
console.log(`Speech probability: ${speechProb}`); | |
} else { | |
throw new Error("Unexpected response from model"); | |
} | |
} catch (e) { | |
console.error("Error in VadDetector.apply:", e); | |
throw new Error("Error calling the model: " + e); | |
} | |
if (speechProb >= this.startThreshold && this.tempEnd !== 0) { | |
this.tempEnd = 0; | |
} | |
if (speechProb >= this.startThreshold && !this.triggered) { | |
this.triggered = true; | |
let speechStart = Math.max(this.currentSample - this.speechPadSamples, 0); | |
console.log(`Speech start detected at sample ${speechStart}`); | |
if (returnSeconds) { | |
const speechStartSeconds = speechStart / this.samplingRate; | |
return { start: Number(speechStartSeconds.toFixed(1)) }; | |
} else { | |
return { start: speechStart }; | |
} | |
} | |
if (speechProb < this.endThreshold && this.triggered) { | |
console.log(`Potential speech end at sample ${this.currentSample}`); | |
if (this.tempEnd === 0) { | |
this.tempEnd = this.currentSample; | |
} | |
if (this.currentSample - this.tempEnd < this.minSilenceSamples) { | |
console.log('Silence duration too short, continuing'); | |
return {}; | |
} else { | |
const speechEnd = this.tempEnd + this.speechPadSamples; | |
console.log(`Speech end confirmed at sample ${speechEnd}`); | |
this.tempEnd = 0; | |
this.triggered = false; | |
if (returnSeconds) { | |
const speechEndSeconds = speechEnd / this.samplingRate; | |
return { end: Number(speechEndSeconds.toFixed(1)) }; | |
} else { | |
return { end: speechEnd }; | |
} | |
} | |
} | |
return {}; | |
} | |
async close() { | |
this.reset(); | |
await this.model.close(); | |
} | |
} |