import OnnxWrapper from './Silero.ts'; const modelPath = "silero_vad.onnx"; // Make sure this path is correct export class VadDetector { constructor(startThreshold, endThreshold, samplingRate, minSilenceDurationMs, speechPadMs) { if (samplingRate !== 8000 && samplingRate !== 16000) { throw new Error("Does not support sampling rates other than [8000, 16000]"); } this.model = new OnnxWrapper(modelPath); this.startThreshold = startThreshold; this.endThreshold = endThreshold; this.samplingRate = samplingRate; this.minSilenceSamples = samplingRate * minSilenceDurationMs / 1000; this.speechPadSamples = samplingRate * speechPadMs / 1000; this.reset(); console.log(`VadDetector initialized with: startThreshold=${startThreshold}, endThreshold=${endThreshold}, samplingRate=${samplingRate}`); } reset() { this.model.resetStates(); this.triggered = false; this.tempEnd = 0; this.currentSample = 0; console.log('VadDetector reset'); } async apply(data, returnSeconds) { console.log(`Applying VAD to data of length ${data.length}`); const windowSizeSamples = data.length; this.currentSample += windowSizeSamples; const rowLength = this.samplingRate === 16000 ? 512 : 256; // Ensure data is the correct length if (data.length < rowLength) { console.warn(`Input data length (${data.length}) is less than required (${rowLength}). Padding with zeros.`); data = [...data, ...new Array(rowLength - data.length).fill(0)]; } else if (data.length > rowLength) { console.warn(`Input data length (${data.length}) is greater than required (${rowLength}). Truncating.`); data = data.slice(0, rowLength); } const x = [Array.from(data)]; let speechProb; try { console.log(`Calling model with input shape: [${x.length}, ${x[0].length}], sample rate: ${this.samplingRate}`); const result = await this.model.call(x, this.samplingRate); if (result && Array.isArray(result) && result[0] && result[0][0] !== undefined) { speechProb = result[0][0]; console.log(`Speech probability: ${speechProb}`); } else { throw new Error("Unexpected response from model"); } } catch (e) { console.error("Error in VadDetector.apply:", e); throw new Error("Error calling the model: " + e); } if (speechProb >= this.startThreshold && this.tempEnd !== 0) { this.tempEnd = 0; } if (speechProb >= this.startThreshold && !this.triggered) { this.triggered = true; let speechStart = Math.max(this.currentSample - this.speechPadSamples, 0); console.log(`Speech start detected at sample ${speechStart}`); if (returnSeconds) { const speechStartSeconds = speechStart / this.samplingRate; return { start: Number(speechStartSeconds.toFixed(1)) }; } else { return { start: speechStart }; } } if (speechProb < this.endThreshold && this.triggered) { console.log(`Potential speech end at sample ${this.currentSample}`); if (this.tempEnd === 0) { this.tempEnd = this.currentSample; } if (this.currentSample - this.tempEnd < this.minSilenceSamples) { console.log('Silence duration too short, continuing'); return {}; } else { const speechEnd = this.tempEnd + this.speechPadSamples; console.log(`Speech end confirmed at sample ${speechEnd}`); this.tempEnd = 0; this.triggered = false; if (returnSeconds) { const speechEndSeconds = speechEnd / this.samplingRate; return { end: Number(speechEndSeconds.toFixed(1)) }; } else { return { end: speechEnd }; } } } return {}; } async close() { this.reset(); await this.model.close(); } }