digitalhuman / VoiceActivityDetector.ts
atlury's picture
Upload 6 files
c48a715 verified
raw
history blame
4.44 kB
import OnnxWrapper from './Silero.ts';
const modelPath = "silero_vad.onnx"; // Make sure this path is correct
export class VadDetector {
constructor(startThreshold, endThreshold, samplingRate, minSilenceDurationMs, speechPadMs) {
if (samplingRate !== 8000 && samplingRate !== 16000) {
throw new Error("Does not support sampling rates other than [8000, 16000]");
}
this.model = new OnnxWrapper(modelPath);
this.startThreshold = startThreshold;
this.endThreshold = endThreshold;
this.samplingRate = samplingRate;
this.minSilenceSamples = samplingRate * minSilenceDurationMs / 1000;
this.speechPadSamples = samplingRate * speechPadMs / 1000;
this.reset();
console.log(`VadDetector initialized with: startThreshold=${startThreshold}, endThreshold=${endThreshold}, samplingRate=${samplingRate}`);
}
reset() {
this.model.resetStates();
this.triggered = false;
this.tempEnd = 0;
this.currentSample = 0;
console.log('VadDetector reset');
}
async apply(data, returnSeconds) {
console.log(`Applying VAD to data of length ${data.length}`);
const windowSizeSamples = data.length;
this.currentSample += windowSizeSamples;
const rowLength = this.samplingRate === 16000 ? 512 : 256;
// Ensure data is the correct length
if (data.length < rowLength) {
console.warn(`Input data length (${data.length}) is less than required (${rowLength}). Padding with zeros.`);
data = [...data, ...new Array(rowLength - data.length).fill(0)];
} else if (data.length > rowLength) {
console.warn(`Input data length (${data.length}) is greater than required (${rowLength}). Truncating.`);
data = data.slice(0, rowLength);
}
const x = [Array.from(data)];
let speechProb;
try {
console.log(`Calling model with input shape: [${x.length}, ${x[0].length}], sample rate: ${this.samplingRate}`);
const result = await this.model.call(x, this.samplingRate);
if (result && Array.isArray(result) && result[0] && result[0][0] !== undefined) {
speechProb = result[0][0];
console.log(`Speech probability: ${speechProb}`);
} else {
throw new Error("Unexpected response from model");
}
} catch (e) {
console.error("Error in VadDetector.apply:", e);
throw new Error("Error calling the model: " + e);
}
if (speechProb >= this.startThreshold && this.tempEnd !== 0) {
this.tempEnd = 0;
}
if (speechProb >= this.startThreshold && !this.triggered) {
this.triggered = true;
let speechStart = Math.max(this.currentSample - this.speechPadSamples, 0);
console.log(`Speech start detected at sample ${speechStart}`);
if (returnSeconds) {
const speechStartSeconds = speechStart / this.samplingRate;
return { start: Number(speechStartSeconds.toFixed(1)) };
} else {
return { start: speechStart };
}
}
if (speechProb < this.endThreshold && this.triggered) {
console.log(`Potential speech end at sample ${this.currentSample}`);
if (this.tempEnd === 0) {
this.tempEnd = this.currentSample;
}
if (this.currentSample - this.tempEnd < this.minSilenceSamples) {
console.log('Silence duration too short, continuing');
return {};
} else {
const speechEnd = this.tempEnd + this.speechPadSamples;
console.log(`Speech end confirmed at sample ${speechEnd}`);
this.tempEnd = 0;
this.triggered = false;
if (returnSeconds) {
const speechEndSeconds = speechEnd / this.samplingRate;
return { end: Number(speechEndSeconds.toFixed(1)) };
} else {
return { end: speechEnd };
}
}
}
return {};
}
async close() {
this.reset();
await this.model.close();
}
}