Spaces:

atlury
/

digitalhuman

Running

App Files Files Community

digitalhuman / VoiceActivityDetector.ts

atlury

Upload 6 files

c48a715 verified 5 months ago

raw

history blame

4.44 kB

	import OnnxWrapper from './Silero.ts';

	const modelPath = "silero_vad.onnx"; // Make sure this path is correct

	export class VadDetector {
	constructor(startThreshold, endThreshold, samplingRate, minSilenceDurationMs, speechPadMs) {
	if (samplingRate !== 8000 && samplingRate !== 16000) {
	throw new Error("Does not support sampling rates other than [8000, 16000]");
	}

	this.model = new OnnxWrapper(modelPath);
	this.startThreshold = startThreshold;
	this.endThreshold = endThreshold;
	this.samplingRate = samplingRate;
	this.minSilenceSamples = samplingRate * minSilenceDurationMs / 1000;
	this.speechPadSamples = samplingRate * speechPadMs / 1000;
	this.reset();
	console.log(`VadDetector initialized with: startThreshold=${startThreshold}, endThreshold=${endThreshold}, samplingRate=${samplingRate}`);
	}

	reset() {
	this.model.resetStates();
	this.triggered = false;
	this.tempEnd = 0;
	this.currentSample = 0;
	console.log('VadDetector reset');
	}

	async apply(data, returnSeconds) {
	console.log(`Applying VAD to data of length ${data.length}`);
	const windowSizeSamples = data.length;
	this.currentSample += windowSizeSamples;

	const rowLength = this.samplingRate === 16000 ? 512 : 256;

	// Ensure data is the correct length
	if (data.length < rowLength) {
	console.warn(`Input data length (${data.length}) is less than required (${rowLength}). Padding with zeros.`);
	data = [...data, ...new Array(rowLength - data.length).fill(0)];
	} else if (data.length > rowLength) {
	console.warn(`Input data length (${data.length}) is greater than required (${rowLength}). Truncating.`);
	data = data.slice(0, rowLength);
	}

	const x = [Array.from(data)];

	let speechProb;
	try {
	console.log(`Calling model with input shape: [${x.length}, ${x[0].length}], sample rate: ${this.samplingRate}`);
	const result = await this.model.call(x, this.samplingRate);
	if (result && Array.isArray(result) && result[0] && result[0][0] !== undefined) {
	speechProb = result[0][0];
	console.log(`Speech probability: ${speechProb}`);
	} else {
	throw new Error("Unexpected response from model");
	}
	} catch (e) {
	console.error("Error in VadDetector.apply:", e);
	throw new Error("Error calling the model: " + e);
	}

	if (speechProb >= this.startThreshold && this.tempEnd !== 0) {
	this.tempEnd = 0;
	}

	if (speechProb >= this.startThreshold && !this.triggered) {
	this.triggered = true;
	let speechStart = Math.max(this.currentSample - this.speechPadSamples, 0);
	console.log(`Speech start detected at sample ${speechStart}`);
	if (returnSeconds) {
	const speechStartSeconds = speechStart / this.samplingRate;
	return { start: Number(speechStartSeconds.toFixed(1)) };
	} else {
	return { start: speechStart };
	}
	}

	if (speechProb < this.endThreshold && this.triggered) {
	console.log(`Potential speech end at sample ${this.currentSample}`);
	if (this.tempEnd === 0) {
	this.tempEnd = this.currentSample;
	}

	if (this.currentSample - this.tempEnd < this.minSilenceSamples) {
	console.log('Silence duration too short, continuing');
	return {};
	} else {
	const speechEnd = this.tempEnd + this.speechPadSamples;
	console.log(`Speech end confirmed at sample ${speechEnd}`);
	this.tempEnd = 0;
	this.triggered = false;

	if (returnSeconds) {
	const speechEndSeconds = speechEnd / this.samplingRate;
	return { end: Number(speechEndSeconds.toFixed(1)) };
	} else {
	return { end: speechEnd };
	}
	}
	}

	return {};
	}

	async close() {
	this.reset();
	await this.model.close();
	}
	}