Spaces:
Running
Running
Upload 6 files
Browse files- MicrophoneAudio.js +129 -0
- Silero.js +165 -0
- SpeechChunks.js +144 -0
- VoiceActivityDetector.js +129 -0
- index.html +85 -19
- silero_vad.onnx +3 -0
MicrophoneAudio.js
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
interface MicrophoneAudioOptions {
|
2 |
+
sampleRate?: number;
|
3 |
+
channels?: number;
|
4 |
+
windowSizeSamples: number;
|
5 |
+
onAudioData: (audioData: Float32Array) => void;
|
6 |
+
}
|
7 |
+
|
8 |
+
class MicrophoneAudio {
|
9 |
+
private stream: MediaStream | null = null;
|
10 |
+
private audioContext: AudioContext | null = null;
|
11 |
+
private sourceNode: MediaStreamAudioSourceNode | null = null;
|
12 |
+
private workletNode: AudioWorkletNode | null = null;
|
13 |
+
private options: MicrophoneAudioOptions;
|
14 |
+
private buffer: Float32Array = new Float32Array();
|
15 |
+
|
16 |
+
constructor(options: MicrophoneAudioOptions) {
|
17 |
+
console.log('Initializing MicrophoneAudio');
|
18 |
+
this.options = {
|
19 |
+
sampleRate: 16000,
|
20 |
+
channels: 1,
|
21 |
+
...options,
|
22 |
+
};
|
23 |
+
console.log(`MicrophoneAudio options: ${JSON.stringify(this.options)}`);
|
24 |
+
}
|
25 |
+
|
26 |
+
getDeviceId(): Promise<string> {
|
27 |
+
console.log('Getting device ID');
|
28 |
+
return navigator.mediaDevices.getUserMedia({ audio: true }).then((stream) => {
|
29 |
+
const deviceId = stream.getTracks()[0].getSettings().deviceId;
|
30 |
+
console.log("The device Id is", deviceId);
|
31 |
+
return deviceId;
|
32 |
+
});
|
33 |
+
}
|
34 |
+
|
35 |
+
async start(): Promise<void> {
|
36 |
+
console.log('Starting MicrophoneAudio');
|
37 |
+
try {
|
38 |
+
this.stream = await navigator.mediaDevices.getUserMedia({
|
39 |
+
audio: {
|
40 |
+
sampleRate: this.options.sampleRate,
|
41 |
+
channelCount: this.options.channels,
|
42 |
+
},
|
43 |
+
});
|
44 |
+
console.log('MediaStream acquired');
|
45 |
+
|
46 |
+
this.getDeviceId().then((deviceId) => {
|
47 |
+
console.log("The device Id is", deviceId);
|
48 |
+
});
|
49 |
+
this.audioContext = new AudioContext({
|
50 |
+
sampleRate: this.options.sampleRate,
|
51 |
+
});
|
52 |
+
|
53 |
+
await this.audioContext.audioWorklet.addModule(
|
54 |
+
URL.createObjectURL(new Blob([`
|
55 |
+
class AudioProcessor extends AudioWorkletProcessor {
|
56 |
+
constructor() {
|
57 |
+
super();
|
58 |
+
this.buffer = new Float32Array();
|
59 |
+
}
|
60 |
+
|
61 |
+
process(inputs, outputs, parameters) {
|
62 |
+
const input = inputs[0];
|
63 |
+
const channelData = input[0];
|
64 |
+
|
65 |
+
this.buffer = Float32Array.from([...this.buffer, ...channelData]);
|
66 |
+
|
67 |
+
while (this.buffer.length >= ${this.options.windowSizeSamples}) {
|
68 |
+
const chunk = this.buffer.slice(0, ${this.options.windowSizeSamples});
|
69 |
+
this.port.postMessage(chunk);
|
70 |
+
this.buffer = this.buffer.slice(${this.options.windowSizeSamples});
|
71 |
+
}
|
72 |
+
|
73 |
+
return true;
|
74 |
+
}
|
75 |
+
}
|
76 |
+
|
77 |
+
registerProcessor('audio-processor', AudioProcessor);
|
78 |
+
`], { type: 'application/javascript' }))
|
79 |
+
);
|
80 |
+
|
81 |
+
this.sourceNode = this.audioContext.createMediaStreamSource(this.stream);
|
82 |
+
this.workletNode = new AudioWorkletNode(this.audioContext, 'audio-processor');
|
83 |
+
|
84 |
+
this.workletNode.port.onmessage = (event) => {
|
85 |
+
this.options.onAudioData(event.data);
|
86 |
+
};
|
87 |
+
|
88 |
+
this.sourceNode.connect(this.workletNode);
|
89 |
+
this.workletNode.connect(this.audioContext.destination);
|
90 |
+
console.log('AudioWorklet added and connected');
|
91 |
+
} catch (error) {
|
92 |
+
console.error('Error starting microphone:', error);
|
93 |
+
throw error;
|
94 |
+
}
|
95 |
+
}
|
96 |
+
|
97 |
+
stop(): void {
|
98 |
+
console.log('Stopping MicrophoneAudio');
|
99 |
+
if (this.workletNode) {
|
100 |
+
this.workletNode.port.postMessage('flush');
|
101 |
+
this.workletNode.disconnect();
|
102 |
+
this.workletNode = null;
|
103 |
+
}
|
104 |
+
|
105 |
+
if (this.sourceNode) {
|
106 |
+
this.sourceNode.disconnect();
|
107 |
+
this.sourceNode = null;
|
108 |
+
}
|
109 |
+
|
110 |
+
if (this.audioContext) {
|
111 |
+
this.audioContext.close();
|
112 |
+
this.audioContext = null;
|
113 |
+
}
|
114 |
+
|
115 |
+
if (this.stream) {
|
116 |
+
this.stream.getTracks().forEach((track) => track.stop());
|
117 |
+
this.stream = null;
|
118 |
+
}
|
119 |
+
|
120 |
+
// Send any remaining data in the buffer
|
121 |
+
if (this.buffer.length > 0) {
|
122 |
+
this.options.onAudioData(this.buffer);
|
123 |
+
this.buffer = new Float32Array();
|
124 |
+
}
|
125 |
+
console.log('MicrophoneAudio stopped');
|
126 |
+
}
|
127 |
+
}
|
128 |
+
|
129 |
+
export default MicrophoneAudio;
|
Silero.js
ADDED
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import * as ort from 'onnxruntime-web';
|
2 |
+
|
3 |
+
class OnnxWrapper {
|
4 |
+
private session: ort.InferenceSession;
|
5 |
+
private _state: number[][];
|
6 |
+
private _context: number[];
|
7 |
+
private _last_sr: number;
|
8 |
+
private _last_batch_size: number;
|
9 |
+
private sample_rates: number[];
|
10 |
+
private sessionReady: Promise<void>;
|
11 |
+
|
12 |
+
constructor(path: string, force_onnx_cpu: boolean = true) {
|
13 |
+
console.log(`Initializing OnnxWrapper with path: ${path}`);
|
14 |
+
this.sessionReady = this.initSession(path, force_onnx_cpu);
|
15 |
+
this.resetStates();
|
16 |
+
this.sample_rates = [8000, 16000];
|
17 |
+
}
|
18 |
+
|
19 |
+
async ready(): Promise<void> {
|
20 |
+
console.log('Waiting for OnnxWrapper session to be ready');
|
21 |
+
await this.sessionReady;
|
22 |
+
console.log('OnnxWrapper session is ready');
|
23 |
+
}
|
24 |
+
|
25 |
+
private async initSession(path: string, force_onnx_cpu: boolean) {
|
26 |
+
console.log(`Initializing ONNX session with force_onnx_cpu: ${force_onnx_cpu}`);
|
27 |
+
const options: ort.InferenceSession.SessionOptions = {
|
28 |
+
executionProviders: force_onnx_cpu ? ['wasm'] : ['webgl', 'wasm'],
|
29 |
+
graphOptimizationLevel: 'all',
|
30 |
+
executionMode: 'sequential',
|
31 |
+
enableCpuMemArena: true,
|
32 |
+
enableMemPattern: true,
|
33 |
+
extra: {
|
34 |
+
session: {
|
35 |
+
intra_op_num_threads: 1,
|
36 |
+
inter_op_num_threads: 1,
|
37 |
+
}
|
38 |
+
}
|
39 |
+
};
|
40 |
+
|
41 |
+
this.session = await ort.InferenceSession.create(path, options);
|
42 |
+
console.log('ONNX session created successfully');
|
43 |
+
}
|
44 |
+
|
45 |
+
private _validate_input(x: number[][], sr: number): [number[][], number] {
|
46 |
+
if (!Array.isArray(x[0])) {
|
47 |
+
x = [x as unknown as number[]];
|
48 |
+
}
|
49 |
+
if (x.length > 2) {
|
50 |
+
throw new Error(`Too many dimensions for input audio chunk ${x.length}`);
|
51 |
+
}
|
52 |
+
if (sr !== 16000 && (sr % 16000 === 0)) {
|
53 |
+
const step = Math.floor(sr / 16000);
|
54 |
+
x = x.map(row => row.filter((_, i) => i % step === 0));
|
55 |
+
sr = 16000;
|
56 |
+
}
|
57 |
+
if (!this.sample_rates.includes(sr)) {
|
58 |
+
throw new Error(`Supported sampling rates: ${this.sample_rates} (or multiply of 16000)`);
|
59 |
+
}
|
60 |
+
if (sr / x[0].length > 31.25) {
|
61 |
+
throw new Error("Input audio chunk is too short");
|
62 |
+
}
|
63 |
+
return [x, sr];
|
64 |
+
}
|
65 |
+
|
66 |
+
resetStates(batch_size: number = 1): void {
|
67 |
+
console.log(`Resetting states with batch_size: ${batch_size}`);
|
68 |
+
this._state = Array(2).fill(0).map(() => Array(batch_size * 128).fill(0));
|
69 |
+
this._context = [];
|
70 |
+
this._last_sr = 0;
|
71 |
+
this._last_batch_size = 0;
|
72 |
+
}
|
73 |
+
|
74 |
+
async call(x: number[][], sr: number): Promise<number[][]> {
|
75 |
+
console.log(`Calling model with input shape: [${x.length}, ${x[0].length}], sample rate: ${sr}`);
|
76 |
+
await this.ready();
|
77 |
+
[x, sr] = this._validate_input(x, sr);
|
78 |
+
const num_samples = sr === 16000 ? 512 : 256;
|
79 |
+
|
80 |
+
if (x[0].length !== num_samples) {
|
81 |
+
throw new Error(`Provided number of samples is ${x[0].length} (Supported values: 256 for 8000 sample rate, 512 for 16000)`);
|
82 |
+
}
|
83 |
+
|
84 |
+
const batch_size = x.length;
|
85 |
+
const context_size = sr === 16000 ? 64 : 32;
|
86 |
+
|
87 |
+
if (!this._last_batch_size) {
|
88 |
+
this.resetStates(batch_size);
|
89 |
+
}
|
90 |
+
if (this._last_sr && this._last_sr !== sr) {
|
91 |
+
this.resetStates(batch_size);
|
92 |
+
}
|
93 |
+
if (this._last_batch_size && this._last_batch_size !== batch_size) {
|
94 |
+
this.resetStates(batch_size);
|
95 |
+
}
|
96 |
+
if (this._context.length === 0) {
|
97 |
+
this._context = Array(batch_size * context_size).fill(0);
|
98 |
+
}
|
99 |
+
|
100 |
+
x = x.map((row, i) => [...this._context.slice(i * context_size, (i + 1) * context_size), ...row]);
|
101 |
+
|
102 |
+
if (sr === 8000 || sr === 16000) {
|
103 |
+
const inputTensor = new ort.Tensor('float32', x.flat(), [batch_size, x[0].length]);
|
104 |
+
const stateTensor = new ort.Tensor('float32', this._state.flat(), [2, batch_size, 128]);
|
105 |
+
const srTensor = new ort.Tensor('int64', [sr], []);
|
106 |
+
|
107 |
+
const feeds: Record<string, ort.Tensor> = {
|
108 |
+
input: inputTensor,
|
109 |
+
state: stateTensor,
|
110 |
+
sr: srTensor
|
111 |
+
};
|
112 |
+
|
113 |
+
const results = await this.session.run(feeds);
|
114 |
+
const outputData = results.output.data as Float32Array;
|
115 |
+
const stateData = results.stateN.data as Float32Array;
|
116 |
+
|
117 |
+
this._state = Array(2).fill(0).map((_, i) =>
|
118 |
+
Array.from(stateData.slice(i * batch_size * 128, (i + 1) * batch_size * 128))
|
119 |
+
);
|
120 |
+
|
121 |
+
const outputShape = results.output.dims as number[];
|
122 |
+
const out = Array(outputShape[0]).fill(0).map((_, i) =>
|
123 |
+
Array.from(outputData.slice(i * outputShape[1], (i + 1) * outputShape[1]))
|
124 |
+
);
|
125 |
+
|
126 |
+
this._context = x.map(row => row.slice(-context_size)).flat();
|
127 |
+
this._last_sr = sr;
|
128 |
+
this._last_batch_size = batch_size;
|
129 |
+
|
130 |
+
console.log(`Model call completed, output shape: [${out.length}, ${out[0].length}]`);
|
131 |
+
return out;
|
132 |
+
} else {
|
133 |
+
throw new Error(`Unsupported sample rate: ${sr}. Supported rates are 8000 and 16000.`);
|
134 |
+
}
|
135 |
+
}
|
136 |
+
|
137 |
+
async audio_forward(x: number[][], sr: number): Promise<number[][]> {
|
138 |
+
console.log(`Running audio_forward with input shape: [${x.length}, ${x[0].length}], sample rate: ${sr}`);
|
139 |
+
const outs: number[][][] = [];
|
140 |
+
[x, sr] = this._validate_input(x, sr);
|
141 |
+
this.resetStates();
|
142 |
+
const num_samples = sr === 16000 ? 512 : 256;
|
143 |
+
|
144 |
+
if (x[0].length % num_samples !== 0) {
|
145 |
+
const pad_num = num_samples - (x[0].length % num_samples);
|
146 |
+
x = x.map(row => [...row, ...Array(pad_num).fill(0)]);
|
147 |
+
}
|
148 |
+
|
149 |
+
for (let i = 0; i < x[0].length; i += num_samples) {
|
150 |
+
const wavs_batch = x.map(row => row.slice(i, i + num_samples));
|
151 |
+
const out_chunk = await this.call(wavs_batch, sr);
|
152 |
+
outs.push(out_chunk);
|
153 |
+
}
|
154 |
+
|
155 |
+
console.log(`audio_forward completed, output shape: [${outs.length}, ${outs[0].length}]`);
|
156 |
+
return outs.reduce((acc, curr) => acc.map((row, i) => [...row, ...curr[i]]));
|
157 |
+
}
|
158 |
+
|
159 |
+
close(): void {
|
160 |
+
console.log('Closing OnnxWrapper session');
|
161 |
+
this.session.release();
|
162 |
+
}
|
163 |
+
}
|
164 |
+
|
165 |
+
export default OnnxWrapper;
|
SpeechChunks.js
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import MicrophoneAudio from './MicrophoneAudio';
|
2 |
+
import { VadDetector } from './VoiceActivityDetector';
|
3 |
+
|
4 |
+
export class SpeechChunks {
|
5 |
+
private static readonly SAMPLE_RATE = 16000;
|
6 |
+
private static readonly START_THRESHOLD = 0.6;
|
7 |
+
private static readonly END_THRESHOLD = 0.45;
|
8 |
+
private static readonly MIN_SILENCE_DURATION_MS = 600;
|
9 |
+
private static readonly SPEECH_PAD_MS = 500;
|
10 |
+
private static readonly WINDOW_SIZE_SAMPLES = 512;
|
11 |
+
|
12 |
+
private chunks: number[][];
|
13 |
+
private microphoneAudio: MicrophoneAudio;
|
14 |
+
private vadDetector: VadDetector;
|
15 |
+
private isSpeechActive: boolean;
|
16 |
+
private onSpeechStart: () => void;
|
17 |
+
private onSpeechEnd: (blob: Blob) => void;
|
18 |
+
|
19 |
+
constructor(onSpeechStart, onSpeechEnd) {
|
20 |
+
this.chunks = [];
|
21 |
+
this.isSpeechActive = false;
|
22 |
+
|
23 |
+
this.microphoneAudio = new MicrophoneAudio({
|
24 |
+
sampleRate: SpeechChunks.SAMPLE_RATE,
|
25 |
+
windowSizeSamples: SpeechChunks.WINDOW_SIZE_SAMPLES,
|
26 |
+
onAudioData: this.processAudioData.bind(this)
|
27 |
+
});
|
28 |
+
|
29 |
+
this.onSpeechStart = onSpeechStart;
|
30 |
+
this.onSpeechEnd = onSpeechEnd;
|
31 |
+
|
32 |
+
this.vadDetector = new VadDetector(
|
33 |
+
SpeechChunks.START_THRESHOLD,
|
34 |
+
SpeechChunks.END_THRESHOLD,
|
35 |
+
SpeechChunks.SAMPLE_RATE,
|
36 |
+
SpeechChunks.MIN_SILENCE_DURATION_MS,
|
37 |
+
SpeechChunks.SPEECH_PAD_MS
|
38 |
+
);
|
39 |
+
|
40 |
+
console.log('SpeechChunks initialized');
|
41 |
+
}
|
42 |
+
|
43 |
+
private async processAudioData(audioData: Float32Array): Promise<void> {
|
44 |
+
console.log(`Processing audio data of length ${audioData.length}`);
|
45 |
+
try {
|
46 |
+
const result = await this.vadDetector.apply(audioData, false);
|
47 |
+
if (result.start !== undefined) {
|
48 |
+
this.isSpeechActive = true;
|
49 |
+
console.log('Speech start detected');
|
50 |
+
this.onSpeechStart();
|
51 |
+
} else if (result.end !== undefined) {
|
52 |
+
this.isSpeechActive = false;
|
53 |
+
console.log('Speech end detected');
|
54 |
+
this.onSpeechEnd(this.getBlob());
|
55 |
+
}
|
56 |
+
if (this.isSpeechActive) {
|
57 |
+
console.log('Adding chunk to speech');
|
58 |
+
this.chunks.push(Array.from(audioData));
|
59 |
+
}
|
60 |
+
} catch (error) {
|
61 |
+
console.error('Error processing audio data', error);
|
62 |
+
}
|
63 |
+
}
|
64 |
+
|
65 |
+
async start(): Promise<void> {
|
66 |
+
console.log('Starting SpeechChunks');
|
67 |
+
await this.microphoneAudio.start();
|
68 |
+
}
|
69 |
+
|
70 |
+
stop(): void {
|
71 |
+
console.log('Stopping SpeechChunks');
|
72 |
+
this.microphoneAudio.stop();
|
73 |
+
this.vadDetector.reset();
|
74 |
+
this.isSpeechActive = false;
|
75 |
+
}
|
76 |
+
|
77 |
+
getSpeechChunks(): number[][] {
|
78 |
+
console.log(`Returning ${this.chunks.length} speech chunks`);
|
79 |
+
const speechChunks = this.chunks;
|
80 |
+
this.chunks = [];
|
81 |
+
return speechChunks;
|
82 |
+
}
|
83 |
+
|
84 |
+
getBlob(): Blob {
|
85 |
+
console.log('Creating audio blob from speech chunks');
|
86 |
+
// Combine all chunks into a single Float32Array
|
87 |
+
const combinedChunks = this.chunks;
|
88 |
+
const combinedLength = combinedChunks.reduce((sum, chunk) => sum + chunk.length, 0);
|
89 |
+
const combinedAudio = new Float32Array(combinedLength);
|
90 |
+
let offset = 0;
|
91 |
+
for (const chunk of combinedChunks) {
|
92 |
+
combinedAudio.set(chunk, offset);
|
93 |
+
offset += chunk.length;
|
94 |
+
}
|
95 |
+
|
96 |
+
// Convert Float32Array to Int16Array (common format for WAV files)
|
97 |
+
const intData = new Int16Array(combinedAudio.length);
|
98 |
+
for (let i = 0; i < combinedAudio.length; i++) {
|
99 |
+
const s = Math.max(-1, Math.min(1, combinedAudio[i]));
|
100 |
+
intData[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
|
101 |
+
}
|
102 |
+
|
103 |
+
// Create WAV file header
|
104 |
+
const header = new ArrayBuffer(44);
|
105 |
+
const view = new DataView(header);
|
106 |
+
|
107 |
+
// RIFF chunk descriptor
|
108 |
+
this.writeString(view, 0, 'RIFF');
|
109 |
+
view.setUint32(4, 36 + intData.length * 2, true);
|
110 |
+
this.writeString(view, 8, 'WAVE');
|
111 |
+
|
112 |
+
// FMT sub-chunk
|
113 |
+
this.writeString(view, 12, 'fmt ');
|
114 |
+
view.setUint32(16, 16, true); // subchunk1size
|
115 |
+
view.setUint16(20, 1, true); // audio format (1 for PCM)
|
116 |
+
view.setUint16(22, 1, true); // num of channels
|
117 |
+
view.setUint32(24, SpeechChunks.SAMPLE_RATE, true); // sample rate
|
118 |
+
view.setUint32(28, SpeechChunks.SAMPLE_RATE * 2, true); // byte rate
|
119 |
+
view.setUint16(32, 2, true); // block align
|
120 |
+
view.setUint16(34, 16, true); // bits per sample
|
121 |
+
|
122 |
+
// Data sub-chunk
|
123 |
+
this.writeString(view, 36, 'data');
|
124 |
+
view.setUint32(40, intData.length * 2, true);
|
125 |
+
|
126 |
+
// Combine header and data
|
127 |
+
const blob = new Blob([header, intData], { type: 'audio/wav' });
|
128 |
+
console.log(`Created blob of size ${blob.size} bytes`);
|
129 |
+
return blob;
|
130 |
+
}
|
131 |
+
|
132 |
+
// Helper function to write strings to DataView
|
133 |
+
private writeString(view: DataView, offset: number, string: string): void {
|
134 |
+
for (let i = 0; i < string.length; i++) {
|
135 |
+
view.setUint8(offset + i, string.charCodeAt(i));
|
136 |
+
}
|
137 |
+
}
|
138 |
+
|
139 |
+
async close(): Promise<void> {
|
140 |
+
console.log('Closing SpeechChunks');
|
141 |
+
this.stop();
|
142 |
+
await this.vadDetector.close();
|
143 |
+
}
|
144 |
+
}
|
VoiceActivityDetector.js
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import OnnxWrapper from './Silero'; // Assuming you have this class implemented
|
2 |
+
const modelPath = process.env.VAD_MODEL_PATH;
|
3 |
+
|
4 |
+
export class VadDetector {
|
5 |
+
private model: OnnxWrapper;
|
6 |
+
private startThreshold: number;
|
7 |
+
private endThreshold: number;
|
8 |
+
private samplingRate: number;
|
9 |
+
private minSilenceSamples: number;
|
10 |
+
private speechPadSamples: number;
|
11 |
+
private triggered: boolean;
|
12 |
+
private tempEnd: number;
|
13 |
+
private currentSample: number;
|
14 |
+
|
15 |
+
constructor(
|
16 |
+
startThreshold: number,
|
17 |
+
endThreshold: number,
|
18 |
+
samplingRate: number,
|
19 |
+
minSilenceDurationMs: number,
|
20 |
+
speechPadMs: number
|
21 |
+
) {
|
22 |
+
if (samplingRate !== 8000 && samplingRate !== 16000) {
|
23 |
+
throw new Error("Does not support sampling rates other than [8000, 16000]");
|
24 |
+
}
|
25 |
+
|
26 |
+
this.model = new OnnxWrapper(modelPath);
|
27 |
+
this.startThreshold = startThreshold;
|
28 |
+
this.endThreshold = endThreshold;
|
29 |
+
this.samplingRate = samplingRate;
|
30 |
+
this.minSilenceSamples = samplingRate * minSilenceDurationMs / 1000;
|
31 |
+
this.speechPadSamples = samplingRate * speechPadMs / 1000;
|
32 |
+
this.reset();
|
33 |
+
console.log(`VadDetector initialized with: startThreshold=${startThreshold}, endThreshold=${endThreshold}, samplingRate=${samplingRate}`);
|
34 |
+
}
|
35 |
+
|
36 |
+
reset(): void {
|
37 |
+
this.model.resetStates();
|
38 |
+
this.triggered = false;
|
39 |
+
this.tempEnd = 0;
|
40 |
+
this.currentSample = 0;
|
41 |
+
console.log('VadDetector reset');
|
42 |
+
}
|
43 |
+
|
44 |
+
async apply(data: Float32Array, returnSeconds: boolean): Promise<{ start?: number; end?: number }> {
|
45 |
+
console.log(`Applying VAD to data of length ${data.length}`);
|
46 |
+
const windowSizeSamples = data.length;
|
47 |
+
this.currentSample += windowSizeSamples;
|
48 |
+
|
49 |
+
// Determine the row length based on the sampling rate
|
50 |
+
const rowLength = this.samplingRate === 16000 ? 512 : 256;
|
51 |
+
|
52 |
+
// Calculate the number of rows
|
53 |
+
const numRows = Math.ceil(data.length / rowLength);
|
54 |
+
|
55 |
+
// Create the 2D array
|
56 |
+
const x: number[][] = [];
|
57 |
+
for (let i = 0; i < numRows; i++) {
|
58 |
+
const start = i * rowLength;
|
59 |
+
const end = Math.min(start + rowLength, data.length);
|
60 |
+
x.push(Array.from(data.slice(start, end)));
|
61 |
+
|
62 |
+
// If the last row is not full, pad it with zeros
|
63 |
+
if (end - start < rowLength) {
|
64 |
+
x[i] = x[i].concat(new Array(rowLength - (end - start)).fill(0));
|
65 |
+
}
|
66 |
+
}
|
67 |
+
|
68 |
+
let speechProb: number;
|
69 |
+
try {
|
70 |
+
let speechProbPromise = await this.model.call(x, this.samplingRate);
|
71 |
+
if (speechProbPromise && Array.isArray(speechProbPromise) && speechProbPromise[0]) {
|
72 |
+
speechProb = speechProbPromise[0][0];
|
73 |
+
console.log(`Speech probability: ${speechProb}`);
|
74 |
+
} else {
|
75 |
+
throw new Error("Unexpected response from model");
|
76 |
+
}
|
77 |
+
} catch (e) {
|
78 |
+
console.error("Error in VadDetector.apply:", e);
|
79 |
+
throw new Error("Error calling the model: " + e);
|
80 |
+
}
|
81 |
+
|
82 |
+
if (speechProb >= this.startThreshold && this.tempEnd !== 0) {
|
83 |
+
this.tempEnd = 0;
|
84 |
+
}
|
85 |
+
|
86 |
+
if (speechProb >= this.startThreshold && !this.triggered) {
|
87 |
+
this.triggered = true;
|
88 |
+
let speechStart = Math.max(this.currentSample - this.speechPadSamples, 0);
|
89 |
+
console.log(`Speech start detected at sample ${speechStart}`);
|
90 |
+
if (returnSeconds) {
|
91 |
+
const speechStartSeconds = speechStart / this.samplingRate;
|
92 |
+
return { start: Number(speechStartSeconds.toFixed(1)) };
|
93 |
+
} else {
|
94 |
+
return { start: speechStart };
|
95 |
+
}
|
96 |
+
}
|
97 |
+
|
98 |
+
if (speechProb < this.endThreshold && this.triggered) {
|
99 |
+
console.log(`Potential speech end at sample ${this.currentSample}`);
|
100 |
+
if (this.tempEnd === 0) {
|
101 |
+
this.tempEnd = this.currentSample;
|
102 |
+
}
|
103 |
+
|
104 |
+
if (this.currentSample - this.tempEnd < this.minSilenceSamples) {
|
105 |
+
console.log('Silence duration too short, continuing');
|
106 |
+
return {};
|
107 |
+
} else {
|
108 |
+
const speechEnd = this.tempEnd + this.speechPadSamples;
|
109 |
+
console.log(`Speech end confirmed at sample ${speechEnd}`);
|
110 |
+
this.tempEnd = 0;
|
111 |
+
this.triggered = false;
|
112 |
+
|
113 |
+
if (returnSeconds) {
|
114 |
+
const speechEndSeconds = speechEnd / this.samplingRate;
|
115 |
+
return { end: Number(speechEndSeconds.toFixed(1)) };
|
116 |
+
} else {
|
117 |
+
return { end: speechEnd };
|
118 |
+
}
|
119 |
+
}
|
120 |
+
}
|
121 |
+
|
122 |
+
return {};
|
123 |
+
}
|
124 |
+
|
125 |
+
async close(): Promise<void> {
|
126 |
+
this.reset();
|
127 |
+
await this.model.close();
|
128 |
+
}
|
129 |
+
}
|
index.html
CHANGED
@@ -1,19 +1,85 @@
|
|
1 |
-
<!
|
2 |
-
<html>
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
<head>
|
4 |
+
<meta charset="UTF-8">
|
5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
6 |
+
<title>Voice Activity Detection Demo</title>
|
7 |
+
<script src="https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.min.js"></script>
|
8 |
+
<style>
|
9 |
+
body {{
|
10 |
+
font-family: Arial, sans-serif;
|
11 |
+
max-width: 800px;
|
12 |
+
margin: 0 auto;
|
13 |
+
padding: 20px;
|
14 |
+
}}
|
15 |
+
#status {{
|
16 |
+
font-weight: bold;
|
17 |
+
margin-bottom: 10px;
|
18 |
+
}}
|
19 |
+
#audioList {{
|
20 |
+
margin-top: 20px;
|
21 |
+
}}
|
22 |
+
</style>
|
23 |
+
</head>
|
24 |
+
<body>
|
25 |
+
<h1>Voice Activity Detection Demo</h1>
|
26 |
+
<div id="status">Not listening</div>
|
27 |
+
<button id="startButton">Start Listening</button>
|
28 |
+
<button id="stopButton" disabled>Stop Listening</button>
|
29 |
+
<div id="audioList"></div>
|
30 |
+
|
31 |
+
<script type="module">
|
32 |
+
{speech_chunks_js}
|
33 |
+
{microphone_audio_js}
|
34 |
+
{silero_js}
|
35 |
+
{voice_activity_detector_js}
|
36 |
+
|
37 |
+
const status = document.getElementById('status');
|
38 |
+
const startButton = document.getElementById('startButton');
|
39 |
+
const stopButton = document.getElementById('stopButton');
|
40 |
+
const audioList = document.getElementById('audioList');
|
41 |
+
|
42 |
+
let speechChunks;
|
43 |
+
|
44 |
+
startButton.addEventListener('click', async () => {{
|
45 |
+
speechChunks = new SpeechChunks(
|
46 |
+
() => {{
|
47 |
+
console.log("Speech start");
|
48 |
+
status.textContent = "Listening...";
|
49 |
+
}},
|
50 |
+
(blob) => {{
|
51 |
+
console.log("Speech end");
|
52 |
+
status.textContent = "Not listening";
|
53 |
+
const audio = new Audio(URL.createObjectURL(blob));
|
54 |
+
const listItem = document.createElement('div');
|
55 |
+
listItem.appendChild(audio);
|
56 |
+
const playButton = document.createElement('button');
|
57 |
+
playButton.textContent = 'Play';
|
58 |
+
playButton.onclick = () => audio.play();
|
59 |
+
listItem.appendChild(playButton);
|
60 |
+
audioList.appendChild(listItem);
|
61 |
+
}}
|
62 |
+
);
|
63 |
+
|
64 |
+
try {{
|
65 |
+
await speechChunks.start();
|
66 |
+
startButton.disabled = true;
|
67 |
+
stopButton.disabled = false;
|
68 |
+
status.textContent = "Listening...";
|
69 |
+
}} catch (error) {{
|
70 |
+
console.error("Failed to start VAD:", error);
|
71 |
+
status.textContent = "Error starting VAD";
|
72 |
+
}}
|
73 |
+
}});
|
74 |
+
|
75 |
+
stopButton.addEventListener('click', () => {{
|
76 |
+
if (speechChunks) {{
|
77 |
+
speechChunks.stop();
|
78 |
+
startButton.disabled = false;
|
79 |
+
stopButton.disabled = true;
|
80 |
+
status.textContent = "Not listening";
|
81 |
+
}}
|
82 |
+
}});
|
83 |
+
</script>
|
84 |
+
</body>
|
85 |
+
</html>
|
silero_vad.onnx
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2623a2953f6ff3d2c1e61740c6cdb7168133479b267dfef114a4a3cc5bdd788f
|
3 |
+
size 2327524
|