atlury commited on
Commit
16bb6c5
·
verified ·
1 Parent(s): 75afa3a

Upload 6 files

Browse files
Files changed (6) hide show
  1. MicrophoneAudio.js +129 -0
  2. Silero.js +165 -0
  3. SpeechChunks.js +144 -0
  4. VoiceActivityDetector.js +129 -0
  5. index.html +85 -19
  6. silero_vad.onnx +3 -0
MicrophoneAudio.js ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ interface MicrophoneAudioOptions {
2
+ sampleRate?: number;
3
+ channels?: number;
4
+ windowSizeSamples: number;
5
+ onAudioData: (audioData: Float32Array) => void;
6
+ }
7
+
8
+ class MicrophoneAudio {
9
+ private stream: MediaStream | null = null;
10
+ private audioContext: AudioContext | null = null;
11
+ private sourceNode: MediaStreamAudioSourceNode | null = null;
12
+ private workletNode: AudioWorkletNode | null = null;
13
+ private options: MicrophoneAudioOptions;
14
+ private buffer: Float32Array = new Float32Array();
15
+
16
+ constructor(options: MicrophoneAudioOptions) {
17
+ console.log('Initializing MicrophoneAudio');
18
+ this.options = {
19
+ sampleRate: 16000,
20
+ channels: 1,
21
+ ...options,
22
+ };
23
+ console.log(`MicrophoneAudio options: ${JSON.stringify(this.options)}`);
24
+ }
25
+
26
+ getDeviceId(): Promise<string> {
27
+ console.log('Getting device ID');
28
+ return navigator.mediaDevices.getUserMedia({ audio: true }).then((stream) => {
29
+ const deviceId = stream.getTracks()[0].getSettings().deviceId;
30
+ console.log("The device Id is", deviceId);
31
+ return deviceId;
32
+ });
33
+ }
34
+
35
+ async start(): Promise<void> {
36
+ console.log('Starting MicrophoneAudio');
37
+ try {
38
+ this.stream = await navigator.mediaDevices.getUserMedia({
39
+ audio: {
40
+ sampleRate: this.options.sampleRate,
41
+ channelCount: this.options.channels,
42
+ },
43
+ });
44
+ console.log('MediaStream acquired');
45
+
46
+ this.getDeviceId().then((deviceId) => {
47
+ console.log("The device Id is", deviceId);
48
+ });
49
+ this.audioContext = new AudioContext({
50
+ sampleRate: this.options.sampleRate,
51
+ });
52
+
53
+ await this.audioContext.audioWorklet.addModule(
54
+ URL.createObjectURL(new Blob([`
55
+ class AudioProcessor extends AudioWorkletProcessor {
56
+ constructor() {
57
+ super();
58
+ this.buffer = new Float32Array();
59
+ }
60
+
61
+ process(inputs, outputs, parameters) {
62
+ const input = inputs[0];
63
+ const channelData = input[0];
64
+
65
+ this.buffer = Float32Array.from([...this.buffer, ...channelData]);
66
+
67
+ while (this.buffer.length >= ${this.options.windowSizeSamples}) {
68
+ const chunk = this.buffer.slice(0, ${this.options.windowSizeSamples});
69
+ this.port.postMessage(chunk);
70
+ this.buffer = this.buffer.slice(${this.options.windowSizeSamples});
71
+ }
72
+
73
+ return true;
74
+ }
75
+ }
76
+
77
+ registerProcessor('audio-processor', AudioProcessor);
78
+ `], { type: 'application/javascript' }))
79
+ );
80
+
81
+ this.sourceNode = this.audioContext.createMediaStreamSource(this.stream);
82
+ this.workletNode = new AudioWorkletNode(this.audioContext, 'audio-processor');
83
+
84
+ this.workletNode.port.onmessage = (event) => {
85
+ this.options.onAudioData(event.data);
86
+ };
87
+
88
+ this.sourceNode.connect(this.workletNode);
89
+ this.workletNode.connect(this.audioContext.destination);
90
+ console.log('AudioWorklet added and connected');
91
+ } catch (error) {
92
+ console.error('Error starting microphone:', error);
93
+ throw error;
94
+ }
95
+ }
96
+
97
+ stop(): void {
98
+ console.log('Stopping MicrophoneAudio');
99
+ if (this.workletNode) {
100
+ this.workletNode.port.postMessage('flush');
101
+ this.workletNode.disconnect();
102
+ this.workletNode = null;
103
+ }
104
+
105
+ if (this.sourceNode) {
106
+ this.sourceNode.disconnect();
107
+ this.sourceNode = null;
108
+ }
109
+
110
+ if (this.audioContext) {
111
+ this.audioContext.close();
112
+ this.audioContext = null;
113
+ }
114
+
115
+ if (this.stream) {
116
+ this.stream.getTracks().forEach((track) => track.stop());
117
+ this.stream = null;
118
+ }
119
+
120
+ // Send any remaining data in the buffer
121
+ if (this.buffer.length > 0) {
122
+ this.options.onAudioData(this.buffer);
123
+ this.buffer = new Float32Array();
124
+ }
125
+ console.log('MicrophoneAudio stopped');
126
+ }
127
+ }
128
+
129
+ export default MicrophoneAudio;
Silero.js ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import * as ort from 'onnxruntime-web';
2
+
3
+ class OnnxWrapper {
4
+ private session: ort.InferenceSession;
5
+ private _state: number[][];
6
+ private _context: number[];
7
+ private _last_sr: number;
8
+ private _last_batch_size: number;
9
+ private sample_rates: number[];
10
+ private sessionReady: Promise<void>;
11
+
12
+ constructor(path: string, force_onnx_cpu: boolean = true) {
13
+ console.log(`Initializing OnnxWrapper with path: ${path}`);
14
+ this.sessionReady = this.initSession(path, force_onnx_cpu);
15
+ this.resetStates();
16
+ this.sample_rates = [8000, 16000];
17
+ }
18
+
19
+ async ready(): Promise<void> {
20
+ console.log('Waiting for OnnxWrapper session to be ready');
21
+ await this.sessionReady;
22
+ console.log('OnnxWrapper session is ready');
23
+ }
24
+
25
+ private async initSession(path: string, force_onnx_cpu: boolean) {
26
+ console.log(`Initializing ONNX session with force_onnx_cpu: ${force_onnx_cpu}`);
27
+ const options: ort.InferenceSession.SessionOptions = {
28
+ executionProviders: force_onnx_cpu ? ['wasm'] : ['webgl', 'wasm'],
29
+ graphOptimizationLevel: 'all',
30
+ executionMode: 'sequential',
31
+ enableCpuMemArena: true,
32
+ enableMemPattern: true,
33
+ extra: {
34
+ session: {
35
+ intra_op_num_threads: 1,
36
+ inter_op_num_threads: 1,
37
+ }
38
+ }
39
+ };
40
+
41
+ this.session = await ort.InferenceSession.create(path, options);
42
+ console.log('ONNX session created successfully');
43
+ }
44
+
45
+ private _validate_input(x: number[][], sr: number): [number[][], number] {
46
+ if (!Array.isArray(x[0])) {
47
+ x = [x as unknown as number[]];
48
+ }
49
+ if (x.length > 2) {
50
+ throw new Error(`Too many dimensions for input audio chunk ${x.length}`);
51
+ }
52
+ if (sr !== 16000 && (sr % 16000 === 0)) {
53
+ const step = Math.floor(sr / 16000);
54
+ x = x.map(row => row.filter((_, i) => i % step === 0));
55
+ sr = 16000;
56
+ }
57
+ if (!this.sample_rates.includes(sr)) {
58
+ throw new Error(`Supported sampling rates: ${this.sample_rates} (or multiply of 16000)`);
59
+ }
60
+ if (sr / x[0].length > 31.25) {
61
+ throw new Error("Input audio chunk is too short");
62
+ }
63
+ return [x, sr];
64
+ }
65
+
66
+ resetStates(batch_size: number = 1): void {
67
+ console.log(`Resetting states with batch_size: ${batch_size}`);
68
+ this._state = Array(2).fill(0).map(() => Array(batch_size * 128).fill(0));
69
+ this._context = [];
70
+ this._last_sr = 0;
71
+ this._last_batch_size = 0;
72
+ }
73
+
74
+ async call(x: number[][], sr: number): Promise<number[][]> {
75
+ console.log(`Calling model with input shape: [${x.length}, ${x[0].length}], sample rate: ${sr}`);
76
+ await this.ready();
77
+ [x, sr] = this._validate_input(x, sr);
78
+ const num_samples = sr === 16000 ? 512 : 256;
79
+
80
+ if (x[0].length !== num_samples) {
81
+ throw new Error(`Provided number of samples is ${x[0].length} (Supported values: 256 for 8000 sample rate, 512 for 16000)`);
82
+ }
83
+
84
+ const batch_size = x.length;
85
+ const context_size = sr === 16000 ? 64 : 32;
86
+
87
+ if (!this._last_batch_size) {
88
+ this.resetStates(batch_size);
89
+ }
90
+ if (this._last_sr && this._last_sr !== sr) {
91
+ this.resetStates(batch_size);
92
+ }
93
+ if (this._last_batch_size && this._last_batch_size !== batch_size) {
94
+ this.resetStates(batch_size);
95
+ }
96
+ if (this._context.length === 0) {
97
+ this._context = Array(batch_size * context_size).fill(0);
98
+ }
99
+
100
+ x = x.map((row, i) => [...this._context.slice(i * context_size, (i + 1) * context_size), ...row]);
101
+
102
+ if (sr === 8000 || sr === 16000) {
103
+ const inputTensor = new ort.Tensor('float32', x.flat(), [batch_size, x[0].length]);
104
+ const stateTensor = new ort.Tensor('float32', this._state.flat(), [2, batch_size, 128]);
105
+ const srTensor = new ort.Tensor('int64', [sr], []);
106
+
107
+ const feeds: Record<string, ort.Tensor> = {
108
+ input: inputTensor,
109
+ state: stateTensor,
110
+ sr: srTensor
111
+ };
112
+
113
+ const results = await this.session.run(feeds);
114
+ const outputData = results.output.data as Float32Array;
115
+ const stateData = results.stateN.data as Float32Array;
116
+
117
+ this._state = Array(2).fill(0).map((_, i) =>
118
+ Array.from(stateData.slice(i * batch_size * 128, (i + 1) * batch_size * 128))
119
+ );
120
+
121
+ const outputShape = results.output.dims as number[];
122
+ const out = Array(outputShape[0]).fill(0).map((_, i) =>
123
+ Array.from(outputData.slice(i * outputShape[1], (i + 1) * outputShape[1]))
124
+ );
125
+
126
+ this._context = x.map(row => row.slice(-context_size)).flat();
127
+ this._last_sr = sr;
128
+ this._last_batch_size = batch_size;
129
+
130
+ console.log(`Model call completed, output shape: [${out.length}, ${out[0].length}]`);
131
+ return out;
132
+ } else {
133
+ throw new Error(`Unsupported sample rate: ${sr}. Supported rates are 8000 and 16000.`);
134
+ }
135
+ }
136
+
137
+ async audio_forward(x: number[][], sr: number): Promise<number[][]> {
138
+ console.log(`Running audio_forward with input shape: [${x.length}, ${x[0].length}], sample rate: ${sr}`);
139
+ const outs: number[][][] = [];
140
+ [x, sr] = this._validate_input(x, sr);
141
+ this.resetStates();
142
+ const num_samples = sr === 16000 ? 512 : 256;
143
+
144
+ if (x[0].length % num_samples !== 0) {
145
+ const pad_num = num_samples - (x[0].length % num_samples);
146
+ x = x.map(row => [...row, ...Array(pad_num).fill(0)]);
147
+ }
148
+
149
+ for (let i = 0; i < x[0].length; i += num_samples) {
150
+ const wavs_batch = x.map(row => row.slice(i, i + num_samples));
151
+ const out_chunk = await this.call(wavs_batch, sr);
152
+ outs.push(out_chunk);
153
+ }
154
+
155
+ console.log(`audio_forward completed, output shape: [${outs.length}, ${outs[0].length}]`);
156
+ return outs.reduce((acc, curr) => acc.map((row, i) => [...row, ...curr[i]]));
157
+ }
158
+
159
+ close(): void {
160
+ console.log('Closing OnnxWrapper session');
161
+ this.session.release();
162
+ }
163
+ }
164
+
165
+ export default OnnxWrapper;
SpeechChunks.js ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import MicrophoneAudio from './MicrophoneAudio';
2
+ import { VadDetector } from './VoiceActivityDetector';
3
+
4
+ export class SpeechChunks {
5
+ private static readonly SAMPLE_RATE = 16000;
6
+ private static readonly START_THRESHOLD = 0.6;
7
+ private static readonly END_THRESHOLD = 0.45;
8
+ private static readonly MIN_SILENCE_DURATION_MS = 600;
9
+ private static readonly SPEECH_PAD_MS = 500;
10
+ private static readonly WINDOW_SIZE_SAMPLES = 512;
11
+
12
+ private chunks: number[][];
13
+ private microphoneAudio: MicrophoneAudio;
14
+ private vadDetector: VadDetector;
15
+ private isSpeechActive: boolean;
16
+ private onSpeechStart: () => void;
17
+ private onSpeechEnd: (blob: Blob) => void;
18
+
19
+ constructor(onSpeechStart, onSpeechEnd) {
20
+ this.chunks = [];
21
+ this.isSpeechActive = false;
22
+
23
+ this.microphoneAudio = new MicrophoneAudio({
24
+ sampleRate: SpeechChunks.SAMPLE_RATE,
25
+ windowSizeSamples: SpeechChunks.WINDOW_SIZE_SAMPLES,
26
+ onAudioData: this.processAudioData.bind(this)
27
+ });
28
+
29
+ this.onSpeechStart = onSpeechStart;
30
+ this.onSpeechEnd = onSpeechEnd;
31
+
32
+ this.vadDetector = new VadDetector(
33
+ SpeechChunks.START_THRESHOLD,
34
+ SpeechChunks.END_THRESHOLD,
35
+ SpeechChunks.SAMPLE_RATE,
36
+ SpeechChunks.MIN_SILENCE_DURATION_MS,
37
+ SpeechChunks.SPEECH_PAD_MS
38
+ );
39
+
40
+ console.log('SpeechChunks initialized');
41
+ }
42
+
43
+ private async processAudioData(audioData: Float32Array): Promise<void> {
44
+ console.log(`Processing audio data of length ${audioData.length}`);
45
+ try {
46
+ const result = await this.vadDetector.apply(audioData, false);
47
+ if (result.start !== undefined) {
48
+ this.isSpeechActive = true;
49
+ console.log('Speech start detected');
50
+ this.onSpeechStart();
51
+ } else if (result.end !== undefined) {
52
+ this.isSpeechActive = false;
53
+ console.log('Speech end detected');
54
+ this.onSpeechEnd(this.getBlob());
55
+ }
56
+ if (this.isSpeechActive) {
57
+ console.log('Adding chunk to speech');
58
+ this.chunks.push(Array.from(audioData));
59
+ }
60
+ } catch (error) {
61
+ console.error('Error processing audio data', error);
62
+ }
63
+ }
64
+
65
+ async start(): Promise<void> {
66
+ console.log('Starting SpeechChunks');
67
+ await this.microphoneAudio.start();
68
+ }
69
+
70
+ stop(): void {
71
+ console.log('Stopping SpeechChunks');
72
+ this.microphoneAudio.stop();
73
+ this.vadDetector.reset();
74
+ this.isSpeechActive = false;
75
+ }
76
+
77
+ getSpeechChunks(): number[][] {
78
+ console.log(`Returning ${this.chunks.length} speech chunks`);
79
+ const speechChunks = this.chunks;
80
+ this.chunks = [];
81
+ return speechChunks;
82
+ }
83
+
84
+ getBlob(): Blob {
85
+ console.log('Creating audio blob from speech chunks');
86
+ // Combine all chunks into a single Float32Array
87
+ const combinedChunks = this.chunks;
88
+ const combinedLength = combinedChunks.reduce((sum, chunk) => sum + chunk.length, 0);
89
+ const combinedAudio = new Float32Array(combinedLength);
90
+ let offset = 0;
91
+ for (const chunk of combinedChunks) {
92
+ combinedAudio.set(chunk, offset);
93
+ offset += chunk.length;
94
+ }
95
+
96
+ // Convert Float32Array to Int16Array (common format for WAV files)
97
+ const intData = new Int16Array(combinedAudio.length);
98
+ for (let i = 0; i < combinedAudio.length; i++) {
99
+ const s = Math.max(-1, Math.min(1, combinedAudio[i]));
100
+ intData[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
101
+ }
102
+
103
+ // Create WAV file header
104
+ const header = new ArrayBuffer(44);
105
+ const view = new DataView(header);
106
+
107
+ // RIFF chunk descriptor
108
+ this.writeString(view, 0, 'RIFF');
109
+ view.setUint32(4, 36 + intData.length * 2, true);
110
+ this.writeString(view, 8, 'WAVE');
111
+
112
+ // FMT sub-chunk
113
+ this.writeString(view, 12, 'fmt ');
114
+ view.setUint32(16, 16, true); // subchunk1size
115
+ view.setUint16(20, 1, true); // audio format (1 for PCM)
116
+ view.setUint16(22, 1, true); // num of channels
117
+ view.setUint32(24, SpeechChunks.SAMPLE_RATE, true); // sample rate
118
+ view.setUint32(28, SpeechChunks.SAMPLE_RATE * 2, true); // byte rate
119
+ view.setUint16(32, 2, true); // block align
120
+ view.setUint16(34, 16, true); // bits per sample
121
+
122
+ // Data sub-chunk
123
+ this.writeString(view, 36, 'data');
124
+ view.setUint32(40, intData.length * 2, true);
125
+
126
+ // Combine header and data
127
+ const blob = new Blob([header, intData], { type: 'audio/wav' });
128
+ console.log(`Created blob of size ${blob.size} bytes`);
129
+ return blob;
130
+ }
131
+
132
+ // Helper function to write strings to DataView
133
+ private writeString(view: DataView, offset: number, string: string): void {
134
+ for (let i = 0; i < string.length; i++) {
135
+ view.setUint8(offset + i, string.charCodeAt(i));
136
+ }
137
+ }
138
+
139
+ async close(): Promise<void> {
140
+ console.log('Closing SpeechChunks');
141
+ this.stop();
142
+ await this.vadDetector.close();
143
+ }
144
+ }
VoiceActivityDetector.js ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import OnnxWrapper from './Silero'; // Assuming you have this class implemented
2
+ const modelPath = process.env.VAD_MODEL_PATH;
3
+
4
+ export class VadDetector {
5
+ private model: OnnxWrapper;
6
+ private startThreshold: number;
7
+ private endThreshold: number;
8
+ private samplingRate: number;
9
+ private minSilenceSamples: number;
10
+ private speechPadSamples: number;
11
+ private triggered: boolean;
12
+ private tempEnd: number;
13
+ private currentSample: number;
14
+
15
+ constructor(
16
+ startThreshold: number,
17
+ endThreshold: number,
18
+ samplingRate: number,
19
+ minSilenceDurationMs: number,
20
+ speechPadMs: number
21
+ ) {
22
+ if (samplingRate !== 8000 && samplingRate !== 16000) {
23
+ throw new Error("Does not support sampling rates other than [8000, 16000]");
24
+ }
25
+
26
+ this.model = new OnnxWrapper(modelPath);
27
+ this.startThreshold = startThreshold;
28
+ this.endThreshold = endThreshold;
29
+ this.samplingRate = samplingRate;
30
+ this.minSilenceSamples = samplingRate * minSilenceDurationMs / 1000;
31
+ this.speechPadSamples = samplingRate * speechPadMs / 1000;
32
+ this.reset();
33
+ console.log(`VadDetector initialized with: startThreshold=${startThreshold}, endThreshold=${endThreshold}, samplingRate=${samplingRate}`);
34
+ }
35
+
36
+ reset(): void {
37
+ this.model.resetStates();
38
+ this.triggered = false;
39
+ this.tempEnd = 0;
40
+ this.currentSample = 0;
41
+ console.log('VadDetector reset');
42
+ }
43
+
44
+ async apply(data: Float32Array, returnSeconds: boolean): Promise<{ start?: number; end?: number }> {
45
+ console.log(`Applying VAD to data of length ${data.length}`);
46
+ const windowSizeSamples = data.length;
47
+ this.currentSample += windowSizeSamples;
48
+
49
+ // Determine the row length based on the sampling rate
50
+ const rowLength = this.samplingRate === 16000 ? 512 : 256;
51
+
52
+ // Calculate the number of rows
53
+ const numRows = Math.ceil(data.length / rowLength);
54
+
55
+ // Create the 2D array
56
+ const x: number[][] = [];
57
+ for (let i = 0; i < numRows; i++) {
58
+ const start = i * rowLength;
59
+ const end = Math.min(start + rowLength, data.length);
60
+ x.push(Array.from(data.slice(start, end)));
61
+
62
+ // If the last row is not full, pad it with zeros
63
+ if (end - start < rowLength) {
64
+ x[i] = x[i].concat(new Array(rowLength - (end - start)).fill(0));
65
+ }
66
+ }
67
+
68
+ let speechProb: number;
69
+ try {
70
+ let speechProbPromise = await this.model.call(x, this.samplingRate);
71
+ if (speechProbPromise && Array.isArray(speechProbPromise) && speechProbPromise[0]) {
72
+ speechProb = speechProbPromise[0][0];
73
+ console.log(`Speech probability: ${speechProb}`);
74
+ } else {
75
+ throw new Error("Unexpected response from model");
76
+ }
77
+ } catch (e) {
78
+ console.error("Error in VadDetector.apply:", e);
79
+ throw new Error("Error calling the model: " + e);
80
+ }
81
+
82
+ if (speechProb >= this.startThreshold && this.tempEnd !== 0) {
83
+ this.tempEnd = 0;
84
+ }
85
+
86
+ if (speechProb >= this.startThreshold && !this.triggered) {
87
+ this.triggered = true;
88
+ let speechStart = Math.max(this.currentSample - this.speechPadSamples, 0);
89
+ console.log(`Speech start detected at sample ${speechStart}`);
90
+ if (returnSeconds) {
91
+ const speechStartSeconds = speechStart / this.samplingRate;
92
+ return { start: Number(speechStartSeconds.toFixed(1)) };
93
+ } else {
94
+ return { start: speechStart };
95
+ }
96
+ }
97
+
98
+ if (speechProb < this.endThreshold && this.triggered) {
99
+ console.log(`Potential speech end at sample ${this.currentSample}`);
100
+ if (this.tempEnd === 0) {
101
+ this.tempEnd = this.currentSample;
102
+ }
103
+
104
+ if (this.currentSample - this.tempEnd < this.minSilenceSamples) {
105
+ console.log('Silence duration too short, continuing');
106
+ return {};
107
+ } else {
108
+ const speechEnd = this.tempEnd + this.speechPadSamples;
109
+ console.log(`Speech end confirmed at sample ${speechEnd}`);
110
+ this.tempEnd = 0;
111
+ this.triggered = false;
112
+
113
+ if (returnSeconds) {
114
+ const speechEndSeconds = speechEnd / this.samplingRate;
115
+ return { end: Number(speechEndSeconds.toFixed(1)) };
116
+ } else {
117
+ return { end: speechEnd };
118
+ }
119
+ }
120
+ }
121
+
122
+ return {};
123
+ }
124
+
125
+ async close(): Promise<void> {
126
+ this.reset();
127
+ await this.model.close();
128
+ }
129
+ }
index.html CHANGED
@@ -1,19 +1,85 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Voice Activity Detection Demo</title>
7
+ <script src="https://cdn.jsdelivr.net/npm/onnxruntime-web/dist/ort.min.js"></script>
8
+ <style>
9
+ body {{
10
+ font-family: Arial, sans-serif;
11
+ max-width: 800px;
12
+ margin: 0 auto;
13
+ padding: 20px;
14
+ }}
15
+ #status {{
16
+ font-weight: bold;
17
+ margin-bottom: 10px;
18
+ }}
19
+ #audioList {{
20
+ margin-top: 20px;
21
+ }}
22
+ </style>
23
+ </head>
24
+ <body>
25
+ <h1>Voice Activity Detection Demo</h1>
26
+ <div id="status">Not listening</div>
27
+ <button id="startButton">Start Listening</button>
28
+ <button id="stopButton" disabled>Stop Listening</button>
29
+ <div id="audioList"></div>
30
+
31
+ <script type="module">
32
+ {speech_chunks_js}
33
+ {microphone_audio_js}
34
+ {silero_js}
35
+ {voice_activity_detector_js}
36
+
37
+ const status = document.getElementById('status');
38
+ const startButton = document.getElementById('startButton');
39
+ const stopButton = document.getElementById('stopButton');
40
+ const audioList = document.getElementById('audioList');
41
+
42
+ let speechChunks;
43
+
44
+ startButton.addEventListener('click', async () => {{
45
+ speechChunks = new SpeechChunks(
46
+ () => {{
47
+ console.log("Speech start");
48
+ status.textContent = "Listening...";
49
+ }},
50
+ (blob) => {{
51
+ console.log("Speech end");
52
+ status.textContent = "Not listening";
53
+ const audio = new Audio(URL.createObjectURL(blob));
54
+ const listItem = document.createElement('div');
55
+ listItem.appendChild(audio);
56
+ const playButton = document.createElement('button');
57
+ playButton.textContent = 'Play';
58
+ playButton.onclick = () => audio.play();
59
+ listItem.appendChild(playButton);
60
+ audioList.appendChild(listItem);
61
+ }}
62
+ );
63
+
64
+ try {{
65
+ await speechChunks.start();
66
+ startButton.disabled = true;
67
+ stopButton.disabled = false;
68
+ status.textContent = "Listening...";
69
+ }} catch (error) {{
70
+ console.error("Failed to start VAD:", error);
71
+ status.textContent = "Error starting VAD";
72
+ }}
73
+ }});
74
+
75
+ stopButton.addEventListener('click', () => {{
76
+ if (speechChunks) {{
77
+ speechChunks.stop();
78
+ startButton.disabled = false;
79
+ stopButton.disabled = true;
80
+ status.textContent = "Not listening";
81
+ }}
82
+ }});
83
+ </script>
84
+ </body>
85
+ </html>
silero_vad.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2623a2953f6ff3d2c1e61740c6cdb7168133479b267dfef114a4a3cc5bdd788f
3
+ size 2327524