Spaces:
Running
Running
import MicrophoneAudio from './MicrophoneAudio'; | |
import { VadDetector } from './VoiceActivityDetector'; | |
export class SpeechChunks { | |
private static readonly SAMPLE_RATE = 16000; | |
private static readonly START_THRESHOLD = 0.6; | |
private static readonly END_THRESHOLD = 0.45; | |
private static readonly MIN_SILENCE_DURATION_MS = 600; | |
private static readonly SPEECH_PAD_MS = 500; | |
private static readonly WINDOW_SIZE_SAMPLES = 512; | |
private chunks: number[][]; | |
private microphoneAudio: MicrophoneAudio; | |
private vadDetector: VadDetector; | |
private isSpeechActive: boolean; | |
private onSpeechStart: () => void; | |
private onSpeechEnd: (blob: Blob) => void; | |
constructor(onSpeechStart, onSpeechEnd) { | |
this.chunks = []; | |
this.isSpeechActive = false; | |
this.microphoneAudio = new MicrophoneAudio({ | |
sampleRate: SpeechChunks.SAMPLE_RATE, | |
windowSizeSamples: SpeechChunks.WINDOW_SIZE_SAMPLES, | |
onAudioData: this.processAudioData.bind(this) | |
}); | |
this.onSpeechStart = onSpeechStart; | |
this.onSpeechEnd = onSpeechEnd; | |
this.vadDetector = new VadDetector( | |
SpeechChunks.START_THRESHOLD, | |
SpeechChunks.END_THRESHOLD, | |
SpeechChunks.SAMPLE_RATE, | |
SpeechChunks.MIN_SILENCE_DURATION_MS, | |
SpeechChunks.SPEECH_PAD_MS | |
); | |
console.log('SpeechChunks initialized'); | |
} | |
private async processAudioData(audioData: Float32Array): Promise<void> { | |
console.log(`Processing audio data of length ${audioData.length}`); | |
try { | |
const result = await this.vadDetector.apply(audioData, false); | |
if (result.start !== undefined) { | |
this.isSpeechActive = true; | |
console.log('Speech start detected'); | |
this.onSpeechStart(); | |
} else if (result.end !== undefined) { | |
this.isSpeechActive = false; | |
console.log('Speech end detected'); | |
this.onSpeechEnd(this.getBlob()); | |
} | |
if (this.isSpeechActive) { | |
console.log('Adding chunk to speech'); | |
this.chunks.push(Array.from(audioData)); | |
} | |
} catch (error) { | |
console.error('Error processing audio data', error); | |
} | |
} | |
async start(): Promise<void> { | |
console.log('Starting SpeechChunks'); | |
await this.microphoneAudio.start(); | |
} | |
stop(): void { | |
console.log('Stopping SpeechChunks'); | |
this.microphoneAudio.stop(); | |
this.vadDetector.reset(); | |
this.isSpeechActive = false; | |
} | |
getSpeechChunks(): number[][] { | |
console.log(`Returning ${this.chunks.length} speech chunks`); | |
const speechChunks = this.chunks; | |
this.chunks = []; | |
return speechChunks; | |
} | |
getBlob(): Blob { | |
console.log('Creating audio blob from speech chunks'); | |
// Combine all chunks into a single Float32Array | |
const combinedChunks = this.chunks; | |
const combinedLength = combinedChunks.reduce((sum, chunk) => sum + chunk.length, 0); | |
const combinedAudio = new Float32Array(combinedLength); | |
let offset = 0; | |
for (const chunk of combinedChunks) { | |
combinedAudio.set(chunk, offset); | |
offset += chunk.length; | |
} | |
// Convert Float32Array to Int16Array (common format for WAV files) | |
const intData = new Int16Array(combinedAudio.length); | |
for (let i = 0; i < combinedAudio.length; i++) { | |
const s = Math.max(-1, Math.min(1, combinedAudio[i])); | |
intData[i] = s < 0 ? s * 0x8000 : s * 0x7FFF; | |
} | |
// Create WAV file header | |
const header = new ArrayBuffer(44); | |
const view = new DataView(header); | |
// RIFF chunk descriptor | |
this.writeString(view, 0, 'RIFF'); | |
view.setUint32(4, 36 + intData.length * 2, true); | |
this.writeString(view, 8, 'WAVE'); | |
// FMT sub-chunk | |
this.writeString(view, 12, 'fmt '); | |
view.setUint32(16, 16, true); // subchunk1size | |
view.setUint16(20, 1, true); // audio format (1 for PCM) | |
view.setUint16(22, 1, true); // num of channels | |
view.setUint32(24, SpeechChunks.SAMPLE_RATE, true); // sample rate | |
view.setUint32(28, SpeechChunks.SAMPLE_RATE * 2, true); // byte rate | |
view.setUint16(32, 2, true); // block align | |
view.setUint16(34, 16, true); // bits per sample | |
// Data sub-chunk | |
this.writeString(view, 36, 'data'); | |
view.setUint32(40, intData.length * 2, true); | |
// Combine header and data | |
const blob = new Blob([header, intData], { type: 'audio/wav' }); | |
console.log(`Created blob of size ${blob.size} bytes`); | |
return blob; | |
} | |
// Helper function to write strings to DataView | |
private writeString(view: DataView, offset: number, string: string): void { | |
for (let i = 0; i < string.length; i++) { | |
view.setUint8(offset + i, string.charCodeAt(i)); | |
} | |
} | |
async close(): Promise<void> { | |
console.log('Closing SpeechChunks'); | |
this.stop(); | |
await this.vadDetector.close(); | |
} | |
} |