Spaces:

atlury
/

digitalhuman

Running

App Files Files Community

digitalhuman / SpeechChunks.js

atlury's picture

Upload 6 files

16bb6c5 verified 5 months ago

5.41 kB

	import MicrophoneAudio from './MicrophoneAudio';
	import { VadDetector } from './VoiceActivityDetector';

	export class SpeechChunks {
	private static readonly SAMPLE_RATE = 16000;
	private static readonly START_THRESHOLD = 0.6;
	private static readonly END_THRESHOLD = 0.45;
	private static readonly MIN_SILENCE_DURATION_MS = 600;
	private static readonly SPEECH_PAD_MS = 500;
	private static readonly WINDOW_SIZE_SAMPLES = 512;

	private chunks: number[][];
	private microphoneAudio: MicrophoneAudio;
	private vadDetector: VadDetector;
	private isSpeechActive: boolean;
	private onSpeechStart: () => void;
	private onSpeechEnd: (blob: Blob) => void;

	constructor(onSpeechStart, onSpeechEnd) {
	this.chunks = [];
	this.isSpeechActive = false;

	this.microphoneAudio = new MicrophoneAudio({
	sampleRate: SpeechChunks.SAMPLE_RATE,
	windowSizeSamples: SpeechChunks.WINDOW_SIZE_SAMPLES,
	onAudioData: this.processAudioData.bind(this)
	});

	this.onSpeechStart = onSpeechStart;
	this.onSpeechEnd = onSpeechEnd;

	this.vadDetector = new VadDetector(
	SpeechChunks.START_THRESHOLD,
	SpeechChunks.END_THRESHOLD,
	SpeechChunks.SAMPLE_RATE,
	SpeechChunks.MIN_SILENCE_DURATION_MS,
	SpeechChunks.SPEECH_PAD_MS
	);

	console.log('SpeechChunks initialized');
	}

	private async processAudioData(audioData: Float32Array): Promise<void> {
	console.log(`Processing audio data of length ${audioData.length}`);
	try {
	const result = await this.vadDetector.apply(audioData, false);
	if (result.start !== undefined) {
	this.isSpeechActive = true;
	console.log('Speech start detected');
	this.onSpeechStart();
	} else if (result.end !== undefined) {
	this.isSpeechActive = false;
	console.log('Speech end detected');
	this.onSpeechEnd(this.getBlob());
	}
	if (this.isSpeechActive) {
	console.log('Adding chunk to speech');
	this.chunks.push(Array.from(audioData));
	}
	} catch (error) {
	console.error('Error processing audio data', error);
	}
	}

	async start(): Promise<void> {
	console.log('Starting SpeechChunks');
	await this.microphoneAudio.start();
	}

	stop(): void {
	console.log('Stopping SpeechChunks');
	this.microphoneAudio.stop();
	this.vadDetector.reset();
	this.isSpeechActive = false;
	}

	getSpeechChunks(): number[][] {
	console.log(`Returning ${this.chunks.length} speech chunks`);
	const speechChunks = this.chunks;
	this.chunks = [];
	return speechChunks;
	}

	getBlob(): Blob {
	console.log('Creating audio blob from speech chunks');
	// Combine all chunks into a single Float32Array
	const combinedChunks = this.chunks;
	const combinedLength = combinedChunks.reduce((sum, chunk) => sum + chunk.length, 0);
	const combinedAudio = new Float32Array(combinedLength);
	let offset = 0;
	for (const chunk of combinedChunks) {
	combinedAudio.set(chunk, offset);
	offset += chunk.length;
	}

	// Convert Float32Array to Int16Array (common format for WAV files)
	const intData = new Int16Array(combinedAudio.length);
	for (let i = 0; i < combinedAudio.length; i++) {
	const s = Math.max(-1, Math.min(1, combinedAudio[i]));
	intData[i] = s < 0 ? s * 0x8000 : s * 0x7FFF;
	}

	// Create WAV file header
	const header = new ArrayBuffer(44);
	const view = new DataView(header);

	// RIFF chunk descriptor
	this.writeString(view, 0, 'RIFF');
	view.setUint32(4, 36 + intData.length * 2, true);
	this.writeString(view, 8, 'WAVE');

	// FMT sub-chunk
	this.writeString(view, 12, 'fmt ');
	view.setUint32(16, 16, true); // subchunk1size
	view.setUint16(20, 1, true); // audio format (1 for PCM)
	view.setUint16(22, 1, true); // num of channels
	view.setUint32(24, SpeechChunks.SAMPLE_RATE, true); // sample rate
	view.setUint32(28, SpeechChunks.SAMPLE_RATE * 2, true); // byte rate
	view.setUint16(32, 2, true); // block align
	view.setUint16(34, 16, true); // bits per sample

	// Data sub-chunk
	this.writeString(view, 36, 'data');
	view.setUint32(40, intData.length * 2, true);

	// Combine header and data
	const blob = new Blob([header, intData], { type: 'audio/wav' });
	console.log(`Created blob of size ${blob.size} bytes`);
	return blob;
	}

	// Helper function to write strings to DataView
	private writeString(view: DataView, offset: number, string: string): void {
	for (let i = 0; i < string.length; i++) {
	view.setUint8(offset + i, string.charCodeAt(i));
	}
	}

	async close(): Promise<void> {
	console.log('Closing SpeechChunks');
	this.stop();
	await this.vadDetector.close();
	}
	}