zhenxuan commited on
Commit
61644eb
1 Parent(s): 0e4d6bc

Split streaming audio into chunks

Browse files
Files changed (1) hide show
  1. app.py +42 -13
app.py CHANGED
@@ -1,5 +1,8 @@
1
  import datetime
2
  import os
 
 
 
3
  os.system('pip install git+https://github.com/openai/whisper.git')
4
  import gradio as gr
5
  import wave
@@ -11,7 +14,7 @@ import torchaudio.functional as F
11
  LOGGING_FORMAT = '%(asctime)s %(message)s'
12
  logging.basicConfig(format=LOGGING_FORMAT,level=logging.INFO)
13
 
14
- REC_INTERVAL_IN_SECONDS = 1
15
 
16
  # tmp dir to store audio files.
17
  if not os.path.isdir('./tmp/'):
@@ -31,7 +34,7 @@ class WhisperStreaming():
31
  def transcribe_audio_file(self, wave_file_path):
32
  waveform, sample_rate = torchaudio.load(wave_file_path)
33
  resampled_waveform = F.resample(waveform, sample_rate, self.whisper_sample_rate, lowpass_filter_width=6)
34
- audio_tmp = whisper.pad_or_trim(resampled_waveform[0])
35
  mel = whisper.log_mel_spectrogram(audio_tmp)
36
  results = self.whisper_model.decode(mel, self.decode_option)
37
  return results
@@ -64,27 +67,53 @@ whisper_model = WhisperStreaming(model_name='base', language='en', fp16=False)
64
 
65
  def transcribe(audio, state={}):
66
  logging.info(f'Transcribe audio file {audio}')
 
67
  logging.info(state)
 
 
 
 
68
 
69
  if not state:
70
- state['concated_audio'] = audio
71
- state['result_text'] = 'Waitting...'
72
  state['count'] = 0
 
 
 
 
 
 
 
 
 
 
73
  else:
74
- state['concated_audio'] = concat_multiple_wav_files([state['concated_audio'], audio])
75
- state['count'] += 1
76
 
77
- if state['count'] % REC_INTERVAL_IN_SECONDS == 0 and state['count'] > 0:
78
- logging.info('start to transcribe.......')
79
- result = whisper_model.transcribe_audio_file(state['concated_audio'])
 
 
 
80
  logging.info('complete transcribe.......')
81
- state['result_text'] = result.text
82
- logging.info('The text is:' + state['result_text'])
83
  else:
84
- logging.info(f'The count of streaming is {state["count"]}, and skip speech recognition')
 
 
 
 
 
 
 
 
85
 
86
- return state['result_text'], state
87
 
 
 
88
 
89
  gr.Interface(fn=transcribe,
90
  inputs=[gr.Audio(source="microphone", type='filepath', streaming=True), 'state'],
 
1
  import datetime
2
  import os
3
+
4
+ from whisper.audio import N_SAMPLES, CHUNK_LENGTH
5
+
6
  os.system('pip install git+https://github.com/openai/whisper.git')
7
  import gradio as gr
8
  import wave
 
14
  LOGGING_FORMAT = '%(asctime)s %(message)s'
15
  logging.basicConfig(format=LOGGING_FORMAT,level=logging.INFO)
16
 
17
+ REC_INTERVAL_IN_SECONDS = 4
18
 
19
  # tmp dir to store audio files.
20
  if not os.path.isdir('./tmp/'):
 
34
  def transcribe_audio_file(self, wave_file_path):
35
  waveform, sample_rate = torchaudio.load(wave_file_path)
36
  resampled_waveform = F.resample(waveform, sample_rate, self.whisper_sample_rate, lowpass_filter_width=6)
37
+ audio_tmp = whisper.pad_or_trim(resampled_waveform[0], length=N_SAMPLES)
38
  mel = whisper.log_mel_spectrogram(audio_tmp)
39
  results = self.whisper_model.decode(mel, self.decode_option)
40
  return results
 
67
 
68
  def transcribe(audio, state={}):
69
  logging.info(f'Transcribe audio file {audio}')
70
+ print('=====================')
71
  logging.info(state)
72
+ # Whisper only take maximum 30s of audio as input.
73
+ # And the gradio streaming does not guarantee each callback is 1s, so -2 as buffer
74
+ # After count reach 28 * n, a new audio file is created.
75
+ # However the text should not change.
76
 
77
  if not state:
78
+ state['all_chunk_texts'] = 'Waitting...'
 
79
  state['count'] = 0
80
+ state['chunks'] = {}
81
+ return state['all_chunk_texts'], state
82
+
83
+ chunk = state['count'] // (CHUNK_LENGTH - 2)
84
+ chunk_offset = state['count'] % (CHUNK_LENGTH - 2)
85
+
86
+ if chunk_offset == 0:
87
+ state['chunks'][chunk] = {}
88
+ state['chunks'][chunk]['concated_audio'] = audio
89
+ state['chunks'][chunk]['result_text'] = ''
90
  else:
91
+ state['chunks'][chunk]['concated_audio'] = concat_multiple_wav_files([state['chunks'][chunk]['concated_audio'], audio])
 
92
 
93
+ state['count'] += 1
94
+
95
+ # Determin if recognizes current chunk.
96
+ if (chunk_offset + 1) % REC_INTERVAL_IN_SECONDS == 0 and chunk_offset > 0:
97
+ logging.info(f'start to transcribe chunk: {chunk}, offset: {chunk_offset}')
98
+ result = whisper_model.transcribe_audio_file(state['chunks'][chunk]['concated_audio'])
99
  logging.info('complete transcribe.......')
100
+ state['chunks'][chunk]['result_text'] = result.text
101
+ logging.info('The text is:' + state['chunks'][chunk]['result_text'])
102
  else:
103
+ logging.info(f'The offset of streaming chunk is {chunk_offset}, and skip speech recognition')
104
+
105
+ # Concat result_texts of all chunks
106
+ result_texts = ''
107
+
108
+ for tmp_chunk_idx, tmp_chunk_values in state['chunks'].items():
109
+ result_texts += tmp_chunk_values['result_text']
110
+
111
+ state['all_chunk_texts'] = result_texts
112
 
113
+ return state['all_chunk_texts'], state
114
 
115
+ # Make sure not missing any audio clip.
116
+ assert (CHUNK_LENGTH - 2) % REC_INTERVAL_IN_SECONDS == 0
117
 
118
  gr.Interface(fn=transcribe,
119
  inputs=[gr.Audio(source="microphone", type='filepath', streaming=True), 'state'],