zhenxuan commited on
Commit
de22680
1 Parent(s): e55e6c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -9
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import datetime
2
  import os
3
  os.system('pip install git+https://github.com/openai/whisper.git')
4
- from whisper.audio import N_SAMPLES, CHUNK_LENGTH
5
  import gradio as gr
6
  import wave
7
  import whisper
@@ -12,8 +12,8 @@ import torchaudio.functional as F
12
  LOGGING_FORMAT = '%(asctime)s %(message)s'
13
  logging.basicConfig(format=LOGGING_FORMAT,level=logging.INFO)
14
 
15
- REC_INTERVAL_IN_SECONDS = 4
16
-
17
  # tmp dir to store audio files.
18
  if not os.path.isdir('./tmp/'):
19
  os.mkdir('./tmp')
@@ -68,8 +68,9 @@ def transcribe(audio, state={}):
68
  print('=====================')
69
  logging.info(state)
70
  # Whisper only take maximum 30s of audio as input.
71
- # And the gradio streaming does not guarantee each callback is 1s, so -2 as buffer
72
- # After count reach 28 * n, a new audio file is created.
 
73
  # However the text should not change.
74
 
75
  if not state:
@@ -78,8 +79,8 @@ def transcribe(audio, state={}):
78
  state['chunks'] = {}
79
  return state['all_chunk_texts'], state
80
 
81
- chunk = state['count'] // (CHUNK_LENGTH - 2)
82
- chunk_offset = state['count'] % (CHUNK_LENGTH - 2)
83
 
84
  if chunk_offset == 0:
85
  state['chunks'][chunk] = {}
@@ -91,7 +92,7 @@ def transcribe(audio, state={}):
91
  state['count'] += 1
92
 
93
  # Determin if recognizes current chunk.
94
- if (chunk_offset + 1) % REC_INTERVAL_IN_SECONDS == 0 and chunk_offset > 0:
95
  logging.info(f'start to transcribe chunk: {chunk}, offset: {chunk_offset}')
96
  result = whisper_model.transcribe_audio_file(state['chunks'][chunk]['concated_audio'])
97
  logging.info('complete transcribe.......')
@@ -111,7 +112,7 @@ def transcribe(audio, state={}):
111
  return state['all_chunk_texts'], state
112
 
113
  # Make sure not missing any audio clip.
114
- assert (CHUNK_LENGTH - 2) % REC_INTERVAL_IN_SECONDS == 0
115
 
116
  gr.Interface(fn=transcribe,
117
  inputs=[gr.Audio(source="microphone", type='filepath', streaming=True), 'state'],
 
1
  import datetime
2
  import os
3
  os.system('pip install git+https://github.com/openai/whisper.git')
4
+ from whisper.audio import N_SAMPLES
5
  import gradio as gr
6
  import wave
7
  import whisper
 
12
  LOGGING_FORMAT = '%(asctime)s %(message)s'
13
  logging.basicConfig(format=LOGGING_FORMAT,level=logging.INFO)
14
 
15
+ RECOGNITION_INTERVAL = 2
16
+ CNT_PER_CHUNK = 6
17
  # tmp dir to store audio files.
18
  if not os.path.isdir('./tmp/'):
19
  os.mkdir('./tmp')
 
68
  print('=====================')
69
  logging.info(state)
70
  # Whisper only take maximum 30s of audio as input.
71
+ # And the gradio streaming does not guarantee each callback is 1s, And I set CNT_PER_CHUNK as 6, it's just a rough guess that 6 callbacks does not sum up an audio longer than 30s.
72
+ # The logic of determine chunk could be improved by reading exact how many samples in audio files.
73
+ # After count reach CNT_PER_CHUNK * n, a new audio file is created.
74
  # However the text should not change.
75
 
76
  if not state:
 
79
  state['chunks'] = {}
80
  return state['all_chunk_texts'], state
81
 
82
+ chunk = state['count'] // CNT_PER_CHUNK
83
+ chunk_offset = state['count'] % CNT_PER_CHUNK
84
 
85
  if chunk_offset == 0:
86
  state['chunks'][chunk] = {}
 
92
  state['count'] += 1
93
 
94
  # Determin if recognizes current chunk.
95
+ if (chunk_offset + 1) % RECOGNITION_INTERVAL == 0 and chunk_offset > 0:
96
  logging.info(f'start to transcribe chunk: {chunk}, offset: {chunk_offset}')
97
  result = whisper_model.transcribe_audio_file(state['chunks'][chunk]['concated_audio'])
98
  logging.info('complete transcribe.......')
 
112
  return state['all_chunk_texts'], state
113
 
114
  # Make sure not missing any audio clip.
115
+ assert CNT_PER_CHUNK % RECOGNITION_INTERVAL == 0
116
 
117
  gr.Interface(fn=transcribe,
118
  inputs=[gr.Audio(source="microphone", type='filepath', streaming=True), 'state'],