zhenxuan commited on
Commit
6e86db4
1 Parent(s): 8c9e711

Add demo video and description

Browse files
Files changed (1) hide show
  1. app.py +27 -2
app.py CHANGED
@@ -69,7 +69,7 @@ def transcribe(audio, state={}):
69
  logging.info(state)
70
  # Whisper only take maximum 30s of audio as input.
71
  # And the gradio streaming does not guarantee each callback is 1s, And I set CNT_PER_CHUNK as 6, it's just a rough guess that 6 callbacks does not sum up an audio longer than 30s.
72
- # The logic of determine chunk could be improved by reading exact how many samples in audio files.
73
  # After count reach CNT_PER_CHUNK * n, a new audio file is created.
74
  # However the text should not change.
75
 
@@ -105,7 +105,7 @@ def transcribe(audio, state={}):
105
  result_texts = ''
106
 
107
  for tmp_chunk_idx, tmp_chunk_values in state['chunks'].items():
108
- result_texts += tmp_chunk_values['result_text']
109
 
110
  state['all_chunk_texts'] = result_texts
111
 
@@ -114,7 +114,32 @@ def transcribe(audio, state={}):
114
  # Make sure not missing any audio clip.
115
  assert CNT_PER_CHUNK % RECOGNITION_INTERVAL == 0
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  gr.Interface(fn=transcribe,
118
  inputs=[gr.Audio(source="microphone", type='filepath', streaming=True), 'state'],
119
  outputs = ['text', 'state'],
 
 
120
  live=True).launch()
 
69
  logging.info(state)
70
  # Whisper only take maximum 30s of audio as input.
71
  # And the gradio streaming does not guarantee each callback is 1s, And I set CNT_PER_CHUNK as 6, it's just a rough guess that 6 callbacks does not sum up an audio longer than 30s.
72
+ # The logic of chunk splitting could be improved by reading exact how many samples in audio files.
73
  # After count reach CNT_PER_CHUNK * n, a new audio file is created.
74
  # However the text should not change.
75
 
 
105
  result_texts = ''
106
 
107
  for tmp_chunk_idx, tmp_chunk_values in state['chunks'].items():
108
+ result_texts += tmp_chunk_values['result_text'] + ' '
109
 
110
  state['all_chunk_texts'] = result_texts
111
 
 
114
  # Make sure not missing any audio clip.
115
  assert CNT_PER_CHUNK % RECOGNITION_INTERVAL == 0
116
 
117
+ STEP_ONE_DESCRIPTION = '''
118
+ <div>
119
+ <h3>
120
+ Step1. Click button <i>"Record from microphone"</i> and allow this site to use your microphone.
121
+ </h3>
122
+ <note>Right now the continuous Speech to text transcription is lag and sometimes missing some sentences...</note>
123
+ </div>
124
+ '''
125
+
126
+ STEP_TWO_DESCRIPTION = '''
127
+ <div align=center>
128
+ <h3 style="font-weight: 900; margin-bottom: 7px;">
129
+ Step2. Try to play the video and see how Whisper transcribe!
130
+ </h3>
131
+ <p>
132
+ Note: make sure using speaker that your computer microphone is able to hear! i.e. computer default speaker
133
+ </p>
134
+ <video id="video" width=50% controls="" preload="none">
135
+ <source id="mp4" src="https://nomorewzx.github.io/near-continuous-whispering/demo_video/whisper_demo.mp4" type="video/mp4">
136
+ </videos>
137
+ </div>
138
+ '''
139
+
140
  gr.Interface(fn=transcribe,
141
  inputs=[gr.Audio(source="microphone", type='filepath', streaming=True), 'state'],
142
  outputs = ['text', 'state'],
143
+ description=STEP_ONE_DESCRIPTION,
144
+ article=STEP_TWO_DESCRIPTION,
145
  live=True).launch()