Spaces:

MLSquad-TWCN
/

near-continuous-whispering

Build error

App Files Files Community

zhenxuan commited on Oct 9, 2022

Commit

6e86db4

•

1 Parent(s): 8c9e711

Add demo video and description

Browse files

Files changed (1) hide show

app.py +27 -2

app.py CHANGED Viewed

@@ -69,7 +69,7 @@ def transcribe(audio, state={}):
     logging.info(state)
     # Whisper only take maximum 30s of audio as input.
     # And the gradio streaming does not guarantee each callback is 1s, And I set CNT_PER_CHUNK as 6, it's just a rough guess that 6 callbacks does not sum up an audio longer than 30s.
-    # The logic of determine chunk could be improved by reading exact how many samples in audio files.
     # After count reach CNT_PER_CHUNK * n, a new audio file is created.
     # However the text should not change.
@@ -105,7 +105,7 @@ def transcribe(audio, state={}):
     result_texts = ''
     for tmp_chunk_idx, tmp_chunk_values in state['chunks'].items():
-        result_texts += tmp_chunk_values['result_text']
     state['all_chunk_texts'] = result_texts
@@ -114,7 +114,32 @@ def transcribe(audio, state={}):
 # Make sure not missing any audio clip.
 assert CNT_PER_CHUNK % RECOGNITION_INTERVAL == 0
 gr.Interface(fn=transcribe,
              inputs=[gr.Audio(source="microphone", type='filepath', streaming=True), 'state'],
              outputs = ['text', 'state'],
              live=True).launch()

     logging.info(state)
     # Whisper only take maximum 30s of audio as input.
     # And the gradio streaming does not guarantee each callback is 1s, And I set CNT_PER_CHUNK as 6, it's just a rough guess that 6 callbacks does not sum up an audio longer than 30s.
+    # The logic of chunk splitting could be improved by reading exact how many samples in audio files.
     # After count reach CNT_PER_CHUNK * n, a new audio file is created.
     # However the text should not change.
     result_texts = ''
     for tmp_chunk_idx, tmp_chunk_values in state['chunks'].items():
+        result_texts += tmp_chunk_values['result_text'] + ' '
     state['all_chunk_texts'] = result_texts
 # Make sure not missing any audio clip.
 assert CNT_PER_CHUNK % RECOGNITION_INTERVAL == 0
+STEP_ONE_DESCRIPTION = '''
+<div>
+    <h3>
+        Step1. Click button <i>"Record from microphone"</i> and allow this site to use your microphone.
+    </h3>
+    <note>Right now the continuous Speech to text transcription is lag and sometimes missing some sentences...</note>
+</div>
+'''
+STEP_TWO_DESCRIPTION = '''
+<div align=center>
+    <h3 style="font-weight: 900; margin-bottom: 7px;">
+        Step2. Try to play the video and see how Whisper transcribe!
+    </h3>
+    <p>
+        Note: make sure using speaker that your computer microphone is able to hear! i.e. computer default speaker
+    </p>
+    <video id="video" width=50% controls="" preload="none">
+        <source id="mp4" src="https://nomorewzx.github.io/near-continuous-whispering/demo_video/whisper_demo.mp4" type="video/mp4">
+    </videos>
+</div>
+'''
 gr.Interface(fn=transcribe,
              inputs=[gr.Audio(source="microphone", type='filepath', streaming=True), 'state'],
              outputs = ['text', 'state'],
+             description=STEP_ONE_DESCRIPTION,
+             article=STEP_TWO_DESCRIPTION,
              live=True).launch()