Spaces:
Build error
Build error
Add demo video and description
Browse files
app.py
CHANGED
@@ -69,7 +69,7 @@ def transcribe(audio, state={}):
|
|
69 |
logging.info(state)
|
70 |
# Whisper only take maximum 30s of audio as input.
|
71 |
# And the gradio streaming does not guarantee each callback is 1s, And I set CNT_PER_CHUNK as 6, it's just a rough guess that 6 callbacks does not sum up an audio longer than 30s.
|
72 |
-
# The logic of
|
73 |
# After count reach CNT_PER_CHUNK * n, a new audio file is created.
|
74 |
# However the text should not change.
|
75 |
|
@@ -105,7 +105,7 @@ def transcribe(audio, state={}):
|
|
105 |
result_texts = ''
|
106 |
|
107 |
for tmp_chunk_idx, tmp_chunk_values in state['chunks'].items():
|
108 |
-
result_texts += tmp_chunk_values['result_text']
|
109 |
|
110 |
state['all_chunk_texts'] = result_texts
|
111 |
|
@@ -114,7 +114,32 @@ def transcribe(audio, state={}):
|
|
114 |
# Make sure not missing any audio clip.
|
115 |
assert CNT_PER_CHUNK % RECOGNITION_INTERVAL == 0
|
116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
gr.Interface(fn=transcribe,
|
118 |
inputs=[gr.Audio(source="microphone", type='filepath', streaming=True), 'state'],
|
119 |
outputs = ['text', 'state'],
|
|
|
|
|
120 |
live=True).launch()
|
|
|
69 |
logging.info(state)
|
70 |
# Whisper only take maximum 30s of audio as input.
|
71 |
# And the gradio streaming does not guarantee each callback is 1s, And I set CNT_PER_CHUNK as 6, it's just a rough guess that 6 callbacks does not sum up an audio longer than 30s.
|
72 |
+
# The logic of chunk splitting could be improved by reading exact how many samples in audio files.
|
73 |
# After count reach CNT_PER_CHUNK * n, a new audio file is created.
|
74 |
# However the text should not change.
|
75 |
|
|
|
105 |
result_texts = ''
|
106 |
|
107 |
for tmp_chunk_idx, tmp_chunk_values in state['chunks'].items():
|
108 |
+
result_texts += tmp_chunk_values['result_text'] + ' '
|
109 |
|
110 |
state['all_chunk_texts'] = result_texts
|
111 |
|
|
|
114 |
# Make sure not missing any audio clip.
|
115 |
assert CNT_PER_CHUNK % RECOGNITION_INTERVAL == 0
|
116 |
|
117 |
+
STEP_ONE_DESCRIPTION = '''
|
118 |
+
<div>
|
119 |
+
<h3>
|
120 |
+
Step1. Click button <i>"Record from microphone"</i> and allow this site to use your microphone.
|
121 |
+
</h3>
|
122 |
+
<note>Right now the continuous Speech to text transcription is lag and sometimes missing some sentences...</note>
|
123 |
+
</div>
|
124 |
+
'''
|
125 |
+
|
126 |
+
STEP_TWO_DESCRIPTION = '''
|
127 |
+
<div align=center>
|
128 |
+
<h3 style="font-weight: 900; margin-bottom: 7px;">
|
129 |
+
Step2. Try to play the video and see how Whisper transcribe!
|
130 |
+
</h3>
|
131 |
+
<p>
|
132 |
+
Note: make sure using speaker that your computer microphone is able to hear! i.e. computer default speaker
|
133 |
+
</p>
|
134 |
+
<video id="video" width=50% controls="" preload="none">
|
135 |
+
<source id="mp4" src="https://nomorewzx.github.io/near-continuous-whispering/demo_video/whisper_demo.mp4" type="video/mp4">
|
136 |
+
</videos>
|
137 |
+
</div>
|
138 |
+
'''
|
139 |
+
|
140 |
gr.Interface(fn=transcribe,
|
141 |
inputs=[gr.Audio(source="microphone", type='filepath', streaming=True), 'state'],
|
142 |
outputs = ['text', 'state'],
|
143 |
+
description=STEP_ONE_DESCRIPTION,
|
144 |
+
article=STEP_TWO_DESCRIPTION,
|
145 |
live=True).launch()
|