Add word timestamps to Simple and reorder
Browse files
app.py
CHANGED
@@ -84,44 +84,49 @@ class WhisperTranscriber:
|
|
84 |
print("[Auto parallel] Using GPU devices " + str(self.parallel_device_list) + " and " + str(self.vad_cpu_cores) + " CPU cores for VAD/transcription.")
|
85 |
|
86 |
# Entry function for the simple tab
|
87 |
-
def transcribe_webui_simple(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
88 |
-
|
|
|
|
|
|
|
|
|
89 |
|
90 |
# Entry function for the simple tab progress
|
91 |
-
def transcribe_webui_simple_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
92 |
-
|
|
|
|
|
93 |
|
94 |
-
vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize,
|
95 |
|
96 |
-
return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions,
|
|
|
97 |
|
98 |
# Entry function for the full tab
|
99 |
def transcribe_webui_full(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
append_punctuations: str, highlight_words: bool = False):
|
107 |
|
108 |
return self.transcribe_webui_full_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
109 |
vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
|
|
|
110 |
initial_prompt, temperature, best_of, beam_size, patience, length_penalty, suppress_tokens,
|
111 |
condition_on_previous_text, fp16, temperature_increment_on_fallback,
|
112 |
-
compression_ratio_threshold, logprob_threshold, no_speech_threshold
|
113 |
-
word_timestamps, prepend_punctuations, append_punctuations, highlight_words)
|
114 |
|
115 |
# Entry function for the full tab with progress
|
116 |
def transcribe_webui_full_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
progress=gr.Progress()):
|
125 |
|
126 |
# Handle temperature_increment_on_fallback
|
127 |
if temperature_increment_on_fallback is not None:
|
@@ -469,24 +474,34 @@ def create_ui(app_config: ApplicationConfig):
|
|
469 |
|
470 |
whisper_models = app_config.get_model_names()
|
471 |
|
472 |
-
|
473 |
gr.Dropdown(choices=whisper_models, value=app_config.default_model_name, label="Model"),
|
474 |
gr.Dropdown(choices=sorted(get_language_names()), label="Language", value=app_config.language),
|
475 |
gr.Text(label="URL (YouTube, etc.)"),
|
476 |
gr.File(label="Upload Files", file_count="multiple"),
|
477 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
478 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task", value=app_config.task),
|
|
|
|
|
|
|
479 |
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD"),
|
480 |
gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window),
|
481 |
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size),
|
482 |
-
|
483 |
-
|
|
|
|
|
|
|
484 |
]
|
485 |
|
486 |
is_queue_mode = app_config.queue_concurrency_count is not None and app_config.queue_concurrency_count > 0
|
487 |
|
488 |
simple_transcribe = gr.Interface(fn=ui.transcribe_webui_simple_progress if is_queue_mode else ui.transcribe_webui_simple,
|
489 |
-
description=ui_description, article=ui_article, inputs=
|
|
|
|
|
|
|
|
|
490 |
gr.File(label="Download"),
|
491 |
gr.Text(label="Transcription"),
|
492 |
gr.Text(label="Segments")
|
@@ -496,8 +511,17 @@ def create_ui(app_config: ApplicationConfig):
|
|
496 |
|
497 |
full_transcribe = gr.Interface(fn=ui.transcribe_webui_full_progress if is_queue_mode else ui.transcribe_webui_full,
|
498 |
description=full_description, article=ui_article, inputs=[
|
499 |
-
*
|
|
|
|
|
|
|
|
|
500 |
gr.Dropdown(choices=["prepend_first_segment", "prepend_all_segments"], value=app_config.vad_initial_prompt_mode, label="VAD - Initial Prompt Mode"),
|
|
|
|
|
|
|
|
|
|
|
501 |
gr.TextArea(label="Initial Prompt"),
|
502 |
gr.Number(label="Temperature", value=app_config.temperature),
|
503 |
gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0),
|
@@ -511,13 +535,6 @@ def create_ui(app_config: ApplicationConfig):
|
|
511 |
gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold),
|
512 |
gr.Number(label="Logprob threshold", value=app_config.logprob_threshold),
|
513 |
gr.Number(label="No speech threshold", value=app_config.no_speech_threshold),
|
514 |
-
|
515 |
-
# Word timestamps
|
516 |
-
gr.Checkbox(label="Word Timestamps", value=app_config.word_timestamps),
|
517 |
-
gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations),
|
518 |
-
gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations),
|
519 |
-
gr.Checkbox(label="Word Timestamps - Highlight Words", value=app_config.highlight_words),
|
520 |
-
|
521 |
], outputs=[
|
522 |
gr.File(label="Download"),
|
523 |
gr.Text(label="Transcription"),
|
|
|
84 |
print("[Auto parallel] Using GPU devices " + str(self.parallel_device_list) + " and " + str(self.vad_cpu_cores) + " CPU cores for VAD/transcription.")
|
85 |
|
86 |
# Entry function for the simple tab
|
87 |
+
def transcribe_webui_simple(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
88 |
+
vad, vadMergeWindow, vadMaxMergeSize,
|
89 |
+
word_timestamps: bool = False, highlight_words: bool = False):
|
90 |
+
return self.transcribe_webui_simple_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
91 |
+
vad, vadMergeWindow, vadMaxMergeSize,
|
92 |
+
word_timestamps, highlight_words)
|
93 |
|
94 |
# Entry function for the simple tab progress
|
95 |
+
def transcribe_webui_simple_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
96 |
+
vad, vadMergeWindow, vadMaxMergeSize,
|
97 |
+
word_timestamps: bool = False, highlight_words: bool = False,
|
98 |
+
progress=gr.Progress()):
|
99 |
|
100 |
+
vadOptions = VadOptions(vad, vadMergeWindow, vadMaxMergeSize, self.app_config.vad_padding, self.app_config.vad_prompt_window, self.app_config.vad_initial_prompt_mode)
|
101 |
|
102 |
+
return self.transcribe_webui(modelName, languageName, urlData, multipleFiles, microphoneData, task, vadOptions,
|
103 |
+
word_timestamps=word_timestamps, highlight_words=highlight_words, progress=progress)
|
104 |
|
105 |
# Entry function for the full tab
|
106 |
def transcribe_webui_full(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
107 |
+
vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
|
108 |
+
# Word timestamps
|
109 |
+
word_timestamps: bool, highlight_words: bool, prepend_punctuations: str, append_punctuations: str,
|
110 |
+
initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
|
111 |
+
condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
|
112 |
+
compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float):
|
|
|
113 |
|
114 |
return self.transcribe_webui_full_progress(modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
115 |
vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
|
116 |
+
word_timestamps, highlight_words, prepend_punctuations, append_punctuations,
|
117 |
initial_prompt, temperature, best_of, beam_size, patience, length_penalty, suppress_tokens,
|
118 |
condition_on_previous_text, fp16, temperature_increment_on_fallback,
|
119 |
+
compression_ratio_threshold, logprob_threshold, no_speech_threshold)
|
|
|
120 |
|
121 |
# Entry function for the full tab with progress
|
122 |
def transcribe_webui_full_progress(self, modelName, languageName, urlData, multipleFiles, microphoneData, task,
|
123 |
+
vad, vadMergeWindow, vadMaxMergeSize, vadPadding, vadPromptWindow, vadInitialPromptMode,
|
124 |
+
# Word timestamps
|
125 |
+
word_timestamps: bool, highlight_words: bool, prepend_punctuations: str, append_punctuations: str,
|
126 |
+
initial_prompt: str, temperature: float, best_of: int, beam_size: int, patience: float, length_penalty: float, suppress_tokens: str,
|
127 |
+
condition_on_previous_text: bool, fp16: bool, temperature_increment_on_fallback: float,
|
128 |
+
compression_ratio_threshold: float, logprob_threshold: float, no_speech_threshold: float,
|
129 |
+
progress=gr.Progress()):
|
|
|
130 |
|
131 |
# Handle temperature_increment_on_fallback
|
132 |
if temperature_increment_on_fallback is not None:
|
|
|
474 |
|
475 |
whisper_models = app_config.get_model_names()
|
476 |
|
477 |
+
common_inputs = lambda : [
|
478 |
gr.Dropdown(choices=whisper_models, value=app_config.default_model_name, label="Model"),
|
479 |
gr.Dropdown(choices=sorted(get_language_names()), label="Language", value=app_config.language),
|
480 |
gr.Text(label="URL (YouTube, etc.)"),
|
481 |
gr.File(label="Upload Files", file_count="multiple"),
|
482 |
gr.Audio(source="microphone", type="filepath", label="Microphone Input"),
|
483 |
gr.Dropdown(choices=["transcribe", "translate"], label="Task", value=app_config.task),
|
484 |
+
]
|
485 |
+
|
486 |
+
common_vad_inputs = lambda : [
|
487 |
gr.Dropdown(choices=["none", "silero-vad", "silero-vad-skip-gaps", "silero-vad-expand-into-gaps", "periodic-vad"], value=app_config.default_vad, label="VAD"),
|
488 |
gr.Number(label="VAD - Merge Window (s)", precision=0, value=app_config.vad_merge_window),
|
489 |
gr.Number(label="VAD - Max Merge Size (s)", precision=0, value=app_config.vad_max_merge_size),
|
490 |
+
]
|
491 |
+
|
492 |
+
common_word_timestamps_inputs = lambda : [
|
493 |
+
gr.Checkbox(label="Word Timestamps", value=app_config.word_timestamps),
|
494 |
+
gr.Checkbox(label="Word Timestamps - Highlight Words", value=app_config.highlight_words),
|
495 |
]
|
496 |
|
497 |
is_queue_mode = app_config.queue_concurrency_count is not None and app_config.queue_concurrency_count > 0
|
498 |
|
499 |
simple_transcribe = gr.Interface(fn=ui.transcribe_webui_simple_progress if is_queue_mode else ui.transcribe_webui_simple,
|
500 |
+
description=ui_description, article=ui_article, inputs=[
|
501 |
+
*common_inputs(),
|
502 |
+
*common_vad_inputs(),
|
503 |
+
*common_word_timestamps_inputs(),
|
504 |
+
], outputs=[
|
505 |
gr.File(label="Download"),
|
506 |
gr.Text(label="Transcription"),
|
507 |
gr.Text(label="Segments")
|
|
|
511 |
|
512 |
full_transcribe = gr.Interface(fn=ui.transcribe_webui_full_progress if is_queue_mode else ui.transcribe_webui_full,
|
513 |
description=full_description, article=ui_article, inputs=[
|
514 |
+
*common_inputs(),
|
515 |
+
|
516 |
+
*common_vad_inputs(),
|
517 |
+
gr.Number(label="VAD - Padding (s)", precision=None, value=app_config.vad_padding),
|
518 |
+
gr.Number(label="VAD - Prompt Window (s)", precision=None, value=app_config.vad_prompt_window),
|
519 |
gr.Dropdown(choices=["prepend_first_segment", "prepend_all_segments"], value=app_config.vad_initial_prompt_mode, label="VAD - Initial Prompt Mode"),
|
520 |
+
|
521 |
+
*common_word_timestamps_inputs(),
|
522 |
+
gr.Text(label="Word Timestamps - Prepend Punctuations", value=app_config.prepend_punctuations),
|
523 |
+
gr.Text(label="Word Timestamps - Append Punctuations", value=app_config.append_punctuations),
|
524 |
+
|
525 |
gr.TextArea(label="Initial Prompt"),
|
526 |
gr.Number(label="Temperature", value=app_config.temperature),
|
527 |
gr.Number(label="Best Of - Non-zero temperature", value=app_config.best_of, precision=0),
|
|
|
535 |
gr.Number(label="Compression ratio threshold", value=app_config.compression_ratio_threshold),
|
536 |
gr.Number(label="Logprob threshold", value=app_config.logprob_threshold),
|
537 |
gr.Number(label="No speech threshold", value=app_config.no_speech_threshold),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
538 |
], outputs=[
|
539 |
gr.File(label="Download"),
|
540 |
gr.Text(label="Transcription"),
|