litagin commited on
Commit
ab55ccc
1 Parent(s): d01f68f
Files changed (3) hide show
  1. app.py +15 -5
  2. requirements.txt +1 -0
  3. test.wav +0 -0
app.py CHANGED
@@ -4,6 +4,7 @@ import time
4
  import gradio as gr
5
  import spaces
6
  import torch
 
7
  from pydub import AudioSegment
8
  from transformers import pipeline
9
 
@@ -30,7 +31,7 @@ model_dict = {
30
  ),
31
  }
32
 
33
- print("Initializing pipelines...")
34
  pipe_dict = {
35
  k: pipeline(
36
  "automatic-speech-recognition",
@@ -39,11 +40,12 @@ pipe_dict = {
39
  )
40
  for k, v in model_dict.items()
41
  }
 
42
 
43
 
44
  @spaces.GPU
45
  def transcribe_common(audio: str, model: str) -> tuple[str, float]:
46
- print(f"Transcribing {audio} with {model}")
47
  # Get duration of audio
48
  duration = AudioSegment.from_file(audio).duration_seconds
49
  if duration > 15:
@@ -51,6 +53,8 @@ def transcribe_common(audio: str, model: str) -> tuple[str, float]:
51
  start_time = time.time()
52
  result = pipe_dict[model](audio, generate_kwargs=generate_kwargs)["text"]
53
  end_time = time.time()
 
 
54
  return result, end_time - start_time
55
 
56
 
@@ -78,13 +82,17 @@ def transcribe_galgame_whisper(audio) -> tuple[str, float]:
78
  return transcribe_common(audio, "galgame-whisper-wip")
79
 
80
 
 
 
 
 
81
  initial_md = """
82
  # Galgame-Whisper (WIP) Demo
83
 
 
84
  - 日本語のみ対応
85
- - 他の書き起こしとついでに比較できるようにいろいろ入れた
86
  - 現在0.1エポックくらい
87
- - 速度はCPUです
88
  - 音声は15秒まで
89
 
90
  pipeのハイパラ:
@@ -135,6 +143,9 @@ with gr.Blocks() as app:
135
  time_kotoba_v2 = gr.Textbox(label="Time taken")
136
  output_kotoba_v2 = gr.Textbox(label="Result")
137
 
 
 
 
138
  button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2])
139
  button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3])
140
  button_v3_turbo.click(
@@ -153,5 +164,4 @@ with gr.Blocks() as app:
153
  inputs=audio,
154
  outputs=[output_galgame, time_galgame],
155
  )
156
-
157
  app.launch(inbrowser=True)
 
4
  import gradio as gr
5
  import spaces
6
  import torch
7
+ from loguru import logger
8
  from pydub import AudioSegment
9
  from transformers import pipeline
10
 
 
31
  ),
32
  }
33
 
34
+ logger.info("Initializing pipelines...")
35
  pipe_dict = {
36
  k: pipeline(
37
  "automatic-speech-recognition",
 
40
  )
41
  for k, v in model_dict.items()
42
  }
43
+ logger.success("Pipelines initialized!")
44
 
45
 
46
  @spaces.GPU
47
  def transcribe_common(audio: str, model: str) -> tuple[str, float]:
48
+ logger.info(f"Transcribing {audio} with {model}")
49
  # Get duration of audio
50
  duration = AudioSegment.from_file(audio).duration_seconds
51
  if duration > 15:
 
53
  start_time = time.time()
54
  result = pipe_dict[model](audio, generate_kwargs=generate_kwargs)["text"]
55
  end_time = time.time()
56
+ logger.success(f"Transcribed {audio} with {model} in {end_time - start_time:.2f}s")
57
+ logger.success(f"Result:\n{result}")
58
  return result, end_time - start_time
59
 
60
 
 
82
  return transcribe_common(audio, "galgame-whisper-wip")
83
 
84
 
85
+ logger.info("Warm-up...")
86
+ transcribe_large_v3_turbo("test.wav")
87
+ logger.success("Warm-up done!")
88
+
89
  initial_md = """
90
  # Galgame-Whisper (WIP) Demo
91
 
92
+ - https://huggingface.co/litagin/galgame-whisper-wip
93
  - 日本語のみ対応
94
+ - 比較できるように他モデルもついでに試せる
95
  - 現在0.1エポックくらい
 
96
  - 音声は15秒まで
97
 
98
  pipeのハイパラ:
 
143
  time_kotoba_v2 = gr.Textbox(label="Time taken")
144
  output_kotoba_v2 = gr.Textbox(label="Result")
145
 
146
+ with gr.Row():
147
+ refresh_button = gr.Button("Refresh Status") # Create a refresh button
148
+
149
  button_v2.click(transcribe_large_v2, inputs=audio, outputs=[output_v2, time_v2])
150
  button_v3.click(transcribe_large_v3, inputs=audio, outputs=[output_v3, time_v3])
151
  button_v3_turbo.click(
 
164
  inputs=audio,
165
  outputs=[output_galgame, time_galgame],
166
  )
 
167
  app.launch(inbrowser=True)
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  gradio
 
2
  numpy<2
3
  spaces
4
  torch
 
1
  gradio
2
+ loguru
3
  numpy<2
4
  spaces
5
  torch
test.wav ADDED
Binary file (414 kB). View file