sarahai commited on
Commit
82077c2
1 Parent(s): 1a7dd8c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -12
app.py CHANGED
@@ -1,15 +1,18 @@
 
 
1
  import streamlit as st
2
  import torchaudio
3
  import torch
4
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 
5
 
6
  # Load the fine-tuned model and processor
7
  model_name_or_path = "sarahai/uzbek-stt-3" # Replace with your model's path
8
  processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
9
  model = Wav2Vec2ForCTC.from_pretrained(model_name_or_path)
10
 
11
- # Function to preprocess and transcribe audio
12
- def preprocess_audio(file):
13
  speech_array, sampling_rate = torchaudio.load(file)
14
 
15
  # Resample to 16 kHz if necessary
@@ -18,15 +21,26 @@ def preprocess_audio(file):
18
  speech_array = resampler(speech_array)
19
 
20
  speech_array = speech_array.squeeze().numpy()
21
- return speech_array
 
 
 
 
 
22
 
23
- def transcribe_audio(speech_array):
24
- input_values = processor(speech_array, return_tensors="pt", sampling_rate=16000).input_values
25
- with torch.no_grad():
26
- logits = model(input_values).logits
27
- predicted_ids = torch.argmax(logits, dim=-1)
28
- transcription = processor.decode(predicted_ids[0])
29
- return transcription.replace("[UNK]", "'")
 
 
 
 
 
 
30
 
31
  # Streamlit interface
32
  st.title("Speech-to-Text Transcription App")
@@ -36,8 +50,9 @@ audio_file = st.file_uploader("Upload an audio file", type=["wav", "mp3"])
36
 
37
  if audio_file is not None:
38
  # Preprocess and transcribe
39
- speech_array = preprocess_audio(audio_file)
40
- transcription = transcribe_audio(speech_array)
41
 
42
  st.write("Transcription:")
43
  st.text(transcription)
 
 
1
+
2
+
3
  import streamlit as st
4
  import torchaudio
5
  import torch
6
  from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
7
+ import numpy as np
8
 
9
  # Load the fine-tuned model and processor
10
  model_name_or_path = "sarahai/uzbek-stt-3" # Replace with your model's path
11
  processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
12
  model = Wav2Vec2ForCTC.from_pretrained(model_name_or_path)
13
 
14
+ # Function to preprocess and split audio into chunks
15
+ def preprocess_audio(file, chunk_duration=10):
16
  speech_array, sampling_rate = torchaudio.load(file)
17
 
18
  # Resample to 16 kHz if necessary
 
21
  speech_array = resampler(speech_array)
22
 
23
  speech_array = speech_array.squeeze().numpy()
24
+
25
+ # Split audio into chunks (e.g., 10 seconds per chunk)
26
+ chunk_size = chunk_duration * 16000 # 10 seconds * 16000 samples per second
27
+ chunks = [speech_array[i:i + chunk_size] for i in range(0, len(speech_array), chunk_size)]
28
+
29
+ return chunks
30
 
31
+ def transcribe_audio(chunks):
32
+ transcription = ""
33
+
34
+ for chunk in chunks:
35
+ input_values = processor(chunk, return_tensors="pt", sampling_rate=16000).input_values
36
+ with torch.no_grad():
37
+ logits = model(input_values).logits
38
+ predicted_ids = torch.argmax(logits, dim=-1)
39
+ chunk_transcription = processor.decode(predicted_ids[0])
40
+ chunk_transcription = chunk_transcription.replace("[UNK]", "'")
41
+ transcription += chunk_transcription + " " # Add a space between chunks
42
+
43
+ return transcription.strip()
44
 
45
  # Streamlit interface
46
  st.title("Speech-to-Text Transcription App")
 
50
 
51
  if audio_file is not None:
52
  # Preprocess and transcribe
53
+ chunks = preprocess_audio(audio_file)
54
+ transcription = transcribe_audio(chunks)
55
 
56
  st.write("Transcription:")
57
  st.text(transcription)
58
+