Llama-3.2s-1B-Instruct-v0.1

Sleeping

App Files Files Community

bachvudinh commited on Aug 22

Commit

e10af0d

•

1 Parent(s): c3d86d3

add @spaces.GPU

Browse files

Files changed (1) hide show

app.py +8 -2

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import gradio as gr
 import torch
 import torchaudio
 from encodec import EncodecModel
 from whisperspeech.vq_stoks import RQBottleneckTransformer
@@ -19,7 +20,7 @@ vq_model = RQBottleneckTransformer.load_model(
         "whisper-vq-stoks-medium-en+pl-fixed.model"
     ).to(device)
 vq_model.ensure_whisper(device)
 def audio_to_sound_tokens_whisperspeech(audio_path):
     wav, sr = torchaudio.load(audio_path)
     if sr != 16000:
@@ -30,6 +31,7 @@ def audio_to_sound_tokens_whisperspeech(audio_path):
     result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
     return f'<|sound_start|>{result}<|sound_end|>'
 def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
     wav, sr = torchaudio.load(audio_path)
     if sr != 16000:
@@ -57,7 +59,7 @@ def audio_to_sound_tokens(audio_path, target_bandwidth=1.5, device="cuda"):
     flatten_tokens = torch.stack((audio_code1, audio_code2), dim=1).flatten().tolist()
     result = ''.join(f'<|sound_{num:04d}|>' for num in flatten_tokens)
     return f'<|sound_start|>{result}<|sound_end|>'
 def setup_pipeline(model_path, use_4bit=False, use_8bit=False):
     tokenizer = AutoTokenizer.from_pretrained(model_path)
     model_kwargs = {"device_map": "auto"}
@@ -79,6 +81,7 @@ tokenizer = pipe.tokenizer
 model = pipe.model
 # print(tokenizer.encode("<|sound_0001|>", add_special_tokens=False))# return the audio tensor
 # print(tokenizer.eos_token)
 def text_to_audio_file(text):
     # gen a random id for the audio file
     id = str(uuid.uuid4())
@@ -93,6 +96,7 @@ def text_to_audio_file(text):
     # torchaudio.save(temp_file, audio.cpu(), sample_rate=24000)
     print(f"Saved audio to {temp_file}")
     return temp_file
 def process_input(input_type, text_input=None, audio_file=None):
     # if input_type == "text":
     #     audio_file = "temp_audio.wav"
@@ -102,6 +106,7 @@ def process_input(input_type, text_input=None, audio_file=None):
     # if input_type == "text":
     #     os.remove(audio_file)
 def process_transcribe_input(input_type, text_input=None, audio_file=None):
     # if input_type == "text":
     #     audio_file = "temp_audio.wav"
@@ -119,6 +124,7 @@ class StopOnTokens(StoppingCriteria):
             if input_ids[0][-1] == stop_id:
                 return True
         return False
 def process_audio(audio_file, transcript=False):
     if audio_file is None:
             raise ValueError("No audio file provided")

 import gradio as gr
 import torch
+import spaces
 import torchaudio
 from encodec import EncodecModel
 from whisperspeech.vq_stoks import RQBottleneckTransformer
         "whisper-vq-stoks-medium-en+pl-fixed.model"
     ).to(device)
 vq_model.ensure_whisper(device)
+@spaces.GPU
 def audio_to_sound_tokens_whisperspeech(audio_path):
     wav, sr = torchaudio.load(audio_path)
     if sr != 16000:
     result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
     return f'<|sound_start|>{result}<|sound_end|>'
+@spaces.GPU
 def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
     wav, sr = torchaudio.load(audio_path)
     if sr != 16000:
     flatten_tokens = torch.stack((audio_code1, audio_code2), dim=1).flatten().tolist()
     result = ''.join(f'<|sound_{num:04d}|>' for num in flatten_tokens)
     return f'<|sound_start|>{result}<|sound_end|>'
+@spaces.GPU
 def setup_pipeline(model_path, use_4bit=False, use_8bit=False):
     tokenizer = AutoTokenizer.from_pretrained(model_path)
     model_kwargs = {"device_map": "auto"}
 model = pipe.model
 # print(tokenizer.encode("<|sound_0001|>", add_special_tokens=False))# return the audio tensor
 # print(tokenizer.eos_token)
+@spaces.GPU
 def text_to_audio_file(text):
     # gen a random id for the audio file
     id = str(uuid.uuid4())
     # torchaudio.save(temp_file, audio.cpu(), sample_rate=24000)
     print(f"Saved audio to {temp_file}")
     return temp_file
+@spaces.GPU
 def process_input(input_type, text_input=None, audio_file=None):
     # if input_type == "text":
     #     audio_file = "temp_audio.wav"
     # if input_type == "text":
     #     os.remove(audio_file)
+@spaces.GPU
 def process_transcribe_input(input_type, text_input=None, audio_file=None):
     # if input_type == "text":
     #     audio_file = "temp_audio.wav"
             if input_ids[0][-1] == stop_id:
                 return True
         return False
+@spaces.GPU
 def process_audio(audio_file, transcript=False):
     if audio_file is None:
             raise ValueError("No audio file provided")