Spaces:

gpt-omni
/

mini-omni

Running

App Files Files Community

gpt-omni commited on 15 days ago

Commit

7d577d3

•

1 Parent(s): 8696667

udpate

Browse files

Files changed (3) hide show

inference.py +13 -13
litgpt/generate/base.py +2 -0
utils/snac_utils.py +2 -0

inference.py CHANGED Viewed

@@ -80,7 +80,7 @@ def get_input_ids_TT(text, text_tokenizer):
 def get_input_ids_whisper(
-    mel, leng, whispermodel, device,
     special_token_a=_answer_a, special_token_t=_answer_t,
 ):
@@ -102,6 +102,7 @@ def get_input_ids_whisper(
     return audio_feature.unsqueeze(0), input_ids
 def get_input_ids_whisper_ATBatch(mel, leng, whispermodel, device):
     with torch.no_grad():
         mel = mel.unsqueeze(0).to(device)
@@ -242,7 +243,7 @@ def A1_A2(fabric, audio_feature, input_ids, leng, model, text_tokenizer, step,
         out_dir = out_dir + "/A1-A2"
     if not os.path.exists(out_dir):
         os.makedirs(out_dir)
     audio = reconstruct_tensors(audiolist)
     with torch.inference_mode():
         audio_hat = snacmodel.decode(audio)
@@ -346,7 +347,7 @@ def T1_T2(fabric, input_ids, model, text_tokenizer, step):
     model.clear_kv_cache()
     return text_tokenizer.decode(torch.tensor(tokenlist)).strip()
 def load_model(ckpt_dir, device):
     snacmodel = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(device)
     whispermodel = whisper.load_model("small").to(device)
@@ -366,12 +367,12 @@ def load_model(ckpt_dir, device):
     return fabric, model, text_tokenizer, snacmodel, whispermodel
 def download_model(ckpt_dir):
     repo_id = "gpt-omni/mini-omni"
     snapshot_download(repo_id, local_dir=ckpt_dir, revision="main")
 class OmniInference:
     def __init__(self, ckpt_dir='./checkpoint', device='cuda:0'):
@@ -385,14 +386,13 @@ class OmniInference:
         for _ in self.run_AT_batch_stream(sample):
             pass
-    # @torch.inference_mode()
-    @spaces.GPU
-    def run_AT_batch_stream(self,
-                            audio_path,
                             stream_stride=4,
-                            max_returned_tokens=2048,
-                            temperature=0.9,
-                            top_k=1,
                             top_p=1.0,
                             eos_id_a=_eoa,
                             eos_id_t=_eot,
@@ -630,7 +630,7 @@ def test_infer():
             for path in test_audio_list:
                 mel, leng = load_audio(path)
                 audio_feature, input_ids = get_input_ids_whisper(
-                    mel, leng, whispermodel, device,
                     special_token_a=_pad_a, special_token_t=_answer_t
                 )
                 text = A1_T2(

 def get_input_ids_whisper(
+    mel, leng, whispermodel, device,
     special_token_a=_answer_a, special_token_t=_answer_t,
 ):
     return audio_feature.unsqueeze(0), input_ids
+@spaces.GPU
 def get_input_ids_whisper_ATBatch(mel, leng, whispermodel, device):
     with torch.no_grad():
         mel = mel.unsqueeze(0).to(device)
         out_dir = out_dir + "/A1-A2"
     if not os.path.exists(out_dir):
         os.makedirs(out_dir)
     audio = reconstruct_tensors(audiolist)
     with torch.inference_mode():
         audio_hat = snacmodel.decode(audio)
     model.clear_kv_cache()
     return text_tokenizer.decode(torch.tensor(tokenlist)).strip()
 def load_model(ckpt_dir, device):
     snacmodel = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(device)
     whispermodel = whisper.load_model("small").to(device)
     return fabric, model, text_tokenizer, snacmodel, whispermodel
 def download_model(ckpt_dir):
     repo_id = "gpt-omni/mini-omni"
     snapshot_download(repo_id, local_dir=ckpt_dir, revision="main")
 class OmniInference:
     def __init__(self, ckpt_dir='./checkpoint', device='cuda:0'):
         for _ in self.run_AT_batch_stream(sample):
             pass
+    @torch.inference_mode()
+    def run_AT_batch_stream(self,
+                            audio_path,
                             stream_stride=4,
+                            max_returned_tokens=2048,
+                            temperature=0.9,
+                            top_k=1,
                             top_p=1.0,
                             eos_id_a=_eoa,
                             eos_id_t=_eot,
             for path in test_audio_list:
                 mel, leng = load_audio(path)
                 audio_feature, input_ids = get_input_ids_whisper(
+                    mel, leng, whispermodel, device,
                     special_token_a=_pad_a, special_token_t=_answer_t
                 )
                 text = A1_T2(

litgpt/generate/base.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from typing import Any, Literal, Optional
 import torch
 # import torch._dynamo.config
 # import torch._inductor.config
@@ -137,6 +138,7 @@ def next_token_A1T1(
     return next_t
 def next_token_batch(
     model: GPT,
     audio_features: torch.tensor,

 from typing import Any, Literal, Optional
+import spaces
 import torch
 # import torch._dynamo.config
 # import torch._inductor.config
     return next_t
+@spaces.GPU
 def next_token_batch(
     model: GPT,
     audio_features: torch.tensor,

utils/snac_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import torch
 import time
 import numpy as np
@@ -21,6 +22,7 @@ def layershift(input_id, layer, stride=4160, shift=152000):
     return input_id + shift + layer * stride
 def generate_audio_data(snac_tokens, snacmodel, device=None):
     audio = reconstruct_tensors(snac_tokens, device)
     with torch.inference_mode():

 import torch
 import time
+import spaces
 import numpy as np
     return input_id + shift + layer * stride
+@spaces.GPU
 def generate_audio_data(snac_tokens, snacmodel, device=None):
     audio = reconstruct_tensors(snac_tokens, device)
     with torch.inference_mode():