ivillar
/

lp-music-caps

English

music

music-captioning

Inference Endpoints

Model card Files Files and versions Community

ivillar commited on Mar 12, 2024

Commit

d99dc9a

1 Parent(s): 3ac37de

Update handler and requirements

Browse files

Files changed (2) hide show

handler.py +67 -35
requirements.txt +19 -0

handler.py CHANGED Viewed

@@ -1,17 +1,15 @@
 import torch
 from model.bart import BartCaptionModel
 from utils.audio_utils import load_audio, STR_CH_FIRST
 import numpy as np
-def get_audio(audio_path, duration=10, target_sr=16000):
     n_samples = int(duration * target_sr)
-    audio, sr = load_audio(
-        path= audio_path,
-        ch_format= STR_CH_FIRST,
-        sample_rate= target_sr,
-        downmix_to_mono= True,
-    )
     if len(audio.shape) == 2:
         audio = audio.mean(0, False)  # to mono
     input_size = int(n_samples)
@@ -23,31 +21,65 @@ def get_audio(audio_path, duration=10, target_sr=16000):
     audio = torch.from_numpy(np.stack(np.split(audio[:ceil * n_samples], ceil)).astype('float32'))
     return audio
-def captioning(audio_path):
-    audio_tensor = get_audio(audio_path = audio_path)
-    if device is not None:
-        audio_tensor = audio_tensor.to(device)
-    with torch.no_grad():
-        output = model.generate(
-            samples=audio_tensor,
-            num_beams=5,
-        )
-    inference = ""
-    number_of_chunks = range(audio_tensor.shape[0])
-    for chunk, text in zip(number_of_chunks, output):
-        time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
-        inference += f"{time}\n{text} \n \n"
-    return inference
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-example_list = ['electronic.mp3', 'orchestra.wav']
-model = BartCaptionModel(max_length = 128)
-pretrained_object = torch.load('./transfer.pth', map_location='cpu')
-state_dict = pretrained_object['state_dict']
-model.load_state_dict(state_dict)
-if torch.cuda.is_available():
-    torch.cuda.set_device(device)
-model = model.cuda(device)
-print(captioning("electronic.mp3"))

 import torch
 from model.bart import BartCaptionModel
 from utils.audio_utils import load_audio, STR_CH_FIRST
+from typing import Dict, List, Any
 import numpy as np
+import librosa
+def preprocess_audio(audio_signal, sr, duration=10, target_sr=16000):
     n_samples = int(duration * target_sr)
+    audio = librosa.to_mono(audio_signal)
+    audio = librosa.resample(audio, orig_sr = sr, target_sr = target_sr)
     if len(audio.shape) == 2:
         audio = audio.mean(0, False)  # to mono
     input_size = int(n_samples)
     audio = torch.from_numpy(np.stack(np.split(audio[:ceil * n_samples], ceil)).astype('float32'))
     return audio
+class EndpointHandler:
+    def __init__(self, path=""):
+        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        self.model = BartCaptionModel(max_length = 128)
+        pretrained_object = torch.load('./transfer.pth', map_location='cpu')
+        state_dict = pretrained_object['state_dict']
+        self.model.load_state_dict(state_dict)
+        if torch.cuda.is_available():
+            torch.cuda.set_device(self.device)
+        self.model = self.model.cuda(self.device)
+    def _captioning(self, audio_tensor):
+        if self.device is not None:
+            audio_tensor = audio_tensor.to(self.device)
+        with torch.no_grad():
+            output = self.model.generate(
+                samples=audio_tensor,
+                num_beams=5,
+            )
+        inference = ""
+        number_of_chunks = range(audio_tensor.shape[0])
+        for chunk, text in zip(number_of_chunks, output):
+            time = f"[{chunk * 10}:00-{(chunk + 1) * 10}:00]"
+            inference += f"{time}\n{text} \n \n"
+        return inference
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        audio_bytes = data["audio_bytes"]
+        audio_shape = tuple([int(x) for x in data["audio_shape"].split(', ')])
+        audio_dtype = data["audio_dtype"]
+        sr = data["sampling_rate"]
+        input_audio = np.frombuffer(audio_bytes, dtype=audio_dtype).reshape(audio_shape)
+        preprocessed_audio = preprocess_audio(input_audio, sr)
+        return self._captioning(preprocessed_audio)
+"""
+if __name__ == "__main__":
+    import numpy as np
+    from scipy.io.wavfile import write as wav_write
+    from huggingface_hub import InferenceApi
+    handler = EndpointHandler()
+    audio_path = "folk.wav"
+    np_audio, sr = librosa.load(audio_path, sr=44100)
+    np_bytes = np_audio.tobytes()
+    np_shape = np_audio.shape
+    np_dtype = np_audio.dtype.name
+    request = {
+        "audio_bytes": np_bytes,
+        "audio_shape": ', '.join(map(str, np_shape)),
+        "audio_dtype": np_dtype,
+        "sampling_rate": sr
+    }
+    print(f"Loaded {audio_path} with sample rate {sr}")
+    print(handler.__call__(request))
+"""

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+datasets==2.18.0
+huggingface-hub==0.21.4
+julius==0.2.7
+librosa==0.10.1
+multidict==6.0.5
+multiprocess==0.70.16
+numpy==1.26.4
+packaging==23.2
+pandas==2.2.1
+pydub==0.25.1
+scikit-learn==1.4.1.post1
+scipy==1.12.0
+tokenizers==0.13.3
+torch==1.13.1
+torchaudio==0.13.1
+torchaudio-augmentations==0.2.1
+tqdm==4.66.2
+transformers==4.26.1
+wavaugment==0.2