English
music
music-captioning
Inference Endpoints
ivillar commited on
Commit
5e4e6f5
·
1 Parent(s): ef418fe

Change request logic

Browse files
Files changed (1) hide show
  1. handler.py +14 -14
handler.py CHANGED
@@ -5,7 +5,7 @@ from typing import Dict, List, Any
5
  import numpy as np
6
  import librosa
7
  import os
8
-
9
  def preprocess_audio(audio_signal, sr, duration=10, target_sr=16000):
10
  n_samples = int(duration * target_sr)
11
  audio = librosa.to_mono(audio_signal)
@@ -51,14 +51,14 @@ class EndpointHandler:
51
  inference += f"{time}\n{text} \n \n"
52
  return inference
53
 
 
54
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
55
- audio_bytes = data["audio_bytes"]
56
- audio_shape = tuple([int(x) for x in data["audio_shape"].split(', ')])
57
- audio_dtype = data["audio_dtype"]
 
58
  sr = data["sampling_rate"]
59
-
60
- input_audio = np.frombuffer(audio_bytes, dtype=audio_dtype).reshape(audio_shape)
61
-
62
  preprocessed_audio = preprocess_audio(input_audio, sr)
63
 
64
  return self._captioning(preprocessed_audio)
@@ -72,17 +72,17 @@ if __name__ == "__main__":
72
  audio_path = "folk.wav"
73
  np_audio, sr = librosa.load(audio_path, sr=44100)
74
 
75
- np_bytes = np_audio.tobytes()
76
  np_shape = np_audio.shape
77
  np_dtype = np_audio.dtype.name
78
-
79
- request = {
80
- "audio_bytes": np_bytes,
81
- "audio_shape": ', '.join(map(str, np_shape)),
82
  "audio_dtype": np_dtype,
83
  "sampling_rate": sr
84
- }
85
 
86
  print(f"Loaded {audio_path} with sample rate {sr}")
87
- print(handler.__call__(request))
88
  """
 
5
  import numpy as np
6
  import librosa
7
  import os
8
+ import json
9
  def preprocess_audio(audio_signal, sr, duration=10, target_sr=16000):
10
  n_samples = int(duration * target_sr)
11
  audio = librosa.to_mono(audio_signal)
 
51
  inference += f"{time}\n{text} \n \n"
52
  return inference
53
 
54
+
55
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
56
+ data = json.loads(data["payload"])
57
+ array = np.array(data['audio_list'], dtype=data["audio_dtype"])
58
+ array_shape = data['audio_shape']
59
+ input_audio = array.reshape(array_shape)
60
  sr = data["sampling_rate"]
61
+
 
 
62
  preprocessed_audio = preprocess_audio(input_audio, sr)
63
 
64
  return self._captioning(preprocessed_audio)
 
72
  audio_path = "folk.wav"
73
  np_audio, sr = librosa.load(audio_path, sr=44100)
74
 
75
+ np_list = np_audio.tolist()
76
  np_shape = np_audio.shape
77
  np_dtype = np_audio.dtype.name
78
+
79
+ request = json.dumps({
80
+ "audio_list": np_list,
81
+ "audio_shape": np_shape,
82
  "audio_dtype": np_dtype,
83
  "sampling_rate": sr
84
+ })
85
 
86
  print(f"Loaded {audio_path} with sample rate {sr}")
87
+ print(handler.__call__({"payload": request}))
88
  """