Upload app.py
Browse files
app.py
CHANGED
@@ -1,31 +1,181 @@
|
|
1 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
|
|
4 |
|
5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
- Processing may take a few minutes depending on file size
|
|
|
1 |
+
# app.py
|
2 |
+
import gradio as gr
|
3 |
+
import librosa
|
4 |
+
import numpy as np
|
5 |
+
from openvino import runtime as ov
|
6 |
+
import soundfile as sf
|
7 |
+
import warnings
|
8 |
+
import os
|
9 |
+
from pathlib import Path
|
10 |
|
11 |
+
warnings.filterwarnings('ignore', category=FutureWarning)
|
12 |
+
warnings.filterwarnings('ignore', category=UserWarning)
|
13 |
|
14 |
+
def estimate_key(y, sr):
|
15 |
+
"""Estimate the musical key using chroma features."""
|
16 |
+
chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
|
17 |
+
chroma_avg = np.mean(chroma, axis=1)
|
18 |
+
keys = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
|
19 |
+
key_index = np.argmax(chroma_avg)
|
20 |
+
return keys[key_index]
|
21 |
|
22 |
+
def classify_instrument(spectral_centroid, rms_energy):
|
23 |
+
"""Classify instrument type based on spectral characteristics."""
|
24 |
+
if spectral_centroid < 500:
|
25 |
+
if rms_energy > 0.1:
|
26 |
+
return "bass"
|
27 |
+
return "sub"
|
28 |
+
elif spectral_centroid < 2000:
|
29 |
+
if rms_energy > 0.15:
|
30 |
+
return "drums"
|
31 |
+
return "perc"
|
32 |
+
elif spectral_centroid < 4000:
|
33 |
+
return "synth"
|
34 |
+
else:
|
35 |
+
return "high"
|
36 |
|
37 |
+
def get_musical_tempo_description(tempo):
|
38 |
+
"""Convert numerical tempo to musical description."""
|
39 |
+
if tempo < 70:
|
40 |
+
return "slow"
|
41 |
+
elif tempo < 100:
|
42 |
+
return "chill"
|
43 |
+
elif tempo < 120:
|
44 |
+
return "upbeat"
|
45 |
+
elif tempo < 140:
|
46 |
+
return "energetic"
|
47 |
+
else:
|
48 |
+
return "fast"
|
49 |
|
50 |
+
def generate_prompt(keys, avg_tempo, streams_info, genre="electronic"):
|
51 |
+
"""Generate a concise, Suno-friendly prompt under 200 characters."""
|
52 |
+
most_common_key = max(set(keys), key=keys.count) if keys else "C"
|
53 |
+
|
54 |
+
instrument_counts = {}
|
55 |
+
for info in streams_info:
|
56 |
+
inst_type = info['type']
|
57 |
+
instrument_counts[inst_type] = instrument_counts.get(inst_type, 0) + 1
|
58 |
+
|
59 |
+
main_elements = [k for k, v in sorted(instrument_counts.items(), key=lambda x: x[1], reverse=True)[:2]]
|
60 |
+
tempo_desc = get_musical_tempo_description(avg_tempo)
|
61 |
+
|
62 |
+
prompt = f"{most_common_key} {int(avg_tempo)}bpm {tempo_desc} {genre} with {' + '.join(main_elements)}, dark atmosphere + reverb"
|
63 |
+
|
64 |
+
if len(prompt) > 200:
|
65 |
+
prompt = prompt[:197] + "..."
|
66 |
+
|
67 |
+
return prompt
|
68 |
|
69 |
+
def process_audio(audio_path, genre):
|
70 |
+
"""Process audio file and generate prompt."""
|
71 |
+
try:
|
72 |
+
# Load audio
|
73 |
+
y, sr = librosa.load(audio_path, sr=None)
|
74 |
+
print(f"Audio loaded: {len(y)} samples, Sample rate: {sr}")
|
75 |
+
|
76 |
+
# Configure OpenVINO model
|
77 |
+
model_path = os.path.join(os.path.dirname(__file__), "models", "htdemucs_v4.xml")
|
78 |
+
core = ov.Core()
|
79 |
+
model = core.read_model(model_path)
|
80 |
+
compiled_model = core.compile_model(model, "CPU")
|
81 |
+
|
82 |
+
input_node = compiled_model.input(0)
|
83 |
+
output_node = compiled_model.output(0)
|
84 |
+
target_shape = (1, 4, 2048, 336)
|
85 |
+
|
86 |
+
total_size = np.prod(target_shape)
|
87 |
+
if len(y) < total_size:
|
88 |
+
input_data = np.pad(y, (0, total_size - len(y)), mode='constant')
|
89 |
+
else:
|
90 |
+
input_data = y[:total_size]
|
91 |
+
|
92 |
+
input_data = input_data.reshape(target_shape).astype(np.float32)
|
93 |
+
input_tensor = ov.Tensor(input_data)
|
94 |
+
|
95 |
+
outputs = compiled_model([input_tensor])[output_node]
|
96 |
+
separated_audios = outputs[0]
|
97 |
+
|
98 |
+
# Analysis lists
|
99 |
+
keys = []
|
100 |
+
avg_tempos = []
|
101 |
+
streams_info = []
|
102 |
+
|
103 |
+
# Create temporary directory for separated streams
|
104 |
+
temp_dir = Path("temp_streams")
|
105 |
+
temp_dir.mkdir(exist_ok=True)
|
106 |
+
|
107 |
+
# Process each separated audio stream
|
108 |
+
for i in range(separated_audios.shape[0]):
|
109 |
+
stream = separated_audios[i].reshape(-1)
|
110 |
+
|
111 |
+
try:
|
112 |
+
output_file = temp_dir / f'separated_stream_{i+1}.wav'
|
113 |
+
sf.write(str(output_file), stream, sr)
|
114 |
+
|
115 |
+
y_s, sr_s = librosa.load(str(output_file), sr=None)
|
116 |
+
|
117 |
+
if len(y_s) < sr_s * 0.1:
|
118 |
+
continue
|
119 |
+
|
120 |
+
# Calculate audio features
|
121 |
+
tempo_s, _ = librosa.beat.beat_track(y=y_s, sr=sr_s)
|
122 |
+
spectral_centroid_s = np.mean(librosa.feature.spectral_centroid(y=y_s, sr=sr_s))
|
123 |
+
rms_s = np.mean(librosa.feature.rms(y=y_s))
|
124 |
+
key_s = estimate_key(y_s, sr_s)
|
125 |
+
|
126 |
+
# Store all information
|
127 |
+
streams_info.append({
|
128 |
+
'type': classify_instrument(spectral_centroid_s, rms_s),
|
129 |
+
'centroid': spectral_centroid_s,
|
130 |
+
'energy': rms_s
|
131 |
+
})
|
132 |
+
|
133 |
+
keys.append(key_s)
|
134 |
+
avg_tempos.append(tempo_s)
|
135 |
+
|
136 |
+
except Exception as e:
|
137 |
+
print(f"Warning: Could not process stream {i+1}: {str(e)}")
|
138 |
+
continue
|
139 |
+
finally:
|
140 |
+
# Clean up temporary file
|
141 |
+
if output_file.exists():
|
142 |
+
output_file.unlink()
|
143 |
+
|
144 |
+
# Clean up temporary directory
|
145 |
+
temp_dir.rmdir()
|
146 |
+
|
147 |
+
if len(avg_tempos) > 0:
|
148 |
+
avg_tempo = np.mean(avg_tempos)
|
149 |
+
prompt = generate_prompt(keys, avg_tempo, streams_info, genre)
|
150 |
+
return prompt, f"Character count: {len(prompt)}"
|
151 |
+
else:
|
152 |
+
return "Error: No valid audio streams were processed.", "Processing failed"
|
153 |
+
|
154 |
+
except Exception as e:
|
155 |
+
return f"Error processing the file: {str(e)}", "Processing failed"
|
156 |
|
157 |
+
# Create Gradio interface
|
158 |
+
def create_interface():
|
159 |
+
genre_choices = ["electronic", "ambient", "trap", "synthwave", "house", "techno"]
|
160 |
+
|
161 |
+
iface = gr.Interface(
|
162 |
+
fn=process_audio,
|
163 |
+
inputs=[
|
164 |
+
gr.Audio(type="filepath", label="Upload Audio File"),
|
165 |
+
gr.Dropdown(choices=genre_choices, label="Select Genre", value="electronic")
|
166 |
+
],
|
167 |
+
outputs=[
|
168 |
+
gr.Textbox(label="Generated Prompt"),
|
169 |
+
gr.Textbox(label="Status")
|
170 |
+
],
|
171 |
+
title="Audio Analysis to Suno Prompt Generator",
|
172 |
+
description="Upload an audio file to generate a Suno-compatible prompt based on its musical characteristics.",
|
173 |
+
examples=[],
|
174 |
+
cache_examples=False
|
175 |
+
)
|
176 |
+
return iface
|
177 |
|
178 |
+
# Launch the interface
|
179 |
+
if __name__ == "__main__":
|
180 |
+
iface = create_interface()
|
181 |
+
iface.launch()
|
|