File size: 6,525 Bytes
4ca50b5 baab697 4ca50b5 baab697 4ca50b5 baab697 4ca50b5 baab697 4ca50b5 baab697 4ca50b5 baab697 4ca50b5 baab697 4ca50b5 baab697 4ca50b5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
# app.py
import gradio as gr
import librosa
import numpy as np
from openvino import runtime as ov
import soundfile as sf
import warnings
import os
from pathlib import Path
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
def estimate_key(y, sr):
"""Estimate the musical key using chroma features."""
chroma = librosa.feature.chroma_cqt(y=y, sr=sr)
chroma_avg = np.mean(chroma, axis=1)
keys = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
key_index = np.argmax(chroma_avg)
return keys[key_index]
def classify_instrument(spectral_centroid, rms_energy):
"""Classify instrument type based on spectral characteristics."""
if spectral_centroid < 500:
if rms_energy > 0.1:
return "bass"
return "sub"
elif spectral_centroid < 2000:
if rms_energy > 0.15:
return "drums"
return "perc"
elif spectral_centroid < 4000:
return "synth"
else:
return "high"
def get_musical_tempo_description(tempo):
"""Convert numerical tempo to musical description."""
if tempo < 70:
return "slow"
elif tempo < 100:
return "chill"
elif tempo < 120:
return "upbeat"
elif tempo < 140:
return "energetic"
else:
return "fast"
def generate_prompt(keys, avg_tempo, streams_info, genre="electronic"):
"""Generate a concise, Suno-friendly prompt under 200 characters."""
most_common_key = max(set(keys), key=keys.count) if keys else "C"
instrument_counts = {}
for info in streams_info:
inst_type = info['type']
instrument_counts[inst_type] = instrument_counts.get(inst_type, 0) + 1
main_elements = [k for k, v in sorted(instrument_counts.items(), key=lambda x: x[1], reverse=True)[:2]]
tempo_desc = get_musical_tempo_description(avg_tempo)
prompt = f"{most_common_key} {int(avg_tempo)}bpm {tempo_desc} {genre} with {' + '.join(main_elements)}, dark atmosphere + reverb"
if len(prompt) > 200:
prompt = prompt[:197] + "..."
return prompt
def process_audio(audio_path, genre):
"""Process audio file and generate prompt."""
try:
# Load audio
y, sr = librosa.load(audio_path, sr=None)
print(f"Audio loaded: {len(y)} samples, Sample rate: {sr}")
# Configure OpenVINO model
model_path = os.path.join(os.path.dirname(__file__), "models", "htdemucs_v4.xml")
core = ov.Core()
model = core.read_model(model_path)
compiled_model = core.compile_model(model, "CPU")
input_node = compiled_model.input(0)
output_node = compiled_model.output(0)
target_shape = (1, 4, 2048, 336)
total_size = np.prod(target_shape)
if len(y) < total_size:
input_data = np.pad(y, (0, total_size - len(y)), mode='constant')
else:
input_data = y[:total_size]
input_data = input_data.reshape(target_shape).astype(np.float32)
input_tensor = ov.Tensor(input_data)
outputs = compiled_model([input_tensor])[output_node]
separated_audios = outputs[0]
# Analysis lists
keys = []
avg_tempos = []
streams_info = []
# Create temporary directory for separated streams
temp_dir = Path("temp_streams")
temp_dir.mkdir(exist_ok=True)
# Process each separated audio stream
for i in range(separated_audios.shape[0]):
stream = separated_audios[i].reshape(-1)
try:
output_file = temp_dir / f'separated_stream_{i+1}.wav'
sf.write(str(output_file), stream, sr)
y_s, sr_s = librosa.load(str(output_file), sr=None)
if len(y_s) < sr_s * 0.1:
continue
# Calculate audio features
tempo_s, _ = librosa.beat.beat_track(y=y_s, sr=sr_s)
spectral_centroid_s = np.mean(librosa.feature.spectral_centroid(y=y_s, sr=sr_s))
rms_s = np.mean(librosa.feature.rms(y=y_s))
key_s = estimate_key(y_s, sr_s)
# Store all information
streams_info.append({
'type': classify_instrument(spectral_centroid_s, rms_s),
'centroid': spectral_centroid_s,
'energy': rms_s
})
keys.append(key_s)
avg_tempos.append(tempo_s)
except Exception as e:
print(f"Warning: Could not process stream {i+1}: {str(e)}")
continue
finally:
# Clean up temporary file
if output_file.exists():
output_file.unlink()
# Clean up temporary directory
temp_dir.rmdir()
if len(avg_tempos) > 0:
avg_tempo = np.mean(avg_tempos)
prompt = generate_prompt(keys, avg_tempo, streams_info, genre)
return prompt, f"Character count: {len(prompt)}"
else:
return "Error: No valid audio streams were processed.", "Processing failed"
except Exception as e:
return f"Error processing the file: {str(e)}", "Processing failed"
# Create Gradio interface
def create_interface():
genre_choices = ["electronic", "ambient", "trap", "synthwave", "house", "techno"]
iface = gr.Interface(
fn=process_audio,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File"),
gr.Dropdown(choices=genre_choices, label="Select Genre", value="electronic")
],
outputs=[
gr.Textbox(label="Generated Prompt"),
gr.Textbox(label="Status")
],
title="Audio Analysis to Suno Prompt Generator",
description="Upload an audio file to generate a Suno-compatible prompt based on its musical characteristics.",
examples=[],
cache_examples=False
)
return iface
# Launch the interface
if __name__ == "__main__":
iface = create_interface()
iface.launch() |