Final changes
Browse files- app.py +126 -136
- audio_splitting.py +5 -20
- feature_extraction.py +11 -3
app.py
CHANGED
@@ -3,153 +3,143 @@ import joblib
|
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
import xgboost
|
6 |
-
|
7 |
-
# from tensorflow.keras.models import load_model
|
8 |
-
import tensorflow
|
9 |
-
from keras.losses import binary_crossentropy
|
10 |
-
from keras.optimizers import Adam
|
11 |
-
from tensorflow import keras
|
12 |
-
from keras.models import load_model
|
13 |
# Local Imports
|
14 |
import feature_extraction
|
15 |
import audio_splitting
|
16 |
|
17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
st.title("Music Genre Classifier")
|
19 |
st.write("A single-label music genre classifier based and trained on the GTZAN Dataset available for use on "
|
20 |
"Kaggle. All the models have been trained on that dataset.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
# Upload music file
|
22 |
-
|
|
|
23 |
|
|
|
24 |
if uploaded_file is not None:
|
25 |
# User selects a model
|
26 |
-
all_models = ["K-Nearest Neighbors
|
27 |
-
"
|
28 |
-
"
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
multi_class_names = ["Metal", "Jazz", "Blues", "R&B", "Classical", "Reggae", "Rap & Hip-Hop", "Punk", "Rock",
|
33 |
-
"Country", "Bebop", "Pop", "Soul", "Dance & Electronic", "Folk"]
|
34 |
-
|
35 |
-
class_names = ["Blues", "Classical", "Country", "Disco", "HipHop",
|
36 |
-
"Jazz", "Metal", "Pop", "Reggae", "Rock"]
|
37 |
-
|
38 |
-
col1, col2 = st.columns(2)
|
39 |
-
s = ''
|
40 |
-
with col1:
|
41 |
-
for i in class_names[:5]:
|
42 |
-
s += "- " + i + "\n"
|
43 |
-
st.markdown(s)
|
44 |
-
|
45 |
-
s = ''
|
46 |
-
|
47 |
-
with col2:
|
48 |
-
for i in class_names[5:]:
|
49 |
-
s += "- " + i + "\n"
|
50 |
-
st.markdown(s)
|
51 |
-
# st.write(multi_class_names)
|
52 |
-
|
53 |
-
# Load the selected model
|
54 |
-
if model_name == "K-Nearest Neighbors - (Single Label)":
|
55 |
-
model = joblib.load("./models/knn.pkl")
|
56 |
-
elif model_name == "Logistic Regression - (Single Label)":
|
57 |
-
model = joblib.load("./models/logistic.pkl")
|
58 |
-
elif model_name == "Support Vector Machines - (Single Label)":
|
59 |
-
model = joblib.load("./models/svm.pkl")
|
60 |
-
elif model_name == "Neural Network - (Single Label)":
|
61 |
-
model = joblib.load("./models/nn.pkl")
|
62 |
-
elif model_name == "XGB Classifier - (Single Label)":
|
63 |
-
model = joblib.load("./models/xgb.pkl")
|
64 |
-
elif model_name == "XGB - (Multi Label)":
|
65 |
-
model = joblib.load("./models/xgb_mlb.pkl")
|
66 |
-
elif model_name == "Convolutional Recurrent Neural Network - (Multi Label)":
|
67 |
-
model = tensorflow.keras.models.load_model("../models/model_crnn1.h5", compile=False)
|
68 |
-
model.compile(loss=binary_crossentropy,
|
69 |
-
optimizer=Adam(),
|
70 |
-
metrics=['accuracy'])
|
71 |
-
elif model_name == "Neural Network - (Multi Label)":
|
72 |
-
model = tensorflow.keras.models.load_model("../models/model_nn.h5", compile=False)
|
73 |
-
model.compile(loss=binary_crossentropy,
|
74 |
-
optimizer=Adam(),
|
75 |
-
metrics=['accuracy'])
|
76 |
-
elif model_name == "Batch Normalization - (Multi Label)":
|
77 |
-
model = tensorflow.keras.models.load_model("../models/model_bn.h5", compile=False)
|
78 |
-
model.compile(loss=binary_crossentropy,
|
79 |
-
optimizer=Adam(),
|
80 |
-
metrics=['accuracy'])
|
81 |
-
# class_names = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]
|
82 |
-
|
83 |
-
xgb_multi_class_names = ["Rock", "Rap & Hip-Hop", "Soul", "Classical", "Dance & Electronic", "Blues","Jazz",
|
84 |
-
"Country","Bebop","Folk","Reggae","R&B","Punk","Metal","Pop"]
|
85 |
-
|
86 |
-
xmulti_class_names = ["Metal", "Blues", "Reggae", "Jazz", "Rock", "Folk", "Classical", "Dance & Electronic",
|
87 |
-
"Punk","Bebop", "Pop", "R&B", "Country", "Rap & Hip-Hop", "Soul"]
|
88 |
-
class_indices = {i: class_name for i, class_name in enumerate(class_names)}
|
89 |
-
|
90 |
-
features_list,val_list = audio_splitting.split_audio(uploaded_file)
|
91 |
-
features = feature_extraction.scale(features_list)
|
92 |
|
93 |
-
|
94 |
-
|
95 |
-
df = pd.DataFrame({
|
96 |
-
"fname": ["Chroma_STFT"],
|
97 |
-
"Values": val_list
|
98 |
-
})
|
99 |
-
st.dataframe(
|
100 |
-
df,
|
101 |
-
column_config={
|
102 |
-
"name": "Features",
|
103 |
-
"Values": st.column_config.LineChartColumn(
|
104 |
-
"Graph Values",y_min=0,y_max = 10000
|
105 |
-
)
|
106 |
-
}
|
107 |
-
)
|
108 |
-
|
109 |
-
|
110 |
-
# Reshape the features to match the expected shape for prediction
|
111 |
-
reshaped_features = features.reshape(1, -1)
|
112 |
-
if model_name == "XGB - (Multi Label)":
|
113 |
-
# Predict labels for the input features
|
114 |
-
predicted_indices = model.predict(reshaped_features)
|
115 |
-
print(predicted_indices)
|
116 |
-
predicted_labels = []
|
117 |
-
for i in range(0,len(predicted_indices[0])):
|
118 |
-
if predicted_indices[0][i]==1.0:
|
119 |
-
predicted_labels.append(xgb_multi_class_names[i])
|
120 |
-
if predicted_labels:
|
121 |
-
st.write(f"Predicted Genres: {', '.join(predicted_labels)}")
|
122 |
-
else:
|
123 |
-
st.write("No genres predicted for this input.")
|
124 |
-
if model_name == "XGB Classifier - (Single Label)":
|
125 |
-
predicted_indices = model.predict(reshaped_features)
|
126 |
-
predicted_labels = [class_indices[i] for i in predicted_indices]
|
127 |
-
st.write(f"Predicted Genre: {predicted_labels[0]}")
|
128 |
-
elif model_name == "Convolutional Recurrent Neural Network - (Multi Label)"\
|
129 |
-
or model_name == "Neural Network - (Multi Label)"\
|
130 |
-
or model_name == "Batch Normalization - (Multi Label)":
|
131 |
-
predicted_probabilities = model.predict(reshaped_features)
|
132 |
-
|
133 |
-
# Set a threshold for class prediction (e.g., 0.5)
|
134 |
-
threshold = 0.3
|
135 |
-
print(predicted_probabilities)
|
136 |
-
probabilities = []
|
137 |
-
if model_name == "Convolutional Recurrent Neural Network - (Multi Label)":
|
138 |
-
predicted_labels = [class_name for i, class_name in enumerate(multi_class_names) if
|
139 |
-
predicted_probabilities[0][i] >= threshold]
|
140 |
-
probabilities = [(class_name,predicted_probabilities[0][i]*100) for i, class_name in enumerate(multi_class_names)]
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
else:
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
if predicted_labels:
|
148 |
-
st.write(f"All probabilities are:")
|
149 |
-
st.write(probabilities)
|
150 |
-
st.write(f"Predicted Genres: {', '.join(predicted_labels)}")
|
151 |
-
else:
|
152 |
-
st.write("No genre predicted above the threshold.")
|
153 |
-
else:
|
154 |
-
predicted_label = model.predict(features)[0]
|
155 |
-
st.metric("Predicted Genre:",str(predicted_label))
|
|
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
import xgboost
|
6 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
# Local Imports
|
8 |
import feature_extraction
|
9 |
import audio_splitting
|
10 |
|
11 |
+
st.set_page_config(layout="wide")
|
12 |
+
# Vars
|
13 |
+
fields_df = ['Chromagram Short-Time Fourier Transform (Chroma-STFT)',
|
14 |
+
'Root Mean Square Energy (RMS)',
|
15 |
+
'Spectral Centroid',
|
16 |
+
'Spectral Bandwidth',
|
17 |
+
'Spectral Rolloff',
|
18 |
+
'Zero Crossing Rate',
|
19 |
+
'Harmony',
|
20 |
+
'Percussion',
|
21 |
+
'Tempo',
|
22 |
+
'Mel-Frequency Cepstral Coefficients (MFCC-1)',
|
23 |
+
'MFCC-2',
|
24 |
+
'MFCC-3',
|
25 |
+
'MFCC-4',
|
26 |
+
'MFCC-5',
|
27 |
+
'MFCC-6',
|
28 |
+
'MFCC-7',
|
29 |
+
'MFCC-8',
|
30 |
+
'MFCC-9',
|
31 |
+
'MFCC-10',
|
32 |
+
'MFCC-11',
|
33 |
+
'MFCC-12',
|
34 |
+
'MFCC-13',
|
35 |
+
'MFCC-14',
|
36 |
+
'MFCC-15',
|
37 |
+
'MFCC-16',
|
38 |
+
'MFCC-17',
|
39 |
+
'MFCC-18',
|
40 |
+
'MFCC-19',
|
41 |
+
'MFCC-20', ]
|
42 |
+
|
43 |
st.title("Music Genre Classifier")
|
44 |
st.write("A single-label music genre classifier based and trained on the GTZAN Dataset available for use on "
|
45 |
"Kaggle. All the models have been trained on that dataset.")
|
46 |
+
|
47 |
+
st.write("Prediction of following genres")
|
48 |
+
|
49 |
+
class_names = ["Blues", "Classical", "Country", "Disco", "HipHop",
|
50 |
+
"Jazz", "Metal", "Pop", "Reggae", "Rock"]
|
51 |
+
|
52 |
+
class_indices = {i: class_name for i, class_name in enumerate(class_names)}
|
53 |
+
|
54 |
+
col1, col2 = st.columns(2)
|
55 |
+
s = ''
|
56 |
+
with col1:
|
57 |
+
for i in class_names[:5]:
|
58 |
+
s += "- " + i + "\n"
|
59 |
+
st.markdown(s)
|
60 |
+
|
61 |
+
s = ''
|
62 |
+
|
63 |
+
with col2:
|
64 |
+
for i in class_names[5:]:
|
65 |
+
s += "- " + i + "\n"
|
66 |
+
st.markdown(s)
|
67 |
+
|
68 |
+
st.divider()
|
69 |
# Upload music file
|
70 |
+
st.subheader("Upload a music file")
|
71 |
+
uploaded_file = st.file_uploader("Upload a music file", type=["mp3", "wav", "ogg"], label_visibility="collapsed")
|
72 |
|
73 |
+
st.divider()
|
74 |
if uploaded_file is not None:
|
75 |
# User selects a model
|
76 |
+
all_models = ["K-Nearest Neighbors",
|
77 |
+
"Logistic Regression",
|
78 |
+
"Support Vector Machines",
|
79 |
+
"Neural Network",
|
80 |
+
"XGB Classifier"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
+
features_list, val_list = audio_splitting.split_audio(uploaded_file)
|
83 |
+
features = feature_extraction.scale(features_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
+
feature_copy = features_list
|
86 |
+
feature_copy.insert(19, "-")
|
87 |
+
st.header("Feature Extraction")
|
88 |
+
|
89 |
+
st.write("The given audio sample is processed using the librosa library to get the features extracted used by the "
|
90 |
+
"models for genre prediction. Following is the dataframe with each of the feature extracted and "
|
91 |
+
"corresponding mean and variance of the feature")
|
92 |
+
|
93 |
+
col3, col4 = st.columns([0.6,0.4])
|
94 |
+
with col3:
|
95 |
+
|
96 |
+
# Features Dataframe
|
97 |
+
df = pd.DataFrame({
|
98 |
+
"name": fields_df,
|
99 |
+
"Mean": feature_copy[2::2],
|
100 |
+
"Variance": feature_copy[3::2]
|
101 |
+
})
|
102 |
+
|
103 |
+
st.dataframe(
|
104 |
+
df,
|
105 |
+
column_config={
|
106 |
+
"name": "Features",
|
107 |
+
"Mean": "Mean of Feature",
|
108 |
+
"Variance": "Variance of Feature"
|
109 |
+
},
|
110 |
+
use_container_width=True
|
111 |
+
)
|
112 |
+
|
113 |
+
with col4:
|
114 |
+
|
115 |
+
col1, col2 = st.columns([0.55, 0.45])
|
116 |
+
|
117 |
+
col1.subheader("Select a model")
|
118 |
+
with col1:
|
119 |
+
model_name = st.selectbox("Select a model", all_models, label_visibility="collapsed")
|
120 |
+
|
121 |
+
# Load the selected model
|
122 |
+
if model_name == "K-Nearest Neighbors":
|
123 |
+
model = joblib.load("./models/knn.pkl")
|
124 |
+
elif model_name == "Logistic Regression":
|
125 |
+
model = joblib.load("./models/logistic.pkl")
|
126 |
+
elif model_name == "Support Vector Machines":
|
127 |
+
model = joblib.load("./models/svm.pkl")
|
128 |
+
elif model_name == "Neural Network":
|
129 |
+
model = joblib.load("./models/nn.pkl")
|
130 |
+
elif model_name == "XGB Classifier":
|
131 |
+
model = joblib.load("./models/xgb.pkl")
|
132 |
+
col2.subheader("Predicted genre")
|
133 |
+
|
134 |
+
# Reshape the features to match the expected shape for prediction
|
135 |
+
reshaped_features = features.reshape(1, -1)
|
136 |
+
|
137 |
+
if model_name == "XGB Classifier":
|
138 |
+
predicted_indices = model.predict(reshaped_features)
|
139 |
+
predicted_labels = [class_indices[i] for i in predicted_indices]
|
140 |
+
with col2:
|
141 |
+
st.metric("Predicted Genre:", str(predicted_labels[0]), label_visibility="collapsed")
|
142 |
else:
|
143 |
+
predicted_label = model.predict(features)[0]
|
144 |
+
with col2:
|
145 |
+
st.metric("Predicted Genre:", str(predicted_label).capitalize(), label_visibility="collapsed")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
audio_splitting.py
CHANGED
@@ -1,41 +1,26 @@
|
|
1 |
-
import pydub
|
2 |
-
import streamlit
|
3 |
from pydub import AudioSegment
|
4 |
import feature_extraction
|
5 |
import io
|
6 |
def split_audio(uploaded_file):
|
7 |
-
# Load your audio file
|
8 |
-
# audio = AudioSegment.from_file("classical.00000.wav", format="wav")
|
9 |
audio = AudioSegment.from_file(uploaded_file)
|
10 |
-
print("Works")
|
11 |
-
# Define the duration of each segment in milliseconds (3 seconds)
|
12 |
-
segment_duration = 3 * 1000 # 3 seconds in milliseconds
|
13 |
|
14 |
-
|
15 |
audio_duration = len(audio)
|
16 |
|
17 |
-
print("works")
|
18 |
# Check if the audio is shorter than 1 minute and 3 seconds
|
19 |
if audio_duration < 63 * 1000:
|
20 |
# If it's shorter, take audio from 0 to 3 seconds
|
21 |
segment = audio[:segment_duration]
|
22 |
else:
|
23 |
# If it's longer, take audio from 1 minute to 1 minute 3 seconds
|
24 |
-
start_time = 60 * 1000
|
25 |
end_time = start_time + segment_duration
|
26 |
segment = audio[start_time:end_time]
|
|
|
27 |
output_stream = io.BytesIO()
|
28 |
segment.export(output_stream, format="wav")
|
29 |
-
|
30 |
-
# Now you can directly use the output_stream for feature extraction
|
31 |
-
output_stream.seek(0) # Reset the stream position to the beginning
|
32 |
|
33 |
# Process and extract features from the segment
|
34 |
features = feature_extraction.all_feature_extraction(output_stream)
|
35 |
-
|
36 |
-
streamlit.write(features)
|
37 |
-
return features
|
38 |
-
# output_file = "D:/miniproject/output_segment.wav"
|
39 |
-
|
40 |
-
# Save the segment to a new file
|
41 |
-
# segment.export(output_file, format="wav")
|
|
|
|
|
|
|
1 |
from pydub import AudioSegment
|
2 |
import feature_extraction
|
3 |
import io
|
4 |
def split_audio(uploaded_file):
|
|
|
|
|
5 |
audio = AudioSegment.from_file(uploaded_file)
|
|
|
|
|
|
|
6 |
|
7 |
+
segment_duration = 3 * 1000 # 3 seconds in milliseconds
|
8 |
audio_duration = len(audio)
|
9 |
|
|
|
10 |
# Check if the audio is shorter than 1 minute and 3 seconds
|
11 |
if audio_duration < 63 * 1000:
|
12 |
# If it's shorter, take audio from 0 to 3 seconds
|
13 |
segment = audio[:segment_duration]
|
14 |
else:
|
15 |
# If it's longer, take audio from 1 minute to 1 minute 3 seconds
|
16 |
+
start_time = 60 * 1000
|
17 |
end_time = start_time + segment_duration
|
18 |
segment = audio[start_time:end_time]
|
19 |
+
|
20 |
output_stream = io.BytesIO()
|
21 |
segment.export(output_stream, format="wav")
|
22 |
+
output_stream.seek(0)
|
|
|
|
|
23 |
|
24 |
# Process and extract features from the segment
|
25 |
features = feature_extraction.all_feature_extraction(output_stream)
|
26 |
+
return features
|
|
|
|
|
|
|
|
|
|
|
|
feature_extraction.py
CHANGED
@@ -32,11 +32,17 @@ short_field = Fields[2:]
|
|
32 |
def all_feature_extraction(audio_path, sample_rate=22050):
|
33 |
data_list = []
|
34 |
val_field = []
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
data_list.append(audio_path)
|
38 |
data_list.append(len(audio_df))
|
39 |
-
|
40 |
|
41 |
# 1. Chroma STFT
|
42 |
chroma_stft = librosa.feature.chroma_stft(y=audio_df, hop_length=512)
|
@@ -47,6 +53,8 @@ def all_feature_extraction(audio_path, sample_rate=22050):
|
|
47 |
data_list.append(chroma_stft_mean)
|
48 |
data_list.append(chroma_stft_var)
|
49 |
|
|
|
|
|
50 |
# 2. RMS
|
51 |
rms = librosa.feature.rms(y=audio_df)
|
52 |
rms_mean = np.mean(rms)
|
|
|
32 |
def all_feature_extraction(audio_path, sample_rate=22050):
|
33 |
data_list = []
|
34 |
val_field = []
|
35 |
+
print(data_list)
|
36 |
+
try:
|
37 |
+
audio_df, sr = sf.read(audio_path)
|
38 |
+
print("Audio loaded successfully.")
|
39 |
+
print("Shape of audio data:", audio_df.shape)
|
40 |
+
print("Sample rate:", sr)
|
41 |
+
except Exception as e:
|
42 |
+
print("Error loading audio file:", e)
|
43 |
data_list.append(audio_path)
|
44 |
data_list.append(len(audio_df))
|
45 |
+
print(data_list)
|
46 |
|
47 |
# 1. Chroma STFT
|
48 |
chroma_stft = librosa.feature.chroma_stft(y=audio_df, hop_length=512)
|
|
|
53 |
data_list.append(chroma_stft_mean)
|
54 |
data_list.append(chroma_stft_var)
|
55 |
|
56 |
+
print(data_list,val_field)
|
57 |
+
|
58 |
# 2. RMS
|
59 |
rms = librosa.feature.rms(y=audio_df)
|
60 |
rms_mean = np.mean(rms)
|