Spaces:

Hetan07
/

Single_Label_Music_Genre_Classifier

Running

App Files Files Community

Hetan07 commited on Jan 11

Commit

9100bbc

•

1 Parent(s): dc45e5e

Final changes

Browse files

Files changed (3) hide show

app.py +126 -136
audio_splitting.py +5 -20
feature_extraction.py +11 -3

app.py CHANGED Viewed

@@ -3,153 +3,143 @@ import joblib
 import pandas as pd
 import numpy as np
 import xgboost
-# from sklearn.ensemble import GradientBoostingClassifier
-# from tensorflow.keras.models import load_model
-import tensorflow
-from keras.losses import binary_crossentropy
-from keras.optimizers import Adam
-from tensorflow import keras
-from keras.models import load_model
 # Local Imports
 import feature_extraction
 import audio_splitting
-# Create a Streamlit web app
 st.title("Music Genre Classifier")
 st.write("A single-label music genre classifier based and trained on the GTZAN Dataset available for use on "
          "Kaggle. All the models have been trained on that dataset.")
 # Upload music file
-uploaded_file = st.file_uploader("Upload a music file", type=["mp3", "wav"])
 if uploaded_file is not None:
     # User selects a model
-    all_models = ["K-Nearest Neighbors - (Single Label)", "Logistic Regression - (Single Label)", "Support Vector Machines - (Single Label)",
-                  "Neural Network - (Single Label)",
-                  "XGB Classifier - (Single Label)"]
-    model_name = st.selectbox("Select a model", all_models)
-    st.write(f"Predicition of following genres")
-    multi_class_names = ["Metal", "Jazz", "Blues", "R&B", "Classical", "Reggae", "Rap & Hip-Hop", "Punk", "Rock",
-                         "Country", "Bebop", "Pop", "Soul", "Dance & Electronic", "Folk"]
-    class_names = ["Blues", "Classical", "Country", "Disco", "HipHop",
-                   "Jazz", "Metal", "Pop", "Reggae", "Rock"]
-    col1, col2 = st.columns(2)
-    s = ''
-    with col1:
-        for i in class_names[:5]:
-            s += "- " + i + "\n"
-        st.markdown(s)
-    s = ''
-    with col2:
-        for i in class_names[5:]:
-            s += "- " + i + "\n"
-        st.markdown(s)
-    # st.write(multi_class_names)
-    # Load the selected model
-    if model_name == "K-Nearest Neighbors - (Single Label)":
-        model = joblib.load("./models/knn.pkl")
-    elif model_name == "Logistic Regression - (Single Label)":
-        model = joblib.load("./models/logistic.pkl")
-    elif model_name == "Support Vector Machines - (Single Label)":
-        model = joblib.load("./models/svm.pkl")
-    elif model_name == "Neural Network - (Single Label)":
-        model = joblib.load("./models/nn.pkl")
-    elif model_name == "XGB Classifier - (Single Label)":
-        model = joblib.load("./models/xgb.pkl")
-    elif model_name == "XGB - (Multi Label)":
-        model = joblib.load("./models/xgb_mlb.pkl")
-    elif model_name == "Convolutional Recurrent Neural Network - (Multi Label)":
-        model = tensorflow.keras.models.load_model("../models/model_crnn1.h5", compile=False)
-        model.compile(loss=binary_crossentropy,
-                      optimizer=Adam(),
-                      metrics=['accuracy'])
-    elif model_name == "Neural Network - (Multi Label)":
-        model = tensorflow.keras.models.load_model("../models/model_nn.h5", compile=False)
-        model.compile(loss=binary_crossentropy,
-                      optimizer=Adam(),
-                      metrics=['accuracy'])
-    elif model_name == "Batch Normalization - (Multi Label)":
-        model = tensorflow.keras.models.load_model("../models/model_bn.h5", compile=False)
-        model.compile(loss=binary_crossentropy,
-                      optimizer=Adam(),
-                      metrics=['accuracy'])
-    # class_names = ["blues", "classical", "country", "disco", "hiphop", "jazz", "metal", "pop", "reggae", "rock"]
-    xgb_multi_class_names = ["Rock", "Rap & Hip-Hop", "Soul", "Classical", "Dance & Electronic", "Blues","Jazz",
-                             "Country","Bebop","Folk","Reggae","R&B","Punk","Metal","Pop"]
-    xmulti_class_names = ["Metal", "Blues", "Reggae", "Jazz", "Rock", "Folk", "Classical", "Dance & Electronic",
-                         "Punk","Bebop", "Pop", "R&B", "Country", "Rap & Hip-Hop", "Soul"]
-    class_indices = {i: class_name for i, class_name in enumerate(class_names)}
-    features_list,val_list = audio_splitting.split_audio(uploaded_file)
-    features = feature_extraction.scale(features_list)
-    # st.write(features)
-    # Features Dataframe
-    df = pd.DataFrame({
-        "fname": ["Chroma_STFT"],
-        "Values": val_list
-    })
-    st.dataframe(
-        df,
-        column_config={
-            "name": "Features",
-            "Values": st.column_config.LineChartColumn(
-                "Graph Values",y_min=0,y_max = 10000
-            )
-        }
-    )
-    # Reshape the features to match the expected shape for prediction
-    reshaped_features = features.reshape(1, -1)
-    if model_name == "XGB - (Multi Label)":
-        # Predict labels for the input features
-        predicted_indices = model.predict(reshaped_features)
-        print(predicted_indices)
-        predicted_labels = []
-        for i in range(0,len(predicted_indices[0])):
-            if predicted_indices[0][i]==1.0:
-                predicted_labels.append(xgb_multi_class_names[i])
-        if predicted_labels:
-            st.write(f"Predicted Genres: {', '.join(predicted_labels)}")
-        else:
-            st.write("No genres predicted for this input.")
-    if model_name == "XGB Classifier - (Single Label)":
-        predicted_indices = model.predict(reshaped_features)
-        predicted_labels = [class_indices[i] for i in predicted_indices]
-        st.write(f"Predicted Genre: {predicted_labels[0]}")
-    elif model_name == "Convolutional Recurrent Neural Network - (Multi Label)"\
-            or model_name == "Neural Network - (Multi Label)"\
-            or model_name == "Batch Normalization - (Multi Label)":
-        predicted_probabilities = model.predict(reshaped_features)
-        # Set a threshold for class prediction (e.g., 0.5)
-        threshold = 0.3
-        print(predicted_probabilities)
-        probabilities = []
-        if model_name == "Convolutional Recurrent Neural Network - (Multi Label)":
-            predicted_labels = [class_name for i, class_name in enumerate(multi_class_names) if
-                                predicted_probabilities[0][i] >= threshold]
-            probabilities = [(class_name,predicted_probabilities[0][i]*100) for i, class_name in enumerate(multi_class_names)]
         else:
-            predicted_labels = [class_name for i,class_name in enumerate(xmulti_class_names) if
-                                predicted_probabilities[0][i] >= threshold]
-            probabilities = [(class_name,predicted_probabilities[0][i]*100) for i, class_name in enumerate(xmulti_class_names)]
-        if predicted_labels:
-            st.write(f"All probabilities are:")
-            st.write(probabilities)
-            st.write(f"Predicted Genres: {', '.join(predicted_labels)}")
-        else:
-            st.write("No genre predicted above the threshold.")
-    else:
-        predicted_label = model.predict(features)[0]
-        st.metric("Predicted Genre:",str(predicted_label))

 import pandas as pd
 import numpy as np
 import xgboost
 # Local Imports
 import feature_extraction
 import audio_splitting
+st.set_page_config(layout="wide")
+# Vars
+fields_df = ['Chromagram Short-Time Fourier Transform (Chroma-STFT)',
+             'Root Mean Square Energy (RMS)',
+             'Spectral Centroid',
+             'Spectral Bandwidth',
+             'Spectral Rolloff',
+             'Zero Crossing Rate',
+             'Harmony',
+             'Percussion',
+             'Tempo',
+             'Mel-Frequency Cepstral Coefficients (MFCC-1)',
+             'MFCC-2',
+             'MFCC-3',
+             'MFCC-4',
+             'MFCC-5',
+             'MFCC-6',
+             'MFCC-7',
+             'MFCC-8',
+             'MFCC-9',
+             'MFCC-10',
+             'MFCC-11',
+             'MFCC-12',
+             'MFCC-13',
+             'MFCC-14',
+             'MFCC-15',
+             'MFCC-16',
+             'MFCC-17',
+             'MFCC-18',
+             'MFCC-19',
+             'MFCC-20', ]
 st.title("Music Genre Classifier")
 st.write("A single-label music genre classifier based and trained on the GTZAN Dataset available for use on "
          "Kaggle. All the models have been trained on that dataset.")
+st.write("Prediction of following genres")
+class_names = ["Blues", "Classical", "Country", "Disco", "HipHop",
+               "Jazz", "Metal", "Pop", "Reggae", "Rock"]
+class_indices = {i: class_name for i, class_name in enumerate(class_names)}
+col1, col2 = st.columns(2)
+s = ''
+with col1:
+    for i in class_names[:5]:
+        s += "- " + i + "\n"
+    st.markdown(s)
+s = ''
+with col2:
+    for i in class_names[5:]:
+        s += "- " + i + "\n"
+    st.markdown(s)
+st.divider()
 # Upload music file
+st.subheader("Upload a music file")
+uploaded_file = st.file_uploader("Upload a music file", type=["mp3", "wav", "ogg"], label_visibility="collapsed")
+st.divider()
 if uploaded_file is not None:
     # User selects a model
+    all_models = ["K-Nearest Neighbors",
+                  "Logistic Regression",
+                  "Support Vector Machines",
+                  "Neural Network",
+                  "XGB Classifier"]
+    features_list, val_list = audio_splitting.split_audio(uploaded_file)
+    features = feature_extraction.scale(features_list)
+    feature_copy = features_list
+    feature_copy.insert(19, "-")
+    st.header("Feature Extraction")
+    st.write("The given audio sample is processed using the librosa library to get the features extracted used by the "
+             "models for genre prediction. Following is the dataframe with each of the feature extracted and "
+             "corresponding mean and variance of the feature")
+    col3, col4 = st.columns([0.6,0.4])
+    with col3:
+        # Features Dataframe
+        df = pd.DataFrame({
+            "name": fields_df,
+            "Mean": feature_copy[2::2],
+            "Variance": feature_copy[3::2]
+        })
+        st.dataframe(
+            df,
+            column_config={
+                "name": "Features",
+                "Mean": "Mean of Feature",
+                "Variance": "Variance of Feature"
+            },
+            use_container_width=True
+        )
+    with col4:
+        col1, col2 = st.columns([0.55, 0.45])
+        col1.subheader("Select a model")
+        with col1:
+            model_name = st.selectbox("Select a model", all_models, label_visibility="collapsed")
+            # Load the selected model
+            if model_name == "K-Nearest Neighbors":
+                model = joblib.load("./models/knn.pkl")
+            elif model_name == "Logistic Regression":
+                model = joblib.load("./models/logistic.pkl")
+            elif model_name == "Support Vector Machines":
+                model = joblib.load("./models/svm.pkl")
+            elif model_name == "Neural Network":
+                model = joblib.load("./models/nn.pkl")
+            elif model_name == "XGB Classifier":
+                model = joblib.load("./models/xgb.pkl")
+        col2.subheader("Predicted genre")
+        # Reshape the features to match the expected shape for prediction
+        reshaped_features = features.reshape(1, -1)
+        if model_name == "XGB Classifier":
+            predicted_indices = model.predict(reshaped_features)
+            predicted_labels = [class_indices[i] for i in predicted_indices]
+            with col2:
+                st.metric("Predicted Genre:", str(predicted_labels[0]), label_visibility="collapsed")
         else:
+            predicted_label = model.predict(features)[0]
+            with col2:
+                st.metric("Predicted Genre:", str(predicted_label).capitalize(), label_visibility="collapsed")

audio_splitting.py CHANGED Viewed

@@ -1,41 +1,26 @@
-import pydub
-import streamlit
 from pydub import AudioSegment
 import feature_extraction
 import io
 def split_audio(uploaded_file):
-    # Load your audio file
-    # audio = AudioSegment.from_file("classical.00000.wav", format="wav")
     audio = AudioSegment.from_file(uploaded_file)
-    print("Works")
-    # Define the duration of each segment in milliseconds (3 seconds)
-    segment_duration = 3 * 1000  # 3 seconds in milliseconds
-    # Check the total duration of the audio
     audio_duration = len(audio)
-    print("works")
     # Check if the audio is shorter than 1 minute and 3 seconds
     if audio_duration < 63 * 1000:
         # If it's shorter, take audio from 0 to 3 seconds
         segment = audio[:segment_duration]
     else:
         # If it's longer, take audio from 1 minute to 1 minute 3 seconds
-        start_time = 60 * 1000  # 1 minute in milliseconds
         end_time = start_time + segment_duration
         segment = audio[start_time:end_time]
     output_stream = io.BytesIO()
     segment.export(output_stream, format="wav")
-    print("Works")
-    # Now you can directly use the output_stream for feature extraction
-    output_stream.seek(0)  # Reset the stream position to the beginning
     # Process and extract features from the segment
     features = feature_extraction.all_feature_extraction(output_stream)
-    print(features)
-    streamlit.write(features)
-    return features
-# output_file = "D:/miniproject/output_segment.wav"
-# Save the segment to a new file
-# segment.export(output_file, format="wav")

 from pydub import AudioSegment
 import feature_extraction
 import io
 def split_audio(uploaded_file):
     audio = AudioSegment.from_file(uploaded_file)
+    segment_duration = 3 * 1000  # 3 seconds in milliseconds
     audio_duration = len(audio)
     # Check if the audio is shorter than 1 minute and 3 seconds
     if audio_duration < 63 * 1000:
         # If it's shorter, take audio from 0 to 3 seconds
         segment = audio[:segment_duration]
     else:
         # If it's longer, take audio from 1 minute to 1 minute 3 seconds
+        start_time = 60 * 1000
         end_time = start_time + segment_duration
         segment = audio[start_time:end_time]
     output_stream = io.BytesIO()
     segment.export(output_stream, format="wav")
+    output_stream.seek(0)
     # Process and extract features from the segment
     features = feature_extraction.all_feature_extraction(output_stream)
+    return features

feature_extraction.py CHANGED Viewed

@@ -32,11 +32,17 @@ short_field = Fields[2:]
 def all_feature_extraction(audio_path, sample_rate=22050):
     data_list = []
     val_field = []
-    audio_df, sr = librosa.load(audio_path, sr=22050)
     data_list.append(audio_path)
     data_list.append(len(audio_df))
     # 1. Chroma STFT
     chroma_stft = librosa.feature.chroma_stft(y=audio_df, hop_length=512)
@@ -47,6 +53,8 @@ def all_feature_extraction(audio_path, sample_rate=22050):
     data_list.append(chroma_stft_mean)
     data_list.append(chroma_stft_var)
     # 2. RMS
     rms = librosa.feature.rms(y=audio_df)
     rms_mean = np.mean(rms)

 def all_feature_extraction(audio_path, sample_rate=22050):
     data_list = []
     val_field = []
+    print(data_list)
+    try:
+        audio_df, sr = sf.read(audio_path)
+        print("Audio loaded successfully.")
+        print("Shape of audio data:", audio_df.shape)
+        print("Sample rate:", sr)
+    except Exception as e:
+        print("Error loading audio file:", e)
     data_list.append(audio_path)
     data_list.append(len(audio_df))
+    print(data_list)
     # 1. Chroma STFT
     chroma_stft = librosa.feature.chroma_stft(y=audio_df, hop_length=512)
     data_list.append(chroma_stft_mean)
     data_list.append(chroma_stft_var)
+    print(data_list,val_field)
     # 2. RMS
     rms = librosa.feature.rms(y=audio_df)
     rms_mean = np.mean(rms)