Spaces:

saeedbenadeeb
/

emotion-detection

Sleeping

App Files Files Community

saeedbenadeeb commited on Jan 5

Commit

0874d87

1 Parent(s): 9a0a0d8

First commit

Browse files

Files changed (33) hide show

.gradio/certificate.pem +31 -0
app.py +168 -0
datasets/TESS_Dataset.py +108 -0
datasets/__init__.py +45 -0
datasets/__pycache__/TESS_Dataset.cpython-311.pyc +0 -0
datasets/__pycache__/__init__.cpython-311.pyc +0 -0
datasets/__pycache__/audio_dataset.cpython-311.pyc +0 -0
datasets/__pycache__/ctc_audio_dataclass.cpython-311.pyc +0 -0
datasets/__pycache__/image_dataset.cpython-311.pyc +0 -0
datasets/audio_dataset.py +120 -0
datasets/ctc_audio_dataclass.py +126 -0
datasets/image_dataset.py +62 -0
emotion-detection +1 -0
encoders/__init__.py +1 -0
encoders/__pycache__/__init__.cpython-311.pyc +0 -0
encoders/__pycache__/encoders.cpython-311.pyc +0 -0
encoders/__pycache__/transformer.cpython-311.pyc +0 -0
encoders/encoders.py +263 -0
encoders/transformer.py +233 -0
model.pth +3 -0
models/CTCencoder.py +93 -0
models/__init__.py +1 -0
models/__pycache__/CTCencoder.cpython-311.pyc +0 -0
models/__pycache__/__init__.cpython-311.pyc +0 -0
requirements.txt +5 -0
statics/style.css +9 -0
upload_model.py +14 -0
utils/__init__.py +2 -0
utils/__pycache__/__init__.cpython-311.pyc +0 -0
utils/__pycache__/helper_functions.cpython-311.pyc +0 -0
utils/__pycache__/random_split.cpython-311.pyc +0 -0
utils/helper_functions.py +70 -0
utils/random_split.py +37 -0

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

app.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import gradio as gr
+import torch
+import librosa
+import numpy as np
+import torch.nn.functional as F
+import os
+from encoders.transformer import Wav2Vec2EmotionClassifier
+# Define the emotions
+emotions = ["happy", "sad", "angry", "neutral", "fear", "disgust", "surprise"]
+label_mapping = {str(idx): emotion for idx, emotion in enumerate(emotions)}
+# Load the trained model
+model_path = "model.pth"
+cfg = {
+    "model": {
+        "encoder": "Wav2Vec2Classifier",
+        "optimizer": {
+            "name": "Adam",
+            "lr": 0.0003,
+            "weight_decay": 3e-4
+        },
+        "l1_lambda": 0.0
+    }
+}
+model = Wav2Vec2EmotionClassifier(num_classes=len(emotions), optimizer_cfg=cfg["model"]["optimizer"])
+model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu")))
+model.eval()
+# Optional: we define a minimum number of samples to avoid Wav2Vec2 conv errors
+MIN_SAMPLES = 10  # or 16000 if you want at least 1 second
+# Preprocessing function
+def preprocess_audio(file_path, sample_rate=16000):
+    """
+    Safely loads the file at file_path and returns a (1, samples) torch tensor.
+    Returns None if the file is invalid or too short.
+    """
+    if not file_path or (not os.path.exists(file_path)):
+        # file_path could be None or an empty string if user didn't record properly
+        return None
+    # Load with librosa (which merges to mono by default if multi-channel)
+    waveform, sr = librosa.load(file_path, sr=sample_rate)
+    # Check length
+    if len(waveform) < MIN_SAMPLES:
+        return None
+    # Convert to torch tensor, shape (1, samples)
+    waveform_tensor = torch.tensor(waveform, dtype=torch.float32).unsqueeze(0)
+    return waveform_tensor
+# Prediction function
+def predict_emotion(audio_file):
+    """
+    audio_file is a file path from Gradio (type='filepath').
+    """
+    # Preprocess
+    waveform = preprocess_audio(audio_file, sample_rate=16000)
+    # If invalid or too short, return an error-like message
+    if waveform is None:
+        return (
+            "Audio is too short or invalid. Please record/upload a longer clip.",
+            ""
+        )
+    # Perform inference
+    with torch.no_grad():
+        logits = model(waveform)
+        probabilities = F.softmax(logits, dim=-1).cpu().numpy()[0]
+    # Get the predicted class
+    predicted_class = np.argmax(probabilities)
+    predicted_emotion = label_mapping[str(predicted_class)]
+    # Format probabilities for visualization
+    probabilities_output = [
+        f"""
+        <div style='display: flex; align-items: center; margin: 5px 0;'>
+            <div style='width: 20%; text-align: right; margin-right: 10px; font-weight: bold;'>{emotions[i]}</div>
+            <div style='flex-grow: 1; background-color: #374151; border-radius: 4px; overflow: hidden;'>
+                <div style='width: {probabilities[i]*100:.2f}%; background-color: #FFA500; height: 10px;'></div>
+            </div>
+            <div style='width: 10%; text-align: right; margin-left: 10px;'>{probabilities[i]*100:.2f}%</div>
+        </div>
+        """
+        for i in range(len(emotions))
+    ]
+    return predicted_emotion, "\n".join(probabilities_output)
+# Create Gradio interface
+def gradio_interface(audio):
+    detected_emotion, probabilities_html = predict_emotion(audio)
+    return detected_emotion, gr.HTML(probabilities_html)
+# Define Gradio UI
+with gr.Blocks(css="""
+    body {
+        background-color: #121212;
+        color: white;
+        font-family: Arial, sans-serif;
+    }
+    h1 {
+        color: #FFA500;
+        font-size: 48px;
+        text-align: center;
+        margin-bottom: 10px;
+    }
+    p {
+        text-align: center;
+        font-size: 18px;
+    }
+    .gradio-row {
+        justify-content: center;
+        align-items: center;
+    }
+    #submit_button {
+        background-color: #FFA500 !important;
+        color: black !important;
+        font-size: 18px;
+        padding: 10px 20px;
+        margin-top: 20px;
+    }
+    #detected_emotion {
+        font-size: 24px;
+        font-weight: bold;
+        text-align: center;
+    }
+    .probabilities-container {
+        margin-top: 20px;
+        padding: 10px;
+        background-color: #1F2937;
+        border-radius: 8px;
+    }
+""") as demo:
+    gr.Markdown(
+        """
+        <div>
+            <h1>Speech Emotion Recognition</h1>
+            <p>🎵 Upload or record an audio file (max 1 minute) to detect emotions.</p>
+            <p>Supported Emotions: 😊 Happy | 😭 Sad | 😡 Angry | 😐 Neutral | 😨 Fear | 🤢 Disgust | 😮 Surprise</p>
+        </div>
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1, elem_id="audio-block"):
+            #  type="filepath" means we get a temporary file path from Gradio
+            audio_input = gr.Audio(label="🎤 Record or Upload Audio", type="filepath")
+            submit_button = gr.Button("Submit", elem_id="submit_button")
+        with gr.Column(scale=1):
+            detected_emotion_label = gr.Label(label="Detected Emotion", elem_id="detected_emotion")
+            probabilities_html = gr.HTML(label="Probabilities", elem_id="probabilities")
+    submit_button.click(
+        fn=gradio_interface,
+        inputs=audio_input,
+        outputs=[detected_emotion_label, probabilities_html]
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch(share=True)

datasets/TESS_Dataset.py ADDED Viewed

	@@ -0,0 +1,108 @@

+import os
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+import librosa
+from typing import List, Tuple
+import shutil
+import kagglehub
+from transformers import Wav2Vec2Processor, Wav2Vec2Model
+import subprocess
+import zipfile
+import os
+# Constants (you may need to define these according to your requirements)
+SAMPLE_RATE = 16000  # Define the sample rate for audio processing
+DURATION = 3.0  # Duration of the audio in seconds
+# Placeholder for waveform normalization
+def normalize_waveform(audio: np.ndarray) -> torch.Tensor:
+    # Convert to tensor if necessary
+    if not isinstance(audio, torch.Tensor):
+        audio = torch.tensor(audio, dtype=torch.float32)
+    return (audio - torch.mean(audio)) / torch.std(audio)
+class TESSRawWaveformDataset(Dataset):
+    def __init__(self, root_path: str, transform=None):
+        super().__init__()
+        self.root_path = root_path
+        self.audio_files = []
+        self.labels = []
+        self.emotions = ["happy", "sad", "angry", "neutral", "fear", "disgust", "surprise"]
+        emotion_mapping = {e.lower(): idx for idx, e in enumerate(self.emotions)}
+        self.download_dataset_if_not_exists()
+        # Load file paths and labels from nested directories
+        for root, dirs, files in os.walk(root_path):
+            for file_name in files:
+                if file_name.endswith(".wav"):
+                    emotion_name = next(
+                        (e for e in emotion_mapping if e in root.lower()), None
+                    )
+                    if emotion_name is not None:
+                        self.audio_files.append(os.path.join(root, file_name))
+                        self.labels.append(emotion_mapping[emotion_name])
+        self.labels = np.array(self.labels, dtype=np.int64)
+        self.transform = transform
+    def __len__(self):
+        return len(self.audio_files)
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
+        # Load raw waveform and label
+        audio_path = self.audio_files[idx]
+        label = self.labels[idx]
+        waveform = self.load_audio(audio_path)
+        if self.transform:
+            waveform = self.transform(waveform)
+        return waveform, label
+    @staticmethod
+    def load_audio(audio_path: str) -> torch.Tensor:
+        # Load audio and ensure it's at the correct sample rate
+        audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, duration=DURATION)
+        assert sr == SAMPLE_RATE, f"Sample rate mismatch: expected {SAMPLE_RATE}, got {sr}"
+        return normalize_waveform(audio)
+    def get_emotions(self) -> List[str]:
+        return self.emotions
+    def download_dataset_if_not_exists(self):
+      if not os.path.exists(self.root_path):
+          print(f"Dataset not found at {self.root_path}. Downloading...")
+          # Ensure the destination directory exists
+          os.makedirs(self.root_path, exist_ok=True)
+          # Download dataset using curl
+          dataset_zip_path = os.path.join(self.root_path, "toronto-emotional-speech-set-tess.zip")
+          curl_command = [
+              "curl",
+              "-L",
+              "-o",
+              dataset_zip_path,
+              "https://www.kaggle.com/api/v1/datasets/download/ejlok1/toronto-emotional-speech-set-tess",
+          ]
+          try:
+              subprocess.run(curl_command, check=True)
+              print(f"Dataset downloaded to {dataset_zip_path}.")
+              # Extract the downloaded zip file
+              with zipfile.ZipFile(dataset_zip_path, "r") as zip_ref:
+                  zip_ref.extractall(self.root_path)
+              print(f"Dataset extracted to {self.root_path}.")
+              # Remove the zip file to save space
+              os.remove(dataset_zip_path)
+              print(f"Removed zip file: {dataset_zip_path}")
+          except subprocess.CalledProcessError as e:
+              print(f"Error occurred during dataset download: {e}")
+              raise
+# Example usage
+# dataset = TESSRawWaveformDataset(root_path="./TESS", transform=None)
+# print("Number of samples:", len(dataset))

datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from typing import List
+from torch.utils.data import Dataset
+from .image_dataset import CustomDataset
+from .audio_dataset import EmodbDataset
+from .ctc_audio_dataclass import CTCEmodbDataset
+from .TESS_Dataset import TESSRawWaveformDataset
+__dataset_mapper__ = {
+    "image": CustomDataset,
+    "emodb": EmodbDataset,
+    'CTCemodb': CTCEmodbDataset,
+    'TESSDataset': TESSRawWaveformDataset
+}
+def list_datasets() -> List[str]:
+    """Returns a list of available dataset names.
+    Returns:
+        List[str]: List of dataset names as strings.
+    Example:
+        >>> from datasets import list_datasets
+        >>> list_datasets()
+        ['image', 'emodb']
+    """
+    return sorted(__dataset_mapper__.keys())
+def get_dataset_by_name(dataset: str, *args, **kwargs) -> Dataset:
+    """Returns the Dataset class using the given name and arguments.
+    Args:
+        dataset (str): The name of the dataset.
+    Returns:
+        Dataset: The requested dataset instance.
+    Example:
+        >>> from datasets import get_dataset_by_name
+        >>> dataset = get_dataset_by_name("emodb", root_path="./data/emodb")
+        >>> type(dataset)
+        <class 'datasets.audio_dataset.EmodbDataset'>
+    """
+    assert dataset in __dataset_mapper__, f"Dataset '{dataset}' not found in the mapper."
+    return __dataset_mapper__[dataset](*args, **kwargs)

datasets/__pycache__/TESS_Dataset.cpython-311.pyc ADDED Viewed

Binary file (6.95 kB). View file

datasets/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.96 kB). View file

datasets/__pycache__/audio_dataset.cpython-311.pyc ADDED Viewed

Binary file (7.66 kB). View file

datasets/__pycache__/ctc_audio_dataclass.cpython-311.pyc ADDED Viewed

Binary file (8.27 kB). View file

datasets/__pycache__/image_dataset.cpython-311.pyc ADDED Viewed

Binary file (4.99 kB). View file

datasets/audio_dataset.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import os
+import zipfile
+import requests
+from tqdm import tqdm
+from typing import List, Tuple
+import numpy as np
+from torch.utils.data import Dataset
+import librosa
+import torch
+SAMPLE_RATE = 22050
+DURATION = 1.4  # second
+class EmodbDataset(Dataset):
+    __url__ = "http://www.emodb.bilderbar.info/download/download.zip"
+    __labels__ = ("angry", "happy", "neutral", "sad")
+    __suffixes__ = {
+        "angry": ["Wa", "Wb", "Wc", "Wd"],
+        "happy": ["Fa", "Fb", "Fc", "Fd"],
+        "neutral": ["Na", "Nb", "Nc", "Nd"],
+        "sad": ["Ta", "Tb", "Tc", "Td"]
+    }
+    def __init__(self, root_path: str = './data/emodb', transform=None):
+        super().__init__()
+        self.root_path = root_path
+        self.audio_root_path = os.path.join(root_path, "wav")
+        # Ensure the dataset is downloaded
+        self._ensure_dataset()
+        ids = []
+        targets = []
+        for audio_file in os.listdir(self.audio_root_path):
+            f_name, ext = os.path.splitext(audio_file)
+            if ext != ".wav":
+                continue
+            suffix = f_name[-2:]
+            for label, suffixes in self.__suffixes__.items():
+                if suffix in suffixes:
+                    ids.append(os.path.join(self.audio_root_path, audio_file))
+                    targets.append(self.label2id(label))
+                    break
+        self.ids = ids
+        self.targets = np.array(targets, dtype=np.int64)
+        self.transform = transform
+    def _ensure_dataset(self):
+        """
+        Ensures the dataset is downloaded and extracted.
+        """
+        if not os.path.isdir(self.audio_root_path):
+            print(f"Dataset not found at {self.audio_root_path}. Downloading...")
+            self._download_and_extract()
+    def _download_and_extract(self):
+        """
+        Downloads and extracts the dataset zip file.
+        """
+        # Ensure the root path exists
+        os.makedirs(self.root_path, exist_ok=True)
+        # Download the dataset
+        zip_path = os.path.join(self.root_path, "emodb.zip")
+        with requests.get(self.__url__, stream=True) as r:
+            r.raise_for_status()
+            total_size = int(r.headers.get("content-length", 0))
+            with open(zip_path, "wb") as f, tqdm(
+                desc="Downloading EMO-DB dataset",
+                total=total_size,
+                unit="B",
+                unit_scale=True,
+                unit_divisor=1024,
+            ) as bar:
+                for chunk in r.iter_content(chunk_size=8192):
+                    f.write(chunk)
+                    bar.update(len(chunk))
+        # Extract the dataset
+        print("Extracting dataset...")
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            zip_ref.extractall(self.root_path)
+        # Clean up the zip file
+        os.remove(zip_path)
+    def __len__(self):
+        return len(self.ids)
+    def __getitem__(self, idx: int) -> Tuple:
+        target = self.targets[idx]
+        audio = self.load_audio(self.ids[idx])  # Should return a numpy array
+        if self.transform:
+            audio = self.transform(audio)  # Apply transform
+        return audio, target
+    @staticmethod
+    def id2label(idx: int) -> str:
+        return EmodbDataset.__labels__[idx]
+    @staticmethod
+    def label2id(label: str) -> int:
+        if label not in EmodbDataset.__labels__:
+            raise ValueError(f"Unknown label: {label}")
+        return EmodbDataset.__labels__.index(label)
+    @staticmethod
+    def load_audio(audio_file_path: str) -> np.ndarray:
+        audio, sr = librosa.load(audio_file_path, sr=SAMPLE_RATE, duration=DURATION)
+        assert SAMPLE_RATE == sr, "broken audio file"
+        # Convert numpy array to PyTorch tensor
+        return torch.tensor(audio, dtype=torch.float32)
+    @staticmethod
+    def get_labels() -> List[str]:
+        return list(EmodbDataset.__labels__)

datasets/ctc_audio_dataclass.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import os
+import zipfile
+import requests
+from tqdm import tqdm
+from typing import List, Tuple
+import numpy as np
+from torch.utils.data import Dataset
+import librosa
+import torch
+from torch.nn.utils.rnn import pad_sequence
+SAMPLE_RATE = 22050
+DURATION = 1.4  # seconds
+class CTCEmodbDataset(Dataset):
+    __url__ = "http://www.emodb.bilderbar.info/download/download.zip"
+    __labels__ = ("angry", "happy", "neutral", "sad")
+    __suffixes__ = {
+        "angry": ["Wa", "Wb", "Wc", "Wd"],
+        "happy": ["Fa", "Fb", "Fc", "Fd"],
+        "neutral": ["Na", "Nb", "Nc", "Nd"],
+        "sad": ["Ta", "Tb", "Tc", "Td"]
+    }
+    def __init__(self, root_path: str = './data/emodb', transform=None):
+        super().__init__()
+        self.root_path = root_path
+        self.audio_root_path = os.path.join(root_path, "wav")
+        # Ensure the dataset is downloaded
+        self._ensure_dataset()
+        ids = []
+        targets = []
+        for audio_file in os.listdir(self.audio_root_path):
+            f_name, ext = os.path.splitext(audio_file)
+            if ext != ".wav":
+                continue
+            suffix = f_name[-2:]
+            for label, suffixes in self.__suffixes__.items():
+                if suffix in suffixes:
+                    ids.append(os.path.join(self.audio_root_path, audio_file))
+                    targets.append(self.label2id(label))  # Store as integers
+                    break
+        self.ids = ids
+        self.targets = targets  # Target sequences as a list of lists
+        self.transform = transform
+    def _ensure_dataset(self):
+        """
+        Ensures the dataset is downloaded and extracted.
+        """
+        if not os.path.isdir(self.audio_root_path):
+            print(f"Dataset not found at {self.audio_root_path}. Downloading...")
+            self._download_and_extract()
+    def _download_and_extract(self):
+        """
+        Downloads and extracts the dataset zip file.
+        """
+        os.makedirs(self.root_path, exist_ok=True)
+        zip_path = os.path.join(self.root_path, "emodb.zip")
+        with requests.get(self.__url__, stream=True) as r:
+            r.raise_for_status()
+            total_size = int(r.headers.get("content-length", 0))
+            with open(zip_path, "wb") as f, tqdm(
+                desc="Downloading EMO-DB dataset",
+                total=total_size,
+                unit="B",
+                unit_scale=True,
+                unit_divisor=1024,
+            ) as bar:
+                for chunk in r.iter_content(chunk_size=8192):
+                    f.write(chunk)
+                    bar.update(len(chunk))
+        print("Extracting dataset...")
+        with zipfile.ZipFile(zip_path, "r") as zip_ref:
+            zip_ref.extractall(self.root_path)
+        os.remove(zip_path)
+    def __len__(self):
+        return len(self.ids)
+    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor, int, int]:
+        """
+        Returns:
+            x (torch.Tensor): Input sequence (audio features or waveform)
+            y (torch.Tensor): Target sequence (labels or tokenized transcription)
+            input_length (int): Length of input sequence
+            target_length (int): Length of target sequence
+        """
+        target = torch.tensor([self.targets[idx]], dtype=torch.long)
+        audio = self.load_audio(self.ids[idx])  # Should return a numpy array
+        if self.transform:
+            audio = self.transform(audio)
+        # Input length (for CTC)
+        input_length = audio.shape[-1]  # Last dimension is the time dimension
+        target_length = len(target)  # Length of target sequence
+        return audio, target, input_length, target_length
+    @staticmethod
+    def id2label(idx: int) -> str:
+        return CTCEmodbDataset.__labels__[idx]
+    @staticmethod
+    def label2id(label: str) -> int:
+        if label not in CTCEmodbDataset.__labels__:
+            raise ValueError(f"Unknown label: {label}")
+        return CTCEmodbDataset.__labels__.index(label)
+    @staticmethod
+    def load_audio(audio_file_path: str) -> torch.Tensor:
+        audio, sr = librosa.load(audio_file_path, sr=SAMPLE_RATE, duration=DURATION)
+        assert SAMPLE_RATE == sr, "broken audio file"
+        return torch.tensor(audio, dtype=torch.float32)
+    @staticmethod
+    def get_labels() -> List[str]:
+        return list(CTCEmodbDataset.__labels__)

datasets/image_dataset.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+from torchvision.datasets import VisionDataset
+from PIL import Image
+from sklearn.model_selection import train_test_split
+class CustomDataset(VisionDataset):
+    def __init__(self, root_path, subset="train", transform=None, target_transform=None, split_ratios=(0.7, 0.15, 0.15), seed=42):
+        super(CustomDataset, self).__init__(root_path, transform=transform, target_transform=target_transform)
+        self.root = root_path
+        self.subset = subset  # Can be "train", "val", or "test"
+        self.split_ratios = split_ratios
+        self.seed = seed
+        self.classes, self.class_idx = self._find_classes()
+        self.samples = self._make_dataset()
+    def _find_classes(self):
+        classes = [d.name for d in os.scandir(self.root) if d.is_dir()]
+        classes.sort()
+        class_idx = {cls_name: i for i, cls_name in enumerate(classes)}
+        return classes, class_idx
+    def _make_dataset(self):
+        samples = []
+        for target_class in sorted(self.class_idx.keys()):
+            class_index = self.class_idx[target_class]
+            target_dir = os.path.join(self.root, target_class)
+            for root, _, fnames in sorted(os.walk(target_dir)):
+                for fname in sorted(fnames):
+                    path = os.path.join(root, fname)
+                    samples.append((path, class_index))
+        # Split into train, val, and test sets
+        train_samples, test_samples = train_test_split(
+            samples, test_size=1 - self.split_ratios[0], random_state=self.seed, stratify=[s[1] for s in samples]
+        )
+        val_samples, test_samples = train_test_split(
+            test_samples, test_size=self.split_ratios[2] / (self.split_ratios[1] + self.split_ratios[2]),
+            random_state=self.seed, stratify=[s[1] for s in test_samples]
+        )
+        if self.subset == "train":
+            return train_samples
+        elif self.subset == "val":
+            return val_samples
+        elif self.subset == "test":
+            return test_samples
+        else:
+            raise ValueError(f"Unknown subset: {self.subset}")
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, index):
+        path, target = self.samples[index]
+        img = Image.open(path).convert("RGB")
+        if self.transform is not None:
+            img = self.transform(img)
+        return img, target

emotion-detection ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 4f5c928446aeb2dabd215f85d3f9647ac92e7e67

encoders/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import encoders, transformer

encoders/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (235 Bytes). View file

encoders/__pycache__/encoders.cpython-311.pyc ADDED Viewed

Binary file (16.2 kB). View file

encoders/__pycache__/transformer.cpython-311.pyc ADDED Viewed

Binary file (12.3 kB). View file

encoders/encoders.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import torch.optim as optim
+import pytorch_lightning as pl
+import timm
+from torchmetrics import Accuracy, Precision, Recall, F1Score
+import torch
+class timm_backbones(pl.LightningModule):
+    """
+    PyTorch Lightning model for image classification using a ResNet-18 architecture.
+    This model uses a pre-trained ResNet-18 model and fine-tunes it for a specific number of classes.
+    Args:
+        num_classes (int, optional): The number of classes in the dataset. Defaults to 2.
+        optimizer_cfg (DictConfig, optional): A Hydra configuration object for the optimizer.
+    Methods:
+        forward(x): Computes the forward pass of the model.
+        configure_optimizers(): Configures the optimizer for the model.
+        training_step(batch, batch_idx): Performs a training step on the model.
+        validation_step(batch, batch_idx): Performs a validation step on the model.
+        on_validation_epoch_end(): Called at the end of each validation epoch.
+        test_step(batch, batch_idx): Performs a test step on the model.
+    Example:
+        model = ResNet18(num_classes=2, optimizer_cfg=cfg.model.optimizer)
+        trainer.fit(model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
+        trainer.test(model, dataloaders=test_dataloader)
+    """
+    def __init__(self, encoder='resnet18', num_classes=2, optimizer_cfg=None, l1_lambda=0.0):
+        super().__init__()
+        self.encoder = encoder
+        self.model = timm.create_model(encoder, pretrained=True)
+        if self.model.default_cfg["input_size"][1] == 3:  # If model expects 3 channels
+            self.model.conv1 = torch.nn.Conv2d(
+                in_channels=1,  # Change to single channel
+                out_channels=self.model.conv1.out_channels,
+                kernel_size=self.model.conv1.kernel_size,
+                stride=self.model.conv1.stride,
+                padding=self.model.conv1.padding,
+                bias=False
+            )
+        self.accuracy = Accuracy(task="multiclass", num_classes=num_classes)
+        self.precision = Precision(task="multiclass", num_classes=num_classes)
+        self.recall = Recall(task="multiclass", num_classes=num_classes)
+        self.f1 = F1Score(task="multiclass", num_classes=num_classes)
+        self.l1_lambda = l1_lambda
+        if hasattr(self.model, 'fc'):  # For models with 'fc' as the classification layer
+            in_features = self.model.fc.in_features
+            self.model.fc = torch.nn.Linear(in_features, num_classes)
+        elif hasattr(self.model, 'classifier'):  # For models with 'classifier'
+            in_features = self.model.classifier.in_features
+            self.model.classifier = torch.nn.Linear(in_features, num_classes)
+        elif hasattr(self.model, 'head'):  # For models with 'head'
+            in_features = self.model.head.in_features
+            self.model.head = torch.nn.Linear(in_features, num_classes)
+        else:
+            raise ValueError(f"Unsupported model architecture for encoder: {encoder}")
+        if optimizer_cfg is not None:
+            optimizer_name = optimizer_cfg.name
+            optimizer_lr = optimizer_cfg.lr
+            optimizer_weight_decay = optimizer_cfg.weight_decay
+            if optimizer_name == 'Adam':
+                self.optimizer = optim.Adam(self.parameters(), lr=optimizer_lr, weight_decay=optimizer_weight_decay)
+            elif optimizer_name == 'SGD':
+                self.optimizer = optim.SGD(self.parameters(), lr=optimizer_lr, weight_decay=optimizer_weight_decay)
+            else:
+                raise ValueError(f"Unsupported optimizer: {optimizer_name}")
+        else:
+            self.optimizer = None
+    def forward(self, x):
+        return self.model(x)
+    def configure_optimizers(self):
+        optimizer = self.optimizer
+        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.2, patience=20, min_lr=5e-5)
+        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y = y.long()
+        # Compute predictions and loss
+        logits = self(x)
+        loss = torch.nn.functional.cross_entropy(logits, y)
+        # Add L1 regularization
+        l1_norm = sum(param.abs().sum() for param in self.parameters())
+        loss += self.l1_lambda * l1_norm
+        self.log('train_loss', loss, prog_bar=True, on_epoch=True, on_step=False, logger=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        y = y.long()
+        logits = self(x)
+        loss = torch.nn.functional.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        accuracy = self.accuracy(y, preds)
+        precision = self.precision(y, preds)
+        recall = self.recall(y, preds)
+        f1 = self.f1(y, preds)
+        self.log('val_loss', loss, prog_bar=True, on_epoch=True, on_step=True)
+        self.log('val_acc', accuracy, prog_bar=True, on_epoch=True, on_step=True)
+        self.log('val_precision', precision, prog_bar=True, on_epoch=True, on_step=True)
+        self.log('val_recall', recall, prog_bar=True, on_epoch=True, on_step=True)
+        self.log('val_f1', f1, prog_bar=True, on_epoch=True, on_step=True)
+        return loss
+    def on_validation_epoch_end(self):
+        avg_loss = self.trainer.logged_metrics['val_loss_epoch']
+        accuracy = self.trainer.logged_metrics['val_acc_epoch']
+        self.log('val_loss', avg_loss, prog_bar=True, on_epoch=True)
+        self.log('val_acc', accuracy, prog_bar=True, on_epoch=True)
+        return {'Average Loss:': avg_loss, 'Accuracy:': accuracy}
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        y = y.long()
+        logits = self(x)
+        loss = torch.nn.functional.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        accuracy = self.accuracy(y, preds)
+        precision = self.precision(y, preds)
+        recall = self.recall(y, preds)
+        f1 = self.f1(y, preds)
+        # Log test metrics
+        self.log('test_loss', loss, prog_bar=True, logger=True)
+        self.log('test_acc', accuracy, prog_bar=True, logger=True)
+        self.log('test_precision', precision, prog_bar=True, logger=True)
+        self.log('test_recall', recall, prog_bar=True, logger=True)
+        self.log('test_f1', f1, prog_bar=True, logger=True)
+        return {'test_loss': loss, 'test_accuracy': accuracy, 'test_precision': precision, 'test_recall': recall, 'test_f1': f1}
+class CTCEncoderPL(pl.LightningModule):
+    def __init__(self, ctc_encoder, num_classes, optimizer_cfg):
+        super(CTCEncoderPL, self).__init__()
+        self.ctc_encoder = ctc_encoder
+        self.ctc_loss = torch.nn.CTCLoss(blank=0, zero_infinity=True)
+        self.optimizer_cfg = optimizer_cfg
+        self.accuracy = Accuracy(task="multiclass", num_classes=num_classes)
+        self.precision = Precision(task="multiclass", num_classes=num_classes)
+        self.recall = Recall(task="multiclass", num_classes=num_classes)
+        self.f1 = F1Score(task="multiclass", num_classes=num_classes)
+        if optimizer_cfg is not None:
+            optimizer_name = optimizer_cfg.name
+            optimizer_lr = optimizer_cfg.lr
+            optimizer_weight_decay = optimizer_cfg.weight_decay
+            if optimizer_name == 'Adam':
+                self.optimizer = optim.Adam(self.parameters(), lr=optimizer_lr, weight_decay=optimizer_weight_decay)
+            elif optimizer_name == 'SGD':
+                self.optimizer = optim.SGD(self.parameters(), lr=optimizer_lr, weight_decay=optimizer_weight_decay)
+            else:
+                raise ValueError(f"Unsupported optimizer: {optimizer_name}")
+        else:
+            self.optimizer = None
+    def forward(self, x):
+        return self.ctc_encoder(x)
+    def training_step(self, batch, batch_idx):
+        x, y, input_lengths, target_lengths = batch
+        logits, input_lengths = self.ctc_encoder(x, input_lengths)
+        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+        loss = self.ctc_loss(log_probs, y, input_lengths, target_lengths)
+        assert input_lengths.size(0) == x.size(0), f"input_lengths size ({input_lengths.size(0)}) must match batch size ({x.size(0)})"
+        preds = torch.argmax(log_probs, dim=-1)
+        self.log("train_loss", loss, on_epoch=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        x, y, input_lengths, target_lengths = batch
+        # Compute logits and adjust input lengths
+        logits, input_lengths = self.ctc_encoder(x, input_lengths)
+        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+        # Validate input_lengths size
+        assert input_lengths.size(0) == logits.size(0), "Mismatch between input_lengths and batch size"
+        # Compute CTC loss
+        loss = self.ctc_loss(log_probs, y, input_lengths, target_lengths)
+        # Compute metrics
+        preds = torch.argmax(log_probs, dim=-1)
+        accuracy = self.accuracy(y, preds)
+        precision = self.precision(y, preds)
+        recall = self.recall(y, preds)
+        f1 = self.f1(y, preds)
+        # Log metrics
+        self.log('val_loss', loss, prog_bar=True, on_epoch=True, on_step=True)
+        self.log('val_acc', accuracy, prog_bar=True, on_epoch=True, on_step=True)
+        self.log('val_precision', precision, prog_bar=True, on_epoch=True, on_step=True)
+        self.log('val_recall', recall, prog_bar=True, on_epoch=True, on_step=True)
+        self.log('val_f1', f1, prog_bar=True, on_epoch=True, on_step=True)
+        return loss
+    def on_validation_epoch_end(self):
+        avg_loss = self.trainer.logged_metrics['val_loss_epoch']
+        accuracy = self.trainer.logged_metrics['val_acc_epoch']
+        self.log('val_loss', avg_loss, prog_bar=True, on_epoch=True)
+        self.log('val_acc', accuracy, prog_bar=True, on_epoch=True)
+        return {'Average Loss:': avg_loss, 'Accuracy:': accuracy}
+    def test_step(self, batch, batch_idx):
+        x, y, input_lengths, target_lengths = batch
+        logits = self(x)
+        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+        loss = self.ctc_loss(log_probs, y, input_lengths, target_lengths)
+        preds = torch.argmax(log_probs, dim=-1)
+        accuracy = self.accuracy(y, preds)
+        precision = self.precision(y, preds)
+        recall = self.recall(y, preds)
+        f1 = self.f1(y, preds)
+        self.log('test_loss', loss, prog_bar=True, logger=True)
+        self.log('test_acc', accuracy, prog_bar=True, logger=True)
+        self.log('test_precision', precision, prog_bar=True, logger=True)
+        self.log('test_recall', recall, prog_bar=True, logger=True)
+        self.log('test_f1', f1, prog_bar=True, logger=True)
+        return {'test_loss': loss, 'test_accuracy': accuracy, 'test_precision': precision, 'test_recall': recall, 'test_f1': f1}
+    def configure_optimizers(self):
+        optimizer = self.optimizer
+        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.2, patience=20, min_lr=5e-5)
+        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}
+    def greedy_decode(self, log_probs):
+        """
+        Perform greedy decoding to get predictions from log probabilities.
+        """
+        preds = torch.argmax(log_probs, dim=-1)
+        return preds

encoders/transformer.py ADDED Viewed

	@@ -0,0 +1,233 @@

+import pytorch_lightning as pl
+import torch
+from torchmetrics import Accuracy, Precision, Recall, F1Score
+from transformers import Wav2Vec2Model, Wav2Vec2ForSequenceClassification
+import torch.nn.functional as F
+class Wav2Vec2Classifier(pl.LightningModule):
+    def __init__(self, num_classes, optimizer_cfg = "Adam", l1_lambda=0.0):
+        super(Wav2Vec2Classifier, self).__init__()
+        self.save_hyperparameters()
+        # Wav2Vec2 backbone
+        # self.wav2vec2 = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
+        self.wav2vec2 = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-xlsr-53")
+        # trying without the need to fine tune it
+        for param in self.wav2vec2.parameters():
+            param.requires_grad = False
+        # Classification head
+        self.classifier = torch.nn.Linear(self.wav2vec2.config.hidden_size, num_classes)
+        # Metrics
+        self.accuracy = Accuracy(task="multiclass", num_classes=num_classes)
+        self.precision = Precision(task="multiclass", num_classes=num_classes)
+        self.recall = Recall(task="multiclass", num_classes=num_classes)
+        self.f1 = F1Score(task="multiclass", num_classes=num_classes)
+        self.l1_lambda = l1_lambda
+        if optimizer_cfg is not None:
+            optimizer_name = optimizer_cfg.name
+            optimizer_lr = optimizer_cfg.lr
+            optimizer_weight_decay = optimizer_cfg.weight_decay
+            if optimizer_name == 'Adam':
+                self.optimizer = torch.optim.Adam(self.parameters(), lr=optimizer_lr, weight_decay=optimizer_weight_decay)
+            elif optimizer_name == 'SGD':
+                self.optimizer = torch.optim.SGD(self.parameters(), lr=optimizer_lr, weight_decay=optimizer_weight_decay)
+            else:
+                raise ValueError(f"Unsupported optimizer: {optimizer_name}")
+        else:
+            self.optimizer = None
+    def forward(self, x, attention_mask=None):
+        # Debug input shape
+        # Ensure input shape is [batch_size, sequence_length]
+        if x.dim() > 2:
+            x = x.squeeze(-1)  # Remove unnecessary dimensions if present
+        # Pass through Wav2Vec2 backbone
+        output = self.wav2vec2(x, attention_mask=attention_mask)
+        x = output.last_hidden_state
+        # Classification head
+        x = torch.mean(x, dim=1)  # Pooling
+        logits = self.classifier(x)
+        return logits
+    def training_step(self, batch, batch_idx):
+        x, attention_mask, y = batch
+        # Forward pass
+        logits = self(x, attention_mask=attention_mask)
+        # Compute loss
+        loss = F.cross_entropy(logits, y)
+        # Add L1 regularization if specified
+        l1_norm = sum(param.abs().sum() for param in self.parameters())
+        loss += self.l1_lambda * l1_norm
+        # Log metrics
+        self.log("train_loss", loss, prog_bar=True, logger=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        x, attention_mask, y = batch  # Unpack batch
+        # Forward pass
+        logits = self(x, attention_mask=attention_mask)
+        # Compute loss and metrics
+        loss = F.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        accuracy = self.accuracy(preds, y)
+        precision = self.precision(preds, y)
+        recall = self.recall(preds, y)
+        f1 = self.f1(preds, y)
+        # Log metrics
+        self.log("val_loss", loss, prog_bar=True, logger=True)
+        self.log("val_acc", accuracy, prog_bar=True, logger=True)
+        self.log("val_precision", precision, prog_bar=True, logger=True)
+        self.log("val_recall", recall, prog_bar=True, logger=True)
+        self.log("val_f1", f1, prog_bar=True, logger=True)
+        return loss
+    def test_step(self, batch, batch_idx):
+        x, attention_mask, y = batch  # Unpack batch
+        # Forward pass
+        logits = self(x, attention_mask=attention_mask)
+        # Compute loss and metrics
+        loss = F.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        accuracy = self.accuracy(preds, y)
+        precision = self.precision(preds, y)
+        recall = self.recall(preds, y)
+        f1 = self.f1(preds, y)
+        # Log metrics
+        self.log("test_loss", loss, prog_bar=True, logger=True)
+        self.log("test_acc", accuracy, prog_bar=True, logger=True)
+        self.log("test_precision", precision, prog_bar=True, logger=True)
+        self.log("test_recall", recall, prog_bar=True, logger=True)
+        self.log("test_f1", f1, prog_bar=True, logger=True)
+        return {"test_loss": loss, "test_accuracy": accuracy}
+    def configure_optimizers(self):
+        optimizer = self.optimizer
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.2, patience=20, min_lr=5e-5)
+        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}
+class Wav2Vec2EmotionClassifier(pl.LightningModule):
+    def __init__(self, num_classes, learning_rate=1e-4, freeze_base=False, optimizer_cfg="AdamW"):
+        super(Wav2Vec2EmotionClassifier, self).__init__()
+        self.save_hyperparameters()
+        # Load a pre-trained Wav2Vec2 model optimized for emotion recognition
+        self.model = Wav2Vec2ForSequenceClassification.from_pretrained(
+            "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim",
+            num_labels=num_classes,
+        )
+        # Optionally freeze the Wav2Vec2 base layers
+        if freeze_base:
+            for param in self.model.wav2vec2.parameters():
+                param.requires_grad = False
+        # Metrics
+        self.accuracy = Accuracy(task="multiclass", num_classes=num_classes)
+        self.precision = Precision(task="multiclass", num_classes=num_classes)
+        self.recall = Recall(task="multiclass", num_classes=num_classes)
+        self.f1 = F1Score(task="multiclass", num_classes=num_classes)
+        self.learning_rate = learning_rate
+        if optimizer_cfg is not None:
+            optimizer_name = optimizer_cfg['name']
+            optimizer_lr = optimizer_cfg['lr']
+            optimizer_weight_decay = optimizer_cfg['weight_decay']
+            if optimizer_name == 'Adam':
+                self.optimizer = torch.optim.Adam(self.parameters(), lr=optimizer_lr, weight_decay=optimizer_weight_decay)
+            elif optimizer_name == 'SGD':
+                self.optimizer = torch.optim.SGD(self.parameters(), lr=optimizer_lr, weight_decay=optimizer_weight_decay)
+            elif optimizer_name == 'AdamW':
+                self.optimizer = torch.optim.AdamW(self.parameters(), lr=optimizer_lr, weight_decay=optimizer_weight_decay)
+            else:
+                raise ValueError(f"Unsupported optimizer: {optimizer_name}")
+        else:
+            self.optimizer = None
+    def forward(self, x, attention_mask=None):
+        return self.model(x, attention_mask=attention_mask).logits
+    def training_step(self, batch, batch_idx):
+        x, attention_mask, y = batch
+        # Forward pass
+        logits = self(x, attention_mask=attention_mask)
+        # Compute loss
+        loss = F.cross_entropy(logits, y)
+        # Log training loss
+        self.log("train_loss", loss, prog_bar=True, logger=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        x, attention_mask, y = batch
+        # Forward pass
+        logits = self(x, attention_mask=attention_mask)
+        # Compute loss and metrics
+        loss = F.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        accuracy = self.accuracy(preds, y)
+        precision = self.precision(preds, y)
+        recall = self.recall(preds, y)
+        f1 = self.f1(preds, y)
+        # Log metrics
+        self.log("val_loss", loss, prog_bar=True, logger=True)
+        self.log("val_acc", accuracy, prog_bar=True, logger=True)
+        self.log("val_precision", precision, prog_bar=True, logger=True)
+        self.log("val_recall", recall, prog_bar=True, logger=True)
+        self.log("val_f1", f1, prog_bar=True, logger=True)
+        return loss
+    def test_step(self, batch, batch_idx):
+        x, attention_mask, y = batch
+        # Forward pass
+        logits = self(x, attention_mask=attention_mask)
+        # Compute loss and metrics
+        loss = F.cross_entropy(logits, y)
+        preds = torch.argmax(logits, dim=1)
+        accuracy = self.accuracy(preds, y)
+        precision = self.precision(preds, y)
+        recall = self.recall(preds, y)
+        f1 = self.f1(preds, y)
+        # Log metrics
+        self.log("test_loss", loss, prog_bar=True, logger=True)
+        self.log("test_acc", accuracy, prog_bar=True, logger=True)
+        self.log("test_precision", precision, prog_bar=True, logger=True)
+        self.log("test_recall", recall, prog_bar=True, logger=True)
+        self.log("test_f1", f1, prog_bar=True, logger=True)
+        return {"test_loss": loss, "test_accuracy": accuracy}
+    def configure_optimizers(self):
+        optimizer = self.optimizer
+        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.2, patience=20, min_lr=5e-5)
+        return {"optimizer": optimizer, "lr_scheduler": scheduler, "monitor": "val_loss"}

model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0116227115053abebb4951ef1bd0bd25750797f2bfe98d74df152dc2289295d6
+size 658272386

models/CTCencoder.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import torch
+import torch.nn as nn
+class CTCEncoder(nn.Module):
+    def __init__(self, num_classes, cnn_output_dim=256, rnn_hidden_dim=256, rnn_layers=3):
+        """
+        CTC Encoder with a CNN feature extractor and LSTM for sequence modeling.
+        Args:
+            num_classes (int): Number of output classes for the model.
+            cnn_output_dim (int): Number of output channels from the CNN.
+            rnn_hidden_dim (int): Hidden size of the LSTM.
+            rnn_layers (int): Number of layers in the LSTM.
+        """
+        super(CTCEncoder, self).__init__()
+        # CNN Feature Extractor
+        self.feature_extractor = nn.Sequential(
+            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),  # Down-sample by 2
+            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.MaxPool2d(kernel_size=2, stride=2),  # Down-sample by another 2
+            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(),
+            nn.AdaptiveAvgPool2d((1, None))  # Ensure output height is 1
+        )
+        # Bidirectional LSTM
+        self.rnn_hidden_dim = rnn_hidden_dim
+        self.rnn_layers = rnn_layers
+        self.cnn_output_dim = cnn_output_dim
+        self.rnn = nn.LSTM(
+            input_size=cnn_output_dim,  # Output channels from CNN
+            hidden_size=rnn_hidden_dim,
+            num_layers=rnn_layers,
+            batch_first=True,
+            bidirectional=True
+        )
+        # Fully connected layer
+        self.fc = nn.Linear(rnn_hidden_dim * 2, num_classes)
+    def compute_input_lengths(self, input_lengths):
+        """
+        Adjusts input lengths based on the CNN's down-sampling operations.
+        Args:
+            input_lengths (torch.Tensor): Original input lengths.
+        Returns:
+            torch.Tensor: Adjusted input lengths.
+        """
+        # Account for down-sampling by MaxPool layers (factor of 2 for each MaxPool)
+        input_lengths = input_lengths // 2  # First MaxPool
+        input_lengths = input_lengths // 2  # Second MaxPool
+        input_lengths = input_lengths // 2  # Third pooling layer or additional down-sampling
+        return input_lengths
+    def forward(self, x, input_lengths):
+        """
+        Forward pass through the encoder.
+        Args:
+            x (torch.Tensor): Input tensor of shape [B, 1, H, W].
+            input_lengths (torch.Tensor): Lengths of the sequences in the batch.
+        Returns:
+            torch.Tensor: Logits of shape [B, T, num_classes].
+            torch.Tensor: Adjusted input lengths.
+        """
+        # Feature extraction
+        x = self.feature_extractor(x)  # [Batch_Size, Channels, Height, Width]
+        print(f"Shape after CNN: {x.shape}")  # Debug the shape
+        # Reshape for LSTM
+        x = x.squeeze(2).permute(0, 2, 1)  # [Batch_Size, Sequence_Length, Features]
+        assert x.size(-1) == 256, f"Expected last dimension to be 256, but got {x.size(-1)}"
+        # Adjust input lengths
+        input_lengths = self.compute_input_lengths(input_lengths)
+        assert input_lengths.size(0) == x.size(0), f"input_lengths size ({input_lengths.size(0)}) must match batch size ({x.size(0)})"
+        # Pass through LSTM
+        x, _ = self.rnn(x)  # [Batch_Size, Sequence_Length, 2 * Hidden_Dim]
+        # Fully connected output
+        x = self.fc(x)  # [Batch_Size, Sequence_Length, Num_Classes]
+        return x, input_lengths

models/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from . import CTCencoder

models/__pycache__/CTCencoder.cpython-311.pyc ADDED Viewed

Binary file (4.67 kB). View file

models/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (207 Bytes). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio
+librosa
+torch
+transformers
+numpy

statics/style.css ADDED Viewed

	@@ -0,0 +1,9 @@

+#audio_input {
+    border: 2px solid #4CAF50;
+    border-radius: 10px;
+}
+#submit_button {
+    background-color: #4CAF50;
+    color: white;
+    border-radius: 5px;
+}

upload_model.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from huggingface_hub import HfApi, HfFolder, Repository
+api = HfApi()
+repo_url = api.create_repo(repo_id="saeedbenadeeb/emotion-detection", exist_ok=True)
+repo = Repository(local_dir="emotion-detection", clone_from=repo_url)
+repo.git_pull()
+# Copy model files to the repo directory
+import shutil
+shutil.copy("model.pth", "emotion-detection")
+# Add files and push
+repo.push_to_hub(commit_message="Initial model upload")

utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from . import random_split
2	+ from . import helper_functions

utils/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (263 Bytes). View file

utils/__pycache__/helper_functions.cpython-311.pyc ADDED Viewed

Binary file (3.61 kB). View file

utils/__pycache__/random_split.cpython-311.pyc ADDED Viewed

Binary file (2.39 kB). View file

utils/helper_functions.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+import torch.nn as nn
+import os
+import shutil
+def normalize_ratios(ratios):
+    total = sum(ratios)
+    return [r / total for r in ratios]
+from torch.nn.utils.rnn import pad_sequence
+def collate_fn_transformer(batch):
+    """
+    Custom collate function to handle variable-length raw waveform inputs.
+    Args:
+        batch: List of tuples (tensor, label), where tensor has shape [sequence_length].
+    Returns:
+        padded_waveforms: Padded tensor of shape [batch_size, max_seq_len].
+        attention_mask: Attention mask for padded sequences.
+        labels: Tensor of shape [batch_size].
+    """
+    # Separate waveforms and labels
+    waveforms, labels = zip(*batch)
+    # Ensure waveforms are 1D tensors
+    waveforms = [torch.tensor(waveform).squeeze() for waveform in waveforms]
+    # Pad sequences to the same length
+    padded_waveforms = pad_sequence(waveforms, batch_first=True)  # [batch_size, max_seq_len]
+    # Create attention mask
+    attention_mask = (padded_waveforms != 0).long()  # Mask for non-padded values
+    # In the training loop or DataLoader debug
+    # Convert labels to a tensor
+    labels = torch.tensor(labels, dtype=torch.long)
+    return padded_waveforms, attention_mask, labels
+def collate_fn(batch):
+    inputs, targets, input_lengths, target_lengths = zip(*batch)
+    inputs = torch.stack(inputs)  # Convert list of tensors to a batch tensor
+    targets = torch.cat(targets)  # Flatten target sequences
+    input_lengths = torch.tensor(input_lengths, dtype=torch.long)
+    target_lengths = torch.tensor(target_lengths, dtype=torch.long)
+    return inputs, targets, input_lengths, target_lengths
+def save_test_data(test_dataset, dataset, save_dir):
+    if os.path.exists(save_dir):
+        shutil.rmtree(save_dir)  # Delete the existing directory and its contents
+        print(f"Existing test data directory '{save_dir}' removed.")
+    os.makedirs(save_dir, exist_ok=True)
+    for idx in test_dataset.indices:
+        audio_file_path = dataset.audio_files[idx]  # Assuming dataset has `audio_files` attribute
+        label = dataset.labels[idx]  # Assuming dataset has `labels` attribute
+        # Create a directory for the label if it doesn't exist
+        label_dir = os.path.join(save_dir, str(label))
+        os.makedirs(label_dir, exist_ok=True)
+        # Copy the audio file to the label directory
+        shutil.copy(audio_file_path, label_dir)
+    print(f"Test data saved in {save_dir}")

utils/random_split.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from typing import List
+import torch
+from torch.utils.data import Subset
+from sklearn.model_selection import train_test_split
+from utils.helper_functions import normalize_ratios
+def stratified_random_split(ds: torch.utils.data.Dataset, parts: List[float], targets: List[int]) -> List[torch.utils.data.Dataset]:
+    """
+    Perform a stratified random split on the dataset.
+    Args:
+        ds: PyTorch dataset to split.
+        parts: List of proportions that sum to 1.
+        targets: List of labels corresponding to dataset samples.
+    Returns:
+        List of PyTorch datasets corresponding to the splits.
+    """
+    total_length = len(ds)
+    # Normalize ratios
+    parts = normalize_ratios(parts)
+    lengths = list(map(lambda p: int(p * total_length), parts))
+    left_over = total_length - sum(lengths)
+    lengths[0] += left_over  # Adjust first split to account for leftover
+    indices = list(range(total_length))
+    train_indices, temp_indices, _, temp_targets = train_test_split(
+        indices, targets, test_size=(1 - parts[0]), stratify=targets, random_state=42
+    )
+    val_size = parts[1] / (parts[1] + parts[2])
+    val_indices, test_indices, _, _ = train_test_split(
+        temp_indices, temp_targets, test_size=(1 - val_size), stratify=temp_targets, random_state=42
+    )
+    return [Subset(ds, train_indices), Subset(ds, val_indices), Subset(ds, test_indices)]