speaker-diarization-app-v2

Running

App Files Files Community

Manyue-DataScientist commited on Jan 24

Commit

b1426fb

1 Parent(s): 965e524

Changed Folder Structure

Browse files

Changed code format and folder structure to make it more available for optimization in future.

Files changed (10) hide show

README.md +16 -13
app.py +114 -139
src/__init__.py +13 -0
src/models/__init__.py +8 -0
src/models/diarization.py +44 -0
src/models/summarization.py +44 -0
src/models/transcription.py +36 -0
src/utils/__init__.py +7 -0
src/utils/audio_processor.py +43 -0
src/utils/formatter.py +59 -0

README.md CHANGED Viewed

@@ -1,13 +1,16 @@
----
-title: Speaker Diarization App
-emoji: 📈
-colorFrom: red
-colorTo: gray
-sdk: streamlit
-sdk_version: 1.41.1
-app_file: app.py
-pinned: false
-short_description: 'College Final Project '
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Multi-Speaker Audio Analyzer
+A Streamlit application that performs speaker diarization, transcription, and summarization on audio files.
+## Features
+- Speaker Diarization using Pyannote
+- Transcription using Whisper
+- Summarization using BART
+## Setup
+1. Install requirements: `pip install -r requirements.txt`
+2. Add HuggingFace token to Streamlit secrets
+3. Run app: `streamlit run app.py`
+## Usage
+Upload an audio file (MP3/WAV) and click "Analyze Audio" to process.

app.py CHANGED Viewed

@@ -1,129 +1,93 @@
 import streamlit as st
-from pyannote.audio import Pipeline
-import whisper
-import tempfile
 import os
-import torch
-from transformers import pipeline as tf_pipeline
-from pydub import AudioSegment
-import io
 @st.cache_resource
 def load_models():
     try:
-        diarization = Pipeline.from_pretrained(
-            "pyannote/speaker-diarization",
-            use_auth_token=st.secrets["hf_token"]
-        )
-        transcriber = whisper.load_model("base")
-        summarizer = tf_pipeline(
-            "summarization",
-            model="facebook/bart-large-cnn",
-            device=0 if torch.cuda.is_available() else -1
-        )
-        if not diarization or not transcriber or not summarizer:
             raise ValueError("One or more models failed to load")
-        return diarization, transcriber, summarizer
     except Exception as e:
         st.error(f"Error loading models: {str(e)}")
         st.error("Debug info: Check if HF token is valid and has necessary permissions")
         return None, None, None
 def process_audio(audio_file, max_duration=600):
     try:
-        audio_bytes = io.BytesIO(audio_file.getvalue())
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
-            try:
-                if audio_file.name.lower().endswith('.mp3'):
-                    audio = AudioSegment.from_mp3(audio_bytes)
-                else:
-                    audio = AudioSegment.from_wav(audio_bytes)
-                # Standardize format
-                audio = audio.set_frame_rate(16000)
-                audio = audio.set_channels(1)
-                audio = audio.set_sample_width(2)
-                audio.export(
-                    tmp.name,
-                    format="wav",
-                    parameters=["-ac", "1", "-ar", "16000"]
-                )
-                tmp_path = tmp.name
-            except Exception as e:
-                st.error(f"Error converting audio: {str(e)}")
-                return None
-            diarization, transcriber, summarizer = load_models()
-            if not all([diarization, transcriber, summarizer]):
-                return "Model loading failed"
-            with st.spinner("Identifying speakers..."):
-                diarization_result = diarization(tmp_path)
-            with st.spinner("Transcribing audio..."):
-                transcription = transcriber.transcribe(tmp_path)
-            with st.spinner("Generating summary..."):
-                summary = summarizer(transcription["text"], max_length=130, min_length=30)
-            os.unlink(tmp_path)
-            return {
-                "diarization": diarization_result,
-                "transcription": transcription,
-                "summary": summary[0]["summary_text"]
-            }
     except Exception as e:
         st.error(f"Error processing audio: {str(e)}")
         return None
-def format_speaker_segments(diarization_result, transcription):
-    if diarization_result is None:
-        return []
-    formatted_segments = []
-    whisper_segments = transcription.get('segments', [])
-    try:
-        for turn, _, speaker in diarization_result.itertracks(yield_label=True):
-            current_text = ""
-            # Find matching whisper segments for this speaker's time window
-            for w_segment in whisper_segments:
-                w_start = float(w_segment['start'])
-                w_end = float(w_segment['end'])
-                # If whisper segment overlaps with speaker segment
-                if (w_start >= turn.start and w_start < turn.end) or \
-                   (w_end > turn.start and w_end <= turn.end):
-                    current_text += w_segment['text'].strip() + " "
-            formatted_segments.append({
-                'speaker': str(speaker),
-                'start': float(turn.start),
-                'end': float(turn.end),
-                'text': current_text.strip()
-            })
-    except Exception as e:
-        st.error(f"Error formatting segments: {str(e)}")
-        return []
-    return formatted_segments
-def format_timestamp(seconds):
-    minutes = int(seconds // 60)
-    seconds = seconds % 60
-    return f"{minutes:02d}:{seconds:05.2f}"
 def main():
     st.title("Multi-Speaker Audio Analyzer")
     st.write("Upload an audio file (MP3/WAV) up to 5 minutes long for best performance")
@@ -144,51 +108,62 @@ def main():
                 if results:
                     tab1, tab2, tab3 = st.tabs(["Speakers", "Transcription", "Summary"])
                     with tab1:
-                        st.write("Speaker Timeline:")
-                        segments = format_speaker_segments(
-                            results["diarization"],
-                            results["transcription"]
-                        )
-                        if segments:
-                            for segment in segments:
-                                col1, col2, col3 = st.columns([2,3,5])
-                                with col1:
-                                    speaker_num = int(segment['speaker'].split('_')[1])
-                                    colors = ['🔵', '🔴']
-                                    speaker_color = colors[speaker_num % len(colors)]
-                                    st.write(f"{speaker_color} {segment['speaker']}")
-                                with col2:
-                                    start_time = format_timestamp(segment['start'])
-                                    end_time = format_timestamp(segment['end'])
-                                    st.write(f"{start_time} → {end_time}")
-                                with col3:
-                                    if segment['text']:
-                                        st.write(f"\"{segment['text']}\"")
-                                    else:
-                                        st.write("(no speech detected)")
-                                st.markdown("---")
-                        else:
-                            st.warning("No speaker segments detected")
                     with tab2:
-                        st.write("Transcription:")
-                        if "text" in results["transcription"]:
-                            st.write(results["transcription"]["text"])
-                        else:
-                            st.warning("No transcription available")
                     with tab3:
-                        st.write("Summary:")
-                        if results["summary"]:
-                            st.write(results["summary"])
-                        else:
-                            st.warning("No summary available")
 if __name__ == "__main__":
     main()

+"""
+Multi-Speaker Audio Analyzer
+A Streamlit application that performs speaker diarization, transcription, and summarization on audio files.
+Author: [Your Name]
+Date: January 2025
+"""
 import streamlit as st
+from src.models.diarization import SpeakerDiarizer
+from src.models.transcription import Transcriber
+from src.models.summarization import Summarizer
+from src.utils.audio_processor import AudioProcessor
+from src.utils.formatter import TimeFormatter
 import os
+# Cache for model loading
 @st.cache_resource
 def load_models():
+    """
+    Load and cache all required models.
+    Returns:
+        tuple: (diarizer, transcriber, summarizer) or (None, None, None) if loading fails
+    """
     try:
+        diarizer = SpeakerDiarizer(st.secrets["hf_token"])
+        diarizer_model = diarizer.load_model()
+        transcriber = Transcriber()
+        transcriber_model = transcriber.load_model()
+        summarizer = Summarizer()
+        summarizer_model = summarizer.load_model()
+        if not all([diarizer_model, transcriber_model, summarizer_model]):
             raise ValueError("One or more models failed to load")
+        return diarizer, transcriber, summarizer
     except Exception as e:
         st.error(f"Error loading models: {str(e)}")
         st.error("Debug info: Check if HF token is valid and has necessary permissions")
         return None, None, None
 def process_audio(audio_file, max_duration=600):
+    """
+    Process the uploaded audio file through all models.
+    Args:
+        audio_file: Uploaded audio file
+        max_duration (int): Maximum duration in seconds
+    Returns:
+        dict: Processing results containing diarization, transcription, and summary
+    """
     try:
+        # Process audio file
+        audio_processor = AudioProcessor()
+        tmp_path = audio_processor.standardize_audio(audio_file)
+        # Load models
+        diarizer, transcriber, summarizer = load_models()
+        if not all([diarizer, transcriber, summarizer]):
+            return "Model loading failed"
+        # Process with each model
+        with st.spinner("Identifying speakers..."):
+            diarization_result = diarizer.process(tmp_path)
+        with st.spinner("Transcribing audio..."):
+            transcription = transcriber.process(tmp_path)
+        with st.spinner("Generating summary..."):
+            summary = summarizer.process(transcription["text"])
+        # Cleanup
+        os.unlink(tmp_path)
+        return {
+            "diarization": diarization_result,
+            "transcription": transcription,
+            "summary": summary[0]["summary_text"]
+        }
     except Exception as e:
         st.error(f"Error processing audio: {str(e)}")
         return None
 def main():
+    """Main application function."""
     st.title("Multi-Speaker Audio Analyzer")
     st.write("Upload an audio file (MP3/WAV) up to 5 minutes long for best performance")
                 if results:
                     tab1, tab2, tab3 = st.tabs(["Speakers", "Transcription", "Summary"])
+                    # Display speaker timeline
                     with tab1:
+                        display_speaker_timeline(results)
+                    # Display transcription
                     with tab2:
+                        display_transcription(results)
+                    # Display summary
                     with tab3:
+                        display_summary(results)
+def display_speaker_timeline(results):
+    """Display speaker diarization results in a timeline format."""
+    st.write("Speaker Timeline:")
+    segments = TimeFormatter.format_speaker_segments(
+        results["diarization"],
+        results["transcription"]
+    )
+    if segments:
+        for segment in segments:
+            col1, col2, col3 = st.columns([2,3,5])
+            with col1:
+                display_speaker_info(segment)
+            with col2:
+                display_timestamp(segment)
+            with col3:
+                display_text(segment)
+            st.markdown("---")
+    else:
+        st.warning("No speaker segments detected")
+def display_speaker_info(segment):
+    """Display speaker information with color coding."""
+    speaker_num = int(segment['speaker'].split('_')[1])
+    colors = ['🔵', '🔴']
+    speaker_color = colors[speaker_num % len(colors)]
+    st.write(f"{speaker_color} {segment['speaker']}")
+def display_timestamp(segment):
+    """Display formatted timestamps."""
+    start_time = TimeFormatter.format_timestamp(segment['start'])
+    end_time = TimeFormatter.format_timestamp(segment['end'])
+    st.write(f"{start_time} → {end_time}")
+def display_text(segment):
+    """Display speaker's text."""
+    if segment['text']:
+        st.write(f"\"{segment['text']}\"")
+    else:
+        st.write("(no speech detected)")
 if __name__ == "__main__":
     main()

src/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+"""
+Initialize the src package.
+"""
+from src.models import SpeakerDiarizer, Transcriber, Summarizer
+from src.utils import AudioProcessor, TimeFormatter
+__all__ = [
+    'SpeakerDiarizer',
+    'Transcriber',
+    'Summarizer',
+    'AudioProcessor',
+    'TimeFormatter'
+]

src/models/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+Initialize the models package.
+"""
+from .diarization import SpeakerDiarizer
+from .transcription import Transcriber
+from .summarization import Summarizer
+__all__ = ['SpeakerDiarizer', 'Transcriber', 'Summarizer']

src/models/diarization.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+Speaker Diarization Model Handler
+Manages the pyannote-audio model for speaker diarization tasks.
+"""
+from pyannote.audio import Pipeline
+import streamlit as st
+class SpeakerDiarizer:
+    def __init__(self, token: str):
+        """Initialize the diarization model.
+        Args:
+            token (str): HuggingFace authentication token
+        """
+        self.token = token
+        self.model = None
+    def load_model(self):
+        """Load the pyannote speaker diarization model."""
+        try:
+            self.model = Pipeline.from_pretrained(
+                "pyannote/speaker-diarization",
+                use_auth_token=self.token
+            )
+            return self.model
+        except Exception as e:
+            st.error(f"Error loading diarization model: {str(e)}")
+            return None
+    def process(self, audio_path: str):
+        """Process audio file for speaker diarization.
+        Args:
+            audio_path (str): Path to the audio file
+        Returns:
+            dict: Diarization results
+        """
+        try:
+            return self.model(audio_path)
+        except Exception as e:
+            st.error(f"Error in diarization: {str(e)}")
+            return None

src/models/summarization.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+Summarization Model Handler
+Manages the BART model for text summarization.
+"""
+from transformers import pipeline
+import torch
+import streamlit as st
+class Summarizer:
+    def __init__(self):
+        """Initialize the summarization model."""
+        self.model = None
+    def load_model(self):
+        """Load the BART summarization model."""
+        try:
+            self.model = pipeline(
+                "summarization",
+                model="facebook/bart-large-cnn",
+                device=0 if torch.cuda.is_available() else -1
+            )
+            return self.model
+        except Exception as e:
+            st.error(f"Error loading summarization model: {str(e)}")
+            return None
+    def process(self, text: str, max_length: int = 130, min_length: int = 30):
+        """Process text for summarization.
+        Args:
+            text (str): Text to summarize
+            max_length (int): Maximum length of summary
+            min_length (int): Minimum length of summary
+        Returns:
+            str: Summarized text
+        """
+        try:
+            summary = self.model(text, max_length=max_length, min_length=min_length)
+            return summary
+        except Exception as e:
+            st.error(f"Error in summarization: {str(e)}")
+            return None

src/models/transcription.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""
+Transcription Model Handler
+Manages the Whisper model for speech-to-text transcription.
+"""
+import whisper
+import streamlit as st
+class Transcriber:
+    def __init__(self):
+        """Initialize the transcription model."""
+        self.model = None
+    def load_model(self):
+        """Load the Whisper transcription model."""
+        try:
+            self.model = whisper.load_model("base")
+            return self.model
+        except Exception as e:
+            st.error(f"Error loading transcription model: {str(e)}")
+            return None
+    def process(self, audio_path: str):
+        """Process audio file for transcription.
+        Args:
+            audio_path (str): Path to the audio file
+        Returns:
+            dict: Transcription results
+        """
+        try:
+            return self.model.transcribe(audio_path)
+        except Exception as e:
+            st.error(f"Error in transcription: {str(e)}")
+            return None

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""
+Initialize the utils package.
+"""
+from .audio_processor import AudioProcessor
+from .formatter import TimeFormatter
+__all__ = ['AudioProcessor', 'TimeFormatter']

src/utils/audio_processor.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+Audio Processing Utilities
+Handles audio file preprocessing and standardization.
+"""
+from pydub import AudioSegment
+import io
+import tempfile
+import os
+class AudioProcessor:
+    @staticmethod
+    def standardize_audio(audio_file):
+        """Standardize audio file to required format.
+        Args:
+            audio_file: Uploaded audio file
+        Returns:
+            str: Path to processed audio file
+        """
+        try:
+            audio_bytes = io.BytesIO(audio_file.getvalue())
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
+                if audio_file.name.lower().endswith('.mp3'):
+                    audio = AudioSegment.from_mp3(audio_bytes)
+                else:
+                    audio = AudioSegment.from_wav(audio_bytes)
+                audio = audio.set_frame_rate(16000)
+                audio = audio.set_channels(1)
+                audio = audio.set_sample_width(2)
+                audio.export(
+                    tmp.name,
+                    format="wav",
+                    parameters=["-ac", "1", "-ar", "16000"]
+                )
+                return tmp.name
+        except Exception as e:
+            raise Exception(f"Error processing audio: {str(e)}")

src/utils/formatter.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""
+Formatting utilities for timestamps and speaker segments.
+"""
+class TimeFormatter:
+    @staticmethod
+    def format_timestamp(seconds: float) -> str:
+        """Format seconds into MM:SS.ss format.
+        Args:
+            seconds (float): Time in seconds
+        Returns:
+            str: Formatted time string
+        """
+        minutes = int(seconds // 60)
+        seconds = seconds % 60
+        return f"{minutes:02d}:{seconds:05.2f}"
+    @staticmethod
+    def format_speaker_segments(diarization_result, transcription):
+        """Format speaker segments with transcribed text.
+        Args:
+            diarization_result: Diarization model output
+            transcription: Whisper transcription output
+        Returns:
+            list: Formatted speaker segments
+        """
+        if diarization_result is None:
+            return []
+        formatted_segments = []
+        whisper_segments = transcription.get('segments', [])
+        try:
+            for turn, _, speaker in diarization_result.itertracks(yield_label=True):
+                current_text = ""
+                for w_segment in whisper_segments:
+                    w_start = float(w_segment['start'])
+                    w_end = float(w_segment['end'])
+                    if (w_start >= turn.start and w_start < turn.end) or \
+                       (w_end > turn.start and w_end <= turn.end):
+                        current_text += w_segment['text'].strip() + " "
+                formatted_segments.append({
+                    'speaker': str(speaker),
+                    'start': float(turn.start),
+                    'end': float(turn.end),
+                    'text': current_text.strip()
+                })
+        except Exception as e:
+            print(f"Error formatting segments: {str(e)}")
+            return []
+        return formatted_segments