Spaces:

awacke1
/

CodeCompetitionClaudeVsGPT

Running

App Files Files Community

awacke1 commited on Dec 20, 2024

Commit

88675e3

verified ·

1 Parent(s): 420b18d

Update app.py

Browse files

Files changed (1) hide show

app.py +141 -201

app.py CHANGED Viewed

@@ -9,23 +9,20 @@ import os
 import glob
 import random
 from pathlib import Path
-from datetime import datetime, timedelta
 import edge_tts
 import asyncio
 import requests
-from collections import defaultdict
 import streamlit.components.v1 as components
-from urllib.parse import quote
-from xml.etree import ElementTree as ET
-from datasets import load_dataset
 import base64
 import re
 # -------------------- Configuration & Constants --------------------
 USER_NAMES = [
-    "Alex", "Jordan", "Taylor", "Morgan", "Rowan", "Avery", "Riley", "Quinn",
-    "Casey", "Jesse", "Reese", "Skyler", "Ellis", "Devon", "Aubrey", "Kendall",
-    "Parker", "Dakota", "Sage", "Finley"
 ]
 ENGLISH_VOICES = [
@@ -34,6 +31,9 @@ ENGLISH_VOICES = [
     "en-CA-LiamNeural", "en-AU-NatashaNeural", "en-AU-WilliamNeural"
 ]
 ROWS_PER_PAGE = 100
 MIN_SEARCH_SCORE = 0.3
 EXACT_MATCH_BOOST = 2.0
@@ -47,7 +47,6 @@ SESSION_VARS = {
     'should_rerun': False,
     'search_columns': [],
     'initial_search_done': False,
-    'tts_voice': "en-US-AriaNeural",
     'arxiv_last_query': "",
     'dataset_loaded': False,
     'current_page': 0,
@@ -59,7 +58,8 @@ SESSION_VARS = {
     'voice_text': None,
     'user_name': random.choice(USER_NAMES),
     'max_items': 100,
-    'global_voice': "en-US-AriaNeural"  # Default global voice
 }
 for var, default in SESSION_VARS.items():
@@ -85,19 +85,17 @@ def clean_for_speech(text: str) -> str:
     text = re.sub(r"\s+", " ", text).strip()
     return text
-async def edge_tts_generate_audio(text, voice="en-US-AriaNeural", rate=0, pitch=0):
     text = clean_for_speech(text)
     if not text.strip():
         return None
-    rate_str = f"{rate:+d}%"
-    pitch_str = f"{pitch:+d}Hz"
-    communicate = edge_tts.Communicate(text, voice, rate=rate_str, pitch=pitch_str)
-    out_fn = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
     await communicate.save(out_fn)
     return out_fn
 def speak_with_edge_tts(text, voice="en-US-AriaNeural"):
-    return asyncio.run(edge_tts_generate_audio(text, voice, 0, 0))
 def play_and_download_audio(file_path):
     if file_path and os.path.exists(file_path):
@@ -138,7 +136,6 @@ def list_saved_inputs():
     return files
 def parse_md_file(fpath):
-    # Extract user and text from md
     user_line = ""
     ts_line = ""
     content_lines = []
@@ -154,139 +151,7 @@ def parse_md_file(fpath):
     content = "\n".join(content_lines).strip()
     return user_line, ts_line, content
-def fetch_dataset_info(dataset_id, token):
-    info_url = f"https://huggingface.co/api/datasets/{dataset_id}"
-    try:
-        response = requests.get(info_url, timeout=30)
-        if response.status_code == 200:
-            return response.json()
-    except Exception:
-        pass
-    return None
-@st.cache_data
-def get_dataset_info(dataset_id, token):
-    try:
-        dataset = load_dataset(dataset_id, token=token, streaming=True)
-        return dataset['train'].info
-    except:
-        return None
-@st.cache_data
-def load_dataset_page(dataset_id, token, page, rows_per_page):
-    try:
-        start_idx = page * rows_per_page
-        end_idx = start_idx + rows_per_page
-        dataset = load_dataset(
-            dataset_id,
-            token=token,
-            streaming=False,
-            split=f'train[{start_idx}:{end_idx}]'
-        )
-        return pd.DataFrame(dataset)
-    except:
-        return pd.DataFrame()
-class FastDatasetSearcher:
-    def __init__(self, dataset_id="tomg-group-umd/cinepile"):
-        self.dataset_id = dataset_id
-        self.text_model = get_model()
-        self.token = os.environ.get('DATASET_KEY')
-    def load_page(self, page=0):
-        return load_dataset_page(self.dataset_id, self.token, page, ROWS_PER_PAGE)
-    def quick_search(self, query, df):
-        if df.empty or not query.strip():
-            return df
-        try:
-            searchable_cols = []
-            if len(df) > 0:
-                for col in df.columns:
-                    sample_val = df[col].iloc[0]
-                    if not isinstance(sample_val, (np.ndarray, bytes)):
-                        searchable_cols.append(col)
-            query_lower = query.lower()
-            query_terms = set(query_lower.split())
-            query_embedding = self.text_model.encode([query], show_progress_bar=False)[0]
-            scores = []
-            matched_any = []
-            for _, row in df.iterrows():
-                text_parts = []
-                row_matched = False
-                exact_match = False
-                priority_fields = ['description', 'matched_text']
-                other_fields = [col for col in searchable_cols if col not in priority_fields]
-                for col in priority_fields:
-                    if col in row:
-                        val = row[col]
-                        if val is not None:
-                            val_str = str(val).lower()
-                            if query_lower in val_str.split():
-                                exact_match = True
-                            if any(term in val_str.split() for term in query_terms):
-                                row_matched = True
-                            text_parts.append(str(val))
-                for col in other_fields:
-                    val = row[col]
-                    if val is not None:
-                        val_str = str(val).lower()
-                        if query_lower in val_str.split():
-                            exact_match = True
-                        if any(term in val_str.split() for term in query_terms):
-                            row_matched = True
-                        text_parts.append(str(val))
-                text = ' '.join(text_parts)
-                if text.strip():
-                    text_tokens = set(text.lower().split())
-                    matching_terms = query_terms.intersection(text_tokens)
-                    keyword_score = len(matching_terms) / len(query_terms) if len(query_terms) > 0 else 0.0
-                    text_embedding = self.text_model.encode([text], show_progress_bar=False)[0]
-                    semantic_score = float(cosine_similarity([query_embedding], [text_embedding])[0][0])
-                    combined_score = 0.7 * keyword_score + 0.3 * semantic_score
-                    if exact_match:
-                        combined_score *= EXACT_MATCH_BOOST
-                    elif row_matched:
-                        combined_score *= 1.2
-                else:
-                    combined_score = 0.0
-                    row_matched = False
-                scores.append(combined_score)
-                matched_any.append(row_matched)
-            results_df = df.copy()
-            results_df['score'] = scores
-            results_df['matched'] = matched_any
-            filtered_df = results_df[
-                (results_df['matched']) |
-                (results_df['score'] > MIN_SEARCH_SCORE)
-            ]
-            return filtered_df.sort_values('score', ascending=False)
-        except:
-            return df
-def play_text(text):
-    voice = st.session_state.get('global_voice', "en-US-AriaNeural")
-    audio_file = speak_with_edge_tts(text, voice=voice)
-    if audio_file:
-        play_and_download_audio(audio_file)
 def arxiv_search(query, max_results=3):
-    # Simple arXiv search using RSS (for demonstration)
-    # In production, use official arXiv API or a library.
     base_url = "http://export.arxiv.org/api/query"
     params = {
         'search_query': query.replace(' ', '+'),
@@ -302,29 +167,76 @@ def arxiv_search(query, max_results=3):
         for entry in entries:
             title = entry.find('a:title', ns).text.strip()
             summary = entry.find('a:summary', ns).text.strip()
-            # Just truncating summary for demo
             summary_short = summary[:300] + "..."
             results.append((title, summary_short))
         return results
     return []
 def summarize_arxiv_results(results):
-    # Just combine titles and short summaries
     lines = []
     for i, (title, summary) in enumerate(results, 1):
         lines.append(f"Result {i}: {title}\n{summary}\n")
     return "\n\n".join(lines)
 def main():
     st.title("🎙️ Voice Chat & Search")
     # Sidebar
     with st.sidebar:
         # Editable user name
-        st.session_state['user_name'] = st.text_input("Current User:", value=st.session_state['user_name'])
-        # Global voice selection
-        st.session_state['global_voice'] = st.selectbox("Select Global Voice:", ENGLISH_VOICES, index=0)
         st.session_state['max_items'] = st.number_input("Max Items per search iteration:", min_value=1, max_value=1000, value=st.session_state['max_items'])
@@ -339,82 +251,109 @@ def main():
     voice_component = create_voice_component()
     voice_val = voice_component(my_input_value="Start speaking...")
-    # Tabs: Voice Chat History, Arxiv Search, Dataset Search, Settings
     tab1, tab2, tab3, tab4 = st.tabs(["🗣️ Voice Chat History", "📚 ArXiv Search", "📊 Dataset Search", "⚙️ Settings"])
     # ------------------ Voice Chat History -------------------------
     with tab1:
         st.subheader("Voice Chat History")
-        # List saved inputs and responses and allow playing them
         files = list_saved_inputs()
-        for fpath in reversed(files):
             user, ts, content = parse_md_file(fpath)
             with st.expander(f"{ts} - {user}", expanded=False):
                 st.write(content)
-                if st.button("🔊 Read Aloud", key=f"read_{fpath}"):
-                    play_text(content)
     # ------------------ ArXiv Search -------------------------
     with tab2:
         st.subheader("ArXiv Search")
-        # If we have a voice_val and autorun with ArXiv chosen:
         edited_input = st.text_area("Enter or Edit Search Query:", value=(voice_val.strip() if voice_val else ""), height=100)
         autorun = st.checkbox("⚡ Auto-Run", value=True)
         run_arxiv = st.button("🔍 ArXiv Search")
         input_changed = (edited_input != st.session_state.get('old_val'))
         if autorun and input_changed and edited_input.strip():
-            st.session_state['old_val'] = edited_input
-            # Save user input
-            save_input_as_md(st.session_state['user_name'], edited_input, prefix="input")
-            with st.spinner("Searching ArXiv..."):
-                results = arxiv_search(edited_input)
-                if results:
-                    summary = summarize_arxiv_results(results)
-                    # Save response
-                    save_response_as_md(st.session_state['user_name'], summary, prefix="response")
-                    st.write(summary)
-                    # Autoplay TTS
-                    play_text(summary)
-                else:
-                    st.warning("No results found on ArXiv.")
         if run_arxiv and edited_input.strip():
-            # Manual trigger
-            save_input_as_md(st.session_state['user_name'], edited_input, prefix="input")
-            with st.spinner("Searching ArXiv..."):
-                results = arxiv_search(edited_input)
-                if results:
-                    summary = summarize_arxiv_results(results)
-                    save_response_as_md(st.session_state['user_name'], summary, prefix="response")
-                    st.write(summary)
-                    play_text(summary)
-                else:
-                    st.warning("No results found on ArXiv.")
     # ------------------ Dataset Search -------------------------
     with tab3:
         st.subheader("Dataset Search")
-        search = FastDatasetSearcher()
         query = st.text_input("Enter dataset search query:")
         run_ds_search = st.button("Search Dataset")
         num_results = st.slider("Max results:", 1, 100, 20)
         if run_ds_search and query.strip():
             with st.spinner("Searching dataset..."):
-                df = search.load_page()
-                results = search.quick_search(query, df)
-                if len(results) > 0:
                     st.write(f"Found {len(results)} results:")
                     shown = 0
-                    for i, (_, result) in enumerate(results.iterrows(), 1):
                         if shown >= num_results:
                             break
                         with st.expander(f"Result {i}", expanded=(i==1)):
-                            # Just print result keys/values here
-                            for k, v in result.items():
-                                if k not in ['score', 'matched']:
-                                    st.write(f"**{k}:** {v}")
                         shown += 1
                 else:
                     st.warning("No matching results found.")
@@ -422,13 +361,14 @@ def main():
     # ------------------ Settings Tab -------------------------
     with tab4:
         st.subheader("Settings")
-        st.write("Adjust voice and search parameters in the sidebar.")
         if st.button("🗑️ Clear Search History"):
             st.session_state['search_history'] = []
-            # Optionally delete files:
-            # for fpath in list_saved_inputs():
-            #     os.remove(fpath)
-            st.success("Search history cleared!")
 if __name__ == "__main__":
     main()

 import glob
 import random
 from pathlib import Path
+from datetime import datetime
 import edge_tts
 import asyncio
 import requests
 import streamlit.components.v1 as components
 import base64
 import re
+from xml.etree import ElementTree as ET
+from datasets import load_dataset
 # -------------------- Configuration & Constants --------------------
+# Exactly 11 user names and 11 voices
 USER_NAMES = [
+    "Aria", "Guy", "Sonia", "Tony", "Jenny", "Davis", "Libby", "Clara", "Liam", "Natasha", "William"
 ]
 ENGLISH_VOICES = [
     "en-CA-LiamNeural", "en-AU-NatashaNeural", "en-AU-WilliamNeural"
 ]
+# Map each user to a corresponding voice
+USER_VOICES = dict(zip(USER_NAMES, ENGLISH_VOICES))
 ROWS_PER_PAGE = 100
 MIN_SEARCH_SCORE = 0.3
 EXACT_MATCH_BOOST = 2.0
     'should_rerun': False,
     'search_columns': [],
     'initial_search_done': False,
     'arxiv_last_query': "",
     'dataset_loaded': False,
     'current_page': 0,
     'voice_text': None,
     'user_name': random.choice(USER_NAMES),
     'max_items': 100,
+    'global_voice': "en-US-AriaNeural",
+    'last_arxiv_input': None  # To avoid double-running ArXiv search
 }
 for var, default in SESSION_VARS.items():
     text = re.sub(r"\s+", " ", text).strip()
     return text
+async def edge_tts_generate_audio(text, voice="en-US-AriaNeural"):
     text = clean_for_speech(text)
     if not text.strip():
         return None
+    communicate = edge_tts.Communicate(text, voice)
+    out_fn = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}.mp3"
     await communicate.save(out_fn)
     return out_fn
 def speak_with_edge_tts(text, voice="en-US-AriaNeural"):
+    return asyncio.run(edge_tts_generate_audio(text, voice))
 def play_and_download_audio(file_path):
     if file_path and os.path.exists(file_path):
     return files
 def parse_md_file(fpath):
     user_line = ""
     ts_line = ""
     content_lines = []
     content = "\n".join(content_lines).strip()
     return user_line, ts_line, content
 def arxiv_search(query, max_results=3):
     base_url = "http://export.arxiv.org/api/query"
     params = {
         'search_query': query.replace(' ', '+'),
         for entry in entries:
             title = entry.find('a:title', ns).text.strip()
             summary = entry.find('a:summary', ns).text.strip()
             summary_short = summary[:300] + "..."
             results.append((title, summary_short))
         return results
     return []
 def summarize_arxiv_results(results):
     lines = []
     for i, (title, summary) in enumerate(results, 1):
         lines.append(f"Result {i}: {title}\n{summary}\n")
     return "\n\n".join(lines)
+# Simple dataset search: text-based substring search
+def simple_dataset_search(query, df):
+    if df.empty or not query.strip():
+        return pd.DataFrame()
+    query_terms = query.lower().split()
+    matches = []
+    for idx, row in df.iterrows():
+        # Combine all text fields into one string
+        text_parts = []
+        for col in df.columns:
+            val = row[col]
+            if isinstance(val, str):
+                text_parts.append(val.lower())
+            elif isinstance(val, (int, float)):
+                text_parts.append(str(val))
+        full_text = " ".join(text_parts)
+        # Check if any query term is in full_text
+        if any(qt in full_text for qt in query_terms):
+            matches.append(row)
+    if matches:
+        return pd.DataFrame(matches)
+    return pd.DataFrame()
+@st.cache_data
+def load_dataset_page(dataset_id, token, page, rows_per_page):
+    try:
+        start_idx = page * rows_per_page
+        end_idx = start_idx + rows_per_page
+        dataset = load_dataset(
+            dataset_id,
+            token=token,
+            streaming=False,
+            split=f'train[{start_idx}:{end_idx}]'
+        )
+        return pd.DataFrame(dataset)
+    except:
+        return pd.DataFrame()
+class SimpleDatasetSearcher:
+    def __init__(self, dataset_id="tomg-group-umd/cinepile"):
+        self.dataset_id = dataset_id
+        self.token = os.environ.get('DATASET_KEY')
+    def load_page(self, page=0):
+        return load_dataset_page(self.dataset_id, self.token, page, ROWS_PER_PAGE)
+def concatenate_mp3(files, output_file):
+    # Naive binary concatenation of MP3 files
+    with open(output_file, 'wb') as outfile:
+        for f in files:
+            with open(f, 'rb') as infile:
+                outfile.write(infile.read())
 def main():
     st.title("🎙️ Voice Chat & Search")
     # Sidebar
     with st.sidebar:
         # Editable user name
+        st.session_state['user_name'] = st.selectbox("Current User:", USER_NAMES, index=0)
         st.session_state['max_items'] = st.number_input("Max Items per search iteration:", min_value=1, max_value=1000, value=st.session_state['max_items'])
     voice_component = create_voice_component()
     voice_val = voice_component(my_input_value="Start speaking...")
+    # Tabs
     tab1, tab2, tab3, tab4 = st.tabs(["🗣️ Voice Chat History", "📚 ArXiv Search", "📊 Dataset Search", "⚙️ Settings"])
     # ------------------ Voice Chat History -------------------------
     with tab1:
         st.subheader("Voice Chat History")
         files = list_saved_inputs()
+        conversation = []
+        for fpath in files:
             user, ts, content = parse_md_file(fpath)
+            conversation.append((user, ts, content))
+        for user, ts, content in reversed(conversation):
             with st.expander(f"{ts} - {user}", expanded=False):
                 st.write(content)
+                if st.button(f"🔊 Read Aloud {ts}-{user}", key=f"read_{fpath}"):
+                    voice = USER_VOICES.get(user, "en-US-AriaNeural")
+                    audio_file = speak_with_edge_tts(content, voice=voice)
+                    if audio_file:
+                        play_and_download_audio(audio_file)
+        # Read entire conversation
+        if st.button("📜 Read Conversation"):
+            # Sort by timestamp to ensure chronological order
+            # Already in order because files is sorted, but let's rely on chronological order:
+            # They are sorted ascending, so conversation is appended ascending.
+            # It's safe to assume files list is chronological by filename.
+            mp3_files = []
+            for user, ts, content in conversation:
+                voice = USER_VOICES.get(user, "en-US-AriaNeural")
+                audio_file = speak_with_edge_tts(content, voice=voice)
+                if audio_file:
+                    mp3_files.append(audio_file)
+                    # Show each line's MP3
+                    st.write(f"**{user} ({ts}):**")
+                    play_and_download_audio(audio_file)
+            if mp3_files:
+                # Concatenate all mp3 files into one
+                combined_file = f"full_conversation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
+                concatenate_mp3(mp3_files, combined_file)
+                st.write("**Full Conversation Audio:**")
+                play_and_download_audio(combined_file)
     # ------------------ ArXiv Search -------------------------
     with tab2:
         st.subheader("ArXiv Search")
         edited_input = st.text_area("Enter or Edit Search Query:", value=(voice_val.strip() if voice_val else ""), height=100)
         autorun = st.checkbox("⚡ Auto-Run", value=True)
         run_arxiv = st.button("🔍 ArXiv Search")
         input_changed = (edited_input != st.session_state.get('old_val'))
+        # Only run once:
+        # Conditions to run ArXiv search:
+        # - If autorun and input_changed and edited_input non-empty
+        # - Or if run_arxiv button is pressed and edited_input non-empty
+        should_run_arxiv = False
         if autorun and input_changed and edited_input.strip():
+            should_run_arxiv = True
         if run_arxiv and edited_input.strip():
+            should_run_arxiv = True
+        if should_run_arxiv:
+            st.session_state['old_val'] = edited_input
+            # Avoid double-running by checking if last_arxiv_input is same
+            if st.session_state['last_arxiv_input'] != edited_input:
+                st.session_state['last_arxiv_input'] = edited_input
+                save_input_as_md(st.session_state['user_name'], edited_input, prefix="input")
+                with st.spinner("Searching ArXiv..."):
+                    results = arxiv_search(edited_input)
+                    if results:
+                        summary = summarize_arxiv_results(results)
+                        save_response_as_md(st.session_state['user_name'], summary, prefix="response")
+                        st.write(summary)
+                        # Play summary aloud
+                        voice = USER_VOICES.get(st.session_state['user_name'], "en-US-AriaNeural")
+                        audio_file = speak_with_edge_tts(summary, voice=voice)
+                        if audio_file:
+                            play_and_download_audio(audio_file)
+                    else:
+                        st.warning("No results found on ArXiv.")
     # ------------------ Dataset Search -------------------------
     with tab3:
         st.subheader("Dataset Search")
+        ds_searcher = SimpleDatasetSearcher()
         query = st.text_input("Enter dataset search query:")
         run_ds_search = st.button("Search Dataset")
         num_results = st.slider("Max results:", 1, 100, 20)
         if run_ds_search and query.strip():
             with st.spinner("Searching dataset..."):
+                # For simplicity, just load first page
+                df = ds_searcher.load_page(0)
+                results = simple_dataset_search(query, df)
+                if not results.empty:
                     st.write(f"Found {len(results)} results:")
                     shown = 0
+                    for i, (_, row) in enumerate(results.iterrows(), 1):
                         if shown >= num_results:
                             break
                         with st.expander(f"Result {i}", expanded=(i==1)):
+                            for k, v in row.items():
+                                st.write(f"**{k}:** {v}")
                         shown += 1
                 else:
                     st.warning("No matching results found.")
     # ------------------ Settings Tab -------------------------
     with tab4:
         st.subheader("Settings")
+        # Clear search history: deletes all md files and clears session
         if st.button("🗑️ Clear Search History"):
+            # Delete all files
+            for fpath in list_saved_inputs():
+                os.remove(fpath)
             st.session_state['search_history'] = []
+            st.success("Search history cleared for everyone!")
+            st.experimental_rerun()
 if __name__ == "__main__":
     main()