awacke1 commited on
Commit
420b18d
Β·
verified Β·
1 Parent(s): e9907ed

Create backup10.app.py

Browse files
Files changed (1) hide show
  1. backup10.app.py +434 -0
backup10.app.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import torch
7
+ import json
8
+ import os
9
+ import glob
10
+ import random
11
+ from pathlib import Path
12
+ from datetime import datetime, timedelta
13
+ import edge_tts
14
+ import asyncio
15
+ import requests
16
+ from collections import defaultdict
17
+ import streamlit.components.v1 as components
18
+ from urllib.parse import quote
19
+ from xml.etree import ElementTree as ET
20
+ from datasets import load_dataset
21
+ import base64
22
+ import re
23
+
24
+ # -------------------- Configuration & Constants --------------------
25
+ USER_NAMES = [
26
+ "Alex", "Jordan", "Taylor", "Morgan", "Rowan", "Avery", "Riley", "Quinn",
27
+ "Casey", "Jesse", "Reese", "Skyler", "Ellis", "Devon", "Aubrey", "Kendall",
28
+ "Parker", "Dakota", "Sage", "Finley"
29
+ ]
30
+
31
+ ENGLISH_VOICES = [
32
+ "en-US-AriaNeural", "en-US-GuyNeural", "en-GB-SoniaNeural", "en-GB-TonyNeural",
33
+ "en-US-JennyNeural", "en-US-DavisNeural", "en-GB-LibbyNeural", "en-CA-ClaraNeural",
34
+ "en-CA-LiamNeural", "en-AU-NatashaNeural", "en-AU-WilliamNeural"
35
+ ]
36
+
37
+ ROWS_PER_PAGE = 100
38
+ MIN_SEARCH_SCORE = 0.3
39
+ EXACT_MATCH_BOOST = 2.0
40
+ SAVED_INPUTS_DIR = "saved_inputs"
41
+ os.makedirs(SAVED_INPUTS_DIR, exist_ok=True)
42
+
43
+ SESSION_VARS = {
44
+ 'search_history': [],
45
+ 'last_voice_input': "",
46
+ 'transcript_history': [],
47
+ 'should_rerun': False,
48
+ 'search_columns': [],
49
+ 'initial_search_done': False,
50
+ 'tts_voice': "en-US-AriaNeural",
51
+ 'arxiv_last_query': "",
52
+ 'dataset_loaded': False,
53
+ 'current_page': 0,
54
+ 'data_cache': None,
55
+ 'dataset_info': None,
56
+ 'nps_submitted': False,
57
+ 'nps_last_shown': None,
58
+ 'old_val': None,
59
+ 'voice_text': None,
60
+ 'user_name': random.choice(USER_NAMES),
61
+ 'max_items': 100,
62
+ 'global_voice': "en-US-AriaNeural" # Default global voice
63
+ }
64
+
65
+ for var, default in SESSION_VARS.items():
66
+ if var not in st.session_state:
67
+ st.session_state[var] = default
68
+
69
+ @st.cache_resource
70
+ def get_model():
71
+ return SentenceTransformer('all-MiniLM-L6-v2')
72
+
73
+ def create_voice_component():
74
+ mycomponent = components.declare_component(
75
+ "mycomponent",
76
+ path="mycomponent"
77
+ )
78
+ return mycomponent
79
+
80
+ def clean_for_speech(text: str) -> str:
81
+ text = text.replace("\n", " ")
82
+ text = text.replace("</s>", " ")
83
+ text = text.replace("#", "")
84
+ text = re.sub(r"\(https?:\/\/[^\)]+\)", "", text)
85
+ text = re.sub(r"\s+", " ", text).strip()
86
+ return text
87
+
88
+ async def edge_tts_generate_audio(text, voice="en-US-AriaNeural", rate=0, pitch=0):
89
+ text = clean_for_speech(text)
90
+ if not text.strip():
91
+ return None
92
+ rate_str = f"{rate:+d}%"
93
+ pitch_str = f"{pitch:+d}Hz"
94
+ communicate = edge_tts.Communicate(text, voice, rate=rate_str, pitch=pitch_str)
95
+ out_fn = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
96
+ await communicate.save(out_fn)
97
+ return out_fn
98
+
99
+ def speak_with_edge_tts(text, voice="en-US-AriaNeural"):
100
+ return asyncio.run(edge_tts_generate_audio(text, voice, 0, 0))
101
+
102
+ def play_and_download_audio(file_path):
103
+ if file_path and os.path.exists(file_path):
104
+ st.audio(file_path)
105
+ dl_link = f'<a href="data:audio/mpeg;base64,{base64.b64encode(open(file_path,"rb").read()).decode()}" download="{os.path.basename(file_path)}">Download {os.path.basename(file_path)}</a>'
106
+ st.markdown(dl_link, unsafe_allow_html=True)
107
+
108
+ def generate_filename(prefix, text):
109
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
110
+ safe_text = re.sub(r'[^\w\s-]', '', text[:50]).strip().lower()
111
+ safe_text = re.sub(r'[-\s]+', '-', safe_text)
112
+ return f"{prefix}_{timestamp}_{safe_text}.md"
113
+
114
+ def save_input_as_md(user_name, text, prefix="input"):
115
+ if not text.strip():
116
+ return
117
+ fn = generate_filename(prefix, text)
118
+ full_path = os.path.join(SAVED_INPUTS_DIR, fn)
119
+ with open(full_path, 'w', encoding='utf-8') as f:
120
+ f.write(f"# User: {user_name}\n")
121
+ f.write(f"**Timestamp:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
122
+ f.write(text)
123
+ return full_path
124
+
125
+ def save_response_as_md(user_name, text, prefix="response"):
126
+ if not text.strip():
127
+ return
128
+ fn = generate_filename(prefix, text)
129
+ full_path = os.path.join(SAVED_INPUTS_DIR, fn)
130
+ with open(full_path, 'w', encoding='utf-8') as f:
131
+ f.write(f"# User: {user_name}\n")
132
+ f.write(f"**Timestamp:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
133
+ f.write(text)
134
+ return full_path
135
+
136
+ def list_saved_inputs():
137
+ files = sorted(glob.glob(os.path.join(SAVED_INPUTS_DIR, "*.md")))
138
+ return files
139
+
140
+ def parse_md_file(fpath):
141
+ # Extract user and text from md
142
+ user_line = ""
143
+ ts_line = ""
144
+ content_lines = []
145
+ with open(fpath, 'r', encoding='utf-8') as f:
146
+ lines = f.readlines()
147
+ for line in lines:
148
+ if line.startswith("# User:"):
149
+ user_line = line.replace("# User:", "").strip()
150
+ elif line.startswith("**Timestamp:**"):
151
+ ts_line = line.replace("**Timestamp:**", "").strip()
152
+ else:
153
+ content_lines.append(line.strip())
154
+ content = "\n".join(content_lines).strip()
155
+ return user_line, ts_line, content
156
+
157
+ def fetch_dataset_info(dataset_id, token):
158
+ info_url = f"https://huggingface.co/api/datasets/{dataset_id}"
159
+ try:
160
+ response = requests.get(info_url, timeout=30)
161
+ if response.status_code == 200:
162
+ return response.json()
163
+ except Exception:
164
+ pass
165
+ return None
166
+
167
+ @st.cache_data
168
+ def get_dataset_info(dataset_id, token):
169
+ try:
170
+ dataset = load_dataset(dataset_id, token=token, streaming=True)
171
+ return dataset['train'].info
172
+ except:
173
+ return None
174
+
175
+ @st.cache_data
176
+ def load_dataset_page(dataset_id, token, page, rows_per_page):
177
+ try:
178
+ start_idx = page * rows_per_page
179
+ end_idx = start_idx + rows_per_page
180
+ dataset = load_dataset(
181
+ dataset_id,
182
+ token=token,
183
+ streaming=False,
184
+ split=f'train[{start_idx}:{end_idx}]'
185
+ )
186
+ return pd.DataFrame(dataset)
187
+ except:
188
+ return pd.DataFrame()
189
+
190
+ class FastDatasetSearcher:
191
+ def __init__(self, dataset_id="tomg-group-umd/cinepile"):
192
+ self.dataset_id = dataset_id
193
+ self.text_model = get_model()
194
+ self.token = os.environ.get('DATASET_KEY')
195
+
196
+ def load_page(self, page=0):
197
+ return load_dataset_page(self.dataset_id, self.token, page, ROWS_PER_PAGE)
198
+
199
+ def quick_search(self, query, df):
200
+ if df.empty or not query.strip():
201
+ return df
202
+
203
+ try:
204
+ searchable_cols = []
205
+ if len(df) > 0:
206
+ for col in df.columns:
207
+ sample_val = df[col].iloc[0]
208
+ if not isinstance(sample_val, (np.ndarray, bytes)):
209
+ searchable_cols.append(col)
210
+
211
+ query_lower = query.lower()
212
+ query_terms = set(query_lower.split())
213
+ query_embedding = self.text_model.encode([query], show_progress_bar=False)[0]
214
+
215
+ scores = []
216
+ matched_any = []
217
+
218
+ for _, row in df.iterrows():
219
+ text_parts = []
220
+ row_matched = False
221
+ exact_match = False
222
+ priority_fields = ['description', 'matched_text']
223
+ other_fields = [col for col in searchable_cols if col not in priority_fields]
224
+
225
+ for col in priority_fields:
226
+ if col in row:
227
+ val = row[col]
228
+ if val is not None:
229
+ val_str = str(val).lower()
230
+ if query_lower in val_str.split():
231
+ exact_match = True
232
+ if any(term in val_str.split() for term in query_terms):
233
+ row_matched = True
234
+ text_parts.append(str(val))
235
+
236
+ for col in other_fields:
237
+ val = row[col]
238
+ if val is not None:
239
+ val_str = str(val).lower()
240
+ if query_lower in val_str.split():
241
+ exact_match = True
242
+ if any(term in val_str.split() for term in query_terms):
243
+ row_matched = True
244
+ text_parts.append(str(val))
245
+
246
+ text = ' '.join(text_parts)
247
+ if text.strip():
248
+ text_tokens = set(text.lower().split())
249
+ matching_terms = query_terms.intersection(text_tokens)
250
+ keyword_score = len(matching_terms) / len(query_terms) if len(query_terms) > 0 else 0.0
251
+
252
+ text_embedding = self.text_model.encode([text], show_progress_bar=False)[0]
253
+ semantic_score = float(cosine_similarity([query_embedding], [text_embedding])[0][0])
254
+
255
+ combined_score = 0.7 * keyword_score + 0.3 * semantic_score
256
+
257
+ if exact_match:
258
+ combined_score *= EXACT_MATCH_BOOST
259
+ elif row_matched:
260
+ combined_score *= 1.2
261
+ else:
262
+ combined_score = 0.0
263
+ row_matched = False
264
+
265
+ scores.append(combined_score)
266
+ matched_any.append(row_matched)
267
+
268
+ results_df = df.copy()
269
+ results_df['score'] = scores
270
+ results_df['matched'] = matched_any
271
+
272
+ filtered_df = results_df[
273
+ (results_df['matched']) |
274
+ (results_df['score'] > MIN_SEARCH_SCORE)
275
+ ]
276
+
277
+ return filtered_df.sort_values('score', ascending=False)
278
+ except:
279
+ return df
280
+
281
+ def play_text(text):
282
+ voice = st.session_state.get('global_voice', "en-US-AriaNeural")
283
+ audio_file = speak_with_edge_tts(text, voice=voice)
284
+ if audio_file:
285
+ play_and_download_audio(audio_file)
286
+
287
+ def arxiv_search(query, max_results=3):
288
+ # Simple arXiv search using RSS (for demonstration)
289
+ # In production, use official arXiv API or a library.
290
+ base_url = "http://export.arxiv.org/api/query"
291
+ params = {
292
+ 'search_query': query.replace(' ', '+'),
293
+ 'start': 0,
294
+ 'max_results': max_results
295
+ }
296
+ response = requests.get(base_url, params=params, timeout=30)
297
+ if response.status_code == 200:
298
+ root = ET.fromstring(response.text)
299
+ ns = {"a": "http://www.w3.org/2005/Atom"}
300
+ entries = root.findall('a:entry', ns)
301
+ results = []
302
+ for entry in entries:
303
+ title = entry.find('a:title', ns).text.strip()
304
+ summary = entry.find('a:summary', ns).text.strip()
305
+ # Just truncating summary for demo
306
+ summary_short = summary[:300] + "..."
307
+ results.append((title, summary_short))
308
+ return results
309
+ return []
310
+
311
+ def summarize_arxiv_results(results):
312
+ # Just combine titles and short summaries
313
+ lines = []
314
+ for i, (title, summary) in enumerate(results, 1):
315
+ lines.append(f"Result {i}: {title}\n{summary}\n")
316
+ return "\n\n".join(lines)
317
+
318
+ def main():
319
+ st.title("πŸŽ™οΈ Voice Chat & Search")
320
+
321
+ # Sidebar
322
+ with st.sidebar:
323
+ # Editable user name
324
+ st.session_state['user_name'] = st.text_input("Current User:", value=st.session_state['user_name'])
325
+
326
+ # Global voice selection
327
+ st.session_state['global_voice'] = st.selectbox("Select Global Voice:", ENGLISH_VOICES, index=0)
328
+
329
+ st.session_state['max_items'] = st.number_input("Max Items per search iteration:", min_value=1, max_value=1000, value=st.session_state['max_items'])
330
+
331
+ st.subheader("πŸ“ Saved Inputs & Responses")
332
+ saved_files = list_saved_inputs()
333
+ for fpath in saved_files:
334
+ user, ts, content = parse_md_file(fpath)
335
+ fname = os.path.basename(fpath)
336
+ st.write(f"- {fname} (User: {user})")
337
+
338
+ # Create voice component for input
339
+ voice_component = create_voice_component()
340
+ voice_val = voice_component(my_input_value="Start speaking...")
341
+
342
+ # Tabs: Voice Chat History, Arxiv Search, Dataset Search, Settings
343
+ tab1, tab2, tab3, tab4 = st.tabs(["πŸ—£οΈ Voice Chat History", "πŸ“š ArXiv Search", "πŸ“Š Dataset Search", "βš™οΈ Settings"])
344
+
345
+ # ------------------ Voice Chat History -------------------------
346
+ with tab1:
347
+ st.subheader("Voice Chat History")
348
+ # List saved inputs and responses and allow playing them
349
+ files = list_saved_inputs()
350
+ for fpath in reversed(files):
351
+ user, ts, content = parse_md_file(fpath)
352
+ with st.expander(f"{ts} - {user}", expanded=False):
353
+ st.write(content)
354
+ if st.button("πŸ”Š Read Aloud", key=f"read_{fpath}"):
355
+ play_text(content)
356
+
357
+ # ------------------ ArXiv Search -------------------------
358
+ with tab2:
359
+ st.subheader("ArXiv Search")
360
+ # If we have a voice_val and autorun with ArXiv chosen:
361
+ edited_input = st.text_area("Enter or Edit Search Query:", value=(voice_val.strip() if voice_val else ""), height=100)
362
+ autorun = st.checkbox("⚑ Auto-Run", value=True)
363
+ run_arxiv = st.button("πŸ” ArXiv Search")
364
+
365
+ input_changed = (edited_input != st.session_state.get('old_val'))
366
+ if autorun and input_changed and edited_input.strip():
367
+ st.session_state['old_val'] = edited_input
368
+ # Save user input
369
+ save_input_as_md(st.session_state['user_name'], edited_input, prefix="input")
370
+ with st.spinner("Searching ArXiv..."):
371
+ results = arxiv_search(edited_input)
372
+ if results:
373
+ summary = summarize_arxiv_results(results)
374
+ # Save response
375
+ save_response_as_md(st.session_state['user_name'], summary, prefix="response")
376
+ st.write(summary)
377
+ # Autoplay TTS
378
+ play_text(summary)
379
+ else:
380
+ st.warning("No results found on ArXiv.")
381
+
382
+ if run_arxiv and edited_input.strip():
383
+ # Manual trigger
384
+ save_input_as_md(st.session_state['user_name'], edited_input, prefix="input")
385
+ with st.spinner("Searching ArXiv..."):
386
+ results = arxiv_search(edited_input)
387
+ if results:
388
+ summary = summarize_arxiv_results(results)
389
+ save_response_as_md(st.session_state['user_name'], summary, prefix="response")
390
+ st.write(summary)
391
+ play_text(summary)
392
+ else:
393
+ st.warning("No results found on ArXiv.")
394
+
395
+ # ------------------ Dataset Search -------------------------
396
+ with tab3:
397
+ st.subheader("Dataset Search")
398
+ search = FastDatasetSearcher()
399
+ query = st.text_input("Enter dataset search query:")
400
+ run_ds_search = st.button("Search Dataset")
401
+ num_results = st.slider("Max results:", 1, 100, 20)
402
+
403
+ if run_ds_search and query.strip():
404
+ with st.spinner("Searching dataset..."):
405
+ df = search.load_page()
406
+ results = search.quick_search(query, df)
407
+ if len(results) > 0:
408
+ st.write(f"Found {len(results)} results:")
409
+ shown = 0
410
+ for i, (_, result) in enumerate(results.iterrows(), 1):
411
+ if shown >= num_results:
412
+ break
413
+ with st.expander(f"Result {i}", expanded=(i==1)):
414
+ # Just print result keys/values here
415
+ for k, v in result.items():
416
+ if k not in ['score', 'matched']:
417
+ st.write(f"**{k}:** {v}")
418
+ shown += 1
419
+ else:
420
+ st.warning("No matching results found.")
421
+
422
+ # ------------------ Settings Tab -------------------------
423
+ with tab4:
424
+ st.subheader("Settings")
425
+ st.write("Adjust voice and search parameters in the sidebar.")
426
+ if st.button("πŸ—‘οΈ Clear Search History"):
427
+ st.session_state['search_history'] = []
428
+ # Optionally delete files:
429
+ # for fpath in list_saved_inputs():
430
+ # os.remove(fpath)
431
+ st.success("Search history cleared!")
432
+
433
+ if __name__ == "__main__":
434
+ main()