awacke1 commited on
Commit
88675e3
Β·
verified Β·
1 Parent(s): 420b18d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -201
app.py CHANGED
@@ -9,23 +9,20 @@ import os
9
  import glob
10
  import random
11
  from pathlib import Path
12
- from datetime import datetime, timedelta
13
  import edge_tts
14
  import asyncio
15
  import requests
16
- from collections import defaultdict
17
  import streamlit.components.v1 as components
18
- from urllib.parse import quote
19
- from xml.etree import ElementTree as ET
20
- from datasets import load_dataset
21
  import base64
22
  import re
 
 
23
 
24
  # -------------------- Configuration & Constants --------------------
 
25
  USER_NAMES = [
26
- "Alex", "Jordan", "Taylor", "Morgan", "Rowan", "Avery", "Riley", "Quinn",
27
- "Casey", "Jesse", "Reese", "Skyler", "Ellis", "Devon", "Aubrey", "Kendall",
28
- "Parker", "Dakota", "Sage", "Finley"
29
  ]
30
 
31
  ENGLISH_VOICES = [
@@ -34,6 +31,9 @@ ENGLISH_VOICES = [
34
  "en-CA-LiamNeural", "en-AU-NatashaNeural", "en-AU-WilliamNeural"
35
  ]
36
 
 
 
 
37
  ROWS_PER_PAGE = 100
38
  MIN_SEARCH_SCORE = 0.3
39
  EXACT_MATCH_BOOST = 2.0
@@ -47,7 +47,6 @@ SESSION_VARS = {
47
  'should_rerun': False,
48
  'search_columns': [],
49
  'initial_search_done': False,
50
- 'tts_voice': "en-US-AriaNeural",
51
  'arxiv_last_query': "",
52
  'dataset_loaded': False,
53
  'current_page': 0,
@@ -59,7 +58,8 @@ SESSION_VARS = {
59
  'voice_text': None,
60
  'user_name': random.choice(USER_NAMES),
61
  'max_items': 100,
62
- 'global_voice': "en-US-AriaNeural" # Default global voice
 
63
  }
64
 
65
  for var, default in SESSION_VARS.items():
@@ -85,19 +85,17 @@ def clean_for_speech(text: str) -> str:
85
  text = re.sub(r"\s+", " ", text).strip()
86
  return text
87
 
88
- async def edge_tts_generate_audio(text, voice="en-US-AriaNeural", rate=0, pitch=0):
89
  text = clean_for_speech(text)
90
  if not text.strip():
91
  return None
92
- rate_str = f"{rate:+d}%"
93
- pitch_str = f"{pitch:+d}Hz"
94
- communicate = edge_tts.Communicate(text, voice, rate=rate_str, pitch=pitch_str)
95
- out_fn = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
96
  await communicate.save(out_fn)
97
  return out_fn
98
 
99
  def speak_with_edge_tts(text, voice="en-US-AriaNeural"):
100
- return asyncio.run(edge_tts_generate_audio(text, voice, 0, 0))
101
 
102
  def play_and_download_audio(file_path):
103
  if file_path and os.path.exists(file_path):
@@ -138,7 +136,6 @@ def list_saved_inputs():
138
  return files
139
 
140
  def parse_md_file(fpath):
141
- # Extract user and text from md
142
  user_line = ""
143
  ts_line = ""
144
  content_lines = []
@@ -154,139 +151,7 @@ def parse_md_file(fpath):
154
  content = "\n".join(content_lines).strip()
155
  return user_line, ts_line, content
156
 
157
- def fetch_dataset_info(dataset_id, token):
158
- info_url = f"https://huggingface.co/api/datasets/{dataset_id}"
159
- try:
160
- response = requests.get(info_url, timeout=30)
161
- if response.status_code == 200:
162
- return response.json()
163
- except Exception:
164
- pass
165
- return None
166
-
167
- @st.cache_data
168
- def get_dataset_info(dataset_id, token):
169
- try:
170
- dataset = load_dataset(dataset_id, token=token, streaming=True)
171
- return dataset['train'].info
172
- except:
173
- return None
174
-
175
- @st.cache_data
176
- def load_dataset_page(dataset_id, token, page, rows_per_page):
177
- try:
178
- start_idx = page * rows_per_page
179
- end_idx = start_idx + rows_per_page
180
- dataset = load_dataset(
181
- dataset_id,
182
- token=token,
183
- streaming=False,
184
- split=f'train[{start_idx}:{end_idx}]'
185
- )
186
- return pd.DataFrame(dataset)
187
- except:
188
- return pd.DataFrame()
189
-
190
- class FastDatasetSearcher:
191
- def __init__(self, dataset_id="tomg-group-umd/cinepile"):
192
- self.dataset_id = dataset_id
193
- self.text_model = get_model()
194
- self.token = os.environ.get('DATASET_KEY')
195
-
196
- def load_page(self, page=0):
197
- return load_dataset_page(self.dataset_id, self.token, page, ROWS_PER_PAGE)
198
-
199
- def quick_search(self, query, df):
200
- if df.empty or not query.strip():
201
- return df
202
-
203
- try:
204
- searchable_cols = []
205
- if len(df) > 0:
206
- for col in df.columns:
207
- sample_val = df[col].iloc[0]
208
- if not isinstance(sample_val, (np.ndarray, bytes)):
209
- searchable_cols.append(col)
210
-
211
- query_lower = query.lower()
212
- query_terms = set(query_lower.split())
213
- query_embedding = self.text_model.encode([query], show_progress_bar=False)[0]
214
-
215
- scores = []
216
- matched_any = []
217
-
218
- for _, row in df.iterrows():
219
- text_parts = []
220
- row_matched = False
221
- exact_match = False
222
- priority_fields = ['description', 'matched_text']
223
- other_fields = [col for col in searchable_cols if col not in priority_fields]
224
-
225
- for col in priority_fields:
226
- if col in row:
227
- val = row[col]
228
- if val is not None:
229
- val_str = str(val).lower()
230
- if query_lower in val_str.split():
231
- exact_match = True
232
- if any(term in val_str.split() for term in query_terms):
233
- row_matched = True
234
- text_parts.append(str(val))
235
-
236
- for col in other_fields:
237
- val = row[col]
238
- if val is not None:
239
- val_str = str(val).lower()
240
- if query_lower in val_str.split():
241
- exact_match = True
242
- if any(term in val_str.split() for term in query_terms):
243
- row_matched = True
244
- text_parts.append(str(val))
245
-
246
- text = ' '.join(text_parts)
247
- if text.strip():
248
- text_tokens = set(text.lower().split())
249
- matching_terms = query_terms.intersection(text_tokens)
250
- keyword_score = len(matching_terms) / len(query_terms) if len(query_terms) > 0 else 0.0
251
-
252
- text_embedding = self.text_model.encode([text], show_progress_bar=False)[0]
253
- semantic_score = float(cosine_similarity([query_embedding], [text_embedding])[0][0])
254
-
255
- combined_score = 0.7 * keyword_score + 0.3 * semantic_score
256
-
257
- if exact_match:
258
- combined_score *= EXACT_MATCH_BOOST
259
- elif row_matched:
260
- combined_score *= 1.2
261
- else:
262
- combined_score = 0.0
263
- row_matched = False
264
-
265
- scores.append(combined_score)
266
- matched_any.append(row_matched)
267
-
268
- results_df = df.copy()
269
- results_df['score'] = scores
270
- results_df['matched'] = matched_any
271
-
272
- filtered_df = results_df[
273
- (results_df['matched']) |
274
- (results_df['score'] > MIN_SEARCH_SCORE)
275
- ]
276
-
277
- return filtered_df.sort_values('score', ascending=False)
278
- except:
279
- return df
280
-
281
- def play_text(text):
282
- voice = st.session_state.get('global_voice', "en-US-AriaNeural")
283
- audio_file = speak_with_edge_tts(text, voice=voice)
284
- if audio_file:
285
- play_and_download_audio(audio_file)
286
-
287
  def arxiv_search(query, max_results=3):
288
- # Simple arXiv search using RSS (for demonstration)
289
- # In production, use official arXiv API or a library.
290
  base_url = "http://export.arxiv.org/api/query"
291
  params = {
292
  'search_query': query.replace(' ', '+'),
@@ -302,29 +167,76 @@ def arxiv_search(query, max_results=3):
302
  for entry in entries:
303
  title = entry.find('a:title', ns).text.strip()
304
  summary = entry.find('a:summary', ns).text.strip()
305
- # Just truncating summary for demo
306
  summary_short = summary[:300] + "..."
307
  results.append((title, summary_short))
308
  return results
309
  return []
310
 
311
  def summarize_arxiv_results(results):
312
- # Just combine titles and short summaries
313
  lines = []
314
  for i, (title, summary) in enumerate(results, 1):
315
  lines.append(f"Result {i}: {title}\n{summary}\n")
316
  return "\n\n".join(lines)
317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
  def main():
319
  st.title("πŸŽ™οΈ Voice Chat & Search")
320
 
321
  # Sidebar
322
  with st.sidebar:
323
  # Editable user name
324
- st.session_state['user_name'] = st.text_input("Current User:", value=st.session_state['user_name'])
325
-
326
- # Global voice selection
327
- st.session_state['global_voice'] = st.selectbox("Select Global Voice:", ENGLISH_VOICES, index=0)
328
 
329
  st.session_state['max_items'] = st.number_input("Max Items per search iteration:", min_value=1, max_value=1000, value=st.session_state['max_items'])
330
 
@@ -339,82 +251,109 @@ def main():
339
  voice_component = create_voice_component()
340
  voice_val = voice_component(my_input_value="Start speaking...")
341
 
342
- # Tabs: Voice Chat History, Arxiv Search, Dataset Search, Settings
343
  tab1, tab2, tab3, tab4 = st.tabs(["πŸ—£οΈ Voice Chat History", "πŸ“š ArXiv Search", "πŸ“Š Dataset Search", "βš™οΈ Settings"])
344
 
345
  # ------------------ Voice Chat History -------------------------
346
  with tab1:
347
  st.subheader("Voice Chat History")
348
- # List saved inputs and responses and allow playing them
349
  files = list_saved_inputs()
350
- for fpath in reversed(files):
 
351
  user, ts, content = parse_md_file(fpath)
 
 
352
  with st.expander(f"{ts} - {user}", expanded=False):
353
  st.write(content)
354
- if st.button("πŸ”Š Read Aloud", key=f"read_{fpath}"):
355
- play_text(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
 
357
  # ------------------ ArXiv Search -------------------------
358
  with tab2:
359
  st.subheader("ArXiv Search")
360
- # If we have a voice_val and autorun with ArXiv chosen:
361
  edited_input = st.text_area("Enter or Edit Search Query:", value=(voice_val.strip() if voice_val else ""), height=100)
362
  autorun = st.checkbox("⚑ Auto-Run", value=True)
363
  run_arxiv = st.button("πŸ” ArXiv Search")
364
 
365
  input_changed = (edited_input != st.session_state.get('old_val'))
 
 
 
 
 
366
  if autorun and input_changed and edited_input.strip():
367
- st.session_state['old_val'] = edited_input
368
- # Save user input
369
- save_input_as_md(st.session_state['user_name'], edited_input, prefix="input")
370
- with st.spinner("Searching ArXiv..."):
371
- results = arxiv_search(edited_input)
372
- if results:
373
- summary = summarize_arxiv_results(results)
374
- # Save response
375
- save_response_as_md(st.session_state['user_name'], summary, prefix="response")
376
- st.write(summary)
377
- # Autoplay TTS
378
- play_text(summary)
379
- else:
380
- st.warning("No results found on ArXiv.")
381
-
382
  if run_arxiv and edited_input.strip():
383
- # Manual trigger
384
- save_input_as_md(st.session_state['user_name'], edited_input, prefix="input")
385
- with st.spinner("Searching ArXiv..."):
386
- results = arxiv_search(edited_input)
387
- if results:
388
- summary = summarize_arxiv_results(results)
389
- save_response_as_md(st.session_state['user_name'], summary, prefix="response")
390
- st.write(summary)
391
- play_text(summary)
392
- else:
393
- st.warning("No results found on ArXiv.")
 
 
 
 
 
 
 
 
 
 
394
 
395
  # ------------------ Dataset Search -------------------------
396
  with tab3:
397
  st.subheader("Dataset Search")
398
- search = FastDatasetSearcher()
399
  query = st.text_input("Enter dataset search query:")
400
  run_ds_search = st.button("Search Dataset")
401
  num_results = st.slider("Max results:", 1, 100, 20)
402
 
403
  if run_ds_search and query.strip():
404
  with st.spinner("Searching dataset..."):
405
- df = search.load_page()
406
- results = search.quick_search(query, df)
407
- if len(results) > 0:
 
408
  st.write(f"Found {len(results)} results:")
409
  shown = 0
410
- for i, (_, result) in enumerate(results.iterrows(), 1):
411
  if shown >= num_results:
412
  break
413
  with st.expander(f"Result {i}", expanded=(i==1)):
414
- # Just print result keys/values here
415
- for k, v in result.items():
416
- if k not in ['score', 'matched']:
417
- st.write(f"**{k}:** {v}")
418
  shown += 1
419
  else:
420
  st.warning("No matching results found.")
@@ -422,13 +361,14 @@ def main():
422
  # ------------------ Settings Tab -------------------------
423
  with tab4:
424
  st.subheader("Settings")
425
- st.write("Adjust voice and search parameters in the sidebar.")
426
  if st.button("πŸ—‘οΈ Clear Search History"):
 
 
 
427
  st.session_state['search_history'] = []
428
- # Optionally delete files:
429
- # for fpath in list_saved_inputs():
430
- # os.remove(fpath)
431
- st.success("Search history cleared!")
432
 
433
  if __name__ == "__main__":
434
  main()
 
9
  import glob
10
  import random
11
  from pathlib import Path
12
+ from datetime import datetime
13
  import edge_tts
14
  import asyncio
15
  import requests
 
16
  import streamlit.components.v1 as components
 
 
 
17
  import base64
18
  import re
19
+ from xml.etree import ElementTree as ET
20
+ from datasets import load_dataset
21
 
22
  # -------------------- Configuration & Constants --------------------
23
+ # Exactly 11 user names and 11 voices
24
  USER_NAMES = [
25
+ "Aria", "Guy", "Sonia", "Tony", "Jenny", "Davis", "Libby", "Clara", "Liam", "Natasha", "William"
 
 
26
  ]
27
 
28
  ENGLISH_VOICES = [
 
31
  "en-CA-LiamNeural", "en-AU-NatashaNeural", "en-AU-WilliamNeural"
32
  ]
33
 
34
+ # Map each user to a corresponding voice
35
+ USER_VOICES = dict(zip(USER_NAMES, ENGLISH_VOICES))
36
+
37
  ROWS_PER_PAGE = 100
38
  MIN_SEARCH_SCORE = 0.3
39
  EXACT_MATCH_BOOST = 2.0
 
47
  'should_rerun': False,
48
  'search_columns': [],
49
  'initial_search_done': False,
 
50
  'arxiv_last_query': "",
51
  'dataset_loaded': False,
52
  'current_page': 0,
 
58
  'voice_text': None,
59
  'user_name': random.choice(USER_NAMES),
60
  'max_items': 100,
61
+ 'global_voice': "en-US-AriaNeural",
62
+ 'last_arxiv_input': None # To avoid double-running ArXiv search
63
  }
64
 
65
  for var, default in SESSION_VARS.items():
 
85
  text = re.sub(r"\s+", " ", text).strip()
86
  return text
87
 
88
+ async def edge_tts_generate_audio(text, voice="en-US-AriaNeural"):
89
  text = clean_for_speech(text)
90
  if not text.strip():
91
  return None
92
+ communicate = edge_tts.Communicate(text, voice)
93
+ out_fn = f"speech_{datetime.now().strftime('%Y%m%d_%H%M%S_%f')}.mp3"
 
 
94
  await communicate.save(out_fn)
95
  return out_fn
96
 
97
  def speak_with_edge_tts(text, voice="en-US-AriaNeural"):
98
+ return asyncio.run(edge_tts_generate_audio(text, voice))
99
 
100
  def play_and_download_audio(file_path):
101
  if file_path and os.path.exists(file_path):
 
136
  return files
137
 
138
  def parse_md_file(fpath):
 
139
  user_line = ""
140
  ts_line = ""
141
  content_lines = []
 
151
  content = "\n".join(content_lines).strip()
152
  return user_line, ts_line, content
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  def arxiv_search(query, max_results=3):
 
 
155
  base_url = "http://export.arxiv.org/api/query"
156
  params = {
157
  'search_query': query.replace(' ', '+'),
 
167
  for entry in entries:
168
  title = entry.find('a:title', ns).text.strip()
169
  summary = entry.find('a:summary', ns).text.strip()
 
170
  summary_short = summary[:300] + "..."
171
  results.append((title, summary_short))
172
  return results
173
  return []
174
 
175
  def summarize_arxiv_results(results):
 
176
  lines = []
177
  for i, (title, summary) in enumerate(results, 1):
178
  lines.append(f"Result {i}: {title}\n{summary}\n")
179
  return "\n\n".join(lines)
180
 
181
+ # Simple dataset search: text-based substring search
182
+ def simple_dataset_search(query, df):
183
+ if df.empty or not query.strip():
184
+ return pd.DataFrame()
185
+ query_terms = query.lower().split()
186
+ matches = []
187
+ for idx, row in df.iterrows():
188
+ # Combine all text fields into one string
189
+ text_parts = []
190
+ for col in df.columns:
191
+ val = row[col]
192
+ if isinstance(val, str):
193
+ text_parts.append(val.lower())
194
+ elif isinstance(val, (int, float)):
195
+ text_parts.append(str(val))
196
+ full_text = " ".join(text_parts)
197
+ # Check if any query term is in full_text
198
+ if any(qt in full_text for qt in query_terms):
199
+ matches.append(row)
200
+ if matches:
201
+ return pd.DataFrame(matches)
202
+ return pd.DataFrame()
203
+
204
+ @st.cache_data
205
+ def load_dataset_page(dataset_id, token, page, rows_per_page):
206
+ try:
207
+ start_idx = page * rows_per_page
208
+ end_idx = start_idx + rows_per_page
209
+ dataset = load_dataset(
210
+ dataset_id,
211
+ token=token,
212
+ streaming=False,
213
+ split=f'train[{start_idx}:{end_idx}]'
214
+ )
215
+ return pd.DataFrame(dataset)
216
+ except:
217
+ return pd.DataFrame()
218
+
219
+ class SimpleDatasetSearcher:
220
+ def __init__(self, dataset_id="tomg-group-umd/cinepile"):
221
+ self.dataset_id = dataset_id
222
+ self.token = os.environ.get('DATASET_KEY')
223
+ def load_page(self, page=0):
224
+ return load_dataset_page(self.dataset_id, self.token, page, ROWS_PER_PAGE)
225
+
226
+ def concatenate_mp3(files, output_file):
227
+ # Naive binary concatenation of MP3 files
228
+ with open(output_file, 'wb') as outfile:
229
+ for f in files:
230
+ with open(f, 'rb') as infile:
231
+ outfile.write(infile.read())
232
+
233
  def main():
234
  st.title("πŸŽ™οΈ Voice Chat & Search")
235
 
236
  # Sidebar
237
  with st.sidebar:
238
  # Editable user name
239
+ st.session_state['user_name'] = st.selectbox("Current User:", USER_NAMES, index=0)
 
 
 
240
 
241
  st.session_state['max_items'] = st.number_input("Max Items per search iteration:", min_value=1, max_value=1000, value=st.session_state['max_items'])
242
 
 
251
  voice_component = create_voice_component()
252
  voice_val = voice_component(my_input_value="Start speaking...")
253
 
254
+ # Tabs
255
  tab1, tab2, tab3, tab4 = st.tabs(["πŸ—£οΈ Voice Chat History", "πŸ“š ArXiv Search", "πŸ“Š Dataset Search", "βš™οΈ Settings"])
256
 
257
  # ------------------ Voice Chat History -------------------------
258
  with tab1:
259
  st.subheader("Voice Chat History")
 
260
  files = list_saved_inputs()
261
+ conversation = []
262
+ for fpath in files:
263
  user, ts, content = parse_md_file(fpath)
264
+ conversation.append((user, ts, content))
265
+ for user, ts, content in reversed(conversation):
266
  with st.expander(f"{ts} - {user}", expanded=False):
267
  st.write(content)
268
+ if st.button(f"πŸ”Š Read Aloud {ts}-{user}", key=f"read_{fpath}"):
269
+ voice = USER_VOICES.get(user, "en-US-AriaNeural")
270
+ audio_file = speak_with_edge_tts(content, voice=voice)
271
+ if audio_file:
272
+ play_and_download_audio(audio_file)
273
+
274
+ # Read entire conversation
275
+ if st.button("πŸ“œ Read Conversation"):
276
+ # Sort by timestamp to ensure chronological order
277
+ # Already in order because files is sorted, but let's rely on chronological order:
278
+ # They are sorted ascending, so conversation is appended ascending.
279
+ # It's safe to assume files list is chronological by filename.
280
+ mp3_files = []
281
+ for user, ts, content in conversation:
282
+ voice = USER_VOICES.get(user, "en-US-AriaNeural")
283
+ audio_file = speak_with_edge_tts(content, voice=voice)
284
+ if audio_file:
285
+ mp3_files.append(audio_file)
286
+ # Show each line's MP3
287
+ st.write(f"**{user} ({ts}):**")
288
+ play_and_download_audio(audio_file)
289
+
290
+ if mp3_files:
291
+ # Concatenate all mp3 files into one
292
+ combined_file = f"full_conversation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp3"
293
+ concatenate_mp3(mp3_files, combined_file)
294
+ st.write("**Full Conversation Audio:**")
295
+ play_and_download_audio(combined_file)
296
 
297
  # ------------------ ArXiv Search -------------------------
298
  with tab2:
299
  st.subheader("ArXiv Search")
 
300
  edited_input = st.text_area("Enter or Edit Search Query:", value=(voice_val.strip() if voice_val else ""), height=100)
301
  autorun = st.checkbox("⚑ Auto-Run", value=True)
302
  run_arxiv = st.button("πŸ” ArXiv Search")
303
 
304
  input_changed = (edited_input != st.session_state.get('old_val'))
305
+ # Only run once:
306
+ # Conditions to run ArXiv search:
307
+ # - If autorun and input_changed and edited_input non-empty
308
+ # - Or if run_arxiv button is pressed and edited_input non-empty
309
+ should_run_arxiv = False
310
  if autorun and input_changed and edited_input.strip():
311
+ should_run_arxiv = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
312
  if run_arxiv and edited_input.strip():
313
+ should_run_arxiv = True
314
+
315
+ if should_run_arxiv:
316
+ st.session_state['old_val'] = edited_input
317
+ # Avoid double-running by checking if last_arxiv_input is same
318
+ if st.session_state['last_arxiv_input'] != edited_input:
319
+ st.session_state['last_arxiv_input'] = edited_input
320
+ save_input_as_md(st.session_state['user_name'], edited_input, prefix="input")
321
+ with st.spinner("Searching ArXiv..."):
322
+ results = arxiv_search(edited_input)
323
+ if results:
324
+ summary = summarize_arxiv_results(results)
325
+ save_response_as_md(st.session_state['user_name'], summary, prefix="response")
326
+ st.write(summary)
327
+ # Play summary aloud
328
+ voice = USER_VOICES.get(st.session_state['user_name'], "en-US-AriaNeural")
329
+ audio_file = speak_with_edge_tts(summary, voice=voice)
330
+ if audio_file:
331
+ play_and_download_audio(audio_file)
332
+ else:
333
+ st.warning("No results found on ArXiv.")
334
 
335
  # ------------------ Dataset Search -------------------------
336
  with tab3:
337
  st.subheader("Dataset Search")
338
+ ds_searcher = SimpleDatasetSearcher()
339
  query = st.text_input("Enter dataset search query:")
340
  run_ds_search = st.button("Search Dataset")
341
  num_results = st.slider("Max results:", 1, 100, 20)
342
 
343
  if run_ds_search and query.strip():
344
  with st.spinner("Searching dataset..."):
345
+ # For simplicity, just load first page
346
+ df = ds_searcher.load_page(0)
347
+ results = simple_dataset_search(query, df)
348
+ if not results.empty:
349
  st.write(f"Found {len(results)} results:")
350
  shown = 0
351
+ for i, (_, row) in enumerate(results.iterrows(), 1):
352
  if shown >= num_results:
353
  break
354
  with st.expander(f"Result {i}", expanded=(i==1)):
355
+ for k, v in row.items():
356
+ st.write(f"**{k}:** {v}")
 
 
357
  shown += 1
358
  else:
359
  st.warning("No matching results found.")
 
361
  # ------------------ Settings Tab -------------------------
362
  with tab4:
363
  st.subheader("Settings")
364
+ # Clear search history: deletes all md files and clears session
365
  if st.button("πŸ—‘οΈ Clear Search History"):
366
+ # Delete all files
367
+ for fpath in list_saved_inputs():
368
+ os.remove(fpath)
369
  st.session_state['search_history'] = []
370
+ st.success("Search history cleared for everyone!")
371
+ st.experimental_rerun()
 
 
372
 
373
  if __name__ == "__main__":
374
  main()