Spaces:

awacke1
/

CodeCompetitionClaudeVsGPT

Running

App Files Files Community

awacke1 commited on Dec 19, 2024

Commit

9b95cb7

verified ·

1 Parent(s): bdefc08

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -62

app.py CHANGED Viewed

@@ -18,13 +18,47 @@ if 'current_page' not in st.session_state:
     st.session_state['current_page'] = 0
 if 'data_cache' not in st.session_state:
     st.session_state['data_cache'] = None
 ROWS_PER_PAGE = 100  # Number of rows to load at a time
 @st.cache_resource
 def get_model():
     return SentenceTransformer('all-MiniLM-L6-v2')
 class FastDatasetSearcher:
     def __init__(self, dataset_id="tomg-group-umd/cinepile"):
         self.dataset_id = dataset_id
@@ -33,54 +67,35 @@ class FastDatasetSearcher:
         if not self.token:
             st.error("Please set the DATASET_KEY environment variable with your Hugging Face token.")
             st.stop()
-        self.load_dataset_info()
-    @st.cache_data
-    def load_dataset_info(self):
-        """Load dataset metadata only"""
-        try:
-            dataset = load_dataset(
-                self.dataset_id,
-                token=self.token,
-                streaming=True
-            )
-            self.dataset_info = dataset['train'].info
-            return True
-        except Exception as e:
-            st.error(f"Error loading dataset: {str(e)}")
-            return False
     def load_page(self, page=0):
-        """Load a specific page of data"""
-        if st.session_state['data_cache'] is not None and st.session_state['current_page'] == page:
-            return st.session_state['data_cache']
-        try:
-            dataset = load_dataset(
-                self.dataset_id,
-                token=self.token,
-                streaming=False,
-                split=f'train[{page*ROWS_PER_PAGE}:{(page+1)*ROWS_PER_PAGE}]'
-            )
-            df = pd.DataFrame(dataset)
-            st.session_state['data_cache'] = df
-            st.session_state['current_page'] = page
-            return df
-        except Exception as e:
-            st.error(f"Error loading page {page}: {str(e)}")
-            return pd.DataFrame()
     def quick_search(self, query, df):
         """Fast search on current page"""
         scores = []
         query_embedding = self.text_model.encode([query], show_progress_bar=False)[0]
         for _, row in df.iterrows():
             # Combine all searchable text fields
-            text = ' '.join(str(v) for v in row.values() if isinstance(v, (str, int, float)))
             # Quick keyword match
-            keyword_score = text.lower().count(query.lower()) / len(text.split())
             # Semantic search on combined text
             text_embedding = self.text_model.encode([text], show_progress_bar=False)[0]
@@ -91,8 +106,29 @@ class FastDatasetSearcher:
             scores.append(combined_score)
         # Get top results
-        df['score'] = scores
-        return df.sort_values('score', ascending=False)
 def main():
     st.title("🎥 Fast Video Dataset Search")
@@ -100,19 +136,31 @@ def main():
     # Initialize search class
     searcher = FastDatasetSearcher()
-    # Page navigation
-    page = st.number_input("Page", min_value=0, value=st.session_state['current_page'])
     # Load current page
-    with st.spinner(f"Loading page {page}..."):
-        df = searcher.load_page(page)
     if df.empty:
         st.warning("No data available for this page.")
         return
     # Search interface
-    query = st.text_input("Search in current page:", help="Searches within currently loaded data")
     if query:
         with st.spinner("Searching..."):
@@ -120,33 +168,23 @@ def main():
             # Display results
             st.write(f"Found {len(results)} results on this page:")
-            for i, (_, result) in enumerate(results.iterrows(), 1):
-                score = result.pop('score')
-                with st.expander(f"Result {i} (Score: {score:.2%})", expanded=i==1):
-                    # Display video if available
-                    if 'youtube_id' in result:
-                        st.video(
-                            f"https://youtube.com/watch?v={result['youtube_id']}&t={result.get('start_time', 0)}"
-                        )
-                    # Display other fields
-                    for key, value in result.items():
-                        if isinstance(value, (str, int, float)):
-                            st.write(f"**{key}:** {value}")
     # Show raw data
-    st.subheader("Raw Data")
-    st.dataframe(df)
     # Navigation buttons
     cols = st.columns(2)
     with cols[0]:
-        if st.button("Previous Page") and page > 0:
-            st.session_state['current_page'] -= 1
             st.rerun()
     with cols[1]:
-        if st.button("Next Page"):
-            st.session_state['current_page'] += 1
             st.rerun()
 if __name__ == "__main__":

     st.session_state['current_page'] = 0
 if 'data_cache' not in st.session_state:
     st.session_state['data_cache'] = None
+if 'dataset_info' not in st.session_state:
+    st.session_state['dataset_info'] = None
 ROWS_PER_PAGE = 100  # Number of rows to load at a time
 @st.cache_resource
 def get_model():
+    """Cache the model loading"""
     return SentenceTransformer('all-MiniLM-L6-v2')
+@st.cache_data
+def load_dataset_page(dataset_id, token, page, rows_per_page):
+    """Load and cache a specific page of data"""
+    try:
+        start_idx = page * rows_per_page
+        end_idx = start_idx + rows_per_page
+        dataset = load_dataset(
+            dataset_id,
+            token=token,
+            streaming=False,
+            split=f'train[{start_idx}:{end_idx}]'
+        )
+        return pd.DataFrame(dataset)
+    except Exception as e:
+        st.error(f"Error loading page {page}: {str(e)}")
+        return pd.DataFrame()
+@st.cache_data
+def get_dataset_info(dataset_id, token):
+    """Load and cache dataset information"""
+    try:
+        dataset = load_dataset(
+            dataset_id,
+            token=token,
+            streaming=True
+        )
+        return dataset['train'].info
+    except Exception as e:
+        st.error(f"Error loading dataset info: {str(e)}")
+        return None
 class FastDatasetSearcher:
     def __init__(self, dataset_id="tomg-group-umd/cinepile"):
         self.dataset_id = dataset_id
         if not self.token:
             st.error("Please set the DATASET_KEY environment variable with your Hugging Face token.")
             st.stop()
+        # Load dataset info if not already loaded
+        if st.session_state['dataset_info'] is None:
+            st.session_state['dataset_info'] = get_dataset_info(self.dataset_id, self.token)
     def load_page(self, page=0):
+        """Load a specific page of data using cached function"""
+        return load_dataset_page(self.dataset_id, self.token, page, ROWS_PER_PAGE)
     def quick_search(self, query, df):
         """Fast search on current page"""
+        if df.empty:
+            return df
         scores = []
         query_embedding = self.text_model.encode([query], show_progress_bar=False)[0]
         for _, row in df.iterrows():
             # Combine all searchable text fields
+            text_values = []
+            for v in row.values():
+                if isinstance(v, (str, int, float)):
+                    text_values.append(str(v))
+                elif isinstance(v, (list, dict)):
+                    text_values.append(str(v))
+            text = ' '.join(text_values)
             # Quick keyword match
+            keyword_score = text.lower().count(query.lower()) / (len(text.split()) + 1)  # Add 1 to avoid division by zero
             # Semantic search on combined text
             text_embedding = self.text_model.encode([text], show_progress_bar=False)[0]
             scores.append(combined_score)
         # Get top results
+        results_df = df.copy()
+        results_df['score'] = scores
+        return results_df.sort_values('score', ascending=False)
+def render_result(result):
+    """Render a single search result"""
+    score = result.pop('score', 0)
+    # Display video if available
+    if 'youtube_id' in result:
+        st.video(
+            f"https://youtube.com/watch?v={result['youtube_id']}&t={result.get('start_time', 0)}"
+        )
+    # Display other fields
+    cols = st.columns([2, 1])
+    with cols[0]:
+        for key, value in result.items():
+            if isinstance(value, (str, int, float)):
+                st.write(f"**{key}:** {value}")
+    with cols[1]:
+        st.metric("Relevance Score", f"{score:.2%}")
 def main():
     st.title("🎥 Fast Video Dataset Search")
     # Initialize search class
     searcher = FastDatasetSearcher()
+    # Show dataset info
+    if st.session_state['dataset_info']:
+        st.sidebar.write("### Dataset Info")
+        st.sidebar.write(f"Total examples: {st.session_state['dataset_info'].splits['train'].num_examples:,}")
+        total_pages = st.session_state['dataset_info'].splits['train'].num_examples // ROWS_PER_PAGE
+        current_page = st.number_input("Page", min_value=0, max_value=total_pages, value=st.session_state['current_page'])
+    else:
+        current_page = st.number_input("Page", min_value=0, value=st.session_state['current_page'])
     # Load current page
+    with st.spinner(f"Loading page {current_page}..."):
+        df = searcher.load_page(current_page)
     if df.empty:
         st.warning("No data available for this page.")
         return
     # Search interface
+    col1, col2 = st.columns([3, 1])
+    with col1:
+        query = st.text_input("Search in current page:",
+                             help="Searches within currently loaded data")
+    with col2:
+        max_results = st.slider("Max results", 1, ROWS_PER_PAGE, 10)
     if query:
         with st.spinner("Searching..."):
             # Display results
             st.write(f"Found {len(results)} results on this page:")
+            for i, (_, result) in enumerate(results.head(max_results).iterrows(), 1):
+                with st.expander(f"Result {i}", expanded=i==1):
+                    render_result(result)
     # Show raw data
+    with st.expander("Show Raw Data"):
+        st.dataframe(df)
     # Navigation buttons
     cols = st.columns(2)
     with cols[0]:
+        if st.button("⬅️ Previous Page") and current_page > 0:
+            st.session_state['current_page'] = current_page - 1
             st.rerun()
     with cols[1]:
+        if st.button("Next Page ➡️"):
+            st.session_state['current_page'] = current_page + 1
             st.rerun()
 if __name__ == "__main__":