science-smolagent-rag

Sleeping

App Files Files Community

Svngoku commited on Jan 3

Commit

6e1aa35

verified ·

1 Parent(s): ccd3034

Update app.py

Browse files

Files changed (1) hide show

app.py +212 -63

app.py CHANGED Viewed

@@ -1,65 +1,214 @@
 import streamlit as st
-from smolagents.agents import ToolCallingAgent
-from smolagents import tool, LiteLLMModel
-from typing import Optional
-import cv2
-import pytesseract
-from PIL import Image
-import io
-import numpy as np
-import base64
-# Define the LiteLLMModel with OpenAI key
-model = LiteLLMModel(model_id="gpt-4o", api_key="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")
-@tool
-def extract_components(image_data_base64: str) -> str:
-    """
-    Extract components from a web design image.
-    Args:
-        image_data_base64: The image data in base64 string format.
-    Returns:
-        A string describing the components found in the image.
-    """
-    image_data = base64.b64decode(image_data_base64)
-    image = Image.open(io.BytesIO(image_data))
-    gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
-    components = pytesseract.image_to_string(gray)
-    return components
-@tool
-def generate_code(components: str) -> str:
-    """
-    Generate code for the given components.
-    Args:
-        components: A string describing the components.
-    Returns:
-        The generated code for the components.
-    """
-    # This is a placeholder implementation. You can replace it with actual code generation logic.
-    return f"Generated code for components: {components}"
-# Define the ToolCallingAgent
-agent = ToolCallingAgent(tools=[extract_components, generate_code], model=model)
-# Streamlit app title
-st.title("Web Design Component Extractor")
-# File uploader for the web design image
-uploaded_file = st.file_uploader("Upload a web design image", type=["png", "jpg", "jpeg"])
-# Button to run the agent
-if st.button("Extract and Generate Code"):
-    if uploaded_file is not None:
-        image_data = uploaded_file.read()
-        image_data_base64 = base64.b64encode(image_data).decode('utf-8')
-        components = agent.run(f"extract_components {image_data_base64}")
-        code = agent.run(f"generate_code {components}")
-        st.write("Extracted Components:", components)
-        st.write("Generated Code:", code)
     else:
-        st.write("Please upload an image.")

 import streamlit as st
+from smolagents import Tool, CodeAgent, HfApiModel
+from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownTextSplitter
+from langchain_community.retrievers import BM25Retriever
+from langchain.docstore.document import Document
+from datasets import load_dataset, concatenate_datasets
+st.set_page_config(
+    page_title="African History Search Engine",
+    page_icon="🌍",
+    layout="wide"
+)
+class RetrieverTool(Tool):
+    name = "retriever"
+    description = "Uses BM25 search to retrieve relevant African historical documentation"
+    inputs = {
+        "query": {
+            "type": "string",
+            "description": "The historical query in affirmative form rather than a question"
+        }
+    }
+    output_type = "string"
+    def __init__(self, docs, k1=1.5, b=0.75, **kwargs):
+        super().__init__(**kwargs)
+        self.retriever = BM25Retriever.from_documents(
+            docs,
+            k=12,
+            k1=k1,
+            b=b
+        )
+        self.docs = docs
+        self.avg_doc_length = sum(len(doc.page_content.split()) for doc in docs) / len(docs)
+    def forward(self, query: str) -> str:  # Matches exactly with inputs
+        # Preprocess query
+        query = self._preprocess_query(query)
+        # Retrieve documents
+        docs = self.retriever.get_relevant_documents(query)
+        # Format response
+        main_response = "Retrieved documents (ranked by relevance):\n\n"
+        for i, doc in enumerate(docs, 1):
+            doc_length = len(doc.page_content.split())
+            length_factor = doc_length / self.avg_doc_length
+            main_response += f"Document {i} (Length Factor: {length_factor:.2f})\n"
+            main_response += f"{doc.page_content}\n\n"
+            if doc.metadata:
+                main_response += f"Metadata: {doc.metadata}\n"
+            main_response += "---\n\n"
+        return main_response
+    def _preprocess_query(self, query: str) -> str:
+        question_words = ["what", "when", "where", "who", "why", "how"]
+        query_terms = query.lower().split()
+        if query_terms[0] in question_words:
+            query_terms = query_terms[1:]
+        return " ".join(query_terms)
+# Process documents
+def prepare_docs(documents):
+    text_splitter = MarkdownTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200
+    )
+    return text_splitter.split_documents(documents)
+# Initialize agent
+def create_rag_agent(processed_docs):
+    retriever_tool = RetrieverTool(processed_docs)
+    return CodeAgent(
+        tools=[retriever_tool],
+        model=HfApiModel(),
+        verbose=True
+    )
+def format_search_results(results: str):
+    """Format the search results into main content and sources sections"""
+    if "### 📚 Sources:" in results:
+        main_content, sources = results.split("### 📚 Sources:")
+        # Create two columns with adjusted ratios
+        col1, col2 = st.columns([3, 2])
+        with col1:
+            st.markdown("### 📖 Main Findings")
+            st.markdown(main_content)
+        with col2:
+            st.markdown("### 📚 Sources")
+            st.markdown(sources, unsafe_allow_html=True)
     else:
+        st.markdown(results)
+@st.cache_resource
+def get_agent():
+    """Single function to handle data loading, processing, and agent creation"""
+    # Load dataset
+    dataset = load_dataset("Svngoku/African-History-Extra-11-30-24")
+    train_docs = dataset["train"]
+    test_docs = dataset["test"]
+    source_docs = concatenate_datasets([train_docs, test_docs])
+    # Create documents
+    documents = [
+        Document(
+            page_content=item['content'],
+            metadata={
+                "source": item['url'],
+                "title": item['title'],
+                "description": item['description'],
+                "published_time": item['publishedTime']
+            }
+        )
+        for item in source_docs
+    ]
+    # Process documents
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=500,
+        add_start_index=True,
+        strip_whitespace=True,
+    )
+    processed_docs = text_splitter.split_documents(documents)
+    # Create and return agent
+    retriever_tool = RetrieverTool(processed_docs)
+    return CodeAgent(
+        tools=[retriever_tool],
+        model=HfApiModel("meta-llama/Llama-3.3-70B-Instruct"),
+    )
+# Streamlit UI
+st.title("🌍 African History Search Engine")
+st.markdown("""
+This search engine uses advanced AI to help you explore African history.
+It provides detailed, sourced information from a curated database of historical documents.
+""")
+# Initialize agent
+if 'agent' not in st.session_state:
+    with st.spinner("Loading historical database..."):
+        st.session_state.agent = get_agent()
+# Search interface
+search_query = st.text_input(
+    "🔍 Search African History",
+    placeholder="E.g., Tell me about the Kingdom of Kush",
+    help="Enter any question about African history"
+)
+# Advanced search options
+with st.expander("Advanced Search Options"):
+    search_type = st.radio(
+        "Search Type",
+        ["General Query", "Specific Time Period", "Geographic Region"],
+        help="Select the type of search you want to perform"
+    )
+    if search_type == "Specific Time Period":
+        search_query = f"Focus on the time period: {search_query}"
+    elif search_type == "Geographic Region":
+        search_query = f"Focus on the region of: {search_query}"
+# Search button
+if st.button("Search", type="primary"):
+    if search_query:
+        with st.spinner("Searching historical records..."):
+            try:
+                results = st.session_state.agent.run(search_query)
+                # Use the formatter to display results
+                format_search_results(results)
+                # Add methodology note
+                st.markdown("---")
+                st.info("""
+                💡 **How to read the results:**
+                - Main findings are summarized on the left
+                - Source references are numbered [Source X]
+                - Click on source details on the right to expand
+                - Follow the links to read the original articles
+                """)
+            except Exception as e:
+                st.error(f"An error occurred during the search: {e}")
+    else:
+        st.warning("Please enter a search query to begin.")
+# Sidebar with additional information
+with st.sidebar:
+    st.markdown("### About This Search Engine")
+    st.markdown("""
+    This search engine specializes in African history, providing:
+    - 📚 Detailed historical information
+    - 🔍 Source verification
+    - 🌍 Geographic context
+    - ⏳ Historical timeline context
+    """)
+    st.markdown("### Data Sources")
+    st.markdown("Our database includes information from various historical documents, "
+               "academic papers, and verified historical records.")
+# Footer
+st.markdown("---")
+st.caption("Powered by SmolAgents, RAG, and African History Dataset")