File size: 6,986 Bytes
97f4a42 6e1aa35 3f6753e 6e1aa35 3f6753e 6e1aa35 3f6753e 6e1aa35 97f4a42 6e1aa35 3f6753e 6e1aa35 3f6753e 6e1aa35 3f6753e 6e1aa35 3f6753e 6e1aa35 6690b0b 6e1aa35 3f6753e 6e1aa35 3f6753e 6e1aa35 3f6753e 6e1aa35 3f6753e 6e1aa35 3f6753e 6e1aa35 3f6753e 6e1aa35 3f6753e 6e1aa35 3f6753e 6e1aa35 3f6753e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
import streamlit as st
from smolagents import Tool, CodeAgent, HfApiModel
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownTextSplitter
from langchain_community.retrievers import BM25Retriever
from langchain.docstore.document import Document
from datasets import load_dataset, concatenate_datasets
st.set_page_config(
page_title="Science Search Engine",
page_icon="π",
layout="wide"
)
class RetrieverTool(Tool):
name = "retriever"
description = "Uses BM25 search to retrieve relevant scientific documentation"
inputs = {
"query": {
"type": "string",
"description": "The scientific query in affirmative form rather than a question"
}
}
output_type = "string"
def __init__(self, docs, k1=1.5, b=0.75, **kwargs):
super().__init__(**kwargs)
self.retriever = BM25Retriever.from_documents(
docs,
k=12,
k1=k1,
b=b
)
self.docs = docs
self.avg_doc_length = sum(len(doc.page_content.split()) for doc in docs) / len(docs)
def forward(self, query: str) -> str: # Matches exactly with inputs
# Preprocess query
query = self._preprocess_query(query)
# Retrieve documents
docs = self.retriever.get_relevant_documents(query)
# Format response
main_response = "Retrieved documents (ranked by relevance):\n\n"
for i, doc in enumerate(docs, 1):
doc_length = len(doc.page_content.split())
length_factor = doc_length / self.avg_doc_length
main_response += f"Document {i} (Length Factor: {length_factor:.2f})\n"
main_response += f"{doc.page_content}\n\n"
if doc.metadata:
main_response += f"Metadata: {doc.metadata}\n"
main_response += "---\n\n"
return main_response
def _preprocess_query(self, query: str) -> str:
question_words = ["what", "when", "where", "who", "why", "how"]
query_terms = query.lower().split()
if query_terms[0] in question_words:
query_terms = query_terms[1:]
return " ".join(query_terms)
# Process documents
def prepare_docs(documents):
text_splitter = MarkdownTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
return text_splitter.split_documents(documents)
# Initialize agent
def create_rag_agent(processed_docs):
retriever_tool = RetrieverTool(processed_docs)
return CodeAgent(
tools=[retriever_tool],
model=HfApiModel(),
verbose=True
)
def format_search_results(results: str):
"""Format the search results into main content and sources sections"""
if "### π Sources:" in results:
main_content, sources = results.split("### π Sources:")
# Create two columns with adjusted ratios
col1, col2 = st.columns([3, 2])
with col1:
st.markdown("### π Main Findings")
st.markdown(main_content)
with col2:
st.markdown("### π Sources")
st.markdown(sources, unsafe_allow_html=True)
else:
st.markdown(results)
@st.cache_resource
def get_agent():
"""Single function to handle data loading, processing, and agent creation"""
# Load dataset
dataset = load_dataset("camel-ai/biology")
train_docs = dataset["train"]
source_docs = concatenate_datasets([train_docs])
# Create documents
documents = [
Document(
page_content=item['message_2'],
metadata={
# "source": item['url'],
"title": item['message_1'],
"description": item['sub_topic'],
# "published_time": item['publishedTime']
}
)
for item in source_docs
]
# Process documents
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=500,
add_start_index=True,
strip_whitespace=True,
)
processed_docs = text_splitter.split_documents(documents)
# Create and return agent
retriever_tool = RetrieverTool(processed_docs)
return CodeAgent(
tools=[retriever_tool],
model=HfApiModel(),
)
# Streamlit UI
st.title("π Scientific Search Engine")
st.markdown("""
This search engine uses advanced AI to help you explore science.
It provides detailed, sourced information from a curated database of scientific knowledge.
""")
# Initialize agent
if 'agent' not in st.session_state:
with st.spinner("Loading database..."):
st.session_state.agent = get_agent()
# Search interface
search_query = st.text_input(
"π Search African History",
placeholder="E.g., Tell me about cancer in dogs",
help="Enter any question about science"
)
# Advanced search options
with st.expander("Advanced Search Options"):
search_type = st.radio(
"Search Type",
["General Query", "Scientific branches"],
help="Select the type of search you want to perform"
)
if search_type == "Scientific branches":
search_query = f"Focus on the specific scientific branch of: {search_query}"
elif search_type == "Geographic Region":
search_query = f"Focus on the region of: {search_query}"
# Search button
if st.button("Search", type="primary"):
if search_query:
with st.spinner("Searching records..."):
try:
results = st.session_state.agent.run(search_query)
# Use the formatter to display results
format_search_results(results)
# Add methodology note
st.markdown("---")
st.info("""
π‘ **How to read the results:**
- Main findings are summarized on the left
- Source references are numbered [Source X]
- Click on source details on the right to expand
- Follow the links to read the original articles
""")
except Exception as e:
st.error(f"An error occurred during the search: {e}")
else:
st.warning("Please enter a search query to begin.")
# Sidebar with additional information
with st.sidebar:
st.markdown("### About This Search Engine")
st.markdown("""
This search engine specializes in African history, providing:
- π Detailed information
- π Source verification
- π Geographic context
- β³ Historical timeline context
""")
st.markdown("### Data Sources")
st.markdown("Our database includes information from various historical documents, "
"academic papers, and verified historical records.")
# Footer
st.markdown("---")
st.caption("Powered by SmolAgents, RAG, and Camel AI Dataset") |