Svngoku commited on
Commit
6e1aa35
Β·
verified Β·
1 Parent(s): ccd3034

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +212 -63
app.py CHANGED
@@ -1,65 +1,214 @@
1
  import streamlit as st
2
- from smolagents.agents import ToolCallingAgent
3
- from smolagents import tool, LiteLLMModel
4
- from typing import Optional
5
- import cv2
6
- import pytesseract
7
- from PIL import Image
8
- import io
9
- import numpy as np
10
- import base64
11
-
12
- # Define the LiteLLMModel with OpenAI key
13
- model = LiteLLMModel(model_id="gpt-4o", api_key="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")
14
-
15
- @tool
16
- def extract_components(image_data_base64: str) -> str:
17
- """
18
- Extract components from a web design image.
19
-
20
- Args:
21
- image_data_base64: The image data in base64 string format.
22
-
23
- Returns:
24
- A string describing the components found in the image.
25
- """
26
- image_data = base64.b64decode(image_data_base64)
27
- image = Image.open(io.BytesIO(image_data))
28
- gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
29
- components = pytesseract.image_to_string(gray)
30
- return components
31
-
32
- @tool
33
- def generate_code(components: str) -> str:
34
- """
35
- Generate code for the given components.
36
-
37
- Args:
38
- components: A string describing the components.
39
-
40
- Returns:
41
- The generated code for the components.
42
- """
43
- # This is a placeholder implementation. You can replace it with actual code generation logic.
44
- return f"Generated code for components: {components}"
45
-
46
- # Define the ToolCallingAgent
47
- agent = ToolCallingAgent(tools=[extract_components, generate_code], model=model)
48
-
49
- # Streamlit app title
50
- st.title("Web Design Component Extractor")
51
-
52
- # File uploader for the web design image
53
- uploaded_file = st.file_uploader("Upload a web design image", type=["png", "jpg", "jpeg"])
54
-
55
- # Button to run the agent
56
- if st.button("Extract and Generate Code"):
57
- if uploaded_file is not None:
58
- image_data = uploaded_file.read()
59
- image_data_base64 = base64.b64encode(image_data).decode('utf-8')
60
- components = agent.run(f"extract_components {image_data_base64}")
61
- code = agent.run(f"generate_code {components}")
62
- st.write("Extracted Components:", components)
63
- st.write("Generated Code:", code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  else:
65
- st.write("Please upload an image.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from smolagents import Tool, CodeAgent, HfApiModel
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownTextSplitter
4
+ from langchain_community.retrievers import BM25Retriever
5
+ from langchain.docstore.document import Document
6
+ from datasets import load_dataset, concatenate_datasets
7
+
8
+ st.set_page_config(
9
+ page_title="African History Search Engine",
10
+ page_icon="🌍",
11
+ layout="wide"
12
+ )
13
+
14
+ class RetrieverTool(Tool):
15
+ name = "retriever"
16
+ description = "Uses BM25 search to retrieve relevant African historical documentation"
17
+ inputs = {
18
+ "query": {
19
+ "type": "string",
20
+ "description": "The historical query in affirmative form rather than a question"
21
+ }
22
+ }
23
+ output_type = "string"
24
+
25
+ def __init__(self, docs, k1=1.5, b=0.75, **kwargs):
26
+ super().__init__(**kwargs)
27
+ self.retriever = BM25Retriever.from_documents(
28
+ docs,
29
+ k=12,
30
+ k1=k1,
31
+ b=b
32
+ )
33
+ self.docs = docs
34
+ self.avg_doc_length = sum(len(doc.page_content.split()) for doc in docs) / len(docs)
35
+
36
+ def forward(self, query: str) -> str: # Matches exactly with inputs
37
+ # Preprocess query
38
+ query = self._preprocess_query(query)
39
+
40
+ # Retrieve documents
41
+ docs = self.retriever.get_relevant_documents(query)
42
+
43
+ # Format response
44
+ main_response = "Retrieved documents (ranked by relevance):\n\n"
45
+
46
+ for i, doc in enumerate(docs, 1):
47
+ doc_length = len(doc.page_content.split())
48
+ length_factor = doc_length / self.avg_doc_length
49
+
50
+ main_response += f"Document {i} (Length Factor: {length_factor:.2f})\n"
51
+ main_response += f"{doc.page_content}\n\n"
52
+
53
+ if doc.metadata:
54
+ main_response += f"Metadata: {doc.metadata}\n"
55
+ main_response += "---\n\n"
56
+
57
+ return main_response
58
+
59
+ def _preprocess_query(self, query: str) -> str:
60
+ question_words = ["what", "when", "where", "who", "why", "how"]
61
+ query_terms = query.lower().split()
62
+ if query_terms[0] in question_words:
63
+ query_terms = query_terms[1:]
64
+ return " ".join(query_terms)
65
+
66
+ # Process documents
67
+ def prepare_docs(documents):
68
+ text_splitter = MarkdownTextSplitter(
69
+ chunk_size=1000,
70
+ chunk_overlap=200
71
+ )
72
+ return text_splitter.split_documents(documents)
73
+
74
+ # Initialize agent
75
+ def create_rag_agent(processed_docs):
76
+ retriever_tool = RetrieverTool(processed_docs)
77
+ return CodeAgent(
78
+ tools=[retriever_tool],
79
+ model=HfApiModel(),
80
+ verbose=True
81
+ )
82
+
83
+ def format_search_results(results: str):
84
+ """Format the search results into main content and sources sections"""
85
+ if "### πŸ“š Sources:" in results:
86
+ main_content, sources = results.split("### πŸ“š Sources:")
87
+
88
+ # Create two columns with adjusted ratios
89
+ col1, col2 = st.columns([3, 2])
90
+
91
+ with col1:
92
+ st.markdown("### πŸ“– Main Findings")
93
+ st.markdown(main_content)
94
+
95
+ with col2:
96
+ st.markdown("### πŸ“š Sources")
97
+ st.markdown(sources, unsafe_allow_html=True)
98
  else:
99
+ st.markdown(results)
100
+
101
+ @st.cache_resource
102
+ def get_agent():
103
+ """Single function to handle data loading, processing, and agent creation"""
104
+ # Load dataset
105
+ dataset = load_dataset("Svngoku/African-History-Extra-11-30-24")
106
+ train_docs = dataset["train"]
107
+ test_docs = dataset["test"]
108
+ source_docs = concatenate_datasets([train_docs, test_docs])
109
+
110
+ # Create documents
111
+ documents = [
112
+ Document(
113
+ page_content=item['content'],
114
+ metadata={
115
+ "source": item['url'],
116
+ "title": item['title'],
117
+ "description": item['description'],
118
+ "published_time": item['publishedTime']
119
+ }
120
+ )
121
+ for item in source_docs
122
+ ]
123
+
124
+ # Process documents
125
+ text_splitter = RecursiveCharacterTextSplitter(
126
+ chunk_size=1000,
127
+ chunk_overlap=500,
128
+ add_start_index=True,
129
+ strip_whitespace=True,
130
+ )
131
+ processed_docs = text_splitter.split_documents(documents)
132
+
133
+ # Create and return agent
134
+ retriever_tool = RetrieverTool(processed_docs)
135
+ return CodeAgent(
136
+ tools=[retriever_tool],
137
+ model=HfApiModel("meta-llama/Llama-3.3-70B-Instruct"),
138
+ )
139
+
140
+ # Streamlit UI
141
+ st.title("🌍 African History Search Engine")
142
+ st.markdown("""
143
+ This search engine uses advanced AI to help you explore African history.
144
+ It provides detailed, sourced information from a curated database of historical documents.
145
+ """)
146
+
147
+ # Initialize agent
148
+ if 'agent' not in st.session_state:
149
+ with st.spinner("Loading historical database..."):
150
+ st.session_state.agent = get_agent()
151
+
152
+ # Search interface
153
+ search_query = st.text_input(
154
+ "πŸ” Search African History",
155
+ placeholder="E.g., Tell me about the Kingdom of Kush",
156
+ help="Enter any question about African history"
157
+ )
158
+
159
+ # Advanced search options
160
+ with st.expander("Advanced Search Options"):
161
+ search_type = st.radio(
162
+ "Search Type",
163
+ ["General Query", "Specific Time Period", "Geographic Region"],
164
+ help="Select the type of search you want to perform"
165
+ )
166
+
167
+ if search_type == "Specific Time Period":
168
+ search_query = f"Focus on the time period: {search_query}"
169
+ elif search_type == "Geographic Region":
170
+ search_query = f"Focus on the region of: {search_query}"
171
+
172
+ # Search button
173
+ if st.button("Search", type="primary"):
174
+ if search_query:
175
+ with st.spinner("Searching historical records..."):
176
+ try:
177
+ results = st.session_state.agent.run(search_query)
178
+
179
+ # Use the formatter to display results
180
+ format_search_results(results)
181
+
182
+ # Add methodology note
183
+ st.markdown("---")
184
+ st.info("""
185
+ πŸ’‘ **How to read the results:**
186
+ - Main findings are summarized on the left
187
+ - Source references are numbered [Source X]
188
+ - Click on source details on the right to expand
189
+ - Follow the links to read the original articles
190
+ """)
191
+
192
+ except Exception as e:
193
+ st.error(f"An error occurred during the search: {e}")
194
+ else:
195
+ st.warning("Please enter a search query to begin.")
196
+
197
+ # Sidebar with additional information
198
+ with st.sidebar:
199
+ st.markdown("### About This Search Engine")
200
+ st.markdown("""
201
+ This search engine specializes in African history, providing:
202
+ - πŸ“š Detailed historical information
203
+ - πŸ” Source verification
204
+ - 🌍 Geographic context
205
+ - ⏳ Historical timeline context
206
+ """)
207
+
208
+ st.markdown("### Data Sources")
209
+ st.markdown("Our database includes information from various historical documents, "
210
+ "academic papers, and verified historical records.")
211
+
212
+ # Footer
213
+ st.markdown("---")
214
+ st.caption("Powered by SmolAgents, RAG, and African History Dataset")