Upload 5 files
Browse files- agents.py +122 -0
- feasibility_agent.py +146 -0
- main.py +155 -0
- usecase_agent.py +597 -0
- vectorstore.py +283 -0
agents.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setup import *
|
2 |
+
import re
|
3 |
+
import requests
|
4 |
+
from typing import Annotated, Sequence, List, Optional
|
5 |
+
from typing_extensions import TypedDict
|
6 |
+
|
7 |
+
from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
|
8 |
+
from langgraph.graph.message import add_messages
|
9 |
+
from langgraph.graph import START, StateGraph, END
|
10 |
+
from langgraph.checkpoint.memory import MemorySaver
|
11 |
+
|
12 |
+
|
13 |
+
# Research agent
|
14 |
+
class AgentState(TypedDict):
|
15 |
+
messages: Annotated[Sequence[BaseMessage], add_messages]
|
16 |
+
queries : List[str]
|
17 |
+
link_list : Optional[List]
|
18 |
+
industry : Optional[str]
|
19 |
+
company: Optional[str]
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
# Node
|
24 |
+
def assistant(state: AgentState):
|
25 |
+
assistant_sys_msg = SystemMessage(content='''You are a highly intelligent and helpful assistant. Your primary task is to analyze user queries and determine whether the query:
|
26 |
+
|
27 |
+
Refers to an industry (general context)
|
28 |
+
Refers to a specific company (e.g., mentions a company's name explicitly).
|
29 |
+
|
30 |
+
For every query:
|
31 |
+
Check for company names, brands, or proper nouns that indicate a specific entity.
|
32 |
+
While analyzing the company industry be specific as possible.
|
33 |
+
Return the company and industry name in the query
|
34 |
+
if you can't find a industry name, return an empty string.
|
35 |
+
|
36 |
+
Example 1:
|
37 |
+
Query: "GenAI in MRF Tyres"
|
38 |
+
Company: "MRF Tyres"
|
39 |
+
Industry: "Tires and rubber products"
|
40 |
+
|
41 |
+
Example 2:
|
42 |
+
Query: "GenAI in the healthcare industry"
|
43 |
+
Company: ""
|
44 |
+
Industry: "Healthcare"
|
45 |
+
''')
|
46 |
+
return {'messages': [llm.invoke([assistant_sys_msg] + state["messages"])]}
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
def company_and_industry_query(state: AgentState):
|
51 |
+
print('--extract_company_and_industry--entered--')
|
52 |
+
text = state['messages'][-1].content
|
53 |
+
|
54 |
+
# Define patterns for extracting company and industry
|
55 |
+
company_pattern = r'Company:\s*"([^"]+)"'
|
56 |
+
industry_pattern = r'Industry:\s*"([^"]+)"'
|
57 |
+
|
58 |
+
# Search for matches
|
59 |
+
company_match = re.search(company_pattern, text)
|
60 |
+
industry_match = re.search(industry_pattern, text)
|
61 |
+
|
62 |
+
# Extract matched groups or return None if not found
|
63 |
+
company_name = company_match.group(1) if company_match else None
|
64 |
+
industry_name = industry_match.group(1) if industry_match else None
|
65 |
+
queries = []
|
66 |
+
if company_name:
|
67 |
+
queries.extend([f'{company_name} Annual report latest AND {company_name} website AND no PDF results',
|
68 |
+
# f'{company_name} GenAI applications',
|
69 |
+
# f'{company_name} key offerings and strategic focus areas (e.g., operations, supply chain, customer experience)',
|
70 |
+
# f'{company_name} competitors and market share'
|
71 |
+
])
|
72 |
+
|
73 |
+
if industry_name:
|
74 |
+
queries.extend([
|
75 |
+
f'{industry_name} report latest mckinsey, deloitte, nexocode',
|
76 |
+
# f'{industry_name} GenAI applications',
|
77 |
+
# f'{industry_name} trends, challenges and oppurtunities'
|
78 |
+
])
|
79 |
+
|
80 |
+
print('--extract_company_and_industry--finished--', queries)
|
81 |
+
return {'queries': queries, 'company': company_name, 'industry': industry_name}
|
82 |
+
|
83 |
+
|
84 |
+
def web_scraping(state: AgentState):
|
85 |
+
print('--web_scraping--entered--')
|
86 |
+
queries = state['queries']
|
87 |
+
link_list = []
|
88 |
+
for query in queries:
|
89 |
+
query_results = tavily_search.invoke({"query": query})
|
90 |
+
link_list.extend(query_results)
|
91 |
+
|
92 |
+
print('--web_scraping--finished--')
|
93 |
+
return {'link_list': link_list}
|
94 |
+
|
95 |
+
|
96 |
+
# Agent Graph
|
97 |
+
def research_agent(user_query: str):
|
98 |
+
builder = StateGraph(AgentState)
|
99 |
+
builder.add_node('assistant', assistant)
|
100 |
+
builder.add_node('names_extract', company_and_industry_query)
|
101 |
+
builder.add_node('web_scraping', web_scraping)
|
102 |
+
|
103 |
+
builder.add_edge(START, "assistant")
|
104 |
+
builder.add_edge("assistant", "names_extract")
|
105 |
+
builder.add_edge("names_extract", 'web_scraping')
|
106 |
+
builder.add_edge("web_scraping", END)
|
107 |
+
|
108 |
+
# memory
|
109 |
+
memory = MemorySaver()
|
110 |
+
react_graph = builder.compile(checkpointer=memory)
|
111 |
+
|
112 |
+
config = {'configurable': {'thread_id':'1'}}
|
113 |
+
messages = [HumanMessage(content=user_query)]
|
114 |
+
agentstate_result = react_graph.invoke({'messages': messages}, config)
|
115 |
+
|
116 |
+
return agentstate_result
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
|
121 |
+
|
122 |
+
|
feasibility_agent.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setup import *
|
2 |
+
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
|
3 |
+
from pydantic import BaseModel,ValidationError
|
4 |
+
from typing import List
|
5 |
+
from langchain_community.tools import TavilySearchResults
|
6 |
+
|
7 |
+
|
8 |
+
keyword_search = TavilySearchResults(
|
9 |
+
max_results=2,
|
10 |
+
search_depth="advanced",
|
11 |
+
include_answer=True,
|
12 |
+
include_raw_content=True,
|
13 |
+
include_images=True,
|
14 |
+
)
|
15 |
+
|
16 |
+
|
17 |
+
# Define the UseCaseKeywords model to include use_case, description, and keyword
|
18 |
+
class UseCaseKeywords(BaseModel):
|
19 |
+
use_case: str
|
20 |
+
description: str
|
21 |
+
keyword: str
|
22 |
+
|
23 |
+
# Override the dict method to return a dictionary with use_case as the key
|
24 |
+
def to_dict(self) -> dict:
|
25 |
+
return {
|
26 |
+
'use_case': self.use_case,
|
27 |
+
'description': self.description,
|
28 |
+
'keyword': self.keyword
|
29 |
+
}
|
30 |
+
|
31 |
+
# Define the KeywordGenerationResponse model to contain a list of UseCaseKeywords
|
32 |
+
class KeywordGenerationResponse(BaseModel):
|
33 |
+
data: List[UseCaseKeywords]
|
34 |
+
|
35 |
+
# Convert the list of UseCaseKeywords to a list of dictionaries
|
36 |
+
def to_list_of_dicts(self) -> List[dict]:
|
37 |
+
return [entry.to_dict() for entry in self.data]
|
38 |
+
|
39 |
+
|
40 |
+
def keyword_generation(report):
|
41 |
+
|
42 |
+
query_generation_sys_prompt = SystemMessage(content='''You are an expert in creating precise and relevant keyword queries to search for datasets. Your task is to generate a keyword query for each use case provided below. These queries should be optimized for searching datasets on platforms such as GitHub, Kaggle, and Hugging Face.
|
43 |
+
|
44 |
+
Your JSON structure must strictly include:
|
45 |
+
[
|
46 |
+
{
|
47 |
+
"use_case": "string",
|
48 |
+
"description": "string",
|
49 |
+
"keyword": "string"
|
50 |
+
}
|
51 |
+
]
|
52 |
+
|
53 |
+
**Instructions:**
|
54 |
+
1. Extract the key concepts from the use case (e.g., objectives, AI application, and domain).
|
55 |
+
2. Formulate a concise, descriptive query using relevant terms and synonyms.
|
56 |
+
3. Include terms related to data types (e.g., "customer data," "chat logs," "shopping behavior"), AI techniques (e.g., "machine learning," "recommendation systems"), and target domain (e.g., "e-commerce," "retail").
|
57 |
+
4. Create a output dictionary with the use case title as the key and the keyword query as the value.
|
58 |
+
|
59 |
+
**Use Cases: Examples**
|
60 |
+
## Use Case 1: Personalized Shopping Experiences with GenAI
|
61 |
+
**Objective/Use Case:** Create tailored shopping experiences for individual customers based on their browsing history, purchasing behavior, and preferences.
|
62 |
+
**AI Application:** Implement machine learning algorithms that analyze customer data to generate personalized offers, marketing communications, and product recommendations.
|
63 |
+
**Cross-Functional Benefit:**
|
64 |
+
- **Marketing:** Increases customer satisfaction and loyalty through targeted marketing efforts.
|
65 |
+
- **Sales:** Boosts sales by offering relevant products to customers.
|
66 |
+
- **Customer Service:** Enhances customer experience through personalized support.
|
67 |
+
|
68 |
+
## Use Case 2: AI-Powered Chatbots for Customer Service
|
69 |
+
**Objective/Use Case:** Improve in-store customer service by providing instant assistance and directing customers to relevant products.
|
70 |
+
**AI Application:** Develop GenAI-powered chatbots that analyze customer queries and provide accurate responses, suggesting related products and services.
|
71 |
+
**Cross-Functional Benefit:**
|
72 |
+
- **Customer Service:** Reduces wait times and improves customer satisfaction.
|
73 |
+
- **Sales:** Increases sales by suggesting relevant products to customers.
|
74 |
+
- **Operations:** Enhances employee productivity by automating routine tasks.
|
75 |
+
|
76 |
+
You must very very striclty follow the below format for the output dictionary.Pls do not deviate from the format. Always remember to follow the format strictly.
|
77 |
+
Example output:
|
78 |
+
[{'use_case' : "Personalized Shopping Experiences with GenAI" ,
|
79 |
+
'description':"AI-driven personalization enhances customer satisfaction through tailored offers, recommendations, and marketing based on individual preferences",
|
80 |
+
'keyword': "e-commerce personalized shopping data customer behavior recommendation system offers dataset"},
|
81 |
+
{'use_case': "AI-Powered Chatbots for Customer Service" ,
|
82 |
+
'description': "AI chatbots provide instant, accurate assistance, improving customer service, increasing sales, and boosting operational efficiency",
|
83 |
+
'keyword': "customer service chatbot dataset customer queries retail e-commerce AI automation"}]''')
|
84 |
+
|
85 |
+
|
86 |
+
# Example usage (you will use llm to generate the output)
|
87 |
+
Keyword_generation_llm = llm.with_structured_output(KeywordGenerationResponse)
|
88 |
+
|
89 |
+
# Your report as input (ensure that this variable is properly formatted and available)
|
90 |
+
report_msg = HumanMessage(content=f'The usecases are as follows {report}')
|
91 |
+
|
92 |
+
try:
|
93 |
+
output_dict = Keyword_generation_llm.invoke([query_generation_sys_prompt, report_msg])
|
94 |
+
if not output_dict or not isinstance(output_dict, dict):
|
95 |
+
raise ValueError("Unexpected LLM output format")
|
96 |
+
|
97 |
+
parsed_response = KeywordGenerationResponse(**output_dict)
|
98 |
+
|
99 |
+
except ValidationError as e:
|
100 |
+
print(f"Validation error: {e}")
|
101 |
+
print(f"Invalid data: {output_dict}")
|
102 |
+
raise
|
103 |
+
# Convert the response to a list of dictionaries
|
104 |
+
output_list = parsed_response.to_list_of_dicts()
|
105 |
+
|
106 |
+
return output_list
|
107 |
+
|
108 |
+
|
109 |
+
|
110 |
+
def dataset_search(output_list):
|
111 |
+
for usecase_dict in output_list:
|
112 |
+
query = usecase_dict['keyword']
|
113 |
+
query_format = 'kaggle OR github OR huggingface AND ({query})'
|
114 |
+
links = keyword_search.invoke({'query': query_format})
|
115 |
+
usecase_dict['links'] = links
|
116 |
+
return output_list
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
def grouping_urls(output_list):
|
121 |
+
for dict_item in output_list:
|
122 |
+
urls_list = []
|
123 |
+
for ele in dict_item['links']:
|
124 |
+
urls_list.append(ele['url'])
|
125 |
+
dict_item['urls_list'] = urls_list
|
126 |
+
return output_list
|
127 |
+
|
128 |
+
|
129 |
+
|
130 |
+
def delete_columns(output_list):
|
131 |
+
# Specify the keys you want to include
|
132 |
+
keys_to_del = ['links', 'keyword']
|
133 |
+
|
134 |
+
for dict_item in output_list:
|
135 |
+
for key in keys_to_del:
|
136 |
+
dict_item.pop(key, None)
|
137 |
+
return output_list
|
138 |
+
|
139 |
+
|
140 |
+
def feasibility_agent_func(report):
|
141 |
+
dict_list = keyword_generation(report)
|
142 |
+
dict_links = dataset_search(dict_list)
|
143 |
+
urls_dict = grouping_urls(dict_links)
|
144 |
+
pd_dict = delete_columns(urls_dict)
|
145 |
+
|
146 |
+
return pd_dict
|
main.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from setup import *
|
3 |
+
import pandas as pd
|
4 |
+
from openpyxl import Workbook
|
5 |
+
from openpyxl.utils.dataframe import dataframe_to_rows
|
6 |
+
from openpyxl.styles import Font
|
7 |
+
from agents import research_agent
|
8 |
+
from vectorstore import extract_urls, urls_classify_list, clean_and_extract_html_data
|
9 |
+
from usecase_agent import usecase_agent_func, vectorstore_writing
|
10 |
+
# from feasibility_agent import feasibility_agent_func
|
11 |
+
|
12 |
+
|
13 |
+
|
14 |
+
# # Function to create Excel file
|
15 |
+
# def create_excel(df):
|
16 |
+
# # Create a new Excel workbook and select the active sheet
|
17 |
+
# wb = Workbook()
|
18 |
+
# ws = wb.active
|
19 |
+
# ws.title = "Use Cases"
|
20 |
+
|
21 |
+
# # Define and write headers to the Excel sheet
|
22 |
+
# headers = ['Use Case', 'Description', 'URLs']
|
23 |
+
# ws.append(headers)
|
24 |
+
|
25 |
+
# # Write data rows
|
26 |
+
# for _, row in df.iterrows():
|
27 |
+
# try:
|
28 |
+
# use_case = row['use_case']
|
29 |
+
# description = row['description']
|
30 |
+
# urls = row['urls_list']
|
31 |
+
|
32 |
+
# ws.append([use_case, description, None]) # Add use case and description
|
33 |
+
# if urls:
|
34 |
+
# for url_index, url in enumerate(urls):
|
35 |
+
# cell = ws.cell(row=ws.max_row, column=3) # URLs go into the third column
|
36 |
+
# cell.value = url
|
37 |
+
# cell.hyperlink = url
|
38 |
+
# cell.font = Font(color="0000FF", underline="single")
|
39 |
+
|
40 |
+
# # Add a new row for additional URLs
|
41 |
+
# if url_index < len(urls) - 1:
|
42 |
+
# ws.append([None, None, None])
|
43 |
+
# except KeyError as e:
|
44 |
+
# print(f"Missing key in DataFrame row: {e}")
|
45 |
+
# except Exception as e:
|
46 |
+
# print(f"Unexpected error while processing row: {e}")
|
47 |
+
|
48 |
+
# excel_file_path = "GenAI_use_cases_feasibility.xlsx"
|
49 |
+
# wb.save(excel_file_path)
|
50 |
+
# return excel_file_path
|
51 |
+
|
52 |
+
|
53 |
+
# # Function to handle the report and create the DataFrame
|
54 |
+
# def pd_creation(report):
|
55 |
+
# # Assuming feasibility_agent_func returns a dictionary
|
56 |
+
# pd_dict = feasibility_agent_func(report)
|
57 |
+
|
58 |
+
# # Check for expected keys in pd_dict before proceeding
|
59 |
+
# required_columns = ['use_case', 'description', 'urls_list']
|
60 |
+
# if not all(col in pd_dict for col in required_columns):
|
61 |
+
# raise ValueError(f"Missing one or more expected columns: {required_columns}")
|
62 |
+
|
63 |
+
# # Create the DataFrame from the dictionary
|
64 |
+
# df = pd.DataFrame(pd_dict)
|
65 |
+
|
66 |
+
# # Convert the dataframe to the format expected by Gradio (list of lists)
|
67 |
+
# data = df.values.tolist() # This creates a list of lists from the dataframe
|
68 |
+
|
69 |
+
# # Create the Excel file and return its path
|
70 |
+
# excel_file_path = create_excel(df) # Create the Excel file and get its path
|
71 |
+
|
72 |
+
# return data, excel_file_path # Return the formatted data and the Excel file path
|
73 |
+
|
74 |
+
# Main function that handles the user query and generates the report
|
75 |
+
def main(user_input):
|
76 |
+
# Research Agent
|
77 |
+
agentstate_result = research_agent(user_input)
|
78 |
+
|
79 |
+
# Vector Store
|
80 |
+
urls, content = extract_urls(agentstate_result)
|
81 |
+
pdf_urls, html_urls = urls_classify_list(urls)
|
82 |
+
html_docs = clean_and_extract_html_data(html_urls)
|
83 |
+
|
84 |
+
# Writing vector store (not explicitly defined in your example)
|
85 |
+
vectorstore_writing(html_docs)
|
86 |
+
|
87 |
+
# Use-case agent
|
88 |
+
company_name = agentstate_result['company']
|
89 |
+
industry_name = agentstate_result['industry']
|
90 |
+
|
91 |
+
if company_name:
|
92 |
+
topic = f'GenAI Usecases in {company_name} and {industry_name} industry. Explore {company_name} GenAI applications, key offerings, strategic focus areas, competitors, and market share.'
|
93 |
+
else:
|
94 |
+
topic = f'GenAI Usecases in {industry_name}. Explore {industry_name} GenAI applications, trends, challenges, and opportunities.'
|
95 |
+
max_analysts = 3
|
96 |
+
|
97 |
+
report = usecase_agent_func(topic, max_analysts)
|
98 |
+
# pd_dict, excel_file_path = pd_creation(report)
|
99 |
+
|
100 |
+
# Save the report as a markdown file
|
101 |
+
report_file_path = "generated_report.md"
|
102 |
+
with open(report_file_path, "w") as f:
|
103 |
+
f.write(report)
|
104 |
+
# pd_dict, excel_file_path
|
105 |
+
return report, report_file_path
|
106 |
+
|
107 |
+
# Example queries
|
108 |
+
examples = [
|
109 |
+
"How is the retail industry leveraging AI and ML?",
|
110 |
+
"AI applications in automotive manufacturing"
|
111 |
+
]
|
112 |
+
|
113 |
+
# Creating the Gradio interface
|
114 |
+
with gr.Blocks(theme=gr.themes.Soft(font=gr.themes.GoogleFont('Open Sans'))) as demo:
|
115 |
+
# Header section
|
116 |
+
gr.HTML("<center><h1>UseCaseGenie - Discover GenAI Use cases for your company and Industry! 🤖🧑🍳.</h1><center>")
|
117 |
+
gr.Markdown("""#### This GenAI Assistant 🤖 helps you discover and explore Generative AI use cases for your company and industry.
|
118 |
+
You can download the generated use case report as a <b>Markdown file</b> to gain insights and explore relevant GenAI applications.
|
119 |
+
### <b>Steps:</b>
|
120 |
+
1. <b>Enter your query</b> regarding any company or industry.
|
121 |
+
2. <b>Click on the 'Submit' button</b> and wait for the GenAI assistant to generate the report.
|
122 |
+
3. <b>Download the generated report<b>
|
123 |
+
4. Explore the GenAI use cases and URLs for further analysis.
|
124 |
+
""")
|
125 |
+
|
126 |
+
|
127 |
+
# Input for the user query
|
128 |
+
with gr.Row():
|
129 |
+
user_input = gr.Textbox(label="Enter your Query", placeholder='Type_here...')
|
130 |
+
|
131 |
+
# Examples to help users with inputs
|
132 |
+
with gr.Row():
|
133 |
+
gr.Examples(examples=examples, inputs=user_input)
|
134 |
+
|
135 |
+
# Buttons for submitting and downloading
|
136 |
+
with gr.Row():
|
137 |
+
submit_button = gr.Button("Submit")
|
138 |
+
clear_btn = gr.ClearButton([user_input], value='Clear')
|
139 |
+
|
140 |
+
# File download buttons
|
141 |
+
with gr.Row():
|
142 |
+
# Create a downloadable markdown file
|
143 |
+
download_report_button = gr.File(label="Usecases Report")
|
144 |
+
|
145 |
+
# # Create a downloadable Excel file
|
146 |
+
# download_excel_button = gr.File(label="Feasibility Excel File")
|
147 |
+
|
148 |
+
# Display report in Markdown format
|
149 |
+
with gr.Row():
|
150 |
+
report_output = gr.Markdown()
|
151 |
+
|
152 |
+
submit_button.click(main, inputs=[user_input], outputs=[report_output, download_report_button])
|
153 |
+
|
154 |
+
# Run the interface
|
155 |
+
demo.launch()
|
usecase_agent.py
ADDED
@@ -0,0 +1,597 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setup import *
|
2 |
+
from typing import List, Optional
|
3 |
+
from typing_extensions import TypedDict
|
4 |
+
from pydantic import BaseModel, Field
|
5 |
+
from langgraph.graph import START, END, StateGraph
|
6 |
+
from langgraph.checkpoint.memory import MemorySaver
|
7 |
+
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, get_buffer_string
|
8 |
+
from langgraph.constants import Send
|
9 |
+
from operator import add
|
10 |
+
from langgraph.graph import MessagesState
|
11 |
+
from typing import Annotated
|
12 |
+
from langchain_community.vectorstores import Chroma
|
13 |
+
from langchain_community.embeddings.jina import JinaEmbeddings
|
14 |
+
# from langchain_huggingface import HuggingFaceEmbeddings
|
15 |
+
|
16 |
+
class Analyst(BaseModel):
|
17 |
+
affiliation: str = Field(
|
18 |
+
description="Primary affiliation of the analyst.",
|
19 |
+
)
|
20 |
+
name: str = Field(
|
21 |
+
description="Name of the analyst."
|
22 |
+
)
|
23 |
+
role: str = Field(
|
24 |
+
description="Role of the analyst in the context of the topic.",
|
25 |
+
)
|
26 |
+
description: str = Field(
|
27 |
+
description="Description of the analyst focus, concerns, and motives.",
|
28 |
+
)
|
29 |
+
@property
|
30 |
+
def persona(self) -> str:
|
31 |
+
return f"Name: {self.name}\nRole: {self.role}\nAffiliation: {self.affiliation}\nDescription: {self.description}\n"
|
32 |
+
|
33 |
+
|
34 |
+
class Perspectives(BaseModel):
|
35 |
+
analysts: List[Analyst] = Field(
|
36 |
+
description="Comprehensive list of analysts with their roles and affiliations.",
|
37 |
+
)
|
38 |
+
|
39 |
+
|
40 |
+
class GenerateAnalystsState(TypedDict):
|
41 |
+
topic: str # Research topic
|
42 |
+
max_analysts: int # Number of analysts to generate
|
43 |
+
analysts: List[Analyst] # Analyst asking questions
|
44 |
+
|
45 |
+
|
46 |
+
class InterviewState(MessagesState):
|
47 |
+
max_num_turns: int # Number turns of conversation
|
48 |
+
context: Annotated[list, add] # Source docs
|
49 |
+
analyst: Analyst # Analyst asking questions
|
50 |
+
interview: str # Interview transcript
|
51 |
+
sections: list # Final key we duplicate in outer state for Send() API
|
52 |
+
|
53 |
+
|
54 |
+
class SearchQuery(BaseModel):
|
55 |
+
search_query: str = Field(None, description="Search query for retrieval.")
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
def create_analysts(state: GenerateAnalystsState):
|
60 |
+
|
61 |
+
""" Create analysts """
|
62 |
+
|
63 |
+
topic=state['topic']
|
64 |
+
max_analysts=state['max_analysts']
|
65 |
+
|
66 |
+
structured_llm = llm.with_structured_output(Perspectives)
|
67 |
+
|
68 |
+
analyst_instructions = """You are tasked with creating a set of AI analyst personas. Follow these instructions carefully:
|
69 |
+
1. First, review the research topic:{topic}
|
70 |
+
2. Create {max_analysts} analysts with following roles:
|
71 |
+
- Industry expert
|
72 |
+
- GenAI expert
|
73 |
+
- Business strategist
|
74 |
+
3. Determine the most interesting themes based upon documents and/or feedback above.
|
75 |
+
4. Pick the top {max_analysts} themes.
|
76 |
+
5. For each theme, create one analyst with ALL of the following required fields: - name: A fitting name for the analyst - role: Their specific role or title - affiliation: Their primary organization or institution - description: A detailed description of their focus areas, concerns, and motives
|
77 |
+
6. Ensure every analyst includes all four fields without exception.
|
78 |
+
Remember: Every analyst **MUST** have all four fields (name, role, affiliation, and description) properly defined. Incomplete personas are not acceptable."""
|
79 |
+
|
80 |
+
# System message
|
81 |
+
system_message = analyst_instructions.format(topic=topic, max_analysts=max_analysts)
|
82 |
+
|
83 |
+
analysts = structured_llm.invoke([SystemMessage(content=system_message)]+[HumanMessage(content="Generate the set of analysts.")])
|
84 |
+
|
85 |
+
# Write the list of analysis to state
|
86 |
+
return {"analysts": analysts.analysts}
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
def vectorstore_writing(doc_splits):
|
92 |
+
global retriever
|
93 |
+
vectorstore = Chroma.from_documents(
|
94 |
+
documents=doc_splits,
|
95 |
+
collection_name="rag-chroma",
|
96 |
+
embedding = JinaEmbeddings(model_name='jina-embeddings-v3'),
|
97 |
+
persist_directory='./chroma_db'
|
98 |
+
)
|
99 |
+
retriever = vectorstore.as_retriever()
|
100 |
+
|
101 |
+
|
102 |
+
|
103 |
+
|
104 |
+
|
105 |
+
def generate_question(state:InterviewState):
|
106 |
+
""" Generate questions for the interview """
|
107 |
+
|
108 |
+
# print('----STATE----', state)
|
109 |
+
# Get the analyst persona
|
110 |
+
analyst = state['analyst']
|
111 |
+
messages = state['messages']
|
112 |
+
context = state["context"]
|
113 |
+
|
114 |
+
question_instructions = """You are an analyst tasked with interviewing an expert to learn about the use of Generative AI (GenAI) applications in a specific industry or company, if mentioned.
|
115 |
+
|
116 |
+
Your goal is to uncover interesting and specific insights related to the topic of Generative AI use cases.
|
117 |
+
|
118 |
+
Interesting: Insights that are surprising, non-obvious, or reveal unique applications of GenAI in the industry or company.
|
119 |
+
Specific: Insights that avoid generalities and include specific examples or case studies relevant to the company’s offerings, strategic focus areas, or the industry’s needs.
|
120 |
+
Focus Areas:
|
121 |
+
Explore the company's key offerings and strategic focus areas (e.g., operations, supply chain, customer experience, etc.), if the company is named.
|
122 |
+
Discuss industry-wide trends, innovations, and opportunities enabled by GenAI, such as improved operational efficiency, enhanced customer experiences, or streamlined supply chain processes.
|
123 |
+
Gather details on the company or industry's vision and products, focusing on how GenAI can be applied to enhance or transform their workflows.
|
124 |
+
Task:
|
125 |
+
Begin by introducing yourself with a name that fits your persona, then ask your question.
|
126 |
+
|
127 |
+
Continue asking follow-up questions to drill down into:
|
128 |
+
|
129 |
+
Specific GenAI use cases within the company's domain or the industry.
|
130 |
+
How these applications align with the company's or industry's strategic goals.
|
131 |
+
Real-world examples or future opportunities for integrating GenAI into their processes.
|
132 |
+
Complete the interview by saying:
|
133 |
+
"Thank you so much for your help!"
|
134 |
+
|
135 |
+
Remember to stay in character throughout the conversation, reflecting your persona and the provided goals."""
|
136 |
+
|
137 |
+
# Generate the question
|
138 |
+
question = llm.invoke([SystemMessage(content=question_instructions)]+[HumanMessage(content="Generate the question.")])
|
139 |
+
|
140 |
+
return {"messages": [question]}
|
141 |
+
|
142 |
+
|
143 |
+
|
144 |
+
def search_vectorstore(state: InterviewState):
|
145 |
+
|
146 |
+
""" Retrieve docs from Docstore """
|
147 |
+
|
148 |
+
# Search query writing
|
149 |
+
search_instructions = SystemMessage(content=f"""You will be given a conversation between an analyst and an expert.
|
150 |
+
|
151 |
+
Your goal is to generate a well-structured query for use in retrieval and / or web-search related to the conversation.
|
152 |
+
|
153 |
+
First, analyze the full conversation.
|
154 |
+
|
155 |
+
Pay particular attention to the final question posed by the analyst.
|
156 |
+
|
157 |
+
Convert this final question into a well-structured web search query""")
|
158 |
+
|
159 |
+
# Search query
|
160 |
+
structured_llm = llm.with_structured_output(SearchQuery)
|
161 |
+
search_query = structured_llm.invoke([search_instructions]+state['messages'])
|
162 |
+
|
163 |
+
# Search
|
164 |
+
search_docs = retriever.invoke(input=search_query.search_query)
|
165 |
+
|
166 |
+
# Format
|
167 |
+
formatted_search_docs = "\n\n---\n\n".join(
|
168 |
+
[
|
169 |
+
f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
|
170 |
+
for doc in search_docs
|
171 |
+
]
|
172 |
+
)
|
173 |
+
|
174 |
+
return {"context": [formatted_search_docs]}
|
175 |
+
|
176 |
+
|
177 |
+
|
178 |
+
def generate_answer(state: InterviewState):
|
179 |
+
|
180 |
+
""" Node to answer a question """
|
181 |
+
|
182 |
+
# Get state
|
183 |
+
analyst = state["analyst"]
|
184 |
+
messages = state["messages"]
|
185 |
+
context = state["context"]
|
186 |
+
|
187 |
+
|
188 |
+
answer_instructions = """You are an expert being interviewed by an analyst.
|
189 |
+
|
190 |
+
Here is analyst area of focus: {goals}.
|
191 |
+
|
192 |
+
You goal is to answer a question posed by the interviewer.
|
193 |
+
|
194 |
+
To answer question, use this context:
|
195 |
+
|
196 |
+
{context}
|
197 |
+
|
198 |
+
When answering questions, follow these guidelines:
|
199 |
+
|
200 |
+
1. Use only the information provided in the context.
|
201 |
+
|
202 |
+
2. Do not introduce external information or make assumptions beyond what is explicitly stated in the context.
|
203 |
+
|
204 |
+
3. The context contain sources at the topic of each individual document.
|
205 |
+
|
206 |
+
4. Include these sources your answer next to any relevant statements. For example, for source # 1 use [1].
|
207 |
+
|
208 |
+
5. List your sources in order at the bottom of your answer. [1] Source 1, [2] Source 2, etc
|
209 |
+
|
210 |
+
6. If the source is: <Document source="assistant/docs/llama3_1.pdf" page="7"/>' then just list:
|
211 |
+
|
212 |
+
[1] assistant/docs/llama3_1.pdf, page 7
|
213 |
+
|
214 |
+
And skip the addition of the brackets as well as the Document source preamble in your citation."""
|
215 |
+
|
216 |
+
|
217 |
+
|
218 |
+
# Answer question
|
219 |
+
system_message = answer_instructions.format(goals=analyst.persona, context=context)
|
220 |
+
answer = llm.invoke([SystemMessage(content=system_message)]+messages)
|
221 |
+
|
222 |
+
# Name the message as coming from the expert
|
223 |
+
answer.name = "expert"
|
224 |
+
|
225 |
+
# Append it to state
|
226 |
+
return {"messages": [answer]}
|
227 |
+
|
228 |
+
|
229 |
+
def save_interview(state: InterviewState):
|
230 |
+
|
231 |
+
""" Save interviews """
|
232 |
+
|
233 |
+
# Get messages
|
234 |
+
messages = state["messages"]
|
235 |
+
|
236 |
+
# Convert interview to a string
|
237 |
+
interview = get_buffer_string(messages)
|
238 |
+
|
239 |
+
# Save to interviews key
|
240 |
+
return {"interview": interview}
|
241 |
+
|
242 |
+
|
243 |
+
|
244 |
+
def route_messages(state: InterviewState,
|
245 |
+
name: str = "expert"):
|
246 |
+
|
247 |
+
""" Route between question and answer """
|
248 |
+
|
249 |
+
# Get messages
|
250 |
+
messages = state["messages"]
|
251 |
+
max_num_turns = state.get('max_num_turns',2)
|
252 |
+
|
253 |
+
# Check the number of expert answers
|
254 |
+
num_responses = len(
|
255 |
+
[m for m in messages if isinstance(m, AIMessage) and m.name == name]
|
256 |
+
)
|
257 |
+
|
258 |
+
# End if expert has answered more than the max turns
|
259 |
+
if num_responses >= max_num_turns:
|
260 |
+
return 'save_interview'
|
261 |
+
|
262 |
+
# This router is run after each question - answer pair
|
263 |
+
# Get the last question asked to check if it signals the end of discussion
|
264 |
+
last_question = messages[-2]
|
265 |
+
|
266 |
+
if "Thank you so much for your help" in last_question.content:
|
267 |
+
return 'save_interview'
|
268 |
+
return "ask_question"
|
269 |
+
|
270 |
+
|
271 |
+
|
272 |
+
def write_section(state: InterviewState):
|
273 |
+
|
274 |
+
""" Node to answer a question """
|
275 |
+
|
276 |
+
# Get state
|
277 |
+
interview = state["interview"]
|
278 |
+
context = state["context"]
|
279 |
+
analyst = state["analyst"]
|
280 |
+
|
281 |
+
|
282 |
+
section_writer_instructions = """You are an expert technical writer.
|
283 |
+
|
284 |
+
Your task is to create a short, easily digestible section of a report based on a set of source documents.
|
285 |
+
|
286 |
+
1. Analyze the content of the source documents:
|
287 |
+
- The name of each source document is at the start of the document, with the <Document tag.
|
288 |
+
|
289 |
+
2. Create a report structure using markdown formatting:
|
290 |
+
- Use ## for the section title
|
291 |
+
- Use ### for sub-section headers
|
292 |
+
|
293 |
+
3. Write the report following this structure:
|
294 |
+
a. Title (## header)
|
295 |
+
b. Summary (### header)
|
296 |
+
c. Sources (### header)
|
297 |
+
|
298 |
+
4. Make your title engaging based upon the focus area of the analyst:
|
299 |
+
{focus}
|
300 |
+
|
301 |
+
5. For the summary section:
|
302 |
+
- Set up summary with general background / context related to the focus area of the analyst
|
303 |
+
- Emphasize what is novel, interesting, or surprising about insights gathered from the interview
|
304 |
+
- Create a numbered list of source documents, as you use them
|
305 |
+
- Do not mention the names of interviewers or experts
|
306 |
+
- Aim for approximately 400 words maximum
|
307 |
+
- Use numbered sources in your report (e.g., [1], [2]) based on information from source documents
|
308 |
+
|
309 |
+
6. In the Sources section:
|
310 |
+
- Include all sources used in your report
|
311 |
+
- Provide full links to relevant websites or specific document paths
|
312 |
+
- Separate each source by a newline. Use two spaces at the end of each line to create a newline in Markdown.
|
313 |
+
- It will look like:
|
314 |
+
|
315 |
+
### Sources
|
316 |
+
[1] Link or Document name
|
317 |
+
[2] Link or Document name
|
318 |
+
|
319 |
+
7. Be sure to combine sources. For example this is not correct:
|
320 |
+
|
321 |
+
[3] https://ai.meta.com/blog/meta-llama-3-1/
|
322 |
+
[4] https://ai.meta.com/blog/meta-llama-3-1/
|
323 |
+
|
324 |
+
There should be no redundant sources. It should simply be:
|
325 |
+
|
326 |
+
[3] https://ai.meta.com/blog/meta-llama-3-1/
|
327 |
+
|
328 |
+
8. Final review:
|
329 |
+
- Ensure the report follows the required structure
|
330 |
+
- Include no preamble before the title of the report
|
331 |
+
- Check that all guidelines have been followed"""
|
332 |
+
|
333 |
+
|
334 |
+
# Write section using either the gathered source docs from interview (context) or the interview itself (interview)
|
335 |
+
system_message = section_writer_instructions.format(focus=analyst.description)
|
336 |
+
section = llm.invoke([SystemMessage(content=system_message)]+[HumanMessage(content=f"Use this source to write your section: {context}")])
|
337 |
+
|
338 |
+
# Append it to state
|
339 |
+
return {"sections": [section.content]}
|
340 |
+
|
341 |
+
|
342 |
+
|
343 |
+
# Add nodes and edges
|
344 |
+
interview_builder = StateGraph(InterviewState)
|
345 |
+
interview_builder.add_node("ask_question", generate_question)
|
346 |
+
interview_builder.add_node("search_rag", search_vectorstore)
|
347 |
+
interview_builder.add_node("answer_question", generate_answer)
|
348 |
+
interview_builder.add_node("save_interview", save_interview)
|
349 |
+
interview_builder.add_node("write_section", write_section)
|
350 |
+
|
351 |
+
# Flow
|
352 |
+
interview_builder.add_edge(START, "ask_question")
|
353 |
+
interview_builder.add_edge("ask_question", "search_rag")
|
354 |
+
interview_builder.add_edge("search_rag", "answer_question")
|
355 |
+
interview_builder.add_conditional_edges("answer_question", route_messages,['ask_question','save_interview'])
|
356 |
+
interview_builder.add_edge("save_interview", "write_section")
|
357 |
+
interview_builder.add_edge("write_section", END)
|
358 |
+
|
359 |
+
# Interview
|
360 |
+
memory = MemorySaver()
|
361 |
+
interview_graph = interview_builder.compile(checkpointer=memory).with_config(run_name="Conduct Interviews")
|
362 |
+
|
363 |
+
|
364 |
+
|
365 |
+
|
366 |
+
class ResearchGraphState(TypedDict):
|
367 |
+
topic: str # Research topic
|
368 |
+
max_analysts: int # Number of analysts
|
369 |
+
analysts: List[Analyst] # Analyst asking questions
|
370 |
+
sections: Annotated[list, add] # Send() API key
|
371 |
+
introduction: str # Introduction for the final report
|
372 |
+
content: str # Content for the final report
|
373 |
+
conclusion: str # Conclusion for the final report
|
374 |
+
final_report: str # Final report
|
375 |
+
human_analyst_feedback: Optional[str] # Human feedback
|
376 |
+
|
377 |
+
|
378 |
+
|
379 |
+
def initiate_all_interviews(state: ResearchGraphState):
|
380 |
+
""" This is the "map" step where we run each interview sub-graph using Send API """
|
381 |
+
|
382 |
+
# Check if human feedback
|
383 |
+
human_analyst_feedback=state.get('human_analyst_feedback')
|
384 |
+
if human_analyst_feedback:
|
385 |
+
# Return to create_analysts
|
386 |
+
return "create_analysts"
|
387 |
+
|
388 |
+
# Otherwise kick off interviews in parallel via Send() API
|
389 |
+
else:
|
390 |
+
topic = state["topic"]
|
391 |
+
return [Send("conduct_interview", {"analyst": analyst,
|
392 |
+
"messages": [HumanMessage(
|
393 |
+
content=f"So you said you were writing an article on {topic}?")],
|
394 |
+
}) for analyst in state["analysts"]]
|
395 |
+
|
396 |
+
report_writer_instructions = '''You are a technical writer tasked with creating a report on the overall topic:
|
397 |
+
|
398 |
+
**{topic}**
|
399 |
+
|
400 |
+
Your team of analysts has conducted interviews and written memos based on their findings. Your task is to consolidate the insights from these memos into a cohesive and structured report, following this format:
|
401 |
+
|
402 |
+
Think deeply and Generate atleat 2 use cases based on the memos.
|
403 |
+
|
404 |
+
### Format for Each Use Case
|
405 |
+
1. **Title Header:** Use a descriptive title for each use case, such as "## Use Case 1: AI-Powered Predictive Maintenance."
|
406 |
+
2. **Objective/Use Case:** Summarize the primary goal or application of AI for this use case in one or two sentences.
|
407 |
+
3. **AI Application:** Describe the specific AI technologies or methods used to achieve the objective.
|
408 |
+
4. **Cross-Functional Benefit:** Outline the key benefits across various functions, formatted as bullet points, specifying which department or area benefits from the AI use case.
|
409 |
+
|
410 |
+
### Example Format:
|
411 |
+
|
412 |
+
## Use Case 1: AI-Powered Predictive Maintenance
|
413 |
+
**Objective/Use Case:** Reduce equipment downtime and maintenance costs by predicting equipment failures before they occur.
|
414 |
+
**AI Application:** Implement machine learning algorithms that analyze real-time sensor data from machinery to predict potential failures and schedule maintenance proactively.
|
415 |
+
**Cross-Functional Benefit:**
|
416 |
+
- **Operations & Maintenance:** Minimizes unplanned downtime and extends equipment lifespan.
|
417 |
+
- **Finance:** Reduces maintenance costs and improves budgeting accuracy.
|
418 |
+
- **Supply Chain:** Optimizes spare parts inventory based on predictive insights.
|
419 |
+
|
420 |
+
## Use Case 2: Real-Time Quality Control with Computer Vision
|
421 |
+
**Objective/Use Case:** Enhance product quality by detecting defects in products during manufacturing.
|
422 |
+
**AI Application:** Deploy AI-powered computer vision systems on production lines to identify surface defects and inconsistencies in real time.
|
423 |
+
**Cross-Functional Benefit:**
|
424 |
+
- **Quality Assurance:** Improves defect detection accuracy and reduces scrap rates.
|
425 |
+
- **Production:** Enables immediate corrective actions, enhancing overall efficiency.
|
426 |
+
- **Customer Satisfaction:** Delivers higher-quality products, strengthening client relationships.
|
427 |
+
|
428 |
+
### Report Guidelines
|
429 |
+
1. Begin with the first use case title in the specified format.
|
430 |
+
2. Do not include any preamble or introductory text for the report.
|
431 |
+
3. Consolidate insights into distinct use cases, with a focus on clarity and relevance.
|
432 |
+
4. Preserve any citations included in the memos, formatted in brackets, e.g., [1], [2].
|
433 |
+
5. After detailing all use cases, include a **Sources** section with the title: `## Sources`.
|
434 |
+
6. Be sure to combine sources. For example this is not correct:
|
435 |
+
|
436 |
+
[3] https://ai.meta.com/blog/meta-llama-3-1/
|
437 |
+
[4] https://ai.meta.com/blog/meta-llama-3-1/
|
438 |
+
|
439 |
+
There should be no redundant sources. It should simply be:
|
440 |
+
[3] https://ai.meta.com/blog/meta-llama-3-1/
|
441 |
+
|
442 |
+
### Your Inputs
|
443 |
+
You will be given a collection of memos from your analysts under `{context}`. Extract and distill insights into specific use cases, ensuring each use case adheres to the prescribed format.'''
|
444 |
+
|
445 |
+
def write_report(state: ResearchGraphState):
|
446 |
+
# Full set of sections
|
447 |
+
sections = state["sections"]
|
448 |
+
topic = state["topic"]
|
449 |
+
|
450 |
+
# Concat all sections together
|
451 |
+
formatted_str_sections = "\n\n".join([f"{section}" for section in sections])
|
452 |
+
|
453 |
+
# Summarize the sections into a final report
|
454 |
+
system_message = report_writer_instructions.format(topic=topic, context=formatted_str_sections)
|
455 |
+
report = llm.invoke([SystemMessage(content=system_message)]+[HumanMessage(content=f"Write a report based upon these memos.")])
|
456 |
+
return {"content": report.content}
|
457 |
+
|
458 |
+
|
459 |
+
def human_feedback(state: ResearchGraphState):
|
460 |
+
""" No-op node that should be interrupted on """
|
461 |
+
pass
|
462 |
+
|
463 |
+
|
464 |
+
|
465 |
+
def write_introduction(state: ResearchGraphState):
|
466 |
+
# Full set of sections
|
467 |
+
sections = state["sections"]
|
468 |
+
topic = state["topic"]
|
469 |
+
|
470 |
+
# Concat all sections together
|
471 |
+
formatted_str_sections = "\n\n".join([f"{section}" for section in sections])
|
472 |
+
|
473 |
+
intro_conclusion_instructions = """You are a technical writer finishing a report on {topic}
|
474 |
+
|
475 |
+
You will be given all of the sections of the report.
|
476 |
+
|
477 |
+
You job is to write a crisp and compelling introduction or conclusion section.
|
478 |
+
|
479 |
+
The user will instruct you whether to write the introduction or conclusion.
|
480 |
+
|
481 |
+
Include no pre-amble for either section.
|
482 |
+
|
483 |
+
Target around 100 words, crisply previewing (for introduction) or recapping (for conclusion) all of the sections of the report.
|
484 |
+
|
485 |
+
Use markdown formatting.
|
486 |
+
|
487 |
+
For your introduction, create a compelling title and use the # header for the title.
|
488 |
+
|
489 |
+
For your introduction, use ## Introduction as the section header.
|
490 |
+
|
491 |
+
For your conclusion, use ## Conclusion as the section header.
|
492 |
+
|
493 |
+
Here are the sections to reflect on for writing: {formatted_str_sections}"""
|
494 |
+
|
495 |
+
|
496 |
+
# Summarize the sections into a final report
|
497 |
+
instructions = intro_conclusion_instructions.format(topic=topic, formatted_str_sections=formatted_str_sections)
|
498 |
+
intro = llm.invoke([instructions]+[HumanMessage(content=f"Write the report introduction")])
|
499 |
+
return {"introduction": intro.content}
|
500 |
+
|
501 |
+
|
502 |
+
def write_conclusion(state: ResearchGraphState):
|
503 |
+
# Full set of sections
|
504 |
+
sections = state["sections"]
|
505 |
+
topic = state["topic"]
|
506 |
+
|
507 |
+
# Concat all sections together
|
508 |
+
formatted_str_sections = "\n\n".join([f"{section}" for section in sections])
|
509 |
+
|
510 |
+
intro_conclusion_instructions = """You are a technical writer finishing a report on {topic}
|
511 |
+
|
512 |
+
You will be given all of the sections of the report.
|
513 |
+
|
514 |
+
You job is to write a crisp and compelling introduction or conclusion section.
|
515 |
+
|
516 |
+
The user will instruct you whether to write the introduction or conclusion.
|
517 |
+
|
518 |
+
Include no pre-amble for either section.
|
519 |
+
|
520 |
+
Target around 100 words, crisply previewing (for introduction) or recapping (for conclusion) all of the sections of the report.
|
521 |
+
|
522 |
+
Use markdown formatting.
|
523 |
+
|
524 |
+
For your introduction, create a compelling title and use the # header for the title.
|
525 |
+
|
526 |
+
For your introduction, use ## Introduction as the section header.
|
527 |
+
|
528 |
+
For your conclusion, use ## Conclusion as the section header.
|
529 |
+
|
530 |
+
Here are the sections to reflect on for writing: {formatted_str_sections}"""
|
531 |
+
|
532 |
+
|
533 |
+
# Summarize the sections into a final report
|
534 |
+
instructions = intro_conclusion_instructions.format(topic=topic, formatted_str_sections=formatted_str_sections)
|
535 |
+
conclusion = llm.invoke([instructions]+[HumanMessage(content=f"Write the report conclusion")])
|
536 |
+
return {"conclusion": conclusion.content}
|
537 |
+
|
538 |
+
|
539 |
+
def finalize_report(state: ResearchGraphState):
|
540 |
+
""" The is the "reduce" step where we gather all the sections, combine them, and reflect on them to write the intro/conclusion """
|
541 |
+
# Save full final report
|
542 |
+
content = state["content"]
|
543 |
+
if content.startswith("## Insights"):
|
544 |
+
content = content.strip("## Insights")
|
545 |
+
if "## Sources" in content:
|
546 |
+
try:
|
547 |
+
content, sources = content.split("\n## Sources\n")
|
548 |
+
except:
|
549 |
+
sources = None
|
550 |
+
else:
|
551 |
+
sources = None
|
552 |
+
|
553 |
+
final_report = state["introduction"] + "\n\n---\n\n" + content + "\n\n---\n\n" + state["conclusion"]
|
554 |
+
if sources is not None:
|
555 |
+
final_report += "\n\n## Sources\n" + sources
|
556 |
+
return {"final_report": final_report}
|
557 |
+
|
558 |
+
|
559 |
+
|
560 |
+
def usecase_agent_func(topic,max_analysts):
|
561 |
+
# Add nodes and edges
|
562 |
+
builder = StateGraph(ResearchGraphState)
|
563 |
+
builder.add_node("create_analysts", create_analysts)
|
564 |
+
builder.add_node("human_feedback", human_feedback)
|
565 |
+
builder.add_node("conduct_interview", interview_builder.compile())
|
566 |
+
builder.add_node("write_report",write_report)
|
567 |
+
builder.add_node("write_introduction",write_introduction)
|
568 |
+
builder.add_node("write_conclusion",write_conclusion)
|
569 |
+
builder.add_node("finalize_report",finalize_report)
|
570 |
+
|
571 |
+
# Logic
|
572 |
+
builder.add_edge(START, "create_analysts")
|
573 |
+
builder.add_edge("create_analysts", "human_feedback")
|
574 |
+
builder.add_conditional_edges("human_feedback", initiate_all_interviews, ["create_analysts", "conduct_interview"])
|
575 |
+
builder.add_edge("conduct_interview", "write_report")
|
576 |
+
builder.add_edge("conduct_interview", "write_introduction")
|
577 |
+
builder.add_edge("conduct_interview", "write_conclusion")
|
578 |
+
builder.add_edge(["write_conclusion", "write_report", "write_introduction"], "finalize_report")
|
579 |
+
builder.add_edge("finalize_report", END)
|
580 |
+
|
581 |
+
# Compile
|
582 |
+
memory = MemorySaver()
|
583 |
+
graph = builder.compile(checkpointer=memory)
|
584 |
+
config = {"configurable": {"thread_id": "1"}}
|
585 |
+
graph.invoke({"topic":topic,
|
586 |
+
"max_analysts":max_analysts,
|
587 |
+
'human_analyst_feedback': None},
|
588 |
+
config)
|
589 |
+
final_state = graph.get_state(config)
|
590 |
+
report = final_state.values.get('final_report')
|
591 |
+
|
592 |
+
print('-----REPORT-----', report)
|
593 |
+
|
594 |
+
return report
|
595 |
+
|
596 |
+
|
597 |
+
|
vectorstore.py
ADDED
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from setup import *
|
2 |
+
import tempfile
|
3 |
+
import requests
|
4 |
+
|
5 |
+
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
|
6 |
+
# from langchain_text_splitters import RecursiveCharacterTextSplitter
|
7 |
+
from urllib.parse import urlparse
|
8 |
+
from langchain.docstore.document import Document
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
def extract_urls(agentstate_result):
|
13 |
+
urls = []
|
14 |
+
content=[]
|
15 |
+
for item in agentstate_result['link_list']:
|
16 |
+
urls.append(item['url'])
|
17 |
+
content.append(item['content'])
|
18 |
+
|
19 |
+
return urls, content
|
20 |
+
|
21 |
+
|
22 |
+
|
23 |
+
# Function to classify URL based on file extension
|
24 |
+
def classify_url_by_extension(url):
|
25 |
+
"""
|
26 |
+
Classifies a URL based on its file extension.
|
27 |
+
Focuses only on pdf and html, classifying others as unknown.
|
28 |
+
"""
|
29 |
+
|
30 |
+
if not isinstance(url, str):
|
31 |
+
raise ValueError(f"Expected a string, but got {type(url)}")
|
32 |
+
|
33 |
+
# Extract the file extension from the URL
|
34 |
+
try:
|
35 |
+
file_extension = urlparse(url).path.split('.')[-1].lower()
|
36 |
+
if file_extension == 'pdf':
|
37 |
+
return 'pdf'
|
38 |
+
elif file_extension in ['html', 'htm']:
|
39 |
+
return 'html'
|
40 |
+
else:
|
41 |
+
return 'unknown'
|
42 |
+
except Exception as e:
|
43 |
+
print(f"Error while parsing URL: {url} - {e}")
|
44 |
+
return 'unknown'
|
45 |
+
|
46 |
+
|
47 |
+
# Function to classify based on HTTP Content-Type header (optional, for extra accuracy)
|
48 |
+
def classify_url_by_header(url):
|
49 |
+
"""
|
50 |
+
Classifies a URL based on the HTTP Content-Type header.
|
51 |
+
Focuses only on pdf and html, classifying others as unknown.
|
52 |
+
"""
|
53 |
+
try:
|
54 |
+
response = requests.head(url, timeout=5) # Use HEAD request to fetch headers
|
55 |
+
content_type = response.headers.get('Content-Type', '').lower()
|
56 |
+
|
57 |
+
if 'pdf' in content_type:
|
58 |
+
return 'pdf'
|
59 |
+
elif 'html' in content_type:
|
60 |
+
return 'html'
|
61 |
+
else:
|
62 |
+
return 'unknown'
|
63 |
+
except requests.RequestException as e:
|
64 |
+
print(f"Error while making HEAD request: {url} - {e}")
|
65 |
+
return 'unknown'
|
66 |
+
|
67 |
+
|
68 |
+
# Function to classify a list of URLs
|
69 |
+
def urls_classify_list(urls):
|
70 |
+
"""
|
71 |
+
Classifies a list of URLs into pdf, html, and unknown.
|
72 |
+
Returns two separate lists: one for pdf URLs and one for html URLs.
|
73 |
+
"""
|
74 |
+
if not isinstance(urls, list):
|
75 |
+
raise ValueError("Expected a list of URLs")
|
76 |
+
|
77 |
+
pdf_urls = []
|
78 |
+
html_urls = []
|
79 |
+
|
80 |
+
# Classify each URL
|
81 |
+
for url in urls:
|
82 |
+
file_type = classify_url_by_extension(url) # First, try classifying by extension
|
83 |
+
if file_type == 'unknown':
|
84 |
+
# If extension-based classification failed, fall back to HTTP header classification
|
85 |
+
file_type = classify_url_by_header(url)
|
86 |
+
|
87 |
+
if file_type == 'pdf':
|
88 |
+
pdf_urls.append(url)
|
89 |
+
elif file_type == 'html':
|
90 |
+
html_urls.append(url)
|
91 |
+
|
92 |
+
return pdf_urls, html_urls
|
93 |
+
|
94 |
+
|
95 |
+
|
96 |
+
def urls_classify_list(urls: list):
|
97 |
+
pdf_urls=[]
|
98 |
+
html_urls=[]
|
99 |
+
# Classify the URLs
|
100 |
+
for url in urls:
|
101 |
+
file_type = classify_url_by_extension(url) # First, try classifying by extension
|
102 |
+
if file_type == 'unknown':
|
103 |
+
# If extension-based classification failed, fall back to HTTP header classification
|
104 |
+
file_type = classify_url_by_header(url)
|
105 |
+
|
106 |
+
if file_type == 'pdf':
|
107 |
+
pdf_urls.append(url)
|
108 |
+
|
109 |
+
if file_type == 'html' or file_type == 'unknown':
|
110 |
+
html_urls.append(url)
|
111 |
+
|
112 |
+
return pdf_urls, html_urls
|
113 |
+
|
114 |
+
|
115 |
+
|
116 |
+
def clean_and_extract_html_data(html_urls, chunk_size=100, chunk_overlap=25):
|
117 |
+
"""
|
118 |
+
Loads HTML content from URLs, cleans the data, and splits it into smaller chunks.
|
119 |
+
|
120 |
+
Args:
|
121 |
+
html_urls (list): List of HTML URLs to process.
|
122 |
+
chunk_size (int): Maximum size of each chunk.
|
123 |
+
chunk_overlap (int): Overlap between chunks.
|
124 |
+
|
125 |
+
Returns:
|
126 |
+
list: List of document chunks.
|
127 |
+
"""
|
128 |
+
|
129 |
+
def clean_content(content):
|
130 |
+
"""
|
131 |
+
Cleans the content by removing unwanted patterns and short lines.
|
132 |
+
"""
|
133 |
+
cleaned_content = content.strip() # Remove leading/trailing whitespace
|
134 |
+
lines = cleaned_content.split('\n') # Split by newlines
|
135 |
+
meaningful_lines = [line.strip() for line in lines if len(line.strip()) > 3] # Keep meaningful lines
|
136 |
+
return '\n'.join(meaningful_lines)
|
137 |
+
|
138 |
+
def split_document(doc_content, chunk_size, chunk_overlap):
|
139 |
+
"""
|
140 |
+
Splits a document into smaller chunks with overlap.
|
141 |
+
"""
|
142 |
+
chunks = []
|
143 |
+
start = 0
|
144 |
+
while start < len(doc_content):
|
145 |
+
end = start + chunk_size
|
146 |
+
chunk = doc_content[start:end]
|
147 |
+
chunks.append(chunk)
|
148 |
+
start = end - chunk_overlap if end < len(doc_content) else len(doc_content)
|
149 |
+
return chunks
|
150 |
+
|
151 |
+
# Step 1: Load documents from URLs
|
152 |
+
docs = []
|
153 |
+
for url in html_urls:
|
154 |
+
try:
|
155 |
+
loader = WebBaseLoader(url)
|
156 |
+
data = loader.load()
|
157 |
+
docs.extend(data)
|
158 |
+
except Exception as e:
|
159 |
+
print(f"Error loading URL {url}: {e}")
|
160 |
+
|
161 |
+
# Step 2: Clean the content to remove unwanted data
|
162 |
+
cleaned_docs = []
|
163 |
+
for doc in docs:
|
164 |
+
cleaned_content = clean_content(doc.page_content)
|
165 |
+
if cleaned_content: # Exclude empty documents
|
166 |
+
doc.page_content = cleaned_content
|
167 |
+
cleaned_docs.append(doc)
|
168 |
+
|
169 |
+
# Step 3: Split the cleaned documents into chunks
|
170 |
+
doc_splits = []
|
171 |
+
for doc in cleaned_docs:
|
172 |
+
chunks = split_document(doc.page_content, chunk_size, chunk_overlap)
|
173 |
+
for chunk in chunks:
|
174 |
+
doc_splits.append(Document(page_content=chunk, metadata=doc.metadata))
|
175 |
+
|
176 |
+
return doc_splits
|
177 |
+
|
178 |
+
|
179 |
+
|
180 |
+
|
181 |
+
|
182 |
+
|
183 |
+
# def extract_pdf_from_url(url):
|
184 |
+
# """
|
185 |
+
# Extract text from a PDF available at a URL.
|
186 |
+
|
187 |
+
# Args:
|
188 |
+
# url (str): The URL of the PDF file.
|
189 |
+
|
190 |
+
# Returns:
|
191 |
+
# str: Extracted text from the PDF.
|
192 |
+
# """
|
193 |
+
# # Step 1: Download the PDF from the URL
|
194 |
+
# response = requests.get(url)
|
195 |
+
# if response.status_code == 200:
|
196 |
+
# pdf_content = response.content
|
197 |
+
# else:
|
198 |
+
# raise ValueError(f"Failed to fetch the PDF. HTTP Status Code: {response.status_code}")
|
199 |
+
|
200 |
+
# # Step 2: Save PDF content to a temporary file
|
201 |
+
# with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
|
202 |
+
# temp_pdf.write(pdf_content)
|
203 |
+
# temp_pdf_path = temp_pdf.name # Get the file path
|
204 |
+
|
205 |
+
# # Step 3: Load the PDF using PyPDFLoader
|
206 |
+
# loader = PyPDFLoader(temp_pdf_path)
|
207 |
+
# documents = loader.load()
|
208 |
+
|
209 |
+
# # Step 4: Extract text from all pages
|
210 |
+
# extracted_text = "\n".join(doc.page_content for doc in documents)
|
211 |
+
|
212 |
+
# return extracted_text
|
213 |
+
|
214 |
+
|
215 |
+
# def clean_and_split_pdf_text(pdf_text, chunk_size=100, chunk_overlap=25):
|
216 |
+
# """
|
217 |
+
# Cleans and splits the extracted PDF text into smaller chunks.
|
218 |
+
|
219 |
+
# Args:
|
220 |
+
# pdf_text (str): Extracted text from a PDF.
|
221 |
+
# chunk_size (int): Maximum size of each chunk.
|
222 |
+
# chunk_overlap (int): Overlap between chunks.
|
223 |
+
|
224 |
+
# Returns:
|
225 |
+
# list: List of document chunks.
|
226 |
+
# """
|
227 |
+
# def clean_content(content):
|
228 |
+
# """
|
229 |
+
# Cleans the text by removing unwanted patterns and short lines.
|
230 |
+
# """
|
231 |
+
# content = content.strip() # Remove leading/trailing whitespace
|
232 |
+
# lines = content.split('\n') # Split into lines
|
233 |
+
# meaningful_lines = [line.strip() for line in lines if len(line.strip()) > 3] # Exclude short lines
|
234 |
+
# return '\n'.join(meaningful_lines)
|
235 |
+
|
236 |
+
# def split_text(content, chunk_size, chunk_overlap):
|
237 |
+
# """
|
238 |
+
# Splits cleaned text into smaller chunks with overlap.
|
239 |
+
# """
|
240 |
+
# chunks = []
|
241 |
+
# start = 0
|
242 |
+
# while start < len(content):
|
243 |
+
# end = start + chunk_size
|
244 |
+
# chunks.append(content[start:end])
|
245 |
+
# start = end - chunk_overlap if end < len(content) else len(content)
|
246 |
+
# return chunks
|
247 |
+
|
248 |
+
# # Step 1: Clean the text
|
249 |
+
# cleaned_text = clean_content(pdf_text)
|
250 |
+
|
251 |
+
# # Step 2: Split the cleaned text
|
252 |
+
# return split_text(cleaned_text, chunk_size, chunk_overlap)
|
253 |
+
|
254 |
+
|
255 |
+
# def pdf_extraction(pdf_urls, chunk_size=100, chunk_overlap=25):
|
256 |
+
# """
|
257 |
+
# Extracts and processes text from a list of PDF URLs.
|
258 |
+
|
259 |
+
# Args:
|
260 |
+
# pdf_urls (list): List of PDF URLs.
|
261 |
+
# chunk_size (int): Maximum size of each chunk.
|
262 |
+
# chunk_overlap (int): Overlap between chunks.
|
263 |
+
|
264 |
+
# Returns:
|
265 |
+
# list: List of Document objects containing split text.
|
266 |
+
# """
|
267 |
+
# all_chunks = []
|
268 |
+
|
269 |
+
# for pdf_url in pdf_urls:
|
270 |
+
# try:
|
271 |
+
# # Extract text from the PDF
|
272 |
+
# extracted_text = extract_pdf_from_url(pdf_url)
|
273 |
+
|
274 |
+
# # Clean and split the text
|
275 |
+
# chunks = clean_and_split_pdf_text(extracted_text, chunk_size, chunk_overlap)
|
276 |
+
|
277 |
+
# # Convert chunks into Document objects
|
278 |
+
# for chunk in chunks:
|
279 |
+
# all_chunks.append(Document(page_content=chunk, metadata={"source": pdf_url}))
|
280 |
+
# except Exception as e:
|
281 |
+
# print(f"Error processing PDF URL {pdf_url}: {e}")
|
282 |
+
|
283 |
+
# return all_chunks
|