Dharma20 commited on
Commit
9cac175
·
verified ·
1 Parent(s): 1b669d3

Upload 5 files

Browse files
Files changed (5) hide show
  1. agents.py +122 -0
  2. feasibility_agent.py +146 -0
  3. main.py +155 -0
  4. usecase_agent.py +597 -0
  5. vectorstore.py +283 -0
agents.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setup import *
2
+ import re
3
+ import requests
4
+ from typing import Annotated, Sequence, List, Optional
5
+ from typing_extensions import TypedDict
6
+
7
+ from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
8
+ from langgraph.graph.message import add_messages
9
+ from langgraph.graph import START, StateGraph, END
10
+ from langgraph.checkpoint.memory import MemorySaver
11
+
12
+
13
+ # Research agent
14
+ class AgentState(TypedDict):
15
+ messages: Annotated[Sequence[BaseMessage], add_messages]
16
+ queries : List[str]
17
+ link_list : Optional[List]
18
+ industry : Optional[str]
19
+ company: Optional[str]
20
+
21
+
22
+
23
+ # Node
24
+ def assistant(state: AgentState):
25
+ assistant_sys_msg = SystemMessage(content='''You are a highly intelligent and helpful assistant. Your primary task is to analyze user queries and determine whether the query:
26
+
27
+ Refers to an industry (general context)
28
+ Refers to a specific company (e.g., mentions a company's name explicitly).
29
+
30
+ For every query:
31
+ Check for company names, brands, or proper nouns that indicate a specific entity.
32
+ While analyzing the company industry be specific as possible.
33
+ Return the company and industry name in the query
34
+ if you can't find a industry name, return an empty string.
35
+
36
+ Example 1:
37
+ Query: "GenAI in MRF Tyres"
38
+ Company: "MRF Tyres"
39
+ Industry: "Tires and rubber products"
40
+
41
+ Example 2:
42
+ Query: "GenAI in the healthcare industry"
43
+ Company: ""
44
+ Industry: "Healthcare"
45
+ ''')
46
+ return {'messages': [llm.invoke([assistant_sys_msg] + state["messages"])]}
47
+
48
+
49
+
50
+ def company_and_industry_query(state: AgentState):
51
+ print('--extract_company_and_industry--entered--')
52
+ text = state['messages'][-1].content
53
+
54
+ # Define patterns for extracting company and industry
55
+ company_pattern = r'Company:\s*"([^"]+)"'
56
+ industry_pattern = r'Industry:\s*"([^"]+)"'
57
+
58
+ # Search for matches
59
+ company_match = re.search(company_pattern, text)
60
+ industry_match = re.search(industry_pattern, text)
61
+
62
+ # Extract matched groups or return None if not found
63
+ company_name = company_match.group(1) if company_match else None
64
+ industry_name = industry_match.group(1) if industry_match else None
65
+ queries = []
66
+ if company_name:
67
+ queries.extend([f'{company_name} Annual report latest AND {company_name} website AND no PDF results',
68
+ # f'{company_name} GenAI applications',
69
+ # f'{company_name} key offerings and strategic focus areas (e.g., operations, supply chain, customer experience)',
70
+ # f'{company_name} competitors and market share'
71
+ ])
72
+
73
+ if industry_name:
74
+ queries.extend([
75
+ f'{industry_name} report latest mckinsey, deloitte, nexocode',
76
+ # f'{industry_name} GenAI applications',
77
+ # f'{industry_name} trends, challenges and oppurtunities'
78
+ ])
79
+
80
+ print('--extract_company_and_industry--finished--', queries)
81
+ return {'queries': queries, 'company': company_name, 'industry': industry_name}
82
+
83
+
84
+ def web_scraping(state: AgentState):
85
+ print('--web_scraping--entered--')
86
+ queries = state['queries']
87
+ link_list = []
88
+ for query in queries:
89
+ query_results = tavily_search.invoke({"query": query})
90
+ link_list.extend(query_results)
91
+
92
+ print('--web_scraping--finished--')
93
+ return {'link_list': link_list}
94
+
95
+
96
+ # Agent Graph
97
+ def research_agent(user_query: str):
98
+ builder = StateGraph(AgentState)
99
+ builder.add_node('assistant', assistant)
100
+ builder.add_node('names_extract', company_and_industry_query)
101
+ builder.add_node('web_scraping', web_scraping)
102
+
103
+ builder.add_edge(START, "assistant")
104
+ builder.add_edge("assistant", "names_extract")
105
+ builder.add_edge("names_extract", 'web_scraping')
106
+ builder.add_edge("web_scraping", END)
107
+
108
+ # memory
109
+ memory = MemorySaver()
110
+ react_graph = builder.compile(checkpointer=memory)
111
+
112
+ config = {'configurable': {'thread_id':'1'}}
113
+ messages = [HumanMessage(content=user_query)]
114
+ agentstate_result = react_graph.invoke({'messages': messages}, config)
115
+
116
+ return agentstate_result
117
+
118
+
119
+
120
+
121
+
122
+
feasibility_agent.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setup import *
2
+ from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
3
+ from pydantic import BaseModel,ValidationError
4
+ from typing import List
5
+ from langchain_community.tools import TavilySearchResults
6
+
7
+
8
+ keyword_search = TavilySearchResults(
9
+ max_results=2,
10
+ search_depth="advanced",
11
+ include_answer=True,
12
+ include_raw_content=True,
13
+ include_images=True,
14
+ )
15
+
16
+
17
+ # Define the UseCaseKeywords model to include use_case, description, and keyword
18
+ class UseCaseKeywords(BaseModel):
19
+ use_case: str
20
+ description: str
21
+ keyword: str
22
+
23
+ # Override the dict method to return a dictionary with use_case as the key
24
+ def to_dict(self) -> dict:
25
+ return {
26
+ 'use_case': self.use_case,
27
+ 'description': self.description,
28
+ 'keyword': self.keyword
29
+ }
30
+
31
+ # Define the KeywordGenerationResponse model to contain a list of UseCaseKeywords
32
+ class KeywordGenerationResponse(BaseModel):
33
+ data: List[UseCaseKeywords]
34
+
35
+ # Convert the list of UseCaseKeywords to a list of dictionaries
36
+ def to_list_of_dicts(self) -> List[dict]:
37
+ return [entry.to_dict() for entry in self.data]
38
+
39
+
40
+ def keyword_generation(report):
41
+
42
+ query_generation_sys_prompt = SystemMessage(content='''You are an expert in creating precise and relevant keyword queries to search for datasets. Your task is to generate a keyword query for each use case provided below. These queries should be optimized for searching datasets on platforms such as GitHub, Kaggle, and Hugging Face.
43
+
44
+ Your JSON structure must strictly include:
45
+ [
46
+ {
47
+ "use_case": "string",
48
+ "description": "string",
49
+ "keyword": "string"
50
+ }
51
+ ]
52
+
53
+ **Instructions:**
54
+ 1. Extract the key concepts from the use case (e.g., objectives, AI application, and domain).
55
+ 2. Formulate a concise, descriptive query using relevant terms and synonyms.
56
+ 3. Include terms related to data types (e.g., "customer data," "chat logs," "shopping behavior"), AI techniques (e.g., "machine learning," "recommendation systems"), and target domain (e.g., "e-commerce," "retail").
57
+ 4. Create a output dictionary with the use case title as the key and the keyword query as the value.
58
+
59
+ **Use Cases: Examples**
60
+ ## Use Case 1: Personalized Shopping Experiences with GenAI
61
+ **Objective/Use Case:** Create tailored shopping experiences for individual customers based on their browsing history, purchasing behavior, and preferences.
62
+ **AI Application:** Implement machine learning algorithms that analyze customer data to generate personalized offers, marketing communications, and product recommendations.
63
+ **Cross-Functional Benefit:**
64
+ - **Marketing:** Increases customer satisfaction and loyalty through targeted marketing efforts.
65
+ - **Sales:** Boosts sales by offering relevant products to customers.
66
+ - **Customer Service:** Enhances customer experience through personalized support.
67
+
68
+ ## Use Case 2: AI-Powered Chatbots for Customer Service
69
+ **Objective/Use Case:** Improve in-store customer service by providing instant assistance and directing customers to relevant products.
70
+ **AI Application:** Develop GenAI-powered chatbots that analyze customer queries and provide accurate responses, suggesting related products and services.
71
+ **Cross-Functional Benefit:**
72
+ - **Customer Service:** Reduces wait times and improves customer satisfaction.
73
+ - **Sales:** Increases sales by suggesting relevant products to customers.
74
+ - **Operations:** Enhances employee productivity by automating routine tasks.
75
+
76
+ You must very very striclty follow the below format for the output dictionary.Pls do not deviate from the format. Always remember to follow the format strictly.
77
+ Example output:
78
+ [{'use_case' : "Personalized Shopping Experiences with GenAI" ,
79
+ 'description':"AI-driven personalization enhances customer satisfaction through tailored offers, recommendations, and marketing based on individual preferences",
80
+ 'keyword': "e-commerce personalized shopping data customer behavior recommendation system offers dataset"},
81
+ {'use_case': "AI-Powered Chatbots for Customer Service" ,
82
+ 'description': "AI chatbots provide instant, accurate assistance, improving customer service, increasing sales, and boosting operational efficiency",
83
+ 'keyword': "customer service chatbot dataset customer queries retail e-commerce AI automation"}]''')
84
+
85
+
86
+ # Example usage (you will use llm to generate the output)
87
+ Keyword_generation_llm = llm.with_structured_output(KeywordGenerationResponse)
88
+
89
+ # Your report as input (ensure that this variable is properly formatted and available)
90
+ report_msg = HumanMessage(content=f'The usecases are as follows {report}')
91
+
92
+ try:
93
+ output_dict = Keyword_generation_llm.invoke([query_generation_sys_prompt, report_msg])
94
+ if not output_dict or not isinstance(output_dict, dict):
95
+ raise ValueError("Unexpected LLM output format")
96
+
97
+ parsed_response = KeywordGenerationResponse(**output_dict)
98
+
99
+ except ValidationError as e:
100
+ print(f"Validation error: {e}")
101
+ print(f"Invalid data: {output_dict}")
102
+ raise
103
+ # Convert the response to a list of dictionaries
104
+ output_list = parsed_response.to_list_of_dicts()
105
+
106
+ return output_list
107
+
108
+
109
+
110
+ def dataset_search(output_list):
111
+ for usecase_dict in output_list:
112
+ query = usecase_dict['keyword']
113
+ query_format = 'kaggle OR github OR huggingface AND ({query})'
114
+ links = keyword_search.invoke({'query': query_format})
115
+ usecase_dict['links'] = links
116
+ return output_list
117
+
118
+
119
+
120
+ def grouping_urls(output_list):
121
+ for dict_item in output_list:
122
+ urls_list = []
123
+ for ele in dict_item['links']:
124
+ urls_list.append(ele['url'])
125
+ dict_item['urls_list'] = urls_list
126
+ return output_list
127
+
128
+
129
+
130
+ def delete_columns(output_list):
131
+ # Specify the keys you want to include
132
+ keys_to_del = ['links', 'keyword']
133
+
134
+ for dict_item in output_list:
135
+ for key in keys_to_del:
136
+ dict_item.pop(key, None)
137
+ return output_list
138
+
139
+
140
+ def feasibility_agent_func(report):
141
+ dict_list = keyword_generation(report)
142
+ dict_links = dataset_search(dict_list)
143
+ urls_dict = grouping_urls(dict_links)
144
+ pd_dict = delete_columns(urls_dict)
145
+
146
+ return pd_dict
main.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from setup import *
3
+ import pandas as pd
4
+ from openpyxl import Workbook
5
+ from openpyxl.utils.dataframe import dataframe_to_rows
6
+ from openpyxl.styles import Font
7
+ from agents import research_agent
8
+ from vectorstore import extract_urls, urls_classify_list, clean_and_extract_html_data
9
+ from usecase_agent import usecase_agent_func, vectorstore_writing
10
+ # from feasibility_agent import feasibility_agent_func
11
+
12
+
13
+
14
+ # # Function to create Excel file
15
+ # def create_excel(df):
16
+ # # Create a new Excel workbook and select the active sheet
17
+ # wb = Workbook()
18
+ # ws = wb.active
19
+ # ws.title = "Use Cases"
20
+
21
+ # # Define and write headers to the Excel sheet
22
+ # headers = ['Use Case', 'Description', 'URLs']
23
+ # ws.append(headers)
24
+
25
+ # # Write data rows
26
+ # for _, row in df.iterrows():
27
+ # try:
28
+ # use_case = row['use_case']
29
+ # description = row['description']
30
+ # urls = row['urls_list']
31
+
32
+ # ws.append([use_case, description, None]) # Add use case and description
33
+ # if urls:
34
+ # for url_index, url in enumerate(urls):
35
+ # cell = ws.cell(row=ws.max_row, column=3) # URLs go into the third column
36
+ # cell.value = url
37
+ # cell.hyperlink = url
38
+ # cell.font = Font(color="0000FF", underline="single")
39
+
40
+ # # Add a new row for additional URLs
41
+ # if url_index < len(urls) - 1:
42
+ # ws.append([None, None, None])
43
+ # except KeyError as e:
44
+ # print(f"Missing key in DataFrame row: {e}")
45
+ # except Exception as e:
46
+ # print(f"Unexpected error while processing row: {e}")
47
+
48
+ # excel_file_path = "GenAI_use_cases_feasibility.xlsx"
49
+ # wb.save(excel_file_path)
50
+ # return excel_file_path
51
+
52
+
53
+ # # Function to handle the report and create the DataFrame
54
+ # def pd_creation(report):
55
+ # # Assuming feasibility_agent_func returns a dictionary
56
+ # pd_dict = feasibility_agent_func(report)
57
+
58
+ # # Check for expected keys in pd_dict before proceeding
59
+ # required_columns = ['use_case', 'description', 'urls_list']
60
+ # if not all(col in pd_dict for col in required_columns):
61
+ # raise ValueError(f"Missing one or more expected columns: {required_columns}")
62
+
63
+ # # Create the DataFrame from the dictionary
64
+ # df = pd.DataFrame(pd_dict)
65
+
66
+ # # Convert the dataframe to the format expected by Gradio (list of lists)
67
+ # data = df.values.tolist() # This creates a list of lists from the dataframe
68
+
69
+ # # Create the Excel file and return its path
70
+ # excel_file_path = create_excel(df) # Create the Excel file and get its path
71
+
72
+ # return data, excel_file_path # Return the formatted data and the Excel file path
73
+
74
+ # Main function that handles the user query and generates the report
75
+ def main(user_input):
76
+ # Research Agent
77
+ agentstate_result = research_agent(user_input)
78
+
79
+ # Vector Store
80
+ urls, content = extract_urls(agentstate_result)
81
+ pdf_urls, html_urls = urls_classify_list(urls)
82
+ html_docs = clean_and_extract_html_data(html_urls)
83
+
84
+ # Writing vector store (not explicitly defined in your example)
85
+ vectorstore_writing(html_docs)
86
+
87
+ # Use-case agent
88
+ company_name = agentstate_result['company']
89
+ industry_name = agentstate_result['industry']
90
+
91
+ if company_name:
92
+ topic = f'GenAI Usecases in {company_name} and {industry_name} industry. Explore {company_name} GenAI applications, key offerings, strategic focus areas, competitors, and market share.'
93
+ else:
94
+ topic = f'GenAI Usecases in {industry_name}. Explore {industry_name} GenAI applications, trends, challenges, and opportunities.'
95
+ max_analysts = 3
96
+
97
+ report = usecase_agent_func(topic, max_analysts)
98
+ # pd_dict, excel_file_path = pd_creation(report)
99
+
100
+ # Save the report as a markdown file
101
+ report_file_path = "generated_report.md"
102
+ with open(report_file_path, "w") as f:
103
+ f.write(report)
104
+ # pd_dict, excel_file_path
105
+ return report, report_file_path
106
+
107
+ # Example queries
108
+ examples = [
109
+ "How is the retail industry leveraging AI and ML?",
110
+ "AI applications in automotive manufacturing"
111
+ ]
112
+
113
+ # Creating the Gradio interface
114
+ with gr.Blocks(theme=gr.themes.Soft(font=gr.themes.GoogleFont('Open Sans'))) as demo:
115
+ # Header section
116
+ gr.HTML("<center><h1>UseCaseGenie - Discover GenAI Use cases for your company and Industry! 🤖🧑‍🍳.</h1><center>")
117
+ gr.Markdown("""#### This GenAI Assistant 🤖 helps you discover and explore Generative AI use cases for your company and industry.
118
+ You can download the generated use case report as a <b>Markdown file</b> to gain insights and explore relevant GenAI applications.
119
+ ### <b>Steps:</b>
120
+ 1. <b>Enter your query</b> regarding any company or industry.
121
+ 2. <b>Click on the 'Submit' button</b> and wait for the GenAI assistant to generate the report.
122
+ 3. <b>Download the generated report<b>
123
+ 4. Explore the GenAI use cases and URLs for further analysis.
124
+ """)
125
+
126
+
127
+ # Input for the user query
128
+ with gr.Row():
129
+ user_input = gr.Textbox(label="Enter your Query", placeholder='Type_here...')
130
+
131
+ # Examples to help users with inputs
132
+ with gr.Row():
133
+ gr.Examples(examples=examples, inputs=user_input)
134
+
135
+ # Buttons for submitting and downloading
136
+ with gr.Row():
137
+ submit_button = gr.Button("Submit")
138
+ clear_btn = gr.ClearButton([user_input], value='Clear')
139
+
140
+ # File download buttons
141
+ with gr.Row():
142
+ # Create a downloadable markdown file
143
+ download_report_button = gr.File(label="Usecases Report")
144
+
145
+ # # Create a downloadable Excel file
146
+ # download_excel_button = gr.File(label="Feasibility Excel File")
147
+
148
+ # Display report in Markdown format
149
+ with gr.Row():
150
+ report_output = gr.Markdown()
151
+
152
+ submit_button.click(main, inputs=[user_input], outputs=[report_output, download_report_button])
153
+
154
+ # Run the interface
155
+ demo.launch()
usecase_agent.py ADDED
@@ -0,0 +1,597 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setup import *
2
+ from typing import List, Optional
3
+ from typing_extensions import TypedDict
4
+ from pydantic import BaseModel, Field
5
+ from langgraph.graph import START, END, StateGraph
6
+ from langgraph.checkpoint.memory import MemorySaver
7
+ from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, get_buffer_string
8
+ from langgraph.constants import Send
9
+ from operator import add
10
+ from langgraph.graph import MessagesState
11
+ from typing import Annotated
12
+ from langchain_community.vectorstores import Chroma
13
+ from langchain_community.embeddings.jina import JinaEmbeddings
14
+ # from langchain_huggingface import HuggingFaceEmbeddings
15
+
16
+ class Analyst(BaseModel):
17
+ affiliation: str = Field(
18
+ description="Primary affiliation of the analyst.",
19
+ )
20
+ name: str = Field(
21
+ description="Name of the analyst."
22
+ )
23
+ role: str = Field(
24
+ description="Role of the analyst in the context of the topic.",
25
+ )
26
+ description: str = Field(
27
+ description="Description of the analyst focus, concerns, and motives.",
28
+ )
29
+ @property
30
+ def persona(self) -> str:
31
+ return f"Name: {self.name}\nRole: {self.role}\nAffiliation: {self.affiliation}\nDescription: {self.description}\n"
32
+
33
+
34
+ class Perspectives(BaseModel):
35
+ analysts: List[Analyst] = Field(
36
+ description="Comprehensive list of analysts with their roles and affiliations.",
37
+ )
38
+
39
+
40
+ class GenerateAnalystsState(TypedDict):
41
+ topic: str # Research topic
42
+ max_analysts: int # Number of analysts to generate
43
+ analysts: List[Analyst] # Analyst asking questions
44
+
45
+
46
+ class InterviewState(MessagesState):
47
+ max_num_turns: int # Number turns of conversation
48
+ context: Annotated[list, add] # Source docs
49
+ analyst: Analyst # Analyst asking questions
50
+ interview: str # Interview transcript
51
+ sections: list # Final key we duplicate in outer state for Send() API
52
+
53
+
54
+ class SearchQuery(BaseModel):
55
+ search_query: str = Field(None, description="Search query for retrieval.")
56
+
57
+
58
+
59
+ def create_analysts(state: GenerateAnalystsState):
60
+
61
+ """ Create analysts """
62
+
63
+ topic=state['topic']
64
+ max_analysts=state['max_analysts']
65
+
66
+ structured_llm = llm.with_structured_output(Perspectives)
67
+
68
+ analyst_instructions = """You are tasked with creating a set of AI analyst personas. Follow these instructions carefully:
69
+ 1. First, review the research topic:{topic}
70
+ 2. Create {max_analysts} analysts with following roles:
71
+ - Industry expert
72
+ - GenAI expert
73
+ - Business strategist
74
+ 3. Determine the most interesting themes based upon documents and/or feedback above.
75
+ 4. Pick the top {max_analysts} themes.
76
+ 5. For each theme, create one analyst with ALL of the following required fields: - name: A fitting name for the analyst - role: Their specific role or title - affiliation: Their primary organization or institution - description: A detailed description of their focus areas, concerns, and motives
77
+ 6. Ensure every analyst includes all four fields without exception.
78
+ Remember: Every analyst **MUST** have all four fields (name, role, affiliation, and description) properly defined. Incomplete personas are not acceptable."""
79
+
80
+ # System message
81
+ system_message = analyst_instructions.format(topic=topic, max_analysts=max_analysts)
82
+
83
+ analysts = structured_llm.invoke([SystemMessage(content=system_message)]+[HumanMessage(content="Generate the set of analysts.")])
84
+
85
+ # Write the list of analysis to state
86
+ return {"analysts": analysts.analysts}
87
+
88
+
89
+
90
+
91
+ def vectorstore_writing(doc_splits):
92
+ global retriever
93
+ vectorstore = Chroma.from_documents(
94
+ documents=doc_splits,
95
+ collection_name="rag-chroma",
96
+ embedding = JinaEmbeddings(model_name='jina-embeddings-v3'),
97
+ persist_directory='./chroma_db'
98
+ )
99
+ retriever = vectorstore.as_retriever()
100
+
101
+
102
+
103
+
104
+
105
+ def generate_question(state:InterviewState):
106
+ """ Generate questions for the interview """
107
+
108
+ # print('----STATE----', state)
109
+ # Get the analyst persona
110
+ analyst = state['analyst']
111
+ messages = state['messages']
112
+ context = state["context"]
113
+
114
+ question_instructions = """You are an analyst tasked with interviewing an expert to learn about the use of Generative AI (GenAI) applications in a specific industry or company, if mentioned.
115
+
116
+ Your goal is to uncover interesting and specific insights related to the topic of Generative AI use cases.
117
+
118
+ Interesting: Insights that are surprising, non-obvious, or reveal unique applications of GenAI in the industry or company.
119
+ Specific: Insights that avoid generalities and include specific examples or case studies relevant to the company’s offerings, strategic focus areas, or the industry’s needs.
120
+ Focus Areas:
121
+ Explore the company's key offerings and strategic focus areas (e.g., operations, supply chain, customer experience, etc.), if the company is named.
122
+ Discuss industry-wide trends, innovations, and opportunities enabled by GenAI, such as improved operational efficiency, enhanced customer experiences, or streamlined supply chain processes.
123
+ Gather details on the company or industry's vision and products, focusing on how GenAI can be applied to enhance or transform their workflows.
124
+ Task:
125
+ Begin by introducing yourself with a name that fits your persona, then ask your question.
126
+
127
+ Continue asking follow-up questions to drill down into:
128
+
129
+ Specific GenAI use cases within the company's domain or the industry.
130
+ How these applications align with the company's or industry's strategic goals.
131
+ Real-world examples or future opportunities for integrating GenAI into their processes.
132
+ Complete the interview by saying:
133
+ "Thank you so much for your help!"
134
+
135
+ Remember to stay in character throughout the conversation, reflecting your persona and the provided goals."""
136
+
137
+ # Generate the question
138
+ question = llm.invoke([SystemMessage(content=question_instructions)]+[HumanMessage(content="Generate the question.")])
139
+
140
+ return {"messages": [question]}
141
+
142
+
143
+
144
+ def search_vectorstore(state: InterviewState):
145
+
146
+ """ Retrieve docs from Docstore """
147
+
148
+ # Search query writing
149
+ search_instructions = SystemMessage(content=f"""You will be given a conversation between an analyst and an expert.
150
+
151
+ Your goal is to generate a well-structured query for use in retrieval and / or web-search related to the conversation.
152
+
153
+ First, analyze the full conversation.
154
+
155
+ Pay particular attention to the final question posed by the analyst.
156
+
157
+ Convert this final question into a well-structured web search query""")
158
+
159
+ # Search query
160
+ structured_llm = llm.with_structured_output(SearchQuery)
161
+ search_query = structured_llm.invoke([search_instructions]+state['messages'])
162
+
163
+ # Search
164
+ search_docs = retriever.invoke(input=search_query.search_query)
165
+
166
+ # Format
167
+ formatted_search_docs = "\n\n---\n\n".join(
168
+ [
169
+ f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
170
+ for doc in search_docs
171
+ ]
172
+ )
173
+
174
+ return {"context": [formatted_search_docs]}
175
+
176
+
177
+
178
+ def generate_answer(state: InterviewState):
179
+
180
+ """ Node to answer a question """
181
+
182
+ # Get state
183
+ analyst = state["analyst"]
184
+ messages = state["messages"]
185
+ context = state["context"]
186
+
187
+
188
+ answer_instructions = """You are an expert being interviewed by an analyst.
189
+
190
+ Here is analyst area of focus: {goals}.
191
+
192
+ You goal is to answer a question posed by the interviewer.
193
+
194
+ To answer question, use this context:
195
+
196
+ {context}
197
+
198
+ When answering questions, follow these guidelines:
199
+
200
+ 1. Use only the information provided in the context.
201
+
202
+ 2. Do not introduce external information or make assumptions beyond what is explicitly stated in the context.
203
+
204
+ 3. The context contain sources at the topic of each individual document.
205
+
206
+ 4. Include these sources your answer next to any relevant statements. For example, for source # 1 use [1].
207
+
208
+ 5. List your sources in order at the bottom of your answer. [1] Source 1, [2] Source 2, etc
209
+
210
+ 6. If the source is: <Document source="assistant/docs/llama3_1.pdf" page="7"/>' then just list:
211
+
212
+ [1] assistant/docs/llama3_1.pdf, page 7
213
+
214
+ And skip the addition of the brackets as well as the Document source preamble in your citation."""
215
+
216
+
217
+
218
+ # Answer question
219
+ system_message = answer_instructions.format(goals=analyst.persona, context=context)
220
+ answer = llm.invoke([SystemMessage(content=system_message)]+messages)
221
+
222
+ # Name the message as coming from the expert
223
+ answer.name = "expert"
224
+
225
+ # Append it to state
226
+ return {"messages": [answer]}
227
+
228
+
229
+ def save_interview(state: InterviewState):
230
+
231
+ """ Save interviews """
232
+
233
+ # Get messages
234
+ messages = state["messages"]
235
+
236
+ # Convert interview to a string
237
+ interview = get_buffer_string(messages)
238
+
239
+ # Save to interviews key
240
+ return {"interview": interview}
241
+
242
+
243
+
244
+ def route_messages(state: InterviewState,
245
+ name: str = "expert"):
246
+
247
+ """ Route between question and answer """
248
+
249
+ # Get messages
250
+ messages = state["messages"]
251
+ max_num_turns = state.get('max_num_turns',2)
252
+
253
+ # Check the number of expert answers
254
+ num_responses = len(
255
+ [m for m in messages if isinstance(m, AIMessage) and m.name == name]
256
+ )
257
+
258
+ # End if expert has answered more than the max turns
259
+ if num_responses >= max_num_turns:
260
+ return 'save_interview'
261
+
262
+ # This router is run after each question - answer pair
263
+ # Get the last question asked to check if it signals the end of discussion
264
+ last_question = messages[-2]
265
+
266
+ if "Thank you so much for your help" in last_question.content:
267
+ return 'save_interview'
268
+ return "ask_question"
269
+
270
+
271
+
272
+ def write_section(state: InterviewState):
273
+
274
+ """ Node to answer a question """
275
+
276
+ # Get state
277
+ interview = state["interview"]
278
+ context = state["context"]
279
+ analyst = state["analyst"]
280
+
281
+
282
+ section_writer_instructions = """You are an expert technical writer.
283
+
284
+ Your task is to create a short, easily digestible section of a report based on a set of source documents.
285
+
286
+ 1. Analyze the content of the source documents:
287
+ - The name of each source document is at the start of the document, with the <Document tag.
288
+
289
+ 2. Create a report structure using markdown formatting:
290
+ - Use ## for the section title
291
+ - Use ### for sub-section headers
292
+
293
+ 3. Write the report following this structure:
294
+ a. Title (## header)
295
+ b. Summary (### header)
296
+ c. Sources (### header)
297
+
298
+ 4. Make your title engaging based upon the focus area of the analyst:
299
+ {focus}
300
+
301
+ 5. For the summary section:
302
+ - Set up summary with general background / context related to the focus area of the analyst
303
+ - Emphasize what is novel, interesting, or surprising about insights gathered from the interview
304
+ - Create a numbered list of source documents, as you use them
305
+ - Do not mention the names of interviewers or experts
306
+ - Aim for approximately 400 words maximum
307
+ - Use numbered sources in your report (e.g., [1], [2]) based on information from source documents
308
+
309
+ 6. In the Sources section:
310
+ - Include all sources used in your report
311
+ - Provide full links to relevant websites or specific document paths
312
+ - Separate each source by a newline. Use two spaces at the end of each line to create a newline in Markdown.
313
+ - It will look like:
314
+
315
+ ### Sources
316
+ [1] Link or Document name
317
+ [2] Link or Document name
318
+
319
+ 7. Be sure to combine sources. For example this is not correct:
320
+
321
+ [3] https://ai.meta.com/blog/meta-llama-3-1/
322
+ [4] https://ai.meta.com/blog/meta-llama-3-1/
323
+
324
+ There should be no redundant sources. It should simply be:
325
+
326
+ [3] https://ai.meta.com/blog/meta-llama-3-1/
327
+
328
+ 8. Final review:
329
+ - Ensure the report follows the required structure
330
+ - Include no preamble before the title of the report
331
+ - Check that all guidelines have been followed"""
332
+
333
+
334
+ # Write section using either the gathered source docs from interview (context) or the interview itself (interview)
335
+ system_message = section_writer_instructions.format(focus=analyst.description)
336
+ section = llm.invoke([SystemMessage(content=system_message)]+[HumanMessage(content=f"Use this source to write your section: {context}")])
337
+
338
+ # Append it to state
339
+ return {"sections": [section.content]}
340
+
341
+
342
+
343
+ # Add nodes and edges
344
+ interview_builder = StateGraph(InterviewState)
345
+ interview_builder.add_node("ask_question", generate_question)
346
+ interview_builder.add_node("search_rag", search_vectorstore)
347
+ interview_builder.add_node("answer_question", generate_answer)
348
+ interview_builder.add_node("save_interview", save_interview)
349
+ interview_builder.add_node("write_section", write_section)
350
+
351
+ # Flow
352
+ interview_builder.add_edge(START, "ask_question")
353
+ interview_builder.add_edge("ask_question", "search_rag")
354
+ interview_builder.add_edge("search_rag", "answer_question")
355
+ interview_builder.add_conditional_edges("answer_question", route_messages,['ask_question','save_interview'])
356
+ interview_builder.add_edge("save_interview", "write_section")
357
+ interview_builder.add_edge("write_section", END)
358
+
359
+ # Interview
360
+ memory = MemorySaver()
361
+ interview_graph = interview_builder.compile(checkpointer=memory).with_config(run_name="Conduct Interviews")
362
+
363
+
364
+
365
+
366
+ class ResearchGraphState(TypedDict):
367
+ topic: str # Research topic
368
+ max_analysts: int # Number of analysts
369
+ analysts: List[Analyst] # Analyst asking questions
370
+ sections: Annotated[list, add] # Send() API key
371
+ introduction: str # Introduction for the final report
372
+ content: str # Content for the final report
373
+ conclusion: str # Conclusion for the final report
374
+ final_report: str # Final report
375
+ human_analyst_feedback: Optional[str] # Human feedback
376
+
377
+
378
+
379
+ def initiate_all_interviews(state: ResearchGraphState):
380
+ """ This is the "map" step where we run each interview sub-graph using Send API """
381
+
382
+ # Check if human feedback
383
+ human_analyst_feedback=state.get('human_analyst_feedback')
384
+ if human_analyst_feedback:
385
+ # Return to create_analysts
386
+ return "create_analysts"
387
+
388
+ # Otherwise kick off interviews in parallel via Send() API
389
+ else:
390
+ topic = state["topic"]
391
+ return [Send("conduct_interview", {"analyst": analyst,
392
+ "messages": [HumanMessage(
393
+ content=f"So you said you were writing an article on {topic}?")],
394
+ }) for analyst in state["analysts"]]
395
+
396
+ report_writer_instructions = '''You are a technical writer tasked with creating a report on the overall topic:
397
+
398
+ **{topic}**
399
+
400
+ Your team of analysts has conducted interviews and written memos based on their findings. Your task is to consolidate the insights from these memos into a cohesive and structured report, following this format:
401
+
402
+ Think deeply and Generate atleat 2 use cases based on the memos.
403
+
404
+ ### Format for Each Use Case
405
+ 1. **Title Header:** Use a descriptive title for each use case, such as "## Use Case 1: AI-Powered Predictive Maintenance."
406
+ 2. **Objective/Use Case:** Summarize the primary goal or application of AI for this use case in one or two sentences.
407
+ 3. **AI Application:** Describe the specific AI technologies or methods used to achieve the objective.
408
+ 4. **Cross-Functional Benefit:** Outline the key benefits across various functions, formatted as bullet points, specifying which department or area benefits from the AI use case.
409
+
410
+ ### Example Format:
411
+
412
+ ## Use Case 1: AI-Powered Predictive Maintenance
413
+ **Objective/Use Case:** Reduce equipment downtime and maintenance costs by predicting equipment failures before they occur.
414
+ **AI Application:** Implement machine learning algorithms that analyze real-time sensor data from machinery to predict potential failures and schedule maintenance proactively.
415
+ **Cross-Functional Benefit:**
416
+ - **Operations & Maintenance:** Minimizes unplanned downtime and extends equipment lifespan.
417
+ - **Finance:** Reduces maintenance costs and improves budgeting accuracy.
418
+ - **Supply Chain:** Optimizes spare parts inventory based on predictive insights.
419
+
420
+ ## Use Case 2: Real-Time Quality Control with Computer Vision
421
+ **Objective/Use Case:** Enhance product quality by detecting defects in products during manufacturing.
422
+ **AI Application:** Deploy AI-powered computer vision systems on production lines to identify surface defects and inconsistencies in real time.
423
+ **Cross-Functional Benefit:**
424
+ - **Quality Assurance:** Improves defect detection accuracy and reduces scrap rates.
425
+ - **Production:** Enables immediate corrective actions, enhancing overall efficiency.
426
+ - **Customer Satisfaction:** Delivers higher-quality products, strengthening client relationships.
427
+
428
+ ### Report Guidelines
429
+ 1. Begin with the first use case title in the specified format.
430
+ 2. Do not include any preamble or introductory text for the report.
431
+ 3. Consolidate insights into distinct use cases, with a focus on clarity and relevance.
432
+ 4. Preserve any citations included in the memos, formatted in brackets, e.g., [1], [2].
433
+ 5. After detailing all use cases, include a **Sources** section with the title: `## Sources`.
434
+ 6. Be sure to combine sources. For example this is not correct:
435
+
436
+ [3] https://ai.meta.com/blog/meta-llama-3-1/
437
+ [4] https://ai.meta.com/blog/meta-llama-3-1/
438
+
439
+ There should be no redundant sources. It should simply be:
440
+ [3] https://ai.meta.com/blog/meta-llama-3-1/
441
+
442
+ ### Your Inputs
443
+ You will be given a collection of memos from your analysts under `{context}`. Extract and distill insights into specific use cases, ensuring each use case adheres to the prescribed format.'''
444
+
445
+ def write_report(state: ResearchGraphState):
446
+ # Full set of sections
447
+ sections = state["sections"]
448
+ topic = state["topic"]
449
+
450
+ # Concat all sections together
451
+ formatted_str_sections = "\n\n".join([f"{section}" for section in sections])
452
+
453
+ # Summarize the sections into a final report
454
+ system_message = report_writer_instructions.format(topic=topic, context=formatted_str_sections)
455
+ report = llm.invoke([SystemMessage(content=system_message)]+[HumanMessage(content=f"Write a report based upon these memos.")])
456
+ return {"content": report.content}
457
+
458
+
459
+ def human_feedback(state: ResearchGraphState):
460
+ """ No-op node that should be interrupted on """
461
+ pass
462
+
463
+
464
+
465
+ def write_introduction(state: ResearchGraphState):
466
+ # Full set of sections
467
+ sections = state["sections"]
468
+ topic = state["topic"]
469
+
470
+ # Concat all sections together
471
+ formatted_str_sections = "\n\n".join([f"{section}" for section in sections])
472
+
473
+ intro_conclusion_instructions = """You are a technical writer finishing a report on {topic}
474
+
475
+ You will be given all of the sections of the report.
476
+
477
+ You job is to write a crisp and compelling introduction or conclusion section.
478
+
479
+ The user will instruct you whether to write the introduction or conclusion.
480
+
481
+ Include no pre-amble for either section.
482
+
483
+ Target around 100 words, crisply previewing (for introduction) or recapping (for conclusion) all of the sections of the report.
484
+
485
+ Use markdown formatting.
486
+
487
+ For your introduction, create a compelling title and use the # header for the title.
488
+
489
+ For your introduction, use ## Introduction as the section header.
490
+
491
+ For your conclusion, use ## Conclusion as the section header.
492
+
493
+ Here are the sections to reflect on for writing: {formatted_str_sections}"""
494
+
495
+
496
+ # Summarize the sections into a final report
497
+ instructions = intro_conclusion_instructions.format(topic=topic, formatted_str_sections=formatted_str_sections)
498
+ intro = llm.invoke([instructions]+[HumanMessage(content=f"Write the report introduction")])
499
+ return {"introduction": intro.content}
500
+
501
+
502
+ def write_conclusion(state: ResearchGraphState):
503
+ # Full set of sections
504
+ sections = state["sections"]
505
+ topic = state["topic"]
506
+
507
+ # Concat all sections together
508
+ formatted_str_sections = "\n\n".join([f"{section}" for section in sections])
509
+
510
+ intro_conclusion_instructions = """You are a technical writer finishing a report on {topic}
511
+
512
+ You will be given all of the sections of the report.
513
+
514
+ You job is to write a crisp and compelling introduction or conclusion section.
515
+
516
+ The user will instruct you whether to write the introduction or conclusion.
517
+
518
+ Include no pre-amble for either section.
519
+
520
+ Target around 100 words, crisply previewing (for introduction) or recapping (for conclusion) all of the sections of the report.
521
+
522
+ Use markdown formatting.
523
+
524
+ For your introduction, create a compelling title and use the # header for the title.
525
+
526
+ For your introduction, use ## Introduction as the section header.
527
+
528
+ For your conclusion, use ## Conclusion as the section header.
529
+
530
+ Here are the sections to reflect on for writing: {formatted_str_sections}"""
531
+
532
+
533
+ # Summarize the sections into a final report
534
+ instructions = intro_conclusion_instructions.format(topic=topic, formatted_str_sections=formatted_str_sections)
535
+ conclusion = llm.invoke([instructions]+[HumanMessage(content=f"Write the report conclusion")])
536
+ return {"conclusion": conclusion.content}
537
+
538
+
539
+ def finalize_report(state: ResearchGraphState):
540
+ """ The is the "reduce" step where we gather all the sections, combine them, and reflect on them to write the intro/conclusion """
541
+ # Save full final report
542
+ content = state["content"]
543
+ if content.startswith("## Insights"):
544
+ content = content.strip("## Insights")
545
+ if "## Sources" in content:
546
+ try:
547
+ content, sources = content.split("\n## Sources\n")
548
+ except:
549
+ sources = None
550
+ else:
551
+ sources = None
552
+
553
+ final_report = state["introduction"] + "\n\n---\n\n" + content + "\n\n---\n\n" + state["conclusion"]
554
+ if sources is not None:
555
+ final_report += "\n\n## Sources\n" + sources
556
+ return {"final_report": final_report}
557
+
558
+
559
+
560
+ def usecase_agent_func(topic,max_analysts):
561
+ # Add nodes and edges
562
+ builder = StateGraph(ResearchGraphState)
563
+ builder.add_node("create_analysts", create_analysts)
564
+ builder.add_node("human_feedback", human_feedback)
565
+ builder.add_node("conduct_interview", interview_builder.compile())
566
+ builder.add_node("write_report",write_report)
567
+ builder.add_node("write_introduction",write_introduction)
568
+ builder.add_node("write_conclusion",write_conclusion)
569
+ builder.add_node("finalize_report",finalize_report)
570
+
571
+ # Logic
572
+ builder.add_edge(START, "create_analysts")
573
+ builder.add_edge("create_analysts", "human_feedback")
574
+ builder.add_conditional_edges("human_feedback", initiate_all_interviews, ["create_analysts", "conduct_interview"])
575
+ builder.add_edge("conduct_interview", "write_report")
576
+ builder.add_edge("conduct_interview", "write_introduction")
577
+ builder.add_edge("conduct_interview", "write_conclusion")
578
+ builder.add_edge(["write_conclusion", "write_report", "write_introduction"], "finalize_report")
579
+ builder.add_edge("finalize_report", END)
580
+
581
+ # Compile
582
+ memory = MemorySaver()
583
+ graph = builder.compile(checkpointer=memory)
584
+ config = {"configurable": {"thread_id": "1"}}
585
+ graph.invoke({"topic":topic,
586
+ "max_analysts":max_analysts,
587
+ 'human_analyst_feedback': None},
588
+ config)
589
+ final_state = graph.get_state(config)
590
+ report = final_state.values.get('final_report')
591
+
592
+ print('-----REPORT-----', report)
593
+
594
+ return report
595
+
596
+
597
+
vectorstore.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setup import *
2
+ import tempfile
3
+ import requests
4
+
5
+ from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
6
+ # from langchain_text_splitters import RecursiveCharacterTextSplitter
7
+ from urllib.parse import urlparse
8
+ from langchain.docstore.document import Document
9
+
10
+
11
+
12
+ def extract_urls(agentstate_result):
13
+ urls = []
14
+ content=[]
15
+ for item in agentstate_result['link_list']:
16
+ urls.append(item['url'])
17
+ content.append(item['content'])
18
+
19
+ return urls, content
20
+
21
+
22
+
23
+ # Function to classify URL based on file extension
24
+ def classify_url_by_extension(url):
25
+ """
26
+ Classifies a URL based on its file extension.
27
+ Focuses only on pdf and html, classifying others as unknown.
28
+ """
29
+
30
+ if not isinstance(url, str):
31
+ raise ValueError(f"Expected a string, but got {type(url)}")
32
+
33
+ # Extract the file extension from the URL
34
+ try:
35
+ file_extension = urlparse(url).path.split('.')[-1].lower()
36
+ if file_extension == 'pdf':
37
+ return 'pdf'
38
+ elif file_extension in ['html', 'htm']:
39
+ return 'html'
40
+ else:
41
+ return 'unknown'
42
+ except Exception as e:
43
+ print(f"Error while parsing URL: {url} - {e}")
44
+ return 'unknown'
45
+
46
+
47
+ # Function to classify based on HTTP Content-Type header (optional, for extra accuracy)
48
+ def classify_url_by_header(url):
49
+ """
50
+ Classifies a URL based on the HTTP Content-Type header.
51
+ Focuses only on pdf and html, classifying others as unknown.
52
+ """
53
+ try:
54
+ response = requests.head(url, timeout=5) # Use HEAD request to fetch headers
55
+ content_type = response.headers.get('Content-Type', '').lower()
56
+
57
+ if 'pdf' in content_type:
58
+ return 'pdf'
59
+ elif 'html' in content_type:
60
+ return 'html'
61
+ else:
62
+ return 'unknown'
63
+ except requests.RequestException as e:
64
+ print(f"Error while making HEAD request: {url} - {e}")
65
+ return 'unknown'
66
+
67
+
68
+ # Function to classify a list of URLs
69
+ def urls_classify_list(urls):
70
+ """
71
+ Classifies a list of URLs into pdf, html, and unknown.
72
+ Returns two separate lists: one for pdf URLs and one for html URLs.
73
+ """
74
+ if not isinstance(urls, list):
75
+ raise ValueError("Expected a list of URLs")
76
+
77
+ pdf_urls = []
78
+ html_urls = []
79
+
80
+ # Classify each URL
81
+ for url in urls:
82
+ file_type = classify_url_by_extension(url) # First, try classifying by extension
83
+ if file_type == 'unknown':
84
+ # If extension-based classification failed, fall back to HTTP header classification
85
+ file_type = classify_url_by_header(url)
86
+
87
+ if file_type == 'pdf':
88
+ pdf_urls.append(url)
89
+ elif file_type == 'html':
90
+ html_urls.append(url)
91
+
92
+ return pdf_urls, html_urls
93
+
94
+
95
+
96
+ def urls_classify_list(urls: list):
97
+ pdf_urls=[]
98
+ html_urls=[]
99
+ # Classify the URLs
100
+ for url in urls:
101
+ file_type = classify_url_by_extension(url) # First, try classifying by extension
102
+ if file_type == 'unknown':
103
+ # If extension-based classification failed, fall back to HTTP header classification
104
+ file_type = classify_url_by_header(url)
105
+
106
+ if file_type == 'pdf':
107
+ pdf_urls.append(url)
108
+
109
+ if file_type == 'html' or file_type == 'unknown':
110
+ html_urls.append(url)
111
+
112
+ return pdf_urls, html_urls
113
+
114
+
115
+
116
+ def clean_and_extract_html_data(html_urls, chunk_size=100, chunk_overlap=25):
117
+ """
118
+ Loads HTML content from URLs, cleans the data, and splits it into smaller chunks.
119
+
120
+ Args:
121
+ html_urls (list): List of HTML URLs to process.
122
+ chunk_size (int): Maximum size of each chunk.
123
+ chunk_overlap (int): Overlap between chunks.
124
+
125
+ Returns:
126
+ list: List of document chunks.
127
+ """
128
+
129
+ def clean_content(content):
130
+ """
131
+ Cleans the content by removing unwanted patterns and short lines.
132
+ """
133
+ cleaned_content = content.strip() # Remove leading/trailing whitespace
134
+ lines = cleaned_content.split('\n') # Split by newlines
135
+ meaningful_lines = [line.strip() for line in lines if len(line.strip()) > 3] # Keep meaningful lines
136
+ return '\n'.join(meaningful_lines)
137
+
138
+ def split_document(doc_content, chunk_size, chunk_overlap):
139
+ """
140
+ Splits a document into smaller chunks with overlap.
141
+ """
142
+ chunks = []
143
+ start = 0
144
+ while start < len(doc_content):
145
+ end = start + chunk_size
146
+ chunk = doc_content[start:end]
147
+ chunks.append(chunk)
148
+ start = end - chunk_overlap if end < len(doc_content) else len(doc_content)
149
+ return chunks
150
+
151
+ # Step 1: Load documents from URLs
152
+ docs = []
153
+ for url in html_urls:
154
+ try:
155
+ loader = WebBaseLoader(url)
156
+ data = loader.load()
157
+ docs.extend(data)
158
+ except Exception as e:
159
+ print(f"Error loading URL {url}: {e}")
160
+
161
+ # Step 2: Clean the content to remove unwanted data
162
+ cleaned_docs = []
163
+ for doc in docs:
164
+ cleaned_content = clean_content(doc.page_content)
165
+ if cleaned_content: # Exclude empty documents
166
+ doc.page_content = cleaned_content
167
+ cleaned_docs.append(doc)
168
+
169
+ # Step 3: Split the cleaned documents into chunks
170
+ doc_splits = []
171
+ for doc in cleaned_docs:
172
+ chunks = split_document(doc.page_content, chunk_size, chunk_overlap)
173
+ for chunk in chunks:
174
+ doc_splits.append(Document(page_content=chunk, metadata=doc.metadata))
175
+
176
+ return doc_splits
177
+
178
+
179
+
180
+
181
+
182
+
183
+ # def extract_pdf_from_url(url):
184
+ # """
185
+ # Extract text from a PDF available at a URL.
186
+
187
+ # Args:
188
+ # url (str): The URL of the PDF file.
189
+
190
+ # Returns:
191
+ # str: Extracted text from the PDF.
192
+ # """
193
+ # # Step 1: Download the PDF from the URL
194
+ # response = requests.get(url)
195
+ # if response.status_code == 200:
196
+ # pdf_content = response.content
197
+ # else:
198
+ # raise ValueError(f"Failed to fetch the PDF. HTTP Status Code: {response.status_code}")
199
+
200
+ # # Step 2: Save PDF content to a temporary file
201
+ # with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
202
+ # temp_pdf.write(pdf_content)
203
+ # temp_pdf_path = temp_pdf.name # Get the file path
204
+
205
+ # # Step 3: Load the PDF using PyPDFLoader
206
+ # loader = PyPDFLoader(temp_pdf_path)
207
+ # documents = loader.load()
208
+
209
+ # # Step 4: Extract text from all pages
210
+ # extracted_text = "\n".join(doc.page_content for doc in documents)
211
+
212
+ # return extracted_text
213
+
214
+
215
+ # def clean_and_split_pdf_text(pdf_text, chunk_size=100, chunk_overlap=25):
216
+ # """
217
+ # Cleans and splits the extracted PDF text into smaller chunks.
218
+
219
+ # Args:
220
+ # pdf_text (str): Extracted text from a PDF.
221
+ # chunk_size (int): Maximum size of each chunk.
222
+ # chunk_overlap (int): Overlap between chunks.
223
+
224
+ # Returns:
225
+ # list: List of document chunks.
226
+ # """
227
+ # def clean_content(content):
228
+ # """
229
+ # Cleans the text by removing unwanted patterns and short lines.
230
+ # """
231
+ # content = content.strip() # Remove leading/trailing whitespace
232
+ # lines = content.split('\n') # Split into lines
233
+ # meaningful_lines = [line.strip() for line in lines if len(line.strip()) > 3] # Exclude short lines
234
+ # return '\n'.join(meaningful_lines)
235
+
236
+ # def split_text(content, chunk_size, chunk_overlap):
237
+ # """
238
+ # Splits cleaned text into smaller chunks with overlap.
239
+ # """
240
+ # chunks = []
241
+ # start = 0
242
+ # while start < len(content):
243
+ # end = start + chunk_size
244
+ # chunks.append(content[start:end])
245
+ # start = end - chunk_overlap if end < len(content) else len(content)
246
+ # return chunks
247
+
248
+ # # Step 1: Clean the text
249
+ # cleaned_text = clean_content(pdf_text)
250
+
251
+ # # Step 2: Split the cleaned text
252
+ # return split_text(cleaned_text, chunk_size, chunk_overlap)
253
+
254
+
255
+ # def pdf_extraction(pdf_urls, chunk_size=100, chunk_overlap=25):
256
+ # """
257
+ # Extracts and processes text from a list of PDF URLs.
258
+
259
+ # Args:
260
+ # pdf_urls (list): List of PDF URLs.
261
+ # chunk_size (int): Maximum size of each chunk.
262
+ # chunk_overlap (int): Overlap between chunks.
263
+
264
+ # Returns:
265
+ # list: List of Document objects containing split text.
266
+ # """
267
+ # all_chunks = []
268
+
269
+ # for pdf_url in pdf_urls:
270
+ # try:
271
+ # # Extract text from the PDF
272
+ # extracted_text = extract_pdf_from_url(pdf_url)
273
+
274
+ # # Clean and split the text
275
+ # chunks = clean_and_split_pdf_text(extracted_text, chunk_size, chunk_overlap)
276
+
277
+ # # Convert chunks into Document objects
278
+ # for chunk in chunks:
279
+ # all_chunks.append(Document(page_content=chunk, metadata={"source": pdf_url}))
280
+ # except Exception as e:
281
+ # print(f"Error processing PDF URL {pdf_url}: {e}")
282
+
283
+ # return all_chunks