Spaces:

Dharma20
/

UseCaseGenie-GenAI-Usecase-Generator

Running

App Files Files Community

Dharma20 commited on Dec 31, 2024

Commit

9cac175

verified ·

1 Parent(s): 1b669d3

Upload 5 files

Browse files

Files changed (5) hide show

agents.py +122 -0
feasibility_agent.py +146 -0
main.py +155 -0
usecase_agent.py +597 -0
vectorstore.py +283 -0

agents.py ADDED Viewed

	@@ -0,0 +1,122 @@

+from setup import *
+import re
+import requests
+from typing import Annotated, Sequence, List, Optional
+from typing_extensions import TypedDict
+from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
+from langgraph.graph.message import add_messages
+from langgraph.graph import START, StateGraph, END
+from langgraph.checkpoint.memory import MemorySaver
+# Research agent
+class AgentState(TypedDict):
+  messages: Annotated[Sequence[BaseMessage], add_messages]
+  queries : List[str]
+  link_list : Optional[List]
+  industry : Optional[str]
+  company: Optional[str]
+# Node
+def assistant(state: AgentState):
+  assistant_sys_msg = SystemMessage(content='''You are a highly intelligent and helpful assistant. Your primary task is to analyze user queries and determine whether the query:
+    Refers to an industry (general context)
+    Refers to a specific company (e.g., mentions a company's name explicitly).
+    For every query:
+    Check for company names, brands, or proper nouns that indicate a specific entity.
+    While analyzing the company industry be specific as possible.
+    Return the company and industry name in the query
+    if you can't find a industry name, return an empty string.
+    Example 1:
+    Query: "GenAI in MRF Tyres"
+    Company: "MRF Tyres"
+    Industry: "Tires and rubber products"
+    Example 2:
+    Query: "GenAI in the healthcare industry"
+    Company: ""
+    Industry: "Healthcare"
+    ''')
+  return {'messages': [llm.invoke([assistant_sys_msg] + state["messages"])]}
+def company_and_industry_query(state: AgentState):
+  print('--extract_company_and_industry--entered--')
+  text = state['messages'][-1].content
+  # Define patterns for extracting company and industry
+  company_pattern = r'Company:\s*"([^"]+)"'
+  industry_pattern = r'Industry:\s*"([^"]+)"'
+  # Search for matches
+  company_match = re.search(company_pattern, text)
+  industry_match = re.search(industry_pattern, text)
+  # Extract matched groups or return None if not found
+  company_name = company_match.group(1) if company_match else None
+  industry_name = industry_match.group(1) if industry_match else None
+  queries = []
+  if company_name:
+      queries.extend([f'{company_name} Annual report latest AND {company_name} website AND no PDF results',
+                      # f'{company_name} GenAI applications',
+                      # f'{company_name} key offerings and strategic focus areas (e.g., operations, supply chain, customer experience)',
+                      # f'{company_name} competitors and market share'
+                      ])
+  if industry_name:
+      queries.extend([
+         f'{industry_name} report latest mckinsey, deloitte, nexocode',
+                      # f'{industry_name} GenAI applications',
+                      # f'{industry_name} trends, challenges and oppurtunities'
+                      ])
+  print('--extract_company_and_industry--finished--', queries)
+  return {'queries': queries, 'company': company_name, 'industry': industry_name}
+def web_scraping(state: AgentState):
+  print('--web_scraping--entered--')
+  queries = state['queries']
+  link_list = []
+  for query in queries:
+      query_results = tavily_search.invoke({"query": query})
+      link_list.extend(query_results)
+  print('--web_scraping--finished--')
+  return {'link_list': link_list}
+# Agent Graph
+def research_agent(user_query: str):
+  builder = StateGraph(AgentState)
+  builder.add_node('assistant', assistant)
+  builder.add_node('names_extract', company_and_industry_query)
+  builder.add_node('web_scraping', web_scraping)
+  builder.add_edge(START, "assistant")
+  builder.add_edge("assistant", "names_extract")
+  builder.add_edge("names_extract", 'web_scraping')
+  builder.add_edge("web_scraping", END)
+  # memory
+  memory = MemorySaver()
+  react_graph = builder.compile(checkpointer=memory)
+  config = {'configurable': {'thread_id':'1'}}
+  messages = [HumanMessage(content=user_query)]
+  agentstate_result = react_graph.invoke({'messages': messages}, config)
+  return agentstate_result

feasibility_agent.py ADDED Viewed

	@@ -0,0 +1,146 @@

+from setup import *
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
+from pydantic import BaseModel,ValidationError
+from typing import List
+from langchain_community.tools import TavilySearchResults
+keyword_search = TavilySearchResults(
+    max_results=2,
+    search_depth="advanced",
+    include_answer=True,
+    include_raw_content=True,
+    include_images=True,
+)
+# Define the UseCaseKeywords model to include use_case, description, and keyword
+class UseCaseKeywords(BaseModel):
+    use_case: str
+    description: str
+    keyword: str
+    # Override the dict method to return a dictionary with use_case as the key
+    def to_dict(self) -> dict:
+        return {
+            'use_case': self.use_case,
+            'description': self.description,
+            'keyword': self.keyword
+        }
+# Define the KeywordGenerationResponse model to contain a list of UseCaseKeywords
+class KeywordGenerationResponse(BaseModel):
+    data: List[UseCaseKeywords]
+    # Convert the list of UseCaseKeywords to a list of dictionaries
+    def to_list_of_dicts(self) -> List[dict]:
+        return [entry.to_dict() for entry in self.data]
+def keyword_generation(report):
+  query_generation_sys_prompt = SystemMessage(content='''You are an expert in creating precise and relevant keyword queries to search for datasets. Your task is to generate a keyword query for each use case provided below. These queries should be optimized for searching datasets on platforms such as GitHub, Kaggle, and Hugging Face.
+    Your JSON structure must strictly include:
+    [
+        {
+            "use_case": "string",
+            "description": "string",
+            "keyword": "string"
+        }
+    ]
+  **Instructions:**
+  1. Extract the key concepts from the use case (e.g., objectives, AI application, and domain).
+  2. Formulate a concise, descriptive query using relevant terms and synonyms.
+  3. Include terms related to data types (e.g., "customer data," "chat logs," "shopping behavior"), AI techniques (e.g., "machine learning," "recommendation systems"), and target domain (e.g., "e-commerce," "retail").
+  4. Create a output dictionary with the use case title as the key and the keyword query as the value.
+  **Use Cases: Examples**
+  ## Use Case 1: Personalized Shopping Experiences with GenAI
+  **Objective/Use Case:** Create tailored shopping experiences for individual customers based on their browsing history, purchasing behavior, and preferences.
+  **AI Application:** Implement machine learning algorithms that analyze customer data to generate personalized offers, marketing communications, and product recommendations.
+  **Cross-Functional Benefit:**
+  - **Marketing:** Increases customer satisfaction and loyalty through targeted marketing efforts.
+  - **Sales:** Boosts sales by offering relevant products to customers.
+  - **Customer Service:** Enhances customer experience through personalized support.
+  ## Use Case 2: AI-Powered Chatbots for Customer Service
+  **Objective/Use Case:** Improve in-store customer service by providing instant assistance and directing customers to relevant products.
+  **AI Application:** Develop GenAI-powered chatbots that analyze customer queries and provide accurate responses, suggesting related products and services.
+  **Cross-Functional Benefit:**
+  - **Customer Service:** Reduces wait times and improves customer satisfaction.
+  - **Sales:** Increases sales by suggesting relevant products to customers.
+  - **Operations:** Enhances employee productivity by automating routine tasks.
+  You must very very striclty follow the below format for the output dictionary.Pls do not deviate from the format. Always remember to follow the format strictly.
+  Example output:
+    [{'use_case' : "Personalized Shopping Experiences with GenAI" ,
+      'description':"AI-driven personalization enhances customer satisfaction through tailored offers, recommendations, and marketing based on individual preferences",
+    'keyword': "e-commerce personalized shopping data customer behavior recommendation system offers dataset"},
+    {'use_case': "AI-Powered Chatbots for Customer Service" ,
+      'description': "AI chatbots provide instant, accurate assistance, improving customer service, increasing sales, and boosting operational efficiency",
+      'keyword': "customer service chatbot dataset customer queries retail e-commerce AI automation"}]''')
+  # Example usage (you will use llm to generate the output)
+  Keyword_generation_llm = llm.with_structured_output(KeywordGenerationResponse)
+  # Your report as input (ensure that this variable is properly formatted and available)
+  report_msg = HumanMessage(content=f'The usecases are as follows {report}')
+  try:
+      output_dict = Keyword_generation_llm.invoke([query_generation_sys_prompt, report_msg])
+      if not output_dict or not isinstance(output_dict, dict):
+          raise ValueError("Unexpected LLM output format")
+      parsed_response = KeywordGenerationResponse(**output_dict)
+  except ValidationError as e:
+      print(f"Validation error: {e}")
+      print(f"Invalid data: {output_dict}")
+      raise
+  # Convert the response to a list of dictionaries
+  output_list = parsed_response.to_list_of_dicts()
+  return output_list
+def dataset_search(output_list):
+  for usecase_dict in output_list:
+    query = usecase_dict['keyword']
+    query_format = 'kaggle OR github OR huggingface AND ({query})'
+    links = keyword_search.invoke({'query': query_format})
+    usecase_dict['links'] = links
+  return output_list
+def grouping_urls(output_list):
+  for dict_item in output_list:
+    urls_list = []
+    for ele in dict_item['links']:
+      urls_list.append(ele['url'])
+    dict_item['urls_list'] = urls_list
+  return output_list
+def delete_columns(output_list):
+  # Specify the keys you want to include
+  keys_to_del = ['links', 'keyword']
+  for dict_item in output_list:
+    for key in keys_to_del:
+      dict_item.pop(key, None)
+  return output_list
+def feasibility_agent_func(report):
+  dict_list = keyword_generation(report)
+  dict_links = dataset_search(dict_list)
+  urls_dict = grouping_urls(dict_links)
+  pd_dict = delete_columns(urls_dict)
+  return pd_dict

main.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import gradio as gr
+from setup import *
+import pandas as pd
+from openpyxl import Workbook
+from openpyxl.utils.dataframe import dataframe_to_rows
+from openpyxl.styles import Font
+from agents import research_agent
+from vectorstore import extract_urls, urls_classify_list, clean_and_extract_html_data
+from usecase_agent import usecase_agent_func, vectorstore_writing
+# from feasibility_agent import feasibility_agent_func
+# # Function to create Excel file
+# def create_excel(df):
+#     # Create a new Excel workbook and select the active sheet
+#     wb = Workbook()
+#     ws = wb.active
+#     ws.title = "Use Cases"
+#     # Define and write headers to the Excel sheet
+#     headers = ['Use Case', 'Description', 'URLs']
+#     ws.append(headers)
+#     # Write data rows
+#     for _, row in df.iterrows():
+#         try:
+#             use_case = row['use_case']
+#             description = row['description']
+#             urls = row['urls_list']
+#             ws.append([use_case, description, None])  # Add use case and description
+#             if urls:
+#                 for url_index, url in enumerate(urls):
+#                     cell = ws.cell(row=ws.max_row, column=3)  # URLs go into the third column
+#                     cell.value = url
+#                     cell.hyperlink = url
+#                     cell.font = Font(color="0000FF", underline="single")
+#                     # Add a new row for additional URLs
+#                     if url_index < len(urls) - 1:
+#                         ws.append([None, None, None])
+#         except KeyError as e:
+#             print(f"Missing key in DataFrame row: {e}")
+#         except Exception as e:
+#             print(f"Unexpected error while processing row: {e}")
+#     excel_file_path = "GenAI_use_cases_feasibility.xlsx"
+#     wb.save(excel_file_path)
+#     return excel_file_path
+# # Function to handle the report and create the DataFrame
+# def pd_creation(report):
+#     # Assuming feasibility_agent_func returns a dictionary
+#     pd_dict = feasibility_agent_func(report)
+#     # Check for expected keys in pd_dict before proceeding
+#     required_columns = ['use_case', 'description', 'urls_list']
+#     if not all(col in pd_dict for col in required_columns):
+#         raise ValueError(f"Missing one or more expected columns: {required_columns}")
+#     # Create the DataFrame from the dictionary
+#     df = pd.DataFrame(pd_dict)
+#     # Convert the dataframe to the format expected by Gradio (list of lists)
+#     data = df.values.tolist()  # This creates a list of lists from the dataframe
+#     # Create the Excel file and return its path
+#     excel_file_path = create_excel(df)  # Create the Excel file and get its path
+#     return data, excel_file_path  # Return the formatted data and the Excel file path
+# Main function that handles the user query and generates the report
+def main(user_input):
+    # Research Agent
+    agentstate_result = research_agent(user_input)
+    # Vector Store
+    urls, content = extract_urls(agentstate_result)
+    pdf_urls, html_urls = urls_classify_list(urls)
+    html_docs = clean_and_extract_html_data(html_urls)
+    # Writing vector store (not explicitly defined in your example)
+    vectorstore_writing(html_docs)
+    # Use-case agent
+    company_name = agentstate_result['company']
+    industry_name = agentstate_result['industry']
+    if company_name:
+        topic = f'GenAI Usecases in {company_name} and {industry_name} industry. Explore {company_name} GenAI applications, key offerings, strategic focus areas, competitors, and market share.'
+    else:
+        topic = f'GenAI Usecases in {industry_name}. Explore {industry_name} GenAI applications, trends, challenges, and opportunities.'
+    max_analysts = 3
+    report = usecase_agent_func(topic, max_analysts)
+    # pd_dict, excel_file_path = pd_creation(report)
+    # Save the report as a markdown file
+    report_file_path = "generated_report.md"
+    with open(report_file_path, "w") as f:
+        f.write(report)
+    # pd_dict, excel_file_path
+    return report, report_file_path
+# Example queries
+examples = [
+    "How is the retail industry leveraging AI and ML?",
+    "AI applications in automotive manufacturing"
+]
+# Creating the Gradio interface
+with gr.Blocks(theme=gr.themes.Soft(font=gr.themes.GoogleFont('Open Sans'))) as demo:
+    # Header section
+    gr.HTML("<center><h1>UseCaseGenie - Discover GenAI Use cases for your company and Industry! 🤖🧑‍🍳.</h1><center>")
+    gr.Markdown("""#### This GenAI Assistant 🤖 helps you discover and explore Generative AI use cases for your company and industry.
+    You can download the generated use case report as a <b>Markdown file</b> to gain insights and explore relevant GenAI applications.
+    ### <b>Steps:</b>
+    1. <b>Enter your query</b> regarding any company or industry.
+    2. <b>Click on the 'Submit' button</b> and wait for the GenAI assistant to generate the report.
+    3. <b>Download the generated report<b>
+    4. Explore the GenAI use cases and URLs for further analysis.
+    """)
+    # Input for the user query
+    with gr.Row():
+        user_input = gr.Textbox(label="Enter your Query", placeholder='Type_here...')
+    # Examples to help users with inputs
+    with gr.Row():
+        gr.Examples(examples=examples, inputs=user_input)
+    # Buttons for submitting and downloading
+    with gr.Row():
+        submit_button = gr.Button("Submit")
+        clear_btn = gr.ClearButton([user_input], value='Clear')
+    # File download buttons
+    with gr.Row():
+        # Create a downloadable markdown file
+        download_report_button = gr.File(label="Usecases Report")
+        # # Create a downloadable Excel file
+        # download_excel_button = gr.File(label="Feasibility Excel File")
+    # Display report in Markdown format
+    with gr.Row():
+        report_output = gr.Markdown()
+    submit_button.click(main, inputs=[user_input], outputs=[report_output, download_report_button])
+# Run the interface
+demo.launch()

usecase_agent.py ADDED Viewed

	@@ -0,0 +1,597 @@

+from setup import *
+from typing import List, Optional
+from typing_extensions import TypedDict
+from pydantic import BaseModel, Field
+from langgraph.graph import START, END, StateGraph
+from langgraph.checkpoint.memory import MemorySaver
+from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, get_buffer_string
+from langgraph.constants import Send
+from operator import add
+from langgraph.graph import MessagesState
+from typing import Annotated
+from langchain_community.vectorstores import Chroma
+from langchain_community.embeddings.jina import JinaEmbeddings
+# from langchain_huggingface import HuggingFaceEmbeddings
+class Analyst(BaseModel):
+    affiliation: str = Field(
+        description="Primary affiliation of the analyst.",
+    )
+    name: str = Field(
+        description="Name of the analyst."
+    )
+    role: str = Field(
+        description="Role of the analyst in the context of the topic.",
+    )
+    description: str = Field(
+        description="Description of the analyst focus, concerns, and motives.",
+    )
+    @property
+    def persona(self) -> str:
+        return f"Name: {self.name}\nRole: {self.role}\nAffiliation: {self.affiliation}\nDescription: {self.description}\n"
+class Perspectives(BaseModel):
+    analysts: List[Analyst] = Field(
+        description="Comprehensive list of analysts with their roles and affiliations.",
+    )
+class GenerateAnalystsState(TypedDict):
+    topic: str # Research topic
+    max_analysts: int # Number of analysts to generate
+    analysts: List[Analyst] # Analyst asking questions
+class InterviewState(MessagesState):
+  max_num_turns: int # Number turns of conversation
+  context: Annotated[list, add] # Source docs
+  analyst: Analyst # Analyst asking questions
+  interview: str # Interview transcript
+  sections: list # Final key we duplicate in outer state for Send() API
+class SearchQuery(BaseModel):
+  search_query: str = Field(None, description="Search query for retrieval.")
+def create_analysts(state: GenerateAnalystsState):
+  """ Create analysts """
+  topic=state['topic']
+  max_analysts=state['max_analysts']
+  structured_llm = llm.with_structured_output(Perspectives)
+  analyst_instructions = """You are tasked with creating a set of AI analyst personas. Follow these instructions carefully:
+    1. First, review the research topic:{topic}
+    2. Create {max_analysts} analysts with following roles:
+      - Industry expert
+      - GenAI expert
+      - Business strategist
+    3. Determine the most interesting themes based upon documents and/or feedback above.
+    4. Pick the top {max_analysts} themes.
+    5. For each theme, create one analyst with ALL of the following required fields:   - name: A fitting name for the analyst   - role: Their specific role or title   - affiliation: Their primary organization or institution   - description: A detailed description of their focus areas, concerns, and motives
+    6. Ensure every analyst includes all four fields without exception.
+    Remember: Every analyst **MUST** have all four fields (name, role, affiliation, and description) properly defined. Incomplete personas are not acceptable."""
+  # System message
+  system_message = analyst_instructions.format(topic=topic, max_analysts=max_analysts)
+  analysts = structured_llm.invoke([SystemMessage(content=system_message)]+[HumanMessage(content="Generate the set of analysts.")])
+  # Write the list of analysis to state
+  return {"analysts": analysts.analysts}
+def vectorstore_writing(doc_splits):
+    global retriever
+    vectorstore = Chroma.from_documents(
+        documents=doc_splits,
+        collection_name="rag-chroma",
+        embedding = JinaEmbeddings(model_name='jina-embeddings-v3'),
+        persist_directory='./chroma_db'
+    )
+    retriever = vectorstore.as_retriever()
+def generate_question(state:InterviewState):
+  """ Generate questions for the interview """
+  # print('----STATE----', state)
+  # Get the analyst persona
+  analyst = state['analyst']
+  messages = state['messages']
+  context = state["context"]
+  question_instructions = """You are an analyst tasked with interviewing an expert to learn about the use of Generative AI (GenAI) applications in a specific industry or company, if mentioned.
+    Your goal is to uncover interesting and specific insights related to the topic of Generative AI use cases.
+    Interesting: Insights that are surprising, non-obvious, or reveal unique applications of GenAI in the industry or company.
+    Specific: Insights that avoid generalities and include specific examples or case studies relevant to the company’s offerings, strategic focus areas, or the industry’s needs.
+    Focus Areas:
+    Explore the company's key offerings and strategic focus areas (e.g., operations, supply chain, customer experience, etc.), if the company is named.
+    Discuss industry-wide trends, innovations, and opportunities enabled by GenAI, such as improved operational efficiency, enhanced customer experiences, or streamlined supply chain processes.
+    Gather details on the company or industry's vision and products, focusing on how GenAI can be applied to enhance or transform their workflows.
+    Task:
+    Begin by introducing yourself with a name that fits your persona, then ask your question.
+    Continue asking follow-up questions to drill down into:
+    Specific GenAI use cases within the company's domain or the industry.
+    How these applications align with the company's or industry's strategic goals.
+    Real-world examples or future opportunities for integrating GenAI into their processes.
+    Complete the interview by saying:
+    "Thank you so much for your help!"
+    Remember to stay in character throughout the conversation, reflecting your persona and the provided goals."""
+  # Generate the question
+  question = llm.invoke([SystemMessage(content=question_instructions)]+[HumanMessage(content="Generate the question.")])
+  return {"messages": [question]}
+def search_vectorstore(state: InterviewState):
+    """ Retrieve docs from Docstore """
+    # Search query writing
+    search_instructions = SystemMessage(content=f"""You will be given a conversation between an analyst and an expert.
+    Your goal is to generate a well-structured query for use in retrieval and / or web-search related to the conversation.
+    First, analyze the full conversation.
+    Pay particular attention to the final question posed by the analyst.
+    Convert this final question into a well-structured web search query""")
+    # Search query
+    structured_llm = llm.with_structured_output(SearchQuery)
+    search_query = structured_llm.invoke([search_instructions]+state['messages'])
+    # Search
+    search_docs = retriever.invoke(input=search_query.search_query)
+     # Format
+    formatted_search_docs = "\n\n---\n\n".join(
+        [
+            f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
+            for doc in search_docs
+        ]
+    )
+    return {"context": [formatted_search_docs]}
+def generate_answer(state: InterviewState):
+    """ Node to answer a question """
+    # Get state
+    analyst = state["analyst"]
+    messages = state["messages"]
+    context = state["context"]
+    answer_instructions = """You are an expert being interviewed by an analyst.
+    Here is analyst area of focus: {goals}.
+    You goal is to answer a question posed by the interviewer.
+    To answer question, use this context:
+    {context}
+    When answering questions, follow these guidelines:
+    1. Use only the information provided in the context.
+    2. Do not introduce external information or make assumptions beyond what is explicitly stated in the context.
+    3. The context contain sources at the topic of each individual document.
+    4. Include these sources your answer next to any relevant statements. For example, for source # 1 use [1].
+    5. List your sources in order at the bottom of your answer. [1] Source 1, [2] Source 2, etc
+    6. If the source is: <Document source="assistant/docs/llama3_1.pdf" page="7"/>' then just list:
+    [1] assistant/docs/llama3_1.pdf, page 7
+    And skip the addition of the brackets as well as the Document source preamble in your citation."""
+    # Answer question
+    system_message = answer_instructions.format(goals=analyst.persona, context=context)
+    answer = llm.invoke([SystemMessage(content=system_message)]+messages)
+    # Name the message as coming from the expert
+    answer.name = "expert"
+    # Append it to state
+    return {"messages": [answer]}
+def save_interview(state: InterviewState):
+    """ Save interviews """
+    # Get messages
+    messages = state["messages"]
+    # Convert interview to a string
+    interview = get_buffer_string(messages)
+    # Save to interviews key
+    return {"interview": interview}
+def route_messages(state: InterviewState,
+                   name: str = "expert"):
+    """ Route between question and answer """
+    # Get messages
+    messages = state["messages"]
+    max_num_turns = state.get('max_num_turns',2)
+    # Check the number of expert answers
+    num_responses = len(
+        [m for m in messages if isinstance(m, AIMessage) and m.name == name]
+    )
+    # End if expert has answered more than the max turns
+    if num_responses >= max_num_turns:
+        return 'save_interview'
+    # This router is run after each question - answer pair
+    # Get the last question asked to check if it signals the end of discussion
+    last_question = messages[-2]
+    if "Thank you so much for your help" in last_question.content:
+        return 'save_interview'
+    return "ask_question"
+def write_section(state: InterviewState):
+    """ Node to answer a question """
+    # Get state
+    interview = state["interview"]
+    context = state["context"]
+    analyst = state["analyst"]
+    section_writer_instructions = """You are an expert technical writer.
+    Your task is to create a short, easily digestible section of a report based on a set of source documents.
+    1. Analyze the content of the source documents:
+    - The name of each source document is at the start of the document, with the <Document tag.
+    2. Create a report structure using markdown formatting:
+    - Use ## for the section title
+    - Use ### for sub-section headers
+    3. Write the report following this structure:
+    a. Title (## header)
+    b. Summary (### header)
+    c. Sources (### header)
+    4. Make your title engaging based upon the focus area of the analyst:
+    {focus}
+    5. For the summary section:
+    - Set up summary with general background / context related to the focus area of the analyst
+    - Emphasize what is novel, interesting, or surprising about insights gathered from the interview
+    - Create a numbered list of source documents, as you use them
+    - Do not mention the names of interviewers or experts
+    - Aim for approximately 400 words maximum
+    - Use numbered sources in your report (e.g., [1], [2]) based on information from source documents
+    6. In the Sources section:
+    - Include all sources used in your report
+    - Provide full links to relevant websites or specific document paths
+    - Separate each source by a newline. Use two spaces at the end of each line to create a newline in Markdown.
+    - It will look like:
+    ### Sources
+    [1] Link or Document name
+    [2] Link or Document name
+    7. Be sure to combine sources. For example this is not correct:
+    [3] https://ai.meta.com/blog/meta-llama-3-1/
+    [4] https://ai.meta.com/blog/meta-llama-3-1/
+    There should be no redundant sources. It should simply be:
+    [3] https://ai.meta.com/blog/meta-llama-3-1/
+    8. Final review:
+    - Ensure the report follows the required structure
+    - Include no preamble before the title of the report
+    - Check that all guidelines have been followed"""
+    # Write section using either the gathered source docs from interview (context) or the interview itself (interview)
+    system_message = section_writer_instructions.format(focus=analyst.description)
+    section = llm.invoke([SystemMessage(content=system_message)]+[HumanMessage(content=f"Use this source to write your section: {context}")])
+    # Append it to state
+    return {"sections": [section.content]}
+# Add nodes and edges
+interview_builder = StateGraph(InterviewState)
+interview_builder.add_node("ask_question", generate_question)
+interview_builder.add_node("search_rag", search_vectorstore)
+interview_builder.add_node("answer_question", generate_answer)
+interview_builder.add_node("save_interview", save_interview)
+interview_builder.add_node("write_section", write_section)
+# Flow
+interview_builder.add_edge(START, "ask_question")
+interview_builder.add_edge("ask_question", "search_rag")
+interview_builder.add_edge("search_rag", "answer_question")
+interview_builder.add_conditional_edges("answer_question", route_messages,['ask_question','save_interview'])
+interview_builder.add_edge("save_interview", "write_section")
+interview_builder.add_edge("write_section", END)
+# Interview
+memory = MemorySaver()
+interview_graph = interview_builder.compile(checkpointer=memory).with_config(run_name="Conduct Interviews")
+class ResearchGraphState(TypedDict):
+    topic: str # Research topic
+    max_analysts: int # Number of analysts
+    analysts: List[Analyst] # Analyst asking questions
+    sections: Annotated[list, add] # Send() API key
+    introduction: str # Introduction for the final report
+    content: str # Content for the final report
+    conclusion: str # Conclusion for the final report
+    final_report: str # Final report
+    human_analyst_feedback: Optional[str] # Human feedback
+def initiate_all_interviews(state: ResearchGraphState):
+    """ This is the "map" step where we run each interview sub-graph using Send API """
+    # Check if human feedback
+    human_analyst_feedback=state.get('human_analyst_feedback')
+    if human_analyst_feedback:
+        # Return to create_analysts
+        return "create_analysts"
+    # Otherwise kick off interviews in parallel via Send() API
+    else:
+        topic = state["topic"]
+        return [Send("conduct_interview", {"analyst": analyst,
+                                           "messages": [HumanMessage(
+                                               content=f"So you said you were writing an article on {topic}?")],
+                                                       }) for analyst in state["analysts"]]
+report_writer_instructions = '''You are a technical writer tasked with creating a report on the overall topic:
+**{topic}**
+Your team of analysts has conducted interviews and written memos based on their findings. Your task is to consolidate the insights from these memos into a cohesive and structured report, following this format:
+Think deeply and Generate atleat 2 use cases based on the memos.
+### Format for Each Use Case
+1. **Title Header:** Use a descriptive title for each use case, such as "## Use Case 1: AI-Powered Predictive Maintenance."
+2. **Objective/Use Case:** Summarize the primary goal or application of AI for this use case in one or two sentences.
+3. **AI Application:** Describe the specific AI technologies or methods used to achieve the objective.
+4. **Cross-Functional Benefit:** Outline the key benefits across various functions, formatted as bullet points, specifying which department or area benefits from the AI use case.
+### Example Format:
+## Use Case 1: AI-Powered Predictive Maintenance
+**Objective/Use Case:** Reduce equipment downtime and maintenance costs by predicting equipment failures before they occur.
+**AI Application:** Implement machine learning algorithms that analyze real-time sensor data from machinery to predict potential failures and schedule maintenance proactively.
+**Cross-Functional Benefit:**
+- **Operations & Maintenance:** Minimizes unplanned downtime and extends equipment lifespan.
+- **Finance:** Reduces maintenance costs and improves budgeting accuracy.
+- **Supply Chain:** Optimizes spare parts inventory based on predictive insights.
+## Use Case 2: Real-Time Quality Control with Computer Vision
+**Objective/Use Case:** Enhance product quality by detecting defects in products during manufacturing.
+**AI Application:** Deploy AI-powered computer vision systems on production lines to identify surface defects and inconsistencies in real time.
+**Cross-Functional Benefit:**
+- **Quality Assurance:** Improves defect detection accuracy and reduces scrap rates.
+- **Production:** Enables immediate corrective actions, enhancing overall efficiency.
+- **Customer Satisfaction:** Delivers higher-quality products, strengthening client relationships.
+### Report Guidelines
+1. Begin with the first use case title in the specified format.
+2. Do not include any preamble or introductory text for the report.
+3. Consolidate insights into distinct use cases, with a focus on clarity and relevance.
+4. Preserve any citations included in the memos, formatted in brackets, e.g., [1], [2].
+5. After detailing all use cases, include a **Sources** section with the title: `## Sources`.
+6. Be sure to combine sources. For example this is not correct:
+[3] https://ai.meta.com/blog/meta-llama-3-1/
+[4] https://ai.meta.com/blog/meta-llama-3-1/
+There should be no redundant sources. It should simply be:
+[3] https://ai.meta.com/blog/meta-llama-3-1/
+### Your Inputs
+You will be given a collection of memos from your analysts under `{context}`. Extract and distill insights into specific use cases, ensuring each use case adheres to the prescribed format.'''
+def write_report(state: ResearchGraphState):
+    # Full set of sections
+    sections = state["sections"]
+    topic = state["topic"]
+    # Concat all sections together
+    formatted_str_sections = "\n\n".join([f"{section}" for section in sections])
+    # Summarize the sections into a final report
+    system_message = report_writer_instructions.format(topic=topic, context=formatted_str_sections)
+    report = llm.invoke([SystemMessage(content=system_message)]+[HumanMessage(content=f"Write a report based upon these memos.")])
+    return {"content": report.content}
+def human_feedback(state: ResearchGraphState):
+    """ No-op node that should be interrupted on """
+    pass
+def write_introduction(state: ResearchGraphState):
+    # Full set of sections
+    sections = state["sections"]
+    topic = state["topic"]
+    # Concat all sections together
+    formatted_str_sections = "\n\n".join([f"{section}" for section in sections])
+    intro_conclusion_instructions = """You are a technical writer finishing a report on {topic}
+    You will be given all of the sections of the report.
+    You job is to write a crisp and compelling introduction or conclusion section.
+    The user will instruct you whether to write the introduction or conclusion.
+    Include no pre-amble for either section.
+    Target around 100 words, crisply previewing (for introduction) or recapping (for conclusion) all of the sections of the report.
+    Use markdown formatting.
+    For your introduction, create a compelling title and use the # header for the title.
+    For your introduction, use ## Introduction as the section header.
+    For your conclusion, use ## Conclusion as the section header.
+    Here are the sections to reflect on for writing: {formatted_str_sections}"""
+    # Summarize the sections into a final report
+    instructions = intro_conclusion_instructions.format(topic=topic, formatted_str_sections=formatted_str_sections)
+    intro = llm.invoke([instructions]+[HumanMessage(content=f"Write the report introduction")])
+    return {"introduction": intro.content}
+def write_conclusion(state: ResearchGraphState):
+    # Full set of sections
+    sections = state["sections"]
+    topic = state["topic"]
+    # Concat all sections together
+    formatted_str_sections = "\n\n".join([f"{section}" for section in sections])
+    intro_conclusion_instructions = """You are a technical writer finishing a report on {topic}
+    You will be given all of the sections of the report.
+    You job is to write a crisp and compelling introduction or conclusion section.
+    The user will instruct you whether to write the introduction or conclusion.
+    Include no pre-amble for either section.
+    Target around 100 words, crisply previewing (for introduction) or recapping (for conclusion) all of the sections of the report.
+    Use markdown formatting.
+    For your introduction, create a compelling title and use the # header for the title.
+    For your introduction, use ## Introduction as the section header.
+    For your conclusion, use ## Conclusion as the section header.
+    Here are the sections to reflect on for writing: {formatted_str_sections}"""
+    # Summarize the sections into a final report
+    instructions = intro_conclusion_instructions.format(topic=topic, formatted_str_sections=formatted_str_sections)
+    conclusion = llm.invoke([instructions]+[HumanMessage(content=f"Write the report conclusion")])
+    return {"conclusion": conclusion.content}
+def finalize_report(state: ResearchGraphState):
+    """ The is the "reduce" step where we gather all the sections, combine them, and reflect on them to write the intro/conclusion """
+    # Save full final report
+    content = state["content"]
+    if content.startswith("## Insights"):
+        content = content.strip("## Insights")
+    if "## Sources" in content:
+        try:
+            content, sources = content.split("\n## Sources\n")
+        except:
+            sources = None
+    else:
+        sources = None
+    final_report = state["introduction"] + "\n\n---\n\n" + content + "\n\n---\n\n" + state["conclusion"]
+    if sources is not None:
+        final_report += "\n\n## Sources\n" + sources
+    return {"final_report": final_report}
+def usecase_agent_func(topic,max_analysts):
+  # Add nodes and edges
+  builder = StateGraph(ResearchGraphState)
+  builder.add_node("create_analysts", create_analysts)
+  builder.add_node("human_feedback", human_feedback)
+  builder.add_node("conduct_interview", interview_builder.compile())
+  builder.add_node("write_report",write_report)
+  builder.add_node("write_introduction",write_introduction)
+  builder.add_node("write_conclusion",write_conclusion)
+  builder.add_node("finalize_report",finalize_report)
+  # Logic
+  builder.add_edge(START, "create_analysts")
+  builder.add_edge("create_analysts", "human_feedback")
+  builder.add_conditional_edges("human_feedback", initiate_all_interviews, ["create_analysts", "conduct_interview"])
+  builder.add_edge("conduct_interview", "write_report")
+  builder.add_edge("conduct_interview", "write_introduction")
+  builder.add_edge("conduct_interview", "write_conclusion")
+  builder.add_edge(["write_conclusion", "write_report", "write_introduction"], "finalize_report")
+  builder.add_edge("finalize_report", END)
+  # Compile
+  memory = MemorySaver()
+  graph = builder.compile(checkpointer=memory)
+  config = {"configurable": {"thread_id": "1"}}
+  graph.invoke({"topic":topic,
+                "max_analysts":max_analysts,
+                'human_analyst_feedback': None},
+                 config)
+  final_state = graph.get_state(config)
+  report = final_state.values.get('final_report')
+  print('-----REPORT-----', report)
+  return report

vectorstore.py ADDED Viewed

	@@ -0,0 +1,283 @@

+from setup import *
+import tempfile
+import requests
+from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
+# from langchain_text_splitters import RecursiveCharacterTextSplitter
+from urllib.parse import urlparse
+from langchain.docstore.document import Document
+def extract_urls(agentstate_result):
+  urls = []
+  content=[]
+  for item in agentstate_result['link_list']:
+    urls.append(item['url'])
+    content.append(item['content'])
+  return urls, content
+# Function to classify URL based on file extension
+def classify_url_by_extension(url):
+    """
+    Classifies a URL based on its file extension.
+    Focuses only on pdf and html, classifying others as unknown.
+    """
+    if not isinstance(url, str):
+        raise ValueError(f"Expected a string, but got {type(url)}")
+    # Extract the file extension from the URL
+    try:
+        file_extension = urlparse(url).path.split('.')[-1].lower()
+        if file_extension == 'pdf':
+            return 'pdf'
+        elif file_extension in ['html', 'htm']:
+            return 'html'
+        else:
+            return 'unknown'
+    except Exception as e:
+        print(f"Error while parsing URL: {url} - {e}")
+        return 'unknown'
+# Function to classify based on HTTP Content-Type header (optional, for extra accuracy)
+def classify_url_by_header(url):
+    """
+    Classifies a URL based on the HTTP Content-Type header.
+    Focuses only on pdf and html, classifying others as unknown.
+    """
+    try:
+        response = requests.head(url, timeout=5)  # Use HEAD request to fetch headers
+        content_type = response.headers.get('Content-Type', '').lower()
+        if 'pdf' in content_type:
+            return 'pdf'
+        elif 'html' in content_type:
+            return 'html'
+        else:
+            return 'unknown'
+    except requests.RequestException as e:
+        print(f"Error while making HEAD request: {url} - {e}")
+        return 'unknown'
+# Function to classify a list of URLs
+def urls_classify_list(urls):
+    """
+    Classifies a list of URLs into pdf, html, and unknown.
+    Returns two separate lists: one for pdf URLs and one for html URLs.
+    """
+    if not isinstance(urls, list):
+        raise ValueError("Expected a list of URLs")
+    pdf_urls = []
+    html_urls = []
+    # Classify each URL
+    for url in urls:
+        file_type = classify_url_by_extension(url)  # First, try classifying by extension
+        if file_type == 'unknown':
+            # If extension-based classification failed, fall back to HTTP header classification
+            file_type = classify_url_by_header(url)
+        if file_type == 'pdf':
+            pdf_urls.append(url)
+        elif file_type == 'html':
+            html_urls.append(url)
+    return pdf_urls, html_urls
+def urls_classify_list(urls: list):
+    pdf_urls=[]
+    html_urls=[]
+    # Classify the URLs
+    for url in urls:
+        file_type = classify_url_by_extension(url)  # First, try classifying by extension
+        if file_type == 'unknown':
+            # If extension-based classification failed, fall back to HTTP header classification
+            file_type = classify_url_by_header(url)
+        if file_type == 'pdf':
+            pdf_urls.append(url)
+        if file_type == 'html' or file_type == 'unknown':
+            html_urls.append(url)
+    return pdf_urls, html_urls
+def clean_and_extract_html_data(html_urls, chunk_size=100, chunk_overlap=25):
+    """
+    Loads HTML content from URLs, cleans the data, and splits it into smaller chunks.
+    Args:
+        html_urls (list): List of HTML URLs to process.
+        chunk_size (int): Maximum size of each chunk.
+        chunk_overlap (int): Overlap between chunks.
+    Returns:
+        list: List of document chunks.
+    """
+    def clean_content(content):
+        """
+        Cleans the content by removing unwanted patterns and short lines.
+        """
+        cleaned_content = content.strip()  # Remove leading/trailing whitespace
+        lines = cleaned_content.split('\n')  # Split by newlines
+        meaningful_lines = [line.strip() for line in lines if len(line.strip()) > 3]  # Keep meaningful lines
+        return '\n'.join(meaningful_lines)
+    def split_document(doc_content, chunk_size, chunk_overlap):
+        """
+        Splits a document into smaller chunks with overlap.
+        """
+        chunks = []
+        start = 0
+        while start < len(doc_content):
+            end = start + chunk_size
+            chunk = doc_content[start:end]
+            chunks.append(chunk)
+            start = end - chunk_overlap if end < len(doc_content) else len(doc_content)
+        return chunks
+    # Step 1: Load documents from URLs
+    docs = []
+    for url in html_urls:
+        try:
+            loader = WebBaseLoader(url)
+            data = loader.load()
+            docs.extend(data)
+        except Exception as e:
+            print(f"Error loading URL {url}: {e}")
+    # Step 2: Clean the content to remove unwanted data
+    cleaned_docs = []
+    for doc in docs:
+        cleaned_content = clean_content(doc.page_content)
+        if cleaned_content:  # Exclude empty documents
+            doc.page_content = cleaned_content
+            cleaned_docs.append(doc)
+    # Step 3: Split the cleaned documents into chunks
+    doc_splits = []
+    for doc in cleaned_docs:
+        chunks = split_document(doc.page_content, chunk_size, chunk_overlap)
+        for chunk in chunks:
+            doc_splits.append(Document(page_content=chunk, metadata=doc.metadata))
+    return doc_splits
+# def extract_pdf_from_url(url):
+#     """
+#     Extract text from a PDF available at a URL.
+#     Args:
+#         url (str): The URL of the PDF file.
+#     Returns:
+#         str: Extracted text from the PDF.
+#     """
+#     # Step 1: Download the PDF from the URL
+#     response = requests.get(url)
+#     if response.status_code == 200:
+#         pdf_content = response.content
+#     else:
+#         raise ValueError(f"Failed to fetch the PDF. HTTP Status Code: {response.status_code}")
+#     # Step 2: Save PDF content to a temporary file
+#     with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
+#         temp_pdf.write(pdf_content)
+#         temp_pdf_path = temp_pdf.name  # Get the file path
+#     # Step 3: Load the PDF using PyPDFLoader
+#     loader = PyPDFLoader(temp_pdf_path)
+#     documents = loader.load()
+#     # Step 4: Extract text from all pages
+#     extracted_text = "\n".join(doc.page_content for doc in documents)
+#     return extracted_text
+# def clean_and_split_pdf_text(pdf_text, chunk_size=100, chunk_overlap=25):
+#     """
+#     Cleans and splits the extracted PDF text into smaller chunks.
+#     Args:
+#         pdf_text (str): Extracted text from a PDF.
+#         chunk_size (int): Maximum size of each chunk.
+#         chunk_overlap (int): Overlap between chunks.
+#     Returns:
+#         list: List of document chunks.
+#     """
+#     def clean_content(content):
+#         """
+#         Cleans the text by removing unwanted patterns and short lines.
+#         """
+#         content = content.strip()  # Remove leading/trailing whitespace
+#         lines = content.split('\n')  # Split into lines
+#         meaningful_lines = [line.strip() for line in lines if len(line.strip()) > 3]  # Exclude short lines
+#         return '\n'.join(meaningful_lines)
+#     def split_text(content, chunk_size, chunk_overlap):
+#         """
+#         Splits cleaned text into smaller chunks with overlap.
+#         """
+#         chunks = []
+#         start = 0
+#         while start < len(content):
+#             end = start + chunk_size
+#             chunks.append(content[start:end])
+#             start = end - chunk_overlap if end < len(content) else len(content)
+#         return chunks
+#     # Step 1: Clean the text
+#     cleaned_text = clean_content(pdf_text)
+#     # Step 2: Split the cleaned text
+#     return split_text(cleaned_text, chunk_size, chunk_overlap)
+# def pdf_extraction(pdf_urls, chunk_size=100, chunk_overlap=25):
+#     """
+#     Extracts and processes text from a list of PDF URLs.
+#     Args:
+#         pdf_urls (list): List of PDF URLs.
+#         chunk_size (int): Maximum size of each chunk.
+#         chunk_overlap (int): Overlap between chunks.
+#     Returns:
+#         list: List of Document objects containing split text.
+#     """
+#     all_chunks = []
+#     for pdf_url in pdf_urls:
+#         try:
+#             # Extract text from the PDF
+#             extracted_text = extract_pdf_from_url(pdf_url)
+#             # Clean and split the text
+#             chunks = clean_and_split_pdf_text(extracted_text, chunk_size, chunk_overlap)
+#             # Convert chunks into Document objects
+#             for chunk in chunks:
+#                 all_chunks.append(Document(page_content=chunk, metadata={"source": pdf_url}))
+#         except Exception as e:
+#             print(f"Error processing PDF URL {pdf_url}: {e}")
+#     return all_chunks