curfox_chatbot / user_guide_sync.py
Arafath10's picture
Update user_guide_sync.py
7928596 verified
raw
history blame
2.38 kB
import requests
from bs4 import BeautifulSoup
import google.generativeai as genai
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_types import AgentType
#from langchain_experimental.agents.agent_toolkits import create_csv_agent
from llama_index.llms import OpenAI
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms import OpenAI
from llama_index import StorageContext, load_index_from_storage
#os.environ["OPENAI_API_KEY"]
# URL of the page to scrape
url = 'https://help.storemate.cloud/docs/'
def get_web_data(valid_links):
for url in valid_links:
# Send a GET request to the URL
response = requests.get(url)
# Parse the page content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Find the title and section content
title = soup.find('h1').get_text()
# Find the section with the title "Renew Package Subscription"
section = soup.find('h1').find_next('div')
# Extract the text content from the section
section_text = section.get_text().strip()
section_text = section_text + f"\nmore detail link : {url}"
file = open(f"user_guide/{title}.txt","w")
file.write(f"{title}\n{section_text}")
file.close()
print("data collected")
def get_base_links():
# Send a GET request to the URL
response = requests.get(url)
# Parse the page content with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Find all <a> tags with href attributes
links = soup.find_all('a', href=True)
valid_links = []
# Extract and print all the URLs
for link in links:
if url in str(link):
valid_links.append(link['href'])
print("base links collected")
get_web_data(valid_links)
def update_user_guide():
get_base_links()
# try:
# storage_context = StorageContext.from_defaults(persist_dir="llama_index")
# index = load_index_from_storage(storage_context=storage_context)
# print("loaded")
# except:
documents = SimpleDirectoryReader("user_guide").load_data()
index = VectorStoreIndex.from_documents(documents)
index.storage_context.persist("llama_index")
print("index created")
return "done"