curfox_chatbot

Sleeping

App Files Files Community

curfox_chatbot / user_guide_sync.py

Arafath10

Update user_guide_sync.py

7928596 verified 6 months ago

raw

history blame

2.38 kB

	import requests
	from bs4 import BeautifulSoup

	import google.generativeai as genai
	from langchain.llms import OpenAI
	from langchain.chat_models import ChatOpenAI
	from langchain.agents.agent_types import AgentType
	#from langchain_experimental.agents.agent_toolkits import create_csv_agent


	from llama_index.llms import OpenAI
	from llama_index import VectorStoreIndex, SimpleDirectoryReader
	from llama_index.llms import OpenAI
	from llama_index import StorageContext, load_index_from_storage


	#os.environ["OPENAI_API_KEY"]



	# URL of the page to scrape
	url = 'https://help.storemate.cloud/docs/'


	def get_web_data(valid_links):
	for url in valid_links:
	# Send a GET request to the URL
	response = requests.get(url)

	# Parse the page content with BeautifulSoup
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find the title and section content
	title = soup.find('h1').get_text()

	# Find the section with the title "Renew Package Subscription"
	section = soup.find('h1').find_next('div')
	# Extract the text content from the section
	section_text = section.get_text().strip()
	section_text = section_text + f"\nmore detail link : {url}"

	file = open(f"user_guide/{title}.txt","w")
	file.write(f"{title}\n{section_text}")
	file.close()
	print("data collected")


	def get_base_links():
	# Send a GET request to the URL
	response = requests.get(url)

	# Parse the page content with BeautifulSoup
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find all <a> tags with href attributes
	links = soup.find_all('a', href=True)

	valid_links = []
	# Extract and print all the URLs
	for link in links:
	if url in str(link):
	valid_links.append(link['href'])

	print("base links collected")

	get_web_data(valid_links)



	def update_user_guide():
	get_base_links()
	# try:
	# storage_context = StorageContext.from_defaults(persist_dir="llama_index")
	# index = load_index_from_storage(storage_context=storage_context)
	# print("loaded")
	# except:
	documents = SimpleDirectoryReader("user_guide").load_data()
	index = VectorStoreIndex.from_documents(documents)
	index.storage_context.persist("llama_index")
	print("index created")
	return "done"