Spaces:
Runtime error
Runtime error
File size: 3,992 Bytes
3505899 1da5fb5 3505899 d6579b5 3505899 d6579b5 de6bfd3 3d86e8b 934ae13 3d86e8b 934ae13 3d86e8b f6006a9 3d86e8b e069396 3d86e8b e069396 3d86e8b c8114a5 3505899 014a1d9 3505899 3d86e8b ae663ee 3d86e8b 23b48d0 3d86e8b de6bfd3 23b48d0 490445b de6bfd3 490445b 3505899 de6bfd3 3505899 de6bfd3 1da5fb5 3505899 490445b 3505899 de6bfd3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
from AssistantService import GPTAssistant
from openai.error import AuthenticationError
import streamlit as st
from langsmith.run_helpers import traceable
import configparser
import os
config = configparser.ConfigParser()
config.read('config.ini')
if 'DEFAULT' in config:
assistant_api_key = config['DEFAULT'].get('API-KEY', '')
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"]=st.secrets["LANGCHAIN_API_KEY"]
os.environ["LANGCHAIN_PROJECT"]=st.secrets["LANGCHAIN_PROJECT"]
@traceable(run_type="tool")
def start_session(session_started):
st.session_state['session_started'] = session_started
return session_started
# change session_started to True
if 'session_started' not in st.session_state:
start_session(True)
st.write("This app helps you to extract data from HTML code using web scraping. It uses *GPT-3.5-turbo-16k* to generate the code for you. \n *Contribute to this project on [GitHub](https://github.com/CognitiveLabs/GPT-auto-webscraping)*")
with st.expander(label="Check out the video demo"):
yt_video = st.video("https://www.youtube.com/watch?v=_zeCun4OlCc")
info_text = """
**Quick start** \n
Fill the input with <HTML code>.
- Choose a repeating element on the page, like a product on a list.
- Inspect the HTML code and copy the element.
- After generating the "output format" and the code, paste the complete HTML code of the page in the last input to test it
"""
st.write(info_text)
st.image("https://j.gifs.com/gpqvPl.gif", width=600)
if assistant_api_key == '':
assistant_api_key = st.secrets["API_KEY"]
if assistant_api_key:
gpt_assistant = GPTAssistant(assistant_api_key)
else:
gpt_assistant = GPTAssistant(assistant_api_key)
# get the html content
html_content = st.text_input("Paste the HTML tags of the item you want to extract:", max_chars=10000, help="example: <li>Product 1 </li>, watch the video above")
# check if html_content is an url, and show error if it is
if html_content:
if html_content.startswith("http"):
st.write("Please paste the HTML piece code, not the URL")
html_content = None
extract_button = st.button("Generate output format & code")
if html_content and extract_button:
try:
st.write("1/2: Generating the output format...")
output = gpt_assistant.chain_response_format(html_content)
st.session_state['output_format'] = output
except NameError:
st.write("Complete the API key field")
except AuthenticationError:
st.write("Invalid API key")
if 'output_format' in st.session_state:
output_format = st.code(st.session_state['output_format'], language="json")
try:
st.write("2/2: Generating the code...")
python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content)
st.session_state['code_generated'] = python_code
st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)"
except NameError:
st.write("Complete the API key field")
except AuthenticationError:
st.write("Invalid API key")
@traceable(run_type="tool")
def test_the_code(code, full_content):
exec(code, globals())
if result:
st.write("data extracted successfully")
# show data in table
st.table(result)
else:
st.write("error extracting data")
return result or "error"
if 'code_generated' in st.session_state:
python_function_label = st.write("Here is your python function:")
code_generated = st.code(st.session_state['code_generated'],language="python")
full_content = st.text_input("Paste your complete HTML here:")
test_code = st.button("Test the code")
if full_content and test_code:
html_data = full_content
result = None
test_the_code(st.session_state['code_generated_exec'], full_content=full_content) |