Spaces:

DataPrism
/

GPT-auto-webscraping

Runtime error

App Files Files Community

GianJSX commited on Aug 15, 2023

Commit

de6bfd3

1 Parent(s): 1da5fb5

trace change

Browse files

Files changed (1) hide show

app.py +43 -29

app.py CHANGED Viewed

@@ -15,18 +15,27 @@ os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
 os.environ["LANGCHAIN_API_KEY"]=st.secrets["LANGCHAIN_API_KEY"]
 os.environ["LANGCHAIN_PROJECT"]=st.secrets["LANGCHAIN_PROJECT"]
-st.write("This app helps you to extract data from HTML code using web scraping. It uses GPT-3.5-turbo to generate the code for you. \n *Contribute to this project on [GitHub](https://github.com/CognitiveLabs/GPT-auto-webscraping)*")
-with st.expander(label="Check out the video demo"):
-    yt_video = st.video("https://www.youtube.com/watch?v=_zeCun4OlCc")
-info_text = """
-**Quick start** \n
-Fill the input with the HTML code you want to extract data from
-"""
-st.write(info_text)
-st.image("https://j.gifs.com/gpqvPl.gif")
 if assistant_api_key == '':
     assistant_api_key = st.secrets["API_KEY"]
@@ -36,17 +45,23 @@ else:
     gpt_assistant = GPTAssistant(assistant_api_key)
-html_content = st.text_input("Paste your piece of HTML here:", max_chars=10000)
 # check if html_content is an url, and show error if it is
-if html_content:
-    if html_content.startswith("http"):
-        st.write("Please paste the HTML piece code, not the URL")
-        html_content = None
-extract_button = st.button("Extract data format")
 if html_content and extract_button:
     try:
         output = gpt_assistant.chain_response_format(html_content)
         st.session_state['output_format'] = output
     except NameError:
@@ -57,16 +72,16 @@ if html_content and extract_button:
 if 'output_format' in st.session_state:
     output_format = st.code(st.session_state['output_format'], language="json")
-    if st.button("Generate the code"):
-        try:
-            python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content)
-            st.session_state['code_generated'] = python_code
-            st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)"
-        except NameError:
-            st.write("Complete the API key field")
-        except AuthenticationError:
-            st.write("Invalid API key")
 @traceable(run_type="tool")
 def test_the_code(code, full_content):
@@ -89,5 +104,4 @@ if 'code_generated' in st.session_state:
     if full_content and test_code:
         html_data = full_content
         result = None
-        test_the_code(st.session_state['code_generated_exec'], full_content=full_content)

 os.environ["LANGCHAIN_API_KEY"]=st.secrets["LANGCHAIN_API_KEY"]
 os.environ["LANGCHAIN_PROJECT"]=st.secrets["LANGCHAIN_PROJECT"]
+@traceable(run_type="tool")
+def start(run=False):
+    st.write("This app helps you to extract data from HTML code using web scraping. It uses GPT-3.5-turbo to generate the code for you. \n *Contribute to this project on [GitHub](https://github.com/CognitiveLabs/GPT-auto-webscraping)*")
+    with st.expander(label="Check out the video demo"):
+        yt_video = st.video("https://www.youtube.com/watch?v=_zeCun4OlCc")
+    info_text = """
+    **Quick start** \n
+    Fill the input with <HTML code>.
+    * Choose a repeating element on the page, like a product on a list.
+    * Inspect the HTML code and copy the element.
+    After generating the "output format" and the code, paste the complete HTML code of the page in the last input to test it
+    """
+    st.write(info_text)
+    st.image("https://j.gifs.com/gpqvPl.gif")
+    if run:
+        return True
+# use time library
+start(run=True)
 if assistant_api_key == '':
     assistant_api_key = st.secrets["API_KEY"]
     gpt_assistant = GPTAssistant(assistant_api_key)
+html_content = None
 # check if html_content is an url, and show error if it is
+def html_content_input():
+    html_content = st.text_input("Paste the HTML tags of the item you want to extract:", max_chars=10000, help="example: <li>Product 1 </li>, watch the video above")
+    if html_content:
+        if html_content.startswith("http"):
+            st.write("Please paste the HTML piece code, not the URL")
+            html_content = None
+    return st.button("Generate output format & code")
+extract_button = html_content_input()
 if html_content and extract_button:
     try:
+        st.write("1/2: Generating the output format...")
         output = gpt_assistant.chain_response_format(html_content)
         st.session_state['output_format'] = output
     except NameError:
 if 'output_format' in st.session_state:
     output_format = st.code(st.session_state['output_format'], language="json")
+    try:
+        st.write("2/2: Generating the code...")
+        python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content)
+        st.session_state['code_generated'] = python_code
+        st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)"
+    except NameError:
+        st.write("Complete the API key field")
+    except AuthenticationError:
+        st.write("Invalid API key")
 @traceable(run_type="tool")
 def test_the_code(code, full_content):
     if full_content and test_code:
         html_data = full_content
         result = None
+        test_the_code(st.session_state['code_generated_exec'], full_content=full_content)