GianJSX commited on
Commit
de6bfd3
·
1 Parent(s): 1da5fb5

trace change

Browse files
Files changed (1) hide show
  1. app.py +43 -29
app.py CHANGED
@@ -15,18 +15,27 @@ os.environ["LANGCHAIN_ENDPOINT"]="https://api.smith.langchain.com"
15
  os.environ["LANGCHAIN_API_KEY"]=st.secrets["LANGCHAIN_API_KEY"]
16
  os.environ["LANGCHAIN_PROJECT"]=st.secrets["LANGCHAIN_PROJECT"]
17
 
18
- st.write("This app helps you to extract data from HTML code using web scraping. It uses GPT-3.5-turbo to generate the code for you. \n *Contribute to this project on [GitHub](https://github.com/CognitiveLabs/GPT-auto-webscraping)*")
19
-
20
- with st.expander(label="Check out the video demo"):
21
- yt_video = st.video("https://www.youtube.com/watch?v=_zeCun4OlCc")
22
-
23
- info_text = """
24
- **Quick start** \n
25
- Fill the input with the HTML code you want to extract data from
26
- """
27
- st.write(info_text)
28
- st.image("https://j.gifs.com/gpqvPl.gif")
29
-
 
 
 
 
 
 
 
 
 
30
 
31
  if assistant_api_key == '':
32
  assistant_api_key = st.secrets["API_KEY"]
@@ -36,17 +45,23 @@ else:
36
  gpt_assistant = GPTAssistant(assistant_api_key)
37
 
38
 
39
- html_content = st.text_input("Paste your piece of HTML here:", max_chars=10000)
40
  # check if html_content is an url, and show error if it is
41
- if html_content:
42
- if html_content.startswith("http"):
43
- st.write("Please paste the HTML piece code, not the URL")
44
- html_content = None
45
 
46
- extract_button = st.button("Extract data format")
 
 
 
 
 
 
 
 
 
47
 
48
  if html_content and extract_button:
49
  try:
 
50
  output = gpt_assistant.chain_response_format(html_content)
51
  st.session_state['output_format'] = output
52
  except NameError:
@@ -57,16 +72,16 @@ if html_content and extract_button:
57
  if 'output_format' in st.session_state:
58
  output_format = st.code(st.session_state['output_format'], language="json")
59
 
60
- if st.button("Generate the code"):
61
- try:
62
- python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content)
63
- st.session_state['code_generated'] = python_code
64
- st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)"
65
 
66
- except NameError:
67
- st.write("Complete the API key field")
68
- except AuthenticationError:
69
- st.write("Invalid API key")
70
 
71
  @traceable(run_type="tool")
72
  def test_the_code(code, full_content):
@@ -89,5 +104,4 @@ if 'code_generated' in st.session_state:
89
  if full_content and test_code:
90
  html_data = full_content
91
  result = None
92
- test_the_code(st.session_state['code_generated_exec'], full_content=full_content)
93
-
 
15
  os.environ["LANGCHAIN_API_KEY"]=st.secrets["LANGCHAIN_API_KEY"]
16
  os.environ["LANGCHAIN_PROJECT"]=st.secrets["LANGCHAIN_PROJECT"]
17
 
18
+ @traceable(run_type="tool")
19
+ def start(run=False):
20
+ st.write("This app helps you to extract data from HTML code using web scraping. It uses GPT-3.5-turbo to generate the code for you. \n *Contribute to this project on [GitHub](https://github.com/CognitiveLabs/GPT-auto-webscraping)*")
21
+
22
+ with st.expander(label="Check out the video demo"):
23
+ yt_video = st.video("https://www.youtube.com/watch?v=_zeCun4OlCc")
24
+
25
+ info_text = """
26
+ **Quick start** \n
27
+ Fill the input with <HTML code>.
28
+ * Choose a repeating element on the page, like a product on a list.
29
+ * Inspect the HTML code and copy the element.
30
+
31
+ After generating the "output format" and the code, paste the complete HTML code of the page in the last input to test it
32
+ """
33
+ st.write(info_text)
34
+ st.image("https://j.gifs.com/gpqvPl.gif")
35
+ if run:
36
+ return True
37
+ # use time library
38
+ start(run=True)
39
 
40
  if assistant_api_key == '':
41
  assistant_api_key = st.secrets["API_KEY"]
 
45
  gpt_assistant = GPTAssistant(assistant_api_key)
46
 
47
 
48
+ html_content = None
49
  # check if html_content is an url, and show error if it is
 
 
 
 
50
 
51
+ def html_content_input():
52
+ html_content = st.text_input("Paste the HTML tags of the item you want to extract:", max_chars=10000, help="example: <li>Product 1 </li>, watch the video above")
53
+ if html_content:
54
+ if html_content.startswith("http"):
55
+ st.write("Please paste the HTML piece code, not the URL")
56
+ html_content = None
57
+
58
+ return st.button("Generate output format & code")
59
+
60
+ extract_button = html_content_input()
61
 
62
  if html_content and extract_button:
63
  try:
64
+ st.write("1/2: Generating the output format...")
65
  output = gpt_assistant.chain_response_format(html_content)
66
  st.session_state['output_format'] = output
67
  except NameError:
 
72
  if 'output_format' in st.session_state:
73
  output_format = st.code(st.session_state['output_format'], language="json")
74
 
75
+ try:
76
+ st.write("2/2: Generating the code...")
77
+ python_code = gpt_assistant.chain_code_generator(st.session_state['output_format'], html_content)
78
+ st.session_state['code_generated'] = python_code
79
+ st.session_state['code_generated_exec'] = python_code + "\nresult = extract_info(html_data)"
80
 
81
+ except NameError:
82
+ st.write("Complete the API key field")
83
+ except AuthenticationError:
84
+ st.write("Invalid API key")
85
 
86
  @traceable(run_type="tool")
87
  def test_the_code(code, full_content):
 
104
  if full_content and test_code:
105
  html_data = full_content
106
  result = None
107
+ test_the_code(st.session_state['code_generated_exec'], full_content=full_content)