dejanseo commited on
Commit
40d7e95
·
verified ·
1 Parent(s): 47c6dd1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -15
app.py CHANGED
@@ -1,6 +1,3 @@
1
- # Install required libraries (in Streamlit you would install them via requirements.txt or manually in the terminal)
2
- # !pip install requests trafilatura sentence-transformers numpy torch tqdm scikit-learn pandas advertools streamlit
3
-
4
  import streamlit as st
5
  import requests
6
  import trafilatura
@@ -11,37 +8,44 @@ import advertools as adv
11
  from sklearn.cluster import KMeans
12
  from collections import Counter
13
 
 
 
 
 
 
 
 
 
14
  # Title of the app
15
  st.title("Site Focus Calculator")
16
  st.write("A tool for calculating the site focus score of a website or a series of URLs.")
17
 
18
-
19
  # Load the model
20
  model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
21
- #model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
22
-
23
- #mxbai-embed-xsmall-v1
24
-
25
-
26
 
27
  # Input fields for sitemap or list of URLs (separated by newlines)
28
- sitemap_url = st.text_input("Enter your XML sitemap URL (optional)", "")
29
- url_list_input = st.text_area("Enter a list of URLs (separated by newlines, optional)", "")
 
 
 
 
 
 
30
 
31
  # Add a "Run" button to trigger the URL processing
32
  if st.button("Run Analysis"):
33
- # Process either sitemap or URL list
34
  urls = []
35
  if sitemap_url:
36
  st.write("Fetching URLs from the sitemap...")
37
- # Read sitemap and extract URLs using advertools
38
  sitemap_df = adv.sitemap_to_df(sitemap_url)
39
  urls = sitemap_df['loc'].tolist()
40
- #urls = urls[:50] # Limit to first 50 URLs for testing purposes
41
  st.write(f"Processing {len(urls)} URLs from sitemap.")
42
  elif url_list_input:
43
- # Parse URL list from input (newlines separated)
44
  urls = [url.strip() for url in url_list_input.split('\n') if url.strip()]
 
45
  st.write(f"Processing {len(urls)} URLs from the input list.")
46
  else:
47
  st.warning("Please provide either a sitemap URL or a list of URLs.")
@@ -140,6 +144,10 @@ if st.button("Run Analysis"):
140
  'ClusterScore': page_cluster_scores
141
  })
142
 
 
 
 
 
143
  # Display the DataFrame
144
  st.write("URL Analysis Results")
145
  st.dataframe(df)
@@ -160,4 +168,13 @@ if st.button("Run Analysis"):
160
  error_df = pd.DataFrame(error_urls, columns=["URL", "Error"])
161
  st.dataframe(error_df)
162
  else:
 
 
 
 
 
 
 
 
 
163
  st.info("Click 'Run Analysis' to start the process.")
 
 
 
 
1
  import streamlit as st
2
  import requests
3
  import trafilatura
 
8
  from sklearn.cluster import KMeans
9
  from collections import Counter
10
 
11
+ # Initialize session state variables
12
+ if 'urls' not in st.session_state:
13
+ st.session_state.urls = []
14
+ if 'results' not in st.session_state:
15
+ st.session_state.results = None
16
+ if 'processing_complete' not in st.session_state:
17
+ st.session_state.processing_complete = False
18
+
19
  # Title of the app
20
  st.title("Site Focus Calculator")
21
  st.write("A tool for calculating the site focus score of a website or a series of URLs.")
22
 
 
23
  # Load the model
24
  model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
 
 
 
 
 
25
 
26
  # Input fields for sitemap or list of URLs (separated by newlines)
27
+ sitemap_url = st.text_input("Enter your XML sitemap URL (optional)", st.session_state.get('sitemap_url', ""))
28
+ url_list_input = st.text_area("Enter a list of URLs (separated by newlines, optional)", st.session_state.get('url_list_input', ""))
29
+
30
+ # Store inputs in session state
31
+ if sitemap_url:
32
+ st.session_state.sitemap_url = sitemap_url
33
+ if url_list_input:
34
+ st.session_state.url_list_input = url_list_input
35
 
36
  # Add a "Run" button to trigger the URL processing
37
  if st.button("Run Analysis"):
38
+ st.session_state.processing_complete = False
39
  urls = []
40
  if sitemap_url:
41
  st.write("Fetching URLs from the sitemap...")
 
42
  sitemap_df = adv.sitemap_to_df(sitemap_url)
43
  urls = sitemap_df['loc'].tolist()
44
+ st.session_state.urls = urls # Store URLs in session state
45
  st.write(f"Processing {len(urls)} URLs from sitemap.")
46
  elif url_list_input:
 
47
  urls = [url.strip() for url in url_list_input.split('\n') if url.strip()]
48
+ st.session_state.urls = urls # Store URLs in session state
49
  st.write(f"Processing {len(urls)} URLs from the input list.")
50
  else:
51
  st.warning("Please provide either a sitemap URL or a list of URLs.")
 
144
  'ClusterScore': page_cluster_scores
145
  })
146
 
147
+ # Store results in session state
148
+ st.session_state.results = df
149
+ st.session_state.processing_complete = True
150
+
151
  # Display the DataFrame
152
  st.write("URL Analysis Results")
153
  st.dataframe(df)
 
168
  error_df = pd.DataFrame(error_urls, columns=["URL", "Error"])
169
  st.dataframe(error_df)
170
  else:
171
+ # Display results if processing is complete
172
+ if st.session_state.processing_complete and st.session_state.results is not None:
173
+ st.write("URL Analysis Results")
174
+ st.dataframe(st.session_state.results)
175
+
176
+ # Option to download the results as CSV
177
+ csv = st.session_state.results.to_csv(index=False)
178
+ st.download_button(label="Download data as CSV", data=csv, file_name='url_analysis_results.csv', mime='text/csv')
179
+
180
  st.info("Click 'Run Analysis' to start the process.")