Spaces:
Running
Running
gopiashokan
commited on
Commit
•
54bf828
1
Parent(s):
406d206
Upload 2 files
Browse files- app.py +10 -17
- requirements.txt +1 -2
app.py
CHANGED
@@ -7,12 +7,11 @@ from streamlit_extras.add_vertical_space import add_vertical_space
|
|
7 |
from PyPDF2 import PdfReader
|
8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
10 |
-
from
|
11 |
-
from
|
12 |
from langchain.chains.question_answering import load_qa_chain
|
13 |
from selenium import webdriver
|
14 |
from selenium.webdriver.common.by import By
|
15 |
-
from selenium.common.exceptions import NoSuchElementException
|
16 |
import warnings
|
17 |
warnings.filterwarnings('ignore')
|
18 |
|
@@ -244,43 +243,37 @@ class linkedin_scraper:
|
|
244 |
website_url = df['Website URL'].tolist()
|
245 |
|
246 |
# Scrap the Job Description
|
247 |
-
job_description = []
|
248 |
for i in range(0, len(website_url)):
|
249 |
try:
|
250 |
# Open the URL
|
251 |
driver.get(website_url[i])
|
252 |
-
driver.implicitly_wait(
|
253 |
-
time.sleep(1)
|
254 |
-
|
255 |
-
# Click on Show More Button
|
256 |
-
driver.find_element(by=By.CSS_SELECTOR, value='button[data-tracking-control-name="public_jobs_show-more-html-btn"]').click()
|
257 |
-
driver.implicitly_wait(10)
|
258 |
time.sleep(1)
|
259 |
|
260 |
# Click on Show More Button
|
261 |
driver.find_element(by=By.CSS_SELECTOR, value='button[data-tracking-control-name="public_jobs_show-more-html-btn"]').click()
|
262 |
-
driver.implicitly_wait(
|
263 |
time.sleep(1)
|
264 |
|
265 |
# Get Job Description
|
266 |
description = driver.find_elements(by=By.CSS_SELECTOR, value='div[class="show-more-less-html__markup relative overflow-hidden"]')
|
267 |
-
driver.implicitly_wait(10)
|
268 |
data = [i.text for i in description][0]
|
269 |
|
270 |
if len(data.strip()) > 0:
|
271 |
job_description.append(data)
|
|
|
272 |
else:
|
273 |
job_description.append('Description Not Available')
|
274 |
-
|
275 |
-
# Check Description Count Meets User Job Count
|
276 |
-
if len([i for i in job_description if i != 'Description Not Available']) >= job_count:
|
277 |
-
break
|
278 |
|
279 |
# If URL cannot Loading Properly
|
280 |
except:
|
281 |
job_description.append('Description Not Available')
|
|
|
|
|
|
|
|
|
282 |
|
283 |
-
|
284 |
# Filter the Job Description
|
285 |
df = df.iloc[:len(job_description), :]
|
286 |
|
|
|
7 |
from PyPDF2 import PdfReader
|
8 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
9 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
10 |
+
from langchain.vectorstores import FAISS
|
11 |
+
from langchain.chat_models import ChatOpenAI
|
12 |
from langchain.chains.question_answering import load_qa_chain
|
13 |
from selenium import webdriver
|
14 |
from selenium.webdriver.common.by import By
|
|
|
15 |
import warnings
|
16 |
warnings.filterwarnings('ignore')
|
17 |
|
|
|
243 |
website_url = df['Website URL'].tolist()
|
244 |
|
245 |
# Scrap the Job Description
|
246 |
+
job_description, description_count = [], 0
|
247 |
for i in range(0, len(website_url)):
|
248 |
try:
|
249 |
# Open the URL
|
250 |
driver.get(website_url[i])
|
251 |
+
driver.implicitly_wait(5)
|
|
|
|
|
|
|
|
|
|
|
252 |
time.sleep(1)
|
253 |
|
254 |
# Click on Show More Button
|
255 |
driver.find_element(by=By.CSS_SELECTOR, value='button[data-tracking-control-name="public_jobs_show-more-html-btn"]').click()
|
256 |
+
driver.implicitly_wait(5)
|
257 |
time.sleep(1)
|
258 |
|
259 |
# Get Job Description
|
260 |
description = driver.find_elements(by=By.CSS_SELECTOR, value='div[class="show-more-less-html__markup relative overflow-hidden"]')
|
|
|
261 |
data = [i.text for i in description][0]
|
262 |
|
263 |
if len(data.strip()) > 0:
|
264 |
job_description.append(data)
|
265 |
+
description_count += 1
|
266 |
else:
|
267 |
job_description.append('Description Not Available')
|
|
|
|
|
|
|
|
|
268 |
|
269 |
# If URL cannot Loading Properly
|
270 |
except:
|
271 |
job_description.append('Description Not Available')
|
272 |
+
|
273 |
+
# Check Description Count Meets User Job Count
|
274 |
+
if description_count == job_count:
|
275 |
+
break
|
276 |
|
|
|
277 |
# Filter the Job Description
|
278 |
df = df.iloc[:len(job_description), :]
|
279 |
|
requirements.txt
CHANGED
@@ -5,8 +5,7 @@ streamlit_option_menu
|
|
5 |
streamlit_extras
|
6 |
PyPDF2
|
7 |
langchain
|
8 |
-
langchain-community
|
9 |
openai
|
10 |
tiktoken
|
11 |
faiss-cpu
|
12 |
-
selenium
|
|
|
5 |
streamlit_extras
|
6 |
PyPDF2
|
7 |
langchain
|
|
|
8 |
openai
|
9 |
tiktoken
|
10 |
faiss-cpu
|
11 |
+
selenium
|