gopiashokan commited on
Commit
406d206
1 Parent(s): d5d7ca0

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -29
app.py CHANGED
@@ -207,7 +207,7 @@ class linkedin_scraper:
207
  return scrap_job_title if len(matched_words) > 1 else np.nan
208
 
209
 
210
- def scrap_company_data(driver, job_title_input, job_count):
211
 
212
  # scraping the Company Data
213
  company = driver.find_elements(by=By.CSS_SELECTOR, value='h4[class="base-search-card__subtitle"]')
@@ -235,13 +235,10 @@ class linkedin_scraper:
235
  df = df.dropna()
236
  df.reset_index(drop=True, inplace=True)
237
 
238
- # Filter Job Title Based on User Input
239
- df = df.iloc[:job_count, :]
240
-
241
  return df
242
 
243
 
244
- def scrap_job_description(driver, df):
245
 
246
  # Get URL into List
247
  website_url = df['Website URL'].tolist()
@@ -249,47 +246,47 @@ class linkedin_scraper:
249
  # Scrap the Job Description
250
  job_description = []
251
  for i in range(0, len(website_url)):
252
- # Open the URL
253
- driver.get(website_url[i])
254
- driver.implicitly_wait(10)
255
- time.sleep(1)
256
-
257
  try:
258
- # Click on Show More Button
259
- driver.find_element(by=By.CSS_SELECTOR, value='button[data-tracking-control-name="public_jobs_show-more-html-btn"]').click()
260
  driver.implicitly_wait(10)
261
  time.sleep(1)
262
-
263
  # Click on Show More Button
264
  driver.find_element(by=By.CSS_SELECTOR, value='button[data-tracking-control-name="public_jobs_show-more-html-btn"]').click()
265
  driver.implicitly_wait(10)
266
  time.sleep(1)
267
 
268
- except NoSuchElementException:
269
- # Open the URL
270
- driver.get('https://www.google.com/')
271
- driver.get(website_url[i])
272
- driver.implicitly_wait(10)
273
- time.sleep(1)
274
-
275
  # Click on Show More Button
276
  driver.find_element(by=By.CSS_SELECTOR, value='button[data-tracking-control-name="public_jobs_show-more-html-btn"]').click()
277
  driver.implicitly_wait(10)
278
  time.sleep(1)
279
 
280
- # Get Job Description
281
- description = driver.find_elements(by=By.CSS_SELECTOR, value='div[class="show-more-less-html__markup relative overflow-hidden"]')
282
- driver.implicitly_wait(10)
283
- data = [i.text for i in description][0]
284
 
285
- if len(data.strip()) > 0:
286
- job_description.append(data)
287
- else:
 
 
 
 
 
 
 
 
288
  job_description.append('Description Not Available')
289
 
 
 
 
290
 
291
  # Add Job Description in Dataframe
292
  df['Job Description'] = pd.DataFrame(job_description, columns=['Description'])
 
293
  df = df.dropna()
294
  df.reset_index(drop=True, inplace=True)
295
  return df
@@ -335,10 +332,10 @@ class linkedin_scraper:
335
  linkedin_scraper.link_open_scrolldown(driver, link, job_count)
336
 
337
  with st.spinner('scraping Company Data...'):
338
- df = linkedin_scraper.scrap_company_data(driver, job_title_input, job_count)
339
 
340
  with st.spinner('Scraping Job Description Data...'):
341
- df_final = linkedin_scraper. scrap_job_description(driver, df)
342
 
343
  # Display the Data in User Interface
344
  linkedin_scraper.display_data_userinterface(df_final)
 
207
  return scrap_job_title if len(matched_words) > 1 else np.nan
208
 
209
 
210
+ def scrap_company_data(driver, job_title_input):
211
 
212
  # scraping the Company Data
213
  company = driver.find_elements(by=By.CSS_SELECTOR, value='h4[class="base-search-card__subtitle"]')
 
235
  df = df.dropna()
236
  df.reset_index(drop=True, inplace=True)
237
 
 
 
 
238
  return df
239
 
240
 
241
+ def scrap_job_description(driver, df, job_count):
242
 
243
  # Get URL into List
244
  website_url = df['Website URL'].tolist()
 
246
  # Scrap the Job Description
247
  job_description = []
248
  for i in range(0, len(website_url)):
 
 
 
 
 
249
  try:
250
+ # Open the URL
251
+ driver.get(website_url[i])
252
  driver.implicitly_wait(10)
253
  time.sleep(1)
254
+
255
  # Click on Show More Button
256
  driver.find_element(by=By.CSS_SELECTOR, value='button[data-tracking-control-name="public_jobs_show-more-html-btn"]').click()
257
  driver.implicitly_wait(10)
258
  time.sleep(1)
259
 
 
 
 
 
 
 
 
260
  # Click on Show More Button
261
  driver.find_element(by=By.CSS_SELECTOR, value='button[data-tracking-control-name="public_jobs_show-more-html-btn"]').click()
262
  driver.implicitly_wait(10)
263
  time.sleep(1)
264
 
265
+ # Get Job Description
266
+ description = driver.find_elements(by=By.CSS_SELECTOR, value='div[class="show-more-less-html__markup relative overflow-hidden"]')
267
+ driver.implicitly_wait(10)
268
+ data = [i.text for i in description][0]
269
 
270
+ if len(data.strip()) > 0:
271
+ job_description.append(data)
272
+ else:
273
+ job_description.append('Description Not Available')
274
+
275
+ # Check Description Count Meets User Job Count
276
+ if len([i for i in job_description if i != 'Description Not Available']) >= job_count:
277
+ break
278
+
279
+ # If URL cannot Loading Properly
280
+ except:
281
  job_description.append('Description Not Available')
282
 
283
+
284
+ # Filter the Job Description
285
+ df = df.iloc[:len(job_description), :]
286
 
287
  # Add Job Description in Dataframe
288
  df['Job Description'] = pd.DataFrame(job_description, columns=['Description'])
289
+ df['Job Description'] = df['Job Description'].apply(lambda x: np.nan if x=='Description Not Available' else x)
290
  df = df.dropna()
291
  df.reset_index(drop=True, inplace=True)
292
  return df
 
332
  linkedin_scraper.link_open_scrolldown(driver, link, job_count)
333
 
334
  with st.spinner('scraping Company Data...'):
335
+ df = linkedin_scraper.scrap_company_data(driver, job_title_input)
336
 
337
  with st.spinner('Scraping Job Description Data...'):
338
+ df_final = linkedin_scraper. scrap_job_description(driver, df, job_count)
339
 
340
  # Display the Data in User Interface
341
  linkedin_scraper.display_data_userinterface(df_final)