File size: 3,425 Bytes
4fad2af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c8a9bc1
4fad2af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from multiprocessing import process
import pandas as pd
import datetime as dt
import http.client
import json
import urllib.parse
import os
from pymongo import MongoClient
from concurrent.futures import ThreadPoolExecutor, as_completed

from dotenv import load_dotenv
load_dotenv()

mongodb_conn = os.getenv('MONGODB_CONNECTION_STRING')

# Global variables to keep track of searched job titles and cities
searched_jobs = set()
searched_cities = set()

def google_job_search(job_title, city_state, start=0):
    '''
    job_title(str): "Data Scientist", "Data Analyst"
    city_state(str): "Denver, CO"
    '''
    query = f"{job_title} {city_state}"
    params = {
        "api_key": os.getenv('WEBSCRAPING_API_KEY'),
        "engine": "google_jobs",
        "q": query,
        "hl": "en",
        "google_domain": "google.com",
        # "start": start,
        # "chips": f"date_posted:{post_age}",
    }

    query_string = urllib.parse.urlencode(params, quote_via=urllib.parse.quote)

    conn = http.client.HTTPSConnection("serpapi.webscrapingapi.com")
    try:
        conn.request("GET", f"/v1?{query_string}")
        print(f"GET /v1?{query_string}")
        res = conn.getresponse()
        try:
            data = res.read()
        finally:
            res.close()
    finally:
        conn.close()

    try:
        json_data = json.loads(data.decode("utf-8"))
        jobs_results = json_data['google_jobs_results']
        return jobs_results
    except (KeyError, json.JSONDecodeError) as e:
        print(f"Error occurred for search: {job_title} in {city_state}")
        print(f"Error message: {str(e)}")
        print(f"Data: {data}")
        return None

def mongo_dump(jobs_results, collection_name):
    client = MongoClient(mongodb_conn)
    db = client.job_search_db
    collection = db[collection_name]
    
    for job in jobs_results:
        job['retrieve_date'] = dt.datetime.today().strftime('%Y-%m-%d')
        collection.insert_one(job)
    
    print(f"Dumped {len(jobs_results)} documents to MongoDB collection {collection_name}")

def process_batch(job, city_state, start=0):
    global searched_jobs, searched_cities

    # Check if the job title and city have already been searched
    if (job, city_state) in searched_jobs:
        print(f'Skipping already searched job: {job} in {city_state}')
        return

    jobs_results = google_job_search(job, city_state, start)
    if jobs_results is not None:
        print(f'City: {city_state} Job: {job} Start: {start}')
        mongo_dump(jobs_results, 'sf_bay_test_jobs')

        # Add the job title and city to the searched sets
        searched_jobs.add((job, city_state))
        searched_cities.add(city_state)

def main(job_list, city_state_list):
    for job in job_list:
        for city_state in city_state_list:
            output = process_batch(job, city_state)

if __name__ == "__main__":
    job_list = ["Data Scientist", "Machine Learning Engineer", "AI Gen Engineer", "ML Ops"]
    city_state_list = ["Atlanta, GA", "Austin, TX", "Boston, MA", "Chicago, IL", 
                    "Denver CO", "Dallas-Ft. Worth, TX", "Los Angeles, CA",
                    "New York City NY", "San Francisco, CA", "Seattle, WA",
                    "Palo Alto CA", "Mountain View CA", "San Jose, CA"]
    simple_city_state_list: list[str] = ["Palo Alto CA", "San Francisco CA", "Mountain View CA"]
    main(job_list, simple_city_state_list)