Spaces:
Sleeping
Sleeping
able to parse objects out of the job description, but extracting in mass w/ error handling is a little difficult. Most recent run is in parse_description_test notebook thats able to connect to psql and then extract a sample set of 74 descriptions.
Browse files- notebooks/parse_description_test.ipynb +0 -0
- utils/google_jobs.py +6 -8
- utils/job_desc_pydantic.py +25 -79
- utils/parse_description.py +15 -18
notebooks/parse_description_test.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
utils/google_jobs.py
CHANGED
@@ -14,11 +14,10 @@ def google_job_search(job_title, city_state, start=0):
|
|
14 |
'''
|
15 |
job_title(str): "Data Scientist", "Data Analyst"
|
16 |
city_state(str): "Denver, CO"
|
17 |
-
post_age,(str)(optional): "3day", "week", "month"
|
18 |
'''
|
19 |
query = f"{job_title} {city_state}"
|
20 |
params = {
|
21 |
-
"api_key": os.getenv('
|
22 |
"engine": "google_jobs",
|
23 |
"q": query,
|
24 |
"hl": "en",
|
@@ -51,16 +50,16 @@ def google_job_search(job_title, city_state, start=0):
|
|
51 |
return None
|
52 |
|
53 |
def sql_dump(df, table):
|
54 |
-
engine = create_engine(f"postgresql://{os.getenv('
|
55 |
with engine.connect() as conn:
|
56 |
-
df.to_sql(table, conn, if_exists='append', chunksize=20, method='
|
57 |
print(f"Dumped {df.shape} to SQL table {table}")
|
58 |
|
59 |
def process_batch(job, city_state, start):
|
60 |
df_10jobs = google_job_search(job, city_state, start)
|
61 |
if df_10jobs is not None:
|
62 |
print(f'City: {city_state} Job: {job} Start: {start}')
|
63 |
-
|
64 |
date = dt.datetime.today().strftime('%Y-%m-%d')
|
65 |
df_10jobs['retrieve_date'] = date
|
66 |
df_10jobs.drop_duplicates(subset=['job_id', 'company_name'], inplace=True)
|
@@ -80,11 +79,10 @@ def main(job_list, city_state_list):
|
|
80 |
future.result()
|
81 |
|
82 |
if __name__ == "__main__":
|
83 |
-
job_list = ["Data Scientist", "Machine Learning Engineer", "AI Gen Engineer"
|
84 |
-
"Data Analyst", "Data Engineer", "Business Intelligence Analyst"]
|
85 |
city_state_list = ["Atlanta, GA", "Austin, TX", "Boston, MA", "Chicago, IL",
|
86 |
"Denver CO", "Dallas-Ft. Worth, TX", "Los Angeles, CA",
|
87 |
"New York City NY", "San Francisco, CA", "Seattle, WA",
|
88 |
"Palo Alto CA", "Mountain View CA"]
|
89 |
-
simple_city_state_list: list[str] = ["Palo Alto CA", "San Francisco CA", ]
|
90 |
main(job_list, city_state_list)
|
|
|
14 |
'''
|
15 |
job_title(str): "Data Scientist", "Data Analyst"
|
16 |
city_state(str): "Denver, CO"
|
|
|
17 |
'''
|
18 |
query = f"{job_title} {city_state}"
|
19 |
params = {
|
20 |
+
"api_key": os.getenv('WEBSCRAPING_API_KEY'),
|
21 |
"engine": "google_jobs",
|
22 |
"q": query,
|
23 |
"hl": "en",
|
|
|
50 |
return None
|
51 |
|
52 |
def sql_dump(df, table):
|
53 |
+
engine = create_engine(f"postgresql://{os.getenv('PSQL_MASTER_NAME')}:{os.getenv('PSQL_KEY')}@{os.getenv('RDS_ENDPOINT')}:5432/postgres")
|
54 |
with engine.connect() as conn:
|
55 |
+
df.to_sql(table, conn, if_exists='append', chunksize=20, method='None', index=False)
|
56 |
print(f"Dumped {df.shape} to SQL table {table}")
|
57 |
|
58 |
def process_batch(job, city_state, start):
|
59 |
df_10jobs = google_job_search(job, city_state, start)
|
60 |
if df_10jobs is not None:
|
61 |
print(f'City: {city_state} Job: {job} Start: {start}')
|
62 |
+
|
63 |
date = dt.datetime.today().strftime('%Y-%m-%d')
|
64 |
df_10jobs['retrieve_date'] = date
|
65 |
df_10jobs.drop_duplicates(subset=['job_id', 'company_name'], inplace=True)
|
|
|
79 |
future.result()
|
80 |
|
81 |
if __name__ == "__main__":
|
82 |
+
job_list = ["Data Scientist", "Machine Learning Engineer", "AI Gen Engineer"]
|
|
|
83 |
city_state_list = ["Atlanta, GA", "Austin, TX", "Boston, MA", "Chicago, IL",
|
84 |
"Denver CO", "Dallas-Ft. Worth, TX", "Los Angeles, CA",
|
85 |
"New York City NY", "San Francisco, CA", "Seattle, WA",
|
86 |
"Palo Alto CA", "Mountain View CA"]
|
87 |
+
simple_city_state_list: list[str] = ["Palo Alto CA", "San Francisco CA", "Mountain View CA", "San Jose, CA"]
|
88 |
main(job_list, city_state_list)
|
utils/job_desc_pydantic.py
CHANGED
@@ -4,19 +4,11 @@ from langchain_core.pydantic_v1 import BaseModel, Field
|
|
4 |
class CompanyOverview(BaseModel):
|
5 |
"""
|
6 |
A model for capturing key information about the company offering the job.
|
7 |
-
|
8 |
-
Extract relevant details about the company from the job description,
|
9 |
-
including a brief overview of its industry and products, its mission and
|
10 |
-
values, size, and location(s).
|
11 |
-
|
12 |
-
Focus on capturing the most salient points that give a well-rounded picture
|
13 |
-
of the company and its culture.
|
14 |
"""
|
15 |
|
16 |
about: Optional[str] = Field(
|
17 |
None,
|
18 |
-
description="""
|
19 |
-
and any notable achievements or differentiators"""
|
20 |
)
|
21 |
|
22 |
mission_and_values: Optional[str] = Field(
|
@@ -31,121 +23,75 @@ class CompanyOverview(BaseModel):
|
|
31 |
|
32 |
locations: Optional[str] = Field(
|
33 |
None,
|
34 |
-
description="""
|
35 |
-
offices, and any remote work options"""
|
36 |
)
|
37 |
-
|
38 |
-
city: Optional[str] = Field(None, description="City where the company is located")
|
39 |
-
|
40 |
-
state: Optional[str] = Field(None, description="State where the company is located")
|
41 |
-
|
42 |
|
43 |
class RoleSummary(BaseModel):
|
44 |
"""
|
45 |
A model for capturing the key summary points about the job role.
|
46 |
|
47 |
-
Extract the essential high-level details about the role from the job description,
|
48 |
-
such as the job title, the team or department the role belongs to, the role type,
|
49 |
-
and any remote work options.
|
50 |
-
|
51 |
Prioritize information that helps understand the overall scope and positioning
|
52 |
of the role within the company.
|
53 |
"""
|
54 |
|
55 |
title: str = Field(..., description="Title of the job role")
|
56 |
|
57 |
-
team_or_department: Optional[str] = Field(
|
58 |
-
None,
|
59 |
description="""Team, department, or business unit the role belongs to,
|
60 |
including any collaborations with other teams"""
|
61 |
)
|
62 |
|
63 |
-
role_type: Optional[str] = Field(
|
64 |
-
None,
|
65 |
description="Type of role (full-time, part-time, contract, etc.)"
|
66 |
)
|
67 |
|
68 |
-
remote: Optional[str] = Field(
|
69 |
-
None,
|
70 |
description="Remote work options for the role (full, hybrid, none)"
|
71 |
)
|
72 |
|
73 |
class ResponsibilitiesAndQualifications(BaseModel):
|
74 |
"""
|
75 |
-
A model for capturing the key
|
76 |
-
qualifications for the job role.
|
77 |
-
|
78 |
-
Extract the essential duties and expectations of the role, the mandatory
|
79 |
-
educational background and experience required, and any additional skills
|
80 |
-
or characteristics that are desirable but not strictly necessary.
|
81 |
-
|
82 |
-
The goal is to provide a clear and comprehensive picture of what the role
|
83 |
-
entails and what qualifications the ideal candidate should possess.
|
84 |
"""
|
85 |
|
86 |
-
responsibilities: List[str] = Field(
|
87 |
-
description="""
|
88 |
-
areas such as metrics, theories, business understanding, product
|
89 |
-
direction, systems, leadership, decision making, strategy, and
|
90 |
-
collaboration, as described in the job description"""
|
91 |
)
|
92 |
|
93 |
-
required_qualifications: List[str] = Field(
|
94 |
-
description="""
|
95 |
-
|
96 |
-
|
97 |
-
including any alternative acceptable combinations of education
|
98 |
-
and experience, as specified in the job description"""
|
99 |
)
|
100 |
|
101 |
-
preferred_qualifications: List[str] = Field(
|
102 |
-
description="""Any additional
|
103 |
-
|
104 |
-
|
105 |
-
relevant soft skills, problem solving abilities, and industry
|
106 |
-
knowledge, as mentioned in the job description as preferred or
|
107 |
-
nice-to-have qualifications"""
|
108 |
)
|
109 |
|
110 |
class CompensationAndBenefits(BaseModel):
|
111 |
"""
|
112 |
-
A
|
113 |
-
|
114 |
-
Extract details about the salary or pay range, bonus and equity compensation,
|
115 |
-
benefits, and perks from the job description.
|
116 |
|
117 |
Aim to provide a comprehensive view of the total rewards offered for the role,
|
118 |
including both monetary compensation and non-monetary benefits and perks.
|
119 |
"""
|
120 |
|
121 |
-
salary_or_pay_range: Optional[str] = Field(
|
122 |
-
|
123 |
-
description="""
|
124 |
-
any specific numbers or bands mentioned in the job description"""
|
125 |
)
|
126 |
|
127 |
-
bonus_and_equity: Optional[str] = Field(
|
128 |
-
|
129 |
-
description="""Any information about bonus compensation, such as signing bonuses,
|
130 |
-
annual performance bonuses, or other incentives, as well as details
|
131 |
-
about equity compensation like stock options or RSUs"""
|
132 |
)
|
133 |
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
dental and vision coverage, retirement plans (401k, pension), paid
|
138 |
-
time off (vacation, sick days, holidays), parental leave, and any
|
139 |
-
other standard benefits mentioned in the job description"""
|
140 |
)
|
141 |
|
142 |
-
perks: Optional[List[str]] = Field(
|
143 |
-
None,
|
144 |
-
description="""A list of additional perks and amenities offered, such as free food
|
145 |
-
or snacks, commuter benefits, wellness programs, learning and development
|
146 |
-
stipends, employee discounts, or any other unique perks the company
|
147 |
-
provides to its employees, as mentioned in the job description"""
|
148 |
-
)
|
149 |
|
150 |
class JobDescription(BaseModel):
|
151 |
"""Extracted information from a job description."""
|
|
|
4 |
class CompanyOverview(BaseModel):
|
5 |
"""
|
6 |
A model for capturing key information about the company offering the job.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
"""
|
8 |
|
9 |
about: Optional[str] = Field(
|
10 |
None,
|
11 |
+
description="""Overview of the company, industry, products, services, and notable achievements"""
|
|
|
12 |
)
|
13 |
|
14 |
mission_and_values: Optional[str] = Field(
|
|
|
23 |
|
24 |
locations: Optional[str] = Field(
|
25 |
None,
|
26 |
+
description="""City, State where this position is based. """
|
|
|
27 |
)
|
|
|
|
|
|
|
|
|
|
|
28 |
|
29 |
class RoleSummary(BaseModel):
|
30 |
"""
|
31 |
A model for capturing the key summary points about the job role.
|
32 |
|
|
|
|
|
|
|
|
|
33 |
Prioritize information that helps understand the overall scope and positioning
|
34 |
of the role within the company.
|
35 |
"""
|
36 |
|
37 |
title: str = Field(..., description="Title of the job role")
|
38 |
|
39 |
+
team_or_department: Optional[str] = Field(None,
|
|
|
40 |
description="""Team, department, or business unit the role belongs to,
|
41 |
including any collaborations with other teams"""
|
42 |
)
|
43 |
|
44 |
+
role_type: Optional[str] = Field(None,
|
|
|
45 |
description="Type of role (full-time, part-time, contract, etc.)"
|
46 |
)
|
47 |
|
48 |
+
remote: Optional[str] = Field(None,
|
|
|
49 |
description="Remote work options for the role (full, hybrid, none)"
|
50 |
)
|
51 |
|
52 |
class ResponsibilitiesAndQualifications(BaseModel):
|
53 |
"""
|
54 |
+
A model for capturing the key summary points about the job role.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
"""
|
56 |
|
57 |
+
responsibilities: List[str] = Field(None,
|
58 |
+
description="""List of responsibilities, including tasks, duties, and expectations for the role"""
|
|
|
|
|
|
|
59 |
)
|
60 |
|
61 |
+
required_qualifications: List[str] = Field(None,
|
62 |
+
description="""Essential educational qualifications and professional experience required for the role.
|
63 |
+
This may include, but not limited to, degrees, certifications, years of experience, technical skills, and domain knowledge.
|
64 |
+
"""
|
|
|
|
|
65 |
)
|
66 |
|
67 |
+
preferred_qualifications: List[str] = Field(None,
|
68 |
+
description="""Any additional qualifications that a candidate may possess to stand out or excel in the role.
|
69 |
+
This may include preferred skills, experience, certifications, or other attributes that are not essential but beneficial for the role.
|
70 |
+
"""
|
|
|
|
|
|
|
71 |
)
|
72 |
|
73 |
class CompensationAndBenefits(BaseModel):
|
74 |
"""
|
75 |
+
A class for capturing details about the compensation and benefits offered for the role
|
|
|
|
|
|
|
76 |
|
77 |
Aim to provide a comprehensive view of the total rewards offered for the role,
|
78 |
including both monetary compensation and non-monetary benefits and perks.
|
79 |
"""
|
80 |
|
81 |
+
salary_or_pay_range: Optional[str] = Field(None,
|
82 |
+
|
83 |
+
description="""Salary range or hourly pay range for the role"""
|
|
|
84 |
)
|
85 |
|
86 |
+
bonus_and_equity: Optional[str] = Field(None,
|
87 |
+
description="""Information about bonus and equity compensation"""
|
|
|
|
|
|
|
88 |
)
|
89 |
|
90 |
+
benefits_and_perks: Optional[List[str]] = Field(None,
|
91 |
+
description="""List of benefits and perks offered for the role, such as insurance, retirement plans, and paid time off.
|
92 |
+
Can also include additional perks like free meals, wellness programs, learning stipends, etc."""
|
|
|
|
|
|
|
93 |
)
|
94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
class JobDescription(BaseModel):
|
97 |
"""Extracted information from a job description."""
|
utils/parse_description.py
CHANGED
@@ -5,36 +5,30 @@ from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
|
5 |
from langchain_core.pydantic_v1 import BaseModel, Field
|
6 |
|
7 |
from langchain_groq import ChatGroq
|
|
|
8 |
from dotenv import load_dotenv
|
9 |
import os
|
10 |
|
11 |
from utils.job_desc_pydantic import JobDescription
|
12 |
|
13 |
-
load_dotenv()
|
14 |
|
|
|
|
|
|
|
|
|
|
|
15 |
|
|
|
16 |
|
17 |
-
def extract_desc_fields(raw_job_description):
|
18 |
prompt = ChatPromptTemplate.from_messages(
|
19 |
[
|
20 |
(
|
21 |
"system",
|
22 |
"""You are an expert at identifying key aspects of job descriptions. Your task is to extract important information from a raw job description and organize it into a structured format using the ResponsibilitiesAndQualifications class.
|
23 |
|
24 |
-
When parsing the job description, your goal is to capture as much relevant information as possible in the appropriate fields of the class.
|
25 |
-
|
26 |
-
1. All key responsibilities and duties of the role, covering the full range of tasks and expectations.
|
27 |
-
2. The required educational qualifications and years of experience, including different acceptable combinations.
|
28 |
-
3. Any additional preferred skills, experiences, and characteristics that are desirable for the role.
|
29 |
-
|
30 |
-
Avoid summarizing or paraphrasing the information. Instead, extract the details as closely as possible to how they appear in the original job description. The aim is to organize and structure the raw data, not to condense or interpret it.
|
31 |
-
|
32 |
-
Some specific things to look out for:
|
33 |
-
- Responsibilities related to metrics, theories, business understanding, product direction, systems, leadership, decision making, strategy, and collaboration
|
34 |
-
- Required degrees (Doctorate, Master's, Bachelor's) in relevant fields, along with the corresponding years of experience
|
35 |
-
- Preferred qualifications like years of coding experience, soft skills, problem solving abilities, and domain expertise
|
36 |
-
|
37 |
-
If any of these details are missing from the job description, simply omit them from the output rather than trying to infer or fill in the gaps.
|
38 |
|
39 |
The structured data you extract will be used for further analysis and insights downstream, so err on the side of including more information rather than less. The key is to make the unstructured job description data more organized and manageable while still retaining all the important details.
|
40 |
""",
|
@@ -43,13 +37,16 @@ def extract_desc_fields(raw_job_description):
|
|
43 |
]
|
44 |
)
|
45 |
|
|
|
46 |
llm = ChatGroq(model_name="llama3-70b-8192")
|
|
|
47 |
|
48 |
extractor = prompt | llm.with_structured_output(
|
49 |
schema=JobDescription,
|
50 |
method="function_calling",
|
51 |
include_raw=False,
|
52 |
)
|
53 |
-
|
54 |
-
|
|
|
55 |
|
|
|
5 |
from langchain_core.pydantic_v1 import BaseModel, Field
|
6 |
|
7 |
from langchain_groq import ChatGroq
|
8 |
+
from langchain_anthropic import ChatAnthropic
|
9 |
from dotenv import load_dotenv
|
10 |
import os
|
11 |
|
12 |
from utils.job_desc_pydantic import JobDescription
|
13 |
|
|
|
14 |
|
15 |
+
# Set the LANGCHAIN_TRACING_V2 environment variable to 'true'
|
16 |
+
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
|
17 |
+
|
18 |
+
# Set the LANGCHAIN_PROJECT environment variable to the desired project name
|
19 |
+
os.environ['LANGCHAIN_PROJECT'] = 'JobDescriptionProject'
|
20 |
|
21 |
+
load_dotenv()
|
22 |
|
23 |
+
def extract_desc_fields(raw_job_description, model_name="llama3-70b-8192"):
|
24 |
prompt = ChatPromptTemplate.from_messages(
|
25 |
[
|
26 |
(
|
27 |
"system",
|
28 |
"""You are an expert at identifying key aspects of job descriptions. Your task is to extract important information from a raw job description and organize it into a structured format using the ResponsibilitiesAndQualifications class.
|
29 |
|
30 |
+
When parsing the job description, your goal is to capture as much relevant information as possible in the appropriate fields of the class.
|
31 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
The structured data you extract will be used for further analysis and insights downstream, so err on the side of including more information rather than less. The key is to make the unstructured job description data more organized and manageable while still retaining all the important details.
|
34 |
""",
|
|
|
37 |
]
|
38 |
)
|
39 |
|
40 |
+
# llm = ChatAnthropic(model_name="claude-3-sonnet-20240229")
|
41 |
llm = ChatGroq(model_name="llama3-70b-8192")
|
42 |
+
# llm = ChatGroq(model_name="llama3-8b-8192")
|
43 |
|
44 |
extractor = prompt | llm.with_structured_output(
|
45 |
schema=JobDescription,
|
46 |
method="function_calling",
|
47 |
include_raw=False,
|
48 |
)
|
49 |
+
clean_description = extractor.invoke(raw_job_description)
|
50 |
+
print(clean_description)
|
51 |
+
return clean_description
|
52 |
|