LeoWalker commited on
Commit
0f180d3
1 Parent(s): 4fad2af

section extraction to mongodb is working through the notebook but needs to be moved to a function.

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
notebooks/gj_error.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/mongo_section_extraction.ipynb ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from multiprocessing import process\n",
10
+ "import pandas as pd\n",
11
+ "import datetime as dt\n",
12
+ "import http.client\n",
13
+ "import json\n",
14
+ "import urllib.parse\n",
15
+ "import os\n",
16
+ "from pymongo import MongoClient\n",
17
+ "import sys\n",
18
+ "sys.path.append('../')\n",
19
+ "\n",
20
+ "from utils import parse_description\n"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 2,
26
+ "metadata": {},
27
+ "outputs": [],
28
+ "source": [
29
+ "mongodb_conn = os.getenv('MONGODB_CONNECTION_STRING')\n",
30
+ "client = MongoClient(mongodb_conn)\n",
31
+ "db = client.job_search_db\n",
32
+ "collection = db['sf_bay_test_jobs']"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": 10,
38
+ "metadata": {},
39
+ "outputs": [
40
+ {
41
+ "name": "stdout",
42
+ "output_type": "stream",
43
+ "text": [
44
+ "company_overview=CompanyOverview(about='Global Cloud Services is a cloud computing company with a location in Palo Alto, CA', mission_and_values='The company values data-driven decision making and uses machine learning to drive business value', size='Not specified', locations='Palo Alto, CA') role_summary=RoleSummary(title='Data Scientist', team_or_department='Multi-Cloud team', role_type='Full-time', remote='Not specified') responsibilities_and_qualifications=ResponsibilitiesAndQualifications(responsibilities=['Explore data sources, implement data exploration, pre-processing and data cleansing on historical data', 'Develop statistical/machine learning models based on pre-processed data as a proof-of-concept', 'Assess and optimize model quality based on technical level by tuning hyper-parameters, settings and/or changing models', 'Integrate statistical/machine learning models into products & services', 'Work closely with product managers, developers, and engineers to ensure that models will be used in products or services', 'Evaluate, improve/refine models by tuning parameters, adjusting data sources, or model approaches, to provide business value'], required_qualifications=[\"Bachelor's degree in Computer Science, Engineering, Mathematics or a related field of study and 4 years of experience\", \"Master's degree and 2 years of experience\"], preferred_qualifications=['Not specified']) compensation_and_benefits=CompensationAndBenefits(salary_or_pay_range='$80K -- $100K', bonus_and_equity='Not specified', benefits_and_perks=['Not specified'])\n",
45
+ "company_overview=CompanyOverview(about='Stanford Health Care', mission_and_values='delivering value and an exceptional experience for our patients and families', size='N/A', locations='1830 Embarcadero Road, PALO ALTO, California') role_summary=RoleSummary(title='Senior Biomedical Informatics Data Scientist', team_or_department='IT RESEARCH Technology & Digital Solutions', role_type='Full-time', remote='Hybrid') responsibilities_and_qualifications=ResponsibilitiesAndQualifications(responsibilities=['Work closely with the data science and engineering team on data quality analysis', 'Develop processes to measure and ensure quality, completeness, integrity and compliance of institutional research data assets', 'Identify best practices in the technical community and help to shape and implement policies that enhance data quality, compliance and customer support', 'Develop ETL (extract, transform, load) specifications to go from raw data to research ready datasets', 'Employ new and existing tools to interpret, analyze, and visualize relationships in data', 'Analyze and incorporate external data sets that may augment the power of clinical data', 'Represent Stanford through presentations at technical conferences, consortiums, participation in standard committees, working groups and other venues', 'Engage in other departmental activities to ensure an inclusive and transparent work culture'], required_qualifications=[\"Bachelor's degree in a scientific field\", 'Strong analytical skills', 'Experience with data manipulation and integration, databases, and statistics', 'Fluency with data science programming paradigms such as Jupyter notebooks, SQL, Python or R'], preferred_qualifications=[\"Master's or PhD\", '4+ years of related experience']) compensation_and_benefits=CompensationAndBenefits(salary_or_pay_range='$55.80 - $73.92 per hour', bonus_and_equity='N/A', benefits_and_perks=['N/A'])\n"
46
+ ]
47
+ }
48
+ ],
49
+ "source": [
50
+ "# for every document in a given collection get the job_id, title, company_name, location, description, detected_extensions, retrieve_date\n",
51
+ "# then transform the description into the JobDescription object and save each lower object as a attribute of the Job object\n",
52
+ "# how can i find only 5 documents\n",
53
+ "\n",
54
+ "# get the first 5 documents\n",
55
+ "\n",
56
+ "# loop through the collection\n",
57
+ "for document in collection.find().limit(2):\n",
58
+ " job_id = document['job_id']\n",
59
+ " title = document['title']\n",
60
+ " company_name = document['company_name']\n",
61
+ " location = document['location']\n",
62
+ " description = document['description']\n",
63
+ " detected_extensions = document['detected_extensions']\n",
64
+ " retrieve_date = document['retrieve_date']\n",
65
+ " job_object = parse_description.extract_desc_fields(description)\n",
66
+ " \n",
67
+ " #insert all of the above as a new document in \"sf_bay_test_jobs_cleaned\" collection\n",
68
+ " db['sf_bay_test_jobs_cleaned'].insert_one({\n",
69
+ " 'job_id': job_id,\n",
70
+ " 'title': title,\n",
71
+ " 'company_name': company_name,\n",
72
+ " 'location': location,\n",
73
+ " 'description': description,\n",
74
+ " 'detected_extensions': detected_extensions,\n",
75
+ " 'retrieve_date': retrieve_date,\n",
76
+ " 'company_overview': job_object.company_overview.dict(),\n",
77
+ " 'role_summary': job_object.role_summary.dict(),\n",
78
+ " 'responsibilities_and_qualifications': job_object.responsibilities_and_qualifications.dict(),\n",
79
+ " 'compensation_and_benefits': job_object.compensation_and_benefits.dict()\n",
80
+ " })\n",
81
+ " \n"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": 9,
87
+ "metadata": {},
88
+ "outputs": [
89
+ {
90
+ "data": {
91
+ "text/plain": [
92
+ "DeleteResult({'n': 2, 'electionId': ObjectId('7fffffff00000000000000e2'), 'opTime': {'ts': Timestamp(1715378977, 13), 't': 226}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1715378977, 14), 'signature': {'hash': b'F\\x92\\xb5\\xce\\xc1\\x189\\x9b\\x00\\xc6\\xe43\\xaa\\xc0\\x9a\\xfe\\xd5\\xdf\\x80\\x14', 'keyId': 7314031900077588486}}, 'operationTime': Timestamp(1715378977, 13)}, acknowledged=True)"
93
+ ]
94
+ },
95
+ "execution_count": 9,
96
+ "metadata": {},
97
+ "output_type": "execute_result"
98
+ }
99
+ ],
100
+ "source": [
101
+ "\n",
102
+ "db['sf_bay_test_jobs_cleaned'].delete_many({})\n"
103
+ ]
104
+ },
105
+ {
106
+ "cell_type": "code",
107
+ "execution_count": null,
108
+ "metadata": {},
109
+ "outputs": [],
110
+ "source": []
111
+ }
112
+ ],
113
+ "metadata": {
114
+ "kernelspec": {
115
+ "display_name": "datajobs",
116
+ "language": "python",
117
+ "name": "python3"
118
+ },
119
+ "language_info": {
120
+ "codemirror_mode": {
121
+ "name": "ipython",
122
+ "version": 3
123
+ },
124
+ "file_extension": ".py",
125
+ "mimetype": "text/x-python",
126
+ "name": "python",
127
+ "nbconvert_exporter": "python",
128
+ "pygments_lexer": "ipython3",
129
+ "version": "3.11.9"
130
+ }
131
+ },
132
+ "nbformat": 4,
133
+ "nbformat_minor": 2
134
+ }