Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pandas as pd | |
import plotly.express as px | |
from huggingface_hub import HfApi | |
import io | |
from datetime import datetime, timedelta | |
import time | |
import pyarrow as pa | |
import pyarrow.parquet as pq | |
import math | |
import re | |
import pyarrow.csv as csv | |
from concurrent.futures import ThreadPoolExecutor, as_completed | |
# Set page config for a wider layout and custom theme | |
st.set_page_config(layout="wide", page_title="Job Listings Dashboard") | |
# Custom CSS for black background and styling | |
st.markdown(""" | |
<style> | |
.stApp { | |
background-color: #000000; | |
color: #FFFFFF; | |
} | |
.stButton>button { | |
background-color: #4e79a7; | |
color: white; | |
} | |
.stSelectbox, .stMultiSelect { | |
color: #FFFFFF; | |
} | |
.stDataFrame { | |
background-color: #1E1E1E; | |
} | |
.plotly-graph-div { | |
background-color: #1E1E1E; | |
} | |
.big-font { | |
font-size: 48px; | |
font-weight: bold; | |
text-align: center; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
st.markdown(""" | |
<style> | |
h1 { | |
text-align: center; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Hugging Face setup | |
HF_TOKEN = st.secrets["HF_TOKEN"] | |
HF_USERNAME = st.secrets["HF_USERNAME"] | |
DATASET_NAME = "jobeasz" | |
import pyarrow.feather as feather | |
def load_and_concat_data(): | |
api = HfApi() | |
dataset_files = api.list_repo_files(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", repo_type="dataset") | |
feather_files = [file for file in dataset_files if file.endswith('.feather')] | |
all_data = [] | |
for file in feather_files: | |
try: | |
file_content = api.hf_hub_download(repo_id=f"{HF_USERNAME}/{DATASET_NAME}", filename=file, repo_type="dataset", token=HF_TOKEN) | |
df = feather.read_feather(file_content) | |
all_data.append(df) | |
except Exception: | |
pass # Silently skip files that can't be processed | |
if not all_data: | |
return pd.DataFrame() | |
concatenated_df = pd.concat(all_data, ignore_index=True) | |
columns_to_keep = [ | |
'site', 'job_url', 'title', 'company', 'location', | |
'job_type', 'date_posted', 'is_remote', 'company_url' | |
] | |
filtered_df = concatenated_df[columns_to_keep].reset_index(drop=True) | |
filtered_df['date_posted'] = pd.to_datetime(filtered_df['date_posted'], errors='coerce') | |
# Drop duplicates and rows with NaT in date_posted | |
filtered_df = filtered_df.drop_duplicates().dropna(subset=['date_posted']) | |
#filtering based on data in 2024 | |
filtered_df = filtered_df[filtered_df['date_posted'].dt.year==2024] | |
# Convert titles and company name to lowercase | |
filtered_df['title'] = filtered_df['title'].str.lower() | |
filtered_df['company'] = filtered_df['company'].str.lower() | |
# Function to clean the location | |
def clean_location(location): | |
if pd.isna(location): | |
return location # Return NaN as is | |
# Convert to lowercase | |
location = location.lower() | |
# Remove ', us' or ', usa' from the end using regex | |
location = re.sub(r',\s*(us|usa)$', '', location) | |
return location | |
# Clean the location in place | |
filtered_df['location'] = filtered_df['location'].apply(clean_location) | |
#added new line to drop duplicate records | |
filtered_df = filtered_df.drop_duplicates() | |
return filtered_df | |
def get_unique_values(df): | |
return { | |
'companies': df['company'].unique(), | |
'locations': df['location'].unique(), | |
'job_types': df['job_type'].unique(), | |
'Role_Name': df['title'].unique(), | |
'Date_posted': df['date_posted'].unique() | |
} | |
def prepare_dashboard_data(df): | |
top_companies = df['company'].value_counts().head(10) | |
top_locations = df['location'].value_counts().head(10) | |
top_job_titles = df['title'].value_counts().head(20) | |
df_by_date = df.groupby('date_posted').size().reset_index(name='count') | |
return top_companies, top_locations, top_job_titles, df_by_date | |
def create_chart(data, _x, y, title, color_sequence): | |
fig = px.bar(data, x=_x, y=y, title=title, color_discrete_sequence=color_sequence) | |
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', paper_bgcolor='rgba(0,0,0,0)', font_color='#FFFFFF') | |
return fig | |
def create_time_series(df, time_unit='day'): | |
if time_unit == 'week': | |
# Group by week and year | |
df_by_date = df.groupby(df['date_posted'].dt.to_period('W')).size().reset_index(name='count') | |
df_by_date['date_posted'] = df_by_date['date_posted'].dt.to_timestamp() | |
else: | |
# Keep daily grouping as before | |
df_by_date = df.groupby('date_posted').size().reset_index(name='count') | |
fig = px.line(df_by_date, x='date_posted', y='count', title="Job Postings Over Time", color_discrete_sequence=['#4e79a7']) | |
fig.update_layout( | |
plot_bgcolor='rgba(0,0,0,0)', | |
paper_bgcolor='rgba(0,0,0,0)', | |
font_color='#FFFFFF', | |
xaxis_title="Date", | |
yaxis_title="Number of Job Postings" | |
) | |
# Adjust x-axis ticks for weekly view | |
if time_unit == 'week': | |
fig.update_xaxes( | |
dtick="W1", | |
tickformat="%d %b %Y", | |
ticklabelmode="period" | |
) | |
return fig | |
def display_dashboard(df): | |
top_companies, top_locations, top_job_titles, df_by_date = prepare_dashboard_data(df) | |
today = datetime.now().date() | |
jobs_today = df[df['date_posted'].dt.date == today].shape[0] | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("Job Postings Overview") | |
st.metric("Total Job Postings", len(df)) | |
st.metric("Unique Companies", df['company'].nunique()) | |
st.metric("Job Postings Today", jobs_today) | |
min_date = df['date_posted'].min().date() | |
max_date = df['date_posted'].max().date() | |
st.write(f"Job postings from {min_date} to {max_date}") | |
with col2: | |
fig = create_chart(top_companies, top_companies.index, top_companies.values, "Top 10 Companies", ['#4e79a7']) | |
st.plotly_chart(fig, use_container_width=True) | |
# Job Postings Over Time Chart | |
fig_time_series = create_time_series(df) | |
st.plotly_chart(fig_time_series, use_container_width=True) | |
col3, col4 = st.columns(2) | |
with col3: | |
fig = create_chart(top_locations, top_locations.index, top_locations.values, "Top 10 Locations", ['#f28e2b']) | |
st.plotly_chart(fig, use_container_width=True) | |
with col4: | |
fig = create_chart(top_job_titles, top_job_titles.index, top_job_titles.values, "Top 20 Job Titles", ['#59a14f']) | |
st.plotly_chart(fig, use_container_width=True) | |
def filter_dataframe(df, companies, locations, job_types,Role_Name,Date_posted): | |
filtered_df = df | |
if companies: | |
filtered_df = filtered_df[filtered_df['company'].isin(companies)] | |
if locations: | |
filtered_df = filtered_df[filtered_df['location'].isin(locations)] | |
if job_types: | |
filtered_df = filtered_df[filtered_df['job_type'].isin(job_types)] | |
if Role_Name: | |
filtered_df = filtered_df[filtered_df['title'].isin(Role_Name)] | |
if Date_posted: | |
filtered_df = filtered_df[filtered_df['date_posted'].isin(Date_posted)] | |
return filtered_df | |
def display_data_explorer(df): | |
st.subheader("Data Explorer") | |
show_all = st.radio("Display", ("All Data", "Filtered Data")) | |
if show_all == "Filtered Data": | |
unique_values = get_unique_values(df) | |
col1, col2, col3, col4,col5 = st.columns(5) | |
with col1: | |
companies = st.multiselect("Select Companies", options=unique_values['companies']) | |
with col2: | |
locations = st.multiselect("Select Locations", options=unique_values['locations']) | |
with col3: | |
job_types = st.multiselect("Select Job Types", options=unique_values['job_types']) | |
with col4: | |
Role_Name = st.multiselect("Select Role Types", options=unique_values['Role_Name']) | |
with col5: | |
Date_posted = st.multiselect("Select Date Posted", options=unique_values['Date_posted']) | |
filtered_df = filter_dataframe(df, companies, locations, job_types, Role_Name,Date_posted) | |
else: | |
filtered_df = df | |
st.write(f"Showing {len(filtered_df)} job listings") | |
# Pagination | |
items_per_page = 15 | |
num_pages = math.ceil(len(filtered_df) / items_per_page) | |
col1, col2, col3 = st.columns([1, 3, 1]) | |
with col2: | |
page = st.number_input("Page", min_value=1, max_value=num_pages, value=1) | |
start_idx = (page - 1) * items_per_page | |
end_idx = start_idx + items_per_page | |
page_df = filtered_df.iloc[start_idx:end_idx] | |
def make_clickable(url): | |
return f'<a href="{url}" target="_blank" style="color: #4e79a7;">Link</a>' | |
page_df['job_url'] = page_df['job_url'].apply(make_clickable) | |
page_df['company_url'] = page_df['company_url'].apply(make_clickable) | |
st.write(page_df.to_html(escape=False, index=False), unsafe_allow_html=True) | |
col1, col2, col3 = st.columns([1, 3, 1]) | |
with col2: | |
st.write(f"Page {page} of {num_pages}") | |
def display_about_page(): | |
st.markdown(""" | |
## What is this application? | |
The Job Listings Dashboard is a powerful tool designed to provide insights into the job market. It offers a comprehensive view of job postings, allowing users to explore trends, top companies, locations, and job titles. | |
### Key Features: | |
- **Interactive Dashboard**: Visualize job market trends with dynamic charts and graphs. | |
- **Data Explorer**: Dive deep into individual job listings with advanced filtering options. | |
- **Real-time Data**: Fetch the latest job data from our Hugging Face dataset. | |
## How to use this application | |
### Dashboard | |
1. Navigate to the Dashboard using the sidebar. | |
2. View overall statistics such as total job postings, unique companies, and today's postings. | |
3. Explore interactive charts showing: | |
- Top companies hiring | |
- Job postings over time | |
- Top locations for job opportunities | |
- Most common job titles | |
### Data Explorer | |
1. Switch to the Data Explorer using the sidebar. | |
2. Choose between viewing all data or applying filters. | |
3. Use the multi-select dropdowns to filter by: | |
- Companies | |
- Locations | |
- Job Types | |
4. Browse the filtered job listings table. | |
5. Click on job or company links to view more details on the original posting site. | |
## Data Source | |
This application fetches data from my Private dataset which scrapes data from varoious job hosting portal and the data gets updated daily. | |
## Contact | |
For questions, feedback, or collaboration opportunities, feel free to reach out: | |
- LinkedIn: [Nihar Palem](https://www.linkedin.com/in/nihar-palem-1b955a183/) | |
""") | |
# Add a clickable LinkedIn button | |
linkedin_url = "https://www.linkedin.com/in/nihar-palem-1b955a183/" | |
st.markdown(f""" | |
<a href="{linkedin_url}" target="_blank"> | |
<img src="https://content.linkedin.com/content/dam/me/business/en-us/amp/brand-site/v2/bg/LI-Logo.svg.original.svg" width="100"> | |
</a> | |
""", unsafe_allow_html=True) | |
def main(): | |
st.title("Job Easz") | |
# Load data | |
df = load_and_concat_data() | |
if df.empty: | |
st.error("No data available. Please check your dataset.") | |
return | |
# Sidebar for navigation | |
st.sidebar.title("Navigation") | |
page = st.sidebar.radio("Go to", ["Dashboard", "Data Explorer", "About"]) | |
# Navigation logic | |
if page == "Dashboard": | |
display_dashboard(df) | |
elif page == "Data Explorer": | |
display_data_explorer(df) | |
elif page == "About": | |
display_about_page() | |
if __name__ == "__main__": | |
main() |