import streamlit as st |
import pandas as pd |
import re |
import time |
import plotly.express as px |
import random |
from huggingface_hub import InferenceClient |
import spacy |
nlp = spacy.load("en_core_web_sm") |
def get_location(sentence): |
doc = nlp(sentence) |
for ent in doc.ents: |
if ent.label_ == "LOC" or ent.label_ == "GPE": |
return ent.text |
return None |
class JobPosting: |
def __init__(self, description): |
self.description = description |
def extract(self): |
responsibilities = "Define responsibilities here" |
return responsibilities |
class CSVFileUploader: |
def __init__(self): |
self.file = None |
self.selected_column = None |
self.selected_row = None |
def upload_file(self): |
uploaded_file = st.file_uploader("Upload CSV file", type=["csv"]) |
if uploaded_file is not None: |
self.file = pd.read_csv(uploaded_file) |
df = self.file |
df["Location"] = df["Location"].apply(get_location) |
st.header("Data Preview") |
st.write(df.head()) |
st.header("Data Information") |
st.write(df.info()) |
st.header("Descriptive Statistics") |
st.write(df.describe()) |
st.header("Column Distribution") |
selected_column = st.selectbox("Select a column", df.columns) |
fig = px.bar(df, x=df[selected_column], labels={'index': 'Value', selected_column: 'Count'}) |
st.plotly_chart(fig) |
def plot_locations(self): |
if self.file is not None: |
df = self.file.dropna(subset=["Location"]) |
locations = df["Location"].value_counts().reset_index() |
locations.columns = ["Location", "Count"] |
fig = px.bar(locations, x="Location", y="Count", labels={'index': 'Value', 'Location': 'Location', 'Count': 'Count'}) |
fig.update_layout(xaxis={'categoryorder': 'total descending'}) |
st.header("Location Distribution") |
st.plotly_chart(fig) |
def select_row(self): |
if self.file is not None: |
self.selected_row = st.selectbox("Select row", list(range(len(self.file)))) |
def display_data(self): |
if self.file is not None: |
if self.selected_column is not None: |
st.write(self.file[self.selected_column]) |
if self.selected_row is not None: |
location = self.file.iloc[self.selected_row]["Location"] |
description = self.file.iloc[self.selected_row]["Description"] |
st.markdown("---") |
st.markdown(f"<span style='background-color: #f4a261; padding: 2px 4px; border-radius: 4px;'>Location:</span>", unsafe_allow_html=True) |
st.write("Just give me the location for this job description, no other words and remove 'on-site' or 'remote' if mentioned. For example, 'Boston, MA, USA': " |
+ str(location)) |
st.markdown("---") |
st.markdown(f"<span style='background-color: #f4a261; padding: 2px 4px; border-radius: 4px;'>Description for selected row:</span>", unsafe_allow_html=True) |
st.write("What are the qualifications required: "+ str(description)) |
clients = { |
"google/gemma-7b": InferenceClient("google/gemma-7b"), |
"google/gemma-7b-it": InferenceClient("google/gemma-7b-it"), |
"google/gemma-2b": InferenceClient("google/gemma-2b"), |
"google/gemma-2b-it": InferenceClient("google/gemma-2b-it") |
} |
def format_prompt(message, history): |
prompt = "" |
if history: |
for user_prompt, bot_response in history: |
prompt += f"<start_of_turn>user{user_prompt}<end_of_turn>" |
prompt += f"<start_of_turn>model{bot_response}" |
prompt += f"<start_of_turn>user{message}<end_of_turn><start_of_turn>model" |
return prompt |
def chat_inf(system_prompt, prompt, history, client_choice, seed, temp, tokens, top_p, rep_p): |
client = clients[client_choice] |
if not history: |
history = [] |
hist_len = 0 |
if history: |
hist_len = len(history) |
generate_kwargs = dict( |
temperature=temp, |
max_new_tokens=tokens, |
top_p=top_p, |
repetition_penalty=rep_p, |
do_sample=True, |
seed=seed, |
) |
formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history) |
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, |
return_full_text=False) |
output = [] |
for response in stream: |
output.append(response.token.text) |
history.append((prompt, "".join(output))) |
st.write("".join(output)) |
def clear_fn(): |
return None |
rand_val = random.randint(1, 1111111111111111) |
def check_rand(inp, val): |
if inp is True: |
return st.slider("Seed", 1, 1111111111111111, rand_val) |
else: |
return st.slider("Seed", 1, 1111111111111111, int(val)) |
st.title("Google Gemma Models") |
client_choice = st.selectbox("Models", list(clients.keys())) |
rand = st.checkbox("Random Seed", True) |
seed = check_rand(rand, rand_val) |
tokens = st.slider("Max new tokens", 0, 8000, 6400, 64) |
temp = st.slider("Temperature", 0.01, 1.0, 0.9, step=0.01) |
top_p = st.slider("Top-P", 0.01, 1.0, 0.9, step=0.01) |
rep_p = st.slider("Repetition Penalty", 0.1, 2.0, 1.0, step=0.1) |
sys_inp = st.text_input("System Prompt (optional)") |
inp = st.text_input("Prompt for Description", value=f"what is qualifications required: {description}") |
btn = st.button("Chat") |
clear_btn = st.button("Clear") |
if btn: |
chat_inf(sys_inp, inp, None, client_choice, seed, temp, tokens, top_p, rep_p) |
if clear_btn: |
st.session_state.history = clear_fn() |
if __name__ == "__main__": |
st.title("Market Hipocrisy") |
uploader = CSVFileUploader() |
uploader.upload_file() |
uploader.select_row() |
uploader.plot_locations() |
with st.spinner("Scraping data..."): |
time.sleep(5) |
st.success("Scraping complete!") |
uploader.display_data() |