|
import streamlit as st |
|
import pandas as pd |
|
import re |
|
import time |
|
import plotly.express as px |
|
import random |
|
from huggingface_hub import InferenceClient |
|
|
|
|
|
import spacy |
|
|
|
|
|
nlp = spacy.load("en_core_web_sm") |
|
|
|
def get_location(sentence): |
|
doc = nlp(sentence) |
|
for ent in doc.ents: |
|
if ent.label_ == "LOC" or ent.label_ == "GPE": |
|
return ent.text |
|
return None |
|
|
|
class JobPosting: |
|
def __init__(self, description): |
|
self.description = description |
|
|
|
def extract(self): |
|
|
|
responsibilities = "Define responsibilities here" |
|
return responsibilities |
|
|
|
class CSVFileUploader: |
|
def __init__(self): |
|
self.file = None |
|
self.selected_column = None |
|
self.selected_row = None |
|
|
|
def upload_file(self): |
|
uploaded_file = st.file_uploader("Upload CSV file", type=["csv"]) |
|
if uploaded_file is not None: |
|
self.file = pd.read_csv(uploaded_file) |
|
df = self.file |
|
|
|
|
|
df["Location"] = df["Location"].apply(get_location) |
|
|
|
st.header("Data Preview") |
|
st.write(df.head()) |
|
|
|
st.header("Data Information") |
|
st.write(df.info()) |
|
|
|
st.header("Descriptive Statistics") |
|
st.write(df.describe()) |
|
|
|
st.header("Column Distribution") |
|
selected_column = st.selectbox("Select a column", df.columns) |
|
fig = px.bar(df, x=df[selected_column], labels={'index': 'Value', selected_column: 'Count'}) |
|
st.plotly_chart(fig) |
|
|
|
def plot_locations(self): |
|
if self.file is not None: |
|
df = self.file.dropna(subset=["Location"]) |
|
locations = df["Location"].value_counts().reset_index() |
|
locations.columns = ["Location", "Count"] |
|
fig = px.bar(locations, x="Location", y="Count", labels={'index': 'Value', 'Location': 'Location', 'Count': 'Count'}) |
|
fig.update_layout(xaxis={'categoryorder': 'total descending'}) |
|
st.header("Location Distribution") |
|
st.plotly_chart(fig) |
|
|
|
def select_row(self): |
|
if self.file is not None: |
|
self.selected_row = st.selectbox("Select row", list(range(len(self.file)))) |
|
|
|
def display_data(self): |
|
if self.file is not None: |
|
if self.selected_column is not None: |
|
st.write(self.file[self.selected_column]) |
|
if self.selected_row is not None: |
|
location = self.file.iloc[self.selected_row]["Location"] |
|
description = self.file.iloc[self.selected_row]["Description"] |
|
st.markdown("---") |
|
st.markdown(f"<span style='background-color: #f4a261; padding: 2px 4px; border-radius: 4px;'>Location:</span>", unsafe_allow_html=True) |
|
st.write("Just give me the location for this job description, no other words and remove 'on-site' or 'remote' if mentioned. For example, 'Boston, MA, USA': " |
|
+ str(location)) |
|
st.markdown("---") |
|
st.markdown(f"<span style='background-color: #f4a261; padding: 2px 4px; border-radius: 4px;'>Description for selected row:</span>", unsafe_allow_html=True) |
|
st.write("What are the qualifications required: "+ str(description)) |
|
|
|
|
|
|
|
clients = { |
|
"google/gemma-7b": InferenceClient("google/gemma-7b"), |
|
"google/gemma-7b-it": InferenceClient("google/gemma-7b-it"), |
|
"google/gemma-2b": InferenceClient("google/gemma-2b"), |
|
"google/gemma-2b-it": InferenceClient("google/gemma-2b-it") |
|
} |
|
|
|
def format_prompt(message, history): |
|
prompt = "" |
|
if history: |
|
for user_prompt, bot_response in history: |
|
prompt += f"<start_of_turn>user{user_prompt}<end_of_turn>" |
|
prompt += f"<start_of_turn>model{bot_response}" |
|
prompt += f"<start_of_turn>user{message}<end_of_turn><start_of_turn>model" |
|
return prompt |
|
|
|
def chat_inf(system_prompt, prompt, history, client_choice, seed, temp, tokens, top_p, rep_p): |
|
client = clients[client_choice] |
|
if not history: |
|
history = [] |
|
hist_len = 0 |
|
if history: |
|
hist_len = len(history) |
|
|
|
generate_kwargs = dict( |
|
temperature=temp, |
|
max_new_tokens=tokens, |
|
top_p=top_p, |
|
repetition_penalty=rep_p, |
|
do_sample=True, |
|
seed=seed, |
|
) |
|
formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history) |
|
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, |
|
return_full_text=False) |
|
output = [] |
|
|
|
for response in stream: |
|
output.append(response.token.text) |
|
history.append((prompt, "".join(output))) |
|
st.write("".join(output)) |
|
|
|
def clear_fn(): |
|
return None |
|
|
|
rand_val = random.randint(1, 1111111111111111) |
|
|
|
def check_rand(inp, val): |
|
if inp is True: |
|
return st.slider("Seed", 1, 1111111111111111, rand_val) |
|
else: |
|
return st.slider("Seed", 1, 1111111111111111, int(val)) |
|
|
|
st.title("Google Gemma Models") |
|
|
|
client_choice = st.selectbox("Models", list(clients.keys())) |
|
|
|
rand = st.checkbox("Random Seed", True) |
|
seed = check_rand(rand, rand_val) |
|
tokens = st.slider("Max new tokens", 0, 8000, 6400, 64) |
|
temp = st.slider("Temperature", 0.01, 1.0, 0.9, step=0.01) |
|
top_p = st.slider("Top-P", 0.01, 1.0, 0.9, step=0.01) |
|
rep_p = st.slider("Repetition Penalty", 0.1, 2.0, 1.0, step=0.1) |
|
|
|
sys_inp = st.text_input("System Prompt (optional)") |
|
inp = st.text_input("Prompt for Description", value=f"what is qualifications required: {description}") |
|
btn = st.button("Chat") |
|
clear_btn = st.button("Clear") |
|
|
|
if btn: |
|
chat_inf(sys_inp, inp, None, client_choice, seed, temp, tokens, top_p, rep_p) |
|
if clear_btn: |
|
st.session_state.history = clear_fn() |
|
|
|
if __name__ == "__main__": |
|
st.title("Market Hipocrisy") |
|
uploader = CSVFileUploader() |
|
uploader.upload_file() |
|
uploader.select_row() |
|
uploader.plot_locations() |
|
|
|
|
|
with st.spinner("Scraping data..."): |
|
time.sleep(5) |
|
st.success("Scraping complete!") |
|
|
|
uploader.display_data() |
|
|