negismohit123's picture
Update app.py
3b1ebb0 verified
import streamlit as st
import pandas as pd
import re
import time
import plotly.express as px
import random
from huggingface_hub import InferenceClient
# Load spaCy model
import spacy
# Load the English language model
nlp = spacy.load("en_core_web_sm")
def get_location(sentence):
doc = nlp(sentence)
for ent in doc.ents:
if ent.label_ == "LOC" or ent.label_ == "GPE":
return ent.text
return None
class JobPosting:
def __init__(self, description):
self.description = description
def extract(self):
# Define responsibilities extraction logic (replace with actual logic)
responsibilities = "Define responsibilities here"
return responsibilities
class CSVFileUploader:
def __init__(self):
self.file = None
self.selected_column = None
self.selected_row = None
def upload_file(self):
uploaded_file = st.file_uploader("Upload CSV file", type=["csv"])
if uploaded_file is not None:
self.file = pd.read_csv(uploaded_file)
df = self.file
# Refining the Location Column
df["Location"] = df["Location"].apply(get_location)
st.header("Data Preview")
st.write(df.head())
st.header("Data Information")
st.write(df.info())
st.header("Descriptive Statistics")
st.write(df.describe())
st.header("Column Distribution")
selected_column = st.selectbox("Select a column", df.columns)
fig = px.bar(df, x=df[selected_column], labels={'index': 'Value', selected_column: 'Count'})
st.plotly_chart(fig)
def plot_locations(self):
if self.file is not None:
df = self.file.dropna(subset=["Location"])
locations = df["Location"].value_counts().reset_index()
locations.columns = ["Location", "Count"]
fig = px.bar(locations, x="Location", y="Count", labels={'index': 'Value', 'Location': 'Location', 'Count': 'Count'})
fig.update_layout(xaxis={'categoryorder': 'total descending'})
st.header("Location Distribution")
st.plotly_chart(fig)
def select_row(self):
if self.file is not None:
self.selected_row = st.selectbox("Select row", list(range(len(self.file))))
def display_data(self):
if self.file is not None:
if self.selected_column is not None:
st.write(self.file[self.selected_column])
if self.selected_row is not None:
location = self.file.iloc[self.selected_row]["Location"]
description = self.file.iloc[self.selected_row]["Description"]
st.markdown("---")
st.markdown(f"<span style='background-color: #f4a261; padding: 2px 4px; border-radius: 4px;'>Location:</span>", unsafe_allow_html=True)
st.write("Just give me the location for this job description, no other words and remove 'on-site' or 'remote' if mentioned. For example, 'Boston, MA, USA': "
+ str(location))
st.markdown("---")
st.markdown(f"<span style='background-color: #f4a261; padding: 2px 4px; border-radius: 4px;'>Description for selected row:</span>", unsafe_allow_html=True)
st.write("What are the qualifications required: "+ str(description))
# Automatically populate prompts and show the output
clients = {
"google/gemma-7b": InferenceClient("google/gemma-7b"),
"google/gemma-7b-it": InferenceClient("google/gemma-7b-it"),
"google/gemma-2b": InferenceClient("google/gemma-2b"),
"google/gemma-2b-it": InferenceClient("google/gemma-2b-it")
}
def format_prompt(message, history):
prompt = ""
if history:
for user_prompt, bot_response in history:
prompt += f"<start_of_turn>user{user_prompt}<end_of_turn>"
prompt += f"<start_of_turn>model{bot_response}"
prompt += f"<start_of_turn>user{message}<end_of_turn><start_of_turn>model"
return prompt
def chat_inf(system_prompt, prompt, history, client_choice, seed, temp, tokens, top_p, rep_p):
client = clients[client_choice] # Use the client_choice directly as an index
if not history:
history = []
hist_len = 0
if history:
hist_len = len(history)
generate_kwargs = dict(
temperature=temp,
max_new_tokens=tokens,
top_p=top_p,
repetition_penalty=rep_p,
do_sample=True,
seed=seed,
)
formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history)
stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True,
return_full_text=False)
output = []
for response in stream:
output.append(response.token.text)
history.append((prompt, "".join(output)))
st.write("".join(output)) # Display the accumulated output
def clear_fn():
return None
rand_val = random.randint(1, 1111111111111111)
def check_rand(inp, val):
if inp is True:
return st.slider("Seed", 1, 1111111111111111, rand_val)
else:
return st.slider("Seed", 1, 1111111111111111, int(val))
st.title("Google Gemma Models")
client_choice = st.selectbox("Models", list(clients.keys())) # Use keys as the choices
rand = st.checkbox("Random Seed", True)
seed = check_rand(rand, rand_val)
tokens = st.slider("Max new tokens", 0, 8000, 6400, 64)
temp = st.slider("Temperature", 0.01, 1.0, 0.9, step=0.01)
top_p = st.slider("Top-P", 0.01, 1.0, 0.9, step=0.01)
rep_p = st.slider("Repetition Penalty", 0.1, 2.0, 1.0, step=0.1)
sys_inp = st.text_input("System Prompt (optional)")
inp = st.text_input("Prompt for Description", value=f"what is qualifications required: {description}")
btn = st.button("Chat")
clear_btn = st.button("Clear")
if btn:
chat_inf(sys_inp, inp, None, client_choice, seed, temp, tokens, top_p, rep_p)
if clear_btn:
st.session_state.history = clear_fn()
if __name__ == "__main__":
st.title("Market Hipocrisy")
uploader = CSVFileUploader()
uploader.upload_file()
uploader.select_row()
uploader.plot_locations()
# Progress bar example (replace with actual scraping logic)
with st.spinner("Scraping data..."):
time.sleep(5) # Simulating a long-running process
st.success("Scraping complete!")
uploader.display_data()