import streamlit as st import pandas as pd import re import time import plotly.express as px import random from huggingface_hub import InferenceClient # Load spaCy model import spacy # Load the English language model nlp = spacy.load("en_core_web_sm") def get_location(sentence): doc = nlp(sentence) for ent in doc.ents: if ent.label_ == "LOC" or ent.label_ == "GPE": return ent.text return None class JobPosting: def __init__(self, description): self.description = description def extract(self): # Define responsibilities extraction logic (replace with actual logic) responsibilities = "Define responsibilities here" return responsibilities class CSVFileUploader: def __init__(self): self.file = None self.selected_column = None self.selected_row = None def upload_file(self): uploaded_file = st.file_uploader("Upload CSV file", type=["csv"]) if uploaded_file is not None: self.file = pd.read_csv(uploaded_file) df = self.file # Refining the Location Column df["Location"] = df["Location"].apply(get_location) st.header("Data Preview") st.write(df.head()) st.header("Data Information") st.write(df.info()) st.header("Descriptive Statistics") st.write(df.describe()) st.header("Column Distribution") selected_column = st.selectbox("Select a column", df.columns) fig = px.bar(df, x=df[selected_column], labels={'index': 'Value', selected_column: 'Count'}) st.plotly_chart(fig) def plot_locations(self): if self.file is not None: df = self.file.dropna(subset=["Location"]) locations = df["Location"].value_counts().reset_index() locations.columns = ["Location", "Count"] fig = px.bar(locations, x="Location", y="Count", labels={'index': 'Value', 'Location': 'Location', 'Count': 'Count'}) fig.update_layout(xaxis={'categoryorder': 'total descending'}) st.header("Location Distribution") st.plotly_chart(fig) def select_row(self): if self.file is not None: self.selected_row = st.selectbox("Select row", list(range(len(self.file)))) def display_data(self): if self.file is not None: if self.selected_column is not None: st.write(self.file[self.selected_column]) if self.selected_row is not None: location = self.file.iloc[self.selected_row]["Location"] description = self.file.iloc[self.selected_row]["Description"] st.markdown("---") st.markdown(f"Location:", unsafe_allow_html=True) st.write("Just give me the location for this job description, no other words and remove 'on-site' or 'remote' if mentioned. For example, 'Boston, MA, USA': " + str(location)) st.markdown("---") st.markdown(f"Description for selected row:", unsafe_allow_html=True) st.write("What are the qualifications required: "+ str(description)) # Automatically populate prompts and show the output clients = { "google/gemma-7b": InferenceClient("google/gemma-7b"), "google/gemma-7b-it": InferenceClient("google/gemma-7b-it"), "google/gemma-2b": InferenceClient("google/gemma-2b"), "google/gemma-2b-it": InferenceClient("google/gemma-2b-it") } def format_prompt(message, history): prompt = "" if history: for user_prompt, bot_response in history: prompt += f"user{user_prompt}" prompt += f"model{bot_response}" prompt += f"user{message}model" return prompt def chat_inf(system_prompt, prompt, history, client_choice, seed, temp, tokens, top_p, rep_p): client = clients[client_choice] # Use the client_choice directly as an index if not history: history = [] hist_len = 0 if history: hist_len = len(history) generate_kwargs = dict( temperature=temp, max_new_tokens=tokens, top_p=top_p, repetition_penalty=rep_p, do_sample=True, seed=seed, ) formatted_prompt = format_prompt(f"{system_prompt}, {prompt}", history) stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False) output = [] for response in stream: output.append(response.token.text) history.append((prompt, "".join(output))) st.write("".join(output)) # Display the accumulated output def clear_fn(): return None rand_val = random.randint(1, 1111111111111111) def check_rand(inp, val): if inp is True: return st.slider("Seed", 1, 1111111111111111, rand_val) else: return st.slider("Seed", 1, 1111111111111111, int(val)) st.title("Google Gemma Models") client_choice = st.selectbox("Models", list(clients.keys())) # Use keys as the choices rand = st.checkbox("Random Seed", True) seed = check_rand(rand, rand_val) tokens = st.slider("Max new tokens", 0, 8000, 6400, 64) temp = st.slider("Temperature", 0.01, 1.0, 0.9, step=0.01) top_p = st.slider("Top-P", 0.01, 1.0, 0.9, step=0.01) rep_p = st.slider("Repetition Penalty", 0.1, 2.0, 1.0, step=0.1) sys_inp = st.text_input("System Prompt (optional)") inp = st.text_input("Prompt for Description", value=f"what is qualifications required: {description}") btn = st.button("Chat") clear_btn = st.button("Clear") if btn: chat_inf(sys_inp, inp, None, client_choice, seed, temp, tokens, top_p, rep_p) if clear_btn: st.session_state.history = clear_fn() if __name__ == "__main__": st.title("Market Hipocrisy") uploader = CSVFileUploader() uploader.upload_file() uploader.select_row() uploader.plot_locations() # Progress bar example (replace with actual scraping logic) with st.spinner("Scraping data..."): time.sleep(5) # Simulating a long-running process st.success("Scraping complete!") uploader.display_data()