import pandas as pd import streamlit as st import pandas as pd from functions import * from qdrant_client import QdrantClient from qdrant_client.http.models import VectorParams, Distance, Record, Filter from random import uniform backgroundPattern = """ """ st.markdown(backgroundPattern, unsafe_allow_html=True) st.write(""" # Resume Screening & Classification """) st.header('Input') jobs_data= job_desc_pdf() resume_data= resume_pdf() st.write('input to df:') st.write(jobs_data) st.write(resume_data) # setup_nltk_resources() # # Unzip wordnet # corpora_path = "/kaggle/working/nltk_data/corpora" # wordnet_zip = os.path.join(corpora_path, "wordnet.zip") # unzip_nltk_resource(wordnet_zip, corpora_path) # Apply preprocessing jobs_data['processed_description'] = jobs_data['description'].apply(preprocess_text) jobs_data_cleaned = drop_duplicates(jobs_data, column_name='description') resume_data['processed_resume'] = resume_data['Resume'].apply(preprocess_text) resume_data_cleaned = drop_duplicates(resume_data, column_name='Resume') st.write("CLEANED") st.write(jobs_data_cleaned) st.write(resume_data_cleaned) jobs_data_cleaned_with_tokens = add_token_count_column(jobs_data_cleaned, column_name='processed_description') resume_data_cleaned_with_tokens = add_token_count_column(resume_data_cleaned, column_name='processed_resume') # Dropping unnecessary columns from jobs data jobs_data_final = jobs_data_cleaned_with_tokens[['processed_description', 'token_count']] # Dropping unnecessary columns from resume data resume_data_final = resume_data_cleaned_with_tokens[['processed_resume', 'token_count']] st.write("CLEANED WITH TOKENS") st.write(jobs_data_final) st.write(resume_data_final) summarizer = TextSummarizer("geekradius/bart-large-cnn-fintetuned-samsum-repo") st.write("sum") # Summariz jobs description jobs_data_summarized = batch_summarize(jobs_data_final, 'processed_description', summarizer, batch_size=10, output_col='summarized_description') # Summarize all 'processed_resume' in resume_data_final resume_data_summarized = batch_summarize(resume_data_final, 'processed_resume', summarizer, batch_size=10, output_col='summarized_resume') st.write("SUMMARISED") st.write(jobs_data_summarized) st.write(resume_data_summarized) # Example Usage encoder = SentenceTransformerEncoder("all-MiniLM-L6-v2") # Encoding the summarized job descriptions jobs_data_summarized_and_encoded = encoder.encode_column(jobs_data_summarized, 'summarized_description') # Encoding the summarized resumes resume_data_summarized_and_encoded = encoder.encode_column(resume_data_summarized, 'summarized_resume') st.write("SUMMARISED AND ENCODED") st.write(jobs_data_summarized_and_encoded) st.write(resume_data_summarized_and_encoded) # Combine the jobs data jobs_combined = pd.merge( jobs_data_final, jobs_data_summarized_and_encoded[['summarized_description', 'summarized_description_encoded']], left_index=True, right_index=True) # Combine the resume data resume_combined = pd.merge( resume_data_final, resume_data_summarized_and_encoded[['summarized_resume', 'summarized_resume_encoded']], left_index=True, right_index=True) # Reset index of DataFrame jobs_combined.reset_index(drop=True, inplace=True) resume_combined.reset_index(drop=True, inplace=True) st.write("SUMMARISED AND ENCODED") st.write(jobs_combined) st.write(resume_combined) #QDRANT VECTORIZER vector_dimension = encoder.model.get_sentence_embedding_dimension() qdrant_interface = QdrantInterface(QUADRANT_ENDPOINT, QUADRANT_API_KEY, vector_dimension) qdrant_interface.create_collection('jobs', Distance.COSINE) qdrant_interface.create_collection('resumes', Distance.COSINE) # Function to ensure vectors are in list format def ensure_list_format(df, vector_col): df[vector_col] = df[vector_col].apply(lambda x: x.tolist() if hasattr(x, 'tolist') else x) return df # Ensure vectors are in the correct format before uploading jobs_combined = ensure_list_format(jobs_combined, 'summarized_description_encoded') resume_combined = ensure_list_format(resume_combined, 'summarized_resume_encoded') st.write("LIST FORMAT") st.write(jobs_combined) st.write(resume_combined) given_job_vector = jobs_combined['summarized_description_encoded'].iloc[0] # Now upload to Qdrant qdrant_interface.save_to_qdrant(jobs_combined, 'jobs', 'summarized_description_encoded', ['processed_title', 'processed_description', 'token_count', 'summarized_description']) qdrant_interface.save_to_qdrant(resume_combined, 'resumes', 'summarized_resume_encoded', ['processed_resume', 'token_count', 'summarized_resume']) # Retrieve specific records by IDs from the 'jobs' collection specific_jobs_records = qdrant_interface.retrieve_specific_records('jobs', ids=[1]) st.write("SPECIFIC JOB RECS") st.write(specific_jobs_records) # Find top 5 matching resumes for the example job matched_resumes = qdrant_interface.match_jobs_to_resumes(given_job_vector, top_k=5) for resume, score in matched_resumes: st.write(f"Matched Resume: {resume['summarized_resume']}, Score: {score}")