Jeet Paul commited on
Commit
e249a3c
·
1 Parent(s): 00e257b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -8
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import streamlit as st
2
- from tika import parser
3
  import pandas as pd
4
  from sklearn.preprocessing import LabelEncoder
5
  from sklearn.feature_extraction.text import TfidfVectorizer
@@ -7,6 +6,8 @@ from sklearn.multiclass import OneVsRestClassifier
7
  from sklearn.neighbors import KNeighborsClassifier
8
  import re
9
  import pickle
 
 
10
 
11
  def cleanResume(resumeText):
12
  # Your existing cleanResume function remains unchanged
@@ -35,9 +36,8 @@ model = OneVsRestClassifier(KNeighborsClassifier())
35
  model.fit(WordFeatures, target)
36
 
37
  def pdf_to_text(file):
38
- # Use tika to extract text from the PDF file
39
- file_data = parser.from_buffer(file.read())
40
- text = file_data['content']
41
  return text
42
 
43
  def predict_category(resumes_data, selected_category):
@@ -64,7 +64,7 @@ def main():
64
  st.title("Resume Ranking App")
65
  st.text("Upload resumes and select a category to rank them.")
66
 
67
- tika_server_url = "http://localhost:58830/"
68
 
69
  resumes_data = []
70
  selected_category = ""
@@ -73,10 +73,8 @@ def main():
73
  files = st.file_uploader("Upload resumes", type=["pdf"], accept_multiple_files=True)
74
  if files:
75
  for file in files:
76
- file_data = parser.from_buffer(file.read(), serverEndpoint=tika_server_url)
77
- text = cleanResume(pdf_to_text(file_data))
78
  resumes_data.append({'ResumeText': text, 'FileName': file.name})
79
-
80
  selected_category = st.selectbox("Select a category to rank by", label.classes_)
81
 
82
  if st.button("Rank Resumes"):
 
1
  import streamlit as st
 
2
  import pandas as pd
3
  from sklearn.preprocessing import LabelEncoder
4
  from sklearn.feature_extraction.text import TfidfVectorizer
 
6
  from sklearn.neighbors import KNeighborsClassifier
7
  import re
8
  import pickle
9
+ import pdfminer
10
+ from pdfminer.high_level import extract_text
11
 
12
  def cleanResume(resumeText):
13
  # Your existing cleanResume function remains unchanged
 
36
  model.fit(WordFeatures, target)
37
 
38
  def pdf_to_text(file):
39
+ # Use pdfminer.six to extract text from the PDF file
40
+ text = extract_text(file)
 
41
  return text
42
 
43
  def predict_category(resumes_data, selected_category):
 
64
  st.title("Resume Ranking App")
65
  st.text("Upload resumes and select a category to rank them.")
66
 
67
+
68
 
69
  resumes_data = []
70
  selected_category = ""
 
73
  files = st.file_uploader("Upload resumes", type=["pdf"], accept_multiple_files=True)
74
  if files:
75
  for file in files:
76
+ text = cleanResume(pdf_to_text(file))
 
77
  resumes_data.append({'ResumeText': text, 'FileName': file.name})
 
78
  selected_category = st.selectbox("Select a category to rank by", label.classes_)
79
 
80
  if st.button("Rank Resumes"):