|
import streamlit as st |
|
import easyocr |
|
import pickle |
|
import spacy |
|
import re |
|
import os |
|
import subprocess |
|
|
|
|
|
@st.cache_resource |
|
def load_nlp(): |
|
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"]) |
|
return spacy.load('en_core_web_sm') |
|
|
|
|
|
@st.cache_resource |
|
def load_tokenizer_model(): |
|
with open('./models/autoclassifier.pkl', 'rb') as model_file: |
|
stopwords = pickle.load(model_file) |
|
punctuations = pickle.load(model_file) |
|
model_pipe = pickle.load(model_file) |
|
return (stopwords, punctuations, model_pipe) |
|
|
|
|
|
@st.cache_resource |
|
def load_ocr_engine(): |
|
return easyocr.Reader(['en']) |
|
|
|
|
|
@st.cache_data |
|
def autoclassifier(images): |
|
|
|
with st.spinner(f"Processing Images"): |
|
for image in images: |
|
|
|
with open(image.name, 'wb') as f: |
|
f.write(image.read()) |
|
|
|
|
|
raw_ocr = ocr_engine.readtext(image.name) |
|
|
|
|
|
words = '' |
|
for (bbox, text, prob) in raw_ocr: |
|
|
|
if prob > 0.5: |
|
|
|
text = re.sub('[0-9]+', '', text) |
|
|
|
if text != '': |
|
words += ' ' + text |
|
|
|
doc_type = model_pipe.predict([words]) |
|
|
|
|
|
st.info(f"filename: '{image.name}', doc_type: '{doc_type[0]}'") |
|
|
|
|
|
os.remove(image.name) |
|
|
|
st.header('Document Classifier', divider='green') |
|
|
|
st.markdown("#### What is OCR?") |
|
st.markdown("OCR stands for Optical Character Recognition, and the technology for it has been around for over 30 years.") |
|
st.markdown("In this project, we leverage the extraction of the text from an image to classify the document. I am using EasyOCR as the OCR Engine, and I do some pre-processing of the raw OCR text to improve the quality of the words used to classify the documents.") |
|
st.markdown("After an investigation I settled on a Random Forest classifier for this project, since it had the best classification accuracy of the different models I investigated.") |
|
st.markdown("This project makes use of the [Real World Documents Collections](https://www.kaggle.com/datasets/shaz13/real-world-documents-collections) found at `Kaggle`") |
|
st.markdown("*This project is based off the tutorial by Animesh Giri [Intelligent Document Classification](https://www.kaggle.com/code/animeshgiri/intelligent-document-classification)*") |
|
st.markdown("*N.B. I created a similar document classifier in my first ML project, but that relied on IBM's Datacap for the OCR Engine. I also used a Support Vector Machine (SVM) classifier library (libsvm) at the time, but it was slow to train. I tried to re-create that document classifier again, using open source tools and modern techniques outlined in the referenced tutorial.*") |
|
st.divider() |
|
|
|
|
|
images = st.file_uploader( |
|
"Choose an image to classify", |
|
type=['png','jpg','jpeg'], |
|
accept_multiple_files=True |
|
) |
|
|
|
|
|
nlp = load_nlp() |
|
|
|
|
|
stopwords, punctuations, model_pipe = load_tokenizer_model() |
|
|
|
|
|
ocr_engine = load_ocr_engine() |
|
|
|
|
|
autoclassifier(images) |