Spaces:

ccapo
/

portfolio

Sleeping

App Files Files Community

Christopher Capobianco commited on Oct 24, 2024

Commit

385b1f2

1 Parent(s): 938985b

Add Document Classifier project

Browse files

Files changed (5) hide show

Home.py +12 -0
app.py +2 -0
assets/document.jpg +0 -0
models/autoclassifier.pkl +3 -0
projects/01_Document_Classifier.py +102 -0

Home.py CHANGED Viewed

@@ -9,12 +9,24 @@ st.markdown('Please have a look at the descriptions below, and select a project
 st.header('Projects', divider='red')
 mv = Image.open("assets/movie.jpg")
 # wp = Image.open("assets/weather.png")
 sm = Image.open("assets/stock-market.png")
 mu = Image.open("assets/music.jpg")
 llm = Image.open("assets/llm.png")
 with st.container():
     text_column, image_column = st.columns((3,1))
     with text_column:

 st.header('Projects', divider='red')
+do = Image.open("assets/document.jpg")
 mv = Image.open("assets/movie.jpg")
 # wp = Image.open("assets/weather.png")
 sm = Image.open("assets/stock-market.png")
 mu = Image.open("assets/music.jpg")
 llm = Image.open("assets/llm.png")
+with st.container():
+    text_column, image_column = st.columns((3,1))
+    with text_column:
+        st.subheader("Document Classifier", divider="green")
+        st.markdown("""
+            - Used OCR text and a Random Forest classification model to predict a document's classification
+            - Trained on Real World Documents Collection at Kaggle
+        """)
+    with image_column:
+        st.image(do)
 with st.container():
     text_column, image_column = st.columns((3,1))
     with text_column:

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ st.set_page_config(page_title="Chris Capobianco's Profile", page_icon=':rocket:'
 home = st.Page('Home.py', title = 'Home')
 movie_recommendation = st.Page('projects/02_Movie_Recommendation.py', title='Movie Recommendation')
 # weather_classification = st.Page('projects/04_Weather_Classification.py', title='Weather Classification')
 stock_market = st.Page('projects/05_Stock_Market.py', title='Stock Market Forecast')
@@ -17,6 +18,7 @@ pg = st.navigation(
             home
         ],
         'Projects': [
             movie_recommendation,
             # weather_classification,
             stock_market,

 home = st.Page('Home.py', title = 'Home')
+document_classification = st.Page('projects/01_Document_Classifier.py', title='Document Classifier')
 movie_recommendation = st.Page('projects/02_Movie_Recommendation.py', title='Movie Recommendation')
 # weather_classification = st.Page('projects/04_Weather_Classification.py', title='Weather Classification')
 stock_market = st.Page('projects/05_Stock_Market.py', title='Stock Market Forecast')
             home
         ],
         'Projects': [
+            document_classification,
             movie_recommendation,
             # weather_classification,
             stock_market,

assets/document.jpg ADDED Viewed

models/autoclassifier.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:85fbfe655117e18cba957ced3fec41d9c243013461682d0f5c296762cda54d9c
+size 5116548

projects/01_Document_Classifier.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import streamlit as st
+import easyocr
+import pickle
+import spacy
+import en_core_web_sm
+import re
+import os
+# Function to Load the Spacy tokenizer
+@st.cache_data
+def load_nlp():
+    return spacy.load('en_core_web_sm')
+# Function to Initialze the OCR Engine
+@st.cache_resource
+def load_ocr_engine():
+    return easyocr.Reader(['en'])
+# Function to Load the model
+@st.cache_resource
+def load_model():
+    with open('models/autoclassifier.pkl', 'rb') as model_file:
+        stopwords = pickle.load(model_file)
+        punctuations = pickle.load(model_file)
+        model_pipe = pickle.load(model_file)
+    return (stopwords, punctuations, model_pipe)
+# Function to tokenize the text
+def tokenizer(sentence):
+    # Process the text
+    doc = nlp(sentence)
+    # Convert tokens to lemma form for all except '-PRON-'
+    # Recall: Tokens like 'I', 'my', 'me' are represented as '-PRON-' by lemma attribute (See SpaCy Introduction)
+    tokens = [ token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc ]
+    # Remove stop words and punctuations
+    tokens = [ token for token in tokens if token not in stopwords and token not in punctuations ]
+    return tokens
+# Function to process uploaded images
+@st.cache_data
+def autoclassifier(images):
+    # Iterate through all uploaded images
+    with st.spinner(f"Processing Images"):
+        for image in images:
+            # Write bytes to disk
+            with open(image.name, 'wb') as f:
+                f.write(image.read())
+            # Load image into OCR Engine and extract text
+            raw_ocr = ocr_engine.readtext(image.name)
+            # Extract relevant words from raw OCR
+            words = ''
+            for (bbox, text, prob) in raw_ocr:
+                # Only keep OCR text with 50% probability or higher
+                if prob > 0.5:
+                    # Filter out any digits
+                    text = re.sub('[0-9]+', '', text)
+                    # If we have any characters left, append to string
+                    if text != '':
+                        words += ' ' + text
+            # Pass filtered OCR string to the model
+            doc_type = model_pipe.predict([words])
+            # Report filename and document class
+            st.info(f"filename: '{image.name}', doc_type: '{doc_type[0]}'")
+            # Delete image file
+            os.remove(image.name)
+st.header('Document Classifier', divider='green')
+st.markdown("#### What is OCR?")
+st.markdown("OCR stands for Optical Character Recognition, and the technology for it has been around for over 30 years.")
+st.markdown("In this project, we leverage the extraction of the text from an image to classify the document. I am using EasyOCR as the OCR Engine, and I do some pre-processing of the raw OCR text to improve the quality of the words used to classify the documents.")
+st.markdown("After an investigation I settled on a Random Forest classifier for this project, since it had the best classification accuracy of the different models I investigated.")
+st.markdown("This project makes use of the [Real World Documents Collections](https://www.kaggle.com/datasets/shaz13/real-world-documents-collections) found at `Kaggle`")
+st.markdown("*This project is based off the tutorial by Animesh Giri [Intelligent Document Classification](https://www.kaggle.com/code/animeshgiri/intelligent-document-classification)*")
+st.markdown("*N.B. I created a similar document classifier in my first ML project, but that relied on IBM's Datacap for the OCR Engine. I also used a Support Vector Machine (SVM) classifier library (libsvm) at the time, but it was slow to train. I tried to re-create that document classifier again, using open source tools and modern techniques outlined in the referenced tutorial.*")
+st.divider()
+# Load the Spacy tokenizer
+nlp = load_nlp()
+# Initialze the OCR Engine
+ocr_engine = load_ocr_engine()
+# Load the Model
+stopwords, punctuations, model_pipe = load_model()
+# Fetch uploaded images
+images = st.file_uploader(
+    "Choose an image to classify",
+    type=['png','jpg','jpeg'],
+    accept_multiple_files=True
+)
+# Process and predict document classification
+autoclassifier(images)