Christopher Capobianco commited on
Commit
fc8e190
1 Parent(s): b1eea1f

Get document classifier to load properly

Browse files
Home.py CHANGED
@@ -20,7 +20,6 @@ with st.container():
20
  text_column, image_column = st.columns((3,1))
21
  with text_column:
22
  st.subheader("Document Classifier", divider="green")
23
- st.warning("Work in Progress")
24
  st.markdown("""
25
  - Used OCR text and a Random Forest classification model to predict a document's classification
26
  - Trained on Real World Documents Collection at Kaggle
 
20
  text_column, image_column = st.columns((3,1))
21
  with text_column:
22
  st.subheader("Document Classifier", divider="green")
 
23
  st.markdown("""
24
  - Used OCR text and a Random Forest classification model to predict a document's classification
25
  - Trained on Real World Documents Collection at Kaggle
app.py CHANGED
@@ -1,9 +1,40 @@
1
  import streamlit as st
 
 
 
2
 
3
  # Page title
4
  st.set_page_config(page_title="Chris Capobianco's Profile", page_icon=':rocket:', layout='wide')
5
 
6
- home = st.Page('Home.py', title = 'Home')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  document_classification = st.Page('projects/01_Document_Classifier.py', title='Document Classifier')
9
  movie_recommendation = st.Page('projects/02_Movie_Recommendation.py', title='Movie Recommendation')
@@ -29,3 +60,9 @@ pg = st.navigation(
29
  )
30
 
31
  pg.run()
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import spacy
3
+ import pickle
4
+ import subprocess
5
 
6
  # Page title
7
  st.set_page_config(page_title="Chris Capobianco's Profile", page_icon=':rocket:', layout='wide')
8
 
9
+ home = st.Page('Home.py', title = 'Home', default = True)
10
+
11
+ # Function to Load the Spacy tokenizer
12
+ @st.cache_resource
13
+ def load_nlp():
14
+ subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
15
+ return spacy.load('en_core_web_sm')
16
+
17
+ def tokenizer(sentence):
18
+ # Process the text
19
+ doc = nlp(sentence)
20
+
21
+ # Convert tokens to lemma form for all except '-PRON-'
22
+ # Recall: Tokens like 'I', 'my', 'me' are represented as '-PRON-' by lemma attribute (See SpaCy Introduction)
23
+ tokens = [ token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc ]
24
+
25
+ # Remove stop words and punctuations
26
+ tokens = [ token for token in tokens if token not in stopwords and token not in punctuations ]
27
+
28
+ return tokens
29
+
30
+ # Function to Load the model
31
+ @st.cache_resource
32
+ def load_tokenizer_model():
33
+ with open('./models/autoclassifier.pkl', 'rb') as model_file:
34
+ stopwords = pickle.load(model_file)
35
+ punctuations = pickle.load(model_file)
36
+ model_pipe = pickle.load(model_file)
37
+ return (stopwords, punctuations, model_pipe)
38
 
39
  document_classification = st.Page('projects/01_Document_Classifier.py', title='Document Classifier')
40
  movie_recommendation = st.Page('projects/02_Movie_Recommendation.py', title='Movie Recommendation')
 
60
  )
61
 
62
  pg.run()
63
+
64
+ # Load the Spacy tokenizer
65
+ nlp = load_nlp()
66
+
67
+ # Load the Model
68
+ stopwords, punctuations, model_pipe = load_tokenizer_model()
projects/01_Document_Classifier.py CHANGED
@@ -7,38 +7,24 @@ import os
7
  import subprocess
8
 
9
  # Function to Load the Spacy tokenizer
10
- @st.cache_data
11
  def load_nlp():
12
  subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
13
  return spacy.load('en_core_web_sm')
14
 
15
- # Function to Initialze the OCR Engine
16
- @st.cache_resource
17
- def load_ocr_engine():
18
- return easyocr.Reader(['en'])
19
-
20
  # Function to Load the model
21
  @st.cache_resource
22
- def load_model():
23
- with open('models/autoclassifier.pkl', 'rb') as model_file:
24
  stopwords = pickle.load(model_file)
25
  punctuations = pickle.load(model_file)
26
  model_pipe = pickle.load(model_file)
27
  return (stopwords, punctuations, model_pipe)
28
 
29
- # Function to tokenize the text
30
- def tokenizer(sentence):
31
- # Process the text
32
- doc = nlp(sentence)
33
-
34
- # Convert tokens to lemma form for all except '-PRON-'
35
- # Recall: Tokens like 'I', 'my', 'me' are represented as '-PRON-' by lemma attribute (See SpaCy Introduction)
36
- tokens = [ token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in doc ]
37
-
38
- # Remove stop words and punctuations
39
- tokens = [ token for token in tokens if token not in stopwords and token not in punctuations ]
40
-
41
- return tokens
42
 
43
  # Function to process uploaded images
44
  @st.cache_data
@@ -72,35 +58,32 @@ def autoclassifier(images):
72
  # Delete image file
73
  os.remove(image.name)
74
 
75
- if __name__ == "__main__":
76
- st.header('Document Classifier', divider='green')
77
-
78
- st.warning("Work in Progress")
79
 
80
- st.markdown("#### What is OCR?")
81
- st.markdown("OCR stands for Optical Character Recognition, and the technology for it has been around for over 30 years.")
82
- st.markdown("In this project, we leverage the extraction of the text from an image to classify the document. I am using EasyOCR as the OCR Engine, and I do some pre-processing of the raw OCR text to improve the quality of the words used to classify the documents.")
83
- st.markdown("After an investigation I settled on a Random Forest classifier for this project, since it had the best classification accuracy of the different models I investigated.")
84
- st.markdown("This project makes use of the [Real World Documents Collections](https://www.kaggle.com/datasets/shaz13/real-world-documents-collections) found at `Kaggle`")
85
- st.markdown("*This project is based off the tutorial by Animesh Giri [Intelligent Document Classification](https://www.kaggle.com/code/animeshgiri/intelligent-document-classification)*")
86
- st.markdown("*N.B. I created a similar document classifier in my first ML project, but that relied on IBM's Datacap for the OCR Engine. I also used a Support Vector Machine (SVM) classifier library (libsvm) at the time, but it was slow to train. I tried to re-create that document classifier again, using open source tools and modern techniques outlined in the referenced tutorial.*")
87
- st.divider()
88
 
89
- # Load the Spacy tokenizer
90
- nlp = load_nlp()
 
 
 
 
91
 
92
- # Initialze the OCR Engine
93
- ocr_engine = load_ocr_engine()
94
 
95
- # Load the Model
96
- stopwords, punctuations, model_pipe = load_model()
97
 
98
- # Fetch uploaded images
99
- images = st.file_uploader(
100
- "Choose an image to classify",
101
- type=['png','jpg','jpeg'],
102
- accept_multiple_files=True
103
- )
104
 
105
- # Process and predict document classification
106
- autoclassifier(images)
 
7
  import subprocess
8
 
9
  # Function to Load the Spacy tokenizer
10
+ @st.cache_resource
11
  def load_nlp():
12
  subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
13
  return spacy.load('en_core_web_sm')
14
 
 
 
 
 
 
15
  # Function to Load the model
16
  @st.cache_resource
17
+ def load_tokenizer_model():
18
+ with open('./models/autoclassifier.pkl', 'rb') as model_file:
19
  stopwords = pickle.load(model_file)
20
  punctuations = pickle.load(model_file)
21
  model_pipe = pickle.load(model_file)
22
  return (stopwords, punctuations, model_pipe)
23
 
24
+ # Function to Initialze the OCR Engine
25
+ @st.cache_resource
26
+ def load_ocr_engine():
27
+ return easyocr.Reader(['en'])
 
 
 
 
 
 
 
 
 
28
 
29
  # Function to process uploaded images
30
  @st.cache_data
 
58
  # Delete image file
59
  os.remove(image.name)
60
 
61
+ st.header('Document Classifier', divider='green')
 
 
 
62
 
63
+ st.markdown("#### What is OCR?")
64
+ st.markdown("OCR stands for Optical Character Recognition, and the technology for it has been around for over 30 years.")
65
+ st.markdown("In this project, we leverage the extraction of the text from an image to classify the document. I am using EasyOCR as the OCR Engine, and I do some pre-processing of the raw OCR text to improve the quality of the words used to classify the documents.")
66
+ st.markdown("After an investigation I settled on a Random Forest classifier for this project, since it had the best classification accuracy of the different models I investigated.")
67
+ st.markdown("This project makes use of the [Real World Documents Collections](https://www.kaggle.com/datasets/shaz13/real-world-documents-collections) found at `Kaggle`")
68
+ st.markdown("*This project is based off the tutorial by Animesh Giri [Intelligent Document Classification](https://www.kaggle.com/code/animeshgiri/intelligent-document-classification)*")
69
+ st.markdown("*N.B. I created a similar document classifier in my first ML project, but that relied on IBM's Datacap for the OCR Engine. I also used a Support Vector Machine (SVM) classifier library (libsvm) at the time, but it was slow to train. I tried to re-create that document classifier again, using open source tools and modern techniques outlined in the referenced tutorial.*")
70
+ st.divider()
71
 
72
+ # Fetch uploaded images
73
+ images = st.file_uploader(
74
+ "Choose an image to classify",
75
+ type=['png','jpg','jpeg'],
76
+ accept_multiple_files=True
77
+ )
78
 
79
+ # Load the Spacy tokenizer
80
+ nlp = load_nlp()
81
 
82
+ # Load the Model
83
+ stopwords, punctuations, model_pipe = load_tokenizer_model()
84
 
85
+ # Initialze the OCR Engine
86
+ ocr_engine = load_ocr_engine()
 
 
 
 
87
 
88
+ # Process and predict document classification
89
+ autoclassifier(images)
projects/05_Stock_Market.py CHANGED
@@ -5,25 +5,23 @@ from PIL import Image
5
 
6
  @st.cache_resource
7
  def load_model():
8
- model_file = open('./models/stock_market_model.pkl', 'rb')
9
- amazon_predictions = pickle.load(model_file)
10
- amazon_scores = pickle.load(model_file)
11
- google_predictions = pickle.load(model_file)
12
- google_scores = pickle.load(model_file)
13
- ibm_predictions = pickle.load(model_file)
14
- ibm_scores = pickle.load(model_file)
15
- microsoft_predictions = pickle.load(model_file)
16
- microsoft_scores = pickle.load(model_file)
17
- model_file.close()
18
- return amazon_predictions, amazon_scores, google_predictions, google_scores, ibm_predictions, ibm_scores, microsoft_predictions, microsoft_scores
 
19
 
20
  # Load Image
21
  gru = Image.open("assets/gru.png")
22
  nn = Image.open("assets/nn.png")
23
 
24
- # Load the Model
25
- amazon_predictions, amazon_scores, google_predictions, google_scores, ibm_predictions, ibm_scores, microsoft_predictions, microsoft_scores = load_model()
26
-
27
  st.header('Stock Market Forecast', divider='green')
28
 
29
  st.markdown("#### Time Series Forecasting")
@@ -42,6 +40,9 @@ st.divider()
42
 
43
  st.markdown("Below each graph is the mean square error (MSE) for the train and test sets, where the test set consists of the last 20 days.")
44
 
 
 
 
45
  fig1 = go.Figure()
46
  fig1.add_trace(go.Scatter(go.Scatter(x=amazon_predictions['Date'], y=amazon_predictions['Train Prediction'],
47
  mode='lines',
 
5
 
6
  @st.cache_resource
7
  def load_model():
8
+ with st.spinner(f"Fetching Models"):
9
+ model_file = open('./models/stock_market_model.pkl', 'rb')
10
+ amazon_predictions = pickle.load(model_file)
11
+ amazon_scores = pickle.load(model_file)
12
+ google_predictions = pickle.load(model_file)
13
+ google_scores = pickle.load(model_file)
14
+ ibm_predictions = pickle.load(model_file)
15
+ ibm_scores = pickle.load(model_file)
16
+ microsoft_predictions = pickle.load(model_file)
17
+ microsoft_scores = pickle.load(model_file)
18
+ model_file.close()
19
+ return amazon_predictions, amazon_scores, google_predictions, google_scores, ibm_predictions, ibm_scores, microsoft_predictions, microsoft_scores
20
 
21
  # Load Image
22
  gru = Image.open("assets/gru.png")
23
  nn = Image.open("assets/nn.png")
24
 
 
 
 
25
  st.header('Stock Market Forecast', divider='green')
26
 
27
  st.markdown("#### Time Series Forecasting")
 
40
 
41
  st.markdown("Below each graph is the mean square error (MSE) for the train and test sets, where the test set consists of the last 20 days.")
42
 
43
+ # Load the Model
44
+ amazon_predictions, amazon_scores, google_predictions, google_scores, ibm_predictions, ibm_scores, microsoft_predictions, microsoft_scores = load_model()
45
+
46
  fig1 = go.Figure()
47
  fig1.add_trace(go.Scatter(go.Scatter(x=amazon_predictions['Date'], y=amazon_predictions['Train Prediction'],
48
  mode='lines',
projects/06_Generative_Music.py CHANGED
@@ -9,18 +9,20 @@ from scipy.io import wavfile
9
 
10
  @st.cache_resource
11
  def load_notes():
12
- notes_filepath = 'models/music_notes.pkl'
13
- with open(notes_filepath, 'rb') as filepath:
14
- notes = pickle.load(filepath)
15
- pitchnames = pickle.load(filepath)
16
- n_vocab = pickle.load(filepath)
17
- return (notes, pitchnames, n_vocab)
 
18
 
19
  @st.cache_resource
20
  def model_load():
21
- model_filepath = 'models/music_model.keras'
22
- model = load_model(model_filepath)
23
- return model
 
24
 
25
  @st.cache_data
26
  def prepare_sequences(notes, pitchnames, n_vocab, sequence_length=100):
@@ -109,15 +111,6 @@ def generate(model, network_input, pitchnames, n_vocab, nlength=500, istart=-1):
109
 
110
  st.header('Generative Music', divider='green')
111
 
112
- # Load notes
113
- notes, pitchnames, n_vocab = load_notes()
114
-
115
- # Prepare note sequences
116
- network_input = prepare_sequences(notes, pitchnames, n_vocab)
117
-
118
- # Load model
119
- model = model_load()
120
-
121
  st.markdown("#### What are Recurrent Neural Networks?")
122
  st.markdown("A recurrent neural network is a class of artificial neural networks that make use of sequential information. They are called recurrent because they perform the same function for every single element of a sequence, with the result being dependent on previous computations. Whereas outputs are independent of previous computations in traditional neural networks.")
123
  st.markdown("In this project we will use a **Long Short-Term Memory** (LSTM) network. They are a type of Recurrent Neural Network that can efficiently learn via gradient descent. Using a gating mechanism, LSTMs are able to recognise and encode long-term patterns. LSTMs are extremely useful to solve problems where the network has to remember information for a long period of time as is the case in music and text generation.")
@@ -130,6 +123,15 @@ st.markdown("It may be possible to improve this model by playing around with the
130
  st.markdown("*This is based off the tutorial by Sigurður Skúli [How to Generate Music using a LSTM Neural Network in Keras](https://towardsdatascience.com/how-to-generate-music-using-a-lstm-neural-network-in-keras-68786834d4c5)*")
131
  st.divider()
132
 
 
 
 
 
 
 
 
 
 
133
  midi_file = None
134
  generated_midi = None
135
  sample_midi = None
 
9
 
10
  @st.cache_resource
11
  def load_notes():
12
+ with st.spinner(f"Fetching Notes"):
13
+ notes_filepath = 'models/music_notes.pkl'
14
+ with open(notes_filepath, 'rb') as filepath:
15
+ notes = pickle.load(filepath)
16
+ pitchnames = pickle.load(filepath)
17
+ n_vocab = pickle.load(filepath)
18
+ return (notes, pitchnames, n_vocab)
19
 
20
  @st.cache_resource
21
  def model_load():
22
+ with st.spinner(f"Fetching Model"):
23
+ model_filepath = 'models/music_model.keras'
24
+ model = load_model(model_filepath)
25
+ return model
26
 
27
  @st.cache_data
28
  def prepare_sequences(notes, pitchnames, n_vocab, sequence_length=100):
 
111
 
112
  st.header('Generative Music', divider='green')
113
 
 
 
 
 
 
 
 
 
 
114
  st.markdown("#### What are Recurrent Neural Networks?")
115
  st.markdown("A recurrent neural network is a class of artificial neural networks that make use of sequential information. They are called recurrent because they perform the same function for every single element of a sequence, with the result being dependent on previous computations. Whereas outputs are independent of previous computations in traditional neural networks.")
116
  st.markdown("In this project we will use a **Long Short-Term Memory** (LSTM) network. They are a type of Recurrent Neural Network that can efficiently learn via gradient descent. Using a gating mechanism, LSTMs are able to recognise and encode long-term patterns. LSTMs are extremely useful to solve problems where the network has to remember information for a long period of time as is the case in music and text generation.")
 
123
  st.markdown("*This is based off the tutorial by Sigurður Skúli [How to Generate Music using a LSTM Neural Network in Keras](https://towardsdatascience.com/how-to-generate-music-using-a-lstm-neural-network-in-keras-68786834d4c5)*")
124
  st.divider()
125
 
126
+ # Load notes
127
+ notes, pitchnames, n_vocab = load_notes()
128
+
129
+ # Prepare note sequences
130
+ network_input = prepare_sequences(notes, pitchnames, n_vocab)
131
+
132
+ # Load model
133
+ model = model_load()
134
+
135
  midi_file = None
136
  generated_midi = None
137
  sample_midi = None