HandWriting-Text-Recognizer

Sleeping

App Files Files Community

Mattral commited on Mar 27, 2024

Commit

57462b3

verified ·

1 Parent(s): 19e0cf1

Upload 13 files

Browse files

Files changed (13) hide show

app/FinalApp.py +223 -0
app/__init__.py +0 -0
app/__pycache__/dataloader_iam.cpython-311.pyc +0 -0
app/__pycache__/model.cpython-311.pyc +0 -0
app/__pycache__/preprocessor.cpython-311.pyc +0 -0
app/dataloader_iam.py +133 -0
app/model.py +334 -0
app/preprocessor.py +191 -0
app/runner.py +5 -0
app/simple.py +64 -0
app/userInput.png +0 -0
app/webapp.py +132 -0
app/word.png +0 -0

app/FinalApp.py ADDED Viewed

	@@ -0,0 +1,223 @@

+import os
+import cv2
+import numpy as np
+from PIL import Image
+from path import Path
+import streamlit as st
+from typing import Tuple
+from dataloader_iam import Batch
+from model import Model, DecoderType
+from preprocessor import Preprocessor
+from streamlit_drawable_canvas import st_canvas
+import easyocr  # Import EasyOCR
+# Set page config at the very beginning (only executed once)
+st.set_page_config(
+    page_title="HTR App",
+    page_icon=":pencil:",
+    layout="centered",
+    initial_sidebar_state="auto",
+)
+ms = st.session_state
+if "themes" not in ms:
+  ms.themes = {"current_theme": "light",
+                    "refreshed": True,
+                    "light": {"theme.base": "dark",
+                              "theme.backgroundColor": "black",
+                              "theme.primaryColor": "#c98bdb",
+                              "theme.secondaryBackgroundColor": "#5591f5",
+                              "theme.textColor": "white",
+                              "theme.textColor": "white",
+                              "button_face": "🌜"},
+                    "dark":  {"theme.base": "light",
+                              "theme.backgroundColor": "white",
+                              "theme.primaryColor": "#5591f5",
+                              "theme.secondaryBackgroundColor": "#82E1D7",
+                              "theme.textColor": "#0a1464",
+                              "button_face": "🌞"},
+                    }
+def ChangeTheme():
+  previous_theme = ms.themes["current_theme"]
+  tdict = ms.themes["light"] if ms.themes["current_theme"] == "light" else ms.themes["dark"]
+  for vkey, vval in tdict.items():
+    if vkey.startswith("theme"): st._config.set_option(vkey, vval)
+  ms.themes["refreshed"] = False
+  if previous_theme == "dark": ms.themes["current_theme"] = "light"
+  elif previous_theme == "light": ms.themes["current_theme"] = "dark"
+btn_face = ms.themes["light"]["button_face"] if ms.themes["current_theme"] == "light" else ms.themes["dark"]["button_face"]
+st.button(btn_face, on_click=ChangeTheme)
+if ms.themes["refreshed"] == False:
+  ms.themes["refreshed"] = True
+  st.rerun()
+def get_img_size(line_mode: bool = False) -> Tuple[int, int]:
+    """
+    Auxiliary method that sets the height and width
+    Height is fixed while width is set according to the Model used.
+    """
+    if line_mode:
+        return 256, get_img_height()
+    return 128, get_img_height()
+def get_img_height() -> int:
+    """
+    Auxiliary method that sets the height, which is fixed for the Neural Network.
+    """
+    return 32
+def infer(line_mode: bool, model: Model, fn_img: Path) -> None:
+    """
+    Auxiliary method that does inference using the pretrained models:
+    Recognizes text in an image given its path.
+    """
+    img = cv2.imread(fn_img, cv2.IMREAD_GRAYSCALE)
+    assert img is not None
+    preprocessor = Preprocessor(get_img_size(line_mode), dynamic_width=True, padding=16)
+    img = preprocessor.process_img(img)
+    batch = Batch([img], None, 1)
+    recognized, probability = model.infer_batch(batch, True)
+    return [recognized, probability]
+def infer_super_model(image_path) -> None:
+    reader = easyocr.Reader(['en'])  # Initialize EasyOCR reader
+    result = reader.readtext(image_path)
+    recognized_texts = [text[1] for text in result]  # Extract recognized texts
+    probabilities = [text[2] for text in result]  # Extract probabilities
+    return recognized_texts, probabilities
+def main():
+    st.title('Extract text from Image Demo')
+    st.markdown("""
+    Streamlit Web Interface for Handwritten Text Recognition (HTR), Optical Character Recognition (OCR)
+                implemented with TensorFlow and trained on the IAM off-line HTR dataset.
+                The model takes images of single words or text lines (multiple words) as input and outputs the recognized text.
+    """, unsafe_allow_html=True)
+    st.markdown("""
+    Predictions can be made using one of two models:
+    - Single_Model (Trained on Single Word Images)
+    - Line_Model (Trained on Text Line Images)
+    - Super_Model ( Most Robust Option for English )
+    - Burmese (Link)
+    """, unsafe_allow_html=True)
+    st.subheader('Select a Model, Choose the Arguments and Draw in the box below or Upload an Image to obtain a prediction.')
+    #Selectors for the model and decoder
+    modelSelect = st.selectbox("Select a Model", ['Single_Model', 'Line_Model', 'Super_Model'])
+    if modelSelect != 'Super_Model':
+        decoderSelect = st.selectbox("Select a Decoder", ['Bestpath', 'Beamsearch', 'Wordbeamsearch'])
+    #Mappings (dictionaries) for the model and decoder. Asigns the directory or the DecoderType of the selected option.
+    modelMapping = {
+        "Single_Model": '../model/word-model',
+        "Line_Model": '../model/line-model'
+    }
+    decoderMapping = {
+        'Bestpath': DecoderType.BestPath,
+        'Beamsearch': DecoderType.BeamSearch,
+        'Wordbeamsearch': DecoderType.WordBeamSearch
+    }
+    #Slider for pencil width
+    strokeWidth = st.slider("Stroke Width: ", 1, 25, 6)
+    #Canvas/Text Box for user input. BackGround Color must be white (#FFFFFF) or else text will not be properly recognised.
+    inputDrawn = st_canvas(
+        fill_color="rgba(255, 165, 0, 0.3)",
+        stroke_width=strokeWidth,
+        update_streamlit=True,
+        background_image=None,
+        height = 200,
+        width = 400,
+        drawing_mode='freedraw',
+        key="canvas",
+        background_color = '#FFFFFF'
+    )
+    #Buffer for user input (images uploaded from the user's device)
+    inputBuffer = st.file_uploader("Upload an Image", type=["png"])
+    #Inference Button
+    inferBool = st.button("Recognize Text")
+    # After clicking the "Recognize Text" button, check if the model selected is Super_Model
+    if inferBool:
+        if modelSelect == 'Super_Model':
+            inputArray = None  # Initialize inputArray to None
+            # Handling uploaded file
+            if inputBuffer is not None:
+                with Image.open(inputBuffer).convert('RGB') as img:
+                    inputArray = np.array(img)
+            # Handling canvas data
+            elif inputDrawn.image_data is not None:
+                # Convert RGBA to RGB
+                inputArray = cv2.cvtColor(np.array(inputDrawn.image_data, dtype=np.uint8), cv2.COLOR_RGBA2RGB)
+            # Now check if inputArray has been set
+            if inputArray is not None:
+                # Initialize EasyOCR Reader
+                reader = easyocr.Reader(['en'])  # Assuming English language; adjust as necessary
+                # Perform OCR
+                results = reader.readtext(inputArray)
+                # Display results
+                all_text = ''
+                for (bbox, text, prob) in results:
+                    all_text += f'{text} (confidence: {prob:.2f})\n'
+                st.write("**Recognized Texts and their Confidence Scores:**")
+                st.text(all_text)
+            else:
+                st.write("No image data found. Please upload an image or draw on the canvas.")
+        else:
+            # Handle other model selections as before
+            if ((inputDrawn.image_data is not None or inputBuffer is not None) and inferBool == True):
+                #We turn the input into a numpy array
+                if inputDrawn.image_data is not None:
+                    inputArray = np.array(inputDrawn.image_data)
+                if inputBuffer is not None:
+                    inputBufferImage = Image.open(inputBuffer)
+                    inputArray = np.array(inputBufferImage)
+                #We turn this array into a .png format and save it.
+                inputImage = Image.fromarray(inputArray.astype('uint8'), 'RGBA')
+                inputImage.save('userInput.png')
+                #We obtain the model directory and the decoder type from their mapping
+                modelDir = modelMapping[modelSelect]
+                decoderType = decoderMapping[decoderSelect]
+                #Finally, we call the model with this image as attribute and display the Best Candidate and its probability on the Interface
+                model = Model(list(open(modelDir + "/charList.txt").read()), modelDir, decoderType, must_restore=True)
+                inferedText = infer(modelDir == '../model/line-model', model, 'userInput.png')
+                st.write("**Best Candidate: **", inferedText[0][0])
+                st.write("**Probability: **", str(inferedText[1][0]*100) + "%")
+if __name__ == "__main__":
+    main()

app/__init__.py ADDED Viewed

File without changes

app/__pycache__/dataloader_iam.cpython-311.pyc ADDED Viewed

Binary file (8.17 kB). View file

app/__pycache__/model.cpython-311.pyc ADDED Viewed

Binary file (19.5 kB). View file

app/__pycache__/preprocessor.cpython-311.pyc ADDED Viewed

Binary file (10.7 kB). View file

app/dataloader_iam.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import pickle
+import random
+from collections import namedtuple
+from typing import Tuple
+import cv2
+from imdb import Cinemagoer
+import numpy as np
+from path import Path
+Sample = namedtuple('Sample', 'gt_text, file_path')
+Batch = namedtuple('Batch', 'imgs, gt_texts, batch_size')
+class DataLoaderIAM:
+    """
+    Loads data which corresponds to IAM format,
+    see: http://www.fki.inf.unibe.ch/databases/iam-handwriting-database
+    """
+    def __init__(self,
+                 data_dir: Path,
+                 batch_size: int,
+                 data_split: float = 0.95,
+                 fast: bool = True) -> None:
+        """Loader for dataset."""
+        assert data_dir.exists()
+        self.fast = fast
+        if fast:
+            self.env = Cinemagoer.open(str(data_dir / 'lmdb'), readonly=True)
+        self.data_augmentation = False
+        self.curr_idx = 0
+        self.batch_size = batch_size
+        self.samples = []
+        f = open(data_dir / 'gt/words.txt')
+        chars = set()
+        bad_samples_reference = ['a01-117-05-02', 'r06-022-03-05']  # known broken images in IAM dataset
+        for line in f:
+            # ignore comment line
+            if not line or line[0] == '#':
+                continue
+            line_split = line.strip().split(' ')
+            assert len(line_split) >= 9
+            # filename: part1-part2-part3 --> part1/part1-part2/part1-part2-part3.png
+            file_name_split = line_split[0].split('-')
+            file_name_subdir1 = file_name_split[0]
+            file_name_subdir2 = f'{file_name_split[0]}-{file_name_split[1]}'
+            file_base_name = line_split[0] + '.png'
+            file_name = data_dir / 'img' / file_name_subdir1 / file_name_subdir2 / file_base_name
+            if line_split[0] in bad_samples_reference:
+                print('Ignoring known broken image:', file_name)
+                continue
+            # GT text are columns starting at 9
+            gt_text = ' '.join(line_split[8:])
+            chars = chars.union(set(list(gt_text)))
+            # put sample into list
+            self.samples.append(Sample(gt_text, file_name))
+        # split into training and validation set: 95% - 5%
+        split_idx = int(data_split * len(self.samples))
+        self.train_samples = self.samples[:split_idx]
+        self.validation_samples = self.samples[split_idx:]
+        # put words into lists
+        self.train_words = [x.gt_text for x in self.train_samples]
+        self.validation_words = [x.gt_text for x in self.validation_samples]
+        # start with train set
+        self.train_set()
+        # list of all chars in dataset
+        self.char_list = sorted(list(chars))
+    def train_set(self) -> None:
+        """Switch to randomly chosen subset of training set."""
+        self.data_augmentation = True
+        self.curr_idx = 0
+        random.shuffle(self.train_samples)
+        self.samples = self.train_samples
+        self.curr_set = 'train'
+    def validation_set(self) -> None:
+        """Switch to validation set."""
+        self.data_augmentation = False
+        self.curr_idx = 0
+        self.samples = self.validation_samples
+        self.curr_set = 'val'
+    def get_iterator_info(self) -> Tuple[int, int]:
+        """Current batch index and overall number of batches."""
+        if self.curr_set == 'train':
+            num_batches = int(np.floor(len(self.samples) / self.batch_size))  # train set: only full-sized batches
+        else:
+            num_batches = int(np.ceil(len(self.samples) / self.batch_size))  # val set: allow last batch to be smaller
+        curr_batch = self.curr_idx // self.batch_size + 1
+        return curr_batch, num_batches
+    def has_next(self) -> bool:
+        """Is there a next element?"""
+        if self.curr_set == 'train':
+            return self.curr_idx + self.batch_size <= len(self.samples)  # train set: only full-sized batches
+        else:
+            return self.curr_idx < len(self.samples)  # val set: allow last batch to be smaller
+    def _get_img(self, i: int) -> np.ndarray:
+        if self.fast:
+            with self.env.begin() as txn:
+                basename = Path(self.samples[i].file_path).basename()
+                data = txn.get(basename.encode("ascii"))
+                img = pickle.loads(data)
+        else:
+            img = cv2.imread(self.samples[i].file_path, cv2.IMREAD_GRAYSCALE)
+        return img
+    def get_next(self) -> Batch:
+        """Get next element."""
+        batch_range = range(self.curr_idx, min(self.curr_idx + self.batch_size, len(self.samples)))
+        imgs = [self._get_img(i) for i in batch_range]
+        gt_texts = [self.samples[i].gt_text for i in batch_range]
+        self.curr_idx += self.batch_size
+        return Batch(imgs, gt_texts, len(imgs))

app/model.py ADDED Viewed

	@@ -0,0 +1,334 @@

+import os
+import sys
+from typing import List, Tuple
+import tf_keras as keras
+import numpy as np
+from dataloader_iam import Batch
+import tensorflow.compat.v1 as tf
+tf.compat.v1.disable_v2_behavior
+# Disable eager mode
+tf.compat.v1.disable_eager_execution
+class DecoderType:
+    """
+    CTC decoder types.
+    """
+    BestPath = 0
+    BeamSearch = 1
+    WordBeamSearch = 2
+class Model:
+    """
+    Minimalistic TF model for HTR.
+    """
+    def __init__(self,
+                 char_list: List[str],
+                 model_dir: str,
+                 decoder_type: str = DecoderType.BestPath,
+                 must_restore: bool = False,
+                 dump: bool = False) -> None:
+        """
+        Init model: add CNN, RNN and CTC and initialize TF.
+        """
+        self.dump = dump
+        self.char_list = char_list
+        self.decoder_type = decoder_type
+        self.must_restore = must_restore
+        self.snap_ID = 0
+        self.model_dir = model_dir
+        tf.compat.v1.disable_eager_execution()
+        # Whether to use normalization over a batch or a population
+        self.is_train = tf.compat.v1.placeholder(tf.bool, name='is_train')
+        # input image batch
+        self.input_imgs = tf.compat.v1.placeholder(tf.float32, shape=(None, None, None))
+        # setup CNN, RNN and CTC
+        self.setup_cnn()
+        self.setup_rnn()
+        self.setup_ctc()
+        # setup optimizer to train NN
+        self.batches_trained = 0
+        self.update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
+        with tf.control_dependencies(self.update_ops):
+            self.optimizer = tf.compat.v1.train.AdamOptimizer().minimize(self.loss)
+        # initialize TF
+        self.sess, self.saver = self.setup_tf()
+    def setup_cnn(self) -> None:
+        """
+        Create CNN layers.
+        """
+        cnn_in4d = tf.expand_dims(input=self.input_imgs, axis=3)
+        # list of parameters for the layers
+        kernel_vals = [5, 5, 3, 3, 3]
+        feature_vals = [1, 32, 64, 128, 128, 256]
+        stride_vals = pool_vals = [(2, 2), (2, 2), (1, 2), (1, 2), (1, 2)]
+        num_layers = len(stride_vals)
+        # create layers
+        pool = cnn_in4d  # input to first CNN layer
+        for i in range(num_layers):
+            kernel = tf.Variable(
+                tf.random.truncated_normal([kernel_vals[i], kernel_vals[i], feature_vals[i], feature_vals[i + 1]],
+                                           stddev=0.1))
+            conv = tf.nn.conv2d(input=pool, filters=kernel, padding='SAME', strides=(1, 1, 1, 1))
+            conv_norm = tf.keras.layers.BatchNormalization()(conv, training=self.is_train)
+            relu = tf.nn.relu(conv_norm)
+            pool = tf.nn.max_pool2d(input=relu, ksize=(1, pool_vals[i][0], pool_vals[i][1], 1),
+                                    strides=(1, stride_vals[i][0], stride_vals[i][1], 1), padding='VALID')
+        self.cnn_out_4d = pool
+    def setup_rnn(self) -> None:
+        """
+        Create RNN layers.
+        """
+        rnn_in3d = tf.squeeze(self.cnn_out_4d, axis=[2])
+        # basic cells which is used to build RNN
+        num_hidden = 256
+        cells = [tf.compat.v1.nn.rnn_cell.LSTMCell(num_units=num_hidden, state_is_tuple=True) for _ in
+                 range(2)]  # 2 layers
+        # stack basic cells
+        stacked = tf.compat.v1.nn.rnn_cell.MultiRNNCell(cells, state_is_tuple=True)
+        # bidirectional RNN
+        # BxTxF -> BxTx2H
+        (fw, bw), _ = tf.compat.v1.nn.bidirectional_dynamic_rnn(cell_fw=stacked, cell_bw=stacked, inputs=rnn_in3d,
+                                                                dtype=rnn_in3d.dtype)
+        # BxTxH + BxTxH -> BxTx2H -> BxTx1X2H
+        concat = tf.expand_dims(tf.concat([fw, bw], 2), 2)
+        # project output to chars (including blank): BxTx1x2H -> BxTx1xC -> BxTxC
+        kernel = tf.Variable(tf.random.truncated_normal([1, 1, num_hidden * 2, len(self.char_list) + 1], stddev=0.1))
+        self.rnn_out_3d = tf.squeeze(tf.nn.atrous_conv2d(value=concat, filters=kernel, rate=1, padding='SAME'),
+                                     axis=[2])
+    def setup_ctc(self) -> None:
+        """
+        Create CTC loss and decoder.
+        """
+        # BxTxC -> TxBxC
+        self.ctc_in_3d_tbc = tf.transpose(a=self.rnn_out_3d, perm=[1, 0, 2])
+        # ground truth text as sparse tensor
+        self.gt_texts = tf.SparseTensor(tf.compat.v1.placeholder(tf.int64, shape=[None, 2]),
+                                        tf.compat.v1.placeholder(tf.int32, [None]),
+                                        tf.compat.v1.placeholder(tf.int64, [2]))
+        # calc loss for batch
+        self.seq_len = tf.compat.v1.placeholder(tf.int32, [None])
+        self.loss = tf.reduce_mean(
+            input_tensor=tf.compat.v1.nn.ctc_loss(labels=self.gt_texts, inputs=self.ctc_in_3d_tbc,
+                                                  sequence_length=self.seq_len,
+                                                  ctc_merge_repeated=True))
+        # calc loss for each element to compute label probability
+        self.saved_ctc_input = tf.compat.v1.placeholder(tf.float32,
+                                                        shape=[None, None, len(self.char_list) + 1])
+        self.loss_per_element = tf.compat.v1.nn.ctc_loss(labels=self.gt_texts, inputs=self.saved_ctc_input,
+                                                         sequence_length=self.seq_len, ctc_merge_repeated=True)
+        # best path decoding or beam search decoding
+        if self.decoder_type == DecoderType.BestPath:
+            self.decoder = tf.nn.ctc_greedy_decoder(inputs=self.ctc_in_3d_tbc, sequence_length=self.seq_len)
+        elif self.decoder_type == DecoderType.BeamSearch:
+            self.decoder = tf.nn.ctc_beam_search_decoder(inputs=self.ctc_in_3d_tbc, sequence_length=self.seq_len,
+                                                         beam_width=50)
+        # word beam search decoding (see https://github.com/githubharald/CTCWordBeamSearch)
+        elif self.decoder_type == DecoderType.WordBeamSearch:
+            # prepare information about language (dictionary, characters in dataset, characters forming words)
+            chars = ''.join(self.char_list)
+            word_chars = open('../model/wordCharList.txt').read().splitlines()[0]
+            corpus = open('../data/corpus.txt').read()
+            # decode using the "Words" mode of word beam search
+            from word_beam_search import WordBeamSearch
+            self.decoder = WordBeamSearch(50, 'Words', 0.0, corpus.encode('utf8'), chars.encode('utf8'),
+                                          word_chars.encode('utf8'))
+            # the input to the decoder must have softmax already applied
+            self.wbs_input = tf.nn.softmax(self.ctc_in_3d_tbc, axis=2)
+    def setup_tf(self) -> Tuple[tf.compat.v1.Session, tf.compat.v1.train.Saver]:
+        """
+        Initialize TF.
+        """
+        print('Python: ' + sys.version)
+        print('Tensorflow: ' + tf.__version__)
+        sess = tf.compat.v1.Session()  # TF session
+        saver = tf.compat.v1.train.Saver(max_to_keep=1)  # saver saves model to file
+        latest_snapshot = tf.train.latest_checkpoint(self.model_dir )  # is there a saved model?
+        # if model must be restored (for inference), there must be a snapshot
+        if self.must_restore and not latest_snapshot:
+            raise Exception('No saved model found in: ' + model_dir)
+        # load saved model if available
+        if latest_snapshot:
+            print('Init with stored values from ' + latest_snapshot)
+            saver.restore(sess, latest_snapshot)
+        else:
+            print('Init with new values')
+            sess.run(tf.compat.v1.global_variables_initializer())
+        return sess, saver
+    def to_sparse(self, texts: List[str]) -> Tuple[List[List[int]], List[int], List[int]]:
+        """
+        Put ground truth texts into sparse tensor for ctc_loss.
+        """
+        indices = []
+        values = []
+        shape = [len(texts), 0]  # last entry must be max(labelList[i])
+        # go over all texts
+        for batchElement, text in enumerate(texts):
+            # convert to string of label (i.e. class-ids)
+            label_str = [self.char_list.index(c) for c in text]
+            # sparse tensor must have size of max. label-string
+            if len(label_str) > shape[1]:
+                shape[1] = len(label_str)
+            # put each label into sparse tensor
+            for i, label in enumerate(label_str):
+                indices.append([batchElement, i])
+                values.append(label)
+        return indices, values, shape
+    def decoder_output_to_text(self, ctc_output: tuple, batch_size: int) -> List[str]:
+        """
+        Extract texts from output of CTC decoder.
+        """
+        # word beam search: already contains label strings
+        if self.decoder_type == DecoderType.WordBeamSearch:
+            label_strs = ctc_output
+        # TF decoders: label strings are contained in sparse tensor
+        else:
+            # ctc returns tuple, first element is SparseTensor
+            decoded = ctc_output[0][0]
+            # contains string of labels for each batch element
+            label_strs = [[] for _ in range(batch_size)]
+            # go over all indices and save mapping: batch -> values
+            for (idx, idx2d) in enumerate(decoded.indices):
+                label = decoded.values[idx]
+                batch_element = idx2d[0]  # index according to [b,t]
+                label_strs[batch_element].append(label)
+        # map labels to chars for all batch elements
+        return [''.join([self.char_list[c] for c in labelStr]) for labelStr in label_strs]
+    def train_batch(self, batch: Batch) -> float:
+        """
+        Feed a batch into the NN to train it.
+        """
+        num_batch_elements = len(batch.imgs)
+        max_text_len = batch.imgs[0].shape[0] // 4
+        sparse = self.to_sparse(batch.gt_texts)
+        eval_list = [self.optimizer, self.loss]
+        feed_dict = {self.input_imgs: batch.imgs, self.gt_texts: sparse,
+                     self.seq_len: [max_text_len] * num_batch_elements, self.is_train: True}
+        _, loss_val = self.sess.run(eval_list, feed_dict)
+        self.batches_trained += 1
+        return loss_val
+    @staticmethod
+    def dump_nn_output(rnn_output: np.ndarray) -> None:
+        """
+        Dump the output of the NN to CSV file(s).
+        """
+        dump_dir = '../dump/'
+        if not os.path.isdir(dump_dir):
+            os.mkdir(dump_dir)
+        # iterate over all batch elements and create a CSV file for each one
+        max_t, max_b, max_c = rnn_output.shape
+        for b in range(max_b):
+            csv = ''
+            for t in range(max_t):
+                for c in range(max_c):
+                    csv += str(rnn_output[t, b, c]) + ';'
+                csv += '\n'
+            fn = dump_dir + 'rnnOutput_' + str(b) + '.csv'
+            print('Write dump of NN to file: ' + fn)
+            with open(fn, 'w') as f:
+                f.write(csv)
+    def infer_batch(self, batch: Batch, calc_probability: bool = False, probability_of_gt: bool = False):
+        """
+        Feed a batch into the NN to recognize the texts.
+        """
+        # decode, optionally save RNN output
+        num_batch_elements = len(batch.imgs)
+        # put tensors to be evaluated into list
+        eval_list = []
+        if self.decoder_type == DecoderType.WordBeamSearch:
+            eval_list.append(self.wbs_input)
+        else:
+            eval_list.append(self.decoder)
+        if self.dump or calc_probability:
+            eval_list.append(self.ctc_in_3d_tbc)
+        # sequence length depends on input image size (model downsizes width by 4)
+        max_text_len = batch.imgs[0].shape[0] // 4
+        # dict containing all tensor fed into the model
+        feed_dict = {self.input_imgs: batch.imgs, self.seq_len: [max_text_len] * num_batch_elements,
+                     self.is_train: False}
+        # evaluate model
+        eval_res = self.sess.run(eval_list, feed_dict)
+        # TF decoders: decoding already done in TF graph
+        if self.decoder_type != DecoderType.WordBeamSearch:
+            decoded = eval_res[0]
+        # word beam search decoder: decoding is done in C++ function compute()
+        else:
+            decoded = self.decoder.compute(eval_res[0])
+        # map labels (numbers) to character string
+        texts = self.decoder_output_to_text(decoded, num_batch_elements)
+        # feed RNN output and recognized text into CTC loss to compute labeling probability
+        probs = None
+        if calc_probability:
+            sparse = self.to_sparse(batch.gt_texts) if probability_of_gt else self.to_sparse(texts)
+            ctc_input = eval_res[1]
+            eval_list = self.loss_per_element
+            feed_dict = {self.saved_ctc_input: ctc_input, self.gt_texts: sparse,
+                         self.seq_len: [max_text_len] * num_batch_elements, self.is_train: False}
+            loss_vals = self.sess.run(eval_list, feed_dict)
+            probs = np.exp(-loss_vals)
+        # dump the output of the NN to CSV file(s)
+        if self.dump:
+            self.dump_nn_output(eval_res[1])
+        return texts, probs
+    def save(self) -> None:
+        """
+        Save model to file.
+        """
+        self.snap_ID += 1
+        self.saver.save(self.sess, '../model/snapshot', global_step=self.snap_ID)

app/preprocessor.py ADDED Viewed

	@@ -0,0 +1,191 @@

+import random
+from typing import Tuple
+import cv2
+import numpy as np
+from dataloader_iam import Batch
+class Preprocessor:
+    def __init__(self,
+                 img_size: Tuple[int, int],
+                 padding: int = 0,
+                 dynamic_width: bool = False,
+                 data_augmentation: bool = False,
+                 line_mode: bool = False) -> None:
+        # dynamic width only supported when no data augmentation happens
+        assert not (dynamic_width and data_augmentation)
+        # when padding is on, we need dynamic width enabled
+        assert not (padding > 0 and not dynamic_width)
+        self.img_size = img_size
+        self.padding = padding
+        self.dynamic_width = dynamic_width
+        self.data_augmentation = data_augmentation
+        self.line_mode = line_mode
+    @staticmethod
+    def _truncate_label(text: str, max_text_len: int) -> str:
+        """
+        Function ctc_loss can't compute loss if it cannot find a mapping between text label and input
+        labels. Repeat letters cost double because of the blank symbol needing to be inserted.
+        If a too-long label is provided, ctc_loss returns an infinite gradient.
+        """
+        cost = 0
+        for i in range(len(text)):
+            if i != 0 and text[i] == text[i - 1]:
+                cost += 2
+            else:
+                cost += 1
+            if cost > max_text_len:
+                return text[:i]
+        return text
+    def _simulate_text_line(self, batch: Batch) -> Batch:
+        """Create image of a text line by pasting multiple word images into an image."""
+        default_word_sep = 30
+        default_num_words = 5
+        # go over all batch elements
+        res_imgs = []
+        res_gt_texts = []
+        for i in range(batch.batch_size):
+            # number of words to put into current line
+            num_words = random.randint(1, 8) if self.data_augmentation else default_num_words
+            # concat ground truth texts
+            curr_gt = ' '.join([batch.gt_texts[(i + j) % batch.batch_size] for j in range(num_words)])
+            res_gt_texts.append(curr_gt)
+            # put selected word images into list, compute target image size
+            sel_imgs = []
+            word_seps = [0]
+            h = 0
+            w = 0
+            for j in range(num_words):
+                curr_sel_img = batch.imgs[(i + j) % batch.batch_size]
+                curr_word_sep = random.randint(20, 50) if self.data_augmentation else default_word_sep
+                h = max(h, curr_sel_img.shape[0])
+                w += curr_sel_img.shape[1]
+                sel_imgs.append(curr_sel_img)
+                if j + 1 < num_words:
+                    w += curr_word_sep
+                    word_seps.append(curr_word_sep)
+            # put all selected word images into target image
+            target = np.ones([h, w], np.uint8) * 255
+            x = 0
+            for curr_sel_img, curr_word_sep in zip(sel_imgs, word_seps):
+                x += curr_word_sep
+                y = (h - curr_sel_img.shape[0]) // 2
+                target[y:y + curr_sel_img.shape[0]:, x:x + curr_sel_img.shape[1]] = curr_sel_img
+                x += curr_sel_img.shape[1]
+            # put image of line into result
+            res_imgs.append(target)
+        return Batch(res_imgs, res_gt_texts, batch.batch_size)
+    def process_img(self, img: np.ndarray) -> np.ndarray:
+        """Resize to target size, apply data augmentation."""
+        # there are damaged files in IAM dataset - just use black image instead
+        if img is None:
+            img = np.zeros(self.img_size[::-1])
+        # data augmentation
+        img = img.astype(float)
+        if self.data_augmentation:
+            # photometric data augmentation
+            if random.random() < 0.25:
+                def rand_odd():
+                    return random.randint(1, 3) * 2 + 1
+                img = cv2.GaussianBlur(img, (rand_odd(), rand_odd()), 0)
+            if random.random() < 0.25:
+                img = cv2.dilate(img, np.ones((3, 3)))
+            if random.random() < 0.25:
+                img = cv2.erode(img, np.ones((3, 3)))
+            # geometric data augmentation
+            wt, ht = self.img_size
+            h, w = img.shape
+            f = min(wt / w, ht / h)
+            fx = f * np.random.uniform(0.75, 1.05)
+            fy = f * np.random.uniform(0.75, 1.05)
+            # random position around center
+            txc = (wt - w * fx) / 2
+            tyc = (ht - h * fy) / 2
+            freedom_x = max((wt - fx * w) / 2, 0)
+            freedom_y = max((ht - fy * h) / 2, 0)
+            tx = txc + np.random.uniform(-freedom_x, freedom_x)
+            ty = tyc + np.random.uniform(-freedom_y, freedom_y)
+            # map image into target image
+            M = np.float32([[fx, 0, tx], [0, fy, ty]])
+            target = np.ones(self.img_size[::-1]) * 255
+            img = cv2.warpAffine(img, M, dsize=self.img_size, dst=target, borderMode=cv2.BORDER_TRANSPARENT)
+            # photometric data augmentation
+            if random.random() < 0.5:
+                img = img * (0.25 + random.random() * 0.75)
+            if random.random() < 0.25:
+                img = np.clip(img + (np.random.random(img.shape) - 0.5) * random.randint(1, 25), 0, 255)
+            if random.random() < 0.1:
+                img = 255 - img
+        # no data augmentation
+        else:
+            if self.dynamic_width:
+                ht = self.img_size[1]
+                h, w = img.shape
+                f = ht / h
+                wt = int(f * w + self.padding)
+                wt = wt + (4 - wt) % 4
+                tx = (wt - w * f) / 2
+                ty = 0
+            else:
+                wt, ht = self.img_size
+                h, w = img.shape
+                f = min(wt / w, ht / h)
+                tx = (wt - w * f) / 2
+                ty = (ht - h * f) / 2
+            # map image into target image
+            M = np.float32([[f, 0, tx], [0, f, ty]])
+            target = np.ones([ht, wt]) * 255
+            img = cv2.warpAffine(img, M, dsize=(wt, ht), dst=target, borderMode=cv2.BORDER_TRANSPARENT)
+        # transpose for TF
+        img = cv2.transpose(img)
+        # convert to range [-1, 1]
+        img = img / 255 - 0.5
+        return img
+    def process_batch(self, batch: Batch) -> Batch:
+        if self.line_mode:
+            batch = self._simulate_text_line(batch)
+        res_imgs = [self.process_img(img) for img in batch.imgs]
+        max_text_len = res_imgs[0].shape[0] // 4
+        res_gt_texts = [self._truncate_label(gt_text, max_text_len) for gt_text in batch.gt_texts]
+        return Batch(res_imgs, res_gt_texts, batch.batch_size)
+def main():
+    import matplotlib.pyplot as plt
+    img = cv2.imread('../data/test.png', cv2.IMREAD_GRAYSCALE)
+    img_aug = Preprocessor((256, 32), data_augmentation=True).process_img(img)
+    plt.subplot(121)
+    plt.imshow(img, cmap='gray')
+    plt.subplot(122)
+    plt.imshow(cv2.transpose(img_aug) + 0.5, cmap='gray', vmin=0, vmax=1)
+    plt.show()
+if __name__ == '__main__':
+    main()

app/runner.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import os
+import subprocess
+fileDir = os.path.dirname(os.path.realpath(__file__))
+subprocess.run(["streamlit", "run", "webapp.py"], cwd = fileDir)

app/simple.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import cv2
+import numpy as np
+from pathlib import Path
+from model import Model, DecoderType
+from preprocessor import Preprocessor
+from dataloader_iam import Batch
+import tensorflow as tf
+def get_img_size(line_mode: bool = False) -> tuple[int, int]:
+    """
+    Auxiliary method that sets the height and width.
+    Height is fixed while width is set according to the Model used.
+    """
+    if line_mode:
+        return 256, get_img_height()
+    return 128, get_img_height()
+def get_img_height() -> int:
+    """
+    Auxiliary method that sets the fixed height for the Neural Network.
+    """
+    return 32
+def infer(line_mode: bool, model: Model, fn_img: str) -> None:
+    """
+    Auxiliary method that does inference using the pretrained models:
+    Recognizes text in an image given its path.
+    """
+    img = cv2.imread(fn_img, cv2.IMREAD_GRAYSCALE)
+    assert img is not None
+    preprocessor = Preprocessor(get_img_size(line_mode), dynamic_width=True, padding=16)
+    img = preprocessor.process_img(img)
+    batch = Batch([img], None, 1)
+    recognized, probability = model.infer_batch(batch, True)
+    return recognized, probability
+def main(image_path: str, model_path: str, decoder_type: DecoderType):
+    """
+    Main function to load the model, perform inference on the input image,
+    and print the result.
+    """
+    # Load the model
+    char_list_path = model_path + "/charList.txt"
+    model = Model(list(open(char_list_path).read()), model_path, decoder_type, must_restore=True)
+    # Perform inference
+    recognized, probability = infer(model_path.endswith('line-model'), model, image_path)
+    # Print the results
+    print("Recognized Text:", recognized[0])
+    print("Probability:", probability[0])
+if __name__ == "__main__":
+    # Example usage
+    # Define the image path, model directory, and decoder type here
+    image_path = 'word.png'  # Update this path
+    model_path = '../model/word-model'  # or '../model/line-model' depending on your model
+    decoder_type = DecoderType.BestPath  # Change as needed: BestPath, BeamSearch, WordBeamSearch
+    # Call the main function with the specified parameters
+    main(image_path, model_path, decoder_type)

app/userInput.png ADDED Viewed

app/webapp.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import os
+import cv2
+import numpy as np
+from PIL import Image
+from path import Path
+import streamlit as st
+from typing import Tuple
+from dataloader_iam import Batch
+from model import Model, DecoderType
+from preprocessor import Preprocessor
+from streamlit_drawable_canvas import st_canvas
+def get_img_size(line_mode: bool = False) -> Tuple[int, int]:
+    """
+    Auxiliary method that sets the height and width
+    Height is fixed while width is set according to the Model used.
+    """
+    if line_mode:
+        return 256, get_img_height()
+    return 128, get_img_height()
+def get_img_height() -> int:
+    """
+    Auxiliary method that sets the height, which is fixed for the Neural Network.
+    """
+    return 32
+def infer(line_mode: bool, model: Model, fn_img: Path) -> None:
+    """
+    Auxiliary method that does inference using the pretrained models:
+    Recognizes text in an image given its path.
+    """
+    img = cv2.imread(fn_img, cv2.IMREAD_GRAYSCALE)
+    assert img is not None
+    preprocessor = Preprocessor(get_img_size(line_mode), dynamic_width=True, padding=16)
+    img = preprocessor.process_img(img)
+    batch = Batch([img], None, 1)
+    recognized, probability = model.infer_batch(batch, True)
+    return [recognized, probability]
+def main():
+    #Website properties
+    st.set_page_config(
+        page_title = "HTR App",
+        page_icon = ":pencil:",
+        layout = "centered",
+        initial_sidebar_state = "auto",
+    )
+    st.title('HTR Simple Application')
+    st.markdown("""
+    Streamlit Web Interface for Handwritten Text Recognition (HTR), implemented with TensorFlow and trained on the IAM off-line HTR dataset. The model takes images of single words or text lines (multiple words) as input and outputs the recognized text.
+    """, unsafe_allow_html=True)
+    st.markdown("""
+    Predictions can be made using one of two models:
+    - [Model 1](https://www.dropbox.com/s/mya8hw6jyzqm0a3/word-model.zip?dl=1) (Trained on Single Word Images)
+    - [Model 2](https://www.dropbox.com/s/7xwkcilho10rthn/line-model.zip?dl=1) (Trained on Text Line Images)
+    """, unsafe_allow_html=True)
+    st.subheader('Select a Model, Choose the Arguments and Draw in the box below or Upload an Image to obtain a prediction.')
+    #Selectors for the model and decoder
+    modelSelect = st.selectbox("Select a Model", ['Single_Model', 'Line_Model'])
+    decoderSelect = st.selectbox("Select a Decoder", ['Bestpath', 'Beamsearch', 'Wordbeamsearch'])
+    #Mappings (dictionaries) for the model and decoder. Asigns the directory or the DecoderType of the selected option.
+    modelMapping = {
+        "Single_Model": '../model/word-model',
+        "Line_Model": '../model/line-model'
+    }
+    decoderMapping = {
+        'Bestpath': DecoderType.BestPath,
+        'Beamsearch': DecoderType.BeamSearch,
+        'Wordbeamsearch': DecoderType.WordBeamSearch
+    }
+    #Slider for pencil width
+    strokeWidth = st.slider("Stroke Width: ", 1, 25, 6)
+    #Canvas/Text Box for user input. BackGround Color must be white (#FFFFFF) or else text will not be properly recognised.
+    inputDrawn = st_canvas(
+        fill_color="rgba(255, 165, 0, 0.3)",
+        stroke_width=strokeWidth,
+        update_streamlit=True,
+        height = 200,
+        width = 400,
+        drawing_mode='freedraw',
+        key="canvas",
+        background_color = '#FFFFFF'
+    )
+    #Buffer for user input (images uploaded from the user's device)
+    inputBuffer = st.file_uploader("Upload an Image", type=["png"])
+    #Infer Button
+    inferBool = st.button("Recognize Word")
+    #We start infering once we have the user input and he presses the Infer button.
+    if ((inputDrawn.image_data is not None or inputBuffer is not None) and inferBool == True):
+        #We turn the input into a numpy array
+        if inputDrawn.image_data is not None:
+            inputArray = np.array(inputDrawn.image_data)
+        if inputBuffer is not None:
+            inputBufferImage = Image.open(inputBuffer)
+            inputArray = np.array(inputBufferImage)
+        #We turn this array into a .png format and save it.
+        inputImage = Image.fromarray(inputArray.astype('uint8'), 'RGBA')
+        inputImage.save('userInput.png')
+        #We obtain the model directory and the decoder type from their mapping
+        modelDir = modelMapping[modelSelect]
+        decoderType = decoderMapping[decoderSelect]
+        #Finally, we call the model with this image as attribute and display the Best Candidate and its probability on the Interface
+        model = Model(list(open(modelDir + "/charList.txt").read()), modelDir, decoderType, must_restore=True)
+        inferedText = infer(modelDir == '../model/line-model', model, 'userInput.png')
+        st.write("**Best Candidate: **", inferedText[0][0])
+        st.write("**Probability: **", str(inferedText[1][0]*100) + "%")
+if __name__ == "__main__":
+    main()

app/word.png ADDED Viewed