Specification for a deep convolutional transcriber modelΒΆ

# Model specification for the ATB2 image transcriber

import tensorflow as tf

# This file provides a subclass of tf.keras.Model that serves as a
#  transcriber for the ATB2 images. It learns to make a tensor of
#  extracted digits from a tensor of the document image.

# import this file, instantiate an instance of the transcriberModel
#  class, and then either train it and save the weights, or load
#  pre-trained weights and use it for transcription.

# Model the image with hierachical convolutions and then map to output digits
class transcriberModel(tf.keras.Model):
    def __init__(self):
        # parent constructor
        super(transcriberModel, self).__init__()
        # Initial shape (1024,768,3)
        self.conv1A = tf.keras.layers.Conv2D(
            16, (3, 3), strides=(2, 2), padding="valid"
        )
        self.drop1A = tf.keras.layers.Dropout(0.3)
        self.act1A = tf.keras.layers.ELU()
        # Now (512,384,16)
        self.conv1B = tf.keras.layers.Conv2D(
            32, (3, 3), strides=(2, 2), padding="valid"
        )
        self.drop1B = tf.keras.layers.Dropout(0.3)
        self.act1B = tf.keras.layers.ELU()
        # Now (256,192,32)
        self.conv1C = tf.keras.layers.Conv2D(
            64, (3, 3), strides=(2, 2), padding="valid"
        )
        self.drop1C = tf.keras.layers.Dropout(0.3)
        self.act1C = tf.keras.layers.ELU()
        # Now (128,96,64)
        self.conv1D = tf.keras.layers.Conv2D(
            128, (3, 3), strides=(2, 2), padding="valid"
        )
        self.drop1D = tf.keras.layers.Dropout(0.3)
        self.act1D = tf.keras.layers.ELU()
        # Now (64,48,128)
        self.conv1E = tf.keras.layers.Conv2D(
            256, (3, 3), strides=(2, 2), padding="valid"
        )
        self.drop1E = tf.keras.layers.Dropout(0.3)
        self.act1E = tf.keras.layers.ELU()
        # Now (32,24,256)
        self.conv1F = tf.keras.layers.Conv2D(
            512, (3, 3), strides=(2, 2), padding="valid"
        )
        self.drop1F = tf.keras.layers.Dropout(0.3)
        self.act1F = tf.keras.layers.ELU()
        # Now (16,12,512)
        # reshape to 1d
        self.flatten = tf.keras.layers.Flatten()
        # map directly to output format (436 digits)
        self.map_to_op = tf.keras.layers.Dense(436 * 10,)
        # softmax to get digit probabilities at each location
        self.op_reshape = tf.keras.layers.Reshape(target_shape=(436, 10,))
        self.op_softmax = tf.keras.layers.Softmax(axis=2)

    def call(self, inputs):
        x = self.conv1A(inputs)
        x = self.drop1A(x)
        x = self.act1A(x)
        x = self.conv1B(x)
        x = self.drop1B(x)
        x = self.act1B(x)
        x = self.conv1C(x)
        x = self.drop1C(x)
        x = self.act1C(x)
        x = self.conv1D(x)
        x = self.drop1D(x)
        x = self.act1D(x)
        x = self.conv1E(x)
        x = self.drop1E(x)
        x = self.act1E(x)
        x = self.conv1F(x)
        x = self.drop1F(x)
        x = self.act1F(x)
        x = self.flatten(x)
        x = self.map_to_op(x)
        x = self.op_reshape(x)
        x = self.op_softmax(x)
        return x