diff --git a/examples/vision/captcha_ocr.py b/examples/vision/captcha_ocr.py index fc4430b1f3..740fdebc6b 100644 --- a/examples/vision/captcha_ocr.py +++ b/examples/vision/captcha_ocr.py @@ -22,6 +22,10 @@ ## Setup """ +import os + +os.environ["KERAS_BACKEND"] = "tensorflow" + import os import numpy as np import matplotlib.pyplot as plt @@ -30,9 +34,8 @@ from collections import Counter import tensorflow as tf -from tensorflow import keras -from tensorflow.keras import layers - +import keras +from keras import layers """ ## Load the data: [Captcha Images](https://www.kaggle.com/fournierp/captcha-version-2-images) @@ -180,10 +183,64 @@ def encode_single_sample(img_path, label): """ +def ctc_batch_cost(y_true, y_pred, input_length, label_length): + label_length = tf.cast(tf.squeeze(label_length, axis=-1), tf.int32) + input_length = tf.cast(tf.squeeze(input_length, axis=-1), tf.int32) + sparse_labels = tf.cast(ctc_label_dense_to_sparse(y_true, label_length), tf.int32) + + y_pred = tf.math.log(tf.transpose(y_pred, perm=[1, 0, 2]) + keras.backend.epsilon()) + + return tf.expand_dims( + tf.compat.v1.nn.ctc_loss( + inputs=y_pred, labels=sparse_labels, sequence_length=input_length + ), + 1, + ) + + +def ctc_label_dense_to_sparse(labels, label_lengths): + label_shape = tf.shape(labels) + num_batches_tns = tf.stack([label_shape[0]]) + max_num_labels_tns = tf.stack([label_shape[1]]) + + def range_less_than(old_input, current_input): + return tf.expand_dims(tf.range(tf.shape(old_input)[1]), 0) < tf.fill( + max_num_labels_tns, current_input + ) + + init = tf.cast(tf.fill([1, label_shape[1]], 0), tf.bool) + dense_mask = tf.compat.v1.scan( + range_less_than, label_lengths, initializer=init, parallel_iterations=1 + ) + dense_mask = dense_mask[:, 0, :] + + label_array = tf.reshape( + tf.tile(tf.range(0, label_shape[1]), num_batches_tns), label_shape + ) + label_ind = tf.compat.v1.boolean_mask(label_array, dense_mask) + + batch_array = tf.transpose( + tf.reshape( + tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), + tf.reverse(label_shape, [0]), + ) + ) + batch_ind = tf.compat.v1.boolean_mask(batch_array, dense_mask) + indices = tf.transpose( + tf.reshape(tf.concat([batch_ind, label_ind], axis=0), [2, -1]) + ) + + vals_sparse = tf.compat.v1.gather_nd(labels, indices) + + return tf.SparseTensor( + tf.cast(indices, tf.int64), vals_sparse, tf.cast(label_shape, tf.int64) + ) + + class CTCLayer(layers.Layer): def __init__(self, name=None): super().__init__(name=name) - self.loss_fn = keras.backend.ctc_batch_cost + self.loss_fn = ctc_batch_cost def call(self, y_true, y_pred): # Compute the training-time loss value and add it @@ -272,7 +329,8 @@ def build_model(): """ -epochs = 100 +# TODO restore epoch count. +epochs = 2 early_stopping_patience = 10 # Add early stopping early_stopping = keras.callbacks.EarlyStopping( @@ -296,9 +354,33 @@ def build_model(): """ +def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1): + input_shape = tf.shape(y_pred) + num_samples, num_steps = input_shape[0], input_shape[1] + y_pred = tf.math.log(tf.transpose(y_pred, perm=[1, 0, 2]) + keras.backend.epsilon()) + input_length = tf.cast(input_length, tf.int32) + + if greedy: + (decoded, log_prob) = tf.nn.ctc_greedy_decoder( + inputs=y_pred, sequence_length=input_length + ) + else: + (decoded, log_prob) = tf.compat.v1.nn.ctc_beam_search_decoder( + inputs=y_pred, + sequence_length=input_length, + beam_width=beam_width, + top_paths=top_paths, + ) + decoded_dense = [] + for st in decoded: + st = tf.SparseTensor(st.indices, st.values, (num_samples, num_steps)) + decoded_dense.append(tf.sparse.to_dense(sp_input=st, default_value=-1)) + return (decoded_dense, log_prob) + + # Get the prediction model by extracting layers till the output layer prediction_model = keras.models.Model( - model.get_layer(name="image").input, model.get_layer(name="dense2").output + model.input[0], model.get_layer(name="dense2").output ) prediction_model.summary() @@ -307,7 +389,7 @@ def build_model(): def decode_batch_predictions(pred): input_len = np.ones(pred.shape[0]) * pred.shape[1] # Use greedy search. For complex tasks, you can use beam search - results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][ + results = ctc_decode(pred, input_length=input_len, greedy=True)[0][0][ :, :max_length ] # Iterate over the results and get back the text diff --git a/examples/vision/img/captcha_ocr/captcha_ocr_13_0.png b/examples/vision/img/captcha_ocr/captcha_ocr_13_0.png index d8f4b5e135..946d8b72bd 100644 Binary files a/examples/vision/img/captcha_ocr/captcha_ocr_13_0.png and b/examples/vision/img/captcha_ocr/captcha_ocr_13_0.png differ diff --git a/examples/vision/img/captcha_ocr/captcha_ocr_19_6.png b/examples/vision/img/captcha_ocr/captcha_ocr_19_6.png new file mode 100644 index 0000000000..ea2588e453 Binary files /dev/null and b/examples/vision/img/captcha_ocr/captcha_ocr_19_6.png differ diff --git a/examples/vision/ipynb/captcha_ocr.ipynb b/examples/vision/ipynb/captcha_ocr.ipynb index b34e3d9f37..5cbb2c5a92 100644 --- a/examples/vision/ipynb/captcha_ocr.ipynb +++ b/examples/vision/ipynb/captcha_ocr.ipynb @@ -41,12 +41,16 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ + "import os\n", + "\n", + "os.environ[\"KERAS_BACKEND\"] = \"tensorflow\"\n", + "\n", "import os\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", @@ -55,8 +59,8 @@ "from collections import Counter\n", "\n", "import tensorflow as tf\n", - "from tensorflow import keras\n", - "from tensorflow.keras import layers\n" + "import keras\n", + "from keras import layers" ] }, { @@ -71,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "colab_type": "code" }, @@ -97,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "colab_type": "code" }, @@ -133,7 +137,8 @@ "downsample_factor = 4\n", "\n", "# Maximum length of any captcha in the dataset\n", - "max_length = max([len(label) for label in labels])\n" + "max_length = max([len(label) for label in labels])\n", + "" ] }, { @@ -147,7 +152,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "colab_type": "code" }, @@ -155,9 +160,7 @@ "source": [ "\n", "# Mapping characters to integers\n", - "char_to_num = layers.StringLookup(\n", - " vocabulary=list(characters), mask_token=None\n", - ")\n", + "char_to_num = layers.StringLookup(vocabulary=list(characters), mask_token=None)\n", "\n", "# Mapping integers back to original characters\n", "num_to_char = layers.StringLookup(\n", @@ -199,7 +202,8 @@ " # 6. Map the characters in label to numbers\n", " label = char_to_num(tf.strings.unicode_split(label, input_encoding=\"UTF-8\"))\n", " # 7. Return a dict as our model is expecting two inputs\n", - " return {\"image\": img, \"label\": label}\n" + " return {\"image\": img, \"label\": label}\n", + "" ] }, { @@ -213,7 +217,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "colab_type": "code" }, @@ -222,18 +226,14 @@ "\n", "train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))\n", "train_dataset = (\n", - " train_dataset.map(\n", - " encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE\n", - " )\n", + " train_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)\n", " .batch(batch_size)\n", " .prefetch(buffer_size=tf.data.AUTOTUNE)\n", ")\n", "\n", "validation_dataset = tf.data.Dataset.from_tensor_slices((x_valid, y_valid))\n", "validation_dataset = (\n", - " validation_dataset.map(\n", - " encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE\n", - " )\n", + " validation_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)\n", " .batch(batch_size)\n", " .prefetch(buffer_size=tf.data.AUTOTUNE)\n", ")" @@ -250,7 +250,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "colab_type": "code" }, @@ -281,17 +281,71 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ + "\n", + "def ctc_batch_cost(y_true, y_pred, input_length, label_length):\n", + " label_length = tf.cast(tf.squeeze(label_length, axis=-1), tf.int32)\n", + " input_length = tf.cast(tf.squeeze(input_length, axis=-1), tf.int32)\n", + " sparse_labels = tf.cast(ctc_label_dense_to_sparse(y_true, label_length), tf.int32)\n", + "\n", + " y_pred = tf.math.log(tf.transpose(y_pred, perm=[1, 0, 2]) + keras.backend.epsilon())\n", + "\n", + " return tf.expand_dims(\n", + " tf.compat.v1.nn.ctc_loss(\n", + " inputs=y_pred, labels=sparse_labels, sequence_length=input_length\n", + " ),\n", + " 1,\n", + " )\n", + "\n", + "\n", + "def ctc_label_dense_to_sparse(labels, label_lengths):\n", + " label_shape = tf.shape(labels)\n", + " num_batches_tns = tf.stack([label_shape[0]])\n", + " max_num_labels_tns = tf.stack([label_shape[1]])\n", + "\n", + " def range_less_than(old_input, current_input):\n", + " return tf.expand_dims(tf.range(tf.shape(old_input)[1]), 0) < tf.fill(\n", + " max_num_labels_tns, current_input\n", + " )\n", + "\n", + " init = tf.cast(tf.fill([1, label_shape[1]], 0), tf.bool)\n", + " dense_mask = tf.compat.v1.scan(\n", + " range_less_than, label_lengths, initializer=init, parallel_iterations=1\n", + " )\n", + " dense_mask = dense_mask[:, 0, :]\n", + "\n", + " label_array = tf.reshape(\n", + " tf.tile(tf.range(0, label_shape[1]), num_batches_tns), label_shape\n", + " )\n", + " label_ind = tf.compat.v1.boolean_mask(label_array, dense_mask)\n", + "\n", + " batch_array = tf.transpose(\n", + " tf.reshape(\n", + " tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns),\n", + " tf.reverse(label_shape, [0]),\n", + " )\n", + " )\n", + " batch_ind = tf.compat.v1.boolean_mask(batch_array, dense_mask)\n", + " indices = tf.transpose(\n", + " tf.reshape(tf.concat([batch_ind, label_ind], axis=0), [2, -1])\n", + " )\n", + "\n", + " vals_sparse = tf.compat.v1.gather_nd(labels, indices)\n", + "\n", + " return tf.SparseTensor(\n", + " tf.cast(indices, tf.int64), vals_sparse, tf.cast(label_shape, tf.int64)\n", + " )\n", + "\n", "\n", "class CTCLayer(layers.Layer):\n", " def __init__(self, name=None):\n", " super().__init__(name=name)\n", - " self.loss_fn = keras.backend.ctc_batch_cost\n", + " self.loss_fn = ctc_batch_cost\n", "\n", " def call(self, y_true, y_pred):\n", " # Compute the training-time loss value and add it\n", @@ -387,14 +441,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "\n", - "epochs = 100\n", + "# TODO restore epoch count.\n", + "epochs = 2\n", "early_stopping_patience = 10\n", "# Add early stopping\n", "early_stopping = keras.callbacks.EarlyStopping(\n", @@ -407,7 +462,8 @@ " validation_data=validation_dataset,\n", " epochs=epochs,\n", " callbacks=[early_stopping],\n", - ")\n" + ")\n", + "" ] }, { @@ -418,30 +474,55 @@ "source": [ "## Inference\n", "\n", - "You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/ocr-for-captcha) \n", + "You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/ocr-for-captcha)\n", "and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/ocr-for-captcha)." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ + "\n", + "def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):\n", + " input_shape = tf.shape(y_pred)\n", + " num_samples, num_steps = input_shape[0], input_shape[1]\n", + " y_pred = tf.math.log(tf.transpose(y_pred, perm=[1, 0, 2]) + keras.backend.epsilon())\n", + " input_length = tf.cast(input_length, tf.int32)\n", + "\n", + " if greedy:\n", + " (decoded, log_prob) = tf.nn.ctc_greedy_decoder(\n", + " inputs=y_pred, sequence_length=input_length\n", + " )\n", + " else:\n", + " (decoded, log_prob) = tf.compat.v1.nn.ctc_beam_search_decoder(\n", + " inputs=y_pred,\n", + " sequence_length=input_length,\n", + " beam_width=beam_width,\n", + " top_paths=top_paths,\n", + " )\n", + " decoded_dense = []\n", + " for st in decoded:\n", + " st = tf.SparseTensor(st.indices, st.values, (num_samples, num_steps))\n", + " decoded_dense.append(tf.sparse.to_dense(sp_input=st, default_value=-1))\n", + " return (decoded_dense, log_prob)\n", + "\n", "\n", "# Get the prediction model by extracting layers till the output layer\n", "prediction_model = keras.models.Model(\n", - " model.get_layer(name=\"image\").input, model.get_layer(name=\"dense2\").output\n", + " model.input[0], model.get_layer(name=\"dense2\").output\n", ")\n", "prediction_model.summary()\n", "\n", + "\n", "# A utility function to decode the output of the network\n", "def decode_batch_predictions(pred):\n", " input_len = np.ones(pred.shape[0]) * pred.shape[1]\n", " # Use greedy search. For complex tasks, you can use beam search\n", - " results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][\n", + " results = ctc_decode(pred, input_length=input_len, greedy=True)[0][0][\n", " :, :max_length\n", " ]\n", " # Iterate over the results and get back the text\n", @@ -506,4 +587,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file diff --git a/examples/vision/md/captcha_ocr.md b/examples/vision/md/captcha_ocr.md index 75329fe5f2..b9e41ecf5b 100644 --- a/examples/vision/md/captcha_ocr.md +++ b/examples/vision/md/captcha_ocr.md @@ -25,6 +25,10 @@ in the developer guides. ```python +import os + +os.environ["KERAS_BACKEND"] = "tensorflow" + import os import numpy as np import matplotlib.pyplot as plt @@ -33,9 +37,8 @@ from pathlib import Path from collections import Counter import tensorflow as tf -from tensorflow import keras -from tensorflow.keras import layers - +import keras +from keras import layers ``` --- @@ -52,8 +55,8 @@ Let's download the data. ``` % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed -100 159 100 159 0 0 164 0 --:--:-- --:--:-- --:--:-- 164 -100 8863k 100 8863k 0 0 4882k 0 0:00:01 0:00:01 --:--:-- 33.0M + 0 0 0 0 0 0 0 0 --:--:-- --:--:-- --:--:-- 0 +100 8863k 100 8863k 0 0 19.6M 0 --:--:-- --:--:-- --:--:-- 19.6M ``` @@ -105,7 +108,7 @@ max_length = max([len(label) for label in labels]) Number of images found: 1040 Number of labels found: 1040 Number of unique characters: 19 -Characters present: {'d', 'w', 'y', '4', 'f', '6', 'g', 'e', '3', '5', 'p', 'x', '2', 'c', '7', 'n', 'b', '8', 'm'} +Characters present: ['2', '3', '4', '5', '6', '7', '8', 'b', 'c', 'd', 'e', 'f', 'g', 'm', 'n', 'p', 'w', 'x', 'y'] ``` @@ -116,9 +119,7 @@ Characters present: {'d', 'w', 'y', '4', 'f', '6', 'g', 'e', '3', '5', 'p', 'x' ```python # Mapping characters to integers -char_to_num = layers.StringLookup( - vocabulary=list(characters), mask_token=None -) +char_to_num = layers.StringLookup(vocabulary=list(characters), mask_token=None) # Mapping integers back to original characters num_to_char = layers.StringLookup( @@ -172,18 +173,14 @@ def encode_single_sample(img_path, label): train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)) train_dataset = ( - train_dataset.map( - encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE - ) + train_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE) .batch(batch_size) .prefetch(buffer_size=tf.data.AUTOTUNE) ) validation_dataset = tf.data.Dataset.from_tensor_slices((x_valid, y_valid)) validation_dataset = ( - validation_dataset.map( - encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE - ) + validation_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE) .batch(batch_size) .prefetch(buffer_size=tf.data.AUTOTUNE) ) @@ -209,7 +206,9 @@ plt.show() ``` + ![png](/img/examples/vision/captcha_ocr/captcha_ocr_13_0.png) + --- @@ -218,10 +217,64 @@ plt.show() ```python +def ctc_batch_cost(y_true, y_pred, input_length, label_length): + label_length = tf.cast(tf.squeeze(label_length, axis=-1), tf.int32) + input_length = tf.cast(tf.squeeze(input_length, axis=-1), tf.int32) + sparse_labels = tf.cast(ctc_label_dense_to_sparse(y_true, label_length), tf.int32) + + y_pred = tf.math.log(tf.transpose(y_pred, perm=[1, 0, 2]) + keras.backend.epsilon()) + + return tf.expand_dims( + tf.compat.v1.nn.ctc_loss( + inputs=y_pred, labels=sparse_labels, sequence_length=input_length + ), + 1, + ) + + +def ctc_label_dense_to_sparse(labels, label_lengths): + label_shape = tf.shape(labels) + num_batches_tns = tf.stack([label_shape[0]]) + max_num_labels_tns = tf.stack([label_shape[1]]) + + def range_less_than(old_input, current_input): + return tf.expand_dims(tf.range(tf.shape(old_input)[1]), 0) < tf.fill( + max_num_labels_tns, current_input + ) + + init = tf.cast(tf.fill([1, label_shape[1]], 0), tf.bool) + dense_mask = tf.compat.v1.scan( + range_less_than, label_lengths, initializer=init, parallel_iterations=1 + ) + dense_mask = dense_mask[:, 0, :] + + label_array = tf.reshape( + tf.tile(tf.range(0, label_shape[1]), num_batches_tns), label_shape + ) + label_ind = tf.compat.v1.boolean_mask(label_array, dense_mask) + + batch_array = tf.transpose( + tf.reshape( + tf.tile(tf.range(0, label_shape[0]), max_num_labels_tns), + tf.reverse(label_shape, [0]), + ) + ) + batch_ind = tf.compat.v1.boolean_mask(batch_array, dense_mask) + indices = tf.transpose( + tf.reshape(tf.concat([batch_ind, label_ind], axis=0), [2, -1]) + ) + + vals_sparse = tf.compat.v1.gather_nd(labels, indices) + + return tf.SparseTensor( + tf.cast(indices, tf.int64), vals_sparse, tf.cast(label_shape, tf.int64) + ) + + class CTCLayer(layers.Layer): def __init__(self, name=None): super().__init__(name=name) - self.loss_fn = keras.backend.ctc_batch_cost + self.loss_fn = ctc_batch_cost def call(self, y_true, y_pred): # Compute the training-time loss value and add it @@ -306,53 +359,80 @@ model = build_model() model.summary() ``` -
-``` -Model: "ocr_model_v1" -__________________________________________________________________________________________________ -Layer (type) Output Shape Param # Connected to -================================================================================================== -image (InputLayer) [(None, 200, 50, 1)] 0 -__________________________________________________________________________________________________ -Conv1 (Conv2D) (None, 200, 50, 32) 320 image[0][0] -__________________________________________________________________________________________________ -pool1 (MaxPooling2D) (None, 100, 25, 32) 0 Conv1[0][0] -__________________________________________________________________________________________________ -Conv2 (Conv2D) (None, 100, 25, 64) 18496 pool1[0][0] -__________________________________________________________________________________________________ -pool2 (MaxPooling2D) (None, 50, 12, 64) 0 Conv2[0][0] -__________________________________________________________________________________________________ -reshape (Reshape) (None, 50, 768) 0 pool2[0][0] -__________________________________________________________________________________________________ -dense1 (Dense) (None, 50, 64) 49216 reshape[0][0] -__________________________________________________________________________________________________ -dropout (Dropout) (None, 50, 64) 0 dense1[0][0] -__________________________________________________________________________________________________ -bidirectional (Bidirectional) (None, 50, 256) 197632 dropout[0][0] -__________________________________________________________________________________________________ -bidirectional_1 (Bidirectional) (None, 50, 128) 164352 bidirectional[0][0] -__________________________________________________________________________________________________ -label (InputLayer) [(None, None)] 0 -__________________________________________________________________________________________________ -dense2 (Dense) (None, 50, 20) 2580 bidirectional_1[0][0] -__________________________________________________________________________________________________ -ctc_loss (CTCLayer) (None, 50, 20) 0 label[0][0] - dense2[0][0] -================================================================================================== -Total params: 432,596 -Trainable params: 432,596 -Non-trainable params: 0 -__________________________________________________________________________________________________ -``` -
+
Model: "ocr_model_v1"
+
+ + + + +
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
+┃ Layer (type)         Output Shape       Param #  Connected to         ┃
+┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
+│ image (InputLayer)  │ (None, 200, 50,   │       0 │ -                    │
+│                     │ 1)                │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ Conv1 (Conv2D)      │ (None, 200, 50,   │     320 │ image[0][0]          │
+│                     │ 32)               │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ pool1               │ (None, 100, 25,   │       0 │ Conv1[0][0]          │
+│ (MaxPooling2D)      │ 32)               │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ Conv2 (Conv2D)      │ (None, 100, 25,   │  18,496 │ pool1[0][0]          │
+│                     │ 64)               │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ pool2               │ (None, 50, 12,    │       0 │ Conv2[0][0]          │
+│ (MaxPooling2D)      │ 64)               │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ reshape (Reshape)   │ (None, 50, 768)   │       0 │ pool2[0][0]          │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ dense1 (Dense)      │ (None, 50, 64)    │  49,216 │ reshape[0][0]        │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ dropout (Dropout)   │ (None, 50, 64)    │       0 │ dense1[0][0]         │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ bidirectional       │ (None, 50, 256)   │ 197,632 │ dropout[0][0]        │
+│ (Bidirectional)     │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ bidirectional_1     │ (None, 50, 128)   │ 164,352 │ bidirectional[0][0]  │
+│ (Bidirectional)     │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ label (InputLayer)  │ (None, None)      │       0 │ -                    │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ dense2 (Dense)      │ (None, 50, 21)    │   2,709 │ bidirectional_1[0][ │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ ctc_loss (CTCLayer) │ (None, 50, 21)    │       0 │ label[0][0],         │
+│                     │                   │         │ dense2[0][0]         │
+└─────────────────────┴───────────────────┴─────────┴──────────────────────┘
+
+ + + + +
 Total params: 432,725 (1.65 MB)
+
+ + + + +
 Trainable params: 432,725 (1.65 MB)
+
+ + + + +
 Non-trainable params: 0 (0.00 B)
+
+ + + --- ## Training ```python -epochs = 100 +# TODO restore epoch count. +epochs = 2 early_stopping_patience = 10 # Add early stopping early_stopping = keras.callbacks.EarlyStopping( @@ -371,227 +451,58 @@ history = model.fit(
``` -Epoch 1/100 -59/59 [==============================] - 3s 53ms/step - loss: 21.5722 - val_loss: 16.3351 -Epoch 2/100 -59/59 [==============================] - 2s 27ms/step - loss: 16.3335 - val_loss: 16.3062 -Epoch 3/100 -59/59 [==============================] - 2s 27ms/step - loss: 16.3360 - val_loss: 16.3116 -Epoch 4/100 -59/59 [==============================] - 2s 27ms/step - loss: 16.3318 - val_loss: 16.3167 -Epoch 5/100 -59/59 [==============================] - 2s 27ms/step - loss: 16.3256 - val_loss: 16.3152 -Epoch 6/100 -59/59 [==============================] - 2s 29ms/step - loss: 16.3229 - val_loss: 16.3123 -Epoch 7/100 -59/59 [==============================] - 2s 30ms/step - loss: 16.3119 - val_loss: 16.3116 -Epoch 8/100 -59/59 [==============================] - 2s 27ms/step - loss: 16.2977 - val_loss: 16.3107 -Epoch 9/100 -59/59 [==============================] - 2s 28ms/step - loss: 16.2801 - val_loss: 16.2552 -Epoch 10/100 -59/59 [==============================] - 2s 28ms/step - loss: 16.2199 - val_loss: 16.1008 -Epoch 11/100 -59/59 [==============================] - 2s 28ms/step - loss: 16.1136 - val_loss: 15.9867 -Epoch 12/100 -59/59 [==============================] - 2s 30ms/step - loss: 16.0138 - val_loss: 15.8825 -Epoch 13/100 -59/59 [==============================] - 2s 29ms/step - loss: 15.9670 - val_loss: 15.8413 -Epoch 14/100 -59/59 [==============================] - 2s 29ms/step - loss: 15.9315 - val_loss: 15.8263 -Epoch 15/100 -59/59 [==============================] - 2s 31ms/step - loss: 15.9162 - val_loss: 15.7971 -Epoch 16/100 -59/59 [==============================] - 2s 31ms/step - loss: 15.8916 - val_loss: 15.7844 -Epoch 17/100 -59/59 [==============================] - 2s 31ms/step - loss: 15.8653 - val_loss: 15.7624 -Epoch 18/100 -59/59 [==============================] - 2s 31ms/step - loss: 15.8543 - val_loss: 15.7620 -Epoch 19/100 -59/59 [==============================] - 2s 28ms/step - loss: 15.8373 - val_loss: 15.7559 -Epoch 20/100 -59/59 [==============================] - 2s 27ms/step - loss: 15.8319 - val_loss: 15.7495 -Epoch 21/100 -59/59 [==============================] - 2s 27ms/step - loss: 15.8104 - val_loss: 15.7430 -Epoch 22/100 -59/59 [==============================] - 2s 29ms/step - loss: 15.8037 - val_loss: 15.7260 -Epoch 23/100 -59/59 [==============================] - 2s 29ms/step - loss: 15.8021 - val_loss: 15.7204 -Epoch 24/100 -59/59 [==============================] - 2s 28ms/step - loss: 15.7901 - val_loss: 15.7174 -Epoch 25/100 -59/59 [==============================] - 2s 29ms/step - loss: 15.7851 - val_loss: 15.7074 -Epoch 26/100 -59/59 [==============================] - 2s 27ms/step - loss: 15.7701 - val_loss: 15.7097 -Epoch 27/100 -59/59 [==============================] - 2s 28ms/step - loss: 15.7694 - val_loss: 15.7040 -Epoch 28/100 -59/59 [==============================] - 2s 28ms/step - loss: 15.7544 - val_loss: 15.7012 -Epoch 29/100 -59/59 [==============================] - 2s 31ms/step - loss: 15.7498 - val_loss: 15.7015 -Epoch 30/100 -59/59 [==============================] - 2s 31ms/step - loss: 15.7521 - val_loss: 15.6880 -Epoch 31/100 -59/59 [==============================] - 2s 29ms/step - loss: 15.7165 - val_loss: 15.6734 -Epoch 32/100 -59/59 [==============================] - 2s 27ms/step - loss: 15.6650 - val_loss: 15.5789 -Epoch 33/100 -59/59 [==============================] - 2s 27ms/step - loss: 15.5300 - val_loss: 15.4026 -Epoch 34/100 -59/59 [==============================] - 2s 27ms/step - loss: 15.3519 - val_loss: 15.2115 -Epoch 35/100 -59/59 [==============================] - 2s 27ms/step - loss: 15.1165 - val_loss: 14.7826 -Epoch 36/100 -59/59 [==============================] - 2s 27ms/step - loss: 14.7086 - val_loss: 14.4432 -Epoch 37/100 -59/59 [==============================] - 2s 29ms/step - loss: 14.3317 - val_loss: 13.9445 -Epoch 38/100 -59/59 [==============================] - 2s 29ms/step - loss: 13.9658 - val_loss: 13.6972 -Epoch 39/100 -59/59 [==============================] - 2s 29ms/step - loss: 13.6728 - val_loss: 13.3388 -Epoch 40/100 -59/59 [==============================] - 2s 28ms/step - loss: 13.3454 - val_loss: 13.0102 -Epoch 41/100 -59/59 [==============================] - 2s 27ms/step - loss: 13.0448 - val_loss: 12.8307 -Epoch 42/100 -59/59 [==============================] - 2s 28ms/step - loss: 12.7552 - val_loss: 12.6071 -Epoch 43/100 -59/59 [==============================] - 2s 29ms/step - loss: 12.4573 - val_loss: 12.2800 -Epoch 44/100 -59/59 [==============================] - 2s 31ms/step - loss: 12.1055 - val_loss: 11.9209 -Epoch 45/100 -59/59 [==============================] - 2s 28ms/step - loss: 11.8148 - val_loss: 11.9132 -Epoch 46/100 -59/59 [==============================] - 2s 28ms/step - loss: 11.4530 - val_loss: 11.4357 -Epoch 47/100 -59/59 [==============================] - 2s 29ms/step - loss: 11.0592 - val_loss: 11.1121 -Epoch 48/100 -59/59 [==============================] - 2s 27ms/step - loss: 10.7746 - val_loss: 10.8532 -Epoch 49/100 -59/59 [==============================] - 2s 28ms/step - loss: 10.2616 - val_loss: 10.3643 -Epoch 50/100 -59/59 [==============================] - 2s 28ms/step - loss: 9.8708 - val_loss: 10.0987 -Epoch 51/100 -59/59 [==============================] - 2s 30ms/step - loss: 9.4077 - val_loss: 9.6371 -Epoch 52/100 -59/59 [==============================] - 2s 29ms/step - loss: 9.0663 - val_loss: 9.2463 -Epoch 53/100 -59/59 [==============================] - 2s 28ms/step - loss: 8.4546 - val_loss: 8.7581 -Epoch 54/100 -59/59 [==============================] - 2s 28ms/step - loss: 7.9226 - val_loss: 8.1805 -Epoch 55/100 -59/59 [==============================] - 2s 27ms/step - loss: 7.4927 - val_loss: 7.8858 -Epoch 56/100 -59/59 [==============================] - 2s 28ms/step - loss: 7.0499 - val_loss: 7.3202 -Epoch 57/100 -59/59 [==============================] - 2s 27ms/step - loss: 6.6383 - val_loss: 7.0875 -Epoch 58/100 -59/59 [==============================] - 2s 28ms/step - loss: 6.1446 - val_loss: 6.9619 -Epoch 59/100 -59/59 [==============================] - 2s 28ms/step - loss: 5.8533 - val_loss: 6.3855 -Epoch 60/100 -59/59 [==============================] - 2s 28ms/step - loss: 5.5107 - val_loss: 5.9797 -Epoch 61/100 -59/59 [==============================] - 2s 31ms/step - loss: 5.1181 - val_loss: 5.7549 -Epoch 62/100 -59/59 [==============================] - 2s 31ms/step - loss: 4.6952 - val_loss: 5.5488 -Epoch 63/100 -59/59 [==============================] - 2s 29ms/step - loss: 4.4189 - val_loss: 5.3030 -Epoch 64/100 -59/59 [==============================] - 2s 28ms/step - loss: 4.1358 - val_loss: 5.1772 -Epoch 65/100 -59/59 [==============================] - 2s 28ms/step - loss: 3.8560 - val_loss: 5.1071 -Epoch 66/100 -59/59 [==============================] - 2s 28ms/step - loss: 3.5342 - val_loss: 4.6958 -Epoch 67/100 -59/59 [==============================] - 2s 28ms/step - loss: 3.3336 - val_loss: 4.5865 -Epoch 68/100 -59/59 [==============================] - 2s 27ms/step - loss: 3.0925 - val_loss: 4.3647 -Epoch 69/100 -59/59 [==============================] - 2s 28ms/step - loss: 2.8751 - val_loss: 4.3005 -Epoch 70/100 -59/59 [==============================] - 2s 27ms/step - loss: 2.7444 - val_loss: 4.0820 -Epoch 71/100 -59/59 [==============================] - 2s 27ms/step - loss: 2.5921 - val_loss: 4.1694 -Epoch 72/100 -59/59 [==============================] - 2s 28ms/step - loss: 2.3246 - val_loss: 3.9142 -Epoch 73/100 -59/59 [==============================] - 2s 28ms/step - loss: 2.0769 - val_loss: 3.9135 -Epoch 74/100 -59/59 [==============================] - 2s 29ms/step - loss: 2.0872 - val_loss: 3.9808 -Epoch 75/100 -59/59 [==============================] - 2s 29ms/step - loss: 1.9498 - val_loss: 3.9935 -Epoch 76/100 -59/59 [==============================] - 2s 28ms/step - loss: 1.8178 - val_loss: 3.7735 -Epoch 77/100 -59/59 [==============================] - 2s 29ms/step - loss: 1.7661 - val_loss: 3.6309 -Epoch 78/100 -59/59 [==============================] - 2s 31ms/step - loss: 1.6236 - val_loss: 3.7410 -Epoch 79/100 -59/59 [==============================] - 2s 29ms/step - loss: 1.4652 - val_loss: 3.6756 -Epoch 80/100 -59/59 [==============================] - 2s 27ms/step - loss: 1.3552 - val_loss: 3.4979 -Epoch 81/100 -59/59 [==============================] - 2s 29ms/step - loss: 1.2655 - val_loss: 3.5306 -Epoch 82/100 -59/59 [==============================] - 2s 29ms/step - loss: 1.2632 - val_loss: 3.2885 -Epoch 83/100 -59/59 [==============================] - 2s 28ms/step - loss: 1.2316 - val_loss: 3.2482 -Epoch 84/100 -59/59 [==============================] - 2s 30ms/step - loss: 1.1260 - val_loss: 3.4285 -Epoch 85/100 -59/59 [==============================] - 2s 28ms/step - loss: 1.0745 - val_loss: 3.2985 -Epoch 86/100 -59/59 [==============================] - 2s 29ms/step - loss: 1.0133 - val_loss: 3.2209 -Epoch 87/100 -59/59 [==============================] - 2s 31ms/step - loss: 0.9417 - val_loss: 3.2203 -Epoch 88/100 -59/59 [==============================] - 2s 28ms/step - loss: 0.9104 - val_loss: 3.1121 -Epoch 89/100 -59/59 [==============================] - 2s 30ms/step - loss: 0.8516 - val_loss: 3.2070 -Epoch 90/100 -59/59 [==============================] - 2s 28ms/step - loss: 0.8275 - val_loss: 3.0335 -Epoch 91/100 -59/59 [==============================] - 2s 28ms/step - loss: 0.8056 - val_loss: 3.2085 -Epoch 92/100 -59/59 [==============================] - 2s 28ms/step - loss: 0.7373 - val_loss: 3.0326 -Epoch 93/100 -59/59 [==============================] - 2s 28ms/step - loss: 0.7753 - val_loss: 2.9935 -Epoch 94/100 -59/59 [==============================] - 2s 28ms/step - loss: 0.7688 - val_loss: 2.9940 -Epoch 95/100 -59/59 [==============================] - 2s 27ms/step - loss: 0.6765 - val_loss: 3.0432 -Epoch 96/100 -59/59 [==============================] - 2s 29ms/step - loss: 0.6674 - val_loss: 3.1233 -Epoch 97/100 -59/59 [==============================] - 2s 29ms/step - loss: 0.6018 - val_loss: 2.8405 -Epoch 98/100 -59/59 [==============================] - 2s 28ms/step - loss: 0.6322 - val_loss: 2.8323 -Epoch 99/100 -59/59 [==============================] - 2s 29ms/step - loss: 0.5889 - val_loss: 2.8786 -Epoch 100/100 -59/59 [==============================] - 2s 28ms/step - loss: 0.5616 - val_loss: 2.9697 +Epoch 1/2 + 59/59 ━━━━━━━━━━━━━━━━━━━━ 8s 78ms/step - loss: 31.6480 - val_loss: 16.4761 +Epoch 2/2 + 59/59 ━━━━━━━━━━━━━━━━━━━━ 4s 64ms/step - loss: 16.4152 - val_loss: 16.4351 ```
--- ## Inference -You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/ocr-for-captcha) and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/ocr-for-captcha). +You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/ocr-for-captcha) +and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/ocr-for-captcha). + ```python +def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1): + input_shape = tf.shape(y_pred) + num_samples, num_steps = input_shape[0], input_shape[1] + y_pred = tf.math.log(tf.transpose(y_pred, perm=[1, 0, 2]) + keras.backend.epsilon()) + input_length = tf.cast(input_length, tf.int32) + + if greedy: + (decoded, log_prob) = tf.nn.ctc_greedy_decoder( + inputs=y_pred, sequence_length=input_length + ) + else: + (decoded, log_prob) = tf.compat.v1.nn.ctc_beam_search_decoder( + inputs=y_pred, + sequence_length=input_length, + beam_width=beam_width, + top_paths=top_paths, + ) + decoded_dense = [] + for st in decoded: + st = tf.SparseTensor(st.indices, st.values, (num_samples, num_steps)) + decoded_dense.append(tf.sparse.to_dense(sp_input=st, default_value=-1)) + return (decoded_dense, log_prob) + + # Get the prediction model by extracting layers till the output layer prediction_model = keras.models.Model( - model.get_layer(name="image").input, model.get_layer(name="dense2").output + model.input[0], model.get_layer(name="dense2").output ) prediction_model.summary() + # A utility function to decode the output of the network def decode_batch_predictions(pred): input_len = np.ones(pred.shape[0]) * pred.shape[1] # Use greedy search. For complex tasks, you can use beam search - results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][ + results = ctc_decode(pred, input_length=input_len, greedy=True)[0][0][ :, :max_length ] # Iterate over the results and get back the text @@ -626,39 +537,67 @@ for batch in validation_dataset.take(1): plt.show() ``` + +
Model: "functional_1"
+
+ + + + +
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
+┃ Layer (type)                     Output Shape                  Param # ┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
+│ image (InputLayer)              │ (None, 200, 50, 1)        │          0 │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ Conv1 (Conv2D)                  │ (None, 200, 50, 32)       │        320 │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ pool1 (MaxPooling2D)            │ (None, 100, 25, 32)       │          0 │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ Conv2 (Conv2D)                  │ (None, 100, 25, 64)       │     18,496 │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ pool2 (MaxPooling2D)            │ (None, 50, 12, 64)        │          0 │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ reshape (Reshape)               │ (None, 50, 768)           │          0 │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ dense1 (Dense)                  │ (None, 50, 64)            │     49,216 │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ dropout (Dropout)               │ (None, 50, 64)            │          0 │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ bidirectional (Bidirectional)   │ (None, 50, 256)           │    197,632 │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ bidirectional_1 (Bidirectional) │ (None, 50, 128)           │    164,352 │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ dense2 (Dense)                  │ (None, 50, 21)            │      2,709 │
+└─────────────────────────────────┴───────────────────────────┴────────────┘
+
+ + + + +
 Total params: 432,725 (1.65 MB)
+
+ + + + +
 Trainable params: 432,725 (1.65 MB)
+
+ + + + +
 Non-trainable params: 0 (0.00 B)
+
+ + +
``` -Model: "functional_1" -_________________________________________________________________ -Layer (type) Output Shape Param # -================================================================= -image (InputLayer) [(None, 200, 50, 1)] 0 -_________________________________________________________________ -Conv1 (Conv2D) (None, 200, 50, 32) 320 -_________________________________________________________________ -pool1 (MaxPooling2D) (None, 100, 25, 32) 0 -_________________________________________________________________ -Conv2 (Conv2D) (None, 100, 25, 64) 18496 -_________________________________________________________________ -pool2 (MaxPooling2D) (None, 50, 12, 64) 0 -_________________________________________________________________ -reshape (Reshape) (None, 50, 768) 0 -_________________________________________________________________ -dense1 (Dense) (None, 50, 64) 49216 -_________________________________________________________________ -dropout (Dropout) (None, 50, 64) 0 -_________________________________________________________________ -bidirectional (Bidirectional (None, 50, 256) 197632 -_________________________________________________________________ -bidirectional_1 (Bidirection (None, 50, 128) 164352 -_________________________________________________________________ -dense2 (Dense) (None, 50, 20) 2580 -================================================================= -Total params: 432,596 -Trainable params: 432,596 -Non-trainable params: 0 -_________________________________________________________________ + 1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 258ms/step ```
-![png](/img/examples/vision/captcha_ocr/captcha_ocr_19_1.png) \ No newline at end of file + +![png](/img/examples/vision/captcha_ocr/captcha_ocr_19_6.png) + +