HowBadIsMyBatch/src/captcha.ipynb

{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UNKC5YSEIS_d"
      },
      "source": [
        "# Captchas\n",
        "\n",
        "**see:** https://keras.io/examples/vision/captcha_ocr/<br>\n",
        "**original:** https://colab.research.google.com/drive/1Olw2KMHfPlnGaYuzffl2zb6D1etlBGZf?usp=sharing<br>\n",
        "**View Github version in Colab:** <a href=\"https://colab.research.google.com/github/KnollFrank/2captcha-worker-assistant-server/blob/master/captcha_ocr_trainAndSaveModel_colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a><br>\n",
        "**paper:** Simple and Easy: Transfer Learning-Based Attacks to Text CAPTCHA<br>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "wRUsVuIiIS_s"
      },
      "source": [
        "## Setup"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ioGwCR3Xl31V"
      },
      "outputs": [],
      "source": [
        "import sys\n",
        "sys.argv = sys.argv[:1]"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "l-coMy_2l31X"
      },
      "outputs": [],
      "source": [
        "def isInColab():\n",
        "    try:\n",
        "        import colab\n",
        "        return True\n",
        "    except:\n",
        "        return False"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "goO0feQwl31Y"
      },
      "outputs": [],
      "source": [
        "inColab = isInColab()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "nsE9VWCel31Z"
      },
      "outputs": [],
      "source": [
        "if inColab:\n",
        "    branch = 'read-captcha'\n",
        "    !git clone https://github.com/KnollFrank/HowBadIsMyBatch.git\n",
        "    !cd HowBadIsMyBatch; git checkout $branch"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "l9qhlDVNl31b"
      },
      "outputs": [],
      "source": [
        "import os\n",
        "srcPath = '/content/HowBadIsMyBatch/src' if inColab else os.getcwd()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "c-2fE6vZsD7a"
      },
      "outputs": [],
      "source": [
        "if inColab:\n",
        "    sys.path.insert(0, srcPath)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zZSwQragIS_v"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "from pathlib import Path\n",
        "import tensorflow as tf\n",
        "from captcha.CaptchaGenerator import CaptchaGenerator\n",
        "from captcha.CharNumConverter import CharNumConverter\n",
        "from captcha.DataSplitter import DataSplitter\n",
        "from captcha.DatasetFactory import DatasetFactory\n",
        "from captcha.ModelFactory import ModelFactory\n",
        "from captcha.PredictionsDecoder import PredictionsDecoder\n",
        "from captcha.ModelDAO import ModelDAO\n",
        "from captcha.CaptchaShape import CaptchaShape"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "BWqAvnVOl31d"
      },
      "outputs": [],
      "source": [
        "from pathlib import Path\n",
        "\n",
        "class GoogleDriveManager:\n",
        "  \n",
        "  _googleDriveFolder = Path('/content/gdrive')\n",
        "  _baseFolder = _googleDriveFolder / 'MyDrive/CAPTCHA/models/'\n",
        "\n",
        "  @staticmethod\n",
        "  def mount():\n",
        "    from google.colab import drive\n",
        "    drive.mount(str(GoogleDriveManager._googleDriveFolder))\n",
        "\n",
        "  @staticmethod\n",
        "  def uploadFolderToGoogleDrive(folder):\n",
        "    basename = !basename {folder}\n",
        "    basename = basename[0]\n",
        "    !cd {folder}/..; zip -r {basename}.zip {basename}/\n",
        "    !cd {folder}/..; cp {basename}.zip {GoogleDriveManager._baseFolder}\n",
        "    \n",
        "  @staticmethod\n",
        "  def downloadFolderFromGoogleDrive(folder):\n",
        "    !cp {GoogleDriveManager._baseFolder}/{folder}.zip .\n",
        "    !rm -rf {folder}\n",
        "    !unzip {folder}.zip\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "0DZfMrbe3MGN"
      },
      "outputs": [],
      "source": [
        "def getImagesAndLabels(dataDir):\n",
        "    fileSuffix = \".jpeg\"\n",
        "    images = sorted(list(map(str, list(dataDir.glob(\"*\" + fileSuffix)))))\n",
        "    labels = [image.split(os.path.sep)[-1].split(fileSuffix)[0] for image in images]\n",
        "    return images, labels\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "kdL9_t03Mf3t"
      },
      "outputs": [],
      "source": [
        "def getTrainValidationTestDatasets(dataDir, datasetFactory):\n",
        "    images, labels = getImagesAndLabels(dataDir)\n",
        "    print(\"Number of images found:\", len(images))\n",
        "    print(\"Characters:\", CaptchaGenerator.characters)\n",
        "\n",
        "    dataSplitter = DataSplitter(images, labels)\n",
        "    \n",
        "    return (\n",
        "        datasetFactory.createDataset(*dataSplitter.getTrain()),\n",
        "        datasetFactory.createDataset(*dataSplitter.getValid()),\n",
        "        datasetFactory.createDataset(*dataSplitter.getTest())\n",
        "        )"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "FqVSEuZp3MGT"
      },
      "outputs": [],
      "source": [
        "import matplotlib.pyplot as plt\n",
        "import math\n",
        "\n",
        "def displayImagesInGrid(numGridCols, images, titles, titleColors):\n",
        "    assert len(images) == len(titles) == len(titleColors)\n",
        "    images = [image.numpy().astype(np.uint8) for image in images]\n",
        "    numGridRows = math.ceil(len(images) / numGridCols)\n",
        "    _, axs = plt.subplots(numGridRows, numGridCols, figsize=(15, 5))\n",
        "    for row in range(numGridRows):\n",
        "        for col in range(numGridCols):\n",
        "            ax = axs[row, col]\n",
        "            ax.axis(\"off\")\n",
        "            i = row * numGridCols + col\n",
        "            if(i < len(images)):\n",
        "                ax.imshow(images[i])\n",
        "                ax.set_title(titles[i], color=titleColors[i])\n",
        "    plt.show()\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "apkeCHhP3MGU"
      },
      "outputs": [],
      "source": [
        "def display16Predictions(model, dataset, predictionsDecoder):\n",
        "    for batch in dataset.take(1):\n",
        "        numPredictions2Display = 16\n",
        "        batch_images = batch[\"image\"][:numPredictions2Display]\n",
        "        batch_labels = batch[\"label\"][:numPredictions2Display]\n",
        "\n",
        "        preds = model.predict(batch_images)\n",
        "        pred_texts = predictionsDecoder.decode_batch_predictions(preds)\n",
        "        orig_texts = predictionsDecoder.asStrings(batch_labels)\n",
        "\n",
        "        displayImagesInGrid(\n",
        "            4,\n",
        "            batch_images,\n",
        "            [f\"Prediction/Truth: {pred_text}/{orig_text}\" for (pred_text, orig_text) in zip(pred_texts, orig_texts)],\n",
        "            ['green' if pred_text == orig_text else 'red' for (pred_text, orig_text) in zip(pred_texts, orig_texts)])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "HEKh6eval31k"
      },
      "outputs": [],
      "source": [
        "def printLayers(model):\n",
        "    for i, layer in enumerate(model.layers):\n",
        "        print(i, layer.name)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "S3X_SslH3MGY"
      },
      "outputs": [],
      "source": [
        "# FK-TODO: entferne die getAccuracy()-Methode. Implementiere stattdessen https://stackoverflow.com/questions/37657260/how-to-implement-custom-metric-in-keras oder https://keras.io/api/metrics/#custom-metrics\n",
        "def getAccuracy(dataset, prediction_model, ctc_decode):\n",
        "    accuracy = tf.keras.metrics.Accuracy()\n",
        "\n",
        "    for batch in dataset:\n",
        "        accuracy.update_state(batch[\"label\"], ctc_decode(prediction_model.predict(batch[\"image\"], verbose=0)))\n",
        "\n",
        "    return accuracy.result().numpy()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "94755hrNMf3w"
      },
      "source": [
        "## Preparation"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "7EsmTaF03MGZ"
      },
      "outputs": [],
      "source": [
        "if inColab:\n",
        "    GoogleDriveManager.mount()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "WmUghcQaMf3y"
      },
      "outputs": [],
      "source": [
        "modelDAO = ModelDAO()\n",
        "charNumConverter = CharNumConverter(CaptchaGenerator.characters)\n",
        "predictionsDecoder = PredictionsDecoder(CaptchaGenerator.captchaLength, charNumConverter.num_to_char)\n",
        "captchaShape = CaptchaShape()\n",
        "datasetFactory = DatasetFactory(captchaShape, charNumConverter.char_to_num, batch_size = 64)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "def saveModel(model):\n",
        "    modelFilepath = f'{srcPath}/captcha/{model.name}'\n",
        "    modelDAO.saveModel(model, modelFilepath)\n",
        "    if inColab:\n",
        "        GoogleDriveManager.uploadFolderToGoogleDrive(modelFilepath)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "lsLuSi7h3MGZ"
      },
      "source": [
        "## Create And Train Base Model"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "oRcemcbG3MGa"
      },
      "outputs": [],
      "source": [
        "if inColab:\n",
        "    !apt-get update\n",
        "    !sudo apt install ttf-mscorefonts-installer\n",
        "    !sudo fc-cache -f\n",
        "    !fc-match Arial"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "P7myCt7e2h6A"
      },
      "outputs": [],
      "source": [
        "# \"We generate 200,000 images for base model pre-training\"\n",
        "captchaGenerator = CaptchaGenerator(\n",
        "    numCaptchas = 200000, # 50, # 200000,\n",
        "    dataDir = Path(srcPath + '/captchas/generated/VAERS/'))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "j9apYsyI3MGb"
      },
      "outputs": [],
      "source": [
        "captchaGenerator.createAndSaveCaptchas()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "AgN4skCkMf31"
      },
      "outputs": [],
      "source": [
        "train_dataset, validation_dataset, test_dataset = getTrainValidationTestDatasets(captchaGenerator.dataDir, datasetFactory)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "RcgWHXVSNsa7"
      },
      "outputs": [],
      "source": [
        "for batch in train_dataset.take(1):\n",
        "    numImages2Display = 16\n",
        "    images = batch[\"image\"][:numImages2Display]\n",
        "    labels = batch[\"label\"][:numImages2Display]\n",
        "    displayImagesInGrid(4, images, predictionsDecoder.asStrings(labels), ['black'] * len(labels))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zDoFYKM2hdEW"
      },
      "outputs": [],
      "source": [
        "modelFactory = ModelFactory(captchaShape, charNumConverter.char_to_num)\n",
        "model = modelFactory.createMobileNetV3Small()\n",
        "model.summary()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ltXYrpjIITAb"
      },
      "outputs": [],
      "source": [
        "# \"the success rates became stable after the base-model training epochs exceeded 20\"\n",
        "history = model.fit(\n",
        "    train_dataset,\n",
        "    validation_data=validation_dataset,\n",
        "    epochs=20)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "fPG-Yl1SJfF7"
      },
      "outputs": [],
      "source": [
        "saveModel(model)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "NnNHMtIGITAe"
      },
      "outputs": [],
      "source": [
        "prediction_model = ModelFactory.createPredictionModel(model)\n",
        "prediction_model.summary()\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "YW651ztD8sKI"
      },
      "outputs": [],
      "source": [
        "display16Predictions(prediction_model, test_dataset, predictionsDecoder)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "V5gqMBIwBmZU"
      },
      "outputs": [],
      "source": [
        "getAccuracy(test_dataset, prediction_model, predictionsDecoder.ctc_decode)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "UYxiYTH9BmZU"
      },
      "source": [
        "## Transfer learning"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "WV8IS4KrBmZU"
      },
      "outputs": [],
      "source": [
        "# \"we collected 1,500 real CAPTCHAs from the websites. Note that only 500 of them are used for fine-tuning, and another 1,000 are applied to calculate the test accuracy\"\n",
        "# FK-TODO: lade das pre-trainierte model und trainiere es mit 500 real-world-Daten aus dem Ordner captchas/VAERS/, die restlichen 540 (es sollten nach obigem Zitat aber 1,000 sein) sind dann die Test-Daten.\n",
        "# see https://keras.io/guides/transfer_learning/\n",
        "# see https://www.tensorflow.org/tutorials/images/transfer_learning\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "qZvn1k2Ul31v"
      },
      "outputs": [],
      "source": [
        "modelName, numTrainableLayers = 'MobileNetV3Small', 104\n",
        "# modelName, numTrainableLayers = 'ResNet101', 348"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "D7ogEQmB3MGj"
      },
      "outputs": [],
      "source": [
        "# FK-TODO: DRY with VAERSFileDownloader\n",
        "modelFilepath = f'{srcPath}/captcha/{modelName}'\n",
        "model = modelDAO.loadModel(modelFilepath)\n",
        "model.summary(show_trainable=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "gbPigogKNFrD"
      },
      "outputs": [],
      "source": [
        "# printLayers(model)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "59quw8o3Mf34"
      },
      "outputs": [],
      "source": [
        "model.trainable = True\n",
        "for layer in model.layers[:numTrainableLayers]:\n",
        "    layer.trainable = False"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "acGczax3Mf34"
      },
      "outputs": [],
      "source": [
        "model.summary(show_trainable=True)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "q7_MjUO0BmZV"
      },
      "outputs": [],
      "source": [
        "train_dataset, validation_dataset, test_dataset = getTrainValidationTestDatasets(Path(f\"{srcPath}/captcha/captchas/VAERS/\"), datasetFactory)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "dZsCpibkBmZX"
      },
      "outputs": [],
      "source": [
        "# \"The model is optimized by a stochastic gradient descent (SGD) strategy with an initial learning rate of 0.004, weight decay of 0.00004 and momentum of 0.9.\"\n",
        "from tensorflow.keras.optimizers import SGD\n",
        "# model.compile(optimizer=SGD(learning_rate=0.0001, momentum=0.9))\n",
        "model.compile(optimizer='adam')\n",
        "\n",
        "# \"Therefore, in our experiments, we chose 1 epoch for the fine-tuning stage.\"\n",
        "history = model.fit(\n",
        "    train_dataset,\n",
        "    validation_data=validation_dataset,\n",
        "    epochs=20)\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "TRbJigbH3MGl"
      },
      "outputs": [],
      "source": [
        "prediction_model = ModelFactory.createPredictionModel(model)\n",
        "prediction_model.summary()"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "rPszfhJ4BmZX"
      },
      "outputs": [],
      "source": [
        "getAccuracy(test_dataset, prediction_model, predictionsDecoder.ctc_decode)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": true,
        "id": "hfmRY1qC7aVV"
      },
      "outputs": [],
      "source": [
        "display16Predictions(prediction_model, test_dataset, predictionsDecoder)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "FpJTHU6dxOVy"
      },
      "outputs": [],
      "source": [
        "saveModel(model)"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "private_outputs": true,
      "provenance": []
    },
    "gpuClass": "standard",
    "kernelspec": {
      "display_name": "howbadismybatch-venv-kernel",
      "language": "python",
      "name": "howbadismybatch-venv-kernel"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.9.15"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}