665 lines
19 KiB
Plaintext
665 lines
19 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "UNKC5YSEIS_d"
|
|
},
|
|
"source": [
|
|
"# Captchas\n",
|
|
"\n",
|
|
"**see:** https://keras.io/examples/vision/captcha_ocr/<br>\n",
|
|
"**original:** https://colab.research.google.com/drive/1Olw2KMHfPlnGaYuzffl2zb6D1etlBGZf?usp=sharing<br>\n",
|
|
"**View Github version in Colab:** <a href=\"https://colab.research.google.com/github/KnollFrank/2captcha-worker-assistant-server/blob/master/captcha_ocr_trainAndSaveModel_colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a><br>\n",
|
|
"**paper:** Simple and Easy: Transfer Learning-Based Attacks to Text CAPTCHA<br>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "wRUsVuIiIS_s"
|
|
},
|
|
"source": [
|
|
"## Setup"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "ioGwCR3Xl31V"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import sys\n",
|
|
"sys.argv = sys.argv[:1]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "l-coMy_2l31X"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def isInColab():\n",
|
|
" try:\n",
|
|
" import colab\n",
|
|
" return True\n",
|
|
" except:\n",
|
|
" return False"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "goO0feQwl31Y"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"inColab = isInColab()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "nsE9VWCel31Z"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"if inColab:\n",
|
|
" branch = 'read-captcha'\n",
|
|
" !git clone https://github.com/KnollFrank/HowBadIsMyBatch.git\n",
|
|
" !cd HowBadIsMyBatch; git checkout $branch"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "l9qhlDVNl31b"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"srcPath = '/content/HowBadIsMyBatch/src' if inColab else os.getcwd()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "c-2fE6vZsD7a"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"if inColab:\n",
|
|
" sys.path.insert(0, srcPath)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "zZSwQragIS_v"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import numpy as np\n",
|
|
"from pathlib import Path\n",
|
|
"import tensorflow as tf\n",
|
|
"from captcha.CaptchaGenerator import CaptchaGenerator\n",
|
|
"from captcha.CharNumConverter import CharNumConverter\n",
|
|
"from captcha.DataSplitter import DataSplitter\n",
|
|
"from captcha.DatasetFactory import DatasetFactory\n",
|
|
"from captcha.ModelFactory import ModelFactory\n",
|
|
"from captcha.PredictionsDecoder import PredictionsDecoder\n",
|
|
"from captcha.ModelDAO import ModelDAO\n",
|
|
"from captcha.CaptchaShape import CaptchaShape"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "BWqAvnVOl31d"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from pathlib import Path\n",
|
|
"\n",
|
|
"class GoogleDriveManager:\n",
|
|
" \n",
|
|
" _googleDriveFolder = Path('/content/gdrive')\n",
|
|
" _baseFolder = _googleDriveFolder / 'MyDrive/CAPTCHA/models/'\n",
|
|
"\n",
|
|
" @staticmethod\n",
|
|
" def mount():\n",
|
|
" from google.colab import drive\n",
|
|
" drive.mount(str(GoogleDriveManager._googleDriveFolder))\n",
|
|
"\n",
|
|
" @staticmethod\n",
|
|
" def uploadFolderToGoogleDrive(folder):\n",
|
|
" basename = !basename {folder}\n",
|
|
" basename = basename[0]\n",
|
|
" !cd {folder}/..; zip -r {basename}.zip {basename}/\n",
|
|
" !cd {folder}/..; cp {basename}.zip {GoogleDriveManager._baseFolder}\n",
|
|
" \n",
|
|
" @staticmethod\n",
|
|
" def downloadFolderFromGoogleDrive(folder):\n",
|
|
" !cp {GoogleDriveManager._baseFolder}/{folder}.zip .\n",
|
|
" !rm -rf {folder}\n",
|
|
" !unzip {folder}.zip\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "0DZfMrbe3MGN"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def getImagesAndLabels(dataDir):\n",
|
|
" fileSuffix = \".jpeg\"\n",
|
|
" images = sorted(list(map(str, list(dataDir.glob(\"*\" + fileSuffix)))))\n",
|
|
" labels = [image.split(os.path.sep)[-1].split(fileSuffix)[0] for image in images]\n",
|
|
" return images, labels\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "kdL9_t03Mf3t"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def getTrainValidationTestDatasets(dataDir, datasetFactory):\n",
|
|
" images, labels = getImagesAndLabels(dataDir)\n",
|
|
" print(\"Number of images found:\", len(images))\n",
|
|
" print(\"Characters:\", CaptchaGenerator.characters)\n",
|
|
"\n",
|
|
" dataSplitter = DataSplitter(images, labels)\n",
|
|
" \n",
|
|
" return (\n",
|
|
" datasetFactory.createDataset(*dataSplitter.getTrain()),\n",
|
|
" datasetFactory.createDataset(*dataSplitter.getValid()),\n",
|
|
" datasetFactory.createDataset(*dataSplitter.getTest())\n",
|
|
" )"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "FqVSEuZp3MGT"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import math\n",
|
|
"\n",
|
|
"def displayImagesInGrid(numGridCols, images, titles, titleColors):\n",
|
|
" assert len(images) == len(titles) == len(titleColors)\n",
|
|
" images = [image.numpy().astype(np.uint8) for image in images]\n",
|
|
" numGridRows = math.ceil(len(images) / numGridCols)\n",
|
|
" _, axs = plt.subplots(numGridRows, numGridCols, figsize=(15, 5))\n",
|
|
" for row in range(numGridRows):\n",
|
|
" for col in range(numGridCols):\n",
|
|
" ax = axs[row, col]\n",
|
|
" ax.axis(\"off\")\n",
|
|
" i = row * numGridCols + col\n",
|
|
" if(i < len(images)):\n",
|
|
" ax.imshow(images[i])\n",
|
|
" ax.set_title(titles[i], color=titleColors[i])\n",
|
|
" plt.show()\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "apkeCHhP3MGU"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def display16Predictions(model, dataset, predictionsDecoder):\n",
|
|
" for batch in dataset.take(1):\n",
|
|
" numPredictions2Display = 16\n",
|
|
" batch_images = batch[\"image\"][:numPredictions2Display]\n",
|
|
" batch_labels = batch[\"label\"][:numPredictions2Display]\n",
|
|
"\n",
|
|
" preds = model.predict(batch_images)\n",
|
|
" pred_texts = predictionsDecoder.decode_batch_predictions(preds)\n",
|
|
" orig_texts = predictionsDecoder.asStrings(batch_labels)\n",
|
|
"\n",
|
|
" displayImagesInGrid(\n",
|
|
" 4,\n",
|
|
" batch_images,\n",
|
|
" [f\"Prediction/Truth: {pred_text}/{orig_text}\" for (pred_text, orig_text) in zip(pred_texts, orig_texts)],\n",
|
|
" ['green' if pred_text == orig_text else 'red' for (pred_text, orig_text) in zip(pred_texts, orig_texts)])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "HEKh6eval31k"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"def printLayers(model):\n",
|
|
" for i, layer in enumerate(model.layers):\n",
|
|
" print(i, layer.name)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "S3X_SslH3MGY"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# FK-TODO: entferne die getAccuracy()-Methode. Implementiere stattdessen https://stackoverflow.com/questions/37657260/how-to-implement-custom-metric-in-keras oder https://keras.io/api/metrics/#custom-metrics\n",
|
|
"def getAccuracy(dataset, prediction_model, ctc_decode):\n",
|
|
" accuracy = tf.keras.metrics.Accuracy()\n",
|
|
"\n",
|
|
" for batch in dataset:\n",
|
|
" accuracy.update_state(batch[\"label\"], ctc_decode(prediction_model.predict(batch[\"image\"], verbose=0)))\n",
|
|
"\n",
|
|
" return accuracy.result().numpy()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "94755hrNMf3w"
|
|
},
|
|
"source": [
|
|
"## Preparation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "7EsmTaF03MGZ"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"if inColab:\n",
|
|
" GoogleDriveManager.mount()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "WmUghcQaMf3y"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"modelDAO = ModelDAO()\n",
|
|
"charNumConverter = CharNumConverter(CaptchaGenerator.characters)\n",
|
|
"predictionsDecoder = PredictionsDecoder(CaptchaGenerator.captchaLength, charNumConverter.num_to_char)\n",
|
|
"captchaShape = CaptchaShape()\n",
|
|
"datasetFactory = DatasetFactory(captchaShape, charNumConverter.char_to_num, batch_size = 64)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def saveModel(model):\n",
|
|
" modelFilepath = f'{srcPath}/captcha/{model.name}'\n",
|
|
" modelDAO.saveModel(model, modelFilepath)\n",
|
|
" if inColab:\n",
|
|
" GoogleDriveManager.uploadFolderToGoogleDrive(modelFilepath)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "lsLuSi7h3MGZ"
|
|
},
|
|
"source": [
|
|
"## Create And Train Base Model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "oRcemcbG3MGa"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"if inColab:\n",
|
|
" !apt-get update\n",
|
|
" !sudo apt install ttf-mscorefonts-installer\n",
|
|
" !sudo fc-cache -f\n",
|
|
" !fc-match Arial"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "P7myCt7e2h6A"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# \"We generate 200,000 images for base model pre-training\"\n",
|
|
"captchaGenerator = CaptchaGenerator(\n",
|
|
" numCaptchas = 200000, # 50, # 200000,\n",
|
|
" dataDir = Path(srcPath + '/captchas/generated/VAERS/'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "j9apYsyI3MGb"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"captchaGenerator.createAndSaveCaptchas()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "AgN4skCkMf31"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_dataset, validation_dataset, test_dataset = getTrainValidationTestDatasets(captchaGenerator.dataDir, datasetFactory)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "RcgWHXVSNsa7"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"for batch in train_dataset.take(1):\n",
|
|
" numImages2Display = 16\n",
|
|
" images = batch[\"image\"][:numImages2Display]\n",
|
|
" labels = batch[\"label\"][:numImages2Display]\n",
|
|
" displayImagesInGrid(4, images, predictionsDecoder.asStrings(labels), ['black'] * len(labels))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "zDoFYKM2hdEW"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"modelFactory = ModelFactory(captchaShape, charNumConverter.char_to_num)\n",
|
|
"model = modelFactory.createMobileNetV3Small()\n",
|
|
"model.summary()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "ltXYrpjIITAb"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# \"the success rates became stable after the base-model training epochs exceeded 20\"\n",
|
|
"history = model.fit(\n",
|
|
" train_dataset,\n",
|
|
" validation_data=validation_dataset,\n",
|
|
" epochs=20)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "fPG-Yl1SJfF7"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"saveModel(model)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "NnNHMtIGITAe"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"prediction_model = ModelFactory.createPredictionModel(model)\n",
|
|
"prediction_model.summary()\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "YW651ztD8sKI"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"display16Predictions(prediction_model, test_dataset, predictionsDecoder)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "V5gqMBIwBmZU"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"getAccuracy(test_dataset, prediction_model, predictionsDecoder.ctc_decode)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "UYxiYTH9BmZU"
|
|
},
|
|
"source": [
|
|
"## Transfer learning"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "WV8IS4KrBmZU"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# \"we collected 1,500 real CAPTCHAs from the websites. Note that only 500 of them are used for fine-tuning, and another 1,000 are applied to calculate the test accuracy\"\n",
|
|
"# FK-TODO: lade das pre-trainierte model und trainiere es mit 500 real-world-Daten aus dem Ordner captchas/VAERS/, die restlichen 540 (es sollten nach obigem Zitat aber 1,000 sein) sind dann die Test-Daten.\n",
|
|
"# see https://keras.io/guides/transfer_learning/\n",
|
|
"# see https://www.tensorflow.org/tutorials/images/transfer_learning\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "qZvn1k2Ul31v"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"modelName, numTrainableLayers = 'MobileNetV3Small', 104\n",
|
|
"# modelName, numTrainableLayers = 'ResNet101', 348"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "D7ogEQmB3MGj"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# FK-TODO: DRY with VAERSFileDownloader\n",
|
|
"modelFilepath = f'{srcPath}/captcha/{modelName}'\n",
|
|
"model = modelDAO.loadModel(modelFilepath)\n",
|
|
"model.summary(show_trainable=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "gbPigogKNFrD"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# printLayers(model)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "59quw8o3Mf34"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"model.trainable = True\n",
|
|
"for layer in model.layers[:numTrainableLayers]:\n",
|
|
" layer.trainable = False"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "acGczax3Mf34"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"model.summary(show_trainable=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "q7_MjUO0BmZV"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"train_dataset, validation_dataset, test_dataset = getTrainValidationTestDatasets(Path(f\"{srcPath}/captcha/captchas/VAERS/\"), datasetFactory)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "dZsCpibkBmZX"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# \"The model is optimized by a stochastic gradient descent (SGD) strategy with an initial learning rate of 0.004, weight decay of 0.00004 and momentum of 0.9.\"\n",
|
|
"from tensorflow.keras.optimizers import SGD\n",
|
|
"# model.compile(optimizer=SGD(learning_rate=0.0001, momentum=0.9))\n",
|
|
"model.compile(optimizer='adam')\n",
|
|
"\n",
|
|
"# \"Therefore, in our experiments, we chose 1 epoch for the fine-tuning stage.\"\n",
|
|
"history = model.fit(\n",
|
|
" train_dataset,\n",
|
|
" validation_data=validation_dataset,\n",
|
|
" epochs=20)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "TRbJigbH3MGl"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"prediction_model = ModelFactory.createPredictionModel(model)\n",
|
|
"prediction_model.summary()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "rPszfhJ4BmZX"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"getAccuracy(test_dataset, prediction_model, predictionsDecoder.ctc_decode)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"collapsed": true,
|
|
"id": "hfmRY1qC7aVV"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"display16Predictions(prediction_model, test_dataset, predictionsDecoder)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "FpJTHU6dxOVy"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"saveModel(model)"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"accelerator": "GPU",
|
|
"colab": {
|
|
"private_outputs": true,
|
|
"provenance": []
|
|
},
|
|
"gpuClass": "standard",
|
|
"kernelspec": {
|
|
"display_name": "howbadismybatch-venv-kernel",
|
|
"language": "python",
|
|
"name": "howbadismybatch-venv-kernel"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.15"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0
|
|
}
|