diff --git a/environment.yml b/environment.yml index 59c3150f39c..d2542baa7cc 100644 --- a/environment.yml +++ b/environment.yml @@ -16,6 +16,7 @@ dependencies: - nb_conda_kernels - pillow - openpyxl + - tqdm # - python-decouple # - selenium # - webdriver-manager diff --git a/src/captcha.ipynb b/src/captcha.ipynb index 3fffbb00435..4fb5052578e 100644 --- a/src/captcha.ipynb +++ b/src/captcha.ipynb @@ -27,15 +27,90 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "zZSwQragIS_v" + "id": "ioGwCR3Xl31V" + }, + "outputs": [], + "source": [ + "import sys\n", + "sys.argv = sys.argv[:1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "l-coMy_2l31X" + }, + "outputs": [], + "source": [ + "def isInColab():\n", + " try:\n", + " import colab\n", + " return True\n", + " except:\n", + " return False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "goO0feQwl31Y" + }, + "outputs": [], + "source": [ + "inColab = isInColab()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nsE9VWCel31Z" + }, + "outputs": [], + "source": [ + "if inColab:\n", + " branch = 'read-captcha'\n", + " !git clone https://github.com/KnollFrank/HowBadIsMyBatch.git\n", + " !cd HowBadIsMyBatch; git checkout $branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "l9qhlDVNl31b" }, "outputs": [], "source": [ "import os\n", + "srcPath = '/content/HowBadIsMyBatch/src' if inColab else os.getcwd()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "c-2fE6vZsD7a" + }, + "outputs": [], + "source": [ + "if inColab:\n", + " sys.path.insert(0, srcPath)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zZSwQragIS_v" + }, + "outputs": [], + "source": [ "import numpy as np\n", "from pathlib import Path\n", "import tensorflow as tf\n", - "from captcha.GoogleDriveManager import GoogleDriveManager\n", "from captcha.CaptchaGenerator import CaptchaGenerator\n", "from captcha.CharNumConverter import CharNumConverter\n", "from captcha.DataSplitter import DataSplitter\n", @@ -46,6 +121,40 @@ "from captcha.CaptchaShape import CaptchaShape" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BWqAvnVOl31d" + }, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "class GoogleDriveManager:\n", + " \n", + " _googleDriveFolder = Path('/content/gdrive')\n", + " _baseFolder = _googleDriveFolder / 'MyDrive/CAPTCHA/models/'\n", + "\n", + " @staticmethod\n", + " def mount():\n", + " from google.colab import drive\n", + " drive.mount(str(GoogleDriveManager._googleDriveFolder))\n", + "\n", + " @staticmethod\n", + " def uploadFolderToGoogleDrive(folder):\n", + " basename = !basename {folder}\n", + " basename = basename[0]\n", + " !cd {folder}/..; zip -r {basename}.zip {basename}/\n", + " !cd {folder}/..; cp {basename}.zip {GoogleDriveManager._baseFolder}\n", + " \n", + " @staticmethod\n", + " def downloadFolderFromGoogleDrive(folder):\n", + " !cp {GoogleDriveManager._baseFolder}/{folder}.zip .\n", + " !rm -rf {folder}\n", + " !unzip {folder}.zip\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -138,7 +247,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "HEKh6eval31k" + }, "outputs": [], "source": [ "def printLayers(model):\n", @@ -173,17 +284,6 @@ "## Preparation" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NZrKXF6P3MGY" - }, - "outputs": [], - "source": [ - "inColab = 'google.colab' in str(get_ipython())" - ] - }, { "cell_type": "code", "execution_count": null, @@ -200,28 +300,28 @@ "cell_type": "code", "execution_count": null, "metadata": { - "id": "S_4hl4S4BmZK" + "id": "WmUghcQaMf3y" }, "outputs": [], "source": [ - "if inColab:\n", - " !cp {GoogleDriveManager._baseFolder}/captchas.zip .\n", - " !unzip captchas.zip" + "modelDAO = ModelDAO()\n", + "charNumConverter = CharNumConverter(CaptchaGenerator.characters)\n", + "predictionsDecoder = PredictionsDecoder(CaptchaGenerator.captchaLength, charNumConverter.num_to_char)\n", + "captchaShape = CaptchaShape()\n", + "datasetFactory = DatasetFactory(captchaShape, charNumConverter.char_to_num, batch_size = 64)" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "id": "WmUghcQaMf3y" - }, + "metadata": {}, "outputs": [], "source": [ - "modelDAO = ModelDAO(inColab)\n", - "charNumConverter = CharNumConverter(CaptchaGenerator.characters)\n", - "predictionsDecoder = PredictionsDecoder(CaptchaGenerator.captchaLength, charNumConverter.num_to_char)\n", - "captchaShape = CaptchaShape()\n", - "datasetFactory = DatasetFactory(captchaShape, charNumConverter.char_to_num, batch_size = 64)" + "def saveModel(model):\n", + " modelFilepath = f'{srcPath}/captcha/{model.name}'\n", + " modelDAO.saveModel(model, modelFilepath)\n", + " if inColab:\n", + " GoogleDriveManager.uploadFolderToGoogleDrive(modelFilepath)" ] }, { @@ -242,6 +342,7 @@ "outputs": [], "source": [ "if inColab:\n", + " !apt-get update\n", " !sudo apt install ttf-mscorefonts-installer\n", " !sudo fc-cache -f\n", " !fc-match Arial" @@ -257,8 +358,8 @@ "source": [ "# \"We generate 200,000 images for base model pre-training\"\n", "captchaGenerator = CaptchaGenerator(\n", - " numCaptchas = 50, # 50, # 200000,\n", - " dataDir = Path(\"captchas/generated/VAERS/\"))" + " numCaptchas = 200000, # 50, # 200000,\n", + " dataDir = Path(srcPath + '/captchas/generated/VAERS/'))" ] }, { @@ -334,7 +435,7 @@ }, "outputs": [], "source": [ - "modelDAO.saveModel(model)" + "saveModel(model)" ] }, { @@ -397,7 +498,9 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "qZvn1k2Ul31v" + }, "outputs": [], "source": [ "modelName, numTrainableLayers = 'MobileNetV3Small', 104\n", @@ -413,7 +516,7 @@ "outputs": [], "source": [ "# FK-TODO: DRY with VAERSFileDownloader\n", - "modelFilepath = f'{os.getcwd()}/captcha/{modelName}'\n", + "modelFilepath = f'{srcPath}/captcha/{modelName}'\n", "model = modelDAO.loadModel(modelFilepath)\n", "model.summary(show_trainable=True)" ] @@ -461,7 +564,7 @@ }, "outputs": [], "source": [ - "train_dataset, validation_dataset, test_dataset = getTrainValidationTestDatasets(Path(\"captcha/captchas/VAERS/\"), datasetFactory)" + "train_dataset, validation_dataset, test_dataset = getTrainValidationTestDatasets(Path(f\"{srcPath}/captcha/captchas/VAERS/\"), datasetFactory)" ] }, { @@ -522,18 +625,18 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "id": "FpJTHU6dxOVy" + }, "outputs": [], "source": [ - "modelDAO.saveModel(model)" + "saveModel(model)" ] } ], "metadata": { "accelerator": "GPU", "colab": { - "collapsed_sections": [], - "name": "captcha.ipynb", "private_outputs": true, "provenance": [] }, diff --git a/src/captcha/CaptchaGenerator.py b/src/captcha/CaptchaGenerator.py index affc3f6ebcf..3dcd06fe209 100644 --- a/src/captcha/CaptchaGenerator.py +++ b/src/captcha/CaptchaGenerator.py @@ -2,6 +2,7 @@ from PIL import Image, ImageDraw, ImageFont import random import string import shutil +from tqdm import tqdm class CaptchaGenerator: @@ -15,7 +16,7 @@ class CaptchaGenerator: def createAndSaveCaptchas(self): self._prepareDataDir() - for _ in range(self.numCaptchas): + for _ in tqdm(range(self.numCaptchas)): self._createAndSaveCaptcha() def _prepareDataDir(self): diff --git a/src/captcha/CaptchaReader.py b/src/captcha/CaptchaReader.py index 460083f7f87..94dbc7f34fa 100644 --- a/src/captcha/CaptchaReader.py +++ b/src/captcha/CaptchaReader.py @@ -28,4 +28,4 @@ class CaptchaReader: return PredictionsDecoder(CaptchaGenerator.captchaLength, CharNumConverter(CaptchaGenerator.characters).num_to_char).decode_batch_predictions(preds) def _createPredictionModel(self): - return ModelFactory.createPredictionModel(ModelDAO(inColab=False).loadModel(self.modelFilepath)) + return ModelFactory.createPredictionModel(ModelDAO().loadModel(self.modelFilepath)) diff --git a/src/captcha/GoogleDriveManager.py b/src/captcha/GoogleDriveManager.py deleted file mode 100644 index 3b622677801..00000000000 --- a/src/captcha/GoogleDriveManager.py +++ /dev/null @@ -1,27 +0,0 @@ -from pathlib import Path - - -class GoogleDriveManager: - - _googleDriveFolder = Path('/content/gdrive') - _baseFolder = _googleDriveFolder / 'MyDrive/CAPTCHA/models/' - - @staticmethod - def mount(): - from google.colab import drive - drive.mount(str(GoogleDriveManager._googleDriveFolder)) - - @staticmethod - def uploadFolderToGoogleDrive(folder): - pass - # FK-FIXME: - # !zip -r {folder}.zip {folder}/ - # !cp {folder}.zip {GoogleDriveManager._baseFolder} - - @staticmethod - def downloadFolderFromGoogleDrive(folder): - pass - # FK-FIXME: - # !cp {GoogleDriveManager._baseFolder}/{folder}.zip . - # !rm -rf {folder} - # !unzip {folder}.zip diff --git a/src/captcha/ModelDAO.py b/src/captcha/ModelDAO.py index 168d9c37ee8..fe57bfd71cc 100644 --- a/src/captcha/ModelDAO.py +++ b/src/captcha/ModelDAO.py @@ -1,20 +1,12 @@ from tensorflow import keras -from captcha.GoogleDriveManager import GoogleDriveManager import shutil class ModelDAO: - def __init__(self, inColab): - self.inColab = inColab - - def saveModel(self, model): - shutil.rmtree(model.name, ignore_errors = True) - model.save(model.name) - if self.inColab: - GoogleDriveManager.uploadFolderToGoogleDrive(model.name) + def saveModel(self, model, modelFilepath): + shutil.rmtree(modelFilepath, ignore_errors = True) + model.save(modelFilepath) def loadModel(self, modelFilepath): - if self.inColab: - GoogleDriveManager.downloadFolderFromGoogleDrive(modelFilepath) return keras.models.load_model(modelFilepath)