From a9e7bf483392199bb83856220c1ce20150230543 Mon Sep 17 00:00:00 2001 From: frankknoll Date: Wed, 15 Mar 2023 17:14:24 +0100 Subject: [PATCH] refactoring --- src/VAERSFileDownloader.py | 4 +- src/{captcha => }/captcha.ipynb | 108 +++++++++++++------------------ src/captcha/CaptchaReader.py | 10 ++- src/captcha/CaptchaReaderTest.py | 5 +- src/captcha/CaptchaShape.py | 5 ++ src/captcha/DatasetFactory.py | 7 +- src/captcha/ModelFactory.py | 11 ++-- 7 files changed, 68 insertions(+), 82 deletions(-) rename src/{captcha => }/captcha.ipynb (85%) create mode 100644 src/captcha/CaptchaShape.py diff --git a/src/VAERSFileDownloader.py b/src/VAERSFileDownloader.py index 5f98de34704..7d59a83996b 100644 --- a/src/VAERSFileDownloader.py +++ b/src/VAERSFileDownloader.py @@ -4,6 +4,7 @@ from WebDriver import getWebDriver, isCaptchaSolved, saveCaptchaImageAs from selenium.webdriver.common.by import By from captcha.CaptchaReader import CaptchaReader from zipUtils import unzipAndRemove +from captcha.CaptchaShape import CaptchaShape #def getTextInCaptchaImage(captchaImageFile): @@ -22,7 +23,8 @@ def solveCaptchaAndStartFileDownload(driver, captchaImageFile): def _createCaptchaReader(): working_directory = os.path.dirname(__file__) - return CaptchaReader(modelFilepath = f'{working_directory}/captcha/MobileNetV3Small') + return CaptchaReader(modelFilepath = f'{working_directory}/captcha/MobileNetV3Small', + captchaShape = CaptchaShape()) def downloadFile(absoluteFile, driver, maxTries): def _downloadFile(): diff --git a/src/captcha/captcha.ipynb b/src/captcha.ipynb similarity index 85% rename from src/captcha/captcha.ipynb rename to src/captcha.ipynb index 48b04c13f33..c6ea0f2921a 100644 --- a/src/captcha/captcha.ipynb +++ b/src/captcha.ipynb @@ -25,22 +25,11 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "id": "zZSwQragIS_v" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-03-15 10:46:02.303787: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE4.1 SSE4.2 AVX AVX2 FMA\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "/home/frankknoll/.local/lib/python3.9/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5\n", - " warnings.warn(f\"A NumPy version >={np_minversion} and <{np_maxversion}\"\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "import numpy as np\n", @@ -54,29 +43,29 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "id": "QB8QZJPg3MGI" }, "outputs": [], "source": [ - "from GoogleDriveManager import GoogleDriveManager" + "from captcha.GoogleDriveManager import GoogleDriveManager" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "id": "C3bxU1US2blM" }, "outputs": [], "source": [ - "from CaptchaGenerator import CaptchaGenerator" + "from captcha.CaptchaGenerator import CaptchaGenerator" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "id": "0DZfMrbe3MGN" }, @@ -91,18 +80,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "id": "sNJjugG83MGO" }, "outputs": [], "source": [ - "from CharNumConverter import CharNumConverter" + "from captcha.CharNumConverter import CharNumConverter" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "id": "qxs04OTR3MGP" }, @@ -137,18 +126,18 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "id": "dAAACymS3MGR" }, "outputs": [], "source": [ - "from DatasetFactory import DatasetFactory" + "from captcha.DatasetFactory import DatasetFactory" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "id": "kdL9_t03Mf3t" }, @@ -170,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "id": "FqVSEuZp3MGT" }, @@ -197,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "id": "apkeCHhP3MGU" }, @@ -222,18 +211,18 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "id": "st13jAjL3MGV" }, "outputs": [], "source": [ - "from ModelFactory import ModelFactory" + "from captcha.ModelFactory import ModelFactory" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -244,29 +233,29 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": { "id": "B7GZlk2_3MGX" }, "outputs": [], "source": [ - "from PredictionsDecoder import PredictionsDecoder" + "from captcha.PredictionsDecoder import PredictionsDecoder" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": { "id": "8Oa7avYt3MGX" }, "outputs": [], "source": [ - "from ModelDAO import ModelDAO" + "from captcha.ModelDAO import ModelDAO" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": { "id": "S3X_SslH3MGY" }, @@ -293,7 +282,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": { "id": "NZrKXF6P3MGY" }, @@ -304,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": { "id": "7EsmTaF03MGZ" }, @@ -316,7 +305,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": { "id": "S_4hl4S4BmZK" }, @@ -329,7 +318,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": { "id": "WmUghcQaMf3y" }, @@ -340,28 +329,18 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": { "id": "cpxO7yGAMf3z" }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-03-15 10:41:54.085280: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE4.1 SSE4.2 AVX AVX2 FMA\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2023-03-15 10:41:54.089954: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.\n" - ] - } - ], + "outputs": [], "source": [ "charNumConverter = CharNumConverter(CaptchaGenerator.characters)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": { "id": "tVb5nDFTMf3z" }, @@ -372,24 +351,23 @@ }, { "cell_type": "code", - "execution_count": 22, - "metadata": { - "id": "t1wzlHQ-Mf3z" - }, + "execution_count": null, + "metadata": {}, "outputs": [], "source": [ - "(img_width, img_height) = (241, 62)" + "from captcha.CaptchaShape import CaptchaShape\n", + "captchaShape = CaptchaShape()" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": { "id": "s35OUslsMf30" }, "outputs": [], "source": [ - "datasetFactory = DatasetFactory(img_height, img_width, charNumConverter.char_to_num, batch_size = 64)" + "datasetFactory = DatasetFactory(captchaShape, charNumConverter.char_to_num, batch_size = 64)" ] }, { @@ -403,7 +381,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": { "id": "oRcemcbG3MGa" }, @@ -417,7 +395,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": { "id": "P7myCt7e2h6A" }, @@ -431,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": { "id": "j9apYsyI3MGb" }, @@ -474,7 +452,7 @@ }, "outputs": [], "source": [ - "modelFactory = ModelFactory(img_height, img_width, charNumConverter.char_to_num)" + "modelFactory = ModelFactory(captchaShape, charNumConverter.char_to_num)" ] }, { @@ -590,7 +568,9 @@ }, "outputs": [], "source": [ - "model = modelDAO.loadModel(modelName)\n", + "# FK-TODO: DRY with VAERSFileDownloader\n", + "modelFilepath = f'{os.getcwd()}/captcha/{modelName}'\n", + "model = modelDAO.loadModel(modelFilepath)\n", "model.summary(show_trainable=True)" ] }, @@ -637,7 +617,7 @@ }, "outputs": [], "source": [ - "train_dataset, validation_dataset, test_dataset = getTrainValidationTestDatasets(Path(\"captchas/VAERS/\"), datasetFactory)" + "train_dataset, validation_dataset, test_dataset = getTrainValidationTestDatasets(Path(\"captcha/captchas/VAERS/\"), datasetFactory)" ] }, { diff --git a/src/captcha/CaptchaReader.py b/src/captcha/CaptchaReader.py index 418621cae56..df2b0c5f6e2 100644 --- a/src/captcha/CaptchaReader.py +++ b/src/captcha/CaptchaReader.py @@ -7,21 +7,19 @@ from captcha.DatasetFactory import DatasetFactory import numpy as np from tensorflow import keras -# FK-TODO: DRY with captcha.ipynb -img_width = 241 -img_height = 62 - class CaptchaReader: - def __init__(self, modelFilepath): + def __init__(self, modelFilepath, captchaShape): self.modelFilepath = modelFilepath + self.captchaShape = captchaShape def getTextInCaptchaImage(self, captchaImageFile): + # FK-TODO: refactor modelDAO = ModelDAO(inColab = False) model = modelDAO.loadModel(self.modelFilepath) prediction_model = ModelFactory.createPredictionModel(model) charNumConverter = CharNumConverter(CaptchaGenerator.characters) - datasetFactory = DatasetFactory(img_height, img_width, charNumConverter.char_to_num, batch_size = 64) + datasetFactory = DatasetFactory(self.captchaShape,charNumConverter.char_to_num, batch_size = 64) batchImages = self._asSingleSampleBatch(datasetFactory._encode_single_sample(captchaImageFile, 'dummy')['image']) preds = prediction_model.predict(batchImages) predictionsDecoder = PredictionsDecoder(CaptchaGenerator.captchaLength, charNumConverter.num_to_char) diff --git a/src/captcha/CaptchaReaderTest.py b/src/captcha/CaptchaReaderTest.py index 8a4050ca4e8..3a9112f5b47 100644 --- a/src/captcha/CaptchaReaderTest.py +++ b/src/captcha/CaptchaReaderTest.py @@ -1,5 +1,6 @@ import unittest from captcha.CaptchaReader import CaptchaReader +from captcha.CaptchaShape import CaptchaShape import os class CaptchaReaderTest(unittest.TestCase): @@ -10,7 +11,9 @@ class CaptchaReaderTest(unittest.TestCase): def test_getTextInCaptchaImage(self): # Given textInCaptchaImage = '1Ad47a' - captchaReader = CaptchaReader(modelFilepath = f'{self.working_directory}/MobileNetV3Small') + captchaReader = CaptchaReader( + modelFilepath = f'{self.working_directory}/MobileNetV3Small', + captchaShape = CaptchaShape()) # When textInCaptchaImageActual = captchaReader.getTextInCaptchaImage(f'{self.working_directory}/captchas/VAERS/{textInCaptchaImage}.jpeg') diff --git a/src/captcha/CaptchaShape.py b/src/captcha/CaptchaShape.py new file mode 100644 index 00000000000..ff46a10f315 --- /dev/null +++ b/src/captcha/CaptchaShape.py @@ -0,0 +1,5 @@ +class CaptchaShape: + + def __init__(self): + self.width = 241 + self.height = 62 diff --git a/src/captcha/DatasetFactory.py b/src/captcha/DatasetFactory.py index c09b4b53bd2..aac88cee4e9 100644 --- a/src/captcha/DatasetFactory.py +++ b/src/captcha/DatasetFactory.py @@ -3,9 +3,8 @@ import tensorflow as tf class DatasetFactory: - def __init__(self, img_height, img_width, char_to_num, batch_size): - self.img_height = img_height - self.img_width = img_width + def __init__(self, captchaShape, char_to_num, batch_size): + self.captchaShape = captchaShape self.char_to_num = char_to_num self.batch_size = batch_size @@ -18,7 +17,7 @@ class DatasetFactory: def _encode_single_sample(self, img_path, label): img = tf.io.read_file(img_path) img = tf.io.decode_jpeg(img, channels=3) - img = tf.image.resize(img, [self.img_height, self.img_width]) + img = tf.image.resize(img, [self.captchaShape.height, self.captchaShape.width]) # Map the characters in label to numbers label = self.char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8")) # Return a dict as our model is expecting two inputs diff --git a/src/captcha/ModelFactory.py b/src/captcha/ModelFactory.py index a829b5b2dff..07deafabb05 100644 --- a/src/captcha/ModelFactory.py +++ b/src/captcha/ModelFactory.py @@ -9,9 +9,8 @@ class ModelFactory: predictionModelInputLayerName = "image" predictionModelOutputLayerName = "dense2" - def __init__(self, img_height, img_width, char_to_num): - self.img_height = img_height - self.img_width = img_width + def __init__(self, captchaShape, char_to_num): + self.captchaShape = captchaShape self.char_to_num = char_to_num # see https://www.tensorflow.org/api_docs/python/tf/keras/applications/resnet/ResNet101 @@ -52,9 +51,9 @@ class ModelFactory: def _createModel(self, baseModelFactory, preprocess_input, name): # Inputs to the model input_image = layers.Input( - shape=(self.img_height, self.img_width, 3), - name=ModelFactory.predictionModelInputLayerName, - dtype="float32") + shape = (self.captchaShape.height, self.captchaShape.width, 3), + name = ModelFactory.predictionModelInputLayerName, + dtype = "float32") labels = layers.Input(name="label", shape=(None,), dtype="float32") image = preprocess_input(input_image)