refactoring
This commit is contained in:
73
src/CaptchaReader.py
Normal file
73
src/CaptchaReader.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from tensorflow import keras
|
||||
from tensorflow.keras import layers
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
import io
|
||||
|
||||
# copied from value of characters variable in captcha_ocr.ipynb or captcha_ocr_trainAndSaveModel.ipynb
|
||||
characters = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'a', 'b', 'c', 'd', 'e', 'f']
|
||||
|
||||
img_width = 241
|
||||
img_height = 62
|
||||
|
||||
downsample_factor = 4
|
||||
|
||||
# copied from value of max_length variable in captcha_ocr.ipynb or captcha_ocr_trainAndSaveModel.ipynb
|
||||
max_length = 6
|
||||
|
||||
char_to_num = layers.StringLookup(
|
||||
vocabulary=list(characters),
|
||||
mask_token=None)
|
||||
|
||||
num_to_char = layers.StringLookup(
|
||||
vocabulary=char_to_num.get_vocabulary(),
|
||||
mask_token=None, invert=True)
|
||||
|
||||
def encode_single_sample(img_path):
|
||||
# 1. Read image
|
||||
img = tf.io.read_file(img_path)
|
||||
# 2. Decode and convert to grayscale
|
||||
img = tf.io.decode_png(img, channels=1)
|
||||
# 3. Convert to float32 in [0, 1] range
|
||||
img = tf.image.convert_image_dtype(img, tf.float32)
|
||||
# 4. Resize to the desired size
|
||||
img = tf.image.resize(img, [img_height, img_width])
|
||||
# 5. Transpose the image because we want the time
|
||||
# dimension to correspond to the width of the image.
|
||||
img = tf.transpose(img, perm=[1, 0, 2])
|
||||
# 7. Return a dict as our model is expecting two inputs
|
||||
return asSingleSampleBatch(img)
|
||||
|
||||
def asSingleSampleBatch(img):
|
||||
array = keras.utils.img_to_array(img)
|
||||
array = np.expand_dims(array, axis=0)
|
||||
return array
|
||||
|
||||
def decode_batch_predictions(pred):
|
||||
input_len = np.ones(pred.shape[0]) * pred.shape[1]
|
||||
# Use greedy search. For complex tasks, you can use beam search
|
||||
results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][:, :max_length]
|
||||
# Iterate over the results and get back the text
|
||||
output_text = []
|
||||
for res in results:
|
||||
res = tf.strings.reduce_join(num_to_char(res)).numpy().decode("utf-8")
|
||||
output_text.append(res)
|
||||
return output_text
|
||||
|
||||
def load_model():
|
||||
_model = keras.models.load_model('model')
|
||||
model = keras.models.Model(
|
||||
_model.get_layer(name="image").input,
|
||||
_model.get_layer(name="dense2").output)
|
||||
return model
|
||||
|
||||
def getTextInCaptchaImage(captchaImageFile):
|
||||
batchImages = encode_single_sample(captchaImageFile)
|
||||
preds = model.predict(batchImages)
|
||||
return decode_batch_predictions(preds)[0]
|
||||
|
||||
print("loading model...")
|
||||
model = load_model()
|
||||
model.summary()
|
||||
@@ -120,189 +120,6 @@
|
||||
" ! adb emu kill"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "777ff543",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from selenium import webdriver\n",
|
||||
"from webdriver_manager.chrome import ChromeDriverManager\n",
|
||||
"from selenium.webdriver.chrome.service import Service as ChromeService\n",
|
||||
"from selenium.webdriver.chrome.options import Options\n",
|
||||
"from selenium.webdriver.common.by import By\n",
|
||||
"\n",
|
||||
"def _getOptions(downloadDir, isHeadless):\n",
|
||||
" options = Options()\n",
|
||||
" options.headless = isHeadless\n",
|
||||
" options.add_experimental_option(\"prefs\", {\"download.default_directory\" : downloadDir})\n",
|
||||
" return options\n",
|
||||
"\n",
|
||||
"def getWebDriver(downloadDir, isHeadless):\n",
|
||||
" return webdriver.Chrome(\n",
|
||||
" service = ChromeService(executable_path = ChromeDriverManager().install()),\n",
|
||||
" options = _getOptions(downloadDir, isHeadless))\n",
|
||||
"\n",
|
||||
"def saveCaptchaImageAs(driver, captchaImageFile):\n",
|
||||
" captchaImage = driver.find_element(By.CSS_SELECTOR, \"img[src='captchaImage']\")\n",
|
||||
" with open(captchaImageFile, 'wb') as file:\n",
|
||||
" file.write(captchaImage.screenshot_as_png)\n",
|
||||
"\n",
|
||||
"def existsElementWithId(driver, id):\n",
|
||||
" return len(driver.find_elements(By.ID, id)) > 0\n",
|
||||
"\n",
|
||||
"def isCaptchaSolved(driver):\n",
|
||||
" return not existsElementWithId(driver, \"wordverify\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "da7c965a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"def waitUntilDownloadHasFinished(file):\n",
|
||||
" while not os.path.exists(file):\n",
|
||||
" time.sleep(2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d9b72506",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import tensorflow as tf\n",
|
||||
"from tensorflow import keras\n",
|
||||
"from tensorflow.keras import layers\n",
|
||||
"from PIL import Image\n",
|
||||
"import numpy as np\n",
|
||||
"import io\n",
|
||||
"\n",
|
||||
"# copied from value of characters variable in captcha_ocr.ipynb or captcha_ocr_trainAndSaveModel.ipynb\n",
|
||||
"characters = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'a', 'b', 'c', 'd', 'e', 'f']\n",
|
||||
"\n",
|
||||
"img_width = 241\n",
|
||||
"img_height = 62\n",
|
||||
"\n",
|
||||
"downsample_factor = 4\n",
|
||||
"\n",
|
||||
"# copied from value of max_length variable in captcha_ocr.ipynb or captcha_ocr_trainAndSaveModel.ipynb\n",
|
||||
"max_length = 6\n",
|
||||
"\n",
|
||||
"char_to_num = layers.StringLookup(\n",
|
||||
" vocabulary=list(characters),\n",
|
||||
" mask_token=None)\n",
|
||||
"\n",
|
||||
"num_to_char = layers.StringLookup(\n",
|
||||
" vocabulary=char_to_num.get_vocabulary(),\n",
|
||||
" mask_token=None, invert=True)\n",
|
||||
"\n",
|
||||
"def encode_single_sample(img_path):\n",
|
||||
" # 1. Read image\n",
|
||||
" img = tf.io.read_file(img_path)\n",
|
||||
" # 2. Decode and convert to grayscale\n",
|
||||
" img = tf.io.decode_png(img, channels=1)\n",
|
||||
" # 3. Convert to float32 in [0, 1] range\n",
|
||||
" img = tf.image.convert_image_dtype(img, tf.float32)\n",
|
||||
" # 4. Resize to the desired size\n",
|
||||
" img = tf.image.resize(img, [img_height, img_width])\n",
|
||||
" # 5. Transpose the image because we want the time\n",
|
||||
" # dimension to correspond to the width of the image.\n",
|
||||
" img = tf.transpose(img, perm=[1, 0, 2])\n",
|
||||
" # 7. Return a dict as our model is expecting two inputs\n",
|
||||
" return asSingleSampleBatch(img)\n",
|
||||
"\n",
|
||||
"def asSingleSampleBatch(img):\n",
|
||||
" array = keras.utils.img_to_array(img)\n",
|
||||
" array = np.expand_dims(array, axis=0)\n",
|
||||
" return array\n",
|
||||
"\n",
|
||||
"def decode_batch_predictions(pred):\n",
|
||||
" input_len = np.ones(pred.shape[0]) * pred.shape[1]\n",
|
||||
" # Use greedy search. For complex tasks, you can use beam search\n",
|
||||
" results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][:, :max_length]\n",
|
||||
" # Iterate over the results and get back the text\n",
|
||||
" output_text = []\n",
|
||||
" for res in results:\n",
|
||||
" res = tf.strings.reduce_join(num_to_char(res)).numpy().decode(\"utf-8\")\n",
|
||||
" output_text.append(res)\n",
|
||||
" return output_text\n",
|
||||
"\n",
|
||||
"def load_model():\n",
|
||||
" _model = keras.models.load_model('model')\n",
|
||||
" model = keras.models.Model(\n",
|
||||
" _model.get_layer(name=\"image\").input,\n",
|
||||
" _model.get_layer(name=\"dense2\").output)\n",
|
||||
" return model\n",
|
||||
"\n",
|
||||
"def getTextInCaptchaImage(captchaImageFile):\n",
|
||||
" batchImages = encode_single_sample(captchaImageFile)\n",
|
||||
" preds = model.predict(batchImages)\n",
|
||||
" return decode_batch_predictions(preds)[0]\n",
|
||||
"\n",
|
||||
"print(\"loading model...\")\n",
|
||||
"model = load_model()\n",
|
||||
"model.summary()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "918d088d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"\n",
|
||||
"#def getTextInCaptchaImage(captchaImageFile):\n",
|
||||
"# baseDir = \"~/AndroidStudioProjects/TextRecognizer\"\n",
|
||||
"# ! cp $captchaImageFile $baseDir/app/src/main/assets/captchas/captcha_image.jpeg\n",
|
||||
"# ! cd $baseDir;./gradlew connectedAndroidTest\n",
|
||||
"# textInCaptchaImage = ! adb shell \"run-as org.textrecognizer cat /data/data/org.textrecognizer/files/captcha_image.txt\"\n",
|
||||
"# return textInCaptchaImage[0]\n",
|
||||
" \n",
|
||||
"def solveCaptchaAndStartFileDownload(driver, captchaImageFile):\n",
|
||||
" saveCaptchaImageAs(driver, captchaImageFile)\n",
|
||||
" textInCaptchaImage = getTextInCaptchaImage(captchaImageFile)\n",
|
||||
" display('textInCaptchaImage: ', textInCaptchaImage)\n",
|
||||
" driver.find_element(By.ID, \"verificationCode\").send_keys(textInCaptchaImage)\n",
|
||||
" driver.find_element(By.CSS_SELECTOR, '[name=\"downloadbut\"]').click()\n",
|
||||
"\n",
|
||||
"def downloadFile(absoluteFile, driver, maxTries):\n",
|
||||
" def _downloadFile():\n",
|
||||
" driver.get('https://vaers.hhs.gov/eSubDownload/index.jsp?fn=' + os.path.basename(absoluteFile))\n",
|
||||
" solveCaptchaAndStartFileDownload(driver, 'captchaImage.jpeg')\n",
|
||||
"\n",
|
||||
" numTries = 1\n",
|
||||
" _downloadFile()\n",
|
||||
" while(not isCaptchaSolved(driver) and (maxTries is None or numTries < maxTries)):\n",
|
||||
" _downloadFile()\n",
|
||||
" numTries = numTries + 1\n",
|
||||
"\n",
|
||||
" if isCaptchaSolved(driver):\n",
|
||||
" waitUntilDownloadHasFinished(absoluteFile)\n",
|
||||
" return absoluteFile\n",
|
||||
" else:\n",
|
||||
" return None\n",
|
||||
"\n",
|
||||
"def downloadVAERSFile(file, downloadDir):\n",
|
||||
" driver = getWebDriver(downloadDir, isHeadless = True)\n",
|
||||
" downloadedFile = downloadFile(\n",
|
||||
" absoluteFile = downloadDir + \"/\" + file,\n",
|
||||
" driver = driver,\n",
|
||||
" maxTries = None)\n",
|
||||
" driver.quit()\n",
|
||||
" return downloadedFile"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@@ -310,16 +127,8 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import zipfile\n",
|
||||
"import os\n",
|
||||
"\n",
|
||||
"def unzip(zipFile, dstDir):\n",
|
||||
" with zipfile.ZipFile(zipFile, 'r') as zip_ref:\n",
|
||||
" zip_ref.extractall(dstDir)\n",
|
||||
"\n",
|
||||
"def unzipAndRemove(zipFile, dstDir):\n",
|
||||
" unzip(zipFile, dstDir)\n",
|
||||
" os.remove(zipFile)\n",
|
||||
"from VAERSFileDownloader import downloadVAERSFile\n",
|
||||
"from zipUtils import unzipAndRemove\n",
|
||||
"\n",
|
||||
"def downloadVAERSFileAndUnzip(file):\n",
|
||||
" downloadedFile = downloadVAERSFile(file, getWorkingDirectory() + \"/VAERS/tmp\")\n",
|
||||
@@ -661,7 +470,7 @@
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
|
||||
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
49
src/VAERSFileDownloader.py
Normal file
49
src/VAERSFileDownloader.py
Normal file
@@ -0,0 +1,49 @@
|
||||
import os
|
||||
import time
|
||||
from WebDriver import getWebDriver, isCaptchaSolved, saveCaptchaImageAs
|
||||
from selenium.webdriver.common.by import By
|
||||
from CaptchaReader import getTextInCaptchaImage
|
||||
|
||||
#def getTextInCaptchaImage(captchaImageFile):
|
||||
# baseDir = "~/AndroidStudioProjects/TextRecognizer"
|
||||
# ! cp $captchaImageFile $baseDir/app/src/main/assets/captchas/captcha_image.jpeg
|
||||
# ! cd $baseDir;./gradlew connectedAndroidTest
|
||||
# textInCaptchaImage = ! adb shell "run-as org.textrecognizer cat /data/data/org.textrecognizer/files/captcha_image.txt"
|
||||
# return textInCaptchaImage[0]
|
||||
|
||||
def solveCaptchaAndStartFileDownload(driver, captchaImageFile):
|
||||
saveCaptchaImageAs(driver, captchaImageFile)
|
||||
textInCaptchaImage = getTextInCaptchaImage(captchaImageFile)
|
||||
display('textInCaptchaImage: ', textInCaptchaImage)
|
||||
driver.find_element(By.ID, "verificationCode").send_keys(textInCaptchaImage)
|
||||
driver.find_element(By.CSS_SELECTOR, '[name="downloadbut"]').click()
|
||||
|
||||
def downloadFile(absoluteFile, driver, maxTries):
|
||||
def _downloadFile():
|
||||
driver.get('https://vaers.hhs.gov/eSubDownload/index.jsp?fn=' + os.path.basename(absoluteFile))
|
||||
solveCaptchaAndStartFileDownload(driver, 'captchaImage.jpeg')
|
||||
|
||||
numTries = 1
|
||||
_downloadFile()
|
||||
while(not isCaptchaSolved(driver) and (maxTries is None or numTries < maxTries)):
|
||||
_downloadFile()
|
||||
numTries = numTries + 1
|
||||
|
||||
if isCaptchaSolved(driver):
|
||||
_waitUntilDownloadHasFinished(absoluteFile)
|
||||
return absoluteFile
|
||||
else:
|
||||
return None
|
||||
|
||||
def _waitUntilDownloadHasFinished(file):
|
||||
while not os.path.exists(file):
|
||||
time.sleep(2)
|
||||
|
||||
def downloadVAERSFile(file, downloadDir):
|
||||
driver = getWebDriver(downloadDir, isHeadless = True)
|
||||
downloadedFile = downloadFile(
|
||||
absoluteFile = downloadDir + "/" + file,
|
||||
driver = driver,
|
||||
maxTries = None)
|
||||
driver.quit()
|
||||
return downloadedFile
|
||||
27
src/WebDriver.py
Normal file
27
src/WebDriver.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from selenium import webdriver
|
||||
from webdriver_manager.chrome import ChromeDriverManager
|
||||
from selenium.webdriver.chrome.service import Service as ChromeService
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.by import By
|
||||
|
||||
def _getOptions(downloadDir, isHeadless):
|
||||
options = Options()
|
||||
options.headless = isHeadless
|
||||
options.add_experimental_option("prefs", {"download.default_directory" : downloadDir})
|
||||
return options
|
||||
|
||||
def getWebDriver(downloadDir, isHeadless):
|
||||
return webdriver.Chrome(
|
||||
service = ChromeService(executable_path = ChromeDriverManager().install()),
|
||||
options = _getOptions(downloadDir, isHeadless))
|
||||
|
||||
def saveCaptchaImageAs(driver, captchaImageFile):
|
||||
captchaImage = driver.find_element(By.CSS_SELECTOR, "img[src='captchaImage']")
|
||||
with open(captchaImageFile, 'wb') as file:
|
||||
file.write(captchaImage.screenshot_as_png)
|
||||
|
||||
def existsElementWithId(driver, id):
|
||||
return len(driver.find_elements(By.ID, id)) > 0
|
||||
|
||||
def isCaptchaSolved(driver):
|
||||
return not existsElementWithId(driver, "wordverify")
|
||||
12
src/zipUtils.py
Normal file
12
src/zipUtils.py
Normal file
@@ -0,0 +1,12 @@
|
||||
import zipfile
|
||||
import os
|
||||
|
||||
|
||||
def unzip(zipFile, dstDir):
|
||||
with zipfile.ZipFile(zipFile, 'r') as zip_ref:
|
||||
zip_ref.extractall(dstDir)
|
||||
|
||||
|
||||
def unzipAndRemove(zipFile, dstDir):
|
||||
unzip(zipFile, dstDir)
|
||||
os.remove(zipFile)
|
||||
Reference in New Issue
Block a user