starting to download VAERS data

This commit is contained in:
frankknoll
2022-05-12 17:39:12 +02:00
parent 04073c7a38
commit 9926901f78
2 changed files with 103 additions and 8 deletions

1
.gitignore vendored
View File

@@ -10,3 +10,4 @@ src/results/
docs/data/*.xlsx docs/data/*.xlsx
docs/data/*.html docs/data/*.html
.env .env
src/captchaImage.jpeg

View File

@@ -80,10 +80,37 @@
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "f3f94f5a", "id": "9514f5be",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Solve CAPTCHA" "## Download VAERS-Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "777ff543",
"metadata": {},
"outputs": [],
"source": [
"from selenium import webdriver\n",
"from webdriver_manager.chrome import ChromeDriverManager\n",
"from selenium.webdriver.chrome.service import Service as ChromeService\n",
"from selenium.webdriver.chrome.options import Options\n",
"from selenium.webdriver.common.by import By\n",
"\n",
"options = Options()\n",
"downloadDir = \"/home/frankknoll/Dokumente/Corona/projects/HowBadIsMyBatch/src/VAERS/tmp\"\n",
"prefs = {\"download.default_directory\" : downloadDir}\n",
"options.add_experimental_option(\"prefs\", prefs)\n",
"service = ChromeService(executable_path = ChromeDriverManager().install())\n",
"driver = webdriver.Chrome(service = service, options = options)\n",
"file2Download = '2022VAERSData.zip'\n",
"driver.get('https://vaers.hhs.gov/eSubDownload/index.jsp?fn=' + file2Download)\n",
"captchaImage = driver.find_element(By.CSS_SELECTOR, \"img[src='captchaImage']\")\n",
"captchaImageFile = 'captchaImage.jpeg'\n",
"with open(captchaImageFile, 'wb') as file:\n",
" file.write(captchaImage.screenshot_as_png)"
] ]
}, },
{ {
@@ -135,10 +162,7 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"textByImage = {}\n", "textInImage = detectTextInImage(captchaImageFile)"
"for i in range(1, 2):\n",
" captchaImage = \"captchaImage\" + str(i) + \".jpeg\"\n",
" textByImage[captchaImage] = detectTextInImage(\"/home/frankknoll/Dokumente/Corona/CAPTCHA/images/VAERS/\" + captchaImage)"
] ]
}, },
{ {
@@ -154,11 +178,81 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "6ba0ad8e", "id": "2dfc95c3",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"textByImage" "textInImage"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0bf84d1c",
"metadata": {},
"outputs": [],
"source": [
"driver.find_element(By.ID, \"verificationCode\").send_keys(textInImage)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6cf55175",
"metadata": {},
"outputs": [],
"source": [
"driver.find_element(By.CSS_SELECTOR, '[name=\"downloadbut\"]').click()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "86e5056f",
"metadata": {},
"outputs": [],
"source": [
"def isCaptchaSolved():\n",
" return len(driver.find_elements(By.ID, \"wordverify\")) == 0"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b6b47c06",
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"import os.path\n",
"\n",
"def waitUntilDownloadHasFinished():\n",
" while not os.path.exists(downloadDir + \"/\" + file2Download):\n",
" time.sleep(2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b1ebc05c",
"metadata": {},
"outputs": [],
"source": [
"if isCaptchaSolved():\n",
" waitUntilDownloadHasFinished()\n",
" display('file downloaded')\n",
"else:\n",
" display('try again')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "380bcf32",
"metadata": {},
"outputs": [],
"source": [
"driver.quit()"
] ]
}, },
{ {