From 9926901f78c95323649eaa1d2997d9e6a0cce136 Mon Sep 17 00:00:00 2001 From: frankknoll Date: Thu, 12 May 2022 17:39:12 +0200 Subject: [PATCH] starting to download VAERS data --- .gitignore | 1 + src/HowBadIsMyBatch.ipynb | 110 +++++++++++++++++++++++++++++++++++--- 2 files changed, 103 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index e8931f7ff21..cdbceccc8b6 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ src/results/ docs/data/*.xlsx docs/data/*.html .env +src/captchaImage.jpeg diff --git a/src/HowBadIsMyBatch.ipynb b/src/HowBadIsMyBatch.ipynb index 7e8403701c4..23bf0e878ae 100644 --- a/src/HowBadIsMyBatch.ipynb +++ b/src/HowBadIsMyBatch.ipynb @@ -80,10 +80,37 @@ }, { "cell_type": "markdown", - "id": "f3f94f5a", + "id": "9514f5be", "metadata": {}, "source": [ - "## Solve CAPTCHA" + "## Download VAERS-Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "777ff543", + "metadata": {}, + "outputs": [], + "source": [ + "from selenium import webdriver\n", + "from webdriver_manager.chrome import ChromeDriverManager\n", + "from selenium.webdriver.chrome.service import Service as ChromeService\n", + "from selenium.webdriver.chrome.options import Options\n", + "from selenium.webdriver.common.by import By\n", + "\n", + "options = Options()\n", + "downloadDir = \"/home/frankknoll/Dokumente/Corona/projects/HowBadIsMyBatch/src/VAERS/tmp\"\n", + "prefs = {\"download.default_directory\" : downloadDir}\n", + "options.add_experimental_option(\"prefs\", prefs)\n", + "service = ChromeService(executable_path = ChromeDriverManager().install())\n", + "driver = webdriver.Chrome(service = service, options = options)\n", + "file2Download = '2022VAERSData.zip'\n", + "driver.get('https://vaers.hhs.gov/eSubDownload/index.jsp?fn=' + file2Download)\n", + "captchaImage = driver.find_element(By.CSS_SELECTOR, \"img[src='captchaImage']\")\n", + "captchaImageFile = 'captchaImage.jpeg'\n", + "with open(captchaImageFile, 'wb') as file:\n", + " file.write(captchaImage.screenshot_as_png)" ] }, { @@ -135,10 +162,7 @@ "metadata": {}, "outputs": [], "source": [ - "textByImage = {}\n", - "for i in range(1, 2):\n", - " captchaImage = \"captchaImage\" + str(i) + \".jpeg\"\n", - " textByImage[captchaImage] = detectTextInImage(\"/home/frankknoll/Dokumente/Corona/CAPTCHA/images/VAERS/\" + captchaImage)" + "textInImage = detectTextInImage(captchaImageFile)" ] }, { @@ -154,11 +178,81 @@ { "cell_type": "code", "execution_count": null, - "id": "6ba0ad8e", + "id": "2dfc95c3", "metadata": {}, "outputs": [], "source": [ - "textByImage" + "textInImage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bf84d1c", + "metadata": {}, + "outputs": [], + "source": [ + "driver.find_element(By.ID, \"verificationCode\").send_keys(textInImage)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6cf55175", + "metadata": {}, + "outputs": [], + "source": [ + "driver.find_element(By.CSS_SELECTOR, '[name=\"downloadbut\"]').click()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "86e5056f", + "metadata": {}, + "outputs": [], + "source": [ + "def isCaptchaSolved():\n", + " return len(driver.find_elements(By.ID, \"wordverify\")) == 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6b47c06", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import os.path\n", + "\n", + "def waitUntilDownloadHasFinished():\n", + " while not os.path.exists(downloadDir + \"/\" + file2Download):\n", + " time.sleep(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1ebc05c", + "metadata": {}, + "outputs": [], + "source": [ + "if isCaptchaSolved():\n", + " waitUntilDownloadHasFinished()\n", + " display('file downloaded')\n", + "else:\n", + " display('try again')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "380bcf32", + "metadata": {}, + "outputs": [], + "source": [ + "driver.quit()" ] }, {