starting to download VAERS data
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -10,3 +10,4 @@ src/results/
|
|||||||
docs/data/*.xlsx
|
docs/data/*.xlsx
|
||||||
docs/data/*.html
|
docs/data/*.html
|
||||||
.env
|
.env
|
||||||
|
src/captchaImage.jpeg
|
||||||
|
|||||||
@@ -80,10 +80,37 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "f3f94f5a",
|
"id": "9514f5be",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"## Solve CAPTCHA"
|
"## Download VAERS-Data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "777ff543",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from selenium import webdriver\n",
|
||||||
|
"from webdriver_manager.chrome import ChromeDriverManager\n",
|
||||||
|
"from selenium.webdriver.chrome.service import Service as ChromeService\n",
|
||||||
|
"from selenium.webdriver.chrome.options import Options\n",
|
||||||
|
"from selenium.webdriver.common.by import By\n",
|
||||||
|
"\n",
|
||||||
|
"options = Options()\n",
|
||||||
|
"downloadDir = \"/home/frankknoll/Dokumente/Corona/projects/HowBadIsMyBatch/src/VAERS/tmp\"\n",
|
||||||
|
"prefs = {\"download.default_directory\" : downloadDir}\n",
|
||||||
|
"options.add_experimental_option(\"prefs\", prefs)\n",
|
||||||
|
"service = ChromeService(executable_path = ChromeDriverManager().install())\n",
|
||||||
|
"driver = webdriver.Chrome(service = service, options = options)\n",
|
||||||
|
"file2Download = '2022VAERSData.zip'\n",
|
||||||
|
"driver.get('https://vaers.hhs.gov/eSubDownload/index.jsp?fn=' + file2Download)\n",
|
||||||
|
"captchaImage = driver.find_element(By.CSS_SELECTOR, \"img[src='captchaImage']\")\n",
|
||||||
|
"captchaImageFile = 'captchaImage.jpeg'\n",
|
||||||
|
"with open(captchaImageFile, 'wb') as file:\n",
|
||||||
|
" file.write(captchaImage.screenshot_as_png)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -135,10 +162,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"textByImage = {}\n",
|
"textInImage = detectTextInImage(captchaImageFile)"
|
||||||
"for i in range(1, 2):\n",
|
|
||||||
" captchaImage = \"captchaImage\" + str(i) + \".jpeg\"\n",
|
|
||||||
" textByImage[captchaImage] = detectTextInImage(\"/home/frankknoll/Dokumente/Corona/CAPTCHA/images/VAERS/\" + captchaImage)"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -154,11 +178,81 @@
|
|||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "6ba0ad8e",
|
"id": "2dfc95c3",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"textByImage"
|
"textInImage"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "0bf84d1c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"driver.find_element(By.ID, \"verificationCode\").send_keys(textInImage)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "6cf55175",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"driver.find_element(By.CSS_SELECTOR, '[name=\"downloadbut\"]').click()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "86e5056f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def isCaptchaSolved():\n",
|
||||||
|
" return len(driver.find_elements(By.ID, \"wordverify\")) == 0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "b6b47c06",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import time\n",
|
||||||
|
"import os.path\n",
|
||||||
|
"\n",
|
||||||
|
"def waitUntilDownloadHasFinished():\n",
|
||||||
|
" while not os.path.exists(downloadDir + \"/\" + file2Download):\n",
|
||||||
|
" time.sleep(2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "b1ebc05c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"if isCaptchaSolved():\n",
|
||||||
|
" waitUntilDownloadHasFinished()\n",
|
||||||
|
" display('file downloaded')\n",
|
||||||
|
"else:\n",
|
||||||
|
" display('try again')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "380bcf32",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"driver.quit()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user