downloading VAERSBeforeDeletion.7z
This commit is contained in:
7
.github/workflows/buildAndDeployWebsite.yml
vendored
7
.github/workflows/buildAndDeployWebsite.yml
vendored
@@ -25,19 +25,20 @@ jobs:
|
||||
- name: Installing Google Chrome
|
||||
shell: bash -el {0}
|
||||
run: |
|
||||
conda activate howbadismybatch-venv
|
||||
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
|
||||
sudo dpkg -i google-chrome-stable_current_amd64.deb
|
||||
sudo apt install google-chrome-stable
|
||||
pip install selenium webdriver-manager
|
||||
pip install pycountry
|
||||
pip install python-decouple
|
||||
pip install --upgrade selenium webdriver-manager
|
||||
- name: Installing ipython kernel
|
||||
shell: bash -el {0}
|
||||
run: |
|
||||
conda activate howbadismybatch-venv
|
||||
ipython kernel install --user --name=howbadismybatch-venv-kernel
|
||||
- name: Executing HowBadIsMyBatch.ipynb
|
||||
shell: bash -el {0}
|
||||
run: |
|
||||
conda activate howbadismybatch-venv
|
||||
cd src
|
||||
jupyter nbconvert --to script HowBadIsMyBatch.ipynb
|
||||
python ./HowBadIsMyBatch.py
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -17,3 +17,4 @@ google-chrome-stable_current_amd64*
|
||||
src/captcha/__pycache__
|
||||
src/GoogleAnalytics/__pycache__
|
||||
src/SymptomsCausedByVaccines/__pycache__
|
||||
src/HowBadIsMyBatch.py
|
||||
|
||||
@@ -9,6 +9,8 @@ dependencies:
|
||||
- pandas=1.4.0
|
||||
- urllib3
|
||||
- requests
|
||||
- gdown
|
||||
- py7zr
|
||||
- bs4
|
||||
- lxml
|
||||
- jupyter
|
||||
|
||||
17
src/GoogleDriveDownloader.py
Normal file
17
src/GoogleDriveDownloader.py
Normal file
@@ -0,0 +1,17 @@
|
||||
import gdown
|
||||
import py7zr
|
||||
import os
|
||||
|
||||
|
||||
class GoogleDriveDownloader:
|
||||
|
||||
@staticmethod
|
||||
def downloadIfNotYetDownloaded(remoteSrcFile, localDstFile):
|
||||
if not os.path.exists(localDstFile):
|
||||
gdown.download(url = remoteSrcFile, output = localDstFile, fuzzy = True)
|
||||
|
||||
@staticmethod
|
||||
def downloadSevenZipFileAndExtract(remoteSevenZipSrcFile, localSevenZipDstFile):
|
||||
GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSevenZipSrcFile, localSevenZipDstFile);
|
||||
with py7zr.SevenZipFile(localSevenZipDstFile, mode='r') as sevenZipFile:
|
||||
sevenZipFile.extractall(path = os.path.dirname(localSevenZipDstFile))
|
||||
52
src/GoogleDriveDownloaderTest.py
Normal file
52
src/GoogleDriveDownloaderTest.py
Normal file
@@ -0,0 +1,52 @@
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
import os
|
||||
from IOUtils import IOUtils
|
||||
from GoogleDriveDownloader import GoogleDriveDownloader
|
||||
|
||||
|
||||
class GoogleDriveDownloaderTest(unittest.TestCase):
|
||||
|
||||
def test_downloadIfNotYetDownloaded_notYetDownloaded(self):
|
||||
# Given
|
||||
remoteSrcFile = "https://drive.google.com/file/d/1LstnMvxW4LVxgNvfk5h4AnbvPktMeNSd/view?usp=drive_link"
|
||||
localDstFile = 'src/tmp/test.txt'
|
||||
IOUtils.silentlyRemoveFile(localDstFile)
|
||||
|
||||
# When
|
||||
GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSrcFile, localDstFile)
|
||||
|
||||
# Then
|
||||
self.assertEqual(Path(localDstFile).read_text(), 'test')
|
||||
|
||||
def test_downloadIfNotYetDownloaded_alreadyDownloaded(self):
|
||||
# Given
|
||||
remoteSrcFile = "https://drive.google.com/file/d/1LstnMvxW4LVxgNvfk5h4AnbvPktMeNSd/view?usp=drive_link"
|
||||
localDstFile = 'src/tmp/test.txt'
|
||||
content = 'local file content'
|
||||
self._createFileWithContent(localDstFile, content);
|
||||
|
||||
# When
|
||||
GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSrcFile, localDstFile)
|
||||
|
||||
# Then
|
||||
self.assertEqual(Path(localDstFile).read_text(), content)
|
||||
|
||||
def test_downloadSevenZipFileAndExtract(self):
|
||||
# Given
|
||||
remoteSevenZipSrcFile = "https://drive.google.com/file/d/14hFKlt48dzDnEjHS_7vYVca5elfzX0l1/view?usp=drive_link"
|
||||
localSevenZipDstFile = 'src/tmp/test.7z'
|
||||
localDstFolder = os.path.dirname(localSevenZipDstFile)
|
||||
IOUtils.silentlyRemoveFile(localSevenZipDstFile)
|
||||
IOUtils.silentlyRemoveFolder(localDstFolder + '/test')
|
||||
|
||||
# When
|
||||
GoogleDriveDownloader.downloadSevenZipFileAndExtract(remoteSevenZipSrcFile, localSevenZipDstFile)
|
||||
|
||||
# Then
|
||||
self.assertEqual(Path(localDstFolder + '/test/test.txt').read_text(), 'test')
|
||||
|
||||
def _createFileWithContent(self, file, content):
|
||||
with open(file, 'w') as file:
|
||||
file.write(content)
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
import shutil
|
||||
import simplejson as json
|
||||
|
||||
|
||||
class IOUtils:
|
||||
|
||||
@staticmethod
|
||||
@@ -13,19 +16,19 @@ class IOUtils:
|
||||
IOUtils.ensurePath(file)
|
||||
dataFrame.to_html(
|
||||
file,
|
||||
index = False,
|
||||
table_id = 'batchCodeTable',
|
||||
classes = 'display',
|
||||
justify = 'unset',
|
||||
border = 0)
|
||||
index=False,
|
||||
table_id='batchCodeTable',
|
||||
classes='display',
|
||||
justify='unset',
|
||||
border=0)
|
||||
|
||||
@staticmethod
|
||||
def saveDataFrameAsJson(dataFrame, file):
|
||||
IOUtils.ensurePath(file)
|
||||
dataFrame.to_json(
|
||||
file,
|
||||
orient = "split",
|
||||
index = False)
|
||||
orient="split",
|
||||
index=False)
|
||||
|
||||
@staticmethod
|
||||
def saveDictAsJson(dict, file):
|
||||
@@ -38,3 +41,11 @@ class IOUtils:
|
||||
directory = os.path.dirname(file)
|
||||
if not os.path.exists(directory):
|
||||
os.makedirs(directory)
|
||||
|
||||
@staticmethod
|
||||
def silentlyRemoveFile(file):
|
||||
Path(file).unlink(missing_ok=True)
|
||||
|
||||
@staticmethod
|
||||
def silentlyRemoveFolder(folder):
|
||||
shutil.rmtree(folder, ignore_errors=True)
|
||||
|
||||
@@ -3,7 +3,7 @@ import VaersReader
|
||||
import pandas as pd
|
||||
from VaersDescrReader import VaersDescrReader
|
||||
from CountryColumnAdder import CountryColumnAdder
|
||||
|
||||
from GoogleDriveDownloader import GoogleDriveDownloader
|
||||
|
||||
def getInternationalVaersCovid19(dataDir, years):
|
||||
internationalVaers = pd.concat(
|
||||
@@ -16,6 +16,9 @@ def getInternationalVaersCovid19(dataDir, years):
|
||||
|
||||
|
||||
def getInternationalVaersCovid19BeforeDeletion():
|
||||
GoogleDriveDownloader.downloadSevenZipFileAndExtract(
|
||||
remoteSevenZipSrcFile = "https://drive.google.com/file/d/1Rb-lfxNxw_WwvRDVLEhvqOyv_a2f8ern/view?usp=drive_link",
|
||||
localSevenZipDstFile = 'VAERS/VAERSBeforeDeletion.7z')
|
||||
return getInternationalVaersCovid19(dataDir = 'VAERS/VAERSBeforeDeletion', years = [2020, 2021, 2022])
|
||||
|
||||
def get_international_VAERSVAX_VAERSSYMPTOMS_Covid19(years):
|
||||
|
||||
@@ -16,34 +16,34 @@ from captcha.CaptchaShape import CaptchaShape
|
||||
|
||||
def updateVAERSFiles(years, workingDirectory):
|
||||
for year in years:
|
||||
downloadVAERSFileAndUnzip(f'{year}VAERSData.zip', workingDirectory)
|
||||
downloadVAERSFileAndUnzip('NonDomesticVAERSData.zip', workingDirectory)
|
||||
_downloadVAERSFileAndUnzip(f'{year}VAERSData.zip', workingDirectory)
|
||||
_downloadVAERSFileAndUnzip('NonDomesticVAERSData.zip', workingDirectory)
|
||||
|
||||
def downloadVAERSFileAndUnzip(file, workingDirectory):
|
||||
downloadedFile = downloadVAERSFile(file, workingDirectory + "/VAERS/tmp")
|
||||
def _downloadVAERSFileAndUnzip(file, workingDirectory):
|
||||
downloadedFile = _downloadVAERSFile(file, workingDirectory + "/VAERS/tmp")
|
||||
unzipAndRemove(
|
||||
zipFile = downloadedFile,
|
||||
dstDir = workingDirectory + '/VAERS/')
|
||||
|
||||
def downloadVAERSFile(file, downloadDir):
|
||||
driver = getWebDriver(downloadDir, isHeadless = False)
|
||||
downloadedFile = downloadFile(
|
||||
def _downloadVAERSFile(file, downloadDir):
|
||||
driver = getWebDriver(downloadDir, isHeadless = True)
|
||||
downloadedFile = _downloadFile(
|
||||
absoluteFile = downloadDir + "/" + file,
|
||||
driver = driver,
|
||||
maxTries = None)
|
||||
driver.quit()
|
||||
return downloadedFile
|
||||
|
||||
def downloadFile(absoluteFile, driver, maxTries):
|
||||
def _downloadFile(absoluteFile, driver, maxTries):
|
||||
captchaReader = _createCaptchaReader()
|
||||
def _downloadFile():
|
||||
def downloadFile():
|
||||
driver.get('https://vaers.hhs.gov/eSubDownload/index.jsp?fn=' + os.path.basename(absoluteFile))
|
||||
solveCaptchaAndStartFileDownload(driver, captchaReader, 'captchaImage.jpeg')
|
||||
_solveCaptchaAndStartFileDownload(driver, captchaReader, 'captchaImage.jpeg')
|
||||
|
||||
numTries = 1
|
||||
_downloadFile()
|
||||
downloadFile()
|
||||
while(not isCaptchaSolved(driver) and (maxTries is None or numTries < maxTries)):
|
||||
_downloadFile()
|
||||
downloadFile()
|
||||
numTries = numTries + 1
|
||||
|
||||
if isCaptchaSolved(driver):
|
||||
@@ -57,7 +57,7 @@ def _createCaptchaReader():
|
||||
return CaptchaReader(modelFilepath = f'{working_directory}/captcha/MobileNetV3Small',
|
||||
captchaShape = CaptchaShape())
|
||||
|
||||
def solveCaptchaAndStartFileDownload(driver, captchaReader, captchaImageFile):
|
||||
def _solveCaptchaAndStartFileDownload(driver, captchaReader, captchaImageFile):
|
||||
saveCaptchaImageAs(driver, captchaImageFile)
|
||||
textInCaptchaImage = captchaReader.getTextInCaptchaImage(captchaImageFile)
|
||||
print('textInCaptchaImage:', textInCaptchaImage)
|
||||
|
||||
@@ -6,7 +6,8 @@ from selenium.webdriver.common.by import By
|
||||
|
||||
def _getOptions(downloadDir, isHeadless):
|
||||
options = Options()
|
||||
options.headless = isHeadless
|
||||
if isHeadless:
|
||||
options.add_argument('--headless=new')
|
||||
options.add_experimental_option("prefs", {"download.default_directory" : downloadDir})
|
||||
return options
|
||||
|
||||
|
||||
Reference in New Issue
Block a user