diff --git a/.github/workflows/buildAndDeployWebsite.yml b/.github/workflows/buildAndDeployWebsite.yml index 4d2f9994a9f..a0d9f63cb9b 100644 --- a/.github/workflows/buildAndDeployWebsite.yml +++ b/.github/workflows/buildAndDeployWebsite.yml @@ -25,19 +25,20 @@ jobs: - name: Installing Google Chrome shell: bash -el {0} run: | + conda activate howbadismybatch-venv wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb sudo dpkg -i google-chrome-stable_current_amd64.deb sudo apt install google-chrome-stable - pip install selenium webdriver-manager - pip install pycountry - pip install python-decouple + pip install --upgrade selenium webdriver-manager - name: Installing ipython kernel shell: bash -el {0} run: | + conda activate howbadismybatch-venv ipython kernel install --user --name=howbadismybatch-venv-kernel - name: Executing HowBadIsMyBatch.ipynb shell: bash -el {0} run: | + conda activate howbadismybatch-venv cd src jupyter nbconvert --to script HowBadIsMyBatch.ipynb python ./HowBadIsMyBatch.py diff --git a/.gitignore b/.gitignore index 0a12bc1c9c5..82558696bbc 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ google-chrome-stable_current_amd64* src/captcha/__pycache__ src/GoogleAnalytics/__pycache__ src/SymptomsCausedByVaccines/__pycache__ +src/HowBadIsMyBatch.py diff --git a/environment.yml b/environment.yml index a0b6931d0a4..9ce59672ae0 100644 --- a/environment.yml +++ b/environment.yml @@ -9,6 +9,8 @@ dependencies: - pandas=1.4.0 - urllib3 - requests + - gdown + - py7zr - bs4 - lxml - jupyter diff --git a/src/GoogleDriveDownloader.py b/src/GoogleDriveDownloader.py new file mode 100644 index 00000000000..ab33e9a1240 --- /dev/null +++ b/src/GoogleDriveDownloader.py @@ -0,0 +1,17 @@ +import gdown +import py7zr +import os + + +class GoogleDriveDownloader: + + @staticmethod + def downloadIfNotYetDownloaded(remoteSrcFile, localDstFile): + if not os.path.exists(localDstFile): + gdown.download(url = remoteSrcFile, output = localDstFile, fuzzy = True) + + @staticmethod + def downloadSevenZipFileAndExtract(remoteSevenZipSrcFile, localSevenZipDstFile): + GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSevenZipSrcFile, localSevenZipDstFile); + with py7zr.SevenZipFile(localSevenZipDstFile, mode='r') as sevenZipFile: + sevenZipFile.extractall(path = os.path.dirname(localSevenZipDstFile)) \ No newline at end of file diff --git a/src/GoogleDriveDownloaderTest.py b/src/GoogleDriveDownloaderTest.py new file mode 100644 index 00000000000..927bfba65b7 --- /dev/null +++ b/src/GoogleDriveDownloaderTest.py @@ -0,0 +1,52 @@ +import unittest +from pathlib import Path +import os +from IOUtils import IOUtils +from GoogleDriveDownloader import GoogleDriveDownloader + + +class GoogleDriveDownloaderTest(unittest.TestCase): + + def test_downloadIfNotYetDownloaded_notYetDownloaded(self): + # Given + remoteSrcFile = "https://drive.google.com/file/d/1LstnMvxW4LVxgNvfk5h4AnbvPktMeNSd/view?usp=drive_link" + localDstFile = 'src/tmp/test.txt' + IOUtils.silentlyRemoveFile(localDstFile) + + # When + GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSrcFile, localDstFile) + + # Then + self.assertEqual(Path(localDstFile).read_text(), 'test') + + def test_downloadIfNotYetDownloaded_alreadyDownloaded(self): + # Given + remoteSrcFile = "https://drive.google.com/file/d/1LstnMvxW4LVxgNvfk5h4AnbvPktMeNSd/view?usp=drive_link" + localDstFile = 'src/tmp/test.txt' + content = 'local file content' + self._createFileWithContent(localDstFile, content); + + # When + GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSrcFile, localDstFile) + + # Then + self.assertEqual(Path(localDstFile).read_text(), content) + + def test_downloadSevenZipFileAndExtract(self): + # Given + remoteSevenZipSrcFile = "https://drive.google.com/file/d/14hFKlt48dzDnEjHS_7vYVca5elfzX0l1/view?usp=drive_link" + localSevenZipDstFile = 'src/tmp/test.7z' + localDstFolder = os.path.dirname(localSevenZipDstFile) + IOUtils.silentlyRemoveFile(localSevenZipDstFile) + IOUtils.silentlyRemoveFolder(localDstFolder + '/test') + + # When + GoogleDriveDownloader.downloadSevenZipFileAndExtract(remoteSevenZipSrcFile, localSevenZipDstFile) + + # Then + self.assertEqual(Path(localDstFolder + '/test/test.txt').read_text(), 'test') + + def _createFileWithContent(self, file, content): + with open(file, 'w') as file: + file.write(content) + diff --git a/src/IOUtils.py b/src/IOUtils.py index 4085d1e39e4..9f698dfa5dd 100644 --- a/src/IOUtils.py +++ b/src/IOUtils.py @@ -1,6 +1,9 @@ import os +from pathlib import Path +import shutil import simplejson as json + class IOUtils: @staticmethod @@ -13,19 +16,19 @@ class IOUtils: IOUtils.ensurePath(file) dataFrame.to_html( file, - index = False, - table_id = 'batchCodeTable', - classes = 'display', - justify = 'unset', - border = 0) + index=False, + table_id='batchCodeTable', + classes='display', + justify='unset', + border=0) @staticmethod def saveDataFrameAsJson(dataFrame, file): IOUtils.ensurePath(file) dataFrame.to_json( file, - orient = "split", - index = False) + orient="split", + index=False) @staticmethod def saveDictAsJson(dict, file): @@ -38,3 +41,11 @@ class IOUtils: directory = os.path.dirname(file) if not os.path.exists(directory): os.makedirs(directory) + + @staticmethod + def silentlyRemoveFile(file): + Path(file).unlink(missing_ok=True) + + @staticmethod + def silentlyRemoveFolder(folder): + shutil.rmtree(folder, ignore_errors=True) diff --git a/src/InternationalVaersCovid19Provider.py b/src/InternationalVaersCovid19Provider.py index 93e8323bf59..f9b47e18905 100644 --- a/src/InternationalVaersCovid19Provider.py +++ b/src/InternationalVaersCovid19Provider.py @@ -3,7 +3,7 @@ import VaersReader import pandas as pd from VaersDescrReader import VaersDescrReader from CountryColumnAdder import CountryColumnAdder - +from GoogleDriveDownloader import GoogleDriveDownloader def getInternationalVaersCovid19(dataDir, years): internationalVaers = pd.concat( @@ -16,6 +16,9 @@ def getInternationalVaersCovid19(dataDir, years): def getInternationalVaersCovid19BeforeDeletion(): + GoogleDriveDownloader.downloadSevenZipFileAndExtract( + remoteSevenZipSrcFile = "https://drive.google.com/file/d/1Rb-lfxNxw_WwvRDVLEhvqOyv_a2f8ern/view?usp=drive_link", + localSevenZipDstFile = 'VAERS/VAERSBeforeDeletion.7z') return getInternationalVaersCovid19(dataDir = 'VAERS/VAERSBeforeDeletion', years = [2020, 2021, 2022]) def get_international_VAERSVAX_VAERSSYMPTOMS_Covid19(years): diff --git a/src/VAERSFileDownloader.py b/src/VAERSFileDownloader.py index d49527ed617..45127f27fe0 100644 --- a/src/VAERSFileDownloader.py +++ b/src/VAERSFileDownloader.py @@ -16,34 +16,34 @@ from captcha.CaptchaShape import CaptchaShape def updateVAERSFiles(years, workingDirectory): for year in years: - downloadVAERSFileAndUnzip(f'{year}VAERSData.zip', workingDirectory) - downloadVAERSFileAndUnzip('NonDomesticVAERSData.zip', workingDirectory) + _downloadVAERSFileAndUnzip(f'{year}VAERSData.zip', workingDirectory) + _downloadVAERSFileAndUnzip('NonDomesticVAERSData.zip', workingDirectory) -def downloadVAERSFileAndUnzip(file, workingDirectory): - downloadedFile = downloadVAERSFile(file, workingDirectory + "/VAERS/tmp") +def _downloadVAERSFileAndUnzip(file, workingDirectory): + downloadedFile = _downloadVAERSFile(file, workingDirectory + "/VAERS/tmp") unzipAndRemove( zipFile = downloadedFile, dstDir = workingDirectory + '/VAERS/') -def downloadVAERSFile(file, downloadDir): - driver = getWebDriver(downloadDir, isHeadless = False) - downloadedFile = downloadFile( +def _downloadVAERSFile(file, downloadDir): + driver = getWebDriver(downloadDir, isHeadless = True) + downloadedFile = _downloadFile( absoluteFile = downloadDir + "/" + file, driver = driver, maxTries = None) driver.quit() return downloadedFile -def downloadFile(absoluteFile, driver, maxTries): +def _downloadFile(absoluteFile, driver, maxTries): captchaReader = _createCaptchaReader() - def _downloadFile(): + def downloadFile(): driver.get('https://vaers.hhs.gov/eSubDownload/index.jsp?fn=' + os.path.basename(absoluteFile)) - solveCaptchaAndStartFileDownload(driver, captchaReader, 'captchaImage.jpeg') + _solveCaptchaAndStartFileDownload(driver, captchaReader, 'captchaImage.jpeg') numTries = 1 - _downloadFile() + downloadFile() while(not isCaptchaSolved(driver) and (maxTries is None or numTries < maxTries)): - _downloadFile() + downloadFile() numTries = numTries + 1 if isCaptchaSolved(driver): @@ -57,7 +57,7 @@ def _createCaptchaReader(): return CaptchaReader(modelFilepath = f'{working_directory}/captcha/MobileNetV3Small', captchaShape = CaptchaShape()) -def solveCaptchaAndStartFileDownload(driver, captchaReader, captchaImageFile): +def _solveCaptchaAndStartFileDownload(driver, captchaReader, captchaImageFile): saveCaptchaImageAs(driver, captchaImageFile) textInCaptchaImage = captchaReader.getTextInCaptchaImage(captchaImageFile) print('textInCaptchaImage:', textInCaptchaImage) diff --git a/src/WebDriver.py b/src/WebDriver.py index 90107b43bd1..401dfcc5778 100644 --- a/src/WebDriver.py +++ b/src/WebDriver.py @@ -6,7 +6,8 @@ from selenium.webdriver.common.by import By def _getOptions(downloadDir, isHeadless): options = Options() - options.headless = isHeadless + if isHeadless: + options.add_argument('--headless=new') options.add_experimental_option("prefs", {"download.default_directory" : downloadDir}) return options