downloading VAERSBeforeDeletion.7z

2024-04-22 12:09:43 +02:00
parent 6a488254fa
commit f8cecf8bad
9 changed files with 113 additions and 25 deletions
--- a/.github/workflows/buildAndDeployWebsite.yml
+++ b/.github/workflows/buildAndDeployWebsite.yml
@@ -25,19 +25,20 @@ jobs:
      - name: Installing Google Chrome
        shell: bash -el {0}
        run: |
          conda activate howbadismybatch-venv
          wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
          sudo dpkg -i google-chrome-stable_current_amd64.deb
          sudo apt install google-chrome-stable
-          pip install selenium webdriver-manager
+          pip install --upgrade selenium webdriver-manager
          pip install pycountry
          pip install python-decouple
      - name: Installing ipython kernel
        shell: bash -el {0}
        run: |
          conda activate howbadismybatch-venv
          ipython kernel install --user --name=howbadismybatch-venv-kernel
      - name: Executing HowBadIsMyBatch.ipynb
        shell: bash -el {0}
        run: |
          conda activate howbadismybatch-venv
          cd src
          jupyter nbconvert --to script HowBadIsMyBatch.ipynb
          python ./HowBadIsMyBatch.py          
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,4 @@ google-chrome-stable_current_amd64*
 src/captcha/__pycache__
 src/GoogleAnalytics/__pycache__
 src/SymptomsCausedByVaccines/__pycache__
 src/HowBadIsMyBatch.py
--- a/environment.yml
+++ b/environment.yml
@@ -9,6 +9,8 @@ dependencies:
  - pandas=1.4.0
  - urllib3
  - requests
  - gdown
  - py7zr
  - bs4
  - lxml
  - jupyter
--- a/src/GoogleDriveDownloader.py
+++ b/src/GoogleDriveDownloader.py
@@ -0,0 +1,17 @@
 import gdown
 import py7zr
 import os
 class GoogleDriveDownloader:
    @staticmethod
    def downloadIfNotYetDownloaded(remoteSrcFile, localDstFile):
        if not os.path.exists(localDstFile):
            gdown.download(url = remoteSrcFile, output = localDstFile, fuzzy = True)
    @staticmethod
    def downloadSevenZipFileAndExtract(remoteSevenZipSrcFile, localSevenZipDstFile):
        GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSevenZipSrcFile, localSevenZipDstFile);
        with py7zr.SevenZipFile(localSevenZipDstFile, mode='r') as sevenZipFile:
            sevenZipFile.extractall(path = os.path.dirname(localSevenZipDstFile))
--- a/src/GoogleDriveDownloaderTest.py
+++ b/src/GoogleDriveDownloaderTest.py
@@ -0,0 +1,52 @@
 import unittest
 from pathlib import Path
 import os
 from IOUtils import IOUtils
 from GoogleDriveDownloader import GoogleDriveDownloader
 class GoogleDriveDownloaderTest(unittest.TestCase):
    def test_downloadIfNotYetDownloaded_notYetDownloaded(self):
        # Given
        remoteSrcFile = "https://drive.google.com/file/d/1LstnMvxW4LVxgNvfk5h4AnbvPktMeNSd/view?usp=drive_link"
        localDstFile = 'src/tmp/test.txt'
        IOUtils.silentlyRemoveFile(localDstFile)
        # When
        GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSrcFile, localDstFile)
        # Then
        self.assertEqual(Path(localDstFile).read_text(), 'test')
    def test_downloadIfNotYetDownloaded_alreadyDownloaded(self):
        # Given
        remoteSrcFile = "https://drive.google.com/file/d/1LstnMvxW4LVxgNvfk5h4AnbvPktMeNSd/view?usp=drive_link"
        localDstFile = 'src/tmp/test.txt'
        content = 'local file content'
        self._createFileWithContent(localDstFile, content);
        # When
        GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSrcFile, localDstFile)
        # Then
        self.assertEqual(Path(localDstFile).read_text(), content)
    def test_downloadSevenZipFileAndExtract(self):
        # Given
        remoteSevenZipSrcFile = "https://drive.google.com/file/d/14hFKlt48dzDnEjHS_7vYVca5elfzX0l1/view?usp=drive_link"
        localSevenZipDstFile = 'src/tmp/test.7z'
        localDstFolder = os.path.dirname(localSevenZipDstFile)
        IOUtils.silentlyRemoveFile(localSevenZipDstFile)
        IOUtils.silentlyRemoveFolder(localDstFolder + '/test')
        # When
        GoogleDriveDownloader.downloadSevenZipFileAndExtract(remoteSevenZipSrcFile, localSevenZipDstFile)
        # Then
        self.assertEqual(Path(localDstFolder + '/test/test.txt').read_text(), 'test')
    def _createFileWithContent(self, file, content):
        with open(file, 'w') as file:
            file.write(content)
--- a/src/IOUtils.py
+++ b/src/IOUtils.py
@@ -1,6 +1,9 @@
 import os
 from pathlib import Path
 import shutil
 import simplejson as json
 class IOUtils:
    @staticmethod
@@ -13,19 +16,19 @@ class IOUtils:
        IOUtils.ensurePath(file)
        dataFrame.to_html(
            file,
-            index = False,
+            index=False,
-            table_id = 'batchCodeTable',
+            table_id='batchCodeTable',
-            classes = 'display',
+            classes='display',
-            justify = 'unset',
+            justify='unset',
-            border = 0)
+            border=0)
    @staticmethod
    def saveDataFrameAsJson(dataFrame, file):
        IOUtils.ensurePath(file)
        dataFrame.to_json(
            file,
-            orient = "split",
+            orient="split",
-            index = False)
+            index=False)
    @staticmethod
    def saveDictAsJson(dict, file):
@@ -38,3 +41,11 @@ class IOUtils:
        directory = os.path.dirname(file)
        if not os.path.exists(directory):
            os.makedirs(directory)
    @staticmethod
    def silentlyRemoveFile(file):
        Path(file).unlink(missing_ok=True)
    @staticmethod
    def silentlyRemoveFolder(folder):
        shutil.rmtree(folder, ignore_errors=True)
--- a/src/InternationalVaersCovid19Provider.py
+++ b/src/InternationalVaersCovid19Provider.py
@@ -3,7 +3,7 @@ import VaersReader
 import pandas as pd
 from VaersDescrReader import VaersDescrReader
 from CountryColumnAdder import CountryColumnAdder
-
+from GoogleDriveDownloader import GoogleDriveDownloader
 def getInternationalVaersCovid19(dataDir, years):
    internationalVaers = pd.concat(
@@ -16,6 +16,9 @@ def getInternationalVaersCovid19(dataDir, years):
 def getInternationalVaersCovid19BeforeDeletion():
    GoogleDriveDownloader.downloadSevenZipFileAndExtract(
        remoteSevenZipSrcFile = "https://drive.google.com/file/d/1Rb-lfxNxw_WwvRDVLEhvqOyv_a2f8ern/view?usp=drive_link",
        localSevenZipDstFile = 'VAERS/VAERSBeforeDeletion.7z')
    return getInternationalVaersCovid19(dataDir = 'VAERS/VAERSBeforeDeletion', years = [2020, 2021, 2022])
 def get_international_VAERSVAX_VAERSSYMPTOMS_Covid19(years):
--- a/src/VAERSFileDownloader.py
+++ b/src/VAERSFileDownloader.py
@@ -16,34 +16,34 @@ from captcha.CaptchaShape import CaptchaShape
 def updateVAERSFiles(years, workingDirectory):
    for year in years:
-        downloadVAERSFileAndUnzip(f'{year}VAERSData.zip', workingDirectory)
+        _downloadVAERSFileAndUnzip(f'{year}VAERSData.zip', workingDirectory)
-    downloadVAERSFileAndUnzip('NonDomesticVAERSData.zip', workingDirectory)
+    _downloadVAERSFileAndUnzip('NonDomesticVAERSData.zip', workingDirectory)
-def downloadVAERSFileAndUnzip(file, workingDirectory):
+def _downloadVAERSFileAndUnzip(file, workingDirectory):
-    downloadedFile = downloadVAERSFile(file, workingDirectory + "/VAERS/tmp")
+    downloadedFile = _downloadVAERSFile(file, workingDirectory + "/VAERS/tmp")
    unzipAndRemove(
        zipFile = downloadedFile,
        dstDir = workingDirectory + '/VAERS/')
-def downloadVAERSFile(file, downloadDir):
+def _downloadVAERSFile(file, downloadDir):
-    driver = getWebDriver(downloadDir, isHeadless = False)
+    driver = getWebDriver(downloadDir, isHeadless = True)
-    downloadedFile = downloadFile(
+    downloadedFile = _downloadFile(
        absoluteFile = downloadDir + "/" + file,
        driver = driver,
        maxTries = None)
    driver.quit()
    return downloadedFile
-def downloadFile(absoluteFile, driver, maxTries):
+def _downloadFile(absoluteFile, driver, maxTries):
    captchaReader = _createCaptchaReader()
-    def _downloadFile():
+    def downloadFile():
        driver.get('https://vaers.hhs.gov/eSubDownload/index.jsp?fn=' + os.path.basename(absoluteFile))
-        solveCaptchaAndStartFileDownload(driver, captchaReader, 'captchaImage.jpeg')
+        _solveCaptchaAndStartFileDownload(driver, captchaReader, 'captchaImage.jpeg')
    numTries = 1
-    _downloadFile()
+    downloadFile()
    while(not isCaptchaSolved(driver) and (maxTries is None or numTries < maxTries)):
-        _downloadFile()
+        downloadFile()
        numTries = numTries + 1
    if isCaptchaSolved(driver):
@@ -57,7 +57,7 @@ def _createCaptchaReader():
    return CaptchaReader(modelFilepath = f'{working_directory}/captcha/MobileNetV3Small',
                         captchaShape = CaptchaShape())
-def solveCaptchaAndStartFileDownload(driver, captchaReader, captchaImageFile):
+def _solveCaptchaAndStartFileDownload(driver, captchaReader, captchaImageFile):
    saveCaptchaImageAs(driver, captchaImageFile)
    textInCaptchaImage = captchaReader.getTextInCaptchaImage(captchaImageFile)
    print('textInCaptchaImage:', textInCaptchaImage)
--- a/src/WebDriver.py
+++ b/src/WebDriver.py
@@ -6,7 +6,8 @@ from selenium.webdriver.common.by import By
 def _getOptions(downloadDir, isHeadless):
    options = Options()
-    options.headless = isHeadless
+    if isHeadless:
        options.add_argument('--headless=new')
    options.add_experimental_option("prefs", {"download.default_directory" : downloadDir})
    return options