downloading VAERSBeforeDeletion.7z

2024-04-22 12:09:43 +02:00
parent 6a488254fa
commit f8cecf8bad
9 changed files with 113 additions and 25 deletions
--- a/.github/workflows/buildAndDeployWebsite.yml
+++ b/.github/workflows/buildAndDeployWebsite.yml
@@ -25,19 +25,20 @@ jobs:
      - name: Installing Google Chrome
        shell: bash -el {0}
        run: |
+          conda activate howbadismybatch-venv
          wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
          sudo dpkg -i google-chrome-stable_current_amd64.deb
          sudo apt install google-chrome-stable
-          pip install selenium webdriver-manager
-          pip install pycountry
-          pip install python-decouple
+          pip install --upgrade selenium webdriver-manager
      - name: Installing ipython kernel
        shell: bash -el {0}
        run: |
+          conda activate howbadismybatch-venv
          ipython kernel install --user --name=howbadismybatch-venv-kernel
      - name: Executing HowBadIsMyBatch.ipynb
        shell: bash -el {0}
        run: |
+          conda activate howbadismybatch-venv
          cd src
          jupyter nbconvert --to script HowBadIsMyBatch.ipynb
          python ./HowBadIsMyBatch.py          
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,4 @@ google-chrome-stable_current_amd64*
 src/captcha/__pycache__
 src/GoogleAnalytics/__pycache__
 src/SymptomsCausedByVaccines/__pycache__
+src/HowBadIsMyBatch.py
--- a/environment.yml
+++ b/environment.yml
@@ -9,6 +9,8 @@ dependencies:
  - pandas=1.4.0
  - urllib3
  - requests
+  - gdown
+  - py7zr
  - bs4
  - lxml
  - jupyter
--- a/src/GoogleDriveDownloader.py
+++ b/src/GoogleDriveDownloader.py
@@ -0,0 +1,17 @@
+import gdown
+import py7zr
+import os
+
+
+class GoogleDriveDownloader:
+
+    @staticmethod
+    def downloadIfNotYetDownloaded(remoteSrcFile, localDstFile):
+        if not os.path.exists(localDstFile):
+            gdown.download(url = remoteSrcFile, output = localDstFile, fuzzy = True)
+
+    @staticmethod
+    def downloadSevenZipFileAndExtract(remoteSevenZipSrcFile, localSevenZipDstFile):
+        GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSevenZipSrcFile, localSevenZipDstFile);
+        with py7zr.SevenZipFile(localSevenZipDstFile, mode='r') as sevenZipFile:
+            sevenZipFile.extractall(path = os.path.dirname(localSevenZipDstFile))
--- a/src/GoogleDriveDownloaderTest.py
+++ b/src/GoogleDriveDownloaderTest.py
@@ -0,0 +1,52 @@
+import unittest
+from pathlib import Path
+import os
+from IOUtils import IOUtils
+from GoogleDriveDownloader import GoogleDriveDownloader
+
+
+class GoogleDriveDownloaderTest(unittest.TestCase):
+
+    def test_downloadIfNotYetDownloaded_notYetDownloaded(self):
+        # Given
+        remoteSrcFile = "https://drive.google.com/file/d/1LstnMvxW4LVxgNvfk5h4AnbvPktMeNSd/view?usp=drive_link"
+        localDstFile = 'src/tmp/test.txt'
+        IOUtils.silentlyRemoveFile(localDstFile)
+
+        # When
+        GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSrcFile, localDstFile)
+
+        # Then
+        self.assertEqual(Path(localDstFile).read_text(), 'test')
+
+    def test_downloadIfNotYetDownloaded_alreadyDownloaded(self):
+        # Given
+        remoteSrcFile = "https://drive.google.com/file/d/1LstnMvxW4LVxgNvfk5h4AnbvPktMeNSd/view?usp=drive_link"
+        localDstFile = 'src/tmp/test.txt'
+        content = 'local file content'
+        self._createFileWithContent(localDstFile, content);
+
+        # When
+        GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSrcFile, localDstFile)
+
+        # Then
+        self.assertEqual(Path(localDstFile).read_text(), content)
+
+    def test_downloadSevenZipFileAndExtract(self):
+        # Given
+        remoteSevenZipSrcFile = "https://drive.google.com/file/d/14hFKlt48dzDnEjHS_7vYVca5elfzX0l1/view?usp=drive_link"
+        localSevenZipDstFile = 'src/tmp/test.7z'
+        localDstFolder = os.path.dirname(localSevenZipDstFile)
+        IOUtils.silentlyRemoveFile(localSevenZipDstFile)
+        IOUtils.silentlyRemoveFolder(localDstFolder + '/test')
+
+        # When
+        GoogleDriveDownloader.downloadSevenZipFileAndExtract(remoteSevenZipSrcFile, localSevenZipDstFile)
+
+        # Then
+        self.assertEqual(Path(localDstFolder + '/test/test.txt').read_text(), 'test')
+
+    def _createFileWithContent(self, file, content):
+        with open(file, 'w') as file:
+            file.write(content)
+
--- a/src/IOUtils.py
+++ b/src/IOUtils.py
@@ -1,6 +1,9 @@
 import os
+from pathlib import Path
+import shutil
 import simplejson as json

+
 class IOUtils:

    @staticmethod
@@ -13,19 +16,19 @@ class IOUtils:
        IOUtils.ensurePath(file)
        dataFrame.to_html(
            file,
-            index = False,
-            table_id = 'batchCodeTable',
-            classes = 'display',
-            justify = 'unset',
-            border = 0)
+            index=False,
+            table_id='batchCodeTable',
+            classes='display',
+            justify='unset',
+            border=0)

    @staticmethod
    def saveDataFrameAsJson(dataFrame, file):
        IOUtils.ensurePath(file)
        dataFrame.to_json(
            file,
-            orient = "split",
-            index = False)
+            orient="split",
+            index=False)

    @staticmethod
    def saveDictAsJson(dict, file):
@@ -38,3 +41,11 @@ class IOUtils:
        directory = os.path.dirname(file)
        if not os.path.exists(directory):
            os.makedirs(directory)
+
+    @staticmethod
+    def silentlyRemoveFile(file):
+        Path(file).unlink(missing_ok=True)
+
+    @staticmethod
+    def silentlyRemoveFolder(folder):
+        shutil.rmtree(folder, ignore_errors=True)
--- a/src/InternationalVaersCovid19Provider.py
+++ b/src/InternationalVaersCovid19Provider.py
@@ -3,7 +3,7 @@ import VaersReader
 import pandas as pd
 from VaersDescrReader import VaersDescrReader
 from CountryColumnAdder import CountryColumnAdder
-
+from GoogleDriveDownloader import GoogleDriveDownloader

 def getInternationalVaersCovid19(dataDir, years):
    internationalVaers = pd.concat(
@@ -16,6 +16,9 @@ def getInternationalVaersCovid19(dataDir, years):


 def getInternationalVaersCovid19BeforeDeletion():
+    GoogleDriveDownloader.downloadSevenZipFileAndExtract(
+        remoteSevenZipSrcFile = "https://drive.google.com/file/d/1Rb-lfxNxw_WwvRDVLEhvqOyv_a2f8ern/view?usp=drive_link",
+        localSevenZipDstFile = 'VAERS/VAERSBeforeDeletion.7z')
    return getInternationalVaersCovid19(dataDir = 'VAERS/VAERSBeforeDeletion', years = [2020, 2021, 2022])

 def get_international_VAERSVAX_VAERSSYMPTOMS_Covid19(years):
--- a/src/VAERSFileDownloader.py
+++ b/src/VAERSFileDownloader.py
@@ -16,34 +16,34 @@ from captcha.CaptchaShape import CaptchaShape
    
 def updateVAERSFiles(years, workingDirectory):
    for year in years:
-        downloadVAERSFileAndUnzip(f'{year}VAERSData.zip', workingDirectory)
-    downloadVAERSFileAndUnzip('NonDomesticVAERSData.zip', workingDirectory)
+        _downloadVAERSFileAndUnzip(f'{year}VAERSData.zip', workingDirectory)
+    _downloadVAERSFileAndUnzip('NonDomesticVAERSData.zip', workingDirectory)
    
-def downloadVAERSFileAndUnzip(file, workingDirectory):
-    downloadedFile = downloadVAERSFile(file, workingDirectory + "/VAERS/tmp")
+def _downloadVAERSFileAndUnzip(file, workingDirectory):
+    downloadedFile = _downloadVAERSFile(file, workingDirectory + "/VAERS/tmp")
    unzipAndRemove(
        zipFile = downloadedFile,
        dstDir = workingDirectory + '/VAERS/')

-def downloadVAERSFile(file, downloadDir):
-    driver = getWebDriver(downloadDir, isHeadless = False)
-    downloadedFile = downloadFile(
+def _downloadVAERSFile(file, downloadDir):
+    driver = getWebDriver(downloadDir, isHeadless = True)
+    downloadedFile = _downloadFile(
        absoluteFile = downloadDir + "/" + file,
        driver = driver,
        maxTries = None)
    driver.quit()
    return downloadedFile

-def downloadFile(absoluteFile, driver, maxTries):
+def _downloadFile(absoluteFile, driver, maxTries):
    captchaReader = _createCaptchaReader()
-    def _downloadFile():
+    def downloadFile():
        driver.get('https://vaers.hhs.gov/eSubDownload/index.jsp?fn=' + os.path.basename(absoluteFile))
-        solveCaptchaAndStartFileDownload(driver, captchaReader, 'captchaImage.jpeg')
+        _solveCaptchaAndStartFileDownload(driver, captchaReader, 'captchaImage.jpeg')

    numTries = 1
-    _downloadFile()
+    downloadFile()
    while(not isCaptchaSolved(driver) and (maxTries is None or numTries < maxTries)):
-        _downloadFile()
+        downloadFile()
        numTries = numTries + 1

    if isCaptchaSolved(driver):
@@ -57,7 +57,7 @@ def _createCaptchaReader():
    return CaptchaReader(modelFilepath = f'{working_directory}/captcha/MobileNetV3Small',
                         captchaShape = CaptchaShape())

-def solveCaptchaAndStartFileDownload(driver, captchaReader, captchaImageFile):
+def _solveCaptchaAndStartFileDownload(driver, captchaReader, captchaImageFile):
    saveCaptchaImageAs(driver, captchaImageFile)
    textInCaptchaImage = captchaReader.getTextInCaptchaImage(captchaImageFile)
    print('textInCaptchaImage:', textInCaptchaImage)
--- a/src/WebDriver.py
+++ b/src/WebDriver.py
@@ -6,7 +6,8 @@ from selenium.webdriver.common.by import By

 def _getOptions(downloadDir, isHeadless):
    options = Options()
-    options.headless = isHeadless
+    if isHeadless:
+        options.add_argument('--headless=new')
    options.add_experimental_option("prefs", {"download.default_directory" : downloadDir})
    return options