downloading VAERSBeforeDeletion.7z

This commit is contained in:
Frank Knoll
2024-04-22 12:09:43 +02:00
parent 6a488254fa
commit f8cecf8bad
9 changed files with 113 additions and 25 deletions

View File

@@ -25,19 +25,20 @@ jobs:
- name: Installing Google Chrome
shell: bash -el {0}
run: |
conda activate howbadismybatch-venv
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
sudo dpkg -i google-chrome-stable_current_amd64.deb
sudo apt install google-chrome-stable
pip install selenium webdriver-manager
pip install pycountry
pip install python-decouple
pip install --upgrade selenium webdriver-manager
- name: Installing ipython kernel
shell: bash -el {0}
run: |
conda activate howbadismybatch-venv
ipython kernel install --user --name=howbadismybatch-venv-kernel
- name: Executing HowBadIsMyBatch.ipynb
shell: bash -el {0}
run: |
conda activate howbadismybatch-venv
cd src
jupyter nbconvert --to script HowBadIsMyBatch.ipynb
python ./HowBadIsMyBatch.py

1
.gitignore vendored
View File

@@ -17,3 +17,4 @@ google-chrome-stable_current_amd64*
src/captcha/__pycache__
src/GoogleAnalytics/__pycache__
src/SymptomsCausedByVaccines/__pycache__
src/HowBadIsMyBatch.py

View File

@@ -9,6 +9,8 @@ dependencies:
- pandas=1.4.0
- urllib3
- requests
- gdown
- py7zr
- bs4
- lxml
- jupyter

View File

@@ -0,0 +1,17 @@
import gdown
import py7zr
import os
class GoogleDriveDownloader:
@staticmethod
def downloadIfNotYetDownloaded(remoteSrcFile, localDstFile):
if not os.path.exists(localDstFile):
gdown.download(url = remoteSrcFile, output = localDstFile, fuzzy = True)
@staticmethod
def downloadSevenZipFileAndExtract(remoteSevenZipSrcFile, localSevenZipDstFile):
GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSevenZipSrcFile, localSevenZipDstFile);
with py7zr.SevenZipFile(localSevenZipDstFile, mode='r') as sevenZipFile:
sevenZipFile.extractall(path = os.path.dirname(localSevenZipDstFile))

View File

@@ -0,0 +1,52 @@
import unittest
from pathlib import Path
import os
from IOUtils import IOUtils
from GoogleDriveDownloader import GoogleDriveDownloader
class GoogleDriveDownloaderTest(unittest.TestCase):
def test_downloadIfNotYetDownloaded_notYetDownloaded(self):
# Given
remoteSrcFile = "https://drive.google.com/file/d/1LstnMvxW4LVxgNvfk5h4AnbvPktMeNSd/view?usp=drive_link"
localDstFile = 'src/tmp/test.txt'
IOUtils.silentlyRemoveFile(localDstFile)
# When
GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSrcFile, localDstFile)
# Then
self.assertEqual(Path(localDstFile).read_text(), 'test')
def test_downloadIfNotYetDownloaded_alreadyDownloaded(self):
# Given
remoteSrcFile = "https://drive.google.com/file/d/1LstnMvxW4LVxgNvfk5h4AnbvPktMeNSd/view?usp=drive_link"
localDstFile = 'src/tmp/test.txt'
content = 'local file content'
self._createFileWithContent(localDstFile, content);
# When
GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSrcFile, localDstFile)
# Then
self.assertEqual(Path(localDstFile).read_text(), content)
def test_downloadSevenZipFileAndExtract(self):
# Given
remoteSevenZipSrcFile = "https://drive.google.com/file/d/14hFKlt48dzDnEjHS_7vYVca5elfzX0l1/view?usp=drive_link"
localSevenZipDstFile = 'src/tmp/test.7z'
localDstFolder = os.path.dirname(localSevenZipDstFile)
IOUtils.silentlyRemoveFile(localSevenZipDstFile)
IOUtils.silentlyRemoveFolder(localDstFolder + '/test')
# When
GoogleDriveDownloader.downloadSevenZipFileAndExtract(remoteSevenZipSrcFile, localSevenZipDstFile)
# Then
self.assertEqual(Path(localDstFolder + '/test/test.txt').read_text(), 'test')
def _createFileWithContent(self, file, content):
with open(file, 'w') as file:
file.write(content)

View File

@@ -1,6 +1,9 @@
import os
from pathlib import Path
import shutil
import simplejson as json
class IOUtils:
@staticmethod
@@ -13,19 +16,19 @@ class IOUtils:
IOUtils.ensurePath(file)
dataFrame.to_html(
file,
index = False,
table_id = 'batchCodeTable',
classes = 'display',
justify = 'unset',
border = 0)
index=False,
table_id='batchCodeTable',
classes='display',
justify='unset',
border=0)
@staticmethod
def saveDataFrameAsJson(dataFrame, file):
IOUtils.ensurePath(file)
dataFrame.to_json(
file,
orient = "split",
index = False)
orient="split",
index=False)
@staticmethod
def saveDictAsJson(dict, file):
@@ -38,3 +41,11 @@ class IOUtils:
directory = os.path.dirname(file)
if not os.path.exists(directory):
os.makedirs(directory)
@staticmethod
def silentlyRemoveFile(file):
Path(file).unlink(missing_ok=True)
@staticmethod
def silentlyRemoveFolder(folder):
shutil.rmtree(folder, ignore_errors=True)

View File

@@ -3,7 +3,7 @@ import VaersReader
import pandas as pd
from VaersDescrReader import VaersDescrReader
from CountryColumnAdder import CountryColumnAdder
from GoogleDriveDownloader import GoogleDriveDownloader
def getInternationalVaersCovid19(dataDir, years):
internationalVaers = pd.concat(
@@ -16,6 +16,9 @@ def getInternationalVaersCovid19(dataDir, years):
def getInternationalVaersCovid19BeforeDeletion():
GoogleDriveDownloader.downloadSevenZipFileAndExtract(
remoteSevenZipSrcFile = "https://drive.google.com/file/d/1Rb-lfxNxw_WwvRDVLEhvqOyv_a2f8ern/view?usp=drive_link",
localSevenZipDstFile = 'VAERS/VAERSBeforeDeletion.7z')
return getInternationalVaersCovid19(dataDir = 'VAERS/VAERSBeforeDeletion', years = [2020, 2021, 2022])
def get_international_VAERSVAX_VAERSSYMPTOMS_Covid19(years):

View File

@@ -16,34 +16,34 @@ from captcha.CaptchaShape import CaptchaShape
def updateVAERSFiles(years, workingDirectory):
for year in years:
downloadVAERSFileAndUnzip(f'{year}VAERSData.zip', workingDirectory)
downloadVAERSFileAndUnzip('NonDomesticVAERSData.zip', workingDirectory)
_downloadVAERSFileAndUnzip(f'{year}VAERSData.zip', workingDirectory)
_downloadVAERSFileAndUnzip('NonDomesticVAERSData.zip', workingDirectory)
def downloadVAERSFileAndUnzip(file, workingDirectory):
downloadedFile = downloadVAERSFile(file, workingDirectory + "/VAERS/tmp")
def _downloadVAERSFileAndUnzip(file, workingDirectory):
downloadedFile = _downloadVAERSFile(file, workingDirectory + "/VAERS/tmp")
unzipAndRemove(
zipFile = downloadedFile,
dstDir = workingDirectory + '/VAERS/')
def downloadVAERSFile(file, downloadDir):
driver = getWebDriver(downloadDir, isHeadless = False)
downloadedFile = downloadFile(
def _downloadVAERSFile(file, downloadDir):
driver = getWebDriver(downloadDir, isHeadless = True)
downloadedFile = _downloadFile(
absoluteFile = downloadDir + "/" + file,
driver = driver,
maxTries = None)
driver.quit()
return downloadedFile
def downloadFile(absoluteFile, driver, maxTries):
def _downloadFile(absoluteFile, driver, maxTries):
captchaReader = _createCaptchaReader()
def _downloadFile():
def downloadFile():
driver.get('https://vaers.hhs.gov/eSubDownload/index.jsp?fn=' + os.path.basename(absoluteFile))
solveCaptchaAndStartFileDownload(driver, captchaReader, 'captchaImage.jpeg')
_solveCaptchaAndStartFileDownload(driver, captchaReader, 'captchaImage.jpeg')
numTries = 1
_downloadFile()
downloadFile()
while(not isCaptchaSolved(driver) and (maxTries is None or numTries < maxTries)):
_downloadFile()
downloadFile()
numTries = numTries + 1
if isCaptchaSolved(driver):
@@ -57,7 +57,7 @@ def _createCaptchaReader():
return CaptchaReader(modelFilepath = f'{working_directory}/captcha/MobileNetV3Small',
captchaShape = CaptchaShape())
def solveCaptchaAndStartFileDownload(driver, captchaReader, captchaImageFile):
def _solveCaptchaAndStartFileDownload(driver, captchaReader, captchaImageFile):
saveCaptchaImageAs(driver, captchaImageFile)
textInCaptchaImage = captchaReader.getTextInCaptchaImage(captchaImageFile)
print('textInCaptchaImage:', textInCaptchaImage)

View File

@@ -6,7 +6,8 @@ from selenium.webdriver.common.by import By
def _getOptions(downloadDir, isHeadless):
options = Options()
options.headless = isHeadless
if isHeadless:
options.add_argument('--headless=new')
options.add_experimental_option("prefs", {"download.default_directory" : downloadDir})
return options