downloading VAERSBeforeDeletion.7z

This commit is contained in:
Frank Knoll
2024-04-22 12:09:43 +02:00
parent 6a488254fa
commit f8cecf8bad
9 changed files with 113 additions and 25 deletions

View File

@@ -25,19 +25,20 @@ jobs:
- name: Installing Google Chrome - name: Installing Google Chrome
shell: bash -el {0} shell: bash -el {0}
run: | run: |
conda activate howbadismybatch-venv
wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
sudo dpkg -i google-chrome-stable_current_amd64.deb sudo dpkg -i google-chrome-stable_current_amd64.deb
sudo apt install google-chrome-stable sudo apt install google-chrome-stable
pip install selenium webdriver-manager pip install --upgrade selenium webdriver-manager
pip install pycountry
pip install python-decouple
- name: Installing ipython kernel - name: Installing ipython kernel
shell: bash -el {0} shell: bash -el {0}
run: | run: |
conda activate howbadismybatch-venv
ipython kernel install --user --name=howbadismybatch-venv-kernel ipython kernel install --user --name=howbadismybatch-venv-kernel
- name: Executing HowBadIsMyBatch.ipynb - name: Executing HowBadIsMyBatch.ipynb
shell: bash -el {0} shell: bash -el {0}
run: | run: |
conda activate howbadismybatch-venv
cd src cd src
jupyter nbconvert --to script HowBadIsMyBatch.ipynb jupyter nbconvert --to script HowBadIsMyBatch.ipynb
python ./HowBadIsMyBatch.py python ./HowBadIsMyBatch.py

1
.gitignore vendored
View File

@@ -17,3 +17,4 @@ google-chrome-stable_current_amd64*
src/captcha/__pycache__ src/captcha/__pycache__
src/GoogleAnalytics/__pycache__ src/GoogleAnalytics/__pycache__
src/SymptomsCausedByVaccines/__pycache__ src/SymptomsCausedByVaccines/__pycache__
src/HowBadIsMyBatch.py

View File

@@ -9,6 +9,8 @@ dependencies:
- pandas=1.4.0 - pandas=1.4.0
- urllib3 - urllib3
- requests - requests
- gdown
- py7zr
- bs4 - bs4
- lxml - lxml
- jupyter - jupyter

View File

@@ -0,0 +1,17 @@
import gdown
import py7zr
import os
class GoogleDriveDownloader:
@staticmethod
def downloadIfNotYetDownloaded(remoteSrcFile, localDstFile):
if not os.path.exists(localDstFile):
gdown.download(url = remoteSrcFile, output = localDstFile, fuzzy = True)
@staticmethod
def downloadSevenZipFileAndExtract(remoteSevenZipSrcFile, localSevenZipDstFile):
GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSevenZipSrcFile, localSevenZipDstFile);
with py7zr.SevenZipFile(localSevenZipDstFile, mode='r') as sevenZipFile:
sevenZipFile.extractall(path = os.path.dirname(localSevenZipDstFile))

View File

@@ -0,0 +1,52 @@
import unittest
from pathlib import Path
import os
from IOUtils import IOUtils
from GoogleDriveDownloader import GoogleDriveDownloader
class GoogleDriveDownloaderTest(unittest.TestCase):
def test_downloadIfNotYetDownloaded_notYetDownloaded(self):
# Given
remoteSrcFile = "https://drive.google.com/file/d/1LstnMvxW4LVxgNvfk5h4AnbvPktMeNSd/view?usp=drive_link"
localDstFile = 'src/tmp/test.txt'
IOUtils.silentlyRemoveFile(localDstFile)
# When
GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSrcFile, localDstFile)
# Then
self.assertEqual(Path(localDstFile).read_text(), 'test')
def test_downloadIfNotYetDownloaded_alreadyDownloaded(self):
# Given
remoteSrcFile = "https://drive.google.com/file/d/1LstnMvxW4LVxgNvfk5h4AnbvPktMeNSd/view?usp=drive_link"
localDstFile = 'src/tmp/test.txt'
content = 'local file content'
self._createFileWithContent(localDstFile, content);
# When
GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSrcFile, localDstFile)
# Then
self.assertEqual(Path(localDstFile).read_text(), content)
def test_downloadSevenZipFileAndExtract(self):
# Given
remoteSevenZipSrcFile = "https://drive.google.com/file/d/14hFKlt48dzDnEjHS_7vYVca5elfzX0l1/view?usp=drive_link"
localSevenZipDstFile = 'src/tmp/test.7z'
localDstFolder = os.path.dirname(localSevenZipDstFile)
IOUtils.silentlyRemoveFile(localSevenZipDstFile)
IOUtils.silentlyRemoveFolder(localDstFolder + '/test')
# When
GoogleDriveDownloader.downloadSevenZipFileAndExtract(remoteSevenZipSrcFile, localSevenZipDstFile)
# Then
self.assertEqual(Path(localDstFolder + '/test/test.txt').read_text(), 'test')
def _createFileWithContent(self, file, content):
with open(file, 'w') as file:
file.write(content)

View File

@@ -1,6 +1,9 @@
import os import os
from pathlib import Path
import shutil
import simplejson as json import simplejson as json
class IOUtils: class IOUtils:
@staticmethod @staticmethod
@@ -13,19 +16,19 @@ class IOUtils:
IOUtils.ensurePath(file) IOUtils.ensurePath(file)
dataFrame.to_html( dataFrame.to_html(
file, file,
index = False, index=False,
table_id = 'batchCodeTable', table_id='batchCodeTable',
classes = 'display', classes='display',
justify = 'unset', justify='unset',
border = 0) border=0)
@staticmethod @staticmethod
def saveDataFrameAsJson(dataFrame, file): def saveDataFrameAsJson(dataFrame, file):
IOUtils.ensurePath(file) IOUtils.ensurePath(file)
dataFrame.to_json( dataFrame.to_json(
file, file,
orient = "split", orient="split",
index = False) index=False)
@staticmethod @staticmethod
def saveDictAsJson(dict, file): def saveDictAsJson(dict, file):
@@ -38,3 +41,11 @@ class IOUtils:
directory = os.path.dirname(file) directory = os.path.dirname(file)
if not os.path.exists(directory): if not os.path.exists(directory):
os.makedirs(directory) os.makedirs(directory)
@staticmethod
def silentlyRemoveFile(file):
Path(file).unlink(missing_ok=True)
@staticmethod
def silentlyRemoveFolder(folder):
shutil.rmtree(folder, ignore_errors=True)

View File

@@ -3,7 +3,7 @@ import VaersReader
import pandas as pd import pandas as pd
from VaersDescrReader import VaersDescrReader from VaersDescrReader import VaersDescrReader
from CountryColumnAdder import CountryColumnAdder from CountryColumnAdder import CountryColumnAdder
from GoogleDriveDownloader import GoogleDriveDownloader
def getInternationalVaersCovid19(dataDir, years): def getInternationalVaersCovid19(dataDir, years):
internationalVaers = pd.concat( internationalVaers = pd.concat(
@@ -16,6 +16,9 @@ def getInternationalVaersCovid19(dataDir, years):
def getInternationalVaersCovid19BeforeDeletion(): def getInternationalVaersCovid19BeforeDeletion():
GoogleDriveDownloader.downloadSevenZipFileAndExtract(
remoteSevenZipSrcFile = "https://drive.google.com/file/d/1Rb-lfxNxw_WwvRDVLEhvqOyv_a2f8ern/view?usp=drive_link",
localSevenZipDstFile = 'VAERS/VAERSBeforeDeletion.7z')
return getInternationalVaersCovid19(dataDir = 'VAERS/VAERSBeforeDeletion', years = [2020, 2021, 2022]) return getInternationalVaersCovid19(dataDir = 'VAERS/VAERSBeforeDeletion', years = [2020, 2021, 2022])
def get_international_VAERSVAX_VAERSSYMPTOMS_Covid19(years): def get_international_VAERSVAX_VAERSSYMPTOMS_Covid19(years):

View File

@@ -16,34 +16,34 @@ from captcha.CaptchaShape import CaptchaShape
def updateVAERSFiles(years, workingDirectory): def updateVAERSFiles(years, workingDirectory):
for year in years: for year in years:
downloadVAERSFileAndUnzip(f'{year}VAERSData.zip', workingDirectory) _downloadVAERSFileAndUnzip(f'{year}VAERSData.zip', workingDirectory)
downloadVAERSFileAndUnzip('NonDomesticVAERSData.zip', workingDirectory) _downloadVAERSFileAndUnzip('NonDomesticVAERSData.zip', workingDirectory)
def downloadVAERSFileAndUnzip(file, workingDirectory): def _downloadVAERSFileAndUnzip(file, workingDirectory):
downloadedFile = downloadVAERSFile(file, workingDirectory + "/VAERS/tmp") downloadedFile = _downloadVAERSFile(file, workingDirectory + "/VAERS/tmp")
unzipAndRemove( unzipAndRemove(
zipFile = downloadedFile, zipFile = downloadedFile,
dstDir = workingDirectory + '/VAERS/') dstDir = workingDirectory + '/VAERS/')
def downloadVAERSFile(file, downloadDir): def _downloadVAERSFile(file, downloadDir):
driver = getWebDriver(downloadDir, isHeadless = False) driver = getWebDriver(downloadDir, isHeadless = True)
downloadedFile = downloadFile( downloadedFile = _downloadFile(
absoluteFile = downloadDir + "/" + file, absoluteFile = downloadDir + "/" + file,
driver = driver, driver = driver,
maxTries = None) maxTries = None)
driver.quit() driver.quit()
return downloadedFile return downloadedFile
def downloadFile(absoluteFile, driver, maxTries): def _downloadFile(absoluteFile, driver, maxTries):
captchaReader = _createCaptchaReader() captchaReader = _createCaptchaReader()
def _downloadFile(): def downloadFile():
driver.get('https://vaers.hhs.gov/eSubDownload/index.jsp?fn=' + os.path.basename(absoluteFile)) driver.get('https://vaers.hhs.gov/eSubDownload/index.jsp?fn=' + os.path.basename(absoluteFile))
solveCaptchaAndStartFileDownload(driver, captchaReader, 'captchaImage.jpeg') _solveCaptchaAndStartFileDownload(driver, captchaReader, 'captchaImage.jpeg')
numTries = 1 numTries = 1
_downloadFile() downloadFile()
while(not isCaptchaSolved(driver) and (maxTries is None or numTries < maxTries)): while(not isCaptchaSolved(driver) and (maxTries is None or numTries < maxTries)):
_downloadFile() downloadFile()
numTries = numTries + 1 numTries = numTries + 1
if isCaptchaSolved(driver): if isCaptchaSolved(driver):
@@ -57,7 +57,7 @@ def _createCaptchaReader():
return CaptchaReader(modelFilepath = f'{working_directory}/captcha/MobileNetV3Small', return CaptchaReader(modelFilepath = f'{working_directory}/captcha/MobileNetV3Small',
captchaShape = CaptchaShape()) captchaShape = CaptchaShape())
def solveCaptchaAndStartFileDownload(driver, captchaReader, captchaImageFile): def _solveCaptchaAndStartFileDownload(driver, captchaReader, captchaImageFile):
saveCaptchaImageAs(driver, captchaImageFile) saveCaptchaImageAs(driver, captchaImageFile)
textInCaptchaImage = captchaReader.getTextInCaptchaImage(captchaImageFile) textInCaptchaImage = captchaReader.getTextInCaptchaImage(captchaImageFile)
print('textInCaptchaImage:', textInCaptchaImage) print('textInCaptchaImage:', textInCaptchaImage)

View File

@@ -6,7 +6,8 @@ from selenium.webdriver.common.by import By
def _getOptions(downloadDir, isHeadless): def _getOptions(downloadDir, isHeadless):
options = Options() options = Options()
options.headless = isHeadless if isHeadless:
options.add_argument('--headless=new')
options.add_experimental_option("prefs", {"download.default_directory" : downloadDir}) options.add_experimental_option("prefs", {"download.default_directory" : downloadDir})
return options return options