Merge branch 'handlingLargeFiles' into pages

This commit is contained in:
Frank Knoll
2024-04-25 12:09:42 +02:00
14 changed files with 28 additions and 86 deletions

View File

@@ -10,7 +10,6 @@ dependencies:
- urllib3
- requests
- gdown
- py7zr
- bs4
- lxml
- jupyter

View File

@@ -3,14 +3,14 @@ from InternationalVaersCovid19Provider import getInternationalVaersCovid19Before
from CountryCountsByBatchcodeTablesMerger import CountryCountsByBatchcodeTablesMerger
def getCountryCountsByBatchcodeTable():
def getCountryCountsByBatchcodeTable(vaersBeforeDeletionDataDir):
return _combineCountryCountsByBatchcodeTables(
countryCountsByClickedBatchcode = CountryCountsByBatchcodeTablesMerger.getCountryCountsByClickedBatchcodeTable(),
countryCountsByBatchcodeBeforeDeletion = _getCountryCountsByBatchcodeBeforeDeletion())
countryCountsByBatchcodeBeforeDeletion = _getCountryCountsByBatchcodeBeforeDeletion(vaersBeforeDeletionDataDir))
def _getCountryCountsByBatchcodeBeforeDeletion():
return (getInternationalVaersCovid19BeforeDeletion()
def _getCountryCountsByBatchcodeBeforeDeletion(vaersBeforeDeletionDataDir):
return (getInternationalVaersCovid19BeforeDeletion(vaersBeforeDeletionDataDir)
.groupby('VAX_LOT')
['COUNTRY'].value_counts()
.to_frame(name = 'COUNTRY_COUNT_BY_VAX_LOT'))

View File

@@ -1,17 +0,0 @@
import gdown
import py7zr
import os
class GoogleDriveDownloader:
@staticmethod
def downloadIfNotYetDownloaded(remoteSrcFile, localDstFile):
if not os.path.exists(localDstFile):
gdown.download(url = remoteSrcFile, output = localDstFile, fuzzy = True)
@staticmethod
def downloadSevenZipFileAndExtract(remoteSevenZipSrcFile, localSevenZipDstFile):
GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSevenZipSrcFile, localSevenZipDstFile);
with py7zr.SevenZipFile(localSevenZipDstFile, mode='r') as sevenZipFile:
sevenZipFile.extractall(path = os.path.dirname(localSevenZipDstFile))

View File

@@ -1,52 +0,0 @@
import unittest
from pathlib import Path
import os
from IOUtils import IOUtils
from GoogleDriveDownloader import GoogleDriveDownloader
class GoogleDriveDownloaderTest(unittest.TestCase):
def test_downloadIfNotYetDownloaded_notYetDownloaded(self):
# Given
remoteSrcFile = "https://drive.google.com/file/d/1LstnMvxW4LVxgNvfk5h4AnbvPktMeNSd/view?usp=drive_link"
localDstFile = 'src/tmp/test.txt'
IOUtils.silentlyRemoveFile(localDstFile)
# When
GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSrcFile, localDstFile)
# Then
self.assertEqual(Path(localDstFile).read_text(), 'test')
def test_downloadIfNotYetDownloaded_alreadyDownloaded(self):
# Given
remoteSrcFile = "https://drive.google.com/file/d/1LstnMvxW4LVxgNvfk5h4AnbvPktMeNSd/view?usp=drive_link"
localDstFile = 'src/tmp/test.txt'
content = 'local file content'
self._createFileWithContent(localDstFile, content);
# When
GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSrcFile, localDstFile)
# Then
self.assertEqual(Path(localDstFile).read_text(), content)
def test_downloadSevenZipFileAndExtract(self):
# Given
remoteSevenZipSrcFile = "https://drive.google.com/file/d/14hFKlt48dzDnEjHS_7vYVca5elfzX0l1/view?usp=drive_link"
localSevenZipDstFile = 'src/tmp/test.7z'
localDstFolder = os.path.dirname(localSevenZipDstFile)
IOUtils.silentlyRemoveFile(localSevenZipDstFile)
IOUtils.silentlyRemoveFolder(localDstFolder + '/test')
# When
GoogleDriveDownloader.downloadSevenZipFileAndExtract(remoteSevenZipSrcFile, localSevenZipDstFile)
# Then
self.assertEqual(Path(localDstFolder + '/test/test.txt').read_text(), 'test')
def _createFileWithContent(self, file, content):
with open(file, 'w') as file:
file.write(content)

View File

@@ -30,7 +30,23 @@
"import pandas as pd\n",
"\n",
"pd.set_option('display.max_rows', 100)\n",
"pd.set_option('display.max_columns', None)\n"
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3b89b2d6",
"metadata": {},
"outputs": [],
"source": [
"# split --bytes=45MiB VAERSBeforeDeletion.7z VAERSBeforeDeletion_\n",
"def restoreVAERSBeforeDeletionFolder():\n",
" !cat data/VAERSBeforeDeletion/VAERSBeforeDeletion_* > VAERS/VAERSBeforeDeletion.7z\n",
" !cd VAERS; 7z x -y VAERSBeforeDeletion.7z\n",
"\n",
"vaersBeforeDeletionDataDir = 'VAERS/VAERSBeforeDeletion'\n",
"restoreVAERSBeforeDeletionFolder()"
]
},
{
@@ -127,7 +143,7 @@
"outputs": [],
"source": [
"internationalVaersCovid19 = CountryColumnsMerger.mergeCountryColumnOfSrcIntoDst(\n",
" src = getInternationalVaersCovid19BeforeDeletion(),\n",
" src = getInternationalVaersCovid19BeforeDeletion(vaersBeforeDeletionDataDir),\n",
" dst = internationalVaersCovid19)\n",
"internationalVaersCovid19"
]
@@ -203,7 +219,7 @@
"metadata": {},
"outputs": [],
"source": [
"countryCountsByBatchcode = filterByBatchcodes(getCountryCountsByBatchcodeTable(), batchCodeTable['Batch'].values)\n",
"countryCountsByBatchcode = filterByBatchcodes(getCountryCountsByBatchcodeTable(vaersBeforeDeletionDataDir), batchCodeTable['Batch'].values)\n",
"countryCountsByBatchcode"
]
},
@@ -670,9 +686,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "howbadismybatch-venv-kernel",
"display_name": "howbadismybatch-venv",
"language": "python",
"name": "howbadismybatch-venv-kernel"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
@@ -684,7 +700,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
"version": "3.9.19"
}
},
"nbformat": 4,

View File

@@ -3,7 +3,6 @@ import VaersReader
import pandas as pd
from VaersDescrReader import VaersDescrReader
from CountryColumnAdder import CountryColumnAdder
from GoogleDriveDownloader import GoogleDriveDownloader
def getInternationalVaersCovid19(dataDir, years):
internationalVaers = pd.concat(
@@ -15,11 +14,8 @@ def getInternationalVaersCovid19(dataDir, years):
return internationalVaersCovid19
def getInternationalVaersCovid19BeforeDeletion():
GoogleDriveDownloader.downloadSevenZipFileAndExtract(
remoteSevenZipSrcFile = "https://drive.google.com/file/d/1Rb-lfxNxw_WwvRDVLEhvqOyv_a2f8ern/view?usp=drive_link",
localSevenZipDstFile = 'VAERS/VAERSBeforeDeletion.7z')
return getInternationalVaersCovid19(dataDir = 'VAERS/VAERSBeforeDeletion', years = [2020, 2021, 2022])
def getInternationalVaersCovid19BeforeDeletion(dataDir):
return getInternationalVaersCovid19(dataDir = dataDir, years = [2020, 2021, 2022])
def get_international_VAERSVAX_VAERSSYMPTOMS_Covid19(years):
VAERSDATA, VAERSVAX, VAERSSYMPTOMS = _get_VAERSDATA_VAERSVAX_VAERSSYMPTOMS(years)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.