Merge branch 'handlingLargeFiles' into pages
This commit is contained in:
@@ -10,7 +10,6 @@ dependencies:
|
|||||||
- urllib3
|
- urllib3
|
||||||
- requests
|
- requests
|
||||||
- gdown
|
- gdown
|
||||||
- py7zr
|
|
||||||
- bs4
|
- bs4
|
||||||
- lxml
|
- lxml
|
||||||
- jupyter
|
- jupyter
|
||||||
|
|||||||
@@ -3,14 +3,14 @@ from InternationalVaersCovid19Provider import getInternationalVaersCovid19Before
|
|||||||
from CountryCountsByBatchcodeTablesMerger import CountryCountsByBatchcodeTablesMerger
|
from CountryCountsByBatchcodeTablesMerger import CountryCountsByBatchcodeTablesMerger
|
||||||
|
|
||||||
|
|
||||||
def getCountryCountsByBatchcodeTable():
|
def getCountryCountsByBatchcodeTable(vaersBeforeDeletionDataDir):
|
||||||
return _combineCountryCountsByBatchcodeTables(
|
return _combineCountryCountsByBatchcodeTables(
|
||||||
countryCountsByClickedBatchcode = CountryCountsByBatchcodeTablesMerger.getCountryCountsByClickedBatchcodeTable(),
|
countryCountsByClickedBatchcode = CountryCountsByBatchcodeTablesMerger.getCountryCountsByClickedBatchcodeTable(),
|
||||||
countryCountsByBatchcodeBeforeDeletion = _getCountryCountsByBatchcodeBeforeDeletion())
|
countryCountsByBatchcodeBeforeDeletion = _getCountryCountsByBatchcodeBeforeDeletion(vaersBeforeDeletionDataDir))
|
||||||
|
|
||||||
|
|
||||||
def _getCountryCountsByBatchcodeBeforeDeletion():
|
def _getCountryCountsByBatchcodeBeforeDeletion(vaersBeforeDeletionDataDir):
|
||||||
return (getInternationalVaersCovid19BeforeDeletion()
|
return (getInternationalVaersCovid19BeforeDeletion(vaersBeforeDeletionDataDir)
|
||||||
.groupby('VAX_LOT')
|
.groupby('VAX_LOT')
|
||||||
['COUNTRY'].value_counts()
|
['COUNTRY'].value_counts()
|
||||||
.to_frame(name = 'COUNTRY_COUNT_BY_VAX_LOT'))
|
.to_frame(name = 'COUNTRY_COUNT_BY_VAX_LOT'))
|
||||||
|
|||||||
@@ -1,17 +0,0 @@
|
|||||||
import gdown
|
|
||||||
import py7zr
|
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
class GoogleDriveDownloader:
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def downloadIfNotYetDownloaded(remoteSrcFile, localDstFile):
|
|
||||||
if not os.path.exists(localDstFile):
|
|
||||||
gdown.download(url = remoteSrcFile, output = localDstFile, fuzzy = True)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def downloadSevenZipFileAndExtract(remoteSevenZipSrcFile, localSevenZipDstFile):
|
|
||||||
GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSevenZipSrcFile, localSevenZipDstFile);
|
|
||||||
with py7zr.SevenZipFile(localSevenZipDstFile, mode='r') as sevenZipFile:
|
|
||||||
sevenZipFile.extractall(path = os.path.dirname(localSevenZipDstFile))
|
|
||||||
@@ -1,52 +0,0 @@
|
|||||||
import unittest
|
|
||||||
from pathlib import Path
|
|
||||||
import os
|
|
||||||
from IOUtils import IOUtils
|
|
||||||
from GoogleDriveDownloader import GoogleDriveDownloader
|
|
||||||
|
|
||||||
|
|
||||||
class GoogleDriveDownloaderTest(unittest.TestCase):
|
|
||||||
|
|
||||||
def test_downloadIfNotYetDownloaded_notYetDownloaded(self):
|
|
||||||
# Given
|
|
||||||
remoteSrcFile = "https://drive.google.com/file/d/1LstnMvxW4LVxgNvfk5h4AnbvPktMeNSd/view?usp=drive_link"
|
|
||||||
localDstFile = 'src/tmp/test.txt'
|
|
||||||
IOUtils.silentlyRemoveFile(localDstFile)
|
|
||||||
|
|
||||||
# When
|
|
||||||
GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSrcFile, localDstFile)
|
|
||||||
|
|
||||||
# Then
|
|
||||||
self.assertEqual(Path(localDstFile).read_text(), 'test')
|
|
||||||
|
|
||||||
def test_downloadIfNotYetDownloaded_alreadyDownloaded(self):
|
|
||||||
# Given
|
|
||||||
remoteSrcFile = "https://drive.google.com/file/d/1LstnMvxW4LVxgNvfk5h4AnbvPktMeNSd/view?usp=drive_link"
|
|
||||||
localDstFile = 'src/tmp/test.txt'
|
|
||||||
content = 'local file content'
|
|
||||||
self._createFileWithContent(localDstFile, content);
|
|
||||||
|
|
||||||
# When
|
|
||||||
GoogleDriveDownloader.downloadIfNotYetDownloaded(remoteSrcFile, localDstFile)
|
|
||||||
|
|
||||||
# Then
|
|
||||||
self.assertEqual(Path(localDstFile).read_text(), content)
|
|
||||||
|
|
||||||
def test_downloadSevenZipFileAndExtract(self):
|
|
||||||
# Given
|
|
||||||
remoteSevenZipSrcFile = "https://drive.google.com/file/d/14hFKlt48dzDnEjHS_7vYVca5elfzX0l1/view?usp=drive_link"
|
|
||||||
localSevenZipDstFile = 'src/tmp/test.7z'
|
|
||||||
localDstFolder = os.path.dirname(localSevenZipDstFile)
|
|
||||||
IOUtils.silentlyRemoveFile(localSevenZipDstFile)
|
|
||||||
IOUtils.silentlyRemoveFolder(localDstFolder + '/test')
|
|
||||||
|
|
||||||
# When
|
|
||||||
GoogleDriveDownloader.downloadSevenZipFileAndExtract(remoteSevenZipSrcFile, localSevenZipDstFile)
|
|
||||||
|
|
||||||
# Then
|
|
||||||
self.assertEqual(Path(localDstFolder + '/test/test.txt').read_text(), 'test')
|
|
||||||
|
|
||||||
def _createFileWithContent(self, file, content):
|
|
||||||
with open(file, 'w') as file:
|
|
||||||
file.write(content)
|
|
||||||
|
|
||||||
@@ -30,7 +30,23 @@
|
|||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"\n",
|
"\n",
|
||||||
"pd.set_option('display.max_rows', 100)\n",
|
"pd.set_option('display.max_rows', 100)\n",
|
||||||
"pd.set_option('display.max_columns', None)\n"
|
"pd.set_option('display.max_columns', None)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3b89b2d6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# split --bytes=45MiB VAERSBeforeDeletion.7z VAERSBeforeDeletion_\n",
|
||||||
|
"def restoreVAERSBeforeDeletionFolder():\n",
|
||||||
|
" !cat data/VAERSBeforeDeletion/VAERSBeforeDeletion_* > VAERS/VAERSBeforeDeletion.7z\n",
|
||||||
|
" !cd VAERS; 7z x -y VAERSBeforeDeletion.7z\n",
|
||||||
|
"\n",
|
||||||
|
"vaersBeforeDeletionDataDir = 'VAERS/VAERSBeforeDeletion'\n",
|
||||||
|
"restoreVAERSBeforeDeletionFolder()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -127,7 +143,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"internationalVaersCovid19 = CountryColumnsMerger.mergeCountryColumnOfSrcIntoDst(\n",
|
"internationalVaersCovid19 = CountryColumnsMerger.mergeCountryColumnOfSrcIntoDst(\n",
|
||||||
" src = getInternationalVaersCovid19BeforeDeletion(),\n",
|
" src = getInternationalVaersCovid19BeforeDeletion(vaersBeforeDeletionDataDir),\n",
|
||||||
" dst = internationalVaersCovid19)\n",
|
" dst = internationalVaersCovid19)\n",
|
||||||
"internationalVaersCovid19"
|
"internationalVaersCovid19"
|
||||||
]
|
]
|
||||||
@@ -203,7 +219,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"countryCountsByBatchcode = filterByBatchcodes(getCountryCountsByBatchcodeTable(), batchCodeTable['Batch'].values)\n",
|
"countryCountsByBatchcode = filterByBatchcodes(getCountryCountsByBatchcodeTable(vaersBeforeDeletionDataDir), batchCodeTable['Batch'].values)\n",
|
||||||
"countryCountsByBatchcode"
|
"countryCountsByBatchcode"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -670,9 +686,9 @@
|
|||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"kernelspec": {
|
"kernelspec": {
|
||||||
"display_name": "howbadismybatch-venv-kernel",
|
"display_name": "howbadismybatch-venv",
|
||||||
"language": "python",
|
"language": "python",
|
||||||
"name": "howbadismybatch-venv-kernel"
|
"name": "python3"
|
||||||
},
|
},
|
||||||
"language_info": {
|
"language_info": {
|
||||||
"codemirror_mode": {
|
"codemirror_mode": {
|
||||||
@@ -684,7 +700,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.9.15"
|
"version": "3.9.19"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ import VaersReader
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from VaersDescrReader import VaersDescrReader
|
from VaersDescrReader import VaersDescrReader
|
||||||
from CountryColumnAdder import CountryColumnAdder
|
from CountryColumnAdder import CountryColumnAdder
|
||||||
from GoogleDriveDownloader import GoogleDriveDownloader
|
|
||||||
|
|
||||||
def getInternationalVaersCovid19(dataDir, years):
|
def getInternationalVaersCovid19(dataDir, years):
|
||||||
internationalVaers = pd.concat(
|
internationalVaers = pd.concat(
|
||||||
@@ -15,11 +14,8 @@ def getInternationalVaersCovid19(dataDir, years):
|
|||||||
return internationalVaersCovid19
|
return internationalVaersCovid19
|
||||||
|
|
||||||
|
|
||||||
def getInternationalVaersCovid19BeforeDeletion():
|
def getInternationalVaersCovid19BeforeDeletion(dataDir):
|
||||||
GoogleDriveDownloader.downloadSevenZipFileAndExtract(
|
return getInternationalVaersCovid19(dataDir = dataDir, years = [2020, 2021, 2022])
|
||||||
remoteSevenZipSrcFile = "https://drive.google.com/file/d/1Rb-lfxNxw_WwvRDVLEhvqOyv_a2f8ern/view?usp=drive_link",
|
|
||||||
localSevenZipDstFile = 'VAERS/VAERSBeforeDeletion.7z')
|
|
||||||
return getInternationalVaersCovid19(dataDir = 'VAERS/VAERSBeforeDeletion', years = [2020, 2021, 2022])
|
|
||||||
|
|
||||||
def get_international_VAERSVAX_VAERSSYMPTOMS_Covid19(years):
|
def get_international_VAERSVAX_VAERSSYMPTOMS_Covid19(years):
|
||||||
VAERSDATA, VAERSVAX, VAERSSYMPTOMS = _get_VAERSDATA_VAERSVAX_VAERSSYMPTOMS(years)
|
VAERSDATA, VAERSVAX, VAERSSYMPTOMS = _get_VAERSDATA_VAERSVAX_VAERSSYMPTOMS(years)
|
||||||
|
|||||||
BIN
src/data/VAERSBeforeDeletion/VAERSBeforeDeletion_aa
Normal file
BIN
src/data/VAERSBeforeDeletion/VAERSBeforeDeletion_aa
Normal file
Binary file not shown.
BIN
src/data/VAERSBeforeDeletion/VAERSBeforeDeletion_ab
Normal file
BIN
src/data/VAERSBeforeDeletion/VAERSBeforeDeletion_ab
Normal file
Binary file not shown.
BIN
src/data/VAERSBeforeDeletion/VAERSBeforeDeletion_ac
Normal file
BIN
src/data/VAERSBeforeDeletion/VAERSBeforeDeletion_ac
Normal file
Binary file not shown.
BIN
src/data/VAERSBeforeDeletion/VAERSBeforeDeletion_ad
Normal file
BIN
src/data/VAERSBeforeDeletion/VAERSBeforeDeletion_ad
Normal file
Binary file not shown.
BIN
src/data/VAERSBeforeDeletion/VAERSBeforeDeletion_ae
Normal file
BIN
src/data/VAERSBeforeDeletion/VAERSBeforeDeletion_ae
Normal file
Binary file not shown.
BIN
src/data/VAERSBeforeDeletion/VAERSBeforeDeletion_af
Normal file
BIN
src/data/VAERSBeforeDeletion/VAERSBeforeDeletion_af
Normal file
Binary file not shown.
BIN
src/data/VAERSBeforeDeletion/VAERSBeforeDeletion_ag
Normal file
BIN
src/data/VAERSBeforeDeletion/VAERSBeforeDeletion_ag
Normal file
Binary file not shown.
BIN
src/data/VAERSBeforeDeletion/VAERSBeforeDeletion_ah
Normal file
BIN
src/data/VAERSBeforeDeletion/VAERSBeforeDeletion_ah
Normal file
Binary file not shown.
Reference in New Issue
Block a user