diff --git a/.gitignore b/.gitignore index 22339ed665b..20be6070905 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,4 @@ docs/data/*.html src/captchaImage.jpeg src/HowBadIsMyBatch.nbconvert.ipynb src/HowBadIsMyBatch.nbconvert.html +src/__pycache__/ diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000000..1c8bf5dbcaf --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,11 @@ +{ + "python.testing.unittestArgs": [ + "-v", + "-s", + "./src", + "-p", + "*Test.py" + ], + "python.testing.pytestEnabled": false, + "python.testing.unittestEnabled": true +} \ No newline at end of file diff --git a/src/BatchCodeTableFactory.py b/src/BatchCodeTableFactory.py new file mode 100644 index 00000000000..1e3f2c2966b --- /dev/null +++ b/src/BatchCodeTableFactory.py @@ -0,0 +1,45 @@ +import pandas as pd +from CompanyColumnAdder import CompanyColumnAdder +from SummationTableFactory import SummationTableFactory + +class BatchCodeTableFactory: + + def __init__(self, dataFrame: pd.DataFrame): + self.dataFrame = dataFrame + self.companyColumnAdder = CompanyColumnAdder(dataFrame) + self.countryBatchCodeTable = SummationTableFactory.createSummationTable( + dataFrame.groupby( + [ + dataFrame['COUNTRY'], + dataFrame['VAX_LOT'] + ])) + + def createGlobalBatchCodeTable(self): + return self._postProcess(SummationTableFactory.createSummationTable(self.dataFrame.groupby('VAX_LOT'))) + + def createBatchCodeTableByCountry(self, country): + return self._postProcess(self._getBatchCodeTableByCountry(country)) + + def _postProcess(self, batchCodeTable): + batchCodeTable = self.companyColumnAdder.addCompanyColumn(batchCodeTable) + batchCodeTable = batchCodeTable[ + [ + 'Adverse Reaction Reports', + 'Deaths', + 'Disabilities', + 'Life Threatening Illnesses', + 'Company', + 'Countries', + 'Severe reports', + 'Lethality' + ]] + return batchCodeTable.sort_values(by = 'Severe reports', ascending = False) + + def _getBatchCodeTableByCountry(self, country): + if country in self.countryBatchCodeTable.index: + return self.countryBatchCodeTable.loc[country] + else: + return self._getEmptyBatchCodeTable() + + def _getEmptyBatchCodeTable(self): + return self.countryBatchCodeTable[0:0].droplevel(0) diff --git a/src/BatchCodeTableFactoryTest.py b/src/BatchCodeTableFactoryTest.py new file mode 100644 index 00000000000..f0630eab893 --- /dev/null +++ b/src/BatchCodeTableFactoryTest.py @@ -0,0 +1,106 @@ +import unittest +import pandas as pd +from pandas.testing import assert_frame_equal +from TestHelper import TestHelper +from SevereColumnAdder import SevereColumnAdder +from BatchCodeTableFactory import BatchCodeTableFactory + +class BatchCodeTableFactoryTest(unittest.TestCase): + + def test_createBatchCodeTableByCountry(self): + # Given + dataFrame = TestHelper.createDataFrame( + columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES', 'SPLTTYPE', 'HOSPITAL', 'ER_VISIT', 'COUNTRY'], + data = [ [1, 0, 0, 'COVID19', 'PFIZER\BIONTECH', '016M20A', '2', 'GBPFIZER INC2020486806', 0, 0, 'United Kingdom'], + [0, 0, 0, 'COVID19', 'MODERNA', '030L20A', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'France'], + [1, 1, 1, 'COVID19', 'MODERNA', '030L20B', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'France'], + [0, 1, 1, 'COVID19', 'MODERNA', '030L20B', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'France']], + index = [ + "1048786", + "1048786", + "4711", + "0815"]) + dataFrame = SevereColumnAdder.addSevereColumn(dataFrame) + batchCodeTableFactory = BatchCodeTableFactory(dataFrame) + + # When + batchCodeTable = batchCodeTableFactory.createBatchCodeTableByCountry('France') + + # Then + assert_frame_equal( + batchCodeTable, + TestHelper.createDataFrame( + columns = ['Adverse Reaction Reports', 'Deaths', 'Disabilities', 'Life Threatening Illnesses', 'Company', 'Countries', 'Severe reports', 'Lethality'], + data = [ [2, 1, 2, 2, 'MODERNA', 'France', 2/2 * 100, 1/2 * 100], + [1, 0, 0, 0, 'MODERNA', 'France', 0/1 * 100, 0/1 * 100]], + index = pd.Index( + [ + '030L20B', + '030L20A' + ], + name = 'VAX_LOT')), + check_dtype = False) + + def test_createGlobalBatchCodeTable(self): + # Given + dataFrame = TestHelper.createDataFrame( + columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES', 'SPLTTYPE', 'HOSPITAL', 'ER_VISIT', 'COUNTRY'], + data = [ [1, 0, 0, 'COVID19', 'PFIZER\BIONTECH', '016M20A', '2', 'GBPFIZER INC2020486806', 0, 0, 'United Kingdom'], + [0, 0, 0, 'COVID19', 'MODERNA', '030L20A', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'France'], + [1, 1, 1, 'COVID19', 'MODERNA', '030L20B', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'France'], + [0, 1, 1, 'COVID19', 'MODERNA', '030L20B', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'United Kingdom']], + index = [ + "1048786", + "1048786", + "4711", + "0815"]) + dataFrame = SevereColumnAdder.addSevereColumn(dataFrame) + batchCodeTableFactory = BatchCodeTableFactory(dataFrame) + + # When + batchCodeTable = batchCodeTableFactory.createGlobalBatchCodeTable() + + # Then + assert_frame_equal( + batchCodeTable, + TestHelper.createDataFrame( + columns = ['Adverse Reaction Reports', 'Deaths', 'Disabilities', 'Life Threatening Illnesses', 'Company', 'Countries', 'Severe reports', 'Lethality'], + data = [ [1, 1, 0, 0, 'PFIZER\BIONTECH', 'United Kingdom', 1/1 * 100, 1/1 * 100], + [2, 1, 2, 2, 'MODERNA', 'France, United Kingdom', 2/2 * 100, 1/2 * 100], + [1, 0, 0, 0, 'MODERNA', 'France', 0/1 * 100, 0/1 * 100]], + index = pd.Index( + [ + '016M20A', + '030L20B', + '030L20A' + ], + name = 'VAX_LOT')), + check_dtype = False) + + def test_createBatchCodeTableByNonExistingCountry(self): + # Given + dataFrame = TestHelper.createDataFrame( + columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES', 'SPLTTYPE', 'HOSPITAL', 'ER_VISIT', 'COUNTRY'], + data = [ [1, 0, 0, 'COVID19', 'PFIZER\BIONTECH', '016M20A', '2', 'GBPFIZER INC2020486806', 0, 0, 'United Kingdom'], + [0, 0, 0, 'COVID19', 'MODERNA', '030L20A', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'France'], + [1, 1, 1, 'COVID19', 'MODERNA', '030L20B', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'France'], + [0, 1, 1, 'COVID19', 'MODERNA', '030L20B', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'France']], + index = [ + "1048786", + "1048786", + "4711", + "0815"]) + dataFrame = SevereColumnAdder.addSevereColumn(dataFrame) + batchCodeTableFactory = BatchCodeTableFactory(dataFrame) + + # When + batchCodeTable = batchCodeTableFactory.createBatchCodeTableByCountry('non existing country') + + # Then + assert_frame_equal( + batchCodeTable, + TestHelper.createDataFrame( + columns = ['Adverse Reaction Reports', 'Deaths', 'Disabilities', 'Life Threatening Illnesses', 'Company', 'Countries', 'Severe reports', 'Lethality'], + data = [ ], + index = pd.Index([], name = 'VAX_LOT')), + check_dtype = False) diff --git a/src/CompanyColumnAdder.py b/src/CompanyColumnAdder.py new file mode 100644 index 00000000000..5b41a13506a --- /dev/null +++ b/src/CompanyColumnAdder.py @@ -0,0 +1,21 @@ +import pandas as pd + +class CompanyColumnAdder: + + def __init__(self, dataFrame_VAX_LOT_VAX_MANU): + self.dataFrame_VAX_LOT_VAX_MANU = dataFrame_VAX_LOT_VAX_MANU + + def addCompanyColumn(self, batchCodeTable): + return pd.merge( + batchCodeTable, + self._createCompanyByBatchCodeTable(), + how = 'left', + left_index = True, + right_index = True, + validate = 'one_to_one') + + def _createCompanyByBatchCodeTable(self): + manufacturerByBatchCodeTable = self.dataFrame_VAX_LOT_VAX_MANU[['VAX_LOT', 'VAX_MANU']] + manufacturerByBatchCodeTable = manufacturerByBatchCodeTable.drop_duplicates(subset = ['VAX_LOT']) + manufacturerByBatchCodeTable = manufacturerByBatchCodeTable.set_index('VAX_LOT') + return manufacturerByBatchCodeTable.rename(columns = {"VAX_MANU": "Company"}) \ No newline at end of file diff --git a/src/CountryColumnAdder.py b/src/CountryColumnAdder.py new file mode 100644 index 00000000000..f7d14ccd4ab --- /dev/null +++ b/src/CountryColumnAdder.py @@ -0,0 +1,25 @@ +import pycountry + +class CountryColumnAdder: + + @staticmethod + def addCountryColumn(dataFrame): + dataFrame['COUNTRY'] = CountryColumnAdder.getCountryColumn(dataFrame) + return dataFrame.astype({'COUNTRY': "string"}) + + @staticmethod + def getCountryColumn(dataFrame): + return dataFrame.apply( + lambda row: + CountryColumnAdder._getCountryNameOfSplttypeOrDefault( + splttype = row['SPLTTYPE'], + default = 'Unknown Country'), + axis = 'columns') + + @staticmethod + def _getCountryNameOfSplttypeOrDefault(splttype, default): + if not isinstance(splttype, str): + return default + + country = pycountry.countries.get(alpha_2 = splttype[:2]) + return country.name if country is not None else default \ No newline at end of file diff --git a/src/CountryOptionsSetter.py b/src/CountryOptionsSetter.py new file mode 100644 index 00000000000..33fae3c2ec7 --- /dev/null +++ b/src/CountryOptionsSetter.py @@ -0,0 +1,21 @@ +from bs4 import BeautifulSoup + + +class CountryOptionsSetter: + + def setCountryOptions(self, html, options): + soup = self._setCountryOptions(self._parse(html), self._parseOptions(options)) + return str(soup) + + def _setCountryOptions(self, soup, options): + countrySelect = soup.find(id = "countrySelect") + countrySelect.clear() + for option in options: + countrySelect.append(option) + return soup + + def _parseOptions(self, options): + return [self._parse(option).option for option in options] + + def _parse(self, html): + return BeautifulSoup(html, 'lxml') diff --git a/src/CountryOptionsSetterTest.py b/src/CountryOptionsSetterTest.py new file mode 100644 index 00000000000..84c15f144a1 --- /dev/null +++ b/src/CountryOptionsSetterTest.py @@ -0,0 +1,73 @@ +import unittest +from CountryOptionsSetter import CountryOptionsSetter + +class CountryOptionsSetterTest(unittest.TestCase): + + def test_setCountryOptions(self): + # Given + countryOptionsSetter = CountryOptionsSetter() + + # When + htmlActual = countryOptionsSetter.setCountryOptions( + html=''' + + +

Test

+ + + + ''', + options=[ + '', + '', + '']) + + # Then + assertEqualHTML( + htmlActual, + ''' + + +

Test

+ + + + ''') + +# adapted from https://stackoverflow.com/questions/8006909/pretty-print-assertequal-for-html-strings +def assertEqualHTML(string1, string2, file1='', file2=''): + u''' + Compare two unicode strings containing HTML. + A human friendly diff goes to logging.error() if they + are not equal, and an exception gets raised. + ''' + from bs4 import BeautifulSoup as bs + import difflib + + def short(mystr): + max = 20 + if len(mystr) > max: + return mystr[:max] + return mystr + p = [] + for mystr, file in [(string1, file1), (string2, file2)]: + if not isinstance(mystr, str): + raise Exception(u'string ist not unicode: %r %s' % + (short(mystr), file)) + soup = bs(mystr, 'lxml') + pretty = soup.prettify() + p.append(pretty) + if p[0] != p[1]: + for line in difflib.unified_diff(p[0].splitlines(), p[1].splitlines(), fromfile=file1, tofile=file2): + display(line) + display(p[0], ' != ', p[1]) + raise Exception('Not equal %s %s' % (file1, file2)) diff --git a/src/DataFrameFilter.py b/src/DataFrameFilter.py new file mode 100644 index 00000000000..4f5c03a538f --- /dev/null +++ b/src/DataFrameFilter.py @@ -0,0 +1,9 @@ +import pandas as pd + +class DataFrameFilter: + + def filterByCovid19(self, dataFrame): + return dataFrame[self._isCovid19(dataFrame)] + + def _isCovid19(self, dataFrame): + return dataFrame["VAX_TYPE"] == "COVID19" diff --git a/src/DataFrameFilterTest.py b/src/DataFrameFilterTest.py new file mode 100644 index 00000000000..b0907a266f8 --- /dev/null +++ b/src/DataFrameFilterTest.py @@ -0,0 +1,64 @@ +import unittest +from pandas.testing import assert_frame_equal +from VaersDescr2DataFrameConverter import VaersDescr2DataFrameConverter +from TestHelper import TestHelper +from DataFrameFilter import DataFrameFilter + +class DataFrameFilterTest(unittest.TestCase): + + def test_filterByCovid19(self): + # Given + dataFrame = VaersDescr2DataFrameConverter.createDataFrameFromDescrs( + [ + { + 'VAERSDATA': TestHelper.createDataFrame( + columns = ['DIED', 'L_THREAT', 'DISABLE'], + data = [ [1, 0, 0], + [0, 0, 1]], + index = [ + "0916600", + "0916601"]), + 'VAERSVAX': TestHelper.createDataFrame( + columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'], + data = [ ['COVID19', 'MODERNA', '037K20A', '1'], + ['COVID19', 'MODERNA', '025L20A', '1']], + index = [ + "0916600", + "0916601"], + dtypes = {'VAX_DOSE_SERIES': "string"}) + }, + { + 'VAERSDATA': TestHelper.createDataFrame( + columns = ['DIED', 'L_THREAT', 'DISABLE'], + data = [ [0, 0, 0], + [0, 0, 1]], + index = [ + "1996873", + "1996874"]), + 'VAERSVAX': TestHelper.createDataFrame( + columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'], + data = [ ['HPV9', 'MERCK & CO. INC.', 'R017624', 'UNK'], + ['COVID19', 'MODERNA', '025L20A', '1']], + index = [ + "1996873", + "1996874"], + dtypes = {'VAX_DOSE_SERIES': "string"}) + } + ]) + dataFrameFilter = DataFrameFilter() + + # When + dataFrame = dataFrameFilter.filterByCovid19(dataFrame) + + # Then + dataFrameExpected = TestHelper.createDataFrame( + columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'], + data = [ [1, 0, 0, 'COVID19', 'MODERNA', '037K20A', '1'], + [0, 0, 1, 'COVID19', 'MODERNA', '025L20A', '1'], + [0, 0, 1, 'COVID19', 'MODERNA', '025L20A', '1']], + index = [ + "0916600", + "0916601", + "1996874"], + dtypes = {'VAX_DOSE_SERIES': "string"}) + assert_frame_equal(dataFrame, dataFrameExpected, check_dtype = False) diff --git a/src/DataFrameNormalizer.py b/src/DataFrameNormalizer.py new file mode 100644 index 00000000000..699d8ba7c82 --- /dev/null +++ b/src/DataFrameNormalizer.py @@ -0,0 +1,40 @@ +import numpy as np + +class DataFrameNormalizer: + + @staticmethod + def normalize(dataFrame): + DataFrameNormalizer.removeUnknownBatchCodes(dataFrame) + DataFrameNormalizer.convertVAX_LOTColumnToUpperCase(dataFrame) + DataFrameNormalizer._convertColumnsOfDataFrame_Y_to_1_else_0( + dataFrame, + ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT']) + + @staticmethod + def convertVAX_LOTColumnToUpperCase(dataFrame): + dataFrame['VAX_LOT'] = dataFrame['VAX_LOT'].str.upper() + + @staticmethod + def removeUnknownBatchCodes(dataFrame): + dataFrame.drop(DataFrameNormalizer._isUnknownBatchCode(dataFrame).index, inplace = True) + + @staticmethod + def _isUnknownBatchCode(dataFrame): + return dataFrame[dataFrame['VAX_LOT'].str.contains(pat = 'UNKNOWN', regex = False, case = False, na = False)] + + @staticmethod + def _convertColumnsOfDataFrame_Y_to_1_else_0(dataFrame, columns): + for column in columns: + DataFrameNormalizer._convertColumnOfDataFrame_Y_to_1_else_0(dataFrame, column) + + @staticmethod + def _convertColumnOfDataFrame_Y_to_1_else_0(dataFrame, column): + dataFrame[column] = DataFrameNormalizer._where( + condition = dataFrame[column] == 'Y', + trueValue = 1, + falseValue = 0) + + @staticmethod + def _where(condition, trueValue, falseValue): + return np.where(condition, trueValue, falseValue) + \ No newline at end of file diff --git a/src/DataFrameNormalizerTest.py b/src/DataFrameNormalizerTest.py new file mode 100644 index 00000000000..e38e1dcdb42 --- /dev/null +++ b/src/DataFrameNormalizerTest.py @@ -0,0 +1,63 @@ +import unittest +from DataFrameNormalizer import DataFrameNormalizer +from TestHelper import TestHelper +from pandas.testing import assert_frame_equal +import numpy as np + +class DataFrameNormalizerTest(unittest.TestCase): + + def test_convertVAX_LOTColumnToUpperCase(self): + # Given + dataFrame = TestHelper.createDataFrame( + columns = ['VAX_LOT'], + data = [ ['037K20A'], + ['025l20A'], + ['025L20A']], + index = [ + "0916600", + "0916601", + "1996874"]) + + # When + DataFrameNormalizer.convertVAX_LOTColumnToUpperCase(dataFrame) + + # Then + dataFrameExpected = TestHelper.createDataFrame( + columns = ['VAX_LOT'], + data = [ ['037K20A'], + ['025L20A'], + ['025L20A']], + index = [ + "0916600", + "0916601", + "1996874"]) + assert_frame_equal(dataFrame, dataFrameExpected, check_dtype = False) + + def test_removeUnknownBatchCodes(self): + # Given + dataFrame = TestHelper.createDataFrame( + columns = ['VAX_LOT'], + data = [ ['UNKNOWN'], + ['N/A Unknown'], + [np.nan], + ['UNKNOWN TO ME'], + ['030L20B']], + index = [ + "1048786", + "1048786", + "123", + "4711", + "0815"]) + + # When + DataFrameNormalizer.removeUnknownBatchCodes(dataFrame) + + # Then + dataFrameExpected = TestHelper.createDataFrame( + columns = ['VAX_LOT'], + data = [ [np.nan], + ['030L20B']], + index = [ + "123", + "0815"]) + assert_frame_equal(dataFrame, dataFrameExpected, check_dtype = False) \ No newline at end of file diff --git a/src/DateProvider.py b/src/DateProvider.py new file mode 100644 index 00000000000..0ed8cdcbef6 --- /dev/null +++ b/src/DateProvider.py @@ -0,0 +1,42 @@ +from bs4 import BeautifulSoup +import requests +import re +from datetime import datetime + + +class DateProvider: + + DATE_FORMAT = "%B %d, %Y" + + def __init__(self): + self.lastUpdated = None + self.lastUpdatedDataSource = None + + def needsUpdate(self): + return self.getLastUpdated() < self.getLastUpdatedDataSource() + + def getLastUpdated(self): + if self.lastUpdated is None: + self.lastUpdated = self.__getLastUpdated( + url="https://knollfrank.github.io/HowBadIsMyBatch/batchCodeTable.html", + getDateStr=lambda soup: soup.find(id="last_updated").text) + + return self.lastUpdated + + def getLastUpdatedDataSource(self): + if self.lastUpdatedDataSource is None: + def getDateStr(soup): + lastUpdated = soup.find(string=re.compile("Last updated")) + return re.search('Last updated: (.+).', lastUpdated).group(1) + + self.lastUpdatedDataSource = self.__getLastUpdated( + url="https://vaers.hhs.gov/data/datasets.html", + getDateStr=getDateStr) + + return self.lastUpdatedDataSource + + def __getLastUpdated(self, url, getDateStr): + htmlContent = requests.get(url).text + soup = BeautifulSoup(htmlContent, "lxml") + dateStr = getDateStr(soup) + return datetime.strptime(dateStr, DateProvider.DATE_FORMAT) diff --git a/src/HowBadIsMyBatch.ipynb b/src/HowBadIsMyBatch.ipynb index 8f50f8c47c6..eff521d52ee 100644 --- a/src/HowBadIsMyBatch.ipynb +++ b/src/HowBadIsMyBatch.ipynb @@ -26,56 +26,6 @@ "print(datetime.now().strftime(\"%d.%m.%Y, %H:%M:%S Uhr\"))" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "1dbf9321", - "metadata": {}, - "outputs": [], - "source": [ - "from bs4 import BeautifulSoup\n", - "import requests\n", - "import re\n", - "from datetime import datetime\n", - "\n", - "class DateProvider:\n", - " \n", - " DATE_FORMAT = \"%B %d, %Y\"\n", - "\n", - " def __init__(self):\n", - " self.lastUpdated = None\n", - " self.lastUpdatedDataSource = None\n", - "\n", - " def needsUpdate(self):\n", - " return self.getLastUpdated() < self.getLastUpdatedDataSource()\n", - " \n", - " def getLastUpdated(self):\n", - " if self.lastUpdated is None:\n", - " self.lastUpdated = self.__getLastUpdated(\n", - " url = \"https://knollfrank.github.io/HowBadIsMyBatch/batchCodeTable.html\",\n", - " getDateStr = lambda soup: soup.find(id = \"last_updated\").text)\n", - " \n", - " return self.lastUpdated\n", - "\n", - " def getLastUpdatedDataSource(self):\n", - " if self.lastUpdatedDataSource is None:\n", - " def getDateStr(soup):\n", - " lastUpdated = soup.find(string = re.compile(\"Last updated\"))\n", - " return re.search('Last updated: (.+).', lastUpdated).group(1)\n", - "\n", - " self.lastUpdatedDataSource = self.__getLastUpdated(\n", - " url = \"https://vaers.hhs.gov/data/datasets.html\",\n", - " getDateStr = getDateStr)\n", - "\n", - " return self.lastUpdatedDataSource\n", - "\n", - " def __getLastUpdated(self, url, getDateStr):\n", - " htmlContent = requests.get(url).text\n", - " soup = BeautifulSoup(htmlContent, \"lxml\")\n", - " dateStr = getDateStr(soup)\n", - " return datetime.strptime(dateStr, DateProvider.DATE_FORMAT)" - ] - }, { "cell_type": "code", "execution_count": null, @@ -83,6 +33,7 @@ "metadata": {}, "outputs": [], "source": [ + "from DateProvider import DateProvider\n", "dateProvider = DateProvider()\n", "print(' lastUpdated:', dateProvider.getLastUpdated())\n", "print('lastUpdatedDataSource:', dateProvider.getLastUpdatedDataSource()) \n", @@ -396,48 +347,7 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "\n", - "class VaersDescrReader:\n", - " \n", - " def __init__(self, dataDir):\n", - " self.dataDir = dataDir\n", - "\n", - " def readVaersDescrsForYears(self, years):\n", - " return [self.readVaersDescrForYear(year) for year in years]\n", - "\n", - " def readVaersDescrForYear(self, year):\n", - " return {\n", - " 'VAERSDATA': self._readVAERSDATA('{dataDir}/{year}VAERSDATA.csv'.format(dataDir = self.dataDir, year = year)),\n", - " 'VAERSVAX': self._readVAERSVAX('{dataDir}/{year}VAERSVAX.csv'.format(dataDir = self.dataDir, year = year))\n", - " }\n", - "\n", - " def readNonDomesticVaersDescr(self):\n", - " return {\n", - " 'VAERSDATA': self._readVAERSDATA(self.dataDir + \"/NonDomesticVAERSDATA.csv\"),\n", - " 'VAERSVAX': self._readVAERSVAX(self.dataDir + \"/NonDomesticVAERSVAX.csv\")\n", - " }\n", - "\n", - " def _readVAERSDATA(self, file):\n", - " return self._read_csv(\n", - " file = file,\n", - " usecols = ['VAERS_ID', 'RECVDATE', 'DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT', 'SPLTTYPE'],\n", - " parse_dates = ['RECVDATE'],\n", - " date_parser = lambda dateStr: pd.to_datetime(dateStr, format = \"%m/%d/%Y\"))\n", - "\n", - " def _readVAERSVAX(self, file):\n", - " return self._read_csv(\n", - " file = file,\n", - " usecols = ['VAERS_ID', 'VAX_DOSE_SERIES', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'],\n", - " dtype = {\"VAX_DOSE_SERIES\": \"string\"})\n", - "\n", - " def _read_csv(self, file, **kwargs):\n", - " return pd.read_csv(\n", - " file,\n", - " index_col = 'VAERS_ID',\n", - " encoding = 'latin1',\n", - " low_memory = False,\n", - " **kwargs)\n" + "from VaersDescrReader import VaersDescrReader\n" ] }, { @@ -447,24 +357,7 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "\n", - "class VaersDescr2DataFrameConverter:\n", - "\n", - " @staticmethod\n", - " def createDataFrameFromDescr(vaersDescr):\n", - " return pd.merge(\n", - " vaersDescr['VAERSDATA'],\n", - " vaersDescr['VAERSVAX'],\n", - " how = 'left',\n", - " left_index = True,\n", - " right_index = True,\n", - " validate = 'one_to_many')\n", - "\n", - " @staticmethod\n", - " def createDataFrameFromDescrs(vaersDescrs):\n", - " dataFrames = [VaersDescr2DataFrameConverter.createDataFrameFromDescr(vaersDescr) for vaersDescr in vaersDescrs]\n", - " return pd.concat(dataFrames)\n" + "from VaersDescr2DataFrameConverter import VaersDescr2DataFrameConverter" ] }, { @@ -474,44 +367,7 @@ "metadata": {}, "outputs": [], "source": [ - "class DataFrameNormalizer:\n", - " \n", - " @staticmethod\n", - " def normalize(dataFrame):\n", - " DataFrameNormalizer.removeUnknownBatchCodes(dataFrame)\n", - " DataFrameNormalizer.convertVAX_LOTColumnToUpperCase(dataFrame)\n", - " DataFrameNormalizer._convertColumnsOfDataFrame_Y_to_1_else_0(\n", - " dataFrame,\n", - " ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'])\n", - "\n", - " @staticmethod\n", - " def convertVAX_LOTColumnToUpperCase(dataFrame):\n", - " dataFrame['VAX_LOT'] = dataFrame['VAX_LOT'].str.upper()\n", - "\n", - " @staticmethod\n", - " def removeUnknownBatchCodes(dataFrame):\n", - " dataFrame.drop(DataFrameNormalizer._isUnknownBatchCode(dataFrame).index, inplace = True)\n", - "\n", - " @staticmethod\n", - " def _isUnknownBatchCode(dataFrame):\n", - " return dataFrame[dataFrame['VAX_LOT'].str.contains(pat = 'UNKNOWN', regex = False, case = False, na = False)]\n", - "\n", - " @staticmethod\n", - " def _convertColumnsOfDataFrame_Y_to_1_else_0(dataFrame, columns):\n", - " for column in columns:\n", - " DataFrameNormalizer._convertColumnOfDataFrame_Y_to_1_else_0(dataFrame, column)\n", - "\n", - " @staticmethod\n", - " def _convertColumnOfDataFrame_Y_to_1_else_0(dataFrame, column):\n", - " dataFrame[column] = DataFrameNormalizer._where(\n", - " condition = dataFrame[column] == 'Y',\n", - " trueValue = 1,\n", - " falseValue = 0)\n", - "\n", - " @staticmethod\n", - " def _where(condition, trueValue, falseValue):\n", - " return np.where(condition, trueValue, falseValue) \n", - " " + "from DataFrameNormalizer import DataFrameNormalizer" ] }, { @@ -521,53 +377,7 @@ "metadata": {}, "outputs": [], "source": [ - "import pandas as pd\n", - "\n", - "class DataFrameFilter:\n", - " \n", - " def filterByCovid19(self, dataFrame):\n", - " return dataFrame[self._isCovid19(dataFrame)]\n", - "\n", - " def _isCovid19(self, dataFrame):\n", - " return dataFrame[\"VAX_TYPE\"] == \"COVID19\"\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c62cfaff", - "metadata": {}, - "outputs": [], - "source": [ - "class SummationTableFactory:\n", - "\n", - " @staticmethod\n", - " def createSummationTable(dataFrame):\n", - " summationTable = dataFrame.agg(\n", - " **{\n", - " 'Deaths': pd.NamedAgg(column = 'DIED', aggfunc = 'sum'),\n", - " 'Adverse Reaction Reports': pd.NamedAgg(column = 'DIED', aggfunc = 'size'),\n", - " 'Life Threatening Illnesses': pd.NamedAgg(column = 'L_THREAT', aggfunc = 'sum'), \n", - " 'Disabilities': pd.NamedAgg(column = 'DISABLE', aggfunc = 'sum'),\n", - " 'Severities': pd.NamedAgg(column = 'SEVERE', aggfunc = 'sum'),\n", - " 'Countries': pd.NamedAgg(column = 'COUNTRY', aggfunc = SummationTableFactory.countries2str)\n", - " })\n", - " summationTable['Severe reports'] = summationTable['Severities'] / summationTable['Adverse Reaction Reports'] * 100\n", - " summationTable['Lethality'] = summationTable['Deaths'] / summationTable['Adverse Reaction Reports'] * 100\n", - " return summationTable[\n", - " [\n", - " 'Adverse Reaction Reports',\n", - " 'Deaths',\n", - " 'Disabilities',\n", - " 'Life Threatening Illnesses',\n", - " 'Severe reports',\n", - " 'Lethality',\n", - " 'Countries'\n", - " ]]\n", - "\n", - " @staticmethod\n", - " def countries2str(countries):\n", - " return ', '.join(sorted(set(countries)))" + "from DataFrameFilter import DataFrameFilter" ] }, { @@ -577,31 +387,7 @@ "metadata": {}, "outputs": [], "source": [ - "import pycountry\n", - "\n", - "class CountryColumnAdder:\n", - " \n", - " @staticmethod\n", - " def addCountryColumn(dataFrame):\n", - " dataFrame['COUNTRY'] = CountryColumnAdder.getCountryColumn(dataFrame)\n", - " return dataFrame.astype({'COUNTRY': \"string\"})\n", - "\n", - " @staticmethod\n", - " def getCountryColumn(dataFrame):\n", - " return dataFrame.apply(\n", - " lambda row:\n", - " CountryColumnAdder._getCountryNameOfSplttypeOrDefault(\n", - " splttype = row['SPLTTYPE'],\n", - " default = 'Unknown Country'),\n", - " axis = 'columns')\n", - "\n", - " @staticmethod\n", - " def _getCountryNameOfSplttypeOrDefault(splttype, default):\n", - " if not isinstance(splttype, str):\n", - " return default\n", - " \n", - " country = pycountry.countries.get(alpha_2 = splttype[:2])\n", - " return country.name if country is not None else default" + "from CountryColumnAdder import CountryColumnAdder" ] }, { @@ -611,41 +397,7 @@ "metadata": {}, "outputs": [], "source": [ - "class SevereColumnAdder:\n", - " \n", - " @staticmethod\n", - " def addSevereColumn(dataFrame):\n", - " dataFrame['SEVERE'] = (dataFrame['DIED'] + dataFrame['L_THREAT'] + dataFrame['DISABLE']) > 0\n", - " dataFrame['SEVERE'].replace({True: 1, False: 0}, inplace = True)\n", - " return dataFrame\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2dad09e5", - "metadata": {}, - "outputs": [], - "source": [ - "class CompanyColumnAdder:\n", - " \n", - " def __init__(self, dataFrame_VAX_LOT_VAX_MANU):\n", - " self.dataFrame_VAX_LOT_VAX_MANU = dataFrame_VAX_LOT_VAX_MANU\n", - "\n", - " def addCompanyColumn(self, batchCodeTable):\n", - " return pd.merge(\n", - " batchCodeTable,\n", - " self._createCompanyByBatchCodeTable(),\n", - " how = 'left',\n", - " left_index = True,\n", - " right_index = True,\n", - " validate = 'one_to_one')\n", - "\n", - " def _createCompanyByBatchCodeTable(self):\n", - " manufacturerByBatchCodeTable = self.dataFrame_VAX_LOT_VAX_MANU[['VAX_LOT', 'VAX_MANU']]\n", - " manufacturerByBatchCodeTable = manufacturerByBatchCodeTable.drop_duplicates(subset = ['VAX_LOT'])\n", - " manufacturerByBatchCodeTable = manufacturerByBatchCodeTable.set_index('VAX_LOT')\n", - " return manufacturerByBatchCodeTable.rename(columns = {\"VAX_MANU\": \"Company\"})" + "from SevereColumnAdder import SevereColumnAdder" ] }, { @@ -655,47 +407,7 @@ "metadata": {}, "outputs": [], "source": [ - "class BatchCodeTableFactory:\n", - "\n", - " def __init__(self, dataFrame: pd.DataFrame):\n", - " self.dataFrame = dataFrame\n", - " self.companyColumnAdder = CompanyColumnAdder(dataFrame)\n", - " self.countryBatchCodeTable = SummationTableFactory.createSummationTable(\n", - " dataFrame.groupby(\n", - " [\n", - " dataFrame['COUNTRY'],\n", - " dataFrame['VAX_LOT']\n", - " ]))\n", - "\n", - " def createGlobalBatchCodeTable(self):\n", - " return self._postProcess(SummationTableFactory.createSummationTable(self.dataFrame.groupby('VAX_LOT')))\n", - "\n", - " def createBatchCodeTableByCountry(self, country):\n", - " return self._postProcess(self._getBatchCodeTableByCountry(country))\n", - "\n", - " def _postProcess(self, batchCodeTable):\n", - " batchCodeTable = self.companyColumnAdder.addCompanyColumn(batchCodeTable)\n", - " batchCodeTable = batchCodeTable[\n", - " [\n", - " 'Adverse Reaction Reports',\n", - " 'Deaths',\n", - " 'Disabilities',\n", - " 'Life Threatening Illnesses',\n", - " 'Company',\n", - " 'Countries',\n", - " 'Severe reports',\n", - " 'Lethality'\n", - " ]]\n", - " return batchCodeTable.sort_values(by = 'Severe reports', ascending = False)\n", - "\n", - " def _getBatchCodeTableByCountry(self, country):\n", - " if country in self.countryBatchCodeTable.index:\n", - " return self.countryBatchCodeTable.loc[country]\n", - " else:\n", - " return self._getEmptyBatchCodeTable()\n", - "\n", - " def _getEmptyBatchCodeTable(self):\n", - " return self.countryBatchCodeTable[0:0].droplevel(0)\n" + "from BatchCodeTableFactory import BatchCodeTableFactory" ] }, { @@ -705,21 +417,7 @@ "metadata": {}, "outputs": [], "source": [ - "from bs4 import BeautifulSoup\n", - "\n", - "class HtmlTransformerUtil:\n", - " \n", - " def applySoupTransformerToFile(self, file, soupTransformer):\n", - " self._writeSoup(soupTransformer(self._readSoup(file)), file)\n", - "\n", - " def _readSoup(self, file):\n", - " with open(file) as fp:\n", - " soup = BeautifulSoup(fp, 'lxml')\n", - " return soup\n", - "\n", - " def _writeSoup(self, soup, file):\n", - " with open(file, \"w\") as fp:\n", - " fp.write(str(soup)) \n" + "from HtmlTransformerUtil import HtmlTransformerUtil" ] }, { @@ -729,27 +427,7 @@ "metadata": {}, "outputs": [], "source": [ - "from bs4 import BeautifulSoup\n", - "\n", - "\n", - "class CountryOptionsSetter:\n", - "\n", - " def setCountryOptions(self, html, options):\n", - " soup = self._setCountryOptions(self._parse(html), self._parseOptions(options))\n", - " return str(soup)\n", - "\n", - " def _setCountryOptions(self, soup, options):\n", - " countrySelect = soup.find(id = \"countrySelect\")\n", - " countrySelect.clear()\n", - " for option in options:\n", - " countrySelect.append(option)\n", - " return soup\n", - "\n", - " def _parseOptions(self, options):\n", - " return [self._parse(option).option for option in options]\n", - "\n", - " def _parse(self, html):\n", - " return BeautifulSoup(html, 'lxml')\n" + "from CountryOptionsSetter import CountryOptionsSetter" ] }, { @@ -796,405 +474,7 @@ "metadata": {}, "outputs": [], "source": [ - "import os\n", - "\n", - "class IOUtils:\n", - "\n", - " @staticmethod\n", - " def saveDataFrame(dataFrame, file):\n", - " # IOUtils.saveDataFrameAsExcelFile(dataFrame, file)\n", - " # IOUtils.saveDataFrameAsHtml(dataFrame, file)\n", - " IOUtils.saveDataFrameAsJson(dataFrame, file)\n", - "\n", - " @staticmethod\n", - " def saveDataFrameAsExcelFile(dataFrame, file):\n", - " IOUtils.ensurePath(file)\n", - " dataFrame.to_excel(file + '.xlsx')\n", - "\n", - " @staticmethod\n", - " def saveDataFrameAsHtml(dataFrame, file):\n", - " IOUtils.ensurePath(file)\n", - " dataFrame.reset_index().to_html(\n", - " file + '.html',\n", - " index = False,\n", - " table_id = 'batchCodeTable',\n", - " classes = 'display',\n", - " justify = 'unset',\n", - " border = 0)\n", - "\n", - " @staticmethod\n", - " def saveDataFrameAsJson(dataFrame, file):\n", - " IOUtils.ensurePath(file)\n", - " dataFrame.reset_index().to_json(\n", - " file + '.json',\n", - " orient = \"split\",\n", - " index = False)\n", - "\n", - " @staticmethod\n", - " def ensurePath(file):\n", - " directory = os.path.dirname(file)\n", - " if not os.path.exists(directory):\n", - " os.makedirs(directory)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3dacedfd", - "metadata": {}, - "outputs": [], - "source": [ - "import unittest" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fcc855dd", - "metadata": {}, - "outputs": [], - "source": [ - "class TestHelper:\n", - "\n", - " @staticmethod\n", - " def createDataFrame(index, columns, data, dtypes = {}):\n", - " return pd.DataFrame(index = index, columns = columns, data = data).astype(dtypes)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ccb9838d", - "metadata": {}, - "outputs": [], - "source": [ - "from pandas.testing import assert_frame_equal\n", - "\n", - "class DataFrameNormalizerTest(unittest.TestCase):\n", - "\n", - " def test_convertVAX_LOTColumnToUpperCase(self):\n", - " # Given\n", - " dataFrame = TestHelper.createDataFrame(\n", - " columns = ['VAX_LOT'],\n", - " data = [ ['037K20A'],\n", - " ['025l20A'],\n", - " ['025L20A']],\n", - " index = [\n", - " \"0916600\",\n", - " \"0916601\",\n", - " \"1996874\"])\n", - " \n", - " # When\n", - " DataFrameNormalizer.convertVAX_LOTColumnToUpperCase(dataFrame)\n", - " \n", - " # Then\n", - " dataFrameExpected = TestHelper.createDataFrame(\n", - " columns = ['VAX_LOT'],\n", - " data = [ ['037K20A'],\n", - " ['025L20A'],\n", - " ['025L20A']],\n", - " index = [\n", - " \"0916600\",\n", - " \"0916601\",\n", - " \"1996874\"])\n", - " assert_frame_equal(dataFrame, dataFrameExpected, check_dtype = False)\n", - "\n", - " def test_removeUnknownBatchCodes(self):\n", - " # Given\n", - " dataFrame = TestHelper.createDataFrame(\n", - " columns = ['VAX_LOT'],\n", - " data = [ ['UNKNOWN'],\n", - " ['N/A Unknown'],\n", - " [np.nan],\n", - " ['UNKNOWN TO ME'],\n", - " ['030L20B']],\n", - " index = [\n", - " \"1048786\",\n", - " \"1048786\",\n", - " \"123\",\n", - " \"4711\",\n", - " \"0815\"])\n", - " \n", - " # When\n", - " DataFrameNormalizer.removeUnknownBatchCodes(dataFrame)\n", - " \n", - " # Then\n", - " dataFrameExpected = TestHelper.createDataFrame(\n", - " columns = ['VAX_LOT'],\n", - " data = [ [np.nan],\n", - " ['030L20B']],\n", - " index = [\n", - " \"123\",\n", - " \"0815\"])\n", - " assert_frame_equal(dataFrame, dataFrameExpected, check_dtype = False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e59a1825", - "metadata": {}, - "outputs": [], - "source": [ - "from pandas.testing import assert_frame_equal\n", - "\n", - "class DataFrameFilterTest(unittest.TestCase):\n", - "\n", - " def test_filterByCovid19(self):\n", - " # Given\n", - " dataFrame = VaersDescr2DataFrameConverter.createDataFrameFromDescrs(\n", - " [\n", - " {\n", - " 'VAERSDATA': TestHelper.createDataFrame(\n", - " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", - " data = [ [1, 0, 0],\n", - " [0, 0, 1]],\n", - " index = [\n", - " \"0916600\",\n", - " \"0916601\"]),\n", - " 'VAERSVAX': TestHelper.createDataFrame(\n", - " columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", - " data = [ ['COVID19', 'MODERNA', '037K20A', '1'],\n", - " ['COVID19', 'MODERNA', '025L20A', '1']],\n", - " index = [\n", - " \"0916600\",\n", - " \"0916601\"],\n", - " dtypes = {'VAX_DOSE_SERIES': \"string\"})\n", - " },\n", - " {\n", - " 'VAERSDATA': TestHelper.createDataFrame(\n", - " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", - " data = [ [0, 0, 0],\n", - " [0, 0, 1]],\n", - " index = [\n", - " \"1996873\",\n", - " \"1996874\"]),\n", - " 'VAERSVAX': TestHelper.createDataFrame(\n", - " columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", - " data = [ ['HPV9', 'MERCK & CO. INC.', 'R017624', 'UNK'],\n", - " ['COVID19', 'MODERNA', '025L20A', '1']],\n", - " index = [\n", - " \"1996873\",\n", - " \"1996874\"],\n", - " dtypes = {'VAX_DOSE_SERIES': \"string\"})\n", - " }\n", - " ])\n", - " dataFrameFilter = DataFrameFilter()\n", - " \n", - " # When\n", - " dataFrame = dataFrameFilter.filterByCovid19(dataFrame)\n", - " \n", - " # Then\n", - " dataFrameExpected = TestHelper.createDataFrame(\n", - " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", - " data = [ [1, 0, 0, 'COVID19', 'MODERNA', '037K20A', '1'],\n", - " [0, 0, 1, 'COVID19', 'MODERNA', '025L20A', '1'],\n", - " [0, 0, 1, 'COVID19', 'MODERNA', '025L20A', '1']],\n", - " index = [\n", - " \"0916600\",\n", - " \"0916601\",\n", - " \"1996874\"],\n", - " dtypes = {'VAX_DOSE_SERIES': \"string\"})\n", - " assert_frame_equal(dataFrame, dataFrameExpected, check_dtype = False)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c784bfef", - "metadata": {}, - "outputs": [], - "source": [ - "from pandas.testing import assert_frame_equal\n", - "\n", - "class BatchCodeTableFactoryTest(unittest.TestCase):\n", - "\n", - " def test_createBatchCodeTableByCountry(self):\n", - " # Given\n", - " dataFrame = TestHelper.createDataFrame(\n", - " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES', 'SPLTTYPE', 'HOSPITAL', 'ER_VISIT', 'COUNTRY'],\n", - " data = [ [1, 0, 0, 'COVID19', 'PFIZER\\BIONTECH', '016M20A', '2', 'GBPFIZER INC2020486806', 0, 0, 'United Kingdom'],\n", - " [0, 0, 0, 'COVID19', 'MODERNA', '030L20A', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'France'],\n", - " [1, 1, 1, 'COVID19', 'MODERNA', '030L20B', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'France'],\n", - " [0, 1, 1, 'COVID19', 'MODERNA', '030L20B', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'France']],\n", - " index = [\n", - " \"1048786\",\n", - " \"1048786\",\n", - " \"4711\",\n", - " \"0815\"])\n", - " dataFrame = SevereColumnAdder.addSevereColumn(dataFrame)\n", - " batchCodeTableFactory = BatchCodeTableFactory(dataFrame)\n", - " \n", - " # When\n", - " batchCodeTable = batchCodeTableFactory.createBatchCodeTableByCountry('France')\n", - "\n", - " # Then\n", - " assert_frame_equal(\n", - " batchCodeTable,\n", - " TestHelper.createDataFrame(\n", - " columns = ['Adverse Reaction Reports', 'Deaths', 'Disabilities', 'Life Threatening Illnesses', 'Company', 'Countries', 'Severe reports', 'Lethality'],\n", - " data = [ [2, 1, 2, 2, 'MODERNA', 'France', 2/2 * 100, 1/2 * 100],\n", - " [1, 0, 0, 0, 'MODERNA', 'France', 0/1 * 100, 0/1 * 100]],\n", - " index = pd.Index(\n", - " [\n", - " '030L20B',\n", - " '030L20A'\n", - " ],\n", - " name = 'VAX_LOT')),\n", - " check_dtype = False)\n", - "\n", - " def test_createGlobalBatchCodeTable(self):\n", - " # Given\n", - " dataFrame = TestHelper.createDataFrame(\n", - " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES', 'SPLTTYPE', 'HOSPITAL', 'ER_VISIT', 'COUNTRY'],\n", - " data = [ [1, 0, 0, 'COVID19', 'PFIZER\\BIONTECH', '016M20A', '2', 'GBPFIZER INC2020486806', 0, 0, 'United Kingdom'],\n", - " [0, 0, 0, 'COVID19', 'MODERNA', '030L20A', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'France'],\n", - " [1, 1, 1, 'COVID19', 'MODERNA', '030L20B', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'France'],\n", - " [0, 1, 1, 'COVID19', 'MODERNA', '030L20B', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'United Kingdom']],\n", - " index = [\n", - " \"1048786\",\n", - " \"1048786\",\n", - " \"4711\",\n", - " \"0815\"])\n", - " dataFrame = SevereColumnAdder.addSevereColumn(dataFrame)\n", - " batchCodeTableFactory = BatchCodeTableFactory(dataFrame)\n", - " \n", - " # When\n", - " batchCodeTable = batchCodeTableFactory.createGlobalBatchCodeTable()\n", - "\n", - " # Then\n", - " assert_frame_equal(\n", - " batchCodeTable,\n", - " TestHelper.createDataFrame(\n", - " columns = ['Adverse Reaction Reports', 'Deaths', 'Disabilities', 'Life Threatening Illnesses', 'Company', 'Countries', 'Severe reports', 'Lethality'],\n", - " data = [ [1, 1, 0, 0, 'PFIZER\\BIONTECH', 'United Kingdom', 1/1 * 100, 1/1 * 100],\n", - " [2, 1, 2, 2, 'MODERNA', 'France, United Kingdom', 2/2 * 100, 1/2 * 100],\n", - " [1, 0, 0, 0, 'MODERNA', 'France', 0/1 * 100, 0/1 * 100]],\n", - " index = pd.Index(\n", - " [\n", - " '016M20A',\n", - " '030L20B',\n", - " '030L20A'\n", - " ],\n", - " name = 'VAX_LOT')),\n", - " check_dtype = False)\n", - "\n", - " def test_createBatchCodeTableByNonExistingCountry(self):\n", - " # Given\n", - " dataFrame = TestHelper.createDataFrame(\n", - " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES', 'SPLTTYPE', 'HOSPITAL', 'ER_VISIT', 'COUNTRY'],\n", - " data = [ [1, 0, 0, 'COVID19', 'PFIZER\\BIONTECH', '016M20A', '2', 'GBPFIZER INC2020486806', 0, 0, 'United Kingdom'],\n", - " [0, 0, 0, 'COVID19', 'MODERNA', '030L20A', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'France'],\n", - " [1, 1, 1, 'COVID19', 'MODERNA', '030L20B', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'France'],\n", - " [0, 1, 1, 'COVID19', 'MODERNA', '030L20B', '1', 'FRMODERNATX, INC.MOD20224', 0, 0, 'France']],\n", - " index = [\n", - " \"1048786\",\n", - " \"1048786\",\n", - " \"4711\",\n", - " \"0815\"])\n", - " dataFrame = SevereColumnAdder.addSevereColumn(dataFrame)\n", - " batchCodeTableFactory = BatchCodeTableFactory(dataFrame)\n", - " \n", - " # When\n", - " batchCodeTable = batchCodeTableFactory.createBatchCodeTableByCountry('non existing country')\n", - "\n", - " # Then\n", - " assert_frame_equal(\n", - " batchCodeTable,\n", - " TestHelper.createDataFrame(\n", - " columns = ['Adverse Reaction Reports', 'Deaths', 'Disabilities', 'Life Threatening Illnesses', 'Company', 'Countries', 'Severe reports', 'Lethality'],\n", - " data = [ ],\n", - " index = pd.Index([], name = 'VAX_LOT')),\n", - " check_dtype = False)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "125351b3", - "metadata": {}, - "outputs": [], - "source": [ - "class CountryOptionsSetterTest(unittest.TestCase):\n", - "\n", - " def test_setCountryOptions(self):\n", - " # Given\n", - " countryOptionsSetter = CountryOptionsSetter()\n", - "\n", - " # When\n", - " htmlActual = countryOptionsSetter.setCountryOptions(\n", - " html='''\n", - " \n", - " \n", - "

Test

\n", - " \n", - " \n", - " \n", - " ''',\n", - " options=[\n", - " '',\n", - " '',\n", - " ''])\n", - "\n", - " # Then\n", - " assertEqualHTML(\n", - " htmlActual,\n", - " '''\n", - " \n", - " \n", - "

Test

\n", - " \n", - " \n", - " \n", - " ''')\n", - "\n", - "# adapted from https://stackoverflow.com/questions/8006909/pretty-print-assertequal-for-html-strings\n", - "def assertEqualHTML(string1, string2, file1='', file2=''):\n", - " u'''\n", - " Compare two unicode strings containing HTML.\n", - " A human friendly diff goes to logging.error() if they\n", - " are not equal, and an exception gets raised.\n", - " '''\n", - " from bs4 import BeautifulSoup as bs\n", - " import difflib\n", - "\n", - " def short(mystr):\n", - " max = 20\n", - " if len(mystr) > max:\n", - " return mystr[:max]\n", - " return mystr\n", - " p = []\n", - " for mystr, file in [(string1, file1), (string2, file2)]:\n", - " if not isinstance(mystr, str):\n", - " raise Exception(u'string ist not unicode: %r %s' %\n", - " (short(mystr), file))\n", - " soup = bs(mystr)\n", - " pretty = soup.prettify()\n", - " p.append(pretty)\n", - " if p[0] != p[1]:\n", - " for line in difflib.unified_diff(p[0].splitlines(), p[1].splitlines(), fromfile=file1, tofile=file2):\n", - " display(line)\n", - " display(p[0], ' != ', p[1])\n", - " raise Exception('Not equal %s %s' % (file1, file2))\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a8bff1b", - "metadata": {}, - "outputs": [], - "source": [ - "unittest.main(argv = [''], verbosity = 2, exit = False)" + "from IOUtils import IOUtils" ] }, { diff --git a/src/HtmlTransformerUtil.py b/src/HtmlTransformerUtil.py new file mode 100644 index 00000000000..b01cdde3387 --- /dev/null +++ b/src/HtmlTransformerUtil.py @@ -0,0 +1,15 @@ +from bs4 import BeautifulSoup + +class HtmlTransformerUtil: + + def applySoupTransformerToFile(self, file, soupTransformer): + self._writeSoup(soupTransformer(self._readSoup(file)), file) + + def _readSoup(self, file): + with open(file) as fp: + soup = BeautifulSoup(fp, 'lxml') + return soup + + def _writeSoup(self, soup, file): + with open(file, "w") as fp: + fp.write(str(soup)) diff --git a/src/IOUtils.py b/src/IOUtils.py new file mode 100644 index 00000000000..52d9176f6b8 --- /dev/null +++ b/src/IOUtils.py @@ -0,0 +1,39 @@ +import os + +class IOUtils: + + @staticmethod + def saveDataFrame(dataFrame, file): + # IOUtils.saveDataFrameAsExcelFile(dataFrame, file) + # IOUtils.saveDataFrameAsHtml(dataFrame, file) + IOUtils.saveDataFrameAsJson(dataFrame, file) + + @staticmethod + def saveDataFrameAsExcelFile(dataFrame, file): + IOUtils.ensurePath(file) + dataFrame.to_excel(file + '.xlsx') + + @staticmethod + def saveDataFrameAsHtml(dataFrame, file): + IOUtils.ensurePath(file) + dataFrame.reset_index().to_html( + file + '.html', + index = False, + table_id = 'batchCodeTable', + classes = 'display', + justify = 'unset', + border = 0) + + @staticmethod + def saveDataFrameAsJson(dataFrame, file): + IOUtils.ensurePath(file) + dataFrame.reset_index().to_json( + file + '.json', + orient = "split", + index = False) + + @staticmethod + def ensurePath(file): + directory = os.path.dirname(file) + if not os.path.exists(directory): + os.makedirs(directory) diff --git a/src/SevereColumnAdder.py b/src/SevereColumnAdder.py new file mode 100644 index 00000000000..7e0b0fb6052 --- /dev/null +++ b/src/SevereColumnAdder.py @@ -0,0 +1,7 @@ +class SevereColumnAdder: + + @staticmethod + def addSevereColumn(dataFrame): + dataFrame['SEVERE'] = (dataFrame['DIED'] + dataFrame['L_THREAT'] + dataFrame['DISABLE']) > 0 + dataFrame['SEVERE'].replace({True: 1, False: 0}, inplace = True) + return dataFrame diff --git a/src/SummationTableFactory.py b/src/SummationTableFactory.py new file mode 100644 index 00000000000..727b86b3bb6 --- /dev/null +++ b/src/SummationTableFactory.py @@ -0,0 +1,31 @@ +import pandas as pd + +class SummationTableFactory: + + @staticmethod + def createSummationTable(dataFrame): + summationTable = dataFrame.agg( + **{ + 'Deaths': pd.NamedAgg(column = 'DIED', aggfunc = 'sum'), + 'Adverse Reaction Reports': pd.NamedAgg(column = 'DIED', aggfunc = 'size'), + 'Life Threatening Illnesses': pd.NamedAgg(column = 'L_THREAT', aggfunc = 'sum'), + 'Disabilities': pd.NamedAgg(column = 'DISABLE', aggfunc = 'sum'), + 'Severities': pd.NamedAgg(column = 'SEVERE', aggfunc = 'sum'), + 'Countries': pd.NamedAgg(column = 'COUNTRY', aggfunc = SummationTableFactory.countries2str) + }) + summationTable['Severe reports'] = summationTable['Severities'] / summationTable['Adverse Reaction Reports'] * 100 + summationTable['Lethality'] = summationTable['Deaths'] / summationTable['Adverse Reaction Reports'] * 100 + return summationTable[ + [ + 'Adverse Reaction Reports', + 'Deaths', + 'Disabilities', + 'Life Threatening Illnesses', + 'Severe reports', + 'Lethality', + 'Countries' + ]] + + @staticmethod + def countries2str(countries): + return ', '.join(sorted(set(countries))) \ No newline at end of file diff --git a/src/TestHelper.py b/src/TestHelper.py new file mode 100644 index 00000000000..58eaabca150 --- /dev/null +++ b/src/TestHelper.py @@ -0,0 +1,8 @@ +import pandas as pd + + +class TestHelper: + + @staticmethod + def createDataFrame(index, columns, data, dtypes={}): + return pd.DataFrame(index=index, columns=columns, data=data).astype(dtypes) diff --git a/src/VaersDescr2DataFrameConverter.py b/src/VaersDescr2DataFrameConverter.py new file mode 100644 index 00000000000..23ce2f0f675 --- /dev/null +++ b/src/VaersDescr2DataFrameConverter.py @@ -0,0 +1,18 @@ +import pandas as pd + +class VaersDescr2DataFrameConverter: + + @staticmethod + def createDataFrameFromDescr(vaersDescr): + return pd.merge( + vaersDescr['VAERSDATA'], + vaersDescr['VAERSVAX'], + how = 'left', + left_index = True, + right_index = True, + validate = 'one_to_many') + + @staticmethod + def createDataFrameFromDescrs(vaersDescrs): + dataFrames = [VaersDescr2DataFrameConverter.createDataFrameFromDescr(vaersDescr) for vaersDescr in vaersDescrs] + return pd.concat(dataFrames) diff --git a/src/VaersDescrReader.py b/src/VaersDescrReader.py new file mode 100644 index 00000000000..dcc4c5c3e98 --- /dev/null +++ b/src/VaersDescrReader.py @@ -0,0 +1,42 @@ +import pandas as pd + +class VaersDescrReader: + + def __init__(self, dataDir): + self.dataDir = dataDir + + def readVaersDescrsForYears(self, years): + return [self.readVaersDescrForYear(year) for year in years] + + def readVaersDescrForYear(self, year): + return { + 'VAERSDATA': self._readVAERSDATA('{dataDir}/{year}VAERSDATA.csv'.format(dataDir = self.dataDir, year = year)), + 'VAERSVAX': self._readVAERSVAX('{dataDir}/{year}VAERSVAX.csv'.format(dataDir = self.dataDir, year = year)) + } + + def readNonDomesticVaersDescr(self): + return { + 'VAERSDATA': self._readVAERSDATA(self.dataDir + "/NonDomesticVAERSDATA.csv"), + 'VAERSVAX': self._readVAERSVAX(self.dataDir + "/NonDomesticVAERSVAX.csv") + } + + def _readVAERSDATA(self, file): + return self._read_csv( + file = file, + usecols = ['VAERS_ID', 'RECVDATE', 'DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT', 'SPLTTYPE'], + parse_dates = ['RECVDATE'], + date_parser = lambda dateStr: pd.to_datetime(dateStr, format = "%m/%d/%Y")) + + def _readVAERSVAX(self, file): + return self._read_csv( + file = file, + usecols = ['VAERS_ID', 'VAX_DOSE_SERIES', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'], + dtype = {"VAX_DOSE_SERIES": "string"}) + + def _read_csv(self, file, **kwargs): + return pd.read_csv( + file, + index_col = 'VAERS_ID', + encoding = 'latin1', + low_memory = False, + **kwargs)