From a8c20c3656ec226febc88b87b26c21094b955eda Mon Sep 17 00:00:00 2001 From: frankknoll Date: Sat, 5 Feb 2022 17:28:26 +0100 Subject: [PATCH] adding getDoseByMonthTable() --- HowBadIsMyBatch.ipynb | 86 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/HowBadIsMyBatch.ipynb b/HowBadIsMyBatch.ipynb index 243905adbec..b0c79621539 100644 --- a/HowBadIsMyBatch.ipynb +++ b/HowBadIsMyBatch.ipynb @@ -64,7 +64,9 @@ " def _readVAERSDATA(self, file):\n", " VAERSDATA = self._read_csv(\n", " file = file,\n", - " usecols = ['VAERS_ID', 'DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'])\n", + " usecols = ['VAERS_ID', 'RECVDATE', 'DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'],\n", + " parse_dates = ['RECVDATE'],\n", + " date_parser = lambda dateStr: pd.to_datetime(dateStr, format = \"%m/%d/%Y\"))\n", " DataFrameConverter.convertColumnsOfDataFrameToNumerics(\n", " VAERSDATA,\n", " ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'])\n", @@ -274,6 +276,33 @@ " })\n", " doseTable = doseTable[['Total reports', 'Deaths', 'Disabilities', 'Life Threatening Illnesses']]\n", " doseTable['Severe reports (%)'] = (doseTable['Deaths'] + doseTable['Disabilities'] + doseTable['Life Threatening Illnesses']) / doseTable['Total reports'] * 100\n", + " return doseTable\n", + "\n", + " @staticmethod\n", + " def getDoseByMonthTable(dataFrame):\n", + " # https://stackoverflow.com/questions/61879166/pandas-groupby-month-and-year-date-as-datetime64ns-and-summarized-by-count\n", + " grouped = dataFrame.groupby(\n", + " [\n", + " dataFrame['RECVDATE'].dt.year.rename('year'),\n", + " dataFrame['RECVDATE'].dt.month.rename('month'),\n", + " dataFrame['VAX_DOSE_SERIES']\n", + " ])\n", + " # FK-TODO: DRY with getDoseTable()\n", + " doseTable = AggregationHelper.aggregateAndFlattenColumnsAndRenameColumns(\n", + " dataFrame = grouped,\n", + " aggFunctionsByColumn = {\n", + " 'DIED': ['sum', 'size'],\n", + " 'L_THREAT': 'sum',\n", + " 'DISABLE': 'sum'\n", + " },\n", + " columnNameMappingsDict = {\n", + " \"DIED_size\": \"Total reports\",\n", + " \"DIED_sum\": \"Deaths\",\n", + " \"L_THREAT_sum\": \"Life Threatening Illnesses\",\n", + " \"DISABLE_sum\": \"Disabilities\"\n", + " })\n", + " doseTable = doseTable[['Total reports', 'Deaths', 'Disabilities', 'Life Threatening Illnesses']]\n", + " doseTable['Severe reports (%)'] = (doseTable['Deaths'] + doseTable['Disabilities'] + doseTable['Life Threatening Illnesses']) / doseTable['Total reports'] * 100\n", " return doseTable\n" ] }, @@ -620,6 +649,42 @@ " },\n", " index = pd.Index(['1', '2'], dtype = \"string\", name = 'VAX_DOSE_SERIES')))\n", " \n", + " def test_getDoseByMonthTable(self):\n", + " # Given\n", + " parseDate = lambda dateStr: pd.to_datetime(dateStr, format = \"%m/%d/%Y\")\n", + " dataFrame = self.createDataFrame(\n", + " columns = ['RECVDATE', 'DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", + " data = [ [parseDate('01/01/2021'), 1, 0, 0, 'COVID19', 'MODERNA', '016M20A', '2'],\n", + " [parseDate('01/01/2021'), 1, 0, 0, 'COVID19', 'MODERNA', '030L20A', '1'],\n", + " [parseDate('01/01/2021'), 1, 1, 1, 'COVID19', 'MODERNA', '030L20B', '1']],\n", + " index = [\n", + " \"1048786\",\n", + " \"1048786\",\n", + " \"4711\"],\n", + " dtypes = {'VAX_DOSE_SERIES': \"string\"})\n", + " \n", + " # When\n", + " doseByMonthTable = DoseAnalysis.getDoseByMonthTable(dataFrame)\n", + "\n", + " # Then\n", + " assert_frame_equal(\n", + " doseByMonthTable,\n", + " pd.DataFrame(\n", + " data = {\n", + " 'Total reports': [2, 1],\n", + " 'Deaths': [2, 1],\n", + " 'Disabilities': [1, 0],\n", + " 'Life Threatening Illnesses': [1, 0],\n", + " 'Severe reports (%)': [(2 + 1 + 1)/2 * 100, (1 + 0 + 0)/1 * 100]\n", + " },\n", + " index = pd.MultiIndex.from_tuples(\n", + " [\n", + " (2021, 1, '1'),\n", + " (2021, 1, '2'),\n", + " ],\n", + " names = ('year', 'month', 'VAX_DOSE_SERIES'))),\n", + " check_index_type = False)\n", + "\n", " def createDataFrame(self, index, columns, data, dtypes = {}):\n", " return pd.DataFrame(index = index, columns = columns, data = data).astype(dtypes)\n" ] @@ -706,7 +771,12 @@ "def getDoseTable():\n", " vaersDescrs = VaersDescrReader(dataDir = \"VAERS\").readAllVaersDescrs()\n", " dataFrame = VaersDescr2DataFrameConverter.createDataFrameFromDescrs(vaersDescrs)\n", - " return DoseAnalysis.getDoseTable(dataFrame)" + " return DoseAnalysis.getDoseTable(dataFrame)\n", + "\n", + "def getDoseByMonthTable():\n", + " vaersDescrs = VaersDescrReader(dataDir = \"VAERS\").readAllVaersDescrs()\n", + " dataFrame = VaersDescr2DataFrameConverter.createDataFrameFromDescrs(vaersDescrs)\n", + " return DoseAnalysis.getDoseByMonthTable(dataFrame)" ] }, { @@ -718,6 +788,18 @@ "source": [ "getDoseTable()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b333e5fb", + "metadata": {}, + "outputs": [], + "source": [ + "doseByMonthTable = getDoseByMonthTable()\n", + "doseByMonthTable.to_excel('results/doseByMonthTable.xlsx')\n", + "doseByMonthTable" + ] } ], "metadata": {