From 5eedead69de0a7ee4bbb956432be0c33aa5b605f Mon Sep 17 00:00:00 2001 From: frankknoll Date: Sat, 5 Feb 2022 13:02:51 +0100 Subject: [PATCH] refactoring --- HowBadIsMyBatch.ipynb | 143 +++++++++++++++++++++--------------------- 1 file changed, 70 insertions(+), 73 deletions(-) diff --git a/HowBadIsMyBatch.ipynb b/HowBadIsMyBatch.ipynb index 70d5951fe28..a1bebcc2ae3 100644 --- a/HowBadIsMyBatch.ipynb +++ b/HowBadIsMyBatch.ipynb @@ -224,23 +224,52 @@ "class DoseAnalysis:\n", " \n", " @staticmethod\n", - " def getNthDoseTable(dataFrame, dose):\n", - " return pd.DataFrame(DoseAnalysis._getNthDoseDict(DataFrameFilter(dataFrame).filterBy(dose = dose)))\n", + " def getDoseTable(dataFrame):\n", + " # FK-TODO: _convertColumnsOfDataFrameToNumerics() sollte schon während des Einlesens aus den CSV-Dateien durchgeführt werden\n", + " # FK-TODO: bitte alle DataFrames als unmutable behandeln und nicht inplace ändern.\n", + " DoseAnalysis._convertColumnsOfDataFrameToNumerics(dataFrame, ['DIED', 'L_THREAT', 'DISABLE'])\n", + " return pd.DataFrame(DoseAnalysis._getNthDoseDict(dataFrame))\n", "\n", + " # FK-TODO: inline method\n", " @staticmethod\n", " def _getNthDoseDict(df):\n", - " nthDoseDict = {\n", - " 'Total reports': [len(df.index)],\n", - " 'Deaths': [DoseAnalysis._count(df, 'DIED')],\n", - " 'Disabilities': [DoseAnalysis._count(df, 'DISABLE')],\n", - " 'Life Threatening Illnesses': [DoseAnalysis._count(df, 'L_THREAT')]\n", - " }\n", - " nthDoseDict['Severe reports (%)'] = [(nthDoseDict['Deaths'][0] + nthDoseDict['Disabilities'][0] + nthDoseDict['Life Threatening Illnesses'][0]) / nthDoseDict['Total reports'][0] * 100]\n", - " return nthDoseDict\n", + " doseTable = df.groupby('VAX_DOSE_SERIES').agg(\n", + " {\n", + " 'DIED': ['sum', 'size'],\n", + " 'L_THREAT': 'sum',\n", + " 'DISABLE': 'sum'\n", + " })\n", + " DoseAnalysis._flattenColumns(doseTable)\n", + " doseTable = doseTable.rename(\n", + " columns =\n", + " {\n", + " \"DIED_size\": \"Total reports\",\n", + " \"DIED_sum\": \"Deaths\",\n", + " \"L_THREAT_sum\": \"Life Threatening Illnesses\",\n", + " \"DISABLE_sum\": \"Disabilities\"\n", + " })[['Total reports', 'Deaths', 'Disabilities', 'Life Threatening Illnesses']]\n", + " doseTable['Severe reports (%)'] = (doseTable['Deaths'] + doseTable['Disabilities'] + doseTable['Life Threatening Illnesses']) / doseTable['Total reports'] * 100\n", + " return doseTable\n", "\n", " @staticmethod\n", " def _count(dataFrame, column):\n", - " return len(dataFrame[dataFrame[column] == 'Y'])\n" + " return len(dataFrame[dataFrame[column] == 'Y'])\n", + "\n", + " # FK-TODO: DRY with BatchCodeTableHelper\n", + " @staticmethod\n", + " def _convertColumnsOfDataFrameToNumerics(dataFrame, columns):\n", + " for column in columns:\n", + " DoseAnalysis._convertColumnOfDataFrameToNumeric(dataFrame, column)\n", + "\n", + " # FK-TODO: DRY with BatchCodeTableHelper\n", + " @staticmethod\n", + " def _convertColumnOfDataFrameToNumeric(dataFrame, column):\n", + " dataFrame[column] = np.where(dataFrame[column] == 'Y', 1, 0)\n", + "\n", + " # FK-TODO: DRY with BatchCodeTableHelper\n", + " @staticmethod\n", + " def _flattenColumns(batchCodeTable):\n", + " batchCodeTable.columns = [\"_\".join(a) for a in batchCodeTable.columns.to_flat_index()]\n" ] }, { @@ -557,57 +586,35 @@ "\n", "class DoseAnalysisTest(unittest.TestCase):\n", "\n", - " def test_getFirstDoseTable(self):\n", - " self._test_getNthDoseTable(\n", - " dataFrame = self.createDataFrame(\n", - " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", - " data = [ ['Y', np.NaN, np.NaN,\t 'COVID19', 'MODERNA', '016M20A', '2'],\n", - " ['Y', np.NaN, np.NaN, 'COVID19', 'MODERNA', '030L20A', '1'],\n", - " ['Y', 'Y', 'Y', 'COVID19', 'MODERNA', '030L20B', '1']],\n", - " index = [\n", - " \"1048786\",\n", - " \"1048786\",\n", - " \"4711\"],\n", - " dtypes = {'VAX_DOSE_SERIES': \"string\"}),\n", - " dose = '1',\n", - " doseTableExpected = pd.DataFrame(\n", - " {\n", - " 'Total reports': [2],\n", - " 'Deaths': [2],\n", - " 'Disabilities': [1],\n", - " 'Life Threatening Illnesses': [1],\n", - " 'Severe reports (%)': [(2 + 1 + 1)/2 * 100]\n", - " }))\n", - "\n", - " def test_getSecondDoseTable(self):\n", - " self._test_getNthDoseTable(\n", - " dataFrame = self.createDataFrame(\n", - " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", - " data = [ ['Y', np.NaN, np.NaN,\t 'COVID19', 'MODERNA', '016M20A', '2'],\n", - " ['Y', np.NaN, np.NaN, 'COVID19', 'MODERNA', '030L20A', '1'],\n", - " ['Y', 'Y', 'Y', 'COVID19', 'MODERNA', '030L20B', '1']],\n", - " index = [\n", - " \"1048786\",\n", - " \"1048786\",\n", - " \"4711\"],\n", - " dtypes = {'VAX_DOSE_SERIES': \"string\"}),\n", - " dose = '2',\n", - " doseTableExpected = pd.DataFrame(\n", - " {\n", - " 'Total reports': [1],\n", - " 'Deaths': [1],\n", - " 'Disabilities': [0],\n", - " 'Life Threatening Illnesses': [0],\n", - " 'Severe reports (%)': [(1 + 0 + 0)/1 * 100]\n", - " }))\n", - "\n", - " def _test_getNthDoseTable(self, dataFrame, dose, doseTableExpected):\n", - " # When\n", - " doseTable = DoseAnalysis.getNthDoseTable(dataFrame, dose)\n", + " def test_getDoseTable(self):\n", + " # Given\n", + " dataFrame = self.createDataFrame(\n", + " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", + " data = [ ['Y', np.NaN, np.NaN,\t 'COVID19', 'MODERNA', '016M20A', '2'],\n", + " ['Y', np.NaN, np.NaN, 'COVID19', 'MODERNA', '030L20A', '1'],\n", + " ['Y', 'Y', 'Y', 'COVID19', 'MODERNA', '030L20B', '1']],\n", + " index = [\n", + " \"1048786\",\n", + " \"1048786\",\n", + " \"4711\"],\n", + " dtypes = {'VAX_DOSE_SERIES': \"string\"})\n", " \n", - " # Then\n", - " assert_frame_equal(doseTable, doseTableExpected)\n", + " # When\n", + " doseTable = DoseAnalysis.getDoseTable(dataFrame)\n", "\n", + " # Then\n", + " assert_frame_equal(\n", + " doseTable,\n", + " pd.DataFrame(\n", + " data = {\n", + " 'Total reports': [2, 1],\n", + " 'Deaths': [2, 1],\n", + " 'Disabilities': [1, 0],\n", + " 'Life Threatening Illnesses': [1, 0],\n", + " 'Severe reports (%)': [(2 + 1 + 1)/2 * 100, (1 + 0 + 0)/1 * 100]\n", + " },\n", + " index = pd.Index(['1', '2'], dtype = \"string\", name = 'VAX_DOSE_SERIES')))\n", + " \n", " def createDataFrame(self, index, columns, data, dtypes = {}):\n", " return pd.DataFrame(index = index, columns = columns, data = data).astype(dtypes)\n" ] @@ -691,10 +698,10 @@ "source": [ "# https://www.howbadismybatch.com/firstsecond.html\n", "\n", - "def getNthDoseTable(dose):\n", + "def getDoseTable():\n", " vaersDescrs = VaersDescrReader(dataDir = \"VAERS\").readAllVaersDescrs()\n", " dataFrame = VaersDescr2DataFrameConverter.createDataFrameFromDescrs(vaersDescrs)\n", - " return DoseAnalysis.getNthDoseTable(dataFrame, dose)" + " return DoseAnalysis.getDoseTable(dataFrame)" ] }, { @@ -704,17 +711,7 @@ "metadata": {}, "outputs": [], "source": [ - "getNthDoseTable(dose = '1')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "686d4ddf", - "metadata": {}, - "outputs": [], - "source": [ - "getNthDoseTable(dose = '2')" + "getDoseTable()" ] } ],