From e7663350c126879e73f1ef68b23e13069d6a838b Mon Sep 17 00:00:00 2001 From: frankknoll Date: Sat, 5 Feb 2022 14:33:37 +0100 Subject: [PATCH] refactoring --- HowBadIsMyBatch.ipynb | 120 +++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 65 deletions(-) diff --git a/HowBadIsMyBatch.ipynb b/HowBadIsMyBatch.ipynb index b7886289c32..9b09fc4307b 100644 --- a/HowBadIsMyBatch.ipynb +++ b/HowBadIsMyBatch.ipynb @@ -37,25 +37,41 @@ " def readVaersDescr(self, year):\n", " folder = self.dataDir + \"/\" + year + \"VAERSData/\"\n", " return {\n", - " 'VAERSDATA':\n", - " self._read_csv(\n", - " folder + year + \"VAERSDATA.csv\",\n", - " ['VAERS_ID', 'DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT']),\n", - " 'VAERSVAX':\n", - " self._read_csv(\n", - " folder + year + \"VAERSVAX.csv\",\n", - " ['VAERS_ID', 'VAX_DOSE_SERIES', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'],\n", - " dtype = {\"VAX_DOSE_SERIES\": \"string\"})\n", + " 'VAERSDATA': self._readVAERSDATA(folder, year),\n", + " 'VAERSVAX': self._readVAERSVAX(folder, year)\n", " }\n", "\n", - " def _read_csv(self, file, usecols, dtype = {}):\n", + " def _readVAERSDATA(self, folder, year):\n", + " VAERSDATA = self._read_csv(\n", + " file = folder + year + \"VAERSDATA.csv\",\n", + " usecols = ['VAERS_ID', 'DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'])\n", + " VaersDescrReader._convertColumnsOfDataFrameToNumerics(\n", + " VAERSDATA,\n", + " ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'])\n", + " return VAERSDATA\n", + "\n", + " def _readVAERSVAX(self, folder, year):\n", + " return self._read_csv(\n", + " file = folder + year + \"VAERSVAX.csv\",\n", + " usecols = ['VAERS_ID', 'VAX_DOSE_SERIES', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'],\n", + " dtype = {\"VAX_DOSE_SERIES\": \"string\"})\n", + "\n", + " def _read_csv(self, file, **kwargs):\n", " return pd.read_csv(\n", " file,\n", " index_col = 'VAERS_ID',\n", " encoding = 'latin1',\n", " low_memory = False,\n", - " usecols = usecols,\n", - " dtype = dtype)\n" + " **kwargs)\n", + "\n", + " @staticmethod\n", + " def _convertColumnsOfDataFrameToNumerics(dataFrame, columns):\n", + " for column in columns:\n", + " VaersDescrReader._convertColumnOfDataFrameToNumeric(dataFrame, column)\n", + "\n", + " @staticmethod\n", + " def _convertColumnOfDataFrameToNumeric(dataFrame, column):\n", + " dataFrame[column] = np.where(dataFrame[column] == 'Y', 1, 0)\n" ] }, { @@ -128,7 +144,6 @@ " \n", " def __init__(self, dataFrame : pd.DataFrame):\n", " self.dataFrame = dataFrame \n", - " self._convertColumnsOfDataFrameToNumerics(['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'])\n", "\n", " def createBatchCodeTable(self):\n", " batchCodeTable = self.dataFrame.groupby('VAX_LOT').agg(\n", @@ -189,13 +204,6 @@ " manufacturerByBatchCodeTable = manufacturerByBatchCodeTable.drop_duplicates(subset = ['VAX_LOT'])\n", " return manufacturerByBatchCodeTable.set_index('VAX_LOT')\n", "\n", - " def _convertColumnsOfDataFrameToNumerics(self, columns):\n", - " for column in columns:\n", - " self._convertColumnOfDataFrameToNumeric(column)\n", - "\n", - " def _convertColumnOfDataFrameToNumeric(self, column):\n", - " self.dataFrame[column] = np.where(self.dataFrame[column] == 'Y', 1, 0)\n", - "\n", " def _flattenColumns(self, batchCodeTable):\n", " batchCodeTable.columns = [\"_\".join(a) for a in batchCodeTable.columns.to_flat_index()]\n", "\n", @@ -225,9 +233,6 @@ " \n", " @staticmethod\n", " def getDoseTable(dataFrame):\n", - " # FK-TODO: _convertColumnsOfDataFrameToNumerics() sollte schon während des Einlesens aus den CSV-Dateien durchgeführt werden\n", - " # FK-TODO: bitte alle DataFrames als unmutable behandeln und nicht inplace ändern.\n", - " DoseAnalysis._convertColumnsOfDataFrameToNumerics(dataFrame, ['DIED', 'L_THREAT', 'DISABLE'])\n", " doseTable = dataFrame.groupby('VAX_DOSE_SERIES').agg(\n", " {\n", " 'DIED': ['sum', 'size'],\n", @@ -246,21 +251,6 @@ " doseTable['Severe reports (%)'] = (doseTable['Deaths'] + doseTable['Disabilities'] + doseTable['Life Threatening Illnesses']) / doseTable['Total reports'] * 100\n", " return doseTable\n", "\n", - " @staticmethod\n", - " def _count(dataFrame, column):\n", - " return len(dataFrame[dataFrame[column] == 'Y'])\n", - "\n", - " # FK-TODO: DRY with BatchCodeTableHelper\n", - " @staticmethod\n", - " def _convertColumnsOfDataFrameToNumerics(dataFrame, columns):\n", - " for column in columns:\n", - " DoseAnalysis._convertColumnOfDataFrameToNumeric(dataFrame, column)\n", - "\n", - " # FK-TODO: DRY with BatchCodeTableHelper\n", - " @staticmethod\n", - " def _convertColumnOfDataFrameToNumeric(dataFrame, column):\n", - " dataFrame[column] = np.where(dataFrame[column] == 'Y', 1, 0)\n", - "\n", " # FK-TODO: DRY with BatchCodeTableHelper\n", " @staticmethod\n", " def _flattenColumns(batchCodeTable):\n", @@ -296,8 +286,8 @@ " {\n", " 'VAERSDATA': self.createDataFrame(\n", " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", - " data = [ ['Y', np.NaN, np.NaN],\n", - " [np.NaN, np.NaN, 'Y']],\n", + " data = [ [1, 0, 0],\n", + " [0, 0, 1]],\n", " index = [\n", " \"0916600\",\n", " \"0916601\"]),\n", @@ -313,8 +303,8 @@ " {\n", " 'VAERSDATA': self.createDataFrame(\n", " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", - " data = [ [np.NaN, np.NaN, np.NaN],\n", - " [np.NaN, np.NaN, 'Y']],\n", + " data = [ [0, 0, 0],\n", + " [0, 0, 1]],\n", " index = [\n", " \"1996873\",\n", " \"1996874\"]),\n", @@ -334,10 +324,10 @@ " \n", " # Then\n", " dataFrameExpected = self.createDataFrame(\n", - " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", - " data = [ ['Y', np.NaN, np.NaN, 'COVID19', 'MODERNA', '037K20A', '1'],\n", - " [np.NaN, np.NaN, 'Y', 'COVID19', 'MODERNA', '025L20A', '1'],\n", - " [np.NaN, np.NaN, 'Y', 'COVID19', 'MODERNA', '025L20A', '1']],\n", + " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", + " data = [ [1, 0, 0, 'COVID19', 'MODERNA', '037K20A', '1'],\n", + " [0, 0, 1, 'COVID19', 'MODERNA', '025L20A', '1'],\n", + " [0, 0, 1, 'COVID19', 'MODERNA', '025L20A', '1']],\n", " index = [\n", " \"0916600\",\n", " \"0916601\",\n", @@ -353,8 +343,8 @@ " {\n", " 'VAERSDATA': self.createDataFrame(\n", " columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'],\n", - " data = [ ['Y', 'Y', np.NaN, 'Y', 'Y'],\n", - " [np.NaN, np.NaN, 'Y', np.NaN, 'Y']],\n", + " data = [ [1, 1, 0, 1, 1],\n", + " [0, 0, 1, 0, 1]],\n", " index = [\n", " \"0916600\",\n", " \"0916601\"]),\n", @@ -375,8 +365,8 @@ " # Then\n", " dataFrameExpected = self.createDataFrame(\n", " columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", - " data = [ ['Y', 'Y', np.NaN, 'Y', 'Y', 'COVID19', 'MODERNA', '037K20A', '1'],\n", - " [np.NaN, np.NaN, 'Y', np.NaN, 'Y', 'COVID19', 'PFIZER\\BIONTECH', '025L20A', '1']],\n", + " data = [ [1, 1, 0, 1, 1, 'COVID19', 'MODERNA', '037K20A', '1'],\n", + " [0, 0, 1, 0, 1, 'COVID19', 'PFIZER\\BIONTECH', '025L20A', '1']],\n", " index = [\n", " \"0916600\",\n", " \"0916601\"],\n", @@ -391,7 +381,7 @@ " {\n", " 'VAERSDATA': self.createDataFrame(\n", " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", - " data = [ ['Y', np.NaN, np.NaN]],\n", + " data = [ [1, 0, 0]],\n", " index = [\n", " \"1048786\"]),\n", " 'VAERSVAX': self.createDataFrame(\n", @@ -410,8 +400,8 @@ " \n", " # Then\n", " dataFrameExpected = self.createDataFrame(\n", - " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", - " data = [ ['Y', np.NaN, np.NaN, 'COVID19', 'MODERNA', '030L20A', '1']],\n", + " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", + " data = [ [1, 0, 0, 'COVID19', 'MODERNA', '030L20A', '1']],\n", " index = [\n", " \"1048786\"],\n", " dtypes = {'VAX_DOSE_SERIES': \"string\"})\n", @@ -425,7 +415,7 @@ " {\n", " 'VAERSDATA': self.createDataFrame(\n", " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", - " data = [ ['Y', np.NaN, np.NaN]],\n", + " data = [ [1, 0, 0]],\n", " index = [\n", " \"1048786\"]),\n", " 'VAERSVAX': self.createDataFrame(\n", @@ -444,8 +434,8 @@ " \n", " # Then\n", " dataFrameExpected = self.createDataFrame(\n", - " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", - " data = [ ['Y', np.NaN, np.NaN, 'COVID19', 'MODERNA', '016M20A', '2']],\n", + " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", + " data = [ [1, 0, 0, 'COVID19', 'MODERNA', '016M20A', '2']],\n", " index = [\n", " \"1048786\"],\n", " dtypes = {'VAX_DOSE_SERIES': \"string\"})\n", @@ -473,8 +463,8 @@ " {\n", " 'VAERSDATA': self.createDataFrame(\n", " columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'],\n", - " data = [ ['Y', 'Y', np.NaN, 'Y', 'Y'],\n", - " [np.NaN, np.NaN, 'Y', np.NaN, 'Y']],\n", + " data = [ [1, 1, 0, 1, 1],\n", + " [0, 0, 1, 0, 1]],\n", " index = [\n", " \"0916600\",\n", " \"0916601\"]),\n", @@ -512,8 +502,8 @@ " {\n", " 'VAERSDATA': self.createDataFrame(\n", " columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'],\n", - " data = [ ['Y', np.NaN, np.NaN, np.NaN, np.NaN],\n", - " [np.NaN, np.NaN, 'Y', np.NaN, np.NaN]],\n", + " data = [ [1, 0, 0, 0, 0],\n", + " [0, 0, 1, 0, 0]],\n", " index = [\n", " \"0916600\",\n", " \"0916601\"]),\n", @@ -529,8 +519,8 @@ " {\n", " 'VAERSDATA': self.createDataFrame(\n", " columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'],\n", - " data = [ [np.NaN, np.NaN, np.NaN, np.NaN, np.NaN],\n", - " [np.NaN, np.NaN, 'Y', np.NaN, np.NaN]],\n", + " data = [ [0, 0, 0, 0, 0],\n", + " [0, 0, 1, 0, 0]],\n", " index = [\n", " \"1996873\",\n", " \"1996874\"]),\n", @@ -585,9 +575,9 @@ " # Given\n", " dataFrame = self.createDataFrame(\n", " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", - " data = [ ['Y', np.NaN, np.NaN,\t 'COVID19', 'MODERNA', '016M20A', '2'],\n", - " ['Y', np.NaN, np.NaN, 'COVID19', 'MODERNA', '030L20A', '1'],\n", - " ['Y', 'Y', 'Y', 'COVID19', 'MODERNA', '030L20B', '1']],\n", + " data = [ [1, 0, 0, 'COVID19', 'MODERNA', '016M20A', '2'],\n", + " [1, 0, 0, 'COVID19', 'MODERNA', '030L20A', '1'],\n", + " [1, 1, 1, 'COVID19', 'MODERNA', '030L20B', '1']],\n", " index = [\n", " \"1048786\",\n", " \"1048786\",\n",