refactoring

This commit is contained in:
frankknoll
2022-02-05 14:33:37 +01:00
parent 0f3bac1b46
commit e7663350c1

View File

@@ -37,25 +37,41 @@
" def readVaersDescr(self, year):\n", " def readVaersDescr(self, year):\n",
" folder = self.dataDir + \"/\" + year + \"VAERSData/\"\n", " folder = self.dataDir + \"/\" + year + \"VAERSData/\"\n",
" return {\n", " return {\n",
" 'VAERSDATA':\n", " 'VAERSDATA': self._readVAERSDATA(folder, year),\n",
" self._read_csv(\n", " 'VAERSVAX': self._readVAERSVAX(folder, year)\n",
" folder + year + \"VAERSDATA.csv\",\n",
" ['VAERS_ID', 'DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT']),\n",
" 'VAERSVAX':\n",
" self._read_csv(\n",
" folder + year + \"VAERSVAX.csv\",\n",
" ['VAERS_ID', 'VAX_DOSE_SERIES', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'],\n",
" dtype = {\"VAX_DOSE_SERIES\": \"string\"})\n",
" }\n", " }\n",
"\n", "\n",
" def _read_csv(self, file, usecols, dtype = {}):\n", " def _readVAERSDATA(self, folder, year):\n",
" VAERSDATA = self._read_csv(\n",
" file = folder + year + \"VAERSDATA.csv\",\n",
" usecols = ['VAERS_ID', 'DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'])\n",
" VaersDescrReader._convertColumnsOfDataFrameToNumerics(\n",
" VAERSDATA,\n",
" ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'])\n",
" return VAERSDATA\n",
"\n",
" def _readVAERSVAX(self, folder, year):\n",
" return self._read_csv(\n",
" file = folder + year + \"VAERSVAX.csv\",\n",
" usecols = ['VAERS_ID', 'VAX_DOSE_SERIES', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'],\n",
" dtype = {\"VAX_DOSE_SERIES\": \"string\"})\n",
"\n",
" def _read_csv(self, file, **kwargs):\n",
" return pd.read_csv(\n", " return pd.read_csv(\n",
" file,\n", " file,\n",
" index_col = 'VAERS_ID',\n", " index_col = 'VAERS_ID',\n",
" encoding = 'latin1',\n", " encoding = 'latin1',\n",
" low_memory = False,\n", " low_memory = False,\n",
" usecols = usecols,\n", " **kwargs)\n",
" dtype = dtype)\n" "\n",
" @staticmethod\n",
" def _convertColumnsOfDataFrameToNumerics(dataFrame, columns):\n",
" for column in columns:\n",
" VaersDescrReader._convertColumnOfDataFrameToNumeric(dataFrame, column)\n",
"\n",
" @staticmethod\n",
" def _convertColumnOfDataFrameToNumeric(dataFrame, column):\n",
" dataFrame[column] = np.where(dataFrame[column] == 'Y', 1, 0)\n"
] ]
}, },
{ {
@@ -128,7 +144,6 @@
" \n", " \n",
" def __init__(self, dataFrame : pd.DataFrame):\n", " def __init__(self, dataFrame : pd.DataFrame):\n",
" self.dataFrame = dataFrame \n", " self.dataFrame = dataFrame \n",
" self._convertColumnsOfDataFrameToNumerics(['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'])\n",
"\n", "\n",
" def createBatchCodeTable(self):\n", " def createBatchCodeTable(self):\n",
" batchCodeTable = self.dataFrame.groupby('VAX_LOT').agg(\n", " batchCodeTable = self.dataFrame.groupby('VAX_LOT').agg(\n",
@@ -189,13 +204,6 @@
" manufacturerByBatchCodeTable = manufacturerByBatchCodeTable.drop_duplicates(subset = ['VAX_LOT'])\n", " manufacturerByBatchCodeTable = manufacturerByBatchCodeTable.drop_duplicates(subset = ['VAX_LOT'])\n",
" return manufacturerByBatchCodeTable.set_index('VAX_LOT')\n", " return manufacturerByBatchCodeTable.set_index('VAX_LOT')\n",
"\n", "\n",
" def _convertColumnsOfDataFrameToNumerics(self, columns):\n",
" for column in columns:\n",
" self._convertColumnOfDataFrameToNumeric(column)\n",
"\n",
" def _convertColumnOfDataFrameToNumeric(self, column):\n",
" self.dataFrame[column] = np.where(self.dataFrame[column] == 'Y', 1, 0)\n",
"\n",
" def _flattenColumns(self, batchCodeTable):\n", " def _flattenColumns(self, batchCodeTable):\n",
" batchCodeTable.columns = [\"_\".join(a) for a in batchCodeTable.columns.to_flat_index()]\n", " batchCodeTable.columns = [\"_\".join(a) for a in batchCodeTable.columns.to_flat_index()]\n",
"\n", "\n",
@@ -225,9 +233,6 @@
" \n", " \n",
" @staticmethod\n", " @staticmethod\n",
" def getDoseTable(dataFrame):\n", " def getDoseTable(dataFrame):\n",
" # FK-TODO: _convertColumnsOfDataFrameToNumerics() sollte schon während des Einlesens aus den CSV-Dateien durchgeführt werden\n",
" # FK-TODO: bitte alle DataFrames als unmutable behandeln und nicht inplace ändern.\n",
" DoseAnalysis._convertColumnsOfDataFrameToNumerics(dataFrame, ['DIED', 'L_THREAT', 'DISABLE'])\n",
" doseTable = dataFrame.groupby('VAX_DOSE_SERIES').agg(\n", " doseTable = dataFrame.groupby('VAX_DOSE_SERIES').agg(\n",
" {\n", " {\n",
" 'DIED': ['sum', 'size'],\n", " 'DIED': ['sum', 'size'],\n",
@@ -246,21 +251,6 @@
" doseTable['Severe reports (%)'] = (doseTable['Deaths'] + doseTable['Disabilities'] + doseTable['Life Threatening Illnesses']) / doseTable['Total reports'] * 100\n", " doseTable['Severe reports (%)'] = (doseTable['Deaths'] + doseTable['Disabilities'] + doseTable['Life Threatening Illnesses']) / doseTable['Total reports'] * 100\n",
" return doseTable\n", " return doseTable\n",
"\n", "\n",
" @staticmethod\n",
" def _count(dataFrame, column):\n",
" return len(dataFrame[dataFrame[column] == 'Y'])\n",
"\n",
" # FK-TODO: DRY with BatchCodeTableHelper\n",
" @staticmethod\n",
" def _convertColumnsOfDataFrameToNumerics(dataFrame, columns):\n",
" for column in columns:\n",
" DoseAnalysis._convertColumnOfDataFrameToNumeric(dataFrame, column)\n",
"\n",
" # FK-TODO: DRY with BatchCodeTableHelper\n",
" @staticmethod\n",
" def _convertColumnOfDataFrameToNumeric(dataFrame, column):\n",
" dataFrame[column] = np.where(dataFrame[column] == 'Y', 1, 0)\n",
"\n",
" # FK-TODO: DRY with BatchCodeTableHelper\n", " # FK-TODO: DRY with BatchCodeTableHelper\n",
" @staticmethod\n", " @staticmethod\n",
" def _flattenColumns(batchCodeTable):\n", " def _flattenColumns(batchCodeTable):\n",
@@ -296,8 +286,8 @@
" {\n", " {\n",
" 'VAERSDATA': self.createDataFrame(\n", " 'VAERSDATA': self.createDataFrame(\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n",
" data = [ ['Y', np.NaN, np.NaN],\n", " data = [ [1, 0, 0],\n",
" [np.NaN, np.NaN, 'Y']],\n", " [0, 0, 1]],\n",
" index = [\n", " index = [\n",
" \"0916600\",\n", " \"0916600\",\n",
" \"0916601\"]),\n", " \"0916601\"]),\n",
@@ -313,8 +303,8 @@
" {\n", " {\n",
" 'VAERSDATA': self.createDataFrame(\n", " 'VAERSDATA': self.createDataFrame(\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n",
" data = [ [np.NaN, np.NaN, np.NaN],\n", " data = [ [0, 0, 0],\n",
" [np.NaN, np.NaN, 'Y']],\n", " [0, 0, 1]],\n",
" index = [\n", " index = [\n",
" \"1996873\",\n", " \"1996873\",\n",
" \"1996874\"]),\n", " \"1996874\"]),\n",
@@ -334,10 +324,10 @@
" \n", " \n",
" # Then\n", " # Then\n",
" dataFrameExpected = self.createDataFrame(\n", " dataFrameExpected = self.createDataFrame(\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n",
" data = [ ['Y', np.NaN, np.NaN, 'COVID19', 'MODERNA', '037K20A', '1'],\n", " data = [ [1, 0, 0, 'COVID19', 'MODERNA', '037K20A', '1'],\n",
" [np.NaN, np.NaN, 'Y', 'COVID19', 'MODERNA', '025L20A', '1'],\n", " [0, 0, 1, 'COVID19', 'MODERNA', '025L20A', '1'],\n",
" [np.NaN, np.NaN, 'Y', 'COVID19', 'MODERNA', '025L20A', '1']],\n", " [0, 0, 1, 'COVID19', 'MODERNA', '025L20A', '1']],\n",
" index = [\n", " index = [\n",
" \"0916600\",\n", " \"0916600\",\n",
" \"0916601\",\n", " \"0916601\",\n",
@@ -353,8 +343,8 @@
" {\n", " {\n",
" 'VAERSDATA': self.createDataFrame(\n", " 'VAERSDATA': self.createDataFrame(\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'],\n", " columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'],\n",
" data = [ ['Y', 'Y', np.NaN, 'Y', 'Y'],\n", " data = [ [1, 1, 0, 1, 1],\n",
" [np.NaN, np.NaN, 'Y', np.NaN, 'Y']],\n", " [0, 0, 1, 0, 1]],\n",
" index = [\n", " index = [\n",
" \"0916600\",\n", " \"0916600\",\n",
" \"0916601\"]),\n", " \"0916601\"]),\n",
@@ -375,8 +365,8 @@
" # Then\n", " # Then\n",
" dataFrameExpected = self.createDataFrame(\n", " dataFrameExpected = self.createDataFrame(\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", " columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n",
" data = [ ['Y', 'Y', np.NaN, 'Y', 'Y', 'COVID19', 'MODERNA', '037K20A', '1'],\n", " data = [ [1, 1, 0, 1, 1, 'COVID19', 'MODERNA', '037K20A', '1'],\n",
" [np.NaN, np.NaN, 'Y', np.NaN, 'Y', 'COVID19', 'PFIZER\\BIONTECH', '025L20A', '1']],\n", " [0, 0, 1, 0, 1, 'COVID19', 'PFIZER\\BIONTECH', '025L20A', '1']],\n",
" index = [\n", " index = [\n",
" \"0916600\",\n", " \"0916600\",\n",
" \"0916601\"],\n", " \"0916601\"],\n",
@@ -391,7 +381,7 @@
" {\n", " {\n",
" 'VAERSDATA': self.createDataFrame(\n", " 'VAERSDATA': self.createDataFrame(\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n",
" data = [ ['Y', np.NaN, np.NaN]],\n", " data = [ [1, 0, 0]],\n",
" index = [\n", " index = [\n",
" \"1048786\"]),\n", " \"1048786\"]),\n",
" 'VAERSVAX': self.createDataFrame(\n", " 'VAERSVAX': self.createDataFrame(\n",
@@ -410,8 +400,8 @@
" \n", " \n",
" # Then\n", " # Then\n",
" dataFrameExpected = self.createDataFrame(\n", " dataFrameExpected = self.createDataFrame(\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n",
" data = [ ['Y', np.NaN, np.NaN, 'COVID19', 'MODERNA', '030L20A', '1']],\n", " data = [ [1, 0, 0, 'COVID19', 'MODERNA', '030L20A', '1']],\n",
" index = [\n", " index = [\n",
" \"1048786\"],\n", " \"1048786\"],\n",
" dtypes = {'VAX_DOSE_SERIES': \"string\"})\n", " dtypes = {'VAX_DOSE_SERIES': \"string\"})\n",
@@ -425,7 +415,7 @@
" {\n", " {\n",
" 'VAERSDATA': self.createDataFrame(\n", " 'VAERSDATA': self.createDataFrame(\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n",
" data = [ ['Y', np.NaN, np.NaN]],\n", " data = [ [1, 0, 0]],\n",
" index = [\n", " index = [\n",
" \"1048786\"]),\n", " \"1048786\"]),\n",
" 'VAERSVAX': self.createDataFrame(\n", " 'VAERSVAX': self.createDataFrame(\n",
@@ -444,8 +434,8 @@
" \n", " \n",
" # Then\n", " # Then\n",
" dataFrameExpected = self.createDataFrame(\n", " dataFrameExpected = self.createDataFrame(\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n",
" data = [ ['Y', np.NaN, np.NaN, 'COVID19', 'MODERNA', '016M20A', '2']],\n", " data = [ [1, 0, 0, 'COVID19', 'MODERNA', '016M20A', '2']],\n",
" index = [\n", " index = [\n",
" \"1048786\"],\n", " \"1048786\"],\n",
" dtypes = {'VAX_DOSE_SERIES': \"string\"})\n", " dtypes = {'VAX_DOSE_SERIES': \"string\"})\n",
@@ -473,8 +463,8 @@
" {\n", " {\n",
" 'VAERSDATA': self.createDataFrame(\n", " 'VAERSDATA': self.createDataFrame(\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'],\n", " columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'],\n",
" data = [ ['Y', 'Y', np.NaN, 'Y', 'Y'],\n", " data = [ [1, 1, 0, 1, 1],\n",
" [np.NaN, np.NaN, 'Y', np.NaN, 'Y']],\n", " [0, 0, 1, 0, 1]],\n",
" index = [\n", " index = [\n",
" \"0916600\",\n", " \"0916600\",\n",
" \"0916601\"]),\n", " \"0916601\"]),\n",
@@ -512,8 +502,8 @@
" {\n", " {\n",
" 'VAERSDATA': self.createDataFrame(\n", " 'VAERSDATA': self.createDataFrame(\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'],\n", " columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'],\n",
" data = [ ['Y', np.NaN, np.NaN, np.NaN, np.NaN],\n", " data = [ [1, 0, 0, 0, 0],\n",
" [np.NaN, np.NaN, 'Y', np.NaN, np.NaN]],\n", " [0, 0, 1, 0, 0]],\n",
" index = [\n", " index = [\n",
" \"0916600\",\n", " \"0916600\",\n",
" \"0916601\"]),\n", " \"0916601\"]),\n",
@@ -529,8 +519,8 @@
" {\n", " {\n",
" 'VAERSDATA': self.createDataFrame(\n", " 'VAERSDATA': self.createDataFrame(\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'],\n", " columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'],\n",
" data = [ [np.NaN, np.NaN, np.NaN, np.NaN, np.NaN],\n", " data = [ [0, 0, 0, 0, 0],\n",
" [np.NaN, np.NaN, 'Y', np.NaN, np.NaN]],\n", " [0, 0, 1, 0, 0]],\n",
" index = [\n", " index = [\n",
" \"1996873\",\n", " \"1996873\",\n",
" \"1996874\"]),\n", " \"1996874\"]),\n",
@@ -585,9 +575,9 @@
" # Given\n", " # Given\n",
" dataFrame = self.createDataFrame(\n", " dataFrame = self.createDataFrame(\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", " columns = ['DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n",
" data = [ ['Y', np.NaN, np.NaN,\t 'COVID19', 'MODERNA', '016M20A', '2'],\n", " data = [ [1, 0, 0, 'COVID19', 'MODERNA', '016M20A', '2'],\n",
" ['Y', np.NaN, np.NaN, 'COVID19', 'MODERNA', '030L20A', '1'],\n", " [1, 0, 0, 'COVID19', 'MODERNA', '030L20A', '1'],\n",
" ['Y', 'Y', 'Y', 'COVID19', 'MODERNA', '030L20B', '1']],\n", " [1, 1, 1, 'COVID19', 'MODERNA', '030L20B', '1']],\n",
" index = [\n", " index = [\n",
" \"1048786\",\n", " \"1048786\",\n",
" \"1048786\",\n", " \"1048786\",\n",