refactoring

This commit is contained in:
frankknoll
2022-01-29 15:03:37 +01:00
parent ae7cb41dac
commit d860b2d630

View File

@@ -21,16 +21,16 @@
"metadata": {},
"outputs": [],
"source": [
"def filter(df, manufacturer, dose):\n",
" return df[\n",
" (df[\"VAX_TYPE\"] == \"COVID19\") &\n",
" (df[\"VAX_MANU\"] == manufacturer) &\n",
" (df[\"VAX_DOSE_SERIES\"].str.contains(dose))]\n",
"\n",
"def createDataFrameFromDescr(vaersDescr, manufacturer, dose):\n",
" def filter(df):\n",
" return df[\n",
" (df[\"VAX_TYPE\"] == \"COVID19\") &\n",
" (df[\"VAX_MANU\"] == manufacturer) &\n",
" (df[\"VAX_DOSE_SERIES\"].str.contains(dose))]\n",
" \n",
" return pd.merge(\n",
" vaersDescr['VAERSDATA'],\n",
" filter(vaersDescr['VAERSVAX']),\n",
" filter(vaersDescr['VAERSVAX'], manufacturer, dose),\n",
" left_index = True,\n",
" right_index = True)\n",
"\n",
@@ -47,32 +47,32 @@
"metadata": {},
"outputs": [],
"source": [
"def createDataFrameFromFiles(dataDir, manufacturer, dose):\n",
" def readVaersDescr(year):\n",
" def read_csv(file, usecols, dtype = {}):\n",
" return pd.read_csv(\n",
" file,\n",
" index_col = 'VAERS_ID',\n",
" encoding = 'latin1',\n",
" low_memory = False,\n",
" usecols = usecols,\n",
" dtype = dtype)\n",
"def read_csv(file, usecols, dtype = {}):\n",
" return pd.read_csv(\n",
" file,\n",
" index_col = 'VAERS_ID',\n",
" encoding = 'latin1',\n",
" low_memory = False,\n",
" usecols = usecols,\n",
" dtype = dtype)\n",
"\n",
" folder = dataDir + \"/\" + year + \"VAERSData/\"\n",
" return {\n",
" 'VAERSDATA':\n",
"def readVaersDescr(dataDir, year):\n",
" folder = dataDir + \"/\" + year + \"VAERSData/\"\n",
" return {\n",
" 'VAERSDATA':\n",
" read_csv(\n",
" folder + year + \"VAERSDATA.csv\",\n",
" ['VAERS_ID', 'DIED', 'L_THREAT', 'DISABLE']),\n",
" 'VAERSVAX':\n",
" read_csv(\n",
" folder + year + \"VAERSDATA.csv\",\n",
" ['VAERS_ID', 'DIED', 'L_THREAT', 'DISABLE']),\n",
" 'VAERSVAX':\n",
" read_csv(\n",
" folder + year + \"VAERSVAX.csv\",\n",
" ['VAERS_ID', 'VAX_DOSE_SERIES', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'],\n",
" dtype = {\"VAX_DOSE_SERIES\": \"string\"})\n",
" }\n",
" folder + year + \"VAERSVAX.csv\",\n",
" ['VAERS_ID', 'VAX_DOSE_SERIES', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'],\n",
" dtype = {\"VAX_DOSE_SERIES\": \"string\"})\n",
" }\n",
"\n",
"def createDataFrameFromFiles(dataDir, manufacturer, dose):\n",
" return createDataFrameFromDescrs(\n",
" [readVaersDescr(\"2021\"), readVaersDescr(\"2022\")],\n",
" [readVaersDescr(dataDir, \"2021\"), readVaersDescr(dataDir, \"2022\")],\n",
" manufacturer,\n",
" dose)"
]
@@ -110,6 +110,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "e59a1825",
"metadata": {},
"outputs": [],
"source": [
@@ -123,9 +124,9 @@
" {\n",
" 'VAERSDATA': self.createDataFrame(\n",
" index = [\"0916600\", \"0916601\"],\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE'],\n",
" data = [ ['Y', np.NaN, np.NaN],\n",
" [np.NaN, np.NaN, 'Y']]),\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE'],\n",
" data = [ ['Y', np.NaN, np.NaN],\n",
" [np.NaN, np.NaN, 'Y']]),\n",
" 'VAERSVAX': self.createDataFrame(\n",
" index = [\"0916600\", \"0916601\"],\n",
" columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n",
@@ -167,13 +168,13 @@
" {\n",
" 'VAERSDATA': self.createDataFrame(\n",
" index = [\"1048786\"],\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE'],\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE'],\n",
" data = [ ['Y', np.NaN, np.NaN]]),\n",
" 'VAERSVAX': self.createDataFrame(\n",
" index = [\"1048786\", \"1048786\"],\n",
" columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n",
" data = [ ['COVID19', 'MODERNA', '016M20A', '2'],\n",
" ['COVID19', 'MODERNA', '030L20A', '1']],\n",
" data = [ ['COVID19', 'MODERNA', '016M20A', '2'],\n",
" ['COVID19', 'MODERNA', '030L20A', '1']],\n",
" dtypes = {'VAX_DOSE_SERIES': \"string\"})\n",
" }\n",
" ]\n",
@@ -195,8 +196,8 @@
" {\n",
" 'VAERSDATA': self.createDataFrame(\n",
" index = [\"1048786\"],\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE'],\n",
" data = [ ['Y', np.NaN, np.NaN]]),\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE'],\n",
" data = [ ['Y', np.NaN, np.NaN]]),\n",
" 'VAERSVAX': self.createDataFrame(\n",
" index = [\"1048786\", \"1048786\"],\n",
" columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n",
@@ -238,9 +239,9 @@
" {\n",
" 'VAERSDATA': self.createDataFrame(\n",
" index = [\"0916600\", \"0916601\"],\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE'],\n",
" data = [ ['Y', np.NaN, np.NaN],\n",
" [np.NaN, np.NaN, 'Y']]),\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE'],\n",
" data = [ ['Y', np.NaN, np.NaN],\n",
" [np.NaN, np.NaN, 'Y']]),\n",
" 'VAERSVAX': self.createDataFrame(\n",
" index = [\"0916600\", \"0916601\"],\n",
" columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n",
@@ -252,8 +253,8 @@
" 'VAERSDATA': self.createDataFrame(\n",
" index = [\"1996873\", \"1996874\"],\n",
" columns = ['DIED', 'L_THREAT', 'DISABLE'],\n",
" data = [ [np.NaN, np.NaN, np.NaN],\n",
" [np.NaN, np.NaN, 'Y']]),\n",
" data = [ [np.NaN, np.NaN, np.NaN],\n",
" [np.NaN, np.NaN, 'Y']]),\n",
" 'VAERSVAX': self.createDataFrame(\n",
" index = [\"1996873\", \"1996874\"],\n",
" columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n",