diff --git a/HowBadIsMyBatch.ipynb b/HowBadIsMyBatch.ipynb index f4b55ffb574..79553e9ac2d 100644 --- a/HowBadIsMyBatch.ipynb +++ b/HowBadIsMyBatch.ipynb @@ -21,16 +21,16 @@ "metadata": {}, "outputs": [], "source": [ + "def filter(df, manufacturer, dose):\n", + " return df[\n", + " (df[\"VAX_TYPE\"] == \"COVID19\") &\n", + " (df[\"VAX_MANU\"] == manufacturer) &\n", + " (df[\"VAX_DOSE_SERIES\"].str.contains(dose))]\n", + "\n", "def createDataFrameFromDescr(vaersDescr, manufacturer, dose):\n", - " def filter(df):\n", - " return df[\n", - " (df[\"VAX_TYPE\"] == \"COVID19\") &\n", - " (df[\"VAX_MANU\"] == manufacturer) &\n", - " (df[\"VAX_DOSE_SERIES\"].str.contains(dose))]\n", - " \n", " return pd.merge(\n", " vaersDescr['VAERSDATA'],\n", - " filter(vaersDescr['VAERSVAX']),\n", + " filter(vaersDescr['VAERSVAX'], manufacturer, dose),\n", " left_index = True,\n", " right_index = True)\n", "\n", @@ -47,32 +47,32 @@ "metadata": {}, "outputs": [], "source": [ - "def createDataFrameFromFiles(dataDir, manufacturer, dose):\n", - " def readVaersDescr(year):\n", - " def read_csv(file, usecols, dtype = {}):\n", - " return pd.read_csv(\n", - " file,\n", - " index_col = 'VAERS_ID',\n", - " encoding = 'latin1',\n", - " low_memory = False,\n", - " usecols = usecols,\n", - " dtype = dtype)\n", + "def read_csv(file, usecols, dtype = {}):\n", + " return pd.read_csv(\n", + " file,\n", + " index_col = 'VAERS_ID',\n", + " encoding = 'latin1',\n", + " low_memory = False,\n", + " usecols = usecols,\n", + " dtype = dtype)\n", "\n", - " folder = dataDir + \"/\" + year + \"VAERSData/\"\n", - " return {\n", - " 'VAERSDATA':\n", + "def readVaersDescr(dataDir, year):\n", + " folder = dataDir + \"/\" + year + \"VAERSData/\"\n", + " return {\n", + " 'VAERSDATA':\n", + " read_csv(\n", + " folder + year + \"VAERSDATA.csv\",\n", + " ['VAERS_ID', 'DIED', 'L_THREAT', 'DISABLE']),\n", + " 'VAERSVAX':\n", " read_csv(\n", - " folder + year + \"VAERSDATA.csv\",\n", - " ['VAERS_ID', 'DIED', 'L_THREAT', 'DISABLE']),\n", - " 'VAERSVAX':\n", - " read_csv(\n", - " folder + year + \"VAERSVAX.csv\",\n", - " ['VAERS_ID', 'VAX_DOSE_SERIES', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'],\n", - " dtype = {\"VAX_DOSE_SERIES\": \"string\"})\n", - " }\n", + " folder + year + \"VAERSVAX.csv\",\n", + " ['VAERS_ID', 'VAX_DOSE_SERIES', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'],\n", + " dtype = {\"VAX_DOSE_SERIES\": \"string\"})\n", + " }\n", "\n", + "def createDataFrameFromFiles(dataDir, manufacturer, dose):\n", " return createDataFrameFromDescrs(\n", - " [readVaersDescr(\"2021\"), readVaersDescr(\"2022\")],\n", + " [readVaersDescr(dataDir, \"2021\"), readVaersDescr(dataDir, \"2022\")],\n", " manufacturer,\n", " dose)" ] @@ -110,6 +110,7 @@ { "cell_type": "code", "execution_count": null, + "id": "e59a1825", "metadata": {}, "outputs": [], "source": [ @@ -123,9 +124,9 @@ " {\n", " 'VAERSDATA': self.createDataFrame(\n", " index = [\"0916600\", \"0916601\"],\n", - " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", - " data = [ ['Y', np.NaN, np.NaN],\n", - " [np.NaN, np.NaN, 'Y']]),\n", + " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", + " data = [ ['Y', np.NaN, np.NaN],\n", + " [np.NaN, np.NaN, 'Y']]),\n", " 'VAERSVAX': self.createDataFrame(\n", " index = [\"0916600\", \"0916601\"],\n", " columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", @@ -167,13 +168,13 @@ " {\n", " 'VAERSDATA': self.createDataFrame(\n", " index = [\"1048786\"],\n", - " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", + " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", " data = [ ['Y', np.NaN, np.NaN]]),\n", " 'VAERSVAX': self.createDataFrame(\n", " index = [\"1048786\", \"1048786\"],\n", " columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", - " data = [ ['COVID19', 'MODERNA', '016M20A', '2'],\n", - " ['COVID19', 'MODERNA', '030L20A', '1']],\n", + " data = [ ['COVID19', 'MODERNA', '016M20A', '2'],\n", + " ['COVID19', 'MODERNA', '030L20A', '1']],\n", " dtypes = {'VAX_DOSE_SERIES': \"string\"})\n", " }\n", " ]\n", @@ -195,8 +196,8 @@ " {\n", " 'VAERSDATA': self.createDataFrame(\n", " index = [\"1048786\"],\n", - " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", - " data = [ ['Y', np.NaN, np.NaN]]),\n", + " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", + " data = [ ['Y', np.NaN, np.NaN]]),\n", " 'VAERSVAX': self.createDataFrame(\n", " index = [\"1048786\", \"1048786\"],\n", " columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", @@ -238,9 +239,9 @@ " {\n", " 'VAERSDATA': self.createDataFrame(\n", " index = [\"0916600\", \"0916601\"],\n", - " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", - " data = [ ['Y', np.NaN, np.NaN],\n", - " [np.NaN, np.NaN, 'Y']]),\n", + " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", + " data = [ ['Y', np.NaN, np.NaN],\n", + " [np.NaN, np.NaN, 'Y']]),\n", " 'VAERSVAX': self.createDataFrame(\n", " index = [\"0916600\", \"0916601\"],\n", " columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n", @@ -252,8 +253,8 @@ " 'VAERSDATA': self.createDataFrame(\n", " index = [\"1996873\", \"1996874\"],\n", " columns = ['DIED', 'L_THREAT', 'DISABLE'],\n", - " data = [ [np.NaN, np.NaN, np.NaN],\n", - " [np.NaN, np.NaN, 'Y']]),\n", + " data = [ [np.NaN, np.NaN, np.NaN],\n", + " [np.NaN, np.NaN, 'Y']]),\n", " 'VAERSVAX': self.createDataFrame(\n", " index = [\"1996873\", \"1996874\"],\n", " columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],\n",