From 8a496bc4d97bd46eb80d3cd07e68174e63e53cf7 Mon Sep 17 00:00:00 2001 From: frankknoll Date: Fri, 28 Jan 2022 23:37:37 +0100 Subject: [PATCH] adding CreateDataFrameTest --- HowBadIsMyBatch.ipynb | 71 +++++++++++++++++++++++++++++++++++++++---- help.txt | 7 +++-- 2 files changed, 70 insertions(+), 8 deletions(-) diff --git a/HowBadIsMyBatch.ipynb b/HowBadIsMyBatch.ipynb index da10cf0a10f..1981eb61594 100644 --- a/HowBadIsMyBatch.ipynb +++ b/HowBadIsMyBatch.ipynb @@ -44,7 +44,7 @@ " folder = dataDir + \"/\" + year + \"VAERSData/\"\n", " return {\n", " 'VAERSDATA': read_csv(folder + year + \"VAERSDATA.csv\", ['VAERS_ID', 'DIED', 'L_THREAT', 'DISABLE']),\n", - " 'VAERSVAX': read_csv(folder + year + \"VAERSVAX.csv\", ['VAERS_ID', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'])\n", + " 'VAERSVAX': read_csv(folder + year + \"VAERSVAX.csv\", ['VAERS_ID', 'VAX_DOSE_SERIES', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'])\n", " }\n", "\n", " return _createDataFrame(\n", @@ -85,12 +85,71 @@ { "cell_type": "code", "execution_count": null, - "id": "e14465d7", "metadata": {}, "outputs": [], "source": [ "from pandas.testing import assert_frame_equal\n", "\n", + "class CreateDataFrameTest(unittest.TestCase):\n", + "\n", + " def test_createDataFrame(self):\n", + " # Given\n", + " vaersDescrs = [\n", + " {\n", + " 'VAERSDATA': self.createDataFrame(\n", + " [ 'DIED', 'L_THREAT', 'DISABLE'],\n", + " {\n", + " '0916600': ['Y', np.NaN, np.NaN],\n", + " '0916601': [np.NaN, np.NaN, 'Y']\n", + " }),\n", + " 'VAERSVAX': self.createDataFrame(\n", + " [ 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'],\n", + " {\n", + " '0916600': ['COVID19', 'MODERNA', '037K20A'],\n", + " '0916601': ['COVID19', 'MODERNA', '025L20A']\n", + " })\n", + " },\n", + " {\n", + " 'VAERSDATA': self.createDataFrame(\n", + " [ 'DIED', 'L_THREAT', 'DISABLE'],\n", + " {\n", + " '1996873': [np.NaN, np.NaN, np.NaN],\n", + " '1996874': [np.NaN, np.NaN, 'Y']\n", + " }),\n", + " 'VAERSVAX': self.createDataFrame(\n", + " [ 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'],\n", + " {\n", + " '1996873': ['HPV9', 'MERCK & CO. INC.', 'R017624'],\n", + " '1996874': ['COVID19', 'MODERNA', '025L20A']\n", + " })\n", + " }\n", + " ]\n", + " \n", + " # When\n", + " dataFrame = _createDataFrame(vaersDescrs, \"MODERNA\")\n", + " \n", + " # Then\n", + " dataFrameExpected = self.createDataFrame(\n", + " [ 'DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'],\n", + " {\n", + " '0916600': ['Y', np.NaN, np.NaN, 'COVID19', 'MODERNA', '037K20A'],\n", + " '0916601': [np.NaN, np.NaN, 'Y', 'COVID19', 'MODERNA', '025L20A'],\n", + " '1996874': [np.NaN, np.NaN, 'Y', 'COVID19', 'MODERNA', '025L20A']\n", + " })\n", + " assert_frame_equal(dataFrame, dataFrameExpected, check_dtype = False)\n", + "\n", + " def createDataFrame(self, columns, data):\n", + " return pd.DataFrame.from_dict(data, columns = columns, orient = 'index')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e14465d7", + "metadata": {}, + "outputs": [], + "source": [ + "from pandas.testing import assert_frame_equal\n", "\n", "class BatchCodeTableTest(unittest.TestCase):\n", "\n", @@ -135,18 +194,18 @@ "\n", " def _test_createBatchCodeTable(self, dataFrame):\n", " # When\n", - " batchCodeTable=createBatchCodeTable(dataFrame)\n", + " batchCodeTable = createBatchCodeTable(dataFrame)\n", "\n", " # Then\n", - " batchCodeTableExpected=pd.DataFrame(\n", + " batchCodeTableExpected = pd.DataFrame(\n", " data={\n", " 'ADRs': [2, 1],\n", " 'DEATHS': [0, 1],\n", " 'DISABILITIES': [2, 0],\n", " 'LIFE THREATENING ILLNESSES': [0, 0]\n", " },\n", - " index=pd.MultiIndex.from_arrays([['025L20A', '037K20A']], names = ('VAX_LOT',)))\n", - " assert_frame_equal(batchCodeTable, batchCodeTableExpected, check_dtype=False)\n", + " index = pd.MultiIndex.from_arrays([['025L20A', '037K20A']], names = ('VAX_LOT',)))\n", + " assert_frame_equal(batchCodeTable, batchCodeTableExpected, check_dtype = False)\n", "\n", " def createDataFrame(self, columns, data):\n", " return pd.DataFrame.from_dict(data, columns = columns, orient = 'index')\n" diff --git a/help.txt b/help.txt index 914857e48ee..201f932f4ba 100644 --- a/help.txt +++ b/help.txt @@ -6,7 +6,7 @@ FK-TODO: Repeat for second dose and third dose separately. The cumulative effect will then appear. It should be analysed separately anyway, because adverse reactions increase with each dose." # 1. filter the vax table first for just C19 vaccines - # 2. and for just n-tn (n \in {1, 2, 3}) dose => VAERSDATA --> VAERSVAX ist 1:1-Beziehung statt 1:n und kann einfacher in eine einzige Tabelle gemergt werden + # 2. and for just n-th (VAERSVAX.VAX_DOSE_SERIES == n \in {1, 2, 3}) dose => VAERSDATA --> VAERSVAX ist 1:1-Beziehung statt 1:n und kann einfacher in eine einzige Tabelle gemergt werden # 3. filter for manufacturer - Prüfe, ob die VAERS_ID wirklich eindeutig ist. Antwort: VAERS_ID ist in der VAERSVAX-Tabelle nicht eindeutig, da es mehrere Impfungen pro Person geben kann. - VAX_LOT-Spalte normalisieren, d.h. mindestens toUpperCase() darauf anwenden @@ -15,7 +15,7 @@ FK-TODO: MOD039K20A #039K20A 039K20A-MODERNA -039K20A-2A +039K20A-2A (vielleicht nicht) 039K20A or 039L Moderna/039K20A MODERNA 039K20A @@ -34,3 +34,6 @@ u039k20a 039K20A & 031M2 039K20A and 032 039K20A, 011L20 + + +df[df.index.duplicated(False)].to_excel('results/pfizer_duplicates.xlsx') \ No newline at end of file