adding CreateDataFrameTest

This commit is contained in:
frankknoll
2022-01-28 23:37:37 +01:00
parent a40f3fe038
commit 8a496bc4d9
2 changed files with 70 additions and 8 deletions

View File

@@ -44,7 +44,7 @@
" folder = dataDir + \"/\" + year + \"VAERSData/\"\n", " folder = dataDir + \"/\" + year + \"VAERSData/\"\n",
" return {\n", " return {\n",
" 'VAERSDATA': read_csv(folder + year + \"VAERSDATA.csv\", ['VAERS_ID', 'DIED', 'L_THREAT', 'DISABLE']),\n", " 'VAERSDATA': read_csv(folder + year + \"VAERSDATA.csv\", ['VAERS_ID', 'DIED', 'L_THREAT', 'DISABLE']),\n",
" 'VAERSVAX': read_csv(folder + year + \"VAERSVAX.csv\", ['VAERS_ID', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'])\n", " 'VAERSVAX': read_csv(folder + year + \"VAERSVAX.csv\", ['VAERS_ID', 'VAX_DOSE_SERIES', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'])\n",
" }\n", " }\n",
"\n", "\n",
" return _createDataFrame(\n", " return _createDataFrame(\n",
@@ -85,12 +85,71 @@
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"id": "e14465d7",
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"from pandas.testing import assert_frame_equal\n", "from pandas.testing import assert_frame_equal\n",
"\n", "\n",
"class CreateDataFrameTest(unittest.TestCase):\n",
"\n",
" def test_createDataFrame(self):\n",
" # Given\n",
" vaersDescrs = [\n",
" {\n",
" 'VAERSDATA': self.createDataFrame(\n",
" [ 'DIED', 'L_THREAT', 'DISABLE'],\n",
" {\n",
" '0916600': ['Y', np.NaN, np.NaN],\n",
" '0916601': [np.NaN, np.NaN, 'Y']\n",
" }),\n",
" 'VAERSVAX': self.createDataFrame(\n",
" [ 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'],\n",
" {\n",
" '0916600': ['COVID19', 'MODERNA', '037K20A'],\n",
" '0916601': ['COVID19', 'MODERNA', '025L20A']\n",
" })\n",
" },\n",
" {\n",
" 'VAERSDATA': self.createDataFrame(\n",
" [ 'DIED', 'L_THREAT', 'DISABLE'],\n",
" {\n",
" '1996873': [np.NaN, np.NaN, np.NaN],\n",
" '1996874': [np.NaN, np.NaN, 'Y']\n",
" }),\n",
" 'VAERSVAX': self.createDataFrame(\n",
" [ 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'],\n",
" {\n",
" '1996873': ['HPV9', 'MERCK & CO. INC.', 'R017624'],\n",
" '1996874': ['COVID19', 'MODERNA', '025L20A']\n",
" })\n",
" }\n",
" ]\n",
" \n",
" # When\n",
" dataFrame = _createDataFrame(vaersDescrs, \"MODERNA\")\n",
" \n",
" # Then\n",
" dataFrameExpected = self.createDataFrame(\n",
" [ 'DIED', 'L_THREAT', 'DISABLE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'],\n",
" {\n",
" '0916600': ['Y', np.NaN, np.NaN, 'COVID19', 'MODERNA', '037K20A'],\n",
" '0916601': [np.NaN, np.NaN, 'Y', 'COVID19', 'MODERNA', '025L20A'],\n",
" '1996874': [np.NaN, np.NaN, 'Y', 'COVID19', 'MODERNA', '025L20A']\n",
" })\n",
" assert_frame_equal(dataFrame, dataFrameExpected, check_dtype = False)\n",
"\n",
" def createDataFrame(self, columns, data):\n",
" return pd.DataFrame.from_dict(data, columns = columns, orient = 'index')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e14465d7",
"metadata": {},
"outputs": [],
"source": [
"from pandas.testing import assert_frame_equal\n",
"\n", "\n",
"class BatchCodeTableTest(unittest.TestCase):\n", "class BatchCodeTableTest(unittest.TestCase):\n",
"\n", "\n",
@@ -135,18 +194,18 @@
"\n", "\n",
" def _test_createBatchCodeTable(self, dataFrame):\n", " def _test_createBatchCodeTable(self, dataFrame):\n",
" # When\n", " # When\n",
" batchCodeTable=createBatchCodeTable(dataFrame)\n", " batchCodeTable = createBatchCodeTable(dataFrame)\n",
"\n", "\n",
" # Then\n", " # Then\n",
" batchCodeTableExpected=pd.DataFrame(\n", " batchCodeTableExpected = pd.DataFrame(\n",
" data={\n", " data={\n",
" 'ADRs': [2, 1],\n", " 'ADRs': [2, 1],\n",
" 'DEATHS': [0, 1],\n", " 'DEATHS': [0, 1],\n",
" 'DISABILITIES': [2, 0],\n", " 'DISABILITIES': [2, 0],\n",
" 'LIFE THREATENING ILLNESSES': [0, 0]\n", " 'LIFE THREATENING ILLNESSES': [0, 0]\n",
" },\n", " },\n",
" index=pd.MultiIndex.from_arrays([['025L20A', '037K20A']], names = ('VAX_LOT',)))\n", " index = pd.MultiIndex.from_arrays([['025L20A', '037K20A']], names = ('VAX_LOT',)))\n",
" assert_frame_equal(batchCodeTable, batchCodeTableExpected, check_dtype=False)\n", " assert_frame_equal(batchCodeTable, batchCodeTableExpected, check_dtype = False)\n",
"\n", "\n",
" def createDataFrame(self, columns, data):\n", " def createDataFrame(self, columns, data):\n",
" return pd.DataFrame.from_dict(data, columns = columns, orient = 'index')\n" " return pd.DataFrame.from_dict(data, columns = columns, orient = 'index')\n"

View File

@@ -6,7 +6,7 @@ FK-TODO:
Repeat for second dose and third dose separately. The cumulative effect will then appear. Repeat for second dose and third dose separately. The cumulative effect will then appear.
It should be analysed separately anyway, because adverse reactions increase with each dose." It should be analysed separately anyway, because adverse reactions increase with each dose."
# 1. filter the vax table first for just C19 vaccines # 1. filter the vax table first for just C19 vaccines
# 2. and for just n-tn (n \in {1, 2, 3}) dose => VAERSDATA --> VAERSVAX ist 1:1-Beziehung statt 1:n und kann einfacher in eine einzige Tabelle gemergt werden # 2. and for just n-th (VAERSVAX.VAX_DOSE_SERIES == n \in {1, 2, 3}) dose => VAERSDATA --> VAERSVAX ist 1:1-Beziehung statt 1:n und kann einfacher in eine einzige Tabelle gemergt werden
# 3. filter for manufacturer # 3. filter for manufacturer
- Prüfe, ob die VAERS_ID wirklich eindeutig ist. Antwort: VAERS_ID ist in der VAERSVAX-Tabelle nicht eindeutig, da es mehrere Impfungen pro Person geben kann. - Prüfe, ob die VAERS_ID wirklich eindeutig ist. Antwort: VAERS_ID ist in der VAERSVAX-Tabelle nicht eindeutig, da es mehrere Impfungen pro Person geben kann.
- VAX_LOT-Spalte normalisieren, d.h. mindestens toUpperCase() darauf anwenden - VAX_LOT-Spalte normalisieren, d.h. mindestens toUpperCase() darauf anwenden
@@ -15,7 +15,7 @@ FK-TODO:
MOD039K20A MOD039K20A
#039K20A #039K20A
039K20A-MODERNA 039K20A-MODERNA
039K20A-2A 039K20A-2A (vielleicht nicht)
039K20A or 039L 039K20A or 039L
Moderna/039K20A Moderna/039K20A
MODERNA 039K20A MODERNA 039K20A
@@ -34,3 +34,6 @@ u039k20a
039K20A & 031M2 039K20A & 031M2
039K20A and 032 039K20A and 032
039K20A, 011L20 039K20A, 011L20
df[df.index.duplicated(False)].to_excel('results/pfizer_duplicates.xlsx')