From 0b71acd06d791a0ae92147966d60fd04ce25fd4e Mon Sep 17 00:00:00 2001 From: frankknoll Date: Fri, 20 Jan 2023 00:13:48 +0100 Subject: [PATCH] refining SymptomsByBatchcodesTableFactoryTest --- src/SymptomsByBatchcodesTableFactory.py | 11 +- src/SymptomsByBatchcodesTableFactoryTest.py | 111 ++++++++++++++++++++ 2 files changed, 118 insertions(+), 4 deletions(-) diff --git a/src/SymptomsByBatchcodesTableFactory.py b/src/SymptomsByBatchcodesTableFactory.py index 3560d53644f..d61b8b1b1fe 100644 --- a/src/SymptomsByBatchcodesTableFactory.py +++ b/src/SymptomsByBatchcodesTableFactory.py @@ -1,5 +1,5 @@ import pandas as pd - +import numpy as np class SymptomsByBatchcodesTableFactory: @@ -14,13 +14,13 @@ class SymptomsByBatchcodesTableFactory: def _get_VAERSVAX_WITH_VAX_LOTS(VAERSVAX): return pd.concat( [VAERSVAX, SymptomsByBatchcodesTableFactory._getVaxLotsTable(VAERSVAX)], - axis=1).drop_duplicates(subset=['VAX_LOT1', 'VAX_LOT2']).reset_index() + axis='columns').reset_index().drop_duplicates(subset=['VAERS_ID', 'VAX_LOT1', 'VAX_LOT2']) @staticmethod def _getVaxLotsTable(VAERSVAX): VAX_LOT_LIST_Table = VAERSVAX.groupby("VAERS_ID").agg(VAX_LOT_LIST = pd.NamedAgg(column = 'VAX_LOT', aggfunc = list)) return pd.DataFrame( - VAX_LOT_LIST_Table['VAX_LOT_LIST'].tolist(), + [fill(VAX_LOTS, 2, str(np.nan)) for VAX_LOTS in VAX_LOT_LIST_Table['VAX_LOT_LIST'].tolist()], columns = ['VAX_LOT1', 'VAX_LOT2'], index = VAX_LOT_LIST_Table.index) @@ -33,4 +33,7 @@ class SymptomsByBatchcodesTableFactory: VAERSSYMPTOMS['SYMPTOM3'], VAERSSYMPTOMS['SYMPTOM4'], VAERSSYMPTOMS['SYMPTOM5'] - ]).dropna().drop_duplicates().to_frame(name = "SYMPTOMS").reset_index() + ]).dropna().to_frame(name = "SYMPTOMS").reset_index() + +def fill(lst, desiredLen, fillValue): + return lst + [fillValue] * (max(desiredLen - len(lst), 0)) \ No newline at end of file diff --git a/src/SymptomsByBatchcodesTableFactoryTest.py b/src/SymptomsByBatchcodesTableFactoryTest.py index 4ea1b66bd24..7c8dbe136b2 100644 --- a/src/SymptomsByBatchcodesTableFactoryTest.py +++ b/src/SymptomsByBatchcodesTableFactoryTest.py @@ -57,3 +57,114 @@ class SymptomsByBatchcodesTableFactoryTest(unittest.TestCase): index = pd.MultiIndex.from_tuples( names = ['VAX_LOT1', 'VAX_LOT2'], tuples = [['1808982', 'EW0175']] * 13))) + + def test_createSymptomsByBatchcodesTable_two_patients_same_symptoms(self): + # Given + VAERSVAX = TestHelper.createDataFrame( + columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'], + data = [ ['COVID19', 'JANSSEN', 'EW0175', '1'], + ['COVID19', 'JANSSEN', 'EW0175', '1']], + index = pd.Index( + name = 'VAERS_ID', + data=[ + 2547730, + 2547731]), + dtypes = { + 'VAX_DOSE_SERIES': 'string', + 'VAX_LOT': 'string'}) + VAERSSYMPTOMS = TestHelper.createDataFrame( + columns = ['SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5'], + data = [ ['Blood pressure orthostatic abnormal', np.nan, np.nan, np.nan, np.nan], + ['Blood pressure orthostatic abnormal', np.nan, np.nan, np.nan, np.nan]], + index = pd.Index( + name = 'VAERS_ID', + data=[ + 2547730, + 2547731])) + + # When + symptomsByBatchcodesTable = SymptomsByBatchcodesTableFactory.createSymptomsByBatchcodesTable(VAERSVAX, VAERSSYMPTOMS) + + # Then + assert_frame_equal( + symptomsByBatchcodesTable, + TestHelper.createDataFrame( + columns = ['SYMPTOMS'], + data = [ ['Blood pressure orthostatic abnormal'], + ['Blood pressure orthostatic abnormal']], + index = pd.MultiIndex.from_tuples( + names = ['VAX_LOT1', 'VAX_LOT2'], + tuples = [['EW0175', str(np.nan)]] * 2)), + check_dtype = False) + + def test_createSymptomsByBatchcodesTable_two_patients_distinct_symptoms(self): + # Given + VAERSVAX = TestHelper.createDataFrame( + columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'], + data = [ ['COVID19', 'JANSSEN', '1808982', 'UNK'], + ['COVID19', 'PFIZER\BIONTECH', 'EW0175', '1'], + ['COVID19', 'PFIZER\BIONTECH', 'EW0175', '1'], + ['COVID19', 'PFIZER\BIONTECH', 'EW0167', '2']], + index = pd.Index( + name = 'VAERS_ID', + data=[ + 2547730, + 2547730, + 2547744, + 2547744]), + dtypes = { + 'VAX_DOSE_SERIES': 'string', + 'VAX_LOT': 'string'}) + VAERSSYMPTOMS = TestHelper.createDataFrame( + columns = ['SYMPTOM1', 'SYMPTOM2', 'SYMPTOM3', 'SYMPTOM4', 'SYMPTOM5'], + data = [ ['Blood pressure orthostatic abnormal', 'COVID-19', 'Coma', 'Computerised tomogram', 'Exposure to SARS-CoV-2'], + ['Head injury', 'Headache', 'Laboratory test', 'Magnetic resonance imaging', 'SARS-CoV-2 antibody test negative'], + ['SARS-CoV-2 test positive', 'Unresponsive to stimuli', 'X-ray', np.nan, np.nan], + ['Computerised tomogram head abnormal', 'Ear pain', 'Headache', 'Idiopathic intracranial hypertension', 'Intracranial pressure increased'], + ['Lumbar puncture', 'Magnetic resonance imaging head', 'Pain', 'Swelling', 'Vision blurred']], + index = pd.Index( + name = 'VAERS_ID', + data=[ + 2547730, + 2547730, + 2547730, + 2547744, + 2547744])) + + # When + symptomsByBatchcodesTable = SymptomsByBatchcodesTableFactory.createSymptomsByBatchcodesTable(VAERSVAX, VAERSSYMPTOMS) + + # Then + assert_frame_equal( + symptomsByBatchcodesTable, + TestHelper.createDataFrame( + columns = ['SYMPTOMS'], + data = [ ['Blood pressure orthostatic abnormal'], + ['Head injury'], + ['SARS-CoV-2 test positive'], + ['COVID-19'], + ['Headache'], + ['Unresponsive to stimuli'], + ['Coma'], + ['Laboratory test'], + ['X-ray'], + ['Computerised tomogram'], + ['Magnetic resonance imaging'], + ['Exposure to SARS-CoV-2'], + ['SARS-CoV-2 antibody test negative'], + + ['Computerised tomogram head abnormal'], + ['Lumbar puncture'], + ['Ear pain'], + ['Magnetic resonance imaging head'], + ['Headache'], + ['Pain'], + ['Idiopathic intracranial hypertension'], + ['Swelling'], + ['Intracranial pressure increased'], + ['Vision blurred']], + index = pd.MultiIndex.from_tuples( + names = ['VAX_LOT1', 'VAX_LOT2'], + tuples = [['1808982', 'EW0175']] * 13 + [['EW0175', 'EW0167']] * 10))) + +