normalizing data while reading VAERS data

This commit is contained in:
frankknoll
2023-02-11 12:20:41 +01:00
parent 40ff17e6aa
commit bc2227171e
5 changed files with 9 additions and 13 deletions

View File

@@ -2,14 +2,6 @@ import numpy as np
class DataFrameNormalizer:
@staticmethod
def normalize(dataFrame):
DataFrameNormalizer.removeUnknownBatchCodes(dataFrame)
DataFrameNormalizer.convertVAX_LOTColumnToUpperCase(dataFrame)
DataFrameNormalizer._convertColumnsOfDataFrame_Y_to_1_else_0(
dataFrame,
['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'])
@staticmethod
def convertVAX_LOTColumnToUpperCase(dataFrame):
dataFrame['VAX_LOT'] = dataFrame['VAX_LOT'].str.upper()

View File

@@ -25,11 +25,15 @@ class VaersDescrReader:
}
def _readVAERSDATA(self, file):
return self._read_csv(
VAERSDATA = self._read_csv(
file = file,
usecols = ['VAERS_ID', 'RECVDATE', 'DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT', 'SPLTTYPE'],
parse_dates = ['RECVDATE'],
date_parser = lambda dateStr: pd.to_datetime(dateStr, format = "%m/%d/%Y"))
DataFrameNormalizer._convertColumnsOfDataFrame_Y_to_1_else_0(
VAERSDATA,
['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'])
return VAERSDATA
def _readVAERSVAX(self, file):
VAERSVAX = self._read_csv(
@@ -40,6 +44,7 @@ class VaersDescrReader:
"VAX_DOSE_SERIES": "string",
"VAX_LOT": "string"
})
DataFrameNormalizer.removeUnknownBatchCodes(VAERSVAX)
DataFrameNormalizer.convertVAX_LOTColumnToUpperCase(VAERSVAX)
return VAERSVAX

View File

@@ -19,7 +19,7 @@ class VaersDescrReaderTest(unittest.TestCase):
vaersDescr['VAERSDATA'],
TestHelper.createDataFrame(
columns = ['RECVDATE', 'DIED', 'L_THREAT', 'ER_VISIT', 'HOSPITAL', 'DISABLE', 'SPLTTYPE'],
data = [ [pd.to_datetime('01/01/2023', format = "%m/%d/%Y"), np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]],
data = [ [pd.to_datetime('01/01/2023', format = "%m/%d/%Y"), 0, 0, 0, 0, 0, np.nan]],
index = pd.Index(
name = 'VAERS_ID',
data = [2547730])))

View File

@@ -1,7 +1,6 @@
from CountryColumnAdder import CountryColumnAdder
from VaersDescrReader import VaersDescrReader
from VaersDescr2DataFrameConverter import VaersDescr2DataFrameConverter
from DataFrameNormalizer import DataFrameNormalizer
from SevereColumnAdder import SevereColumnAdder
def getVaersForYears(years):
@@ -24,6 +23,5 @@ def _getVaersDescrReader():
def _getVaers(vaersDescrs, addCountryColumn):
dataFrame = VaersDescr2DataFrameConverter.createDataFrameFromDescrs(vaersDescrs)
dataFrame = addCountryColumn(dataFrame)
DataFrameNormalizer.normalize(dataFrame)
dataFrame = SevereColumnAdder.addSevereColumn(dataFrame)
return dataFrame

View File

@@ -1,3 +1,4 @@
VAERS_ID,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
2547730,COVID19,JANSSEN,1808982,UNK,SYR,AR,COVID19 (COVID19 (JANSSEN))
2547730,COVID19,PFIZER\BIONTECH,EW0175,1,IM,,COVID19 (COVID19 (PFIZER-BIONTECH))
2547730,COVID19,PFIZER\BIONTECH,ew0175,1,IM,,COVID19 (COVID19 (PFIZER-BIONTECH))
2547731,COVID19,PFIZER\BIONTECH,UNKNOWN TO ME,1,IM,,COVID19 (COVID19 (PFIZER-BIONTECH))
1 VAERS_ID VAX_TYPE VAX_MANU VAX_LOT VAX_DOSE_SERIES VAX_ROUTE VAX_SITE VAX_NAME
2 2547730 COVID19 JANSSEN 1808982 UNK SYR AR COVID19 (COVID19 (JANSSEN))
3 2547730 COVID19 PFIZER\BIONTECH EW0175 ew0175 1 IM COVID19 (COVID19 (PFIZER-BIONTECH))
4 2547731 COVID19 PFIZER\BIONTECH UNKNOWN TO ME 1 IM COVID19 (COVID19 (PFIZER-BIONTECH))