normalizing data while reading VAERS data

This commit is contained in:
frankknoll
2023-02-11 12:20:41 +01:00
parent 40ff17e6aa
commit bc2227171e
5 changed files with 9 additions and 13 deletions

View File

@@ -2,14 +2,6 @@ import numpy as np
class DataFrameNormalizer: class DataFrameNormalizer:
@staticmethod
def normalize(dataFrame):
DataFrameNormalizer.removeUnknownBatchCodes(dataFrame)
DataFrameNormalizer.convertVAX_LOTColumnToUpperCase(dataFrame)
DataFrameNormalizer._convertColumnsOfDataFrame_Y_to_1_else_0(
dataFrame,
['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'])
@staticmethod @staticmethod
def convertVAX_LOTColumnToUpperCase(dataFrame): def convertVAX_LOTColumnToUpperCase(dataFrame):
dataFrame['VAX_LOT'] = dataFrame['VAX_LOT'].str.upper() dataFrame['VAX_LOT'] = dataFrame['VAX_LOT'].str.upper()

View File

@@ -25,11 +25,15 @@ class VaersDescrReader:
} }
def _readVAERSDATA(self, file): def _readVAERSDATA(self, file):
return self._read_csv( VAERSDATA = self._read_csv(
file = file, file = file,
usecols = ['VAERS_ID', 'RECVDATE', 'DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT', 'SPLTTYPE'], usecols = ['VAERS_ID', 'RECVDATE', 'DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT', 'SPLTTYPE'],
parse_dates = ['RECVDATE'], parse_dates = ['RECVDATE'],
date_parser = lambda dateStr: pd.to_datetime(dateStr, format = "%m/%d/%Y")) date_parser = lambda dateStr: pd.to_datetime(dateStr, format = "%m/%d/%Y"))
DataFrameNormalizer._convertColumnsOfDataFrame_Y_to_1_else_0(
VAERSDATA,
['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'])
return VAERSDATA
def _readVAERSVAX(self, file): def _readVAERSVAX(self, file):
VAERSVAX = self._read_csv( VAERSVAX = self._read_csv(
@@ -40,6 +44,7 @@ class VaersDescrReader:
"VAX_DOSE_SERIES": "string", "VAX_DOSE_SERIES": "string",
"VAX_LOT": "string" "VAX_LOT": "string"
}) })
DataFrameNormalizer.removeUnknownBatchCodes(VAERSVAX)
DataFrameNormalizer.convertVAX_LOTColumnToUpperCase(VAERSVAX) DataFrameNormalizer.convertVAX_LOTColumnToUpperCase(VAERSVAX)
return VAERSVAX return VAERSVAX

View File

@@ -19,7 +19,7 @@ class VaersDescrReaderTest(unittest.TestCase):
vaersDescr['VAERSDATA'], vaersDescr['VAERSDATA'],
TestHelper.createDataFrame( TestHelper.createDataFrame(
columns = ['RECVDATE', 'DIED', 'L_THREAT', 'ER_VISIT', 'HOSPITAL', 'DISABLE', 'SPLTTYPE'], columns = ['RECVDATE', 'DIED', 'L_THREAT', 'ER_VISIT', 'HOSPITAL', 'DISABLE', 'SPLTTYPE'],
data = [ [pd.to_datetime('01/01/2023', format = "%m/%d/%Y"), np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]], data = [ [pd.to_datetime('01/01/2023', format = "%m/%d/%Y"), 0, 0, 0, 0, 0, np.nan]],
index = pd.Index( index = pd.Index(
name = 'VAERS_ID', name = 'VAERS_ID',
data = [2547730]))) data = [2547730])))

View File

@@ -1,7 +1,6 @@
from CountryColumnAdder import CountryColumnAdder from CountryColumnAdder import CountryColumnAdder
from VaersDescrReader import VaersDescrReader from VaersDescrReader import VaersDescrReader
from VaersDescr2DataFrameConverter import VaersDescr2DataFrameConverter from VaersDescr2DataFrameConverter import VaersDescr2DataFrameConverter
from DataFrameNormalizer import DataFrameNormalizer
from SevereColumnAdder import SevereColumnAdder from SevereColumnAdder import SevereColumnAdder
def getVaersForYears(years): def getVaersForYears(years):
@@ -24,6 +23,5 @@ def _getVaersDescrReader():
def _getVaers(vaersDescrs, addCountryColumn): def _getVaers(vaersDescrs, addCountryColumn):
dataFrame = VaersDescr2DataFrameConverter.createDataFrameFromDescrs(vaersDescrs) dataFrame = VaersDescr2DataFrameConverter.createDataFrameFromDescrs(vaersDescrs)
dataFrame = addCountryColumn(dataFrame) dataFrame = addCountryColumn(dataFrame)
DataFrameNormalizer.normalize(dataFrame)
dataFrame = SevereColumnAdder.addSevereColumn(dataFrame) dataFrame = SevereColumnAdder.addSevereColumn(dataFrame)
return dataFrame return dataFrame

View File

@@ -1,3 +1,4 @@
VAERS_ID,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME VAERS_ID,VAX_TYPE,VAX_MANU,VAX_LOT,VAX_DOSE_SERIES,VAX_ROUTE,VAX_SITE,VAX_NAME
2547730,COVID19,JANSSEN,1808982,UNK,SYR,AR,COVID19 (COVID19 (JANSSEN)) 2547730,COVID19,JANSSEN,1808982,UNK,SYR,AR,COVID19 (COVID19 (JANSSEN))
2547730,COVID19,PFIZER\BIONTECH,EW0175,1,IM,,COVID19 (COVID19 (PFIZER-BIONTECH)) 2547730,COVID19,PFIZER\BIONTECH,ew0175,1,IM,,COVID19 (COVID19 (PFIZER-BIONTECH))
2547731,COVID19,PFIZER\BIONTECH,UNKNOWN TO ME,1,IM,,COVID19 (COVID19 (PFIZER-BIONTECH))
1 VAERS_ID VAX_TYPE VAX_MANU VAX_LOT VAX_DOSE_SERIES VAX_ROUTE VAX_SITE VAX_NAME
2 2547730 COVID19 JANSSEN 1808982 UNK SYR AR COVID19 (COVID19 (JANSSEN))
3 2547730 COVID19 PFIZER\BIONTECH EW0175 ew0175 1 IM COVID19 (COVID19 (PFIZER-BIONTECH))
4 2547731 COVID19 PFIZER\BIONTECH UNKNOWN TO ME 1 IM COVID19 (COVID19 (PFIZER-BIONTECH))