In [None]:
import numpy as np
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [None]:
def createDataFrameFromDescr(vaersDescr):
    return pd.merge(
            vaersDescr['VAERSDATA'],
            vaersDescr['VAERSVAX'],
            how = 'left',
            left_index = True,
            right_index = True,
            validate = 'one_to_many')

def createDataFrameFromDescrs(vaersDescrs):
    dataFrames = map(createDataFrameFromDescr, vaersDescrs)
    return pd.concat(dataFrames)

def createAndFilterDataFrameFromDescrs(vaersDescrs, manufacturer, dose):
    def filterDataFrame(df):
        return df[
            (df["VAX_TYPE"] == "COVID19") &
            (df["VAX_MANU"] == manufacturer) &
            (df["VAX_DOSE_SERIES"].str.contains(dose))]
    
    return filterDataFrame(createDataFrameFromDescrs(vaersDescrs))

def createDataFrameSevereEffectsFromDescrs(vaersDescrs, dose):
    def filterDataFrame(df):
        return df[
            (df["VAX_TYPE"] == "COVID19") &
            (df["VAX_DOSE_SERIES"].str.contains(dose))]

    return filterDataFrame(createDataFrameFromDescrs(vaersDescrs))

In [None]:
def read_csv(file, usecols, dtype = {}):
    return pd.read_csv(
        file,
        index_col = 'VAERS_ID',
        encoding = 'latin1',
        low_memory = False,
        usecols = usecols,
        dtype = dtype)

def readVaersDescr(dataDir, year):
    folder = dataDir + "/" + year + "VAERSData/"
    return {
        'VAERSDATA':
            read_csv(
                folder + year + "VAERSDATA.csv",
                ['VAERS_ID', 'DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT']),
        'VAERSVAX':
                read_csv(
                    folder + year + "VAERSVAX.csv",
                    ['VAERS_ID', 'VAX_DOSE_SERIES', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT'],
                    dtype = {"VAX_DOSE_SERIES": "string"})
        }

def createAndFilterDataFrameFromFiles(dataDir, manufacturer, dose):
    return createAndFilterDataFrameFromDescrs(
        [readVaersDescr(dataDir, "2021"), readVaersDescr(dataDir, "2022")],
        manufacturer,
        dose)

In [None]:
def createBatchCodeTable(df : pd.DataFrame):
    def filter(df, col):
        return df[df[col] == 'Y'][['VAX_LOT']]

    batchCodeTableDict = {
        'ADRs': df[['VAX_LOT']].value_counts(),
        'DEATHS': filter(df, 'DIED').value_counts(),
        'DISABILITIES': filter(df, 'DISABLE').value_counts(),
        'LIFE THREATENING ILLNESSES': filter(df, 'L_THREAT').value_counts()
    }
    return pd.concat(batchCodeTableDict, axis = 'columns').replace(to_replace = np.nan, value = 0)


In [None]:
import unittest

In [None]:
from pandas.testing import assert_frame_equal

class CreateAndFilterDataFrameTest(unittest.TestCase):

    def test_createAndFilterDataFrameFromDescrs(self):
        # Given
        vaersDescrs = [
               {
                   'VAERSDATA': self.createDataFrame(
                       columns = ['DIED', 'L_THREAT', 'DISABLE'],
                       data = [  ['Y',    np.NaN,     np.NaN],
                                 [np.NaN, np.NaN,     'Y']],
                       index = [
                           "0916600",
                           "0916601"]),
                   'VAERSVAX': self.createDataFrame(
                       columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],
                       data = [  ['COVID19',  'MODERNA',  '037K20A', '1'],
                                 ['COVID19',  'MODERNA',  '025L20A', '1']],
                       index = [
                           "0916600",
                           "0916601"],
                       dtypes = {'VAX_DOSE_SERIES': "string"})
               },
               {
                    'VAERSDATA': self.createDataFrame(
                       columns = ['DIED', 'L_THREAT', 'DISABLE'],
                       data = [  [np.NaN, np.NaN,     np.NaN],
                                 [np.NaN, np.NaN,     'Y']],
                        index = [
                           "1996873",
                           "1996874"]),
                    'VAERSVAX': self.createDataFrame(
                        columns = ['VAX_TYPE', 'VAX_MANU',         'VAX_LOT', 'VAX_DOSE_SERIES'],
                        data = [  ['HPV9',     'MERCK & CO. INC.', 'R017624', 'UNK'],
                                  ['COVID19',  'MODERNA',          '025L20A', '1']],
                        index = [
                            "1996873",
                            "1996874"],
                        dtypes = {'VAX_DOSE_SERIES': "string"})
                }
            ]
            
        # When
        dataFrame = createAndFilterDataFrameFromDescrs(vaersDescrs, "MODERNA", '1')
        
        # Then
        dataFrameExpected = self.createDataFrame(
            columns = ['DIED', 'L_THREAT', 'DISABLE',  'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],
            data = [  ['Y',     np.NaN,    np.NaN,     'COVID19',  'MODERNA',  '037K20A', '1'],
                      [np.NaN,  np.NaN,    'Y',        'COVID19',  'MODERNA',  '025L20A', '1'],
                      [np.NaN, np.NaN,     'Y',        'COVID19',  'MODERNA',  '025L20A', '1']],
            index = [
                "0916600",
                "0916601",
                "1996874"],
            dtypes = {'VAX_DOSE_SERIES': "string"})
        assert_frame_equal(dataFrame, dataFrameExpected, check_dtype = False)

    def test_createDataFrameFromForSevereEffects(self):
        # Given
        vaersDescrs = [
               {
                   'VAERSDATA': self.createDataFrame(
                       columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT'],
                       data = [  ['Y',    'Y',        np.NaN,    'Y',        'Y'],
                                 [np.NaN, np.NaN,     'Y',       np.NaN,     'Y']],
                       index = [
                           "0916600",
                           "0916601"]),
                   'VAERSVAX': self.createDataFrame(
                       columns = ['VAX_TYPE', 'VAX_MANU',        'VAX_LOT', 'VAX_DOSE_SERIES'],
                       data = [  ['COVID19',  'MODERNA',         '037K20A', '1'],
                                 ['COVID19',  'PFIZER\BIONTECH', '025L20A', '1']],
                       index = [
                           "0916600",
                           "0916601"],
                       dtypes = {'VAX_DOSE_SERIES': "string"})
               }
            ]
            
        # When
        dataFrame = createDataFrameSevereEffectsFromDescrs(vaersDescrs, '1')
        
        # Then
        dataFrameExpected = self.createDataFrame(
            columns = ['DIED', 'L_THREAT', 'DISABLE', 'HOSPITAL', 'ER_VISIT', 'VAX_TYPE', 'VAX_MANU',        'VAX_LOT', 'VAX_DOSE_SERIES'],
            data = [  ['Y',    'Y',        np.NaN,    'Y',        'Y',        'COVID19',  'MODERNA',         '037K20A', '1'],
                      [np.NaN,  np.NaN,    'Y',       np.NaN,     'Y',        'COVID19',  'PFIZER\BIONTECH', '025L20A', '1']],
            index = [
                "0916600",
                "0916601"],
            dtypes = {'VAX_DOSE_SERIES': "string"})
        assert_frame_equal(dataFrame, dataFrameExpected, check_dtype = False)

    def test_createAndFilterDataFrameFromDescrsWithFirstDose(self):
        # Given
        vaersDescrs = [
               {
                   'VAERSDATA': self.createDataFrame(
                       columns = ['DIED', 'L_THREAT', 'DISABLE'],
                       data = [  ['Y',    np.NaN,      np.NaN]],
                       index = [
                           "1048786"]),
                   'VAERSVAX': self.createDataFrame(
                       columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],
                       data = [  ['COVID19',  'MODERNA',  '016M20A', '2'],
                                 ['COVID19',  'MODERNA',  '030L20A', '1']],
                       index = [
                           "1048786",
                           "1048786"],
                       dtypes = {'VAX_DOSE_SERIES': "string"})
               }
            ]
            
        # When
        dataFrame = createAndFilterDataFrameFromDescrs(vaersDescrs, "MODERNA", '1')
        
        # Then
        dataFrameExpected = self.createDataFrame(
            columns = ['DIED', 'L_THREAT', 'DISABLE',  'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],
            data = [  ['Y',     np.NaN,    np.NaN,     'COVID19',  'MODERNA',  '030L20A',  '1']],
            index = [
                "1048786"],
            dtypes = {'VAX_DOSE_SERIES': "string"})
        assert_frame_equal(dataFrame, dataFrameExpected, check_dtype = False)

    def test_createAndFilterDataFrameFromDescrsWithSecondDose(self):
        # Given
        vaersDescrs = [
               {
                   'VAERSDATA': self.createDataFrame(
                       columns = ['DIED', 'L_THREAT', 'DISABLE'],
                       data = [  ['Y',    np.NaN,     np.NaN]],
                       index = [
                           "1048786"]),
                   'VAERSVAX': self.createDataFrame(
                       columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],
                       data = [  ['COVID19',  'MODERNA',  '016M20A',  '2'],
                                 ['COVID19',  'MODERNA',  '030L20A',  '1']],
                       index = [
                           "1048786",
                           "1048786"],
                       dtypes = {'VAX_DOSE_SERIES': "string"})
               }
            ]
            
        # When
        dataFrame = createAndFilterDataFrameFromDescrs(vaersDescrs, "MODERNA", '2')
        
        # Then
        dataFrameExpected = self.createDataFrame(
            columns = ['DIED', 'L_THREAT', 'DISABLE',  'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],
            data = [  ['Y',     np.NaN,    np.NaN,     'COVID19',  'MODERNA',  '016M20A',  '2']],
            index = [
                "1048786"],
            dtypes = {'VAX_DOSE_SERIES': "string"})
        assert_frame_equal(dataFrame, dataFrameExpected, check_dtype = False)

    def createDataFrame(self, index, columns, data, dtypes = {}):
        return pd.DataFrame(index = index, columns = columns, data = data).astype(dtypes)


In [None]:
from pandas.testing import assert_frame_equal

class BatchCodeTableTest(unittest.TestCase):

    def test_createBatchCodeTable2(self):
        dataFrame = createAndFilterDataFrameFromDescrs(
            [
               {
                   'VAERSDATA': self.createDataFrame(
                       columns = ['DIED', 'L_THREAT', 'DISABLE'],
                       data = [  ['Y',    np.NaN,     np.NaN],
                                 [np.NaN, np.NaN,     'Y']],
                       index = [
                           "0916600",
                           "0916601"]),
                   'VAERSVAX': self.createDataFrame(
                       columns = ['VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_DOSE_SERIES'],
                       data = [  ['COVID19',  'MODERNA',  '037K20A', '1'],
                                 ['COVID19',  'MODERNA',  '025L20A', '1']],
                       index = [
                           "0916600",
                           "0916601"],
                       dtypes = {'VAX_DOSE_SERIES': "string"})
               },
               {
                    'VAERSDATA': self.createDataFrame(
                       columns = ['DIED', 'L_THREAT', 'DISABLE'],
                       data = [  [np.NaN,  np.NaN,    np.NaN],
                                 [np.NaN,  np.NaN,    'Y']],
                       index = [
                           "1996873",
                           "1996874"]),
                    'VAERSVAX': self.createDataFrame(
                        columns = ['VAX_TYPE', 'VAX_MANU',         'VAX_LOT', 'VAX_DOSE_SERIES'],
                        data = [  ['HPV9',     'MERCK & CO. INC.', 'R017624', 'UNK'],
                                  ['COVID19',  'MODERNA',          '025L20A', '1']],
                        index = [
                            "1996873",
                            "1996874"],
                        dtypes = {'VAX_DOSE_SERIES': "string"})
                }
            ],
            "MODERNA",
            '1')

        self._test_createBatchCodeTable(dataFrame);

    def test_createBatchCodeTable(self):
        self._test_createBatchCodeTable(createAndFilterDataFrameFromFiles("test/VAERS", "MODERNA", '1'));

    def _test_createBatchCodeTable(self, dataFrame):
        # When
        batchCodeTable = createBatchCodeTable(dataFrame)

        # Then
        batchCodeTableExpected = pd.DataFrame(
            data = {
                'ADRs': [2, 1],
                'DEATHS': [0, 1],
                'DISABILITIES': [2, 0],
                'LIFE THREATENING ILLNESSES': [0, 0]
            },
            index = pd.MultiIndex.from_arrays([['025L20A', '037K20A']], names = ('VAX_LOT',)))
        assert_frame_equal(batchCodeTable, batchCodeTableExpected, check_dtype = False)

    def createDataFrame(self, index, columns, data, dtypes = {}):
        return pd.DataFrame(index = index, columns = columns, data = data).astype(dtypes)


In [None]:
unittest.main(argv = [''], verbosity = 2, exit = False)

In [None]:
def saveBatchCodeTable(manufacturer, excelFile):
    batchCodeTable = createBatchCodeTable(createAndFilterDataFrameFromFiles("VAERS", manufacturer, '1'))
    display(manufacturer, batchCodeTable)
    batchCodeTable.to_excel(excelFile)

In [None]:
saveBatchCodeTable("MODERNA", "results/moderna.xlsx")
saveBatchCodeTable("PFIZER\BIONTECH", "results/pfizer.xlsx")
saveBatchCodeTable("JANSSEN", "results/janssen.xlsx")