From 0ffc5881b6af9418da0a65de44b017f5b0f5fe8b Mon Sep 17 00:00:00 2001 From: frankknoll Date: Thu, 16 Feb 2023 14:59:43 +0100 Subject: [PATCH] generating histograms for countries more efficient --- src/HistogramDescriptionPersister.py | 21 ---- src/HistogramDescriptionTableFactory.py | 29 +++++- src/HistogramDescriptionTableFactoryTest.py | 100 +++++++++++++++++++- src/HistogramFactoryAndPersister.py | 18 +--- src/MultiIndexExploder.py | 2 +- src/MultiIndexExploderTest.py | 22 ++--- src/TableByBatchcodeFilter.py | 18 ---- src/TableByBatchcodeFilterTest.py | 65 ------------- 8 files changed, 139 insertions(+), 136 deletions(-) delete mode 100644 src/HistogramDescriptionPersister.py delete mode 100644 src/TableByBatchcodeFilter.py delete mode 100644 src/TableByBatchcodeFilterTest.py diff --git a/src/HistogramDescriptionPersister.py b/src/HistogramDescriptionPersister.py deleted file mode 100644 index eaafe0e1c45..00000000000 --- a/src/HistogramDescriptionPersister.py +++ /dev/null @@ -1,21 +0,0 @@ -from TableByBatchcodeFilter import TableByBatchcodeFilter -from DictByBatchcodeTable2DictConverter import DictByBatchcodeTable2DictConverter -from IOUtils import IOUtils - - -class HistogramDescriptionPersister: - - def __init__(self, directory): - self.directory = directory - - def saveHistogramDescriptionsForBatchcodes(self, batchcodes, dictByBatchcodeTable, progress): - for count, batchcode in enumerate(batchcodes, start = 1): - histogramDescription = self._getHistogramDescriptionForBatchcode(batchcode, dictByBatchcodeTable) - # FK-TODO: nicht direkt {batchcode}.json speichern, denn im Dateinamen könnte sich dann ein '/' befinden, was ein nicht gewünschtes Unterverzeichnis erzeugt. Deshalb in der Batchcode-Tabelle eine unsichtbare Spalte einfügen, in welcher für den jeweiligen batchcode der bereinigte und eindeutige Dateiname steht (z.B. einfach durchnummeriert: 0.json, 1.json, ...). - IOUtils.saveDictAsJson(histogramDescription, f'{self.directory}/{batchcode}.json') - progress(count, len(batchcodes), batchcode) - - def _getHistogramDescriptionForBatchcode(self, batchcode, dictByBatchcodeTable): - dictByBatchcodeTableForBatchcode = TableByBatchcodeFilter.filterTableByBatchcode(batchcode, dictByBatchcodeTable) - histogramDescription = DictByBatchcodeTable2DictConverter.convertDictByBatchcodeTable2Dict(dictByBatchcodeTableForBatchcode, batchcode) - return histogramDescription diff --git a/src/HistogramDescriptionTableFactory.py b/src/HistogramDescriptionTableFactory.py index 67d3b928920..4d2ba779ada 100644 --- a/src/HistogramDescriptionTableFactory.py +++ b/src/HistogramDescriptionTableFactory.py @@ -4,15 +4,34 @@ class HistogramDescriptionTableFactory: @staticmethod def createHistogramDescriptionTable(dictByBatchcodeTable): - histogramDescriptionTable = ( - dictByBatchcodeTable - .groupby('VAX_LOT_EXPLODED') - .agg(HistogramDescriptionTableFactory._getHistograms) - .drop('nan')) + histogramDescriptionTable = HistogramDescriptionTableFactory._createHistogramDescriptionTable(dictByBatchcodeTable) histogramDescriptionTable = histogramDescriptionTable.rename(columns = { "SYMPTOM_COUNT_BY_VAX_LOT": "HISTOGRAM_DESCRIPTION" }) histogramDescriptionTable.index.rename('VAX_LOT', inplace = True) return histogramDescriptionTable + @staticmethod + def _createHistogramDescriptionTable(dictByBatchcodeTable): + if 'COUNTRY' in dictByBatchcodeTable.columns: + return HistogramDescriptionTableFactory._createHistogramDescriptionTableForCountries(dictByBatchcodeTable) + else: + return HistogramDescriptionTableFactory._createGlobalHistogramDescriptionTable(dictByBatchcodeTable) + + @staticmethod + def _createHistogramDescriptionTableForCountries(dictByBatchcodeTable): + return (dictByBatchcodeTable + .groupby(['VAX_LOT_EXPLODED', 'COUNTRY']) + .agg(HistogramDescriptionTableFactory._getHistograms) + .reset_index(level = 'COUNTRY') + .drop('nan')) + + @staticmethod + def _createGlobalHistogramDescriptionTable(dictByBatchcodeTable): + return (dictByBatchcodeTable + .groupby('VAX_LOT_EXPLODED') + .agg(HistogramDescriptionTableFactory._getHistograms) + .drop('nan')) + + @staticmethod def _getHistograms(dictByBatchcodeTable): dictByBatchcodeTable = dictByBatchcodeTable.to_frame() diff --git a/src/HistogramDescriptionTableFactoryTest.py b/src/HistogramDescriptionTableFactoryTest.py index 75fb43cb901..3dd508042de 100644 --- a/src/HistogramDescriptionTableFactoryTest.py +++ b/src/HistogramDescriptionTableFactoryTest.py @@ -6,7 +6,7 @@ import pandas as pd class HistogramDescriptionTableFactoryTest(unittest.TestCase): - def test_createHistogramDescriptionTable(self): + def test_createGlobalHistogramDescriptionTable(self): # Given dictByBatchcodeTable = TestHelper.createDataFrame( columns = ['SYMPTOM_COUNT_BY_VAX_LOT'], @@ -93,3 +93,101 @@ class HistogramDescriptionTableFactoryTest(unittest.TestCase): 'FD1921', '015M20A'])), check_like = True) + + def test_createHistogramDescriptionTable4Countries(self): + # Given + dictByBatchcodeTable = TestHelper.createDataFrame( + columns = ['SYMPTOM_COUNT_BY_VAX_LOT', 'COUNTRY'], + data = [ [{"Blood pressure orthostatic abnormal": 5, "Chest discomfort": 1}, 'Country A'], + [{"Blood pressure orthostatic abnormal": 5, "Chest discomfort": 1}, 'Country A'], + [{"Blood pressure orthostatic abnormal": 5, "Chest discomfort": 1}, 'Country A'], + + [{"Chest discomfort": 2}, 'Country A'], + [{"Chest discomfort": 2}, 'Country A'], + [{"Chest discomfort": 2}, 'Country A'] + ], + index = pd.MultiIndex.from_tuples( + names = ['VAX_LOT_EXPLODED', 'VAX_LOT1', 'VAX_LOT2', 'VAX_LOT3'], + tuples = [['1808982', '1808982', 'EW0175', 'FD1921'], + ['EW0175', '1808982', 'EW0175', 'FD1921'], + ['FD1921', '1808982', 'EW0175', 'FD1921'], + + ['015M20A', '015M20A', '1808982', 'nan'], + ['1808982', '015M20A', '1808982', 'nan'], + ['nan', '015M20A', '1808982', 'nan']])) + + # When + histogramDescriptionTable = HistogramDescriptionTableFactory.createHistogramDescriptionTable(dictByBatchcodeTable) + + # Then + assert_frame_equal( + histogramDescriptionTable, + TestHelper.createDataFrame( + columns = ['HISTOGRAM_DESCRIPTION', 'COUNTRY'], + data = [ [ + { + "batchcode": "1808982", + "histograms": [ + { + "batchcodes": ["1808982", "EW0175", "FD1921"], + "histogram": { + "Blood pressure orthostatic abnormal": 5, + "Chest discomfort": 1} + }, + { + "batchcodes": ["015M20A", "1808982"], + "histogram": {"Chest discomfort": 2} + } + ] + }, + 'Country A' + ], + [ + { + "batchcode": "EW0175", + "histograms": [ + { + "batchcodes": ["1808982", "EW0175", "FD1921"], + "histogram": { + "Blood pressure orthostatic abnormal": 5, + "Chest discomfort": 1} + } + ] + }, + 'Country A' + ], + [ + { + "batchcode": "FD1921", + "histograms": [ + { + "batchcodes": ["1808982", "EW0175", "FD1921"], + "histogram": { + "Blood pressure orthostatic abnormal": 5, + "Chest discomfort": 1} + } + ] + }, + 'Country A' + ], + [ + { + "batchcode": "015M20A", + "histograms": [ + { + "batchcodes": ["015M20A", "1808982"], + "histogram": {"Chest discomfort": 2} + } + ] + }, + 'Country A' + ] + ], + index = pd.Index( + name = 'VAX_LOT', + data = [ + '1808982', + 'EW0175', + 'FD1921', + '015M20A'])), + check_like = True) diff --git a/src/HistogramFactoryAndPersister.py b/src/HistogramFactoryAndPersister.py index 727dc18ff3d..2140aeba794 100644 --- a/src/HistogramFactoryAndPersister.py +++ b/src/HistogramFactoryAndPersister.py @@ -13,18 +13,8 @@ def createAndSaveGlobalHistograms(symptomByBatchcodeTable): def createAndSaveHistogramsForCountries(symptomByBatchcodeTable, countries): dictByBatchcodeTable = createHistograms(symptomByBatchcodeTable) - for count, country in enumerate(countries, start = 1): - _createAndSaveHistogramsForCountry( - count = count, - numCountries = len(countries), - country = country, - dictByBatchcodeTable = dictByBatchcodeTable) - - -def _createAndSaveHistogramsForCountry(count, numCountries, country, dictByBatchcodeTable): - # FK-TODO: use https://github.com/tqdm/tqdm - print(f'saving histograms for country {count}/{numCountries}: {country}') - dictByBatchcodeTable4Country = dictByBatchcodeTable[dictByBatchcodeTable['COUNTRY'] == country] - explodedTable = MultiIndexExploder.explodeMultiIndexOfTable(dictByBatchcodeTable4Country) + explodedTable = MultiIndexExploder.explodeMultiIndexOfTable(dictByBatchcodeTable) histogramDescriptionTable = HistogramDescriptionTableFactory.createHistogramDescriptionTable(explodedTable) - saveHistograms(histogramDescriptionTable, country) + for country, histogramDescriptionTableForCountry in histogramDescriptionTable.groupby('COUNTRY'): + print(country, ':') + saveHistograms(histogramDescriptionTableForCountry, country) \ No newline at end of file diff --git a/src/MultiIndexExploder.py b/src/MultiIndexExploder.py index 911e4eaa553..5818d5f0eff 100644 --- a/src/MultiIndexExploder.py +++ b/src/MultiIndexExploder.py @@ -7,6 +7,6 @@ class MultiIndexExploder: @staticmethod def explodeMultiIndexOfTable(table): batchcodeColumns = table.index.names - explodedTable = table.loc[np.repeat(table.index, len(batchcodeColumns))].reset_index() + explodedTable = table.iloc[np.repeat(range(len(table.index)), len(batchcodeColumns))].reset_index() explodedTable['VAX_LOT_EXPLODED'] = Utils.flatten(table.index.values) return explodedTable.set_index(['VAX_LOT_EXPLODED'] + batchcodeColumns) diff --git a/src/MultiIndexExploderTest.py b/src/MultiIndexExploderTest.py index a45f2c76444..3b87d6c950d 100644 --- a/src/MultiIndexExploderTest.py +++ b/src/MultiIndexExploderTest.py @@ -9,10 +9,10 @@ class MultiIndexExploderTest(unittest.TestCase): def test_explodeMultiIndexOfTable(self): # Given table = TestHelper.createDataFrame( - columns = ['DATA'], - data = [ ['A, B data'], - ['C, A data'], - ['C, B data']], + columns = ['DATA', 'COUNTRY'], + data = [ ['A, B data', 'Country A'], + ['C, A data', 'Country B'], + ['C, B data', 'Country C']], index = pd.MultiIndex.from_tuples( names = ['VAX_LOT1', 'VAX_LOT2'], tuples = [['A', 'B'], @@ -26,15 +26,15 @@ class MultiIndexExploderTest(unittest.TestCase): assert_frame_equal( explodedTable, TestHelper.createDataFrame( - columns = ['DATA'], - data = [ ['A, B data'], - ['A, B data'], + columns = ['DATA', 'COUNTRY'], + data = [ ['A, B data', 'Country A'], + ['A, B data', 'Country A'], - ['C, A data'], - ['C, A data'], + ['C, A data', 'Country B'], + ['C, A data', 'Country B'], - ['C, B data'], - ['C, B data']], + ['C, B data', 'Country C'], + ['C, B data', 'Country C']], index = pd.MultiIndex.from_tuples( names = ['VAX_LOT_EXPLODED', 'VAX_LOT1', 'VAX_LOT2'], tuples = [['A', 'A', 'B'], diff --git a/src/TableByBatchcodeFilter.py b/src/TableByBatchcodeFilter.py deleted file mode 100644 index a340c2f3953..00000000000 --- a/src/TableByBatchcodeFilter.py +++ /dev/null @@ -1,18 +0,0 @@ -from functools import reduce - - -class TableByBatchcodeFilter: - - @staticmethod - def filterTableByBatchcode(batchcode, table): - batchcodeColumns = table.index.names - table = table.reset_index() - filteredTable = table[TableByBatchcodeFilter._existsBatchcodeInAnyBatchcodeColumn(table, batchcodeColumns, batchcode)] - return filteredTable.set_index(batchcodeColumns) - - @staticmethod - def _existsBatchcodeInAnyBatchcodeColumn(table, batchcodeColumns, batchcode): - return reduce( - lambda accum, batchcodeColumn: accum | (table[batchcodeColumn] == batchcode), - batchcodeColumns, - [False] * len(table.index)) diff --git a/src/TableByBatchcodeFilterTest.py b/src/TableByBatchcodeFilterTest.py deleted file mode 100644 index 7b34bd78e0e..00000000000 --- a/src/TableByBatchcodeFilterTest.py +++ /dev/null @@ -1,65 +0,0 @@ -import unittest -from pandas.testing import assert_frame_equal -from TableByBatchcodeFilter import TableByBatchcodeFilter -from TestHelper import TestHelper -import pandas as pd - -class TableByBatchcodeFilterTest(unittest.TestCase): - - def test_convertHistogramTable2JsonTable_2_VAX_LOT_columns(self): - # Given - batchcode = '1808982' - symptomHistogramByBatchcodeTable = TestHelper.createDataFrame( - columns = ['SYMPTOM_COUNT_BY_VAX_LOT'], - data = [ ['{"Blood pressure orthostatic abnormal":5,"Chest discomfort":1}'], - ['{"Chest discomfort":2}'], - ['{"Chills":5}']], - index = pd.MultiIndex.from_tuples( - names = ['VAX_LOT1', 'VAX_LOT2'], - tuples = [[batchcode, 'EW0175'], - ['015M20A', batchcode], - ['015M20A', 'EW0175']])) - - # When - filteredTable = TableByBatchcodeFilter.filterTableByBatchcode(batchcode, symptomHistogramByBatchcodeTable) - - # Then - assert_frame_equal( - filteredTable, - TestHelper.createDataFrame( - columns = ['SYMPTOM_COUNT_BY_VAX_LOT'], - data = [ ['{"Blood pressure orthostatic abnormal":5,"Chest discomfort":1}'], - ['{"Chest discomfort":2}']], - index = pd.MultiIndex.from_tuples( - names = ['VAX_LOT1', 'VAX_LOT2'], - tuples = [[batchcode, 'EW0175'], - ['015M20A', batchcode]]))) - - def test_convertHistogramTable2JsonTable_3_VAX_LOT_columns(self): - # Given - batchcode = '1808983' - symptomHistogramByBatchcodeTable = TestHelper.createDataFrame( - columns = ['SYMPTOM_COUNT_BY_VAX_LOT'], - data = [ ['{"Blood pressure orthostatic abnormal":5,"Chest discomfort":1}'], - ['{"Chest discomfort":2}'], - ['{"Chills":5}']], - index = pd.MultiIndex.from_tuples( - names = ['VAX_LOT1', 'VAX_LOT2', 'VAX_LOT3'], - tuples = [[batchcode, 'EW0175', None], - ['015M20A', None, batchcode], - ['015M20A', 'EW0175', 'dummy2']])) - - # When - filteredTable = TableByBatchcodeFilter.filterTableByBatchcode(batchcode, symptomHistogramByBatchcodeTable) - - # Then - assert_frame_equal( - filteredTable, - TestHelper.createDataFrame( - columns = ['SYMPTOM_COUNT_BY_VAX_LOT'], - data = [ ['{"Blood pressure orthostatic abnormal":5,"Chest discomfort":1}'], - ['{"Chest discomfort":2}']], - index = pd.MultiIndex.from_tuples( - names = ['VAX_LOT1', 'VAX_LOT2', 'VAX_LOT3'], - tuples = [[batchcode, 'EW0175', None], - ['015M20A', None, batchcode]])))