generating histograms for countries more efficient

This commit is contained in:
frankknoll
2023-02-16 14:59:43 +01:00
parent ef7c36d567
commit 0ffc5881b6
8 changed files with 139 additions and 136 deletions

View File

@@ -1,21 +0,0 @@
from TableByBatchcodeFilter import TableByBatchcodeFilter
from DictByBatchcodeTable2DictConverter import DictByBatchcodeTable2DictConverter
from IOUtils import IOUtils
class HistogramDescriptionPersister:
def __init__(self, directory):
self.directory = directory
def saveHistogramDescriptionsForBatchcodes(self, batchcodes, dictByBatchcodeTable, progress):
for count, batchcode in enumerate(batchcodes, start = 1):
histogramDescription = self._getHistogramDescriptionForBatchcode(batchcode, dictByBatchcodeTable)
# FK-TODO: nicht direkt {batchcode}.json speichern, denn im Dateinamen könnte sich dann ein '/' befinden, was ein nicht gewünschtes Unterverzeichnis erzeugt. Deshalb in der Batchcode-Tabelle eine unsichtbare Spalte einfügen, in welcher für den jeweiligen batchcode der bereinigte und eindeutige Dateiname steht (z.B. einfach durchnummeriert: 0.json, 1.json, ...).
IOUtils.saveDictAsJson(histogramDescription, f'{self.directory}/{batchcode}.json')
progress(count, len(batchcodes), batchcode)
def _getHistogramDescriptionForBatchcode(self, batchcode, dictByBatchcodeTable):
dictByBatchcodeTableForBatchcode = TableByBatchcodeFilter.filterTableByBatchcode(batchcode, dictByBatchcodeTable)
histogramDescription = DictByBatchcodeTable2DictConverter.convertDictByBatchcodeTable2Dict(dictByBatchcodeTableForBatchcode, batchcode)
return histogramDescription

View File

@@ -4,15 +4,34 @@ class HistogramDescriptionTableFactory:
@staticmethod @staticmethod
def createHistogramDescriptionTable(dictByBatchcodeTable): def createHistogramDescriptionTable(dictByBatchcodeTable):
histogramDescriptionTable = ( histogramDescriptionTable = HistogramDescriptionTableFactory._createHistogramDescriptionTable(dictByBatchcodeTable)
dictByBatchcodeTable
.groupby('VAX_LOT_EXPLODED')
.agg(HistogramDescriptionTableFactory._getHistograms)
.drop('nan'))
histogramDescriptionTable = histogramDescriptionTable.rename(columns = { "SYMPTOM_COUNT_BY_VAX_LOT": "HISTOGRAM_DESCRIPTION" }) histogramDescriptionTable = histogramDescriptionTable.rename(columns = { "SYMPTOM_COUNT_BY_VAX_LOT": "HISTOGRAM_DESCRIPTION" })
histogramDescriptionTable.index.rename('VAX_LOT', inplace = True) histogramDescriptionTable.index.rename('VAX_LOT', inplace = True)
return histogramDescriptionTable return histogramDescriptionTable
@staticmethod
def _createHistogramDescriptionTable(dictByBatchcodeTable):
if 'COUNTRY' in dictByBatchcodeTable.columns:
return HistogramDescriptionTableFactory._createHistogramDescriptionTableForCountries(dictByBatchcodeTable)
else:
return HistogramDescriptionTableFactory._createGlobalHistogramDescriptionTable(dictByBatchcodeTable)
@staticmethod
def _createHistogramDescriptionTableForCountries(dictByBatchcodeTable):
return (dictByBatchcodeTable
.groupby(['VAX_LOT_EXPLODED', 'COUNTRY'])
.agg(HistogramDescriptionTableFactory._getHistograms)
.reset_index(level = 'COUNTRY')
.drop('nan'))
@staticmethod
def _createGlobalHistogramDescriptionTable(dictByBatchcodeTable):
return (dictByBatchcodeTable
.groupby('VAX_LOT_EXPLODED')
.agg(HistogramDescriptionTableFactory._getHistograms)
.drop('nan'))
@staticmethod @staticmethod
def _getHistograms(dictByBatchcodeTable): def _getHistograms(dictByBatchcodeTable):
dictByBatchcodeTable = dictByBatchcodeTable.to_frame() dictByBatchcodeTable = dictByBatchcodeTable.to_frame()

View File

@@ -6,7 +6,7 @@ import pandas as pd
class HistogramDescriptionTableFactoryTest(unittest.TestCase): class HistogramDescriptionTableFactoryTest(unittest.TestCase):
def test_createHistogramDescriptionTable(self): def test_createGlobalHistogramDescriptionTable(self):
# Given # Given
dictByBatchcodeTable = TestHelper.createDataFrame( dictByBatchcodeTable = TestHelper.createDataFrame(
columns = ['SYMPTOM_COUNT_BY_VAX_LOT'], columns = ['SYMPTOM_COUNT_BY_VAX_LOT'],
@@ -93,3 +93,101 @@ class HistogramDescriptionTableFactoryTest(unittest.TestCase):
'FD1921', 'FD1921',
'015M20A'])), '015M20A'])),
check_like = True) check_like = True)
def test_createHistogramDescriptionTable4Countries(self):
# Given
dictByBatchcodeTable = TestHelper.createDataFrame(
columns = ['SYMPTOM_COUNT_BY_VAX_LOT', 'COUNTRY'],
data = [ [{"Blood pressure orthostatic abnormal": 5, "Chest discomfort": 1}, 'Country A'],
[{"Blood pressure orthostatic abnormal": 5, "Chest discomfort": 1}, 'Country A'],
[{"Blood pressure orthostatic abnormal": 5, "Chest discomfort": 1}, 'Country A'],
[{"Chest discomfort": 2}, 'Country A'],
[{"Chest discomfort": 2}, 'Country A'],
[{"Chest discomfort": 2}, 'Country A']
],
index = pd.MultiIndex.from_tuples(
names = ['VAX_LOT_EXPLODED', 'VAX_LOT1', 'VAX_LOT2', 'VAX_LOT3'],
tuples = [['1808982', '1808982', 'EW0175', 'FD1921'],
['EW0175', '1808982', 'EW0175', 'FD1921'],
['FD1921', '1808982', 'EW0175', 'FD1921'],
['015M20A', '015M20A', '1808982', 'nan'],
['1808982', '015M20A', '1808982', 'nan'],
['nan', '015M20A', '1808982', 'nan']]))
# When
histogramDescriptionTable = HistogramDescriptionTableFactory.createHistogramDescriptionTable(dictByBatchcodeTable)
# Then
assert_frame_equal(
histogramDescriptionTable,
TestHelper.createDataFrame(
columns = ['HISTOGRAM_DESCRIPTION', 'COUNTRY'],
data = [ [
{
"batchcode": "1808982",
"histograms": [
{
"batchcodes": ["1808982", "EW0175", "FD1921"],
"histogram": {
"Blood pressure orthostatic abnormal": 5,
"Chest discomfort": 1}
},
{
"batchcodes": ["015M20A", "1808982"],
"histogram": {"Chest discomfort": 2}
}
]
},
'Country A'
],
[
{
"batchcode": "EW0175",
"histograms": [
{
"batchcodes": ["1808982", "EW0175", "FD1921"],
"histogram": {
"Blood pressure orthostatic abnormal": 5,
"Chest discomfort": 1}
}
]
},
'Country A'
],
[
{
"batchcode": "FD1921",
"histograms": [
{
"batchcodes": ["1808982", "EW0175", "FD1921"],
"histogram": {
"Blood pressure orthostatic abnormal": 5,
"Chest discomfort": 1}
}
]
},
'Country A'
],
[
{
"batchcode": "015M20A",
"histograms": [
{
"batchcodes": ["015M20A", "1808982"],
"histogram": {"Chest discomfort": 2}
}
]
},
'Country A'
]
],
index = pd.Index(
name = 'VAX_LOT',
data = [
'1808982',
'EW0175',
'FD1921',
'015M20A'])),
check_like = True)

View File

@@ -13,18 +13,8 @@ def createAndSaveGlobalHistograms(symptomByBatchcodeTable):
def createAndSaveHistogramsForCountries(symptomByBatchcodeTable, countries): def createAndSaveHistogramsForCountries(symptomByBatchcodeTable, countries):
dictByBatchcodeTable = createHistograms(symptomByBatchcodeTable) dictByBatchcodeTable = createHistograms(symptomByBatchcodeTable)
for count, country in enumerate(countries, start = 1): explodedTable = MultiIndexExploder.explodeMultiIndexOfTable(dictByBatchcodeTable)
_createAndSaveHistogramsForCountry(
count = count,
numCountries = len(countries),
country = country,
dictByBatchcodeTable = dictByBatchcodeTable)
def _createAndSaveHistogramsForCountry(count, numCountries, country, dictByBatchcodeTable):
# FK-TODO: use https://github.com/tqdm/tqdm
print(f'saving histograms for country {count}/{numCountries}: {country}')
dictByBatchcodeTable4Country = dictByBatchcodeTable[dictByBatchcodeTable['COUNTRY'] == country]
explodedTable = MultiIndexExploder.explodeMultiIndexOfTable(dictByBatchcodeTable4Country)
histogramDescriptionTable = HistogramDescriptionTableFactory.createHistogramDescriptionTable(explodedTable) histogramDescriptionTable = HistogramDescriptionTableFactory.createHistogramDescriptionTable(explodedTable)
saveHistograms(histogramDescriptionTable, country) for country, histogramDescriptionTableForCountry in histogramDescriptionTable.groupby('COUNTRY'):
print(country, ':')
saveHistograms(histogramDescriptionTableForCountry, country)

View File

@@ -7,6 +7,6 @@ class MultiIndexExploder:
@staticmethod @staticmethod
def explodeMultiIndexOfTable(table): def explodeMultiIndexOfTable(table):
batchcodeColumns = table.index.names batchcodeColumns = table.index.names
explodedTable = table.loc[np.repeat(table.index, len(batchcodeColumns))].reset_index() explodedTable = table.iloc[np.repeat(range(len(table.index)), len(batchcodeColumns))].reset_index()
explodedTable['VAX_LOT_EXPLODED'] = Utils.flatten(table.index.values) explodedTable['VAX_LOT_EXPLODED'] = Utils.flatten(table.index.values)
return explodedTable.set_index(['VAX_LOT_EXPLODED'] + batchcodeColumns) return explodedTable.set_index(['VAX_LOT_EXPLODED'] + batchcodeColumns)

View File

@@ -9,10 +9,10 @@ class MultiIndexExploderTest(unittest.TestCase):
def test_explodeMultiIndexOfTable(self): def test_explodeMultiIndexOfTable(self):
# Given # Given
table = TestHelper.createDataFrame( table = TestHelper.createDataFrame(
columns = ['DATA'], columns = ['DATA', 'COUNTRY'],
data = [ ['A, B data'], data = [ ['A, B data', 'Country A'],
['C, A data'], ['C, A data', 'Country B'],
['C, B data']], ['C, B data', 'Country C']],
index = pd.MultiIndex.from_tuples( index = pd.MultiIndex.from_tuples(
names = ['VAX_LOT1', 'VAX_LOT2'], names = ['VAX_LOT1', 'VAX_LOT2'],
tuples = [['A', 'B'], tuples = [['A', 'B'],
@@ -26,15 +26,15 @@ class MultiIndexExploderTest(unittest.TestCase):
assert_frame_equal( assert_frame_equal(
explodedTable, explodedTable,
TestHelper.createDataFrame( TestHelper.createDataFrame(
columns = ['DATA'], columns = ['DATA', 'COUNTRY'],
data = [ ['A, B data'], data = [ ['A, B data', 'Country A'],
['A, B data'], ['A, B data', 'Country A'],
['C, A data'], ['C, A data', 'Country B'],
['C, A data'], ['C, A data', 'Country B'],
['C, B data'], ['C, B data', 'Country C'],
['C, B data']], ['C, B data', 'Country C']],
index = pd.MultiIndex.from_tuples( index = pd.MultiIndex.from_tuples(
names = ['VAX_LOT_EXPLODED', 'VAX_LOT1', 'VAX_LOT2'], names = ['VAX_LOT_EXPLODED', 'VAX_LOT1', 'VAX_LOT2'],
tuples = [['A', 'A', 'B'], tuples = [['A', 'A', 'B'],

View File

@@ -1,18 +0,0 @@
from functools import reduce
class TableByBatchcodeFilter:
@staticmethod
def filterTableByBatchcode(batchcode, table):
batchcodeColumns = table.index.names
table = table.reset_index()
filteredTable = table[TableByBatchcodeFilter._existsBatchcodeInAnyBatchcodeColumn(table, batchcodeColumns, batchcode)]
return filteredTable.set_index(batchcodeColumns)
@staticmethod
def _existsBatchcodeInAnyBatchcodeColumn(table, batchcodeColumns, batchcode):
return reduce(
lambda accum, batchcodeColumn: accum | (table[batchcodeColumn] == batchcode),
batchcodeColumns,
[False] * len(table.index))

View File

@@ -1,65 +0,0 @@
import unittest
from pandas.testing import assert_frame_equal
from TableByBatchcodeFilter import TableByBatchcodeFilter
from TestHelper import TestHelper
import pandas as pd
class TableByBatchcodeFilterTest(unittest.TestCase):
def test_convertHistogramTable2JsonTable_2_VAX_LOT_columns(self):
# Given
batchcode = '1808982'
symptomHistogramByBatchcodeTable = TestHelper.createDataFrame(
columns = ['SYMPTOM_COUNT_BY_VAX_LOT'],
data = [ ['{"Blood pressure orthostatic abnormal":5,"Chest discomfort":1}'],
['{"Chest discomfort":2}'],
['{"Chills":5}']],
index = pd.MultiIndex.from_tuples(
names = ['VAX_LOT1', 'VAX_LOT2'],
tuples = [[batchcode, 'EW0175'],
['015M20A', batchcode],
['015M20A', 'EW0175']]))
# When
filteredTable = TableByBatchcodeFilter.filterTableByBatchcode(batchcode, symptomHistogramByBatchcodeTable)
# Then
assert_frame_equal(
filteredTable,
TestHelper.createDataFrame(
columns = ['SYMPTOM_COUNT_BY_VAX_LOT'],
data = [ ['{"Blood pressure orthostatic abnormal":5,"Chest discomfort":1}'],
['{"Chest discomfort":2}']],
index = pd.MultiIndex.from_tuples(
names = ['VAX_LOT1', 'VAX_LOT2'],
tuples = [[batchcode, 'EW0175'],
['015M20A', batchcode]])))
def test_convertHistogramTable2JsonTable_3_VAX_LOT_columns(self):
# Given
batchcode = '1808983'
symptomHistogramByBatchcodeTable = TestHelper.createDataFrame(
columns = ['SYMPTOM_COUNT_BY_VAX_LOT'],
data = [ ['{"Blood pressure orthostatic abnormal":5,"Chest discomfort":1}'],
['{"Chest discomfort":2}'],
['{"Chills":5}']],
index = pd.MultiIndex.from_tuples(
names = ['VAX_LOT1', 'VAX_LOT2', 'VAX_LOT3'],
tuples = [[batchcode, 'EW0175', None],
['015M20A', None, batchcode],
['015M20A', 'EW0175', 'dummy2']]))
# When
filteredTable = TableByBatchcodeFilter.filterTableByBatchcode(batchcode, symptomHistogramByBatchcodeTable)
# Then
assert_frame_equal(
filteredTable,
TestHelper.createDataFrame(
columns = ['SYMPTOM_COUNT_BY_VAX_LOT'],
data = [ ['{"Blood pressure orthostatic abnormal":5,"Chest discomfort":1}'],
['{"Chest discomfort":2}']],
index = pd.MultiIndex.from_tuples(
names = ['VAX_LOT1', 'VAX_LOT2', 'VAX_LOT3'],
tuples = [[batchcode, 'EW0175', None],
['015M20A', None, batchcode]])))