generating histograms for countries more efficient
This commit is contained in:
@@ -1,21 +0,0 @@
|
|||||||
from TableByBatchcodeFilter import TableByBatchcodeFilter
|
|
||||||
from DictByBatchcodeTable2DictConverter import DictByBatchcodeTable2DictConverter
|
|
||||||
from IOUtils import IOUtils
|
|
||||||
|
|
||||||
|
|
||||||
class HistogramDescriptionPersister:
|
|
||||||
|
|
||||||
def __init__(self, directory):
|
|
||||||
self.directory = directory
|
|
||||||
|
|
||||||
def saveHistogramDescriptionsForBatchcodes(self, batchcodes, dictByBatchcodeTable, progress):
|
|
||||||
for count, batchcode in enumerate(batchcodes, start = 1):
|
|
||||||
histogramDescription = self._getHistogramDescriptionForBatchcode(batchcode, dictByBatchcodeTable)
|
|
||||||
# FK-TODO: nicht direkt {batchcode}.json speichern, denn im Dateinamen könnte sich dann ein '/' befinden, was ein nicht gewünschtes Unterverzeichnis erzeugt. Deshalb in der Batchcode-Tabelle eine unsichtbare Spalte einfügen, in welcher für den jeweiligen batchcode der bereinigte und eindeutige Dateiname steht (z.B. einfach durchnummeriert: 0.json, 1.json, ...).
|
|
||||||
IOUtils.saveDictAsJson(histogramDescription, f'{self.directory}/{batchcode}.json')
|
|
||||||
progress(count, len(batchcodes), batchcode)
|
|
||||||
|
|
||||||
def _getHistogramDescriptionForBatchcode(self, batchcode, dictByBatchcodeTable):
|
|
||||||
dictByBatchcodeTableForBatchcode = TableByBatchcodeFilter.filterTableByBatchcode(batchcode, dictByBatchcodeTable)
|
|
||||||
histogramDescription = DictByBatchcodeTable2DictConverter.convertDictByBatchcodeTable2Dict(dictByBatchcodeTableForBatchcode, batchcode)
|
|
||||||
return histogramDescription
|
|
||||||
@@ -4,15 +4,34 @@ class HistogramDescriptionTableFactory:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def createHistogramDescriptionTable(dictByBatchcodeTable):
|
def createHistogramDescriptionTable(dictByBatchcodeTable):
|
||||||
histogramDescriptionTable = (
|
histogramDescriptionTable = HistogramDescriptionTableFactory._createHistogramDescriptionTable(dictByBatchcodeTable)
|
||||||
dictByBatchcodeTable
|
|
||||||
.groupby('VAX_LOT_EXPLODED')
|
|
||||||
.agg(HistogramDescriptionTableFactory._getHistograms)
|
|
||||||
.drop('nan'))
|
|
||||||
histogramDescriptionTable = histogramDescriptionTable.rename(columns = { "SYMPTOM_COUNT_BY_VAX_LOT": "HISTOGRAM_DESCRIPTION" })
|
histogramDescriptionTable = histogramDescriptionTable.rename(columns = { "SYMPTOM_COUNT_BY_VAX_LOT": "HISTOGRAM_DESCRIPTION" })
|
||||||
histogramDescriptionTable.index.rename('VAX_LOT', inplace = True)
|
histogramDescriptionTable.index.rename('VAX_LOT', inplace = True)
|
||||||
return histogramDescriptionTable
|
return histogramDescriptionTable
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _createHistogramDescriptionTable(dictByBatchcodeTable):
|
||||||
|
if 'COUNTRY' in dictByBatchcodeTable.columns:
|
||||||
|
return HistogramDescriptionTableFactory._createHistogramDescriptionTableForCountries(dictByBatchcodeTable)
|
||||||
|
else:
|
||||||
|
return HistogramDescriptionTableFactory._createGlobalHistogramDescriptionTable(dictByBatchcodeTable)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _createHistogramDescriptionTableForCountries(dictByBatchcodeTable):
|
||||||
|
return (dictByBatchcodeTable
|
||||||
|
.groupby(['VAX_LOT_EXPLODED', 'COUNTRY'])
|
||||||
|
.agg(HistogramDescriptionTableFactory._getHistograms)
|
||||||
|
.reset_index(level = 'COUNTRY')
|
||||||
|
.drop('nan'))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _createGlobalHistogramDescriptionTable(dictByBatchcodeTable):
|
||||||
|
return (dictByBatchcodeTable
|
||||||
|
.groupby('VAX_LOT_EXPLODED')
|
||||||
|
.agg(HistogramDescriptionTableFactory._getHistograms)
|
||||||
|
.drop('nan'))
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _getHistograms(dictByBatchcodeTable):
|
def _getHistograms(dictByBatchcodeTable):
|
||||||
dictByBatchcodeTable = dictByBatchcodeTable.to_frame()
|
dictByBatchcodeTable = dictByBatchcodeTable.to_frame()
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ import pandas as pd
|
|||||||
|
|
||||||
class HistogramDescriptionTableFactoryTest(unittest.TestCase):
|
class HistogramDescriptionTableFactoryTest(unittest.TestCase):
|
||||||
|
|
||||||
def test_createHistogramDescriptionTable(self):
|
def test_createGlobalHistogramDescriptionTable(self):
|
||||||
# Given
|
# Given
|
||||||
dictByBatchcodeTable = TestHelper.createDataFrame(
|
dictByBatchcodeTable = TestHelper.createDataFrame(
|
||||||
columns = ['SYMPTOM_COUNT_BY_VAX_LOT'],
|
columns = ['SYMPTOM_COUNT_BY_VAX_LOT'],
|
||||||
@@ -93,3 +93,101 @@ class HistogramDescriptionTableFactoryTest(unittest.TestCase):
|
|||||||
'FD1921',
|
'FD1921',
|
||||||
'015M20A'])),
|
'015M20A'])),
|
||||||
check_like = True)
|
check_like = True)
|
||||||
|
|
||||||
|
def test_createHistogramDescriptionTable4Countries(self):
|
||||||
|
# Given
|
||||||
|
dictByBatchcodeTable = TestHelper.createDataFrame(
|
||||||
|
columns = ['SYMPTOM_COUNT_BY_VAX_LOT', 'COUNTRY'],
|
||||||
|
data = [ [{"Blood pressure orthostatic abnormal": 5, "Chest discomfort": 1}, 'Country A'],
|
||||||
|
[{"Blood pressure orthostatic abnormal": 5, "Chest discomfort": 1}, 'Country A'],
|
||||||
|
[{"Blood pressure orthostatic abnormal": 5, "Chest discomfort": 1}, 'Country A'],
|
||||||
|
|
||||||
|
[{"Chest discomfort": 2}, 'Country A'],
|
||||||
|
[{"Chest discomfort": 2}, 'Country A'],
|
||||||
|
[{"Chest discomfort": 2}, 'Country A']
|
||||||
|
],
|
||||||
|
index = pd.MultiIndex.from_tuples(
|
||||||
|
names = ['VAX_LOT_EXPLODED', 'VAX_LOT1', 'VAX_LOT2', 'VAX_LOT3'],
|
||||||
|
tuples = [['1808982', '1808982', 'EW0175', 'FD1921'],
|
||||||
|
['EW0175', '1808982', 'EW0175', 'FD1921'],
|
||||||
|
['FD1921', '1808982', 'EW0175', 'FD1921'],
|
||||||
|
|
||||||
|
['015M20A', '015M20A', '1808982', 'nan'],
|
||||||
|
['1808982', '015M20A', '1808982', 'nan'],
|
||||||
|
['nan', '015M20A', '1808982', 'nan']]))
|
||||||
|
|
||||||
|
# When
|
||||||
|
histogramDescriptionTable = HistogramDescriptionTableFactory.createHistogramDescriptionTable(dictByBatchcodeTable)
|
||||||
|
|
||||||
|
# Then
|
||||||
|
assert_frame_equal(
|
||||||
|
histogramDescriptionTable,
|
||||||
|
TestHelper.createDataFrame(
|
||||||
|
columns = ['HISTOGRAM_DESCRIPTION', 'COUNTRY'],
|
||||||
|
data = [ [
|
||||||
|
{
|
||||||
|
"batchcode": "1808982",
|
||||||
|
"histograms": [
|
||||||
|
{
|
||||||
|
"batchcodes": ["1808982", "EW0175", "FD1921"],
|
||||||
|
"histogram": {
|
||||||
|
"Blood pressure orthostatic abnormal": 5,
|
||||||
|
"Chest discomfort": 1}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"batchcodes": ["015M20A", "1808982"],
|
||||||
|
"histogram": {"Chest discomfort": 2}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
'Country A'
|
||||||
|
],
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"batchcode": "EW0175",
|
||||||
|
"histograms": [
|
||||||
|
{
|
||||||
|
"batchcodes": ["1808982", "EW0175", "FD1921"],
|
||||||
|
"histogram": {
|
||||||
|
"Blood pressure orthostatic abnormal": 5,
|
||||||
|
"Chest discomfort": 1}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
'Country A'
|
||||||
|
],
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"batchcode": "FD1921",
|
||||||
|
"histograms": [
|
||||||
|
{
|
||||||
|
"batchcodes": ["1808982", "EW0175", "FD1921"],
|
||||||
|
"histogram": {
|
||||||
|
"Blood pressure orthostatic abnormal": 5,
|
||||||
|
"Chest discomfort": 1}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
'Country A'
|
||||||
|
],
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"batchcode": "015M20A",
|
||||||
|
"histograms": [
|
||||||
|
{
|
||||||
|
"batchcodes": ["015M20A", "1808982"],
|
||||||
|
"histogram": {"Chest discomfort": 2}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
'Country A'
|
||||||
|
]
|
||||||
|
],
|
||||||
|
index = pd.Index(
|
||||||
|
name = 'VAX_LOT',
|
||||||
|
data = [
|
||||||
|
'1808982',
|
||||||
|
'EW0175',
|
||||||
|
'FD1921',
|
||||||
|
'015M20A'])),
|
||||||
|
check_like = True)
|
||||||
|
|||||||
@@ -13,18 +13,8 @@ def createAndSaveGlobalHistograms(symptomByBatchcodeTable):
|
|||||||
|
|
||||||
def createAndSaveHistogramsForCountries(symptomByBatchcodeTable, countries):
|
def createAndSaveHistogramsForCountries(symptomByBatchcodeTable, countries):
|
||||||
dictByBatchcodeTable = createHistograms(symptomByBatchcodeTable)
|
dictByBatchcodeTable = createHistograms(symptomByBatchcodeTable)
|
||||||
for count, country in enumerate(countries, start = 1):
|
explodedTable = MultiIndexExploder.explodeMultiIndexOfTable(dictByBatchcodeTable)
|
||||||
_createAndSaveHistogramsForCountry(
|
|
||||||
count = count,
|
|
||||||
numCountries = len(countries),
|
|
||||||
country = country,
|
|
||||||
dictByBatchcodeTable = dictByBatchcodeTable)
|
|
||||||
|
|
||||||
|
|
||||||
def _createAndSaveHistogramsForCountry(count, numCountries, country, dictByBatchcodeTable):
|
|
||||||
# FK-TODO: use https://github.com/tqdm/tqdm
|
|
||||||
print(f'saving histograms for country {count}/{numCountries}: {country}')
|
|
||||||
dictByBatchcodeTable4Country = dictByBatchcodeTable[dictByBatchcodeTable['COUNTRY'] == country]
|
|
||||||
explodedTable = MultiIndexExploder.explodeMultiIndexOfTable(dictByBatchcodeTable4Country)
|
|
||||||
histogramDescriptionTable = HistogramDescriptionTableFactory.createHistogramDescriptionTable(explodedTable)
|
histogramDescriptionTable = HistogramDescriptionTableFactory.createHistogramDescriptionTable(explodedTable)
|
||||||
saveHistograms(histogramDescriptionTable, country)
|
for country, histogramDescriptionTableForCountry in histogramDescriptionTable.groupby('COUNTRY'):
|
||||||
|
print(country, ':')
|
||||||
|
saveHistograms(histogramDescriptionTableForCountry, country)
|
||||||
@@ -7,6 +7,6 @@ class MultiIndexExploder:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def explodeMultiIndexOfTable(table):
|
def explodeMultiIndexOfTable(table):
|
||||||
batchcodeColumns = table.index.names
|
batchcodeColumns = table.index.names
|
||||||
explodedTable = table.loc[np.repeat(table.index, len(batchcodeColumns))].reset_index()
|
explodedTable = table.iloc[np.repeat(range(len(table.index)), len(batchcodeColumns))].reset_index()
|
||||||
explodedTable['VAX_LOT_EXPLODED'] = Utils.flatten(table.index.values)
|
explodedTable['VAX_LOT_EXPLODED'] = Utils.flatten(table.index.values)
|
||||||
return explodedTable.set_index(['VAX_LOT_EXPLODED'] + batchcodeColumns)
|
return explodedTable.set_index(['VAX_LOT_EXPLODED'] + batchcodeColumns)
|
||||||
|
|||||||
@@ -9,10 +9,10 @@ class MultiIndexExploderTest(unittest.TestCase):
|
|||||||
def test_explodeMultiIndexOfTable(self):
|
def test_explodeMultiIndexOfTable(self):
|
||||||
# Given
|
# Given
|
||||||
table = TestHelper.createDataFrame(
|
table = TestHelper.createDataFrame(
|
||||||
columns = ['DATA'],
|
columns = ['DATA', 'COUNTRY'],
|
||||||
data = [ ['A, B data'],
|
data = [ ['A, B data', 'Country A'],
|
||||||
['C, A data'],
|
['C, A data', 'Country B'],
|
||||||
['C, B data']],
|
['C, B data', 'Country C']],
|
||||||
index = pd.MultiIndex.from_tuples(
|
index = pd.MultiIndex.from_tuples(
|
||||||
names = ['VAX_LOT1', 'VAX_LOT2'],
|
names = ['VAX_LOT1', 'VAX_LOT2'],
|
||||||
tuples = [['A', 'B'],
|
tuples = [['A', 'B'],
|
||||||
@@ -26,15 +26,15 @@ class MultiIndexExploderTest(unittest.TestCase):
|
|||||||
assert_frame_equal(
|
assert_frame_equal(
|
||||||
explodedTable,
|
explodedTable,
|
||||||
TestHelper.createDataFrame(
|
TestHelper.createDataFrame(
|
||||||
columns = ['DATA'],
|
columns = ['DATA', 'COUNTRY'],
|
||||||
data = [ ['A, B data'],
|
data = [ ['A, B data', 'Country A'],
|
||||||
['A, B data'],
|
['A, B data', 'Country A'],
|
||||||
|
|
||||||
['C, A data'],
|
['C, A data', 'Country B'],
|
||||||
['C, A data'],
|
['C, A data', 'Country B'],
|
||||||
|
|
||||||
['C, B data'],
|
['C, B data', 'Country C'],
|
||||||
['C, B data']],
|
['C, B data', 'Country C']],
|
||||||
index = pd.MultiIndex.from_tuples(
|
index = pd.MultiIndex.from_tuples(
|
||||||
names = ['VAX_LOT_EXPLODED', 'VAX_LOT1', 'VAX_LOT2'],
|
names = ['VAX_LOT_EXPLODED', 'VAX_LOT1', 'VAX_LOT2'],
|
||||||
tuples = [['A', 'A', 'B'],
|
tuples = [['A', 'A', 'B'],
|
||||||
|
|||||||
@@ -1,18 +0,0 @@
|
|||||||
from functools import reduce
|
|
||||||
|
|
||||||
|
|
||||||
class TableByBatchcodeFilter:
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def filterTableByBatchcode(batchcode, table):
|
|
||||||
batchcodeColumns = table.index.names
|
|
||||||
table = table.reset_index()
|
|
||||||
filteredTable = table[TableByBatchcodeFilter._existsBatchcodeInAnyBatchcodeColumn(table, batchcodeColumns, batchcode)]
|
|
||||||
return filteredTable.set_index(batchcodeColumns)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _existsBatchcodeInAnyBatchcodeColumn(table, batchcodeColumns, batchcode):
|
|
||||||
return reduce(
|
|
||||||
lambda accum, batchcodeColumn: accum | (table[batchcodeColumn] == batchcode),
|
|
||||||
batchcodeColumns,
|
|
||||||
[False] * len(table.index))
|
|
||||||
@@ -1,65 +0,0 @@
|
|||||||
import unittest
|
|
||||||
from pandas.testing import assert_frame_equal
|
|
||||||
from TableByBatchcodeFilter import TableByBatchcodeFilter
|
|
||||||
from TestHelper import TestHelper
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
class TableByBatchcodeFilterTest(unittest.TestCase):
|
|
||||||
|
|
||||||
def test_convertHistogramTable2JsonTable_2_VAX_LOT_columns(self):
|
|
||||||
# Given
|
|
||||||
batchcode = '1808982'
|
|
||||||
symptomHistogramByBatchcodeTable = TestHelper.createDataFrame(
|
|
||||||
columns = ['SYMPTOM_COUNT_BY_VAX_LOT'],
|
|
||||||
data = [ ['{"Blood pressure orthostatic abnormal":5,"Chest discomfort":1}'],
|
|
||||||
['{"Chest discomfort":2}'],
|
|
||||||
['{"Chills":5}']],
|
|
||||||
index = pd.MultiIndex.from_tuples(
|
|
||||||
names = ['VAX_LOT1', 'VAX_LOT2'],
|
|
||||||
tuples = [[batchcode, 'EW0175'],
|
|
||||||
['015M20A', batchcode],
|
|
||||||
['015M20A', 'EW0175']]))
|
|
||||||
|
|
||||||
# When
|
|
||||||
filteredTable = TableByBatchcodeFilter.filterTableByBatchcode(batchcode, symptomHistogramByBatchcodeTable)
|
|
||||||
|
|
||||||
# Then
|
|
||||||
assert_frame_equal(
|
|
||||||
filteredTable,
|
|
||||||
TestHelper.createDataFrame(
|
|
||||||
columns = ['SYMPTOM_COUNT_BY_VAX_LOT'],
|
|
||||||
data = [ ['{"Blood pressure orthostatic abnormal":5,"Chest discomfort":1}'],
|
|
||||||
['{"Chest discomfort":2}']],
|
|
||||||
index = pd.MultiIndex.from_tuples(
|
|
||||||
names = ['VAX_LOT1', 'VAX_LOT2'],
|
|
||||||
tuples = [[batchcode, 'EW0175'],
|
|
||||||
['015M20A', batchcode]])))
|
|
||||||
|
|
||||||
def test_convertHistogramTable2JsonTable_3_VAX_LOT_columns(self):
|
|
||||||
# Given
|
|
||||||
batchcode = '1808983'
|
|
||||||
symptomHistogramByBatchcodeTable = TestHelper.createDataFrame(
|
|
||||||
columns = ['SYMPTOM_COUNT_BY_VAX_LOT'],
|
|
||||||
data = [ ['{"Blood pressure orthostatic abnormal":5,"Chest discomfort":1}'],
|
|
||||||
['{"Chest discomfort":2}'],
|
|
||||||
['{"Chills":5}']],
|
|
||||||
index = pd.MultiIndex.from_tuples(
|
|
||||||
names = ['VAX_LOT1', 'VAX_LOT2', 'VAX_LOT3'],
|
|
||||||
tuples = [[batchcode, 'EW0175', None],
|
|
||||||
['015M20A', None, batchcode],
|
|
||||||
['015M20A', 'EW0175', 'dummy2']]))
|
|
||||||
|
|
||||||
# When
|
|
||||||
filteredTable = TableByBatchcodeFilter.filterTableByBatchcode(batchcode, symptomHistogramByBatchcodeTable)
|
|
||||||
|
|
||||||
# Then
|
|
||||||
assert_frame_equal(
|
|
||||||
filteredTable,
|
|
||||||
TestHelper.createDataFrame(
|
|
||||||
columns = ['SYMPTOM_COUNT_BY_VAX_LOT'],
|
|
||||||
data = [ ['{"Blood pressure orthostatic abnormal":5,"Chest discomfort":1}'],
|
|
||||||
['{"Chest discomfort":2}']],
|
|
||||||
index = pd.MultiIndex.from_tuples(
|
|
||||||
names = ['VAX_LOT1', 'VAX_LOT2', 'VAX_LOT3'],
|
|
||||||
tuples = [[batchcode, 'EW0175', None],
|
|
||||||
['015M20A', None, batchcode]])))
|
|
||||||
Reference in New Issue
Block a user