From 5cd36b8e1e39ee00d4f5b54ee2d1eaa8a275a381 Mon Sep 17 00:00:00 2001 From: frankknoll Date: Wed, 4 Oct 2023 11:34:20 +0200 Subject: [PATCH] adding CityCountsByBatchcodeTablesMerger --- .../CityCountsByBatchcodeTablesMerger.py | 19 +++++ .../CityCountsByBatchcodeTablesMergerTest.py | 27 +++++++ .../RegionCountsByBatchcodeTablesMerger.py | 8 +- .../RegionCountsByClickedBatchcodeProvider.py | 17 ++-- src/HowBadIsMyBatch.ipynb | 80 +++++++++++++++++++ 5 files changed, 140 insertions(+), 11 deletions(-) create mode 100644 src/GoogleAnalytics/CityCountsByBatchcodeTablesMerger.py create mode 100644 src/GoogleAnalytics/CityCountsByBatchcodeTablesMergerTest.py diff --git a/src/GoogleAnalytics/CityCountsByBatchcodeTablesMerger.py b/src/GoogleAnalytics/CityCountsByBatchcodeTablesMerger.py new file mode 100644 index 00000000000..793f388fa7c --- /dev/null +++ b/src/GoogleAnalytics/CityCountsByBatchcodeTablesMerger.py @@ -0,0 +1,19 @@ +import pandas as pd +from GoogleAnalytics.RegionCountsByClickedBatchcodeProvider import RegionCountsByClickedBatchcodeProvider +from GoogleAnalytics.FilesProvider import FilesProvider +from GoogleAnalytics.Resolution import Resolution + +class CityCountsByBatchcodeTablesMerger: + + @staticmethod + def getCityCountsByClickedBatchcode(dataDir): + files = FilesProvider(dataDir).getFilesHavingResolution(Resolution.CITY) + cityCountsByClickedBatchcodeTables = [RegionCountsByClickedBatchcodeProvider._getCityCountsByClickedBatchcode(file) for file in files] + table = pd.concat(cityCountsByClickedBatchcodeTables) + return CityCountsByBatchcodeTablesMerger._getCityCountsByClickedBatchcodeFromTable(table) + + @staticmethod + def _getCityCountsByClickedBatchcodeFromTable(cityCountsByClickedBatchcodeTable): + return (cityCountsByClickedBatchcodeTable + .groupby(cityCountsByClickedBatchcodeTable.index.names) + .sum()) diff --git a/src/GoogleAnalytics/CityCountsByBatchcodeTablesMergerTest.py b/src/GoogleAnalytics/CityCountsByBatchcodeTablesMergerTest.py new file mode 100644 index 00000000000..7909250e4eb --- /dev/null +++ b/src/GoogleAnalytics/CityCountsByBatchcodeTablesMergerTest.py @@ -0,0 +1,27 @@ +import unittest +from pandas.testing import assert_frame_equal +from TestHelper import TestHelper +import pandas as pd +from GoogleAnalytics.CityCountsByBatchcodeTablesMerger import CityCountsByBatchcodeTablesMerger + +class CityCountsByBatchcodeTablesMergerTest(unittest.TestCase): + + def test_getCityCountsByClickedBatchcode(self): + # Given + + # When + cityCountsByClickedBatchcodeTable = CityCountsByBatchcodeTablesMerger.getCityCountsByClickedBatchcode('src/testdata/GoogleAnalytics') + + # Then + assert_frame_equal( + cityCountsByClickedBatchcodeTable, + TestHelper.createDataFrame( + columns = ['CITY_COUNT_BY_VAX_LOT'], + data = [ [100 + 200], + [10 + 20], + [20 + 40]], + index = pd.MultiIndex.from_tuples( + names = ['VAX_LOT', 'COUNTRY', 'REGION', 'CITY'], + tuples = [['#003B21A', 'United States', 'California', 'Roseville'], + ['000086A', 'Germany', 'Bavaria', 'Nordlingen'], + ['000086A', 'Germany', 'Bavaria', 'Nuremberg']]))) diff --git a/src/GoogleAnalytics/RegionCountsByBatchcodeTablesMerger.py b/src/GoogleAnalytics/RegionCountsByBatchcodeTablesMerger.py index 91f99bd46a9..c791c296930 100644 --- a/src/GoogleAnalytics/RegionCountsByBatchcodeTablesMerger.py +++ b/src/GoogleAnalytics/RegionCountsByBatchcodeTablesMerger.py @@ -1,13 +1,13 @@ +import pandas as pd from GoogleAnalytics.RegionCountsByClickedBatchcodeProvider import RegionCountsByClickedBatchcodeProvider from GoogleAnalytics.FilesProvider import FilesProvider from GoogleAnalytics.Resolution import Resolution -from TablesHelper import TablesHelper class RegionCountsByBatchcodeTablesMerger: @staticmethod def getRegionCountsByClickedBatchcode(dataDir): files = FilesProvider(dataDir).getFilesHavingResolution(Resolution.CITY) - tables = [RegionCountsByClickedBatchcodeProvider.getRegionCountsByClickedBatchcode(file) for file in files] - table = TablesHelper.concatTables_groupByIndex_sum(tables) - return table + cityCountsByClickedBatchcodeTables = [RegionCountsByClickedBatchcodeProvider._getCityCountsByClickedBatchcode(file) for file in files] + table = pd.concat(cityCountsByClickedBatchcodeTables) + return RegionCountsByClickedBatchcodeProvider._getRegionCountsByClickedBatchcodeFromTable(table) diff --git a/src/GoogleAnalytics/RegionCountsByClickedBatchcodeProvider.py b/src/GoogleAnalytics/RegionCountsByClickedBatchcodeProvider.py index 0ddc2ab2c4b..47082430e82 100644 --- a/src/GoogleAnalytics/RegionCountsByClickedBatchcodeProvider.py +++ b/src/GoogleAnalytics/RegionCountsByClickedBatchcodeProvider.py @@ -5,13 +5,7 @@ class RegionCountsByClickedBatchcodeProvider: @staticmethod def getRegionCountsByClickedBatchcode(file): - cityCountsByClickedBatchcodeTable = RegionCountsByClickedBatchcodeProvider._getCityCountsByClickedBatchcode(file) - return (cityCountsByClickedBatchcodeTable - .groupby(['VAX_LOT', 'COUNTRY', 'REGION']) - .agg(REGION_COUNT_BY_VAX_LOT = - pd.NamedAgg( - column = 'CITY_COUNT_BY_VAX_LOT', - aggfunc = sum))) + return RegionCountsByClickedBatchcodeProvider._getRegionCountsByClickedBatchcodeFromTable(RegionCountsByClickedBatchcodeProvider._getCityCountsByClickedBatchcode(file)) # FK-TODO: delegate same method CountryCountsByClickedBatchcodeProvider._getCityCountsByClickedBatchcode() to here @staticmethod @@ -25,3 +19,12 @@ class RegionCountsByClickedBatchcodeProvider: 'Event count': 'CITY_COUNT_BY_VAX_LOT' }, index_columns = ['COUNTRY', 'REGION', 'CITY']) + + @staticmethod + def _getRegionCountsByClickedBatchcodeFromTable(cityCountsByClickedBatchcodeTable): + return (cityCountsByClickedBatchcodeTable + .groupby(['VAX_LOT', 'COUNTRY', 'REGION']) + .agg(REGION_COUNT_BY_VAX_LOT = + pd.NamedAgg( + column = 'CITY_COUNT_BY_VAX_LOT', + aggfunc = sum))) diff --git a/src/HowBadIsMyBatch.ipynb b/src/HowBadIsMyBatch.ipynb index a0b25e67c6a..fc18dc59f8f 100644 --- a/src/HowBadIsMyBatch.ipynb +++ b/src/HowBadIsMyBatch.ipynb @@ -249,6 +249,14 @@ " '../docs/data/barChartDescriptionTable.json')\n" ] }, + { + "cell_type": "markdown", + "id": "c5a66a3b", + "metadata": {}, + "source": [ + "# Google Analytics" + ] + }, { "cell_type": "code", "execution_count": null, @@ -283,6 +291,78 @@ "regionCountsByClickedBatchcodeTable4Germany.to_excel('tmp/regionCountsByClickedBatchcodeTable4Germany.xlsx')" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c05fcfc", + "metadata": {}, + "outputs": [], + "source": [ + "# VAX_LOT: EX8679\n", + "(regionCountsByClickedBatchcodeTable4Germany\n", + " .groupby('VAX_LOT')\n", + " .sum()\n", + " .sort_values(by = 'REGION_COUNT_BY_VAX_LOT', ascending = False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "370bf329", + "metadata": {}, + "outputs": [], + "source": [ + "(regionCountsByClickedBatchcodeTable4Germany\n", + " .loc[('EX8679', slice(None), slice(None)), :]\n", + " .sort_values(by = 'REGION_COUNT_BY_VAX_LOT', ascending = False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9740c40b", + "metadata": {}, + "outputs": [], + "source": [ + "from GoogleAnalytics.CityCountsByBatchcodeTablesMerger import CityCountsByBatchcodeTablesMerger\n", + "\n", + "cityCountsByClickedBatchcodeTable = CityCountsByBatchcodeTablesMerger.getCityCountsByClickedBatchcode('data/GoogleAnalytics')\n", + "cityCountsByClickedBatchcodeTable" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0dac0ea6", + "metadata": {}, + "outputs": [], + "source": [ + "cityCountsByClickedBatchcodeTable_EX8679_Germany = cityCountsByClickedBatchcodeTable.loc[('EX8679', 'Germany', slice(None), slice(None)), :]\n", + "cityCountsByClickedBatchcodeTable_EX8679_Germany" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d7a8bbf", + "metadata": {}, + "outputs": [], + "source": [ + "cityCountsByClickedBatchcodeTable_EX8679_Germany.to_excel('tmp/cityCountsByClickedBatchcodeTable_EX8679_Germany.xlsx')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "494943f3", + "metadata": {}, + "outputs": [], + "source": [ + "(cityCountsByClickedBatchcodeTable_EX8679_Germany\n", + " .sort_values(by = ['CITY_COUNT_BY_VAX_LOT'], ascending = False)\n", + " .to_excel('tmp/cityCountsByClickedBatchcodeTable_EX8679_Germany_sorted.xlsx'))" + ] + }, { "attachments": {}, "cell_type": "markdown",