Files
HowBadIsMyBatch/src/intensivstationen/Intensivstationen.ipynb
2022-03-19 15:45:20 +01:00

702 lines
23 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "9de5907f-18f5-4cb1-903e-26028ff1fa03",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from urllib import request\n",
"\n",
"pd.set_option('display.max_rows', 100)\n",
"pd.set_option('display.max_columns', None)\n",
"pd.set_option('mode.chained_assignment', 'raise')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dfa836ec",
"metadata": {},
"outputs": [],
"source": [
"needsUpdate = False"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "79de4057",
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"import requests\n",
"from datetime import datetime\n",
"from time import sleep\n",
"from selenium import webdriver\n",
"from selenium.webdriver.firefox.options import Options\n",
"\n",
"class DateProvider:\n",
" \n",
" INTENSIVSTATIONEN_DATE_FORMAT = \"%d.%m.%Y, %H:%M Uhr\"\n",
"\n",
" def __init__(self):\n",
" self.lastUpdated = None\n",
" self.lastUpdatedDataSource = None\n",
"\n",
" def needsUpdate(self):\n",
" return self.getLastUpdated() < self.getLastUpdatedDataSource()\n",
" \n",
" def getLastUpdated(self):\n",
" if self.lastUpdated is None:\n",
" htmlContent = requests.get(\"https://knollfrank.github.io/HowBadIsMyBatch/intensivstationen.html\").text\n",
" soup = BeautifulSoup(htmlContent, \"lxml\")\n",
" dateStr = soup.find(id = \"Datenstand\").text\n",
" self.lastUpdated = datetime.strptime(dateStr, DateProvider.INTENSIVSTATIONEN_DATE_FORMAT)\n",
" \n",
" return self.lastUpdated\n",
"\n",
" def getLastUpdatedDataSource(self):\n",
" if self.lastUpdatedDataSource is None:\n",
" html = self._getOriginalHtml()\n",
" lastUpdatedColumn = 'Letzte Änderung'\n",
" dataFrame = self._asDataFrame(html, lastUpdatedColumn)\n",
" self.lastUpdatedDataSource = dataFrame.loc['Landkreis-Daten', lastUpdatedColumn].to_pydatetime()\n",
"\n",
" return self.lastUpdatedDataSource\n",
"\n",
" def _getOriginalHtml(self):\n",
" options = Options()\n",
" options.headless = True\n",
" driver = webdriver.Firefox(options = options)\n",
" driver.get('https://www.intensivregister.de/#/aktuelle-lage/downloads')\n",
" sleep(10)\n",
" innerHTML = driver.execute_script(\"return document.body.innerHTML\")\n",
" driver.quit()\n",
" return innerHTML\n",
"\n",
" def _asDataFrame(self, html, lastUpdatedColumn):\n",
" dataFrame = pd.read_html(html, parse_dates = [lastUpdatedColumn])[0]\n",
" dataFrame[lastUpdatedColumn] = pd.to_datetime(dataFrame[lastUpdatedColumn], format = \"%d.%m.%Y %H:%M Uhr\")\n",
" dataFrame.set_index('Name', inplace = True)\n",
" return dataFrame\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "336f56e6",
"metadata": {},
"outputs": [],
"source": [
"dateProvider = DateProvider()\n",
"print(' lastUpdated:', dateProvider.getLastUpdated())\n",
"print('lastUpdatedDataSource:', dateProvider.getLastUpdatedDataSource()) \n",
"needsUpdate = dateProvider.needsUpdate()\n",
"print('needsUpdate: ', needsUpdate)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "03784154",
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"\n",
"class HtmlTransformerUtil:\n",
" \n",
" def applySoupTransformerToFile(self, file, soupTransformer):\n",
" self._writeSoup(soupTransformer(self._readSoup(file)), file)\n",
"\n",
" def _readSoup(self, file):\n",
" with open(file) as fp:\n",
" soup = BeautifulSoup(fp, 'lxml')\n",
" return soup\n",
"\n",
" def _writeSoup(self, soup, file):\n",
" with open(file, \"w\") as fp:\n",
" fp.write(str(soup)) \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af101279",
"metadata": {},
"outputs": [],
"source": [
"def saveLastUpdatedIntensivstationen(lastUpdated):\n",
" def setLastUpdated(soup):\n",
" soup.find(id = \"Datenstand\").string.replace_with(lastUpdated.strftime(DateProvider.INTENSIVSTATIONEN_DATE_FORMAT))\n",
" return soup\n",
"\n",
" HtmlTransformerUtil().applySoupTransformerToFile(\n",
" file = \"../../docs/intensivstationen.html\",\n",
" soupTransformer = setLastUpdated)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "63be303c",
"metadata": {},
"outputs": [],
"source": [
"saveLastUpdatedIntensivstationen(dateProvider.getLastUpdatedDataSource())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d021de84",
"metadata": {},
"outputs": [],
"source": [
"def readTimeseries(download = False):\n",
" timeSeriesFile = 'zeitreihe-tagesdaten.csv'\n",
" if download:\n",
" _downloadTimeseries(timeSeriesFile)\n",
"\n",
" timeseries = pd.read_csv(\n",
" timeSeriesFile,\n",
" low_memory = False,\n",
" usecols = ['date', 'bundesland', 'gemeindeschluessel', 'betten_belegt', 'betten_frei'],\n",
" parse_dates = ['date'],\n",
" date_parser = lambda dateStr: pd.to_datetime(dateStr, format = \"%Y-%m-%d\"),\n",
" dtype = {\n",
" 'gemeindeschluessel': 'string',\n",
" 'bundesland': 'string'\n",
" })\n",
" return timeseries.sort_values(by = 'date', ascending = True)\n",
"\n",
"# download https://diviexchange.blob.core.windows.net/%24web/zeitreihe-tagesdaten.csv or https://www.intensivregister.de/#/aktuelle-lage/downloads\n",
"def _downloadTimeseries(timeSeriesFile):\n",
" request.urlretrieve(\n",
" 'https://diviexchange.blob.core.windows.net/%24web/zeitreihe-tagesdaten.csv',\n",
" timeSeriesFile)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3f992231",
"metadata": {},
"outputs": [],
"source": [
"timeSeries = readTimeseries(download = needsUpdate)\n",
"timeSeries"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2d34c6a4",
"metadata": {},
"outputs": [],
"source": [
"def readKreise(download = False):\n",
" kreiseFile = '04-kreise.xlsx'\n",
" if download:\n",
" _downloadKreise(kreiseFile)\n",
" \n",
" kreise = pd.read_excel(\n",
" kreiseFile,\n",
" sheet_name = 'Kreisfreie Städte u. Landkreise',\n",
" header = 5,\n",
" index_col = 0)\n",
" kreise = kreise.rename(columns = {'2': 'Bundesland', 3: 'Kreis', 6: 'Einwohnerzahl'})[['Bundesland', 'Kreis', 'Einwohnerzahl']]\n",
" kreise.index.set_names(\"Key\", inplace = True)\n",
" return kreise\n",
"\n",
"# download https://www.destatis.de/DE/Themen/Laender-Regionen/Regionales/Gemeindeverzeichnis/Administrativ/04-kreise.xlsx?__blob=publicationFile or https://www.destatis.de/DE/Themen/Laender-Regionen/Regionales/Gemeindeverzeichnis/Administrativ/04-kreise.html\n",
"def _downloadKreise(kreiseFile):\n",
" request.urlretrieve(\n",
" 'https://www.destatis.de/DE/Themen/Laender-Regionen/Regionales/Gemeindeverzeichnis/Administrativ/04-kreise.xlsx?__blob=publicationFile',\n",
" kreiseFile)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "74ea4d55",
"metadata": {},
"outputs": [],
"source": [
"kreise = readKreise(download = False)\n",
"kreise"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af96fb11",
"metadata": {},
"outputs": [],
"source": [
"class ColumnsAdder:\n",
"\n",
" def __init__(self, kreise):\n",
" self.kreise = kreise\n",
"\n",
" def addKreisAndBundeslandAndEinwohnerzahlColumns(self, dataFrame):\n",
" dataFrame_kreise = pd.merge(dataFrame, self.kreise, how = 'left', left_on = 'gemeindeschluessel', right_index = True)\n",
" dataFrame['Kreis'] = dataFrame_kreise['Kreis']\n",
" dataFrame['Einwohnerzahl'] = dataFrame_kreise['Einwohnerzahl']\n",
" return self._addBundeslandColumn(dataFrame)\n",
" \n",
" def _addBundeslandColumn(self, dataFrame):\n",
" return pd.merge(\n",
" dataFrame,\n",
" self._createBundeslandByKeyTable(),\n",
" how = 'left',\n",
" left_on = 'bundesland',\n",
" right_index = True)\n",
"\n",
" def _createBundeslandByKeyTable(self):\n",
" return self.kreise[self.kreise.index.str.len() == 2][['Bundesland']]\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "62a20115",
"metadata": {},
"outputs": [],
"source": [
"timeSeries = ColumnsAdder(kreise).addKreisAndBundeslandAndEinwohnerzahlColumns(timeSeries)\n",
"timeSeries"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "356494d3",
"metadata": {},
"outputs": [],
"source": [
"kreisValues = sorted(timeSeries['Kreis'].drop_duplicates().values)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "05aa0117",
"metadata": {},
"outputs": [],
"source": [
"def getKreisOptions(kreisValues):\n",
" return [getKreisOption(kreis) for kreis in kreisValues]\n",
"\n",
"def getKreisOption(kreis):\n",
" return '<option value=\"{kreis}\">{kreis}</option>'.format(kreis = kreis)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c38ca16",
"metadata": {},
"outputs": [],
"source": [
"kreisOptions = ['<option selected=\"\" value=\"de\">Alle Landkreise</option>'] + getKreisOptions(kreisValues)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9eb453d0",
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"\n",
"\n",
"class KreisOptionsSetter:\n",
"\n",
" def setKreisOptions(self, html, options):\n",
" soup = self._setKreisOptions(self._parse(html), self._parseOptions(options))\n",
" return str(soup)\n",
"\n",
" def _setKreisOptions(self, soup, options):\n",
" kreisSelect = soup.find(id = \"kreisSelect\")\n",
" kreisSelect.clear()\n",
" for option in options:\n",
" kreisSelect.append(option)\n",
" return soup\n",
"\n",
" def _parseOptions(self, options):\n",
" return [self._parse(option).option for option in options]\n",
"\n",
" def _parse(self, html):\n",
" return BeautifulSoup(html, 'lxml')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29b0930a",
"metadata": {},
"outputs": [],
"source": [
"import unittest"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "45072a1d",
"metadata": {},
"outputs": [],
"source": [
"class TestHelper:\n",
"\n",
" @staticmethod\n",
" def createDataFrame(index, columns, data, dtypes = {}):\n",
" return pd.DataFrame(index = index, columns = columns, data = data).astype(dtypes)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e4f8fa80",
"metadata": {},
"outputs": [],
"source": [
"class KreisOptionsSetterTest(unittest.TestCase):\n",
"\n",
" def test_setKreisOptions(self):\n",
" # Given\n",
" kreisOptionsSetter = KreisOptionsSetter()\n",
"\n",
" # When\n",
" htmlActual = kreisOptionsSetter.setKreisOptions(\n",
" html='''\n",
" <html>\n",
" <body>\n",
" <p>Test<p/>\n",
" <select id=\"kreisSelect\" name=\"kreis\">\n",
" <option selected=\"\" value=\"de\">Alle Landkreise</option>\n",
" <option value=\"Ahrweiler\">Ahrweiler</option>\n",
" <option value=\"Wiesbaden, Landeshauptstadt\">Wiesbaden, Landeshauptstadt</option>\n",
" <option value=\"Aichach-Friedberg\">Aichach-Friedberg</option>\n",
" </select>\n",
" </body>\n",
" </html>\n",
" ''',\n",
" options=[\n",
" '<option selected=\"\" value=\"de\">Alle Landkreise</option>',\n",
" '<option value=\"Ahrweiler\">Ahrweiler</option>',\n",
" '<option value=\"Aichach-Friedberg\">Aichach-Friedberg</option>'])\n",
"\n",
" # Then\n",
" assertEqualHTML(\n",
" htmlActual,\n",
" '''\n",
" <html>\n",
" <body>\n",
" <p>Test<p/>\n",
" <select id=\"kreisSelect\" name=\"kreis\">\n",
" <option selected=\"\" value=\"de\">Alle Landkreise</option>\n",
" <option value=\"Ahrweiler\">Ahrweiler</option>\n",
" <option value=\"Aichach-Friedberg\">Aichach-Friedberg</option>\n",
" </select>\n",
" </body>\n",
" </html>\n",
" ''')\n",
"\n",
"# adapted from https://stackoverflow.com/questions/8006909/pretty-print-assertequal-for-html-strings\n",
"\n",
"\n",
"def assertEqualHTML(string1, string2, file1='', file2=''):\n",
" u'''\n",
" Compare two unicode strings containing HTML.\n",
" A human friendly diff goes to logging.error() if they\n",
" are not equal, and an exception gets raised.\n",
" '''\n",
" from bs4 import BeautifulSoup as bs\n",
" import difflib\n",
"\n",
" def short(mystr):\n",
" max = 20\n",
" if len(mystr) > max:\n",
" return mystr[:max]\n",
" return mystr\n",
" p = []\n",
" for mystr, file in [(string1, file1), (string2, file2)]:\n",
" if not isinstance(mystr, str):\n",
" raise Exception(u'string ist not unicode: %r %s' %\n",
" (short(mystr), file))\n",
" soup = bs(mystr)\n",
" pretty = soup.prettify()\n",
" p.append(pretty)\n",
" if p[0] != p[1]:\n",
" for line in difflib.unified_diff(p[0].splitlines(), p[1].splitlines(), fromfile=file1, tofile=file2):\n",
" display(line)\n",
" display(p[0], ' != ', p[1])\n",
" raise Exception('Not equal %s %s' % (file1, file2))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "403f8b7b",
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"\n",
"\n",
"def saveKreisOptions(kreisOptions):\n",
" HtmlTransformerUtil().applySoupTransformerToFile(\n",
" file = \"../../docs/intensivstationen.html\",\n",
" soupTransformer =\n",
" lambda soup:\n",
" BeautifulSoup(\n",
" KreisOptionsSetter().setKreisOptions(html = str(soup), options = kreisOptions),\n",
" 'lxml'))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5dd8d864",
"metadata": {},
"outputs": [],
"source": [
"saveKreisOptions(kreisOptions)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "43c2f826",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"\n",
"class IOUtils:\n",
"\n",
" def saveDictAsJson(dict, file):\n",
" IOUtils.ensurePath(file)\n",
" with open(file, 'w') as outfile:\n",
" json.dump(dict, outfile)\n",
"\n",
" @staticmethod\n",
" def ensurePath(file):\n",
" directory = os.path.dirname(file)\n",
" if not os.path.exists(directory):\n",
" os.makedirs(directory)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "997a4bdb",
"metadata": {},
"outputs": [],
"source": [
"def getIntensiveCareBeds(timeSeries, kreis = None):\n",
" if kreis is not None:\n",
" return timeSeries[timeSeries['Kreis'] == kreis][['date', 'betten_belegt', 'betten_frei', 'Einwohnerzahl']]\n",
" else:\n",
" return timeSeries.groupby('date').agg(**{\n",
" 'betten_belegt': pd.NamedAgg(column = 'betten_belegt', aggfunc = 'sum'),\n",
" 'betten_frei': pd.NamedAgg(column = 'betten_frei', aggfunc = 'sum'),\n",
" 'Einwohnerzahl': pd.NamedAgg(column = 'Einwohnerzahl', aggfunc = 'sum')\n",
" }).reset_index()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a97f5b2b",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"\n",
"def getAndPersistIntensiveCareBeds(timeSeries, kreis = None):\n",
" intensiveCareBeds = getIntensiveCareBeds(timeSeries, kreis)\n",
" display(kreis)\n",
" _saveAsJson(intensiveCareBeds, _getFilename(kreis))\n",
" return intensiveCareBeds\n",
"\n",
"\n",
"def _saveAsJson(intensiveCareBeds, file):\n",
" IOUtils.saveDictAsJson(\n",
" {\n",
" 'population': int(intensiveCareBeds.iloc[0]['Einwohnerzahl']),\n",
" 'data': _intensiveCareBeds2Dict(intensiveCareBeds),\n",
" },\n",
" file)\n",
"\n",
"\n",
"def _intensiveCareBeds2Dict(intensiveCareBeds):\n",
" df = intensiveCareBeds[['date', 'betten_belegt', 'betten_frei']].copy()\n",
" df['date'] = df['date'].dt.strftime('%Y-%m-%d')\n",
" return df.to_dict(orient = \"records\")\n",
"\n",
"\n",
"def _getFilename(kreis):\n",
" return '../../docs/data/intensivstationen/intensivstationen-{suffix}.json'.format(suffix = _getSuffix(kreis))\n",
"\n",
"\n",
"def _getSuffix(kreis):\n",
" return kreis if kreis is not None else 'de'\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "349edd73",
"metadata": {},
"outputs": [],
"source": [
"getAndPersistIntensiveCareBeds(timeSeries)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1b97137f",
"metadata": {},
"outputs": [],
"source": [
"for kreis in kreisValues:\n",
" getAndPersistIntensiveCareBeds(timeSeries, kreis)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d9d4acab",
"metadata": {},
"outputs": [],
"source": [
"class MedianOfFreeBedsByKreisTableFactory:\n",
" \n",
" def __init__(self, dataFrame):\n",
" self.dataFrame = dataFrame\n",
"\n",
" def createMedianOfFreeBedsByKreisTable(self):\n",
" self.dataFrame['free_beds_divided_by_all_beds_in_percent'] = self.dataFrame['betten_frei'] / (self.dataFrame['betten_frei'] + self.dataFrame['betten_belegt']) * 100\n",
" aggregated = self.dataFrame.groupby('Kreis').agg(\n",
" median_free_beds_in_percent =\n",
" pd.NamedAgg(\n",
" column = 'free_beds_divided_by_all_beds_in_percent',\n",
" aggfunc = 'median'))\n",
" return aggregated.sort_values(by = 'median_free_beds_in_percent', ascending = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a739d4d1",
"metadata": {},
"outputs": [],
"source": [
"from pandas.testing import assert_frame_equal\n",
"import statistics\n",
"\n",
"class MedianOfFreeBedsByKreisTableFactoryTest(unittest.TestCase):\n",
"\n",
" def test_createMedianOfFreeBedsByKreisTable(self):\n",
" # Given\n",
" dataFrame = TestHelper.createDataFrame(\n",
" columns = ['date', 'betten_frei', 'betten_belegt', 'Kreis'],\n",
" data = [ ['2020-04-24', 40, 38, 'Flensburg, Stadt'],\n",
" ['2020-04-24', 42, 36, 'Flensburg, Stadt'],\n",
" ['2020-04-24', 44, 34, 'Flensburg, Stadt'],\n",
" ['2020-04-24', 9, 10, 'Bamberg']],\n",
" index = [\n",
" 0,\n",
" 1,\n",
" 2,\n",
" 3])\n",
" medianOfFreeBedsByKreisTableFactory = MedianOfFreeBedsByKreisTableFactory(dataFrame)\n",
" \n",
" # When\n",
" medianOfFreeBedsByKreisTable = medianOfFreeBedsByKreisTableFactory.createMedianOfFreeBedsByKreisTable()\n",
"\n",
" # Then\n",
" assert_frame_equal(\n",
" medianOfFreeBedsByKreisTable,\n",
" TestHelper.createDataFrame(\n",
" columns = ['median_free_beds_in_percent'],\n",
" data = [ [statistics.median([40/(40 + 38) * 100, 42/(42 + 36) * 100, 44/(44 + 34) * 100])],\n",
" [9/(9 + 10) * 100]],\n",
" index = pd.Index(\n",
" name = 'Kreis',\n",
" data = [\n",
" 'Flensburg, Stadt',\n",
" 'Bamberg'\n",
" ])),\n",
" check_dtype = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "af22cdc5",
"metadata": {},
"outputs": [],
"source": [
"unittest.main(argv = [''], verbosity = 2, exit = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0218cdb4",
"metadata": {},
"outputs": [],
"source": [
"def publish():\n",
" %cd /home/frankknoll/Dokumente/Corona/projects/HowBadIsMyBatch-pages\n",
" ! git add -A\n",
" ! git commit -m \"updating data\"\n",
" ! git push"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5f173c2b",
"metadata": {},
"outputs": [],
"source": [
"publish()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}