{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "9de5907f-18f5-4cb1-903e-26028ff1fa03", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from urllib import request\n", "\n", "pd.set_option('display.max_rows', 100)\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('mode.chained_assignment', 'raise')" ] }, { "cell_type": "code", "execution_count": null, "id": "dfa836ec", "metadata": {}, "outputs": [], "source": [ "needsUpdate = False" ] }, { "cell_type": "code", "execution_count": null, "id": "79de4057", "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "import requests\n", "from datetime import datetime\n", "from time import sleep\n", "from selenium import webdriver\n", "from selenium.webdriver.firefox.options import Options\n", "\n", "class DateProvider:\n", " \n", " INTENSIVSTATIONEN_DATE_FORMAT = \"%d.%m.%Y, %H:%M Uhr\"\n", "\n", " def __init__(self):\n", " self.lastUpdated = None\n", " self.lastUpdatedDataSource = None\n", "\n", " def needsUpdate(self):\n", " return self.getLastUpdated() < self.getLastUpdatedDataSource()\n", " \n", " def getLastUpdated(self):\n", " if self.lastUpdated is None:\n", " htmlContent = requests.get(\"https://knollfrank.github.io/HowBadIsMyBatch/intensivstationen.html\").text\n", " soup = BeautifulSoup(htmlContent, \"lxml\")\n", " dateStr = soup.find(id = \"Datenstand\").text\n", " self.lastUpdated = datetime.strptime(dateStr, DateProvider.INTENSIVSTATIONEN_DATE_FORMAT)\n", " \n", " return self.lastUpdated\n", "\n", " def getLastUpdatedDataSource(self):\n", " if self.lastUpdatedDataSource is None:\n", " html = self._getOriginalHtml()\n", " lastUpdatedColumn = 'Letzte Änderung'\n", " dataFrame = self._asDataFrame(html, lastUpdatedColumn)\n", " self.lastUpdatedDataSource = dataFrame.loc['Landkreis-Daten', lastUpdatedColumn].to_pydatetime()\n", "\n", " return self.lastUpdatedDataSource\n", "\n", " def _getOriginalHtml(self):\n", " options = Options()\n", " options.headless = True\n", " driver = webdriver.Firefox(options = options)\n", " driver.get('https://www.intensivregister.de/#/aktuelle-lage/downloads')\n", " sleep(10)\n", " innerHTML = driver.execute_script(\"return document.body.innerHTML\")\n", " driver.quit()\n", " return innerHTML\n", "\n", " def _asDataFrame(self, html, lastUpdatedColumn):\n", " dataFrame = pd.read_html(html, parse_dates = [lastUpdatedColumn])[0]\n", " dataFrame[lastUpdatedColumn] = pd.to_datetime(dataFrame[lastUpdatedColumn], format = \"%d.%m.%Y %H:%M Uhr\")\n", " dataFrame.set_index('Name', inplace = True)\n", " return dataFrame\n" ] }, { "cell_type": "code", "execution_count": null, "id": "336f56e6", "metadata": {}, "outputs": [], "source": [ "dateProvider = DateProvider()\n", "print(' lastUpdated:', dateProvider.getLastUpdated())\n", "print('lastUpdatedDataSource:', dateProvider.getLastUpdatedDataSource()) \n", "needsUpdate = dateProvider.needsUpdate()\n", "print('needsUpdate: ', needsUpdate)" ] }, { "cell_type": "code", "execution_count": null, "id": "03784154", "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "\n", "class HtmlTransformerUtil:\n", " \n", " def applySoupTransformerToFile(self, file, soupTransformer):\n", " self._writeSoup(soupTransformer(self._readSoup(file)), file)\n", "\n", " def _readSoup(self, file):\n", " with open(file) as fp:\n", " soup = BeautifulSoup(fp, 'lxml')\n", " return soup\n", "\n", " def _writeSoup(self, soup, file):\n", " with open(file, \"w\") as fp:\n", " fp.write(str(soup)) \n" ] }, { "cell_type": "code", "execution_count": null, "id": "af101279", "metadata": {}, "outputs": [], "source": [ "def saveLastUpdatedIntensivstationen(lastUpdated):\n", " def setLastUpdated(soup):\n", " soup.find(id = \"Datenstand\").string.replace_with(lastUpdated.strftime(DateProvider.INTENSIVSTATIONEN_DATE_FORMAT))\n", " return soup\n", "\n", " HtmlTransformerUtil().applySoupTransformerToFile(\n", " file = \"../../docs/intensivstationen.html\",\n", " soupTransformer = setLastUpdated)" ] }, { "cell_type": "code", "execution_count": null, "id": "63be303c", "metadata": {}, "outputs": [], "source": [ "saveLastUpdatedIntensivstationen(dateProvider.getLastUpdatedDataSource())" ] }, { "cell_type": "code", "execution_count": null, "id": "d021de84", "metadata": {}, "outputs": [], "source": [ "def readTimeseries(download = False):\n", " timeSeriesFile = 'zeitreihe-tagesdaten.csv'\n", " if download:\n", " _downloadTimeseries(timeSeriesFile)\n", "\n", " timeseries = pd.read_csv(\n", " timeSeriesFile,\n", " low_memory = False,\n", " usecols = ['date', 'bundesland', 'gemeindeschluessel', 'betten_belegt', 'betten_frei'],\n", " parse_dates = ['date'],\n", " date_parser = lambda dateStr: pd.to_datetime(dateStr, format = \"%Y-%m-%d\"),\n", " dtype = {\n", " 'gemeindeschluessel': 'string',\n", " 'bundesland': 'string'\n", " })\n", " return timeseries.sort_values(by = 'date', ascending = True)\n", "\n", "# download https://diviexchange.blob.core.windows.net/%24web/zeitreihe-tagesdaten.csv or https://www.intensivregister.de/#/aktuelle-lage/downloads\n", "def _downloadTimeseries(timeSeriesFile):\n", " request.urlretrieve(\n", " 'https://diviexchange.blob.core.windows.net/%24web/zeitreihe-tagesdaten.csv',\n", " timeSeriesFile)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "3f992231", "metadata": {}, "outputs": [], "source": [ "timeSeries = readTimeseries(download = needsUpdate)\n", "timeSeries" ] }, { "cell_type": "code", "execution_count": null, "id": "2d34c6a4", "metadata": {}, "outputs": [], "source": [ "def readKreise(download = False):\n", " kreiseFile = '04-kreise.xlsx'\n", " if download:\n", " _downloadKreise(kreiseFile)\n", " \n", " kreise = pd.read_excel(\n", " kreiseFile,\n", " sheet_name = 'Kreisfreie Städte u. Landkreise',\n", " header = 5,\n", " index_col = 0)\n", " kreise = kreise.rename(columns = {'2': 'Bundesland', 3: 'Kreis', 6: 'Einwohnerzahl'})[['Bundesland', 'Kreis', 'Einwohnerzahl']]\n", " kreise.index.set_names(\"Key\", inplace = True)\n", " return kreise\n", "\n", "# download https://www.destatis.de/DE/Themen/Laender-Regionen/Regionales/Gemeindeverzeichnis/Administrativ/04-kreise.xlsx?__blob=publicationFile or https://www.destatis.de/DE/Themen/Laender-Regionen/Regionales/Gemeindeverzeichnis/Administrativ/04-kreise.html\n", "def _downloadKreise(kreiseFile):\n", " request.urlretrieve(\n", " 'https://www.destatis.de/DE/Themen/Laender-Regionen/Regionales/Gemeindeverzeichnis/Administrativ/04-kreise.xlsx?__blob=publicationFile',\n", " kreiseFile)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "74ea4d55", "metadata": {}, "outputs": [], "source": [ "kreise = readKreise(download = False)\n", "kreise" ] }, { "cell_type": "code", "execution_count": null, "id": "af96fb11", "metadata": {}, "outputs": [], "source": [ "class ColumnsAdder:\n", "\n", " def __init__(self, kreise):\n", " self.kreise = kreise\n", "\n", " def addKreisAndBundeslandAndEinwohnerzahlColumns(self, dataFrame):\n", " dataFrame_kreise = pd.merge(dataFrame, self.kreise, how = 'left', left_on = 'gemeindeschluessel', right_index = True)\n", " dataFrame['Kreis'] = dataFrame_kreise['Kreis']\n", " dataFrame['Einwohnerzahl'] = dataFrame_kreise['Einwohnerzahl']\n", " return self._addBundeslandColumn(dataFrame)\n", " \n", " def _addBundeslandColumn(self, dataFrame):\n", " return pd.merge(\n", " dataFrame,\n", " self._createBundeslandByKeyTable(),\n", " how = 'left',\n", " left_on = 'bundesland',\n", " right_index = True)\n", "\n", " def _createBundeslandByKeyTable(self):\n", " return self.kreise[self.kreise.index.str.len() == 2][['Bundesland']]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "62a20115", "metadata": {}, "outputs": [], "source": [ "timeSeries = ColumnsAdder(kreise).addKreisAndBundeslandAndEinwohnerzahlColumns(timeSeries)\n", "timeSeries" ] }, { "cell_type": "code", "execution_count": null, "id": "356494d3", "metadata": {}, "outputs": [], "source": [ "kreisValues = sorted(timeSeries['Kreis'].drop_duplicates().values)" ] }, { "cell_type": "code", "execution_count": null, "id": "05aa0117", "metadata": {}, "outputs": [], "source": [ "def getKreisOptions(kreisValues):\n", " return [getKreisOption(kreis) for kreis in kreisValues]\n", "\n", "def getKreisOption(kreis):\n", " return ''.format(kreis = kreis)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9c38ca16", "metadata": {}, "outputs": [], "source": [ "kreisOptions = [''] + getKreisOptions(kreisValues)" ] }, { "cell_type": "code", "execution_count": null, "id": "9eb453d0", "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "\n", "\n", "class KreisOptionsSetter:\n", "\n", " def setKreisOptions(self, html, options):\n", " soup = self._setKreisOptions(self._parse(html), self._parseOptions(options))\n", " return str(soup)\n", "\n", " def _setKreisOptions(self, soup, options):\n", " kreisSelect = soup.find(id = \"kreisSelect\")\n", " kreisSelect.clear()\n", " for option in options:\n", " kreisSelect.append(option)\n", " return soup\n", "\n", " def _parseOptions(self, options):\n", " return [self._parse(option).option for option in options]\n", "\n", " def _parse(self, html):\n", " return BeautifulSoup(html, 'lxml')\n" ] }, { "cell_type": "code", "execution_count": null, "id": "29b0930a", "metadata": {}, "outputs": [], "source": [ "import unittest" ] }, { "cell_type": "code", "execution_count": null, "id": "45072a1d", "metadata": {}, "outputs": [], "source": [ "class TestHelper:\n", "\n", " @staticmethod\n", " def createDataFrame(index, columns, data, dtypes = {}):\n", " return pd.DataFrame(index = index, columns = columns, data = data).astype(dtypes)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "e4f8fa80", "metadata": {}, "outputs": [], "source": [ "class KreisOptionsSetterTest(unittest.TestCase):\n", "\n", " def test_setKreisOptions(self):\n", " # Given\n", " kreisOptionsSetter = KreisOptionsSetter()\n", "\n", " # When\n", " htmlActual = kreisOptionsSetter.setKreisOptions(\n", " html='''\n", " \n", " \n", "

Test

\n", " \n", " \n", " \n", " ''',\n", " options=[\n", " '',\n", " '',\n", " ''])\n", "\n", " # Then\n", " assertEqualHTML(\n", " htmlActual,\n", " '''\n", " \n", " \n", "

Test

\n", " \n", " \n", " \n", " ''')\n", "\n", "# adapted from https://stackoverflow.com/questions/8006909/pretty-print-assertequal-for-html-strings\n", "\n", "\n", "def assertEqualHTML(string1, string2, file1='', file2=''):\n", " u'''\n", " Compare two unicode strings containing HTML.\n", " A human friendly diff goes to logging.error() if they\n", " are not equal, and an exception gets raised.\n", " '''\n", " from bs4 import BeautifulSoup as bs\n", " import difflib\n", "\n", " def short(mystr):\n", " max = 20\n", " if len(mystr) > max:\n", " return mystr[:max]\n", " return mystr\n", " p = []\n", " for mystr, file in [(string1, file1), (string2, file2)]:\n", " if not isinstance(mystr, str):\n", " raise Exception(u'string ist not unicode: %r %s' %\n", " (short(mystr), file))\n", " soup = bs(mystr)\n", " pretty = soup.prettify()\n", " p.append(pretty)\n", " if p[0] != p[1]:\n", " for line in difflib.unified_diff(p[0].splitlines(), p[1].splitlines(), fromfile=file1, tofile=file2):\n", " display(line)\n", " display(p[0], ' != ', p[1])\n", " raise Exception('Not equal %s %s' % (file1, file2))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "403f8b7b", "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "\n", "\n", "def saveKreisOptions(kreisOptions):\n", " HtmlTransformerUtil().applySoupTransformerToFile(\n", " file = \"../../docs/intensivstationen.html\",\n", " soupTransformer =\n", " lambda soup:\n", " BeautifulSoup(\n", " KreisOptionsSetter().setKreisOptions(html = str(soup), options = kreisOptions),\n", " 'lxml'))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "5dd8d864", "metadata": {}, "outputs": [], "source": [ "saveKreisOptions(kreisOptions)" ] }, { "cell_type": "code", "execution_count": null, "id": "43c2f826", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "\n", "class IOUtils:\n", "\n", " def saveDictAsJson(dict, file):\n", " IOUtils.ensurePath(file)\n", " with open(file, 'w') as outfile:\n", " json.dump(dict, outfile)\n", "\n", " @staticmethod\n", " def ensurePath(file):\n", " directory = os.path.dirname(file)\n", " if not os.path.exists(directory):\n", " os.makedirs(directory)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "997a4bdb", "metadata": {}, "outputs": [], "source": [ "def getIntensiveCareBeds(timeSeries, kreis = None):\n", " if kreis is not None:\n", " return timeSeries[timeSeries['Kreis'] == kreis][['date', 'betten_belegt', 'betten_frei', 'Einwohnerzahl']]\n", " else:\n", " return timeSeries.groupby('date').agg(**{\n", " 'betten_belegt': pd.NamedAgg(column = 'betten_belegt', aggfunc = 'sum'),\n", " 'betten_frei': pd.NamedAgg(column = 'betten_frei', aggfunc = 'sum'),\n", " 'Einwohnerzahl': pd.NamedAgg(column = 'Einwohnerzahl', aggfunc = 'sum')\n", " }).reset_index()" ] }, { "cell_type": "code", "execution_count": null, "id": "a97f5b2b", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "\n", "def getAndPersistIntensiveCareBeds(timeSeries, kreis = None):\n", " intensiveCareBeds = getIntensiveCareBeds(timeSeries, kreis)\n", " display(kreis)\n", " _saveAsJson(intensiveCareBeds, _getFilename(kreis))\n", " return intensiveCareBeds\n", "\n", "\n", "def _saveAsJson(intensiveCareBeds, file):\n", " IOUtils.saveDictAsJson(\n", " {\n", " 'population': int(intensiveCareBeds.iloc[0]['Einwohnerzahl']),\n", " 'data': _intensiveCareBeds2Dict(intensiveCareBeds),\n", " },\n", " file)\n", "\n", "\n", "def _intensiveCareBeds2Dict(intensiveCareBeds):\n", " df = intensiveCareBeds[['date', 'betten_belegt', 'betten_frei']].copy()\n", " df['date'] = df['date'].dt.strftime('%Y-%m-%d')\n", " return df.to_dict(orient = \"records\")\n", "\n", "\n", "def _getFilename(kreis):\n", " return '../../docs/data/intensivstationen/intensivstationen-{suffix}.json'.format(suffix = _getSuffix(kreis))\n", "\n", "\n", "def _getSuffix(kreis):\n", " return kreis if kreis is not None else 'de'\n" ] }, { "cell_type": "code", "execution_count": null, "id": "349edd73", "metadata": {}, "outputs": [], "source": [ "getAndPersistIntensiveCareBeds(timeSeries)" ] }, { "cell_type": "code", "execution_count": null, "id": "1b97137f", "metadata": {}, "outputs": [], "source": [ "for kreis in kreisValues:\n", " getAndPersistIntensiveCareBeds(timeSeries, kreis)" ] }, { "cell_type": "code", "execution_count": null, "id": "d9d4acab", "metadata": {}, "outputs": [], "source": [ "class MedianOfFreeBedsByKreisTableFactory:\n", " \n", " def __init__(self, dataFrame):\n", " self.dataFrame = dataFrame\n", "\n", " def createMedianOfFreeBedsByKreisTable(self):\n", " self.dataFrame['free_beds_divided_by_all_beds_in_percent'] = self.dataFrame['betten_frei'] / (self.dataFrame['betten_frei'] + self.dataFrame['betten_belegt']) * 100\n", " aggregated = self.dataFrame.groupby('Kreis').agg(\n", " median_free_beds_in_percent =\n", " pd.NamedAgg(\n", " column = 'free_beds_divided_by_all_beds_in_percent',\n", " aggfunc = 'median'))\n", " return aggregated.sort_values(by = 'median_free_beds_in_percent', ascending = False)" ] }, { "cell_type": "code", "execution_count": null, "id": "a739d4d1", "metadata": {}, "outputs": [], "source": [ "from pandas.testing import assert_frame_equal\n", "import statistics\n", "\n", "class MedianOfFreeBedsByKreisTableFactoryTest(unittest.TestCase):\n", "\n", " def test_createMedianOfFreeBedsByKreisTable(self):\n", " # Given\n", " dataFrame = TestHelper.createDataFrame(\n", " columns = ['date', 'betten_frei', 'betten_belegt', 'Kreis'],\n", " data = [ ['2020-04-24', 40, 38, 'Flensburg, Stadt'],\n", " ['2020-04-24', 42, 36, 'Flensburg, Stadt'],\n", " ['2020-04-24', 44, 34, 'Flensburg, Stadt'],\n", " ['2020-04-24', 9, 10, 'Bamberg']],\n", " index = [\n", " 0,\n", " 1,\n", " 2,\n", " 3])\n", " medianOfFreeBedsByKreisTableFactory = MedianOfFreeBedsByKreisTableFactory(dataFrame)\n", " \n", " # When\n", " medianOfFreeBedsByKreisTable = medianOfFreeBedsByKreisTableFactory.createMedianOfFreeBedsByKreisTable()\n", "\n", " # Then\n", " assert_frame_equal(\n", " medianOfFreeBedsByKreisTable,\n", " TestHelper.createDataFrame(\n", " columns = ['median_free_beds_in_percent'],\n", " data = [ [statistics.median([40/(40 + 38) * 100, 42/(42 + 36) * 100, 44/(44 + 34) * 100])],\n", " [9/(9 + 10) * 100]],\n", " index = pd.Index(\n", " name = 'Kreis',\n", " data = [\n", " 'Flensburg, Stadt',\n", " 'Bamberg'\n", " ])),\n", " check_dtype = False)" ] }, { "cell_type": "code", "execution_count": null, "id": "af22cdc5", "metadata": {}, "outputs": [], "source": [ "unittest.main(argv = [''], verbosity = 2, exit = False)" ] }, { "cell_type": "code", "execution_count": null, "id": "0218cdb4", "metadata": {}, "outputs": [], "source": [ "def publish():\n", " %cd /home/frankknoll/Dokumente/Corona/projects/HowBadIsMyBatch-pages\n", " ! git add -A\n", " ! git commit -m \"updating data\"\n", " ! git push" ] }, { "cell_type": "code", "execution_count": null, "id": "5f173c2b", "metadata": {}, "outputs": [], "source": [ "publish()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }