{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "9de5907f-18f5-4cb1-903e-26028ff1fa03", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from urllib import request\n", "\n", "pd.set_option('display.max_rows', 100)\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('mode.chained_assignment', 'raise')" ] }, { "cell_type": "code", "execution_count": 2, "id": "e40d4c8d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "31.03.2022, 08:46:18 Uhr\n" ] } ], "source": [ "from datetime import datetime\n", "\n", "print(datetime.now().strftime(\"%d.%m.%Y, %H:%M:%S Uhr\"))" ] }, { "cell_type": "code", "execution_count": 3, "id": "579c0911", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/frankknoll/Dokumente/Corona/projects/HowBadIsMyBatch-pages/src/intensivstationen\n" ] } ], "source": [ "! pwd" ] }, { "cell_type": "code", "execution_count": 4, "id": "dfa836ec", "metadata": {}, "outputs": [], "source": [ "needsUpdate = False" ] }, { "cell_type": "code", "execution_count": 5, "id": "79de4057", "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "import requests\n", "from datetime import datetime\n", "from time import sleep\n", "from selenium import webdriver\n", "from selenium.webdriver.firefox.options import Options\n", "\n", "class DateProvider:\n", " \n", " INTENSIVSTATIONEN_DATE_FORMAT = \"%d.%m.%Y, %H:%M Uhr\"\n", "\n", " def __init__(self):\n", " self.lastUpdated = None\n", " self.lastUpdatedDataSource = None\n", "\n", " def needsUpdate(self):\n", " return self.getLastUpdated() < self.getLastUpdatedDataSource()\n", " \n", " def getLastUpdated(self):\n", " if self.lastUpdated is None:\n", " htmlContent = requests.get(\"https://knollfrank.github.io/HowBadIsMyBatch/intensivstationen.html\").text\n", " soup = BeautifulSoup(htmlContent, \"lxml\")\n", " dateStr = soup.find(id = \"Datenstand\").text\n", " self.lastUpdated = datetime.strptime(dateStr, DateProvider.INTENSIVSTATIONEN_DATE_FORMAT)\n", " \n", " return self.lastUpdated\n", "\n", " def getLastUpdatedDataSource(self):\n", " if self.lastUpdatedDataSource is None:\n", " html = self._getOriginalHtml()\n", " lastUpdatedColumn = 'Letzte Änderung'\n", " dataFrame = self._asDataFrame(html, lastUpdatedColumn)\n", " self.lastUpdatedDataSource = dataFrame.loc['Landkreis-Daten', lastUpdatedColumn].to_pydatetime()\n", "\n", " return self.lastUpdatedDataSource\n", "\n", " def _getOriginalHtml(self):\n", " options = Options()\n", " options.headless = True\n", " driver = webdriver.Firefox(options = options)\n", " driver.get('https://www.intensivregister.de/#/aktuelle-lage/downloads')\n", " sleep(10)\n", " innerHTML = driver.execute_script(\"return document.body.innerHTML\")\n", " driver.quit()\n", " return innerHTML\n", "\n", " def _asDataFrame(self, html, lastUpdatedColumn):\n", " dataFrame = pd.read_html(html, parse_dates = [lastUpdatedColumn])[0]\n", " dataFrame[lastUpdatedColumn] = pd.to_datetime(dataFrame[lastUpdatedColumn], format = \"%d.%m.%Y %H:%M Uhr\")\n", " dataFrame.set_index('Name', inplace = True)\n", " return dataFrame\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "336f56e6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " lastUpdated: 2022-03-29 13:32:00\n", "lastUpdatedDataSource: 2022-03-30 13:32:00\n", "needsUpdate: True\n" ] } ], "source": [ "dateProvider = DateProvider()\n", "print(' lastUpdated:', dateProvider.getLastUpdated())\n", "print('lastUpdatedDataSource:', dateProvider.getLastUpdatedDataSource()) \n", "needsUpdate = dateProvider.needsUpdate()\n", "print('needsUpdate:', needsUpdate)" ] }, { "cell_type": "code", "execution_count": 7, "id": "03784154", "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "\n", "class HtmlTransformerUtil:\n", " \n", " def applySoupTransformerToFile(self, file, soupTransformer):\n", " self._writeSoup(soupTransformer(self._readSoup(file)), file)\n", "\n", " def _readSoup(self, file):\n", " with open(file) as fp:\n", " soup = BeautifulSoup(fp, 'lxml')\n", " return soup\n", "\n", " def _writeSoup(self, soup, file):\n", " with open(file, \"w\") as fp:\n", " fp.write(str(soup)) \n" ] }, { "cell_type": "code", "execution_count": 8, "id": "af101279", "metadata": {}, "outputs": [], "source": [ "def saveLastUpdatedIntensivstationen(lastUpdated):\n", " def setLastUpdated(soup):\n", " soup.find(id = \"Datenstand\").string.replace_with(lastUpdated.strftime(DateProvider.INTENSIVSTATIONEN_DATE_FORMAT))\n", " return soup\n", "\n", " HtmlTransformerUtil().applySoupTransformerToFile(\n", " file = \"../../docs/intensivstationen.html\",\n", " soupTransformer = setLastUpdated)" ] }, { "cell_type": "code", "execution_count": 9, "id": "63be303c", "metadata": {}, "outputs": [], "source": [ "saveLastUpdatedIntensivstationen(dateProvider.getLastUpdatedDataSource())" ] }, { "cell_type": "code", "execution_count": 10, "id": "d021de84", "metadata": {}, "outputs": [], "source": [ "def readTimeseries(download = False):\n", " timeSeriesFile = 'zeitreihe-tagesdaten.csv'\n", " if download:\n", " _downloadTimeseries(timeSeriesFile)\n", "\n", " timeseries = pd.read_csv(\n", " timeSeriesFile,\n", " low_memory = False,\n", " usecols = ['date', 'bundesland', 'gemeindeschluessel', 'betten_belegt', 'betten_frei'],\n", " parse_dates = ['date'],\n", " date_parser = lambda dateStr: pd.to_datetime(dateStr, format = \"%Y-%m-%d\"),\n", " dtype = {\n", " 'gemeindeschluessel': 'string',\n", " 'bundesland': 'string'\n", " })\n", " return timeseries.sort_values(by = 'date', ascending = True)\n", "\n", "# download https://diviexchange.blob.core.windows.net/%24web/zeitreihe-tagesdaten.csv or https://www.intensivregister.de/#/aktuelle-lage/downloads\n", "def _downloadTimeseries(timeSeriesFile):\n", " request.urlretrieve(\n", " 'https://diviexchange.blob.core.windows.net/%24web/zeitreihe-tagesdaten.csv',\n", " timeSeriesFile)\n" ] }, { "cell_type": "code", "execution_count": 11, "id": "3f992231", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
| \n", " | date | \n", "bundesland | \n", "gemeindeschluessel | \n", "betten_frei | \n", "betten_belegt | \n", "
|---|---|---|---|---|---|
| 0 | \n", "2020-04-24 | \n", "01 | \n", "01001 | \n", "40 | \n", "38 | \n", "
| 267 | \n", "2020-04-24 | \n", "09 | \n", "09471 | \n", "9 | \n", "9 | \n", "
| 266 | \n", "2020-04-24 | \n", "09 | \n", "09464 | \n", "17 | \n", "23 | \n", "
| 265 | \n", "2020-04-24 | \n", "09 | \n", "09463 | \n", "9 | \n", "25 | \n", "
| 264 | \n", "2020-04-24 | \n", "09 | \n", "09462 | \n", "12 | \n", "51 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 279131 | \n", "2022-03-30 | \n", "06 | \n", "06437 | \n", "7 | \n", "119 | \n", "
| 279130 | \n", "2022-03-30 | \n", "06 | \n", "06436 | \n", "1 | \n", "23 | \n", "
| 279129 | \n", "2022-03-30 | \n", "06 | \n", "06435 | \n", "11 | \n", "70 | \n", "
| 279127 | \n", "2022-03-30 | \n", "06 | \n", "06433 | \n", "2 | \n", "19 | \n", "
| 279400 | \n", "2022-03-30 | \n", "16 | \n", "16077 | \n", "5 | \n", "27 | \n", "
279401 rows × 5 columns
\n", "| \n", " | Bundesland | \n", "Kreis | \n", "Einwohnerzahl | \n", "
|---|---|---|---|
| Key | \n", "\n", " | \n", " | \n", " |
| NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| 01 | \n", "Schleswig-Holstein | \n", "NaN | \n", "NaN | \n", "
| 01001 | \n", "Kreisfreie Stadt | \n", "Flensburg, Stadt | \n", "89934.0 | \n", "
| 01002 | \n", "Kreisfreie Stadt | \n", "Kiel, Landeshauptstadt | \n", "246601.0 | \n", "
| 01003 | \n", "Kreisfreie Stadt | \n", "Lübeck, Hansestadt | \n", "215846.0 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "
| 2) Die Ergebnisse ab Berichtsjahr 2016 sind aufgrund methodischer Änderungen und technischer Weiterentwicklung\\n nur bedingt mit den Vorjahreswerten vegleichbar. Erläuterungen dazu finden Sie unter www.destatis.de beim Bevölkerungsstand. | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| © Daten (im Auftrag der Herausgebergemeinschaft Statistische Ämter des Bundes und der Länder) | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| Statistisches Bundesamt (Destatis), 2021 | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
| Vervielfältigung und Verbreitung, auch auszugsweise, mit Quellenangabe gestattet. | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
488 rows × 3 columns
\n", "| \n", " | date | \n", "bundesland | \n", "gemeindeschluessel | \n", "betten_frei | \n", "betten_belegt | \n", "Kreis | \n", "Einwohnerzahl | \n", "Bundesland | \n", "
|---|---|---|---|---|---|---|---|---|
| 0 | \n", "2020-04-24 | \n", "01 | \n", "01001 | \n", "40 | \n", "38 | \n", "Flensburg, Stadt | \n", "89934.0 | \n", "Schleswig-Holstein | \n", "
| 267 | \n", "2020-04-24 | \n", "09 | \n", "09471 | \n", "9 | \n", "9 | \n", "Bamberg | \n", "147497.0 | \n", "Bayern | \n", "
| 266 | \n", "2020-04-24 | \n", "09 | \n", "09464 | \n", "17 | \n", "23 | \n", "Hof | \n", "45173.0 | \n", "Bayern | \n", "
| 265 | \n", "2020-04-24 | \n", "09 | \n", "09463 | \n", "9 | \n", "25 | \n", "Coburg | \n", "40842.0 | \n", "Bayern | \n", "
| 264 | \n", "2020-04-24 | \n", "09 | \n", "09462 | \n", "12 | \n", "51 | \n", "Bayreuth | \n", "74048.0 | \n", "Bayern | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 279131 | \n", "2022-03-30 | \n", "06 | \n", "06437 | \n", "7 | \n", "119 | \n", "Odenwaldkreis | \n", "96754.0 | \n", "Hessen | \n", "
| 279130 | \n", "2022-03-30 | \n", "06 | \n", "06436 | \n", "1 | \n", "23 | \n", "Main-Taunus-Kreis | \n", "239264.0 | \n", "Hessen | \n", "
| 279129 | \n", "2022-03-30 | \n", "06 | \n", "06435 | \n", "11 | \n", "70 | \n", "Main-Kinzig-Kreis | \n", "421689.0 | \n", "Hessen | \n", "
| 279127 | \n", "2022-03-30 | \n", "06 | \n", "06433 | \n", "2 | \n", "19 | \n", "Groß-Gerau | \n", "275807.0 | \n", "Hessen | \n", "
| 279400 | \n", "2022-03-30 | \n", "16 | \n", "16077 | \n", "5 | \n", "27 | \n", "Altenburger Land | \n", "88356.0 | \n", "Thüringen | \n", "
279401 rows × 8 columns
\n", "Test
\n", " \n", " \n", " \n", " ''',\n", " options=[\n", " '',\n", " '',\n", " ''])\n", "\n", " # Then\n", " assertEqualHTML(\n", " htmlActual,\n", " '''\n", " \n", " \n", "Test
\n", " \n", " \n", " \n", " ''')\n", "\n", "# adapted from https://stackoverflow.com/questions/8006909/pretty-print-assertequal-for-html-strings\n", "def assertEqualHTML(string1, string2, file1='', file2=''):\n", " u'''\n", " Compare two unicode strings containing HTML.\n", " A human friendly diff goes to logging.error() if they\n", " are not equal, and an exception gets raised.\n", " '''\n", " from bs4 import BeautifulSoup as bs\n", " import difflib\n", "\n", " def short(mystr):\n", " max = 20\n", " if len(mystr) > max:\n", " return mystr[:max]\n", " return mystr\n", " p = []\n", " for mystr, file in [(string1, file1), (string2, file2)]:\n", " if not isinstance(mystr, str):\n", " raise Exception(u'string ist not unicode: %r %s' %\n", " (short(mystr), file))\n", " soup = bs(mystr)\n", " pretty = soup.prettify()\n", " p.append(pretty)\n", " if p[0] != p[1]:\n", " for line in difflib.unified_diff(p[0].splitlines(), p[1].splitlines(), fromfile=file1, tofile=file2):\n", " display(line)\n", " display(p[0], ' != ', p[1])\n", " raise Exception('Not equal %s %s' % (file1, file2))\n" ] }, { "cell_type": "code", "execution_count": 23, "id": "403f8b7b", "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "\n", "\n", "def saveKreisOptions(kreisOptions):\n", " HtmlTransformerUtil().applySoupTransformerToFile(\n", " file = \"../../docs/intensivstationen.html\",\n", " soupTransformer =\n", " lambda soup:\n", " BeautifulSoup(\n", " KreisOptionsSetter().setKreisOptions(html = str(soup), options = kreisOptions),\n", " 'lxml'))\n" ] }, { "cell_type": "code", "execution_count": 24, "id": "5dd8d864", "metadata": {}, "outputs": [], "source": [ "saveKreisOptions(kreisOptions)" ] }, { "cell_type": "code", "execution_count": 25, "id": "43c2f826", "metadata": {}, "outputs": [], "source": [ "import os\n", "import json\n", "\n", "\n", "class IOUtils:\n", "\n", " def saveDictAsJson(dict, file):\n", " IOUtils.ensurePath(file)\n", " with open(file, 'w') as outfile:\n", " json.dump(dict, outfile)\n", "\n", " @staticmethod\n", " def ensurePath(file):\n", " directory = os.path.dirname(file)\n", " if not os.path.exists(directory):\n", " os.makedirs(directory)\n" ] }, { "cell_type": "code", "execution_count": 26, "id": "997a4bdb", "metadata": {}, "outputs": [], "source": [ "def getIntensiveCareBeds(timeSeries, kreis = None):\n", " if kreis is not None:\n", " return timeSeries[timeSeries['Kreis'] == kreis][['date', 'betten_belegt', 'betten_frei', 'Einwohnerzahl']]\n", " else:\n", " return timeSeries.groupby('date').agg(**{\n", " 'betten_belegt': pd.NamedAgg(column = 'betten_belegt', aggfunc = 'sum'),\n", " 'betten_frei': pd.NamedAgg(column = 'betten_frei', aggfunc = 'sum'),\n", " 'Einwohnerzahl': pd.NamedAgg(column = 'Einwohnerzahl', aggfunc = 'sum')\n", " }).reset_index()" ] }, { "cell_type": "code", "execution_count": 27, "id": "a97f5b2b", "metadata": {}, "outputs": [], "source": [ "def getAndPersistIntensiveCareBeds(timeSeries, kreis = None):\n", " intensiveCareBeds = getIntensiveCareBeds(timeSeries, kreis)\n", " display(kreis)\n", " _saveAsJson(intensiveCareBeds, _getFilename(kreis))\n", " return intensiveCareBeds\n", "\n", "\n", "def _saveAsJson(intensiveCareBeds, file):\n", " IOUtils.saveDictAsJson(\n", " {\n", " 'population': int(intensiveCareBeds.iloc[0]['Einwohnerzahl']),\n", " 'data': _intensiveCareBeds2Dict(intensiveCareBeds),\n", " },\n", " file)\n", "\n", "\n", "def _intensiveCareBeds2Dict(intensiveCareBeds):\n", " df = intensiveCareBeds[['date', 'betten_belegt', 'betten_frei']].copy()\n", " df['date'] = df['date'].dt.strftime('%Y-%m-%d')\n", " return df.to_dict(orient = \"records\")\n", "\n", "\n", "def _getFilename(kreis):\n", " return f'../../docs/data/intensivstationen/intensivstationen-{_getSuffix(kreis)}.json'\n", "\n", "\n", "def _getSuffix(kreis):\n", " return kreis if kreis is not None else 'de'\n" ] }, { "cell_type": "code", "execution_count": 28, "id": "349edd73", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "None" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "| \n", " | date | \n", "betten_belegt | \n", "betten_frei | \n", "Einwohnerzahl | \n", "
|---|---|---|---|---|
| 0 | \n", "2020-04-24 | \n", "19237 | \n", "12270 | \n", "82401553.0 | \n", "
| 1 | \n", "2020-04-25 | \n", "19100 | \n", "12290 | \n", "82401553.0 | \n", "
| 2 | \n", "2020-04-26 | \n", "18617 | \n", "12694 | \n", "82401553.0 | \n", "
| 3 | \n", "2020-04-27 | \n", "18803 | \n", "12537 | \n", "82360711.0 | \n", "
| 4 | \n", "2020-04-28 | \n", "19345 | \n", "12207 | \n", "82504802.0 | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 701 | \n", "2022-03-26 | \n", "20268 | \n", "3924 | \n", "82658396.0 | \n", "
| 702 | \n", "2022-03-27 | \n", "19921 | \n", "4187 | \n", "82658396.0 | \n", "
| 703 | \n", "2022-03-28 | \n", "20123 | \n", "4263 | \n", "82658396.0 | \n", "
| 704 | \n", "2022-03-29 | \n", "20656 | \n", "3904 | \n", "82658396.0 | \n", "
| 705 | \n", "2022-03-30 | \n", "20828 | \n", "3772 | \n", "82658396.0 | \n", "
706 rows × 4 columns
\n", "