{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "9de5907f-18f5-4cb1-903e-26028ff1fa03", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from urllib import request\n", "\n", "pd.set_option('display.max_rows', 100)\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('mode.chained_assignment', 'raise')" ] }, { "cell_type": "code", "execution_count": null, "id": "dfa836ec", "metadata": {}, "outputs": [], "source": [ "needsUpdate = False" ] }, { "cell_type": "code", "execution_count": null, "id": "79de4057", "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "import requests\n", "from datetime import datetime\n", "from time import sleep\n", "from selenium import webdriver\n", "from selenium.webdriver.firefox.options import Options\n", "\n", "class DateProvider:\n", " \n", " INTENSIVSTATIONEN_DATE_FORMAT = \"%d.%m.%Y, %H:%M Uhr\"\n", "\n", " def __init__(self):\n", " self.lastUpdatedIntensivstationen = None\n", " self.lastUpdatedOriginal = None\n", "\n", " def needsUpdate(self):\n", " return self.getLastUpdatedIntensivstationen() < self.getLastUpdatedOriginal()\n", " \n", " def getLastUpdatedIntensivstationen(self):\n", " if self.lastUpdatedIntensivstationen is None:\n", " htmlContent = requests.get(\"https://knollfrank.github.io/HowBadIsMyBatch/intensivstationen.html\").text\n", " soup = BeautifulSoup(htmlContent, \"lxml\")\n", " dateStr = soup.find(id = \"Datenstand\").text\n", " self.lastUpdatedIntensivstationen = datetime.strptime(dateStr, DateProvider.INTENSIVSTATIONEN_DATE_FORMAT)\n", " \n", " return self.lastUpdatedIntensivstationen\n", "\n", " def getLastUpdatedOriginal(self):\n", " if self.lastUpdatedOriginal is None:\n", " html = self._getOriginalHtml()\n", " lastUpdatedColumn = 'Letzte Änderung'\n", " dataFrame = self._asDataFrame(html, lastUpdatedColumn)\n", " self.lastUpdatedOriginal = dataFrame.loc['Landkreis-Daten', lastUpdatedColumn].to_pydatetime()\n", "\n", " return self.lastUpdatedOriginal\n", "\n", " def _getOriginalHtml(self):\n", " options = Options()\n", " options.headless = True\n", " driver = webdriver.Firefox(options = options)\n", " driver.get('https://www.intensivregister.de/#/aktuelle-lage/downloads')\n", " sleep(10)\n", " innerHTML = driver.execute_script(\"return document.body.innerHTML\")\n", " driver.quit()\n", " return innerHTML\n", "\n", " def _asDataFrame(self, html, lastUpdatedColumn):\n", " dataFrame = pd.read_html(html, parse_dates = [lastUpdatedColumn])[0]\n", " dataFrame[lastUpdatedColumn] = pd.to_datetime(dataFrame[lastUpdatedColumn], format = \"%d.%m.%Y %H:%M Uhr\")\n", " dataFrame.set_index('Name', inplace = True)\n", " return dataFrame\n" ] }, { "cell_type": "code", "execution_count": null, "id": "336f56e6", "metadata": {}, "outputs": [], "source": [ "dateProvider = DateProvider()\n", "print('lastUpdatedIntensivstationen:', dateProvider.getLastUpdatedIntensivstationen())\n", "print('lastUpdatedOriginal:', dateProvider.getLastUpdatedOriginal()) \n", "needsUpdate = dateProvider.needsUpdate()\n", "print('needsUpdate: ', needsUpdate)" ] }, { "cell_type": "code", "execution_count": null, "id": "03784154", "metadata": {}, "outputs": [], "source": [ "from bs4 import BeautifulSoup\n", "\n", "def saveLastUpdatedIntensivstationen(lastUpdated):\n", " file = \"../../docs/intensivstationen.html\"\n", " with open(file) as fp:\n", " soup = BeautifulSoup(fp, 'lxml')\n", "\n", " soup.find(id = \"Datenstand\").string.replace_with(lastUpdated.strftime(DateProvider.INTENSIVSTATIONEN_DATE_FORMAT))\n", "\n", " with open(file, \"w\") as fp:\n", " fp.write(str(soup))" ] }, { "cell_type": "code", "execution_count": null, "id": "63be303c", "metadata": {}, "outputs": [], "source": [ "saveLastUpdatedIntensivstationen(dateProvider.getLastUpdatedOriginal())" ] }, { "cell_type": "code", "execution_count": null, "id": "d021de84", "metadata": {}, "outputs": [], "source": [ "def readTimeseries(download = False):\n", " timeSeriesFile = 'zeitreihe-tagesdaten.csv'\n", " if download:\n", " _downloadTimeseries(timeSeriesFile)\n", "\n", " timeseries = pd.read_csv(\n", " timeSeriesFile,\n", " low_memory = False,\n", " usecols = ['date', 'bundesland', 'gemeindeschluessel', 'betten_belegt', 'betten_frei'],\n", " parse_dates = ['date'],\n", " date_parser = lambda dateStr: pd.to_datetime(dateStr, format = \"%Y-%m-%d\"),\n", " dtype = {\n", " 'gemeindeschluessel': 'string',\n", " 'bundesland': 'string'\n", " })\n", " return timeseries.sort_values(by = 'date', ascending = True)\n", "\n", "# download https://diviexchange.blob.core.windows.net/%24web/zeitreihe-tagesdaten.csv or https://www.intensivregister.de/#/aktuelle-lage/downloads\n", "def _downloadTimeseries(timeSeriesFile):\n", " request.urlretrieve(\n", " 'https://diviexchange.blob.core.windows.net/%24web/zeitreihe-tagesdaten.csv',\n", " timeSeriesFile)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "3f992231", "metadata": {}, "outputs": [], "source": [ "timeSeries = readTimeseries(download = needsUpdate)\n", "timeSeries" ] }, { "cell_type": "code", "execution_count": null, "id": "2d34c6a4", "metadata": {}, "outputs": [], "source": [ "def readKreise(download = False):\n", " kreiseFile = '04-kreise.xlsx'\n", " if download:\n", " _downloadKreise(kreiseFile)\n", " \n", " kreise = pd.read_excel(\n", " kreiseFile,\n", " sheet_name = 'Kreisfreie Städte u. Landkreise',\n", " header = 5,\n", " index_col = 0)\n", " kreise = kreise.rename(columns = {'2': 'Bundesland', 3: 'Kreis', 6: 'Einwohnerzahl'})[['Bundesland', 'Kreis', 'Einwohnerzahl']]\n", " kreise.index.set_names(\"Key\", inplace = True)\n", " return kreise\n", "\n", "# download https://www.destatis.de/DE/Themen/Laender-Regionen/Regionales/Gemeindeverzeichnis/Administrativ/04-kreise.xlsx?__blob=publicationFile or https://www.destatis.de/DE/Themen/Laender-Regionen/Regionales/Gemeindeverzeichnis/Administrativ/04-kreise.html\n", "def _downloadKreise(kreiseFile):\n", " request.urlretrieve(\n", " 'https://www.destatis.de/DE/Themen/Laender-Regionen/Regionales/Gemeindeverzeichnis/Administrativ/04-kreise.xlsx?__blob=publicationFile',\n", " kreiseFile)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "74ea4d55", "metadata": {}, "outputs": [], "source": [ "kreise = readKreise(download = False)\n", "kreise" ] }, { "cell_type": "code", "execution_count": null, "id": "af96fb11", "metadata": {}, "outputs": [], "source": [ "class ColumnsAdder:\n", "\n", " def __init__(self, kreise):\n", " self.kreise = kreise\n", "\n", " def addKreisAndBundeslandAndEinwohnerzahlColumns(self, dataFrame):\n", " dataFrame_kreise = pd.merge(dataFrame, self.kreise, how = 'left', left_on = 'gemeindeschluessel', right_index = True)\n", " dataFrame['Kreis'] = dataFrame_kreise['Kreis']\n", " dataFrame['Einwohnerzahl'] = dataFrame_kreise['Einwohnerzahl']\n", " return self._addBundeslandColumn(dataFrame)\n", " \n", " def _addBundeslandColumn(self, dataFrame):\n", " return pd.merge(\n", " dataFrame,\n", " self._createBundeslandByKeyTable(),\n", " how = 'left',\n", " left_on = 'bundesland',\n", " right_index = True)\n", "\n", " def _createBundeslandByKeyTable(self):\n", " return self.kreise[self.kreise.index.str.len() == 2][['Bundesland']]\n" ] }, { "cell_type": "code", "execution_count": null, "id": "62a20115", "metadata": {}, "outputs": [], "source": [ "timeSeries = ColumnsAdder(kreise).addKreisAndBundeslandAndEinwohnerzahlColumns(timeSeries)\n", "timeSeries" ] }, { "cell_type": "code", "execution_count": null, "id": "356494d3", "metadata": {}, "outputs": [], "source": [ "kreisValues = timeSeries['Kreis'].drop_duplicates().values\n", "kreisValues" ] }, { "cell_type": "code", "execution_count": null, "id": "05aa0117", "metadata": {}, "outputs": [], "source": [ "def printKreisOptions(kreisValues):\n", " for kreis in kreisValues:\n", " printKreisOption(kreis)\n", "\n", "def printKreisOption(kreis):\n", " print(''.format(kreis = kreis))" ] }, { "cell_type": "code", "execution_count": null, "id": "33a4b725", "metadata": {}, "outputs": [], "source": [ "kreisValues = sorted(kreisValues)\n", "# FK-TODO: die folgenden Optionen in der Datei intensivstationen.html in das select-Element nach \"Alle Landkreise\" einsetzen \n", "printKreisOptions(kreisValues)" ] }, { "cell_type": "code", "execution_count": null, "id": "43c2f826", "metadata": {}, "outputs": [], "source": [ "import os\n", "\n", "\n", "class IOUtils:\n", "\n", " def saveDictAsJson(dict, file):\n", " IOUtils.ensurePath(file)\n", " with open(file, 'w') as outfile:\n", " json.dump(dict, outfile)\n", "\n", " @staticmethod\n", " def ensurePath(file):\n", " directory = os.path.dirname(file)\n", " if not os.path.exists(directory):\n", " os.makedirs(directory)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "997a4bdb", "metadata": {}, "outputs": [], "source": [ "def getIntensiveCareBeds(timeSeries, kreis = None):\n", " if kreis is not None:\n", " return timeSeries[timeSeries['Kreis'] == kreis][['date', 'betten_belegt', 'betten_frei', 'Einwohnerzahl']]\n", " else:\n", " return timeSeries.groupby('date').agg(**{\n", " 'betten_belegt': pd.NamedAgg(column = 'betten_belegt', aggfunc = 'sum'),\n", " 'betten_frei': pd.NamedAgg(column = 'betten_frei', aggfunc = 'sum'),\n", " 'Einwohnerzahl': pd.NamedAgg(column = 'Einwohnerzahl', aggfunc = 'sum') \n", " }).reset_index()" ] }, { "cell_type": "code", "execution_count": null, "id": "a97f5b2b", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "\n", "def getAndPersistIntensiveCareBeds(timeSeries, kreis = None):\n", " intensiveCareBeds = getIntensiveCareBeds(timeSeries, kreis)\n", " display(kreis)\n", " _saveAsJson(intensiveCareBeds, _getFilename(kreis))\n", " return intensiveCareBeds\n", "\n", "\n", "def _saveAsJson(intensiveCareBeds, file):\n", " IOUtils.saveDictAsJson(\n", " {\n", " 'population': int(intensiveCareBeds.iloc[0]['Einwohnerzahl']),\n", " 'data': _intensiveCareBeds2Dict(intensiveCareBeds),\n", " },\n", " file)\n", "\n", "\n", "def _intensiveCareBeds2Dict(intensiveCareBeds):\n", " df = intensiveCareBeds[['date', 'betten_belegt', 'betten_frei']].copy()\n", " df['date'] = df['date'].dt.strftime('%Y-%m-%d')\n", " return df.to_dict(orient = \"records\")\n", "\n", "\n", "def _getFilename(kreis):\n", " return '../../docs/data/intensivstationen/intensivstationen-{suffix}.json'.format(suffix = _getSuffix(kreis))\n", "\n", "\n", "def _getSuffix(kreis):\n", " return kreis if kreis is not None else 'de'\n" ] }, { "cell_type": "code", "execution_count": null, "id": "349edd73", "metadata": {}, "outputs": [], "source": [ "getAndPersistIntensiveCareBeds(timeSeries)" ] }, { "cell_type": "code", "execution_count": null, "id": "1b97137f", "metadata": {}, "outputs": [], "source": [ "for kreis in kreisValues:\n", " getAndPersistIntensiveCareBeds(timeSeries, kreis)" ] }, { "cell_type": "code", "execution_count": null, "id": "0218cdb4", "metadata": {}, "outputs": [], "source": [ "def publish():\n", " %cd /home/frankknoll/Dokumente/Corona/projects/HowBadIsMyBatch-pages\n", " ! git add -A\n", " ! git commit -m \"updating data\"\n", " ! git push" ] }, { "cell_type": "code", "execution_count": null, "id": "5f173c2b", "metadata": {}, "outputs": [], "source": [ "publish()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.7" } }, "nbformat": 4, "nbformat_minor": 5 }