From d313d33d58e5d8006616caf4bdb2ede74a9991e2 Mon Sep 17 00:00:00 2001 From: frankknoll Date: Wed, 14 Dec 2022 10:33:27 +0100 Subject: [PATCH] refactoring --- .gitignore | 1 + src/HowBadIsMyBatch.ipynb | 2 +- src/intensivstationen/DateProvider.py | 55 ++++++++++++ src/intensivstationen/Intensivstationen.ipynb | 88 ++----------------- 4 files changed, 65 insertions(+), 81 deletions(-) create mode 100644 src/intensivstationen/DateProvider.py diff --git a/.gitignore b/.gitignore index 20be6070905..ae2653d0236 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ src/captchaImage.jpeg src/HowBadIsMyBatch.nbconvert.ipynb src/HowBadIsMyBatch.nbconvert.html src/__pycache__/ +src/intensivstationen/__pycache__/ diff --git a/src/HowBadIsMyBatch.ipynb b/src/HowBadIsMyBatch.ipynb index 50b913e44b6..f9d5cdae9b0 100644 --- a/src/HowBadIsMyBatch.ipynb +++ b/src/HowBadIsMyBatch.ipynb @@ -138,7 +138,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.10.8 (main, Nov 24 2022, 14:13:03) [GCC 11.2.0]" }, "vscode": { "interpreter": { diff --git a/src/intensivstationen/DateProvider.py b/src/intensivstationen/DateProvider.py new file mode 100644 index 00000000000..c43d161dfa3 --- /dev/null +++ b/src/intensivstationen/DateProvider.py @@ -0,0 +1,55 @@ +from bs4 import BeautifulSoup +import requests +from datetime import datetime +from time import sleep +from selenium import webdriver +from selenium.webdriver.firefox.options import Options +import pandas as pd + +class DateProvider: + + INTENSIVSTATIONEN_DATE_FORMAT = "%d.%m.%Y, %H:%M Uhr" + + def __init__(self): + self.lastUpdated = None + self.lastUpdatedDataSource = None + + def needsUpdate(self): + return self.getLastUpdated() < self.getLastUpdatedDataSource() + + def getLastUpdated(self): + if self.lastUpdated is None: + htmlContent = requests.get("https://knollfrank.github.io/HowBadIsMyBatch/intensivstationen.html").text + soup = BeautifulSoup(htmlContent, "lxml") + dateStr = soup.find(id = "Datenstand").text + self.lastUpdated = datetime.strptime(dateStr, DateProvider.INTENSIVSTATIONEN_DATE_FORMAT) + + return self.lastUpdated + + def getLastUpdatedDataSource(self): + if self.lastUpdatedDataSource is None: + html = self._getOriginalHtml() + lastUpdatedColumn = 'Letzte Änderung' + dataFrame = self._asDataFrame(html, lastUpdatedColumn) + self.lastUpdatedDataSource = dataFrame.loc['Landkreis-Daten', lastUpdatedColumn].to_pydatetime() + + return self.lastUpdatedDataSource + + def _getOriginalHtml(self): + options = Options() + options.headless = True + options.add_argument("-profile") + # put the root directory your default profile path here, you can check it by opening Firefox and then pasting 'about:profiles' into the url field + options.add_argument("/home/frankknoll/snap/firefox/common/.mozilla/firefox/1j6r2yp6.default") + driver = webdriver.Firefox(options = options) + driver.get('https://www.intensivregister.de/#/aktuelle-lage/downloads') + sleep(10) + innerHTML = driver.execute_script("return document.body.innerHTML") + driver.quit() + return innerHTML + + def _asDataFrame(self, html, lastUpdatedColumn): + dataFrame = pd.read_html(html, parse_dates = [lastUpdatedColumn])[0] + dataFrame[lastUpdatedColumn] = pd.to_datetime(dataFrame[lastUpdatedColumn], format = "%d.%m.%Y %H:%M Uhr") + dataFrame.set_index('Name', inplace = True) + return dataFrame diff --git a/src/intensivstationen/Intensivstationen.ipynb b/src/intensivstationen/Intensivstationen.ipynb index 2736362ec1f..748660c2dec 100644 --- a/src/intensivstationen/Intensivstationen.ipynb +++ b/src/intensivstationen/Intensivstationen.ipynb @@ -9,10 +9,13 @@ "source": [ "import pandas as pd\n", "from urllib import request\n", + "import os\n", + "from DateProvider import DateProvider\n", + "from datetime import datetime\n", "\n", "pd.set_option('display.max_rows', 100)\n", "pd.set_option('display.max_columns', None)\n", - "pd.set_option('mode.chained_assignment', 'raise')" + "pd.set_option('mode.chained_assignment', 'raise')\n" ] }, { @@ -22,92 +25,17 @@ "metadata": {}, "outputs": [], "source": [ - "from datetime import datetime\n", - "\n", "print(datetime.now().strftime(\"%d.%m.%Y, %H:%M:%S Uhr\"))" ] }, { "cell_type": "code", "execution_count": null, - "id": "579c0911", + "id": "98981ab9", "metadata": {}, "outputs": [], "source": [ - "! pwd" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dfa836ec", - "metadata": {}, - "outputs": [], - "source": [ - "needsUpdate = False" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "79de4057", - "metadata": {}, - "outputs": [], - "source": [ - "from bs4 import BeautifulSoup\n", - "import requests\n", - "from datetime import datetime\n", - "from time import sleep\n", - "from selenium import webdriver\n", - "from selenium.webdriver.firefox.options import Options\n", - "\n", - "class DateProvider:\n", - " \n", - " INTENSIVSTATIONEN_DATE_FORMAT = \"%d.%m.%Y, %H:%M Uhr\"\n", - "\n", - " def __init__(self):\n", - " self.lastUpdated = None\n", - " self.lastUpdatedDataSource = None\n", - "\n", - " def needsUpdate(self):\n", - " return self.getLastUpdated() < self.getLastUpdatedDataSource()\n", - " \n", - " def getLastUpdated(self):\n", - " if self.lastUpdated is None:\n", - " htmlContent = requests.get(\"https://knollfrank.github.io/HowBadIsMyBatch/intensivstationen.html\").text\n", - " soup = BeautifulSoup(htmlContent, \"lxml\")\n", - " dateStr = soup.find(id = \"Datenstand\").text\n", - " self.lastUpdated = datetime.strptime(dateStr, DateProvider.INTENSIVSTATIONEN_DATE_FORMAT)\n", - " \n", - " return self.lastUpdated\n", - "\n", - " def getLastUpdatedDataSource(self):\n", - " if self.lastUpdatedDataSource is None:\n", - " html = self._getOriginalHtml()\n", - " lastUpdatedColumn = 'Letzte Änderung'\n", - " dataFrame = self._asDataFrame(html, lastUpdatedColumn)\n", - " self.lastUpdatedDataSource = dataFrame.loc['Landkreis-Daten', lastUpdatedColumn].to_pydatetime()\n", - "\n", - " return self.lastUpdatedDataSource\n", - "\n", - " def _getOriginalHtml(self):\n", - " options = Options()\n", - " options.headless = True\n", - " options.add_argument(\"-profile\")\n", - " # put the root directory your default profile path here, you can check it by opening Firefox and then pasting 'about:profiles' into the url field \n", - " options.add_argument(\"/home/frankknoll/snap/firefox/common/.mozilla/firefox/1j6r2yp6.default\")\n", - " driver = webdriver.Firefox(options = options)\n", - " driver.get('https://www.intensivregister.de/#/aktuelle-lage/downloads')\n", - " sleep(10)\n", - " innerHTML = driver.execute_script(\"return document.body.innerHTML\")\n", - " driver.quit()\n", - " return innerHTML\n", - "\n", - " def _asDataFrame(self, html, lastUpdatedColumn):\n", - " dataFrame = pd.read_html(html, parse_dates = [lastUpdatedColumn])[0]\n", - " dataFrame[lastUpdatedColumn] = pd.to_datetime(dataFrame[lastUpdatedColumn], format = \"%d.%m.%Y %H:%M Uhr\")\n", - " dataFrame.set_index('Name', inplace = True)\n", - " return dataFrame\n" + "os.getcwd()" ] }, { @@ -831,9 +759,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.10.8 ('howbadismybatch-venv')", + "display_name": "howbadismybatch-venv-kernel", "language": "python", - "name": "python3" + "name": "howbadismybatch-venv-kernel" }, "language_info": { "codemirror_mode": {