From 2d7204a9c05d58027cbbc4b0f2dbb68fbe8b829d Mon Sep 17 00:00:00 2001 From: vlado Date: Mon, 22 Sep 2025 07:19:26 +0200 Subject: [PATCH] W22 --- .idea/misc.xml | 1 + Medevio4-readandsavekartoteka.py | 177 ++++++++++++++++++++++++ Medevio4.py | 3 +- Medevio5_ReadNamesFromKartoteka_html.py | 42 ++++++ medevio_storage.json | 1 + 5 files changed, 223 insertions(+), 1 deletion(-) create mode 100644 Medevio4-readandsavekartoteka.py create mode 100644 Medevio5_ReadNamesFromKartoteka_html.py create mode 100644 medevio_storage.json diff --git a/.idea/misc.xml b/.idea/misc.xml index a278da7..7a3c570 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,4 +3,5 @@ + \ No newline at end of file diff --git a/Medevio4-readandsavekartoteka.py b/Medevio4-readandsavekartoteka.py new file mode 100644 index 0000000..6e8b1bf --- /dev/null +++ b/Medevio4-readandsavekartoteka.py @@ -0,0 +1,177 @@ +#Tento kod se pripoji do kartoteky Medevio, zmeni na 100 pacientu na stranu, nactene + + +# medevio_dump_patients_html_to_mysql.py +import time +import json +from pathlib import Path +from datetime import datetime +from typing import Set + +import mysql.connector +from mysql.connector import errorcode +from playwright.sync_api import sync_playwright, TimeoutError as PWTimeout + +# ---------- CONFIG ---------- +STATE_FILE = r"medevio_storage.json" +BASE_LIST_URL = "https://my.medevio.cz/mudr-buzalkova/klinika/pacienti" +SAVE_DELAY_SECONDS = 10 # throttle: 10 sec per patient + +# MySQL connection settings (fill in) +MYSQL_CFG = dict( + host="192.168.1.76", + port=3307, + user="root", + password="Vlado9674+", + database="medevio", +) + +# ---------- DB helpers ---------- +def db_connect(): + try: + conn = mysql.connector.connect(**MYSQL_CFG) + return conn + except mysql.connector.Error as e: + raise SystemExit(f"MySQL connection failed: {e}") + +# ---------- Playwright helpers ---------- +def wait_for_grid_ready(page): + # grid present & at least one row (be generous on timeout) + page.wait_for_selector("div[role='rowgroup']", timeout=20000) + page.wait_for_selector("div[role='row'][data-id]", timeout=20000) + +def set_page_size_100(page): #zde se nastavuje hodnota pacientu na stranu na 100, toto je jedno volani + # Click the page-size combobox (CZ/EN + generic) + for loc in [ + page.get_by_role("combobox", name="Řádků na stránce:"), + page.get_by_role("combobox", name="Rows per page:"), + page.locator("div.MuiTablePagination-root [role='combobox']"), + ]: + if loc.count(): + loc.first.click() + break + # Select 100 (MUI menu often renders in a portal) + opt = page.get_by_role("option", name="100") + if not opt.count(): + opt = page.locator("//li[normalize-space(.)='100']") + opt.first.wait_for(state="visible", timeout=5000) + opt.first.click() + # Wait for rows to refresh + try: + page.wait_for_selector("div[role='row'][data-id]", timeout=10000) + except PWTimeout: + time.sleep(0.8) + +def click_next_page(page) -> bool: #toto je kliknuti, aby se nacetla dalsi stranka se 100 zaznamy + # Prefer ARIA label + nxt = page.get_by_role("button", name="Go to next page") + if nxt.count(): + try: + if nxt.first.is_enabled(): + nxt.first.click() + return True + except Exception: + pass + # Fallback (CZ) + nxt2 = page.get_by_role("button", name="Další") + if nxt2.count(): + try: + if nxt2.first.is_enabled(): + nxt2.first.click() + return True + except Exception: + pass + return False + +# ---------- Main workflow ---------- +def save_all_patient_htmls(conn,context,next_round): #toto ulozi do mysql vsechny html stranky z kartoteky, takze cca 19 + page = context.new_page() + page.set_default_timeout(15000) + page.set_default_navigation_timeout(30000) + + # Use domcontentloaded (SPAs often keep network busy) + page.goto(BASE_LIST_URL, wait_until="domcontentloaded") + if "/prihlaseni" in page.url.lower(): + raise SystemExit("Session expired → refresh medevio_storage.json via the login script.") + + wait_for_grid_ready(page) + + # optional: print label like "1–25 z 1856" + try: + label = page.locator("p.MuiTablePagination-displayedRows").first.inner_text() + print("Pagination label BEFORE:", label) + except Exception: + pass + + # Set 100/page + try: + set_page_size_100(page) + try: + label = page.locator("p.MuiTablePagination-displayedRows").first.inner_text() + print("Pagination label AFTER :", label) + except Exception: + pass + except Exception as e: + print(f"Warning: could not set page size to 100: {e!r}") + + page_index = 1 + + while True: + wait_for_grid_ready(page) + + #here I need code to save page into kartoteka_html + cur = conn.cursor() + cur.execute( + f"""INSERT INTO kartoteka_html (html,round) + VALUES (%s,%s)""", + (page.content(),next_round), + ) + conn.commit() + cur.close() + print(f"DB saved page index {page_index}") + # Try to go next; if cannot, break + if not click_next_page(page): + break + # Wait for DOM to actually update (new rows) + try: + page.wait_for_load_state("domcontentloaded", timeout=10000) + except PWTimeout: + pass + time.sleep(0.5) + page_index += 1 + + page.close() + print(f"Total pages colleceted collected: {page_index}") + return + + +def main(): + # Check storage exists + if not Path(STATE_FILE).exists(): + raise SystemExit(f"Storage not found: {STATE_FILE}") + + # DB ready + conn = db_connect() + + #vymazat vsechny zaznamy z kartoteka_html, ktere nemaji hodnotu round + cur=conn.cursor() + cur.execute("delete from kartoteka_html where round=0") + conn.commit() + + with conn.cursor() as cur: + cur.execute("SELECT MAX(`round`) AS max_round FROM kartoteka_html") + result = cur.fetchone() + # If table empty, use 0 as fallback + next_round = (result[0] or 0) + 1 + print("Next round will be:", next_round) + + with sync_playwright() as p: + browser = p.chromium.launch(headless=False) # set False to watch + context = browser.new_context(storage_state=STATE_FILE) + save_all_patient_htmls(conn, context,next_round) + browser.close() + conn.close() + print("Done.") + +if __name__ == "__main__": + main() diff --git a/Medevio4.py b/Medevio4.py index 38b14aa..f5ec2ee 100644 --- a/Medevio4.py +++ b/Medevio4.py @@ -241,7 +241,8 @@ def main(): context = browser.new_context(storage_state=STATE_FILE) # 1) Collect all IDs from the listing (all pages) - all_ids = collect_all_patient_ids(context) + # all_ids = collect_all_patient_ids(context) + all_ids=db_existing_ids(conn) # 2) Iterate and store HTML (skip existing) todo = [pid for pid in sorted(all_ids) if pid not in already] diff --git a/Medevio5_ReadNamesFromKartoteka_html.py b/Medevio5_ReadNamesFromKartoteka_html.py new file mode 100644 index 0000000..b931254 --- /dev/null +++ b/Medevio5_ReadNamesFromKartoteka_html.py @@ -0,0 +1,42 @@ +import mysql.connector +from bs4 import BeautifulSoup +import re + + +# ---------- CONFIG ---------- +# MySQL connection settings (fill in) +MYSQL_CFG = dict( + host="192.168.1.76", + port=3307, + user="root", + password="Vlado9674+", + database="medevio", +) + +conn=mysql.connector.connect(**MYSQL_CFG) +cur=conn.cursor() +cur.execute("select html from kartoteka_html where 'fetched-at'=(SELECT MAX('fetched-at') FROM kartoteka_html)") +html=cur.fetchone() +html=html[0] + + +# html is the string containing the entire web page +soup = BeautifulSoup(html, "html.parser") + +# Find every