r

2025-09-22 10:45:44 +02:00
parent 2d7204a9c0
commit fb9486d0af
6 changed files with 217 additions and 68 deletions
--- a/Medevio5_ReadNamesFromKartoteka_html.py
+++ b/Medevio5_ReadNamesFromKartoteka_html.py
@@ -1,10 +1,8 @@
 import mysql.connector
 from bs4 import BeautifulSoup
 import re
+import time

-
-# ---------- CONFIG ----------
-# MySQL connection settings (fill in)
 MYSQL_CFG = dict(
    host="192.168.1.76",
    port=3307,
@@ -13,30 +11,100 @@ MYSQL_CFG = dict(
    database="medevio",
 )

-conn=mysql.connector.connect(**MYSQL_CFG)
-cur=conn.cursor()
-cur.execute("select html from kartoteka_html where 'fetched-at'=(SELECT MAX('fetched-at') FROM kartoteka_html)")
-html=cur.fetchone()
-html=html[0]
+#Helper functions
+def is_valid_rc(rc: str) -> bool:
+    """
+    Very basic RC check:
+      – remove any slash
+      – must be 9 or 10 digits
+    """
+    rc_clean = rc.replace("/", "")
+    return bool(re.fullmatch(r"\d{9,10}", rc_clean))

+conn = mysql.connector.connect(**MYSQL_CFG)

-# html is the string containing the entire web page
-soup = BeautifulSoup(html, "html.parser")
+# --- get latest HTML (single-row result) ---
+with conn.cursor() as cur:
+    cur.execute("""
+        SELECT html
+        FROM kartoteka_html
+        where round=3
+        ORDER BY `fetched-at` DESC        
+    """)
+    rows = cur.fetchall()
+    if not rows:
+        raise RuntimeError("No HTML found in kartoteka_html")

-# Find every <button> that has that specific class sequence
-# (space-separated class names → match as a set)
-buttons = soup.find_all(
-    "button",
-    class_="MuiTypography-root MuiTypography-body2 "
-           "MuiLink-root MuiLink-underlineAlways "
-           "MuiLink-button css-xf7pf8"
-)
-names = []
-for btn in buttons:
-    text = btn.get_text(strip=True)
-    print(text)
-    names.append(text)
+for row in rows:

-print(names)
-# names = [btn.get_text(strip=True) for btn in buttons]
-print(names)
+    html = row[0]
+
+    soup = BeautifulSoup(html, "html.parser")
+
+    records = []
+    for row in soup.find_all("div", attrs={"role": "row", "data-id": True}):
+        data_id = row["data-id"]
+
+        # full name -> surname + rest
+        name_btn = row.find("button", class_="MuiTypography-root")
+        fullname = name_btn.get_text(strip=True) if name_btn else ""
+        parts = fullname.split()
+        surname = parts[0] if parts else ""
+        name = " ".join(parts[1:]) if len(parts) > 1 else ""
+
+        # RC
+        id_cell = row.find("div", attrs={"data-field": "IdentificationNumber"})
+        rc = (id_cell.get("title", "") if id_cell else "")
+        rc = rc.replace("/", "").replace("\\", "")
+
+        # Phone
+        ph_cell = row.find("div", attrs={"data-field": "Phone"})
+        raw_phone = ph_cell.get("title", "") if ph_cell else ""
+        raw_phone = raw_phone.replace("\u00A0", " ")  # NBSP -> space
+        phone = re.sub(r"[^\d+]", "", raw_phone)     # keep + and digits
+
+        # Insurance
+        ins_cell = row.find("div", attrs={"data-field": "InsuranceCompany"})
+        poj = ins_cell.get("title", "") if ins_cell else ""
+
+        # Skip rows with no name or no RC or not valid TC
+        if not fullname or not rc:
+            continue
+        if not is_valid_rc(rc):
+            continue
+
+        records.append((data_id, fullname, rc, phone, poj))
+
+        # --- per-patient lookup: use a fresh cursor each time (or buffered=True) ---
+        with conn.cursor(buffered=True) as cur2:
+            cur2.execute(
+                """
+                SELECT *
+                FROM patients_extracted
+                WHERE rc=%s
+                """,
+                (rc,),
+            )
+            rows = cur2.fetchall()
+
+            # print(surname, name, rc, len(rows))
+
+            if len(rows) > 1:
+                print(f"Pacient {surname} {name} {rc} je v medeviu {len(rows)}x")
+                time.sleep(1)
+            if len(rows)==0:
+                print(f"Pacient {surname} {name} {rc} je v medeviu {len(rows)}x")
+                time.sleep(1)
+            if len(rows)==1 and rows[0][0]!=data_id:
+                print(f"Pacient {surname} {name} {rc} má v medeviu jiný id, v db je {rows[0][0]} and nyní je {data_id}")
+                time.sleep(.1)
+
+            if len(rows) == 1:
+                cur2.execute("""
+                Update patients_extracted set rid=%s where rc=%s""",(data_id,rc))
+                conn.commit()
+    # preview
+    # for r in records[:10]:
+    #     print(f"ID: {r[0]}  Name: {r[1]}  RC: {r[2]}  Phone: {r[3]}  Pojistovna: {r[4]}")
+    #
+    # print("Total patients:", len(records))