medevio/Medevio5_ReadNamesFromKartoteka_html.py

import mysql.connector
from bs4 import BeautifulSoup
import re
import time

MYSQL_CFG = dict(
    host="192.168.1.76",
    port=3307,
    user="root",
    password="Vlado9674+",
    database="medevio",
)

#Helper functions
def is_valid_rc(rc: str) -> bool:
    """
    Very basic RC check:
      – remove any slash
      – must be 9 or 10 digits
    """
    rc_clean = rc.replace("/", "")
    return bool(re.fullmatch(r"\d{9,10}", rc_clean))

conn = mysql.connector.connect(**MYSQL_CFG)

# --- get latest HTML (single-row result) ---
with conn.cursor() as cur:
    cur.execute("""
        SELECT html
        FROM kartoteka_html
        where round=3
        ORDER BY `fetched-at` DESC
    """)
    rows = cur.fetchall()
    if not rows:
        raise RuntimeError("No HTML found in kartoteka_html")

for row in rows:

    html = row[0]

    soup = BeautifulSoup(html, "html.parser")

    records = []
    for row in soup.find_all("div", attrs={"role": "row", "data-id": True}):
        data_id = row["data-id"]

        # full name -> surname + rest
        name_btn = row.find("button", class_="MuiTypography-root")
        fullname = name_btn.get_text(strip=True) if name_btn else ""
        parts = fullname.split()
        surname = parts[0] if parts else ""
        name = " ".join(parts[1:]) if len(parts) > 1 else ""

        # RC
        id_cell = row.find("div", attrs={"data-field": "IdentificationNumber"})
        rc = (id_cell.get("title", "") if id_cell else "")
        rc = rc.replace("/", "").replace("\\", "")

        # Phone
        ph_cell = row.find("div", attrs={"data-field": "Phone"})
        raw_phone = ph_cell.get("title", "") if ph_cell else ""
        raw_phone = raw_phone.replace("\u00A0", " ")  # NBSP -> space
        phone = re.sub(r"[^\d+]", "", raw_phone)     # keep + and digits

        # Insurance
        ins_cell = row.find("div", attrs={"data-field": "InsuranceCompany"})
        poj = ins_cell.get("title", "") if ins_cell else ""

        # Skip rows with no name or no RC or not valid TC
        if not fullname or not rc:
            continue
        if not is_valid_rc(rc):
            continue

        records.append((data_id, fullname, rc, phone, poj))

        # --- per-patient lookup: use a fresh cursor each time (or buffered=True) ---
        with conn.cursor(buffered=True) as cur2:
            cur2.execute(
                """
                SELECT *
                FROM patients_extracted
                WHERE rc=%s
                """,
                (rc,),
            )
            rows = cur2.fetchall()

            # print(surname, name, rc, len(rows))

            if len(rows) > 1:
                print(f"Pacient {surname} {name} {rc} je v medeviu {len(rows)}x")
                time.sleep(1)
            if len(rows)==0:
                print(f"Pacient {surname} {name} {rc} je v medeviu {len(rows)}x")
                time.sleep(1)
            if len(rows)==1 and rows[0][0]!=data_id:
                print(f"Pacient {surname} {name} {rc} má v medeviu jiný id, v db je {rows[0][0]} and nyní je {data_id}")
                time.sleep(.1)

            if len(rows) == 1:
                cur2.execute("""
                Update patients_extracted set rid=%s where rc=%s""",(data_id,rc))
                conn.commit()
    # preview
    # for r in records[:10]:
    #     print(f"ID: {r[0]}  Name: {r[1]}  RC: {r[2]}  Phone: {r[3]}  Pojistovna: {r[4]}")
    #
    # print("Total patients:", len(records))