This commit is contained in:
2025-09-22 10:45:44 +02:00
parent 2d7204a9c0
commit fb9486d0af
6 changed files with 217 additions and 68 deletions

View File

@@ -1,10 +1,8 @@
import mysql.connector
from bs4 import BeautifulSoup
import re
import time
# ---------- CONFIG ----------
# MySQL connection settings (fill in)
MYSQL_CFG = dict(
host="192.168.1.76",
port=3307,
@@ -13,30 +11,100 @@ MYSQL_CFG = dict(
database="medevio",
)
conn=mysql.connector.connect(**MYSQL_CFG)
cur=conn.cursor()
cur.execute("select html from kartoteka_html where 'fetched-at'=(SELECT MAX('fetched-at') FROM kartoteka_html)")
html=cur.fetchone()
html=html[0]
#Helper functions
def is_valid_rc(rc: str) -> bool:
"""
Very basic RC check:
remove any slash
must be 9 or 10 digits
"""
rc_clean = rc.replace("/", "")
return bool(re.fullmatch(r"\d{9,10}", rc_clean))
conn = mysql.connector.connect(**MYSQL_CFG)
# html is the string containing the entire web page
soup = BeautifulSoup(html, "html.parser")
# --- get latest HTML (single-row result) ---
with conn.cursor() as cur:
cur.execute("""
SELECT html
FROM kartoteka_html
where round=3
ORDER BY `fetched-at` DESC
""")
rows = cur.fetchall()
if not rows:
raise RuntimeError("No HTML found in kartoteka_html")
# Find every <button> that has that specific class sequence
# (space-separated class names → match as a set)
buttons = soup.find_all(
"button",
class_="MuiTypography-root MuiTypography-body2 "
"MuiLink-root MuiLink-underlineAlways "
"MuiLink-button css-xf7pf8"
)
names = []
for btn in buttons:
text = btn.get_text(strip=True)
print(text)
names.append(text)
for row in rows:
print(names)
# names = [btn.get_text(strip=True) for btn in buttons]
print(names)
html = row[0]
soup = BeautifulSoup(html, "html.parser")
records = []
for row in soup.find_all("div", attrs={"role": "row", "data-id": True}):
data_id = row["data-id"]
# full name -> surname + rest
name_btn = row.find("button", class_="MuiTypography-root")
fullname = name_btn.get_text(strip=True) if name_btn else ""
parts = fullname.split()
surname = parts[0] if parts else ""
name = " ".join(parts[1:]) if len(parts) > 1 else ""
# RC
id_cell = row.find("div", attrs={"data-field": "IdentificationNumber"})
rc = (id_cell.get("title", "") if id_cell else "")
rc = rc.replace("/", "").replace("\\", "")
# Phone
ph_cell = row.find("div", attrs={"data-field": "Phone"})
raw_phone = ph_cell.get("title", "") if ph_cell else ""
raw_phone = raw_phone.replace("\u00A0", " ") # NBSP -> space
phone = re.sub(r"[^\d+]", "", raw_phone) # keep + and digits
# Insurance
ins_cell = row.find("div", attrs={"data-field": "InsuranceCompany"})
poj = ins_cell.get("title", "") if ins_cell else ""
# Skip rows with no name or no RC or not valid TC
if not fullname or not rc:
continue
if not is_valid_rc(rc):
continue
records.append((data_id, fullname, rc, phone, poj))
# --- per-patient lookup: use a fresh cursor each time (or buffered=True) ---
with conn.cursor(buffered=True) as cur2:
cur2.execute(
"""
SELECT *
FROM patients_extracted
WHERE rc=%s
""",
(rc,),
)
rows = cur2.fetchall()
# print(surname, name, rc, len(rows))
if len(rows) > 1:
print(f"Pacient {surname} {name} {rc} je v medeviu {len(rows)}x")
time.sleep(1)
if len(rows)==0:
print(f"Pacient {surname} {name} {rc} je v medeviu {len(rows)}x")
time.sleep(1)
if len(rows)==1 and rows[0][0]!=data_id:
print(f"Pacient {surname} {name} {rc} má v medeviu jiný id, v db je {rows[0][0]} and nyní je {data_id}")
time.sleep(.1)
if len(rows) == 1:
cur2.execute("""
Update patients_extracted set rid=%s where rc=%s""",(data_id,rc))
conn.commit()
# preview
# for r in records[:10]:
# print(f"ID: {r[0]} Name: {r[1]} RC: {r[2]} Phone: {r[3]} Pojistovna: {r[4]}")
#
# print("Total patients:", len(records))