This commit is contained in:
2026-04-17 10:02:37 +02:00
parent 906810fe42
commit c7989ca915
+126 -8
View File
@@ -1,8 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import fitz import fitz
import os import os
import re import re
@@ -35,6 +33,102 @@ def ocr_page(page):
return text return text
# ─── Medicus ověření ──────────────────────────────────────────────────────────
def _medicus_connect():
try:
import fdb
return fdb.connect(
dsn=r"localhost:c:\medicus 3\data\medicus.fdb",
user="SYSDBA", password="masterkey", charset="win1250"
)
except Exception as e:
print(f" [Medicus] Nepřipojeno: {e}")
return None
def _lookup_by_rc(cur, rc_digits: str) -> dict | None:
cur.execute(
"SELECT IDPAC, PRIJMENI, JMENO, RODCIS FROM KAR "
"WHERE REPLACE(RODCIS, '/', '') = ?",
(rc_digits,)
)
row = cur.fetchone()
if row:
return {"idpac": row[0], "prijmeni": row[1].strip(), "jmeno": row[2].strip(), "rodcis": row[3].strip()}
return None
def _rc_candidates(rc: str) -> list[str]:
similar = {"0": "8", "8": "0", "1": "7", "7": "1", "5": "6", "6": "5", "3": "8"}
candidates = set()
for i in range(len(rc)):
candidates.add(rc[:i] + rc[i+1:])
for i, ch in enumerate(rc):
if ch in similar:
candidates.add(rc[:i] + similar[ch] + rc[i+1:])
candidates.discard(rc)
return sorted(candidates)
def _rc_checksum_ok(rc: str) -> bool:
digits = re.sub(r"\D", "", rc)
if len(digits) == 10:
return int(digits) % 11 == 0
return True
def verify_patient(rc_raw: str) -> dict:
"""
Ověří pacienta v Medicus.
status: "ok" | "fuzzy" | "not_found" | "offline"
"""
rc = re.sub(r"\D", "", rc_raw or "")
if not rc:
return {"status": "not_found", "patient": None, "rc_corrected": None}
con = _medicus_connect()
if con is None:
return {"status": "offline", "patient": None, "rc_corrected": None}
try:
cur = con.cursor()
patient = _lookup_by_rc(cur, rc)
if patient:
return {"status": "ok", "patient": patient, "rc_corrected": None}
candidates = _rc_candidates(rc)
matches = []
for cand in candidates:
p = _lookup_by_rc(cur, cand)
if p:
matches.append((cand, p))
if not matches:
return {"status": "not_found", "patient": None, "rc_corrected": None}
matches.sort(key=lambda x: (0 if _rc_checksum_ok(x[0]) else 1))
best_rc, best_patient = matches[0]
return {"status": "fuzzy", "patient": best_patient, "rc_corrected": best_rc, "all_matches": matches}
finally:
con.close()
def print_verification(verif: dict, rc_from_ocr: str):
status = verif["status"]
patient = verif.get("patient")
if status == "ok":
print(f" ✓ Medicus: {patient['prijmeni']} {patient['jmeno']} | RČ {patient['rodcis']}")
elif status == "fuzzy":
rc_corr = verif["rc_corrected"]
print(f" ⚠ Medicus: RČ z OCR '{rc_from_ocr}' nenalezeno")
print(f" → Nalezen podobný pacient: {patient['prijmeni']} {patient['jmeno']} | RČ {patient['rodcis']}")
print(f" → Pravděpodobná oprava RČ: {rc_from_ocr}{rc_corr} (OCR chyba)")
if len(verif.get("all_matches", [])) > 1:
print(f" → Další shody: {[m[0] for m in verif['all_matches'][1:]]}")
elif status == "not_found":
print(f" ✗ Medicus: RČ '{rc_from_ocr}' nenalezeno ani při fuzzy hledání")
elif status == "offline":
print(f" — Medicus: nedostupný (offline), ověření přeskočeno")
# ─── OCR extrakce ─────────────────────────────────────────────────────────────
def extract_rodne_cislo(text): def extract_rodne_cislo(text):
"""Extracts RC (6 digits + optional slash/spaces + 3-4 digits).""" """Extracts RC (6 digits + optional slash/spaces + 3-4 digits)."""
m = re.search(r"(\d{6})\s*/?\s*(\d{3,4})", text) m = re.search(r"(\d{6})\s*/?\s*(\d{3,4})", text)
@@ -47,11 +141,19 @@ def extract_rodne_cislo(text):
def extract_date(text): def extract_date(text):
"""Extracts date with high tolerance for OCR spacing errors.""" """Extracts date with high tolerance for OCR spacing errors."""
# Matches D.M.YYYY, D. M. YYYY, DD.MM.YYYY, etc. # Primary: D.M.YYYY, DD.MM.YYYY with dots/commas (also comma instead of dot)
# It also handles if OCR accidentally puts a comma instead of a dot
m = re.search(r"(\d{1,2})[\.,]\s*(\d{1,2})[\.,]\s*(\d{4})", text) m = re.search(r"(\d{1,2})[\.,]\s*(\d{1,2})[\.,]\s*(\d{4})", text)
if m: if m:
return m.groups() return m.groups()
# Fallback: compact date without separators — OCR drops the dots
# Try DDMMYYYY (8 digits) first, then DDMYYYY (7 digits)
for pat in [r"\b(\d{2})(\d{2})(\d{4})\b", r"\b(\d{2})(\d{1})(\d{4})\b"]:
for m in re.finditer(pat, text):
d, mo, y = m.groups()
if 1 <= int(d) <= 31 and 1 <= int(mo) <= 12 and 1900 <= int(y) <= 2100:
return (d, mo, y)
return None return None
@@ -105,15 +207,31 @@ for pdf_path in BASE_DIR.glob("*.pdf"):
doc.close() doc.close()
# 3. RENAME LOGIC # 3. MEDICUS VERIFICATION + FUZZY MATCHING
if rc and date_tuple: rc_final = rc
if rc:
print(f" [Medicus] Ověřuji RČ {rc}...")
verif = verify_patient(rc)
print_verification(verif, rc)
if verif["status"] == "fuzzy" and verif.get("rc_corrected"):
rc_final = verif["rc_corrected"]
print(f" [Medicus] RČ opraveno na: {rc_final}")
else:
verif = {"status": "not_found", "patient": None, "rc_corrected": None}
# 4. RENAME LOGIC
if rc_final and date_tuple:
date_iso = convert_date_to_iso(date_tuple) date_iso = convert_date_to_iso(date_tuple)
new_name = f"{rc} {date_iso} [EKG] [bez hodnocení].pdf" patient = verif.get("patient")
if patient:
name_part = f"{patient['prijmeni']}, {patient['jmeno']} "
else:
name_part = ""
new_name = f"{rc_final} {name_part}{date_iso} [EKG] [bez hodnocení].pdf"
new_path = pdf_path.with_name(new_name) new_path = pdf_path.with_name(new_name)
if not new_path.exists(): if not new_path.exists():
try: try:
# Close the handle properly before renaming
pdf_path.rename(new_path) pdf_path.rename(new_path)
print(f"✅ Success: Renamed to {new_name}") print(f"✅ Success: Renamed to {new_name}")
except Exception as e: except Exception as e: