#!/usr/bin/env python3 # -*- coding: utf-8 -*- import fitz import os import re import time import io import pytesseract from pathlib import Path from PIL import Image from FunkceWhereIsDropbox import get_dropbox_path # --- CONFIGURATION --- #Get cesta to Dropbox on working computer dropbox=get_dropbox_path() BASE_DIR=dropbox/"Ordinace"/"Dokumentace_ke_zpracování"/"EKGforProcessing" FLAG = "rotated-by-script" # Point to your Tesseract executable if it's not in your PATH: pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' def ocr_page(page): """Perform OCR using Tesseract with high DPI for better accuracy.""" # Increasing DPI to 300 is crucial for small text on EKG strips pix = page.get_pixmap(dpi=300) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) # Using 'ces' for Czech language support # PSM 6: Assume a single uniform block of text text = pytesseract.image_to_string(img, lang='ces', config='--psm 6') return text # ─── Medicus ověření ────────────────────────────────────────────────────────── def _medicus_connect(): try: import fdb return fdb.connect( dsn=r"localhost:c:\medicus 3\data\medicus.fdb", user="SYSDBA", password="masterkey", charset="win1250" ) except Exception as e: print(f" [Medicus] Nepřipojeno: {e}") return None def _lookup_by_rc(cur, rc_digits: str) -> dict | None: cur.execute( "SELECT IDPAC, PRIJMENI, JMENO, RODCIS FROM KAR " "WHERE REPLACE(RODCIS, '/', '') = ?", (rc_digits,) ) row = cur.fetchone() if row: return {"idpac": row[0], "prijmeni": row[1].strip(), "jmeno": row[2].strip(), "rodcis": row[3].strip()} return None def _rc_candidates(rc: str) -> list[str]: similar = {"0": "8", "8": "0", "1": "7", "7": "1", "5": "6", "6": "5", "3": "8"} candidates = set() for i in range(len(rc)): candidates.add(rc[:i] + rc[i+1:]) for i, ch in enumerate(rc): if ch in similar: candidates.add(rc[:i] + similar[ch] + rc[i+1:]) candidates.discard(rc) return sorted(candidates) def _rc_checksum_ok(rc: str) -> bool: digits = re.sub(r"\D", "", rc) if len(digits) == 10: return int(digits) % 11 == 0 return True def verify_patient(rc_raw: str) -> dict: """ Ověří pacienta v Medicus. status: "ok" | "fuzzy" | "not_found" | "offline" """ rc = re.sub(r"\D", "", rc_raw or "") if not rc: return {"status": "not_found", "patient": None, "rc_corrected": None} con = _medicus_connect() if con is None: return {"status": "offline", "patient": None, "rc_corrected": None} try: cur = con.cursor() patient = _lookup_by_rc(cur, rc) if patient: return {"status": "ok", "patient": patient, "rc_corrected": None} candidates = _rc_candidates(rc) matches = [] for cand in candidates: p = _lookup_by_rc(cur, cand) if p: matches.append((cand, p)) if not matches: return {"status": "not_found", "patient": None, "rc_corrected": None} matches.sort(key=lambda x: (0 if _rc_checksum_ok(x[0]) else 1)) best_rc, best_patient = matches[0] return {"status": "fuzzy", "patient": best_patient, "rc_corrected": best_rc, "all_matches": matches} finally: con.close() def print_verification(verif: dict, rc_from_ocr: str): status = verif["status"] patient = verif.get("patient") if status == "ok": print(f" ✓ Medicus: {patient['prijmeni']} {patient['jmeno']} | RČ {patient['rodcis']}") elif status == "fuzzy": rc_corr = verif["rc_corrected"] print(f" ⚠ Medicus: RČ z OCR '{rc_from_ocr}' nenalezeno") print(f" → Nalezen podobný pacient: {patient['prijmeni']} {patient['jmeno']} | RČ {patient['rodcis']}") print(f" → Pravděpodobná oprava RČ: {rc_from_ocr} → {rc_corr} (OCR chyba)") if len(verif.get("all_matches", [])) > 1: print(f" → Další shody: {[m[0] for m in verif['all_matches'][1:]]}") elif status == "not_found": print(f" ✗ Medicus: RČ '{rc_from_ocr}' nenalezeno ani při fuzzy hledání") elif status == "offline": print(f" — Medicus: nedostupný (offline), ověření přeskočeno") # ─── OCR extrakce ───────────────────────────────────────────────────────────── def extract_rodne_cislo(text): """Extracts RC (6 digits + optional slash/spaces + 3-4 digits).""" m = re.search(r"(\d{6})\s*/?\s*(\d{3,4})", text) if not m: return None left = m.group(1) right = m.group(2).zfill(4) return left + right def extract_date(text): """Extracts date with high tolerance for OCR spacing errors.""" # Primary: D.M.YYYY, DD.MM.YYYY with dots/commas (also comma instead of dot) m = re.search(r"(\d{1,2})[\.,]\s*(\d{1,2})[\.,]\s*(\d{4})", text) if m: return m.groups() # Fallback: compact date without separators — OCR drops the dots # Try DDMMYYYY (8 digits) first, then DDMYYYY (7 digits) for pat in [r"\b(\d{2})(\d{2})(\d{4})\b", r"\b(\d{2})(\d{1})(\d{4})\b"]: for m in re.finditer(pat, text): d, mo, y = m.groups() if 1 <= int(d) <= 31 and 1 <= int(mo) <= 12 and 1900 <= int(y) <= 2100: return (d, mo, y) return None def convert_date_to_iso(date_tuple): """Converts (D, M, Y) -> YYYY-MM-DD.""" d, m, y = date_tuple return f"{y}-{m.zfill(2)}-{d.zfill(2)}" # --- MAIN PROCESS --- for pdf_path in BASE_DIR.glob("*.pdf"): if ".tmp" in pdf_path.name: continue print(f"\n{'=' * 60}") print(f"PROCESSING: {pdf_path.name}") print(f"{'=' * 60}") doc = fitz.open(pdf_path) meta = doc.metadata keywords = meta.get("keywords", "") or "" # 1. HANDLE ROTATION if FLAG not in keywords: print(" [Action] Rotating page...") page = doc[0] page.set_rotation((page.rotation + 90) % 360) if doc.page_count > 1: doc.delete_page(1) meta["keywords"] = (keywords + " " + FLAG).strip() doc.set_metadata(meta) tmp = pdf_path.with_suffix(".tmp.pdf") doc.save(tmp, deflate=True) doc.close() os.replace(tmp, pdf_path) doc = fitz.open(pdf_path) # 2. PERFORM OCR raw_text = ocr_page(doc[0]) # --- DEBUG PRINTOUT --- print("\n--- START OF EXTRACTED TEXT ---") print(raw_text) print("--- END OF EXTRACTED TEXT ---\n") # --------------------- rc = extract_rodne_cislo(raw_text) date_tuple = extract_date(raw_text) print(f"RESULT -> RC: {rc if rc else 'NOT FOUND'}") print(f"RESULT -> Date: {date_tuple if date_tuple else 'NOT FOUND'}") doc.close() # 3. MEDICUS VERIFICATION + FUZZY MATCHING rc_final = rc if rc: print(f" [Medicus] Ověřuji RČ {rc}...") verif = verify_patient(rc) print_verification(verif, rc) if verif["status"] == "fuzzy" and verif.get("rc_corrected"): rc_final = verif["rc_corrected"] print(f" [Medicus] RČ opraveno na: {rc_final}") else: verif = {"status": "not_found", "patient": None, "rc_corrected": None} # 4. RENAME LOGIC if rc_final and date_tuple: date_iso = convert_date_to_iso(date_tuple) patient = verif.get("patient") if patient: name_part = f" {patient['prijmeni']}, {patient['jmeno']}" else: name_part = "" new_name = f"{rc_final} {date_iso}{name_part} [EKG] [bez hodnocení].pdf" new_path = pdf_path.with_name(new_name) if not new_path.exists(): try: pdf_path.rename(new_path) print(f"✅ Success: Renamed to {new_name}") except Exception as e: print(f"❌ Rename error: {e}") else: print(f"⚠ Skipping: {new_name} already exists.") else: print("❌ Script could not find all data. Check the extracted text above.") print("\nAll files processed.")