projects/ECG/30 ECG test3.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import fitz
import os
import re
import time
import io
import pytesseract
from pathlib import Path
from PIL import Image
from FunkceWhereIsDropbox import get_dropbox_path

# --- CONFIGURATION ---
#Get cesta to Dropbox on working computer
dropbox=get_dropbox_path()
BASE_DIR=dropbox/"Ordinace"/"Dokumentace_ke_zpracování"/"EKGforProcessing"
FLAG = "rotated-by-script"


# Point to your Tesseract executable if it's not in your PATH:
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def ocr_page(page):
    """Perform OCR using Tesseract with high DPI for better accuracy."""
    # Increasing DPI to 300 is crucial for small text on EKG strips
    pix = page.get_pixmap(dpi=300)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    # Using 'ces' for Czech language support
    # PSM 6: Assume a single uniform block of text
    text = pytesseract.image_to_string(img, lang='ces', config='--psm 6')
    return text


# ─── Medicus ověření ──────────────────────────────────────────────────────────

def _medicus_connect():
    try:
        import fdb
        return fdb.connect(
            dsn=r"localhost:c:\medicus 3\data\medicus.fdb",
            user="SYSDBA", password="masterkey", charset="win1250"
        )
    except Exception as e:
        print(f"  [Medicus] Nepřipojeno: {e}")
        return None

def _lookup_by_rc(cur, rc_digits: str) -> dict | None:
    cur.execute(
        "SELECT IDPAC, PRIJMENI, JMENO, RODCIS FROM KAR "
        "WHERE REPLACE(RODCIS, '/', '') = ?",
        (rc_digits,)
    )
    row = cur.fetchone()
    if row:
        return {"idpac": row[0], "prijmeni": row[1].strip(), "jmeno": row[2].strip(), "rodcis": row[3].strip()}
    return None

def _rc_candidates(rc: str) -> list[str]:
    similar = {"0": "8", "8": "0", "1": "7", "7": "1", "5": "6", "6": "5", "3": "8"}
    candidates = set()
    for i in range(len(rc)):
        candidates.add(rc[:i] + rc[i+1:])
    for i, ch in enumerate(rc):
        if ch in similar:
            candidates.add(rc[:i] + similar[ch] + rc[i+1:])
    candidates.discard(rc)
    return sorted(candidates)

def _rc_checksum_ok(rc: str) -> bool:
    digits = re.sub(r"\D", "", rc)
    if len(digits) == 10:
        return int(digits) % 11 == 0
    return True

def verify_patient(rc_raw: str) -> dict:
    """
    Ověří pacienta v Medicus.
    status: "ok" | "fuzzy" | "not_found" | "offline"
    """
    rc = re.sub(r"\D", "", rc_raw or "")
    if not rc:
        return {"status": "not_found", "patient": None, "rc_corrected": None}

    con = _medicus_connect()
    if con is None:
        return {"status": "offline", "patient": None, "rc_corrected": None}

    try:
        cur = con.cursor()
        patient = _lookup_by_rc(cur, rc)
        if patient:
            return {"status": "ok", "patient": patient, "rc_corrected": None}

        candidates = _rc_candidates(rc)
        matches = []
        for cand in candidates:
            p = _lookup_by_rc(cur, cand)
            if p:
                matches.append((cand, p))

        if not matches:
            return {"status": "not_found", "patient": None, "rc_corrected": None}

        matches.sort(key=lambda x: (0 if _rc_checksum_ok(x[0]) else 1))
        best_rc, best_patient = matches[0]
        return {"status": "fuzzy", "patient": best_patient, "rc_corrected": best_rc, "all_matches": matches}
    finally:
        con.close()

def print_verification(verif: dict, rc_from_ocr: str):
    status = verif["status"]
    patient = verif.get("patient")
    if status == "ok":
        print(f"  ✓ Medicus: {patient['prijmeni']} {patient['jmeno']}  |  RČ {patient['rodcis']}")
    elif status == "fuzzy":
        rc_corr = verif["rc_corrected"]
        print(f"  ⚠ Medicus: RČ z OCR '{rc_from_ocr}' nenalezeno")
        print(f"    → Nalezen podobný pacient: {patient['prijmeni']} {patient['jmeno']}  |  RČ {patient['rodcis']}")
        print(f"    → Pravděpodobná oprava RČ: {rc_from_ocr}  →  {rc_corr}  (OCR chyba)")
        if len(verif.get("all_matches", [])) > 1:
            print(f"    → Další shody: {[m[0] for m in verif['all_matches'][1:]]}")
    elif status == "not_found":
        print(f"  ✗ Medicus: RČ '{rc_from_ocr}' nenalezeno ani při fuzzy hledání")
    elif status == "offline":
        print(f"  — Medicus: nedostupný (offline), ověření přeskočeno")


# ─── OCR extrakce ─────────────────────────────────────────────────────────────

def extract_rodne_cislo(text):
    """Extracts RC (6 digits + optional slash/spaces + 3-4 digits)."""
    m = re.search(r"(\d{6})\s*/?\s*(\d{3,4})", text)
    if not m:
        return None
    left = m.group(1)
    right = m.group(2).zfill(4)
    return left + right


def extract_date(text):
    """Extracts date with high tolerance for OCR spacing errors."""
    # Primary: D.M.YYYY, DD.MM.YYYY with dots/commas (also comma instead of dot)
    m = re.search(r"(\d{1,2})[\.,]\s*(\d{1,2})[\.,]\s*(\d{4})", text)
    if m:
        return m.groups()

    # Fallback: compact date without separators — OCR drops the dots
    # Try DDMMYYYY (8 digits) first, then DDMYYYY (7 digits)
    for pat in [r"\b(\d{2})(\d{2})(\d{4})\b", r"\b(\d{2})(\d{1})(\d{4})\b"]:
        for m in re.finditer(pat, text):
            d, mo, y = m.groups()
            if 1 <= int(d) <= 31 and 1 <= int(mo) <= 12 and 1900 <= int(y) <= 2100:
                return (d, mo, y)

    return None


def convert_date_to_iso(date_tuple):
    """Converts (D, M, Y) -> YYYY-MM-DD."""
    d, m, y = date_tuple
    return f"{y}-{m.zfill(2)}-{d.zfill(2)}"


# --- MAIN PROCESS ---
for pdf_path in BASE_DIR.glob("*.pdf"):
    if ".tmp" in pdf_path.name: continue

    print(f"\n{'=' * 60}")
    print(f"PROCESSING: {pdf_path.name}")
    print(f"{'=' * 60}")

    doc = fitz.open(pdf_path)
    meta = doc.metadata
    keywords = meta.get("keywords", "") or ""

    # 1. HANDLE ROTATION
    if FLAG not in keywords:
        print("   [Action] Rotating page...")
        page = doc[0]
        page.set_rotation((page.rotation + 90) % 360)
        if doc.page_count > 1:
            doc.delete_page(1)
        meta["keywords"] = (keywords + " " + FLAG).strip()
        doc.set_metadata(meta)
        tmp = pdf_path.with_suffix(".tmp.pdf")
        doc.save(tmp, deflate=True)
        doc.close()
        os.replace(tmp, pdf_path)
        doc = fitz.open(pdf_path)

    # 2. PERFORM OCR
    raw_text = ocr_page(doc[0])

    # --- DEBUG PRINTOUT ---
    print("\n--- START OF EXTRACTED TEXT ---")
    print(raw_text)
    print("--- END OF EXTRACTED TEXT ---\n")
    # ---------------------

    rc = extract_rodne_cislo(raw_text)
    date_tuple = extract_date(raw_text)

    print(f"RESULT -> RC: {rc if rc else 'NOT FOUND'}")
    print(f"RESULT -> Date: {date_tuple if date_tuple else 'NOT FOUND'}")

    doc.close()

    # 3. MEDICUS VERIFICATION + FUZZY MATCHING
    rc_final = rc
    if rc:
        print(f"   [Medicus] Ověřuji RČ {rc}...")
        verif = verify_patient(rc)
        print_verification(verif, rc)
        if verif["status"] == "fuzzy" and verif.get("rc_corrected"):
            rc_final = verif["rc_corrected"]
            print(f"   [Medicus] RČ opraveno na: {rc_final}")
    else:
        verif = {"status": "not_found", "patient": None, "rc_corrected": None}

    # 4. RENAME LOGIC
    if rc_final and date_tuple:
        date_iso = convert_date_to_iso(date_tuple)
        patient = verif.get("patient")
        if patient:
            name_part = f"{patient['prijmeni']}, {patient['jmeno']} "
        else:
            name_part = ""
        new_name = f"{rc_final} {name_part}{date_iso} [EKG] [bez hodnocení].pdf"
        new_path = pdf_path.with_name(new_name)

        if not new_path.exists():
            try:
                pdf_path.rename(new_path)
                print(f"✅ Success: Renamed to {new_name}")
            except Exception as e:
                print(f"❌ Rename error: {e}")
        else:
            print(f"⚠ Skipping: {new_name} already exists.")
    else:
        print("❌ Script could not find all data. Check the extracted text above.")

print("\nAll files processed.")