projects/ECG/20 ECG test2.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import fitz
from pathlib import Path
import os
import easyocr
from PIL import Image
import io
import re,time

BASE_DIR = Path(r"u:\Dropbox\Ordinace\Dokumentace_ke_zpracování\EKGforProcessing")
FLAG = "rotated-by-script"

# OCR Reader
reader = easyocr.Reader(['cs'], gpu=False)


def ocr_page(page):
    pix = page.get_pixmap(alpha=False)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    buf = io.BytesIO()
    img.save(buf, format="PNG")
    lines = reader.readtext(buf.getvalue(), detail=0)
    return "\n".join(lines)


def extract_rodne_cislo(text):
    """
    Extract rodné číslo in formats:
    - 6 digits + slash + 4 digits  → 655527/1910
    - 6 digits + slash + 3 digits  → 655527/910
    - 10 digits without slash      → 6555271910

    Always returns 10 digits without slash.
    """
    m = re.search(r"\b(\d{6})/?(\d{3,4})\b", text)
    if not m:
        return None

    left = m.group(1)
    right = m.group(2).zfill(4)  # ensure 4 digits

    return left + right


def extract_date(text):
    """Extract DD.MM.YYYY from 'DD.MM.YYYY HH.MM.SS'."""
    m = re.search(r"\b(\d{1,2}\.\d{1,2}\.\d{4})\b", text)
    return m.group(1) if m else None


def convert_date_to_iso(dmy):
    """Convert DD.MM.YYYY → YYYY-MM-DD."""
    d, m, y = dmy.split(".")
    return f"{y}-{m.zfill(2)}-{d.zfill(2)}"


def rename_ecg_file(pdf_path, rc, date_dmy):
    """Rename PDF reliably, even if Dropbox temporarily locks it."""
    date_iso = convert_date_to_iso(date_dmy)
    new_name = f"{rc} {date_iso} [EKG] [bez hodnocení].pdf"
    new_path = pdf_path.with_name(new_name)

    if new_path.exists():
        print(f"   ⚠ File with name already exists: {new_name}")
        return

    # Try renaming with retries in case Dropbox locks the file
    for attempt in range(15):   # ~4.5 seconds total
        try:
            pdf_path.rename(new_path)
            print(f"   → File renamed to: {new_name}")
            return
        except PermissionError:
            print(f"   ⚠ File locked (Dropbox?), retrying... {attempt+1}/15")
            time.sleep(1)
    print("   ❌ Could not rename file after several attempts.")


for pdf_path in BASE_DIR.glob("*.pdf"):
    print(f"\nProcessing: {pdf_path.name}")

    doc = fitz.open(pdf_path)
    meta = doc.metadata
    keywords = meta.get("keywords", "") or meta.get("Keywords", "")

    # =============================
    # 1) ALREADY ROTATED → do OCR
    # =============================
    if FLAG in keywords:
        print("   → Already rotated, skipping rotation.")
        page = doc[0]
        print("   Performing OCR...")
        text = ocr_page(page)

        print("----- OCR RESULT -----")
        print(text)
        print("----------------------")

        rc = extract_rodne_cislo(text)
        date = extract_date(text)

        print("\n----- EXTRACTED DATA -----")
        print("Rodné číslo :", rc)
        print("Datum       :", date)
        print("---------------------------")

        # IMPORTANT: close file BEFORE renaming
        doc.close()

        if rc and date:
            rename_ecg_file(pdf_path, rc, date)
        else:
            print("   ⚠ Missing RC or date – file NOT renamed.")

        continue

    # =============================
    # 2) NOT ROTATED → rotate + OCR
    # =============================
    try:
        first = doc[0]
        first.set_rotation((first.rotation + 90) % 360)

        if doc.page_count > 1:
            doc.delete_page(1)

        meta["keywords"] = (keywords + " " + FLAG).strip()
        doc.set_metadata(meta)

        tmp = pdf_path.with_suffix(".tmp.pdf")
        doc.save(tmp, deflate=True, garbage=3)
        doc.close()
        os.replace(tmp, pdf_path)

        print("   → Rotated + saved + marked")

        doc2 = fitz.open(pdf_path)
        page = doc2[0]
        text = ocr_page(page)
        print("----- OCR RESULT -----")
        print(text)
        print("----------------------")

        rc = extract_rodne_cislo(text)
        date = extract_date(text)

        print("\n----- EXTRACTED DATA -----")
        print("Rodné číslo :", rc)
        print("Datum       :", date)
        print("---------------------------")

        # CLOSE PDF FIRST — VERY IMPORTANT
        doc2.close()

        if rc and date:
            rename_ecg_file(pdf_path, rc, date)


    except Exception as e:
        print("❌ Error:", e)
        doc.close()

print("\nDone.")