#!/usr/bin/env python3 # -*- coding: utf-8 -*- import fitz from pathlib import Path import os import easyocr from PIL import Image import io import re,time BASE_DIR = Path(r"u:\Dropbox\Ordinace\Dokumentace_ke_zpracování\EKGforProcessing") FLAG = "rotated-by-script" # OCR Reader reader = easyocr.Reader(['cs'], gpu=False) def ocr_page(page): pix = page.get_pixmap(alpha=False) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) buf = io.BytesIO() img.save(buf, format="PNG") lines = reader.readtext(buf.getvalue(), detail=0) return "\n".join(lines) def extract_rodne_cislo(text): """Extract 10-digit rodné číslo (no slash).""" m = re.search(r"\b\d{9,10}\b", text) return m.group(0) if m else None def extract_date(text): """Extract DD.MM.YYYY from 'DD.MM.YYYY HH.MM.SS'.""" m = re.search(r"\b(\d{1,2}\.\d{1,2}\.\d{4})\b", text) return m.group(1) if m else None def convert_date_to_iso(dmy): """Convert DD.MM.YYYY → YYYY-MM-DD.""" d, m, y = dmy.split(".") return f"{y}-{m.zfill(2)}-{d.zfill(2)}" def rename_ecg_file(pdf_path, rc, date_dmy): """Rename PDF reliably, even if Dropbox temporarily locks it.""" date_iso = convert_date_to_iso(date_dmy) new_name = f"{rc} {date_iso} [EKG] [bez hodnocení].pdf" new_path = pdf_path.with_name(new_name) if new_path.exists(): print(f" ⚠ File with name already exists: {new_name}") return # Try renaming with retries in case Dropbox locks the file for attempt in range(15): # ~4.5 seconds total try: pdf_path.rename(new_path) print(f" → File renamed to: {new_name}") return except PermissionError: print(f" ⚠ File locked (Dropbox?), retrying... {attempt+1}/15") time.sleep(1) print(" ❌ Could not rename file after several attempts.") for pdf_path in BASE_DIR.glob("*.pdf"): print(f"\nProcessing: {pdf_path.name}") doc = fitz.open(pdf_path) meta = doc.metadata keywords = meta.get("keywords", "") or meta.get("Keywords", "") # ============================= # 1) ALREADY ROTATED → do OCR # ============================= if FLAG in keywords: print(" → Already rotated, skipping rotation.") page = doc[0] print(" Performing OCR...") text = ocr_page(page) print("----- OCR RESULT -----") print(text) print("----------------------") rc = extract_rodne_cislo(text) date = extract_date(text) print("\n----- EXTRACTED DATA -----") print("Rodné číslo :", rc) print("Datum :", date) print("---------------------------") # IMPORTANT: close file BEFORE renaming doc.close() if rc and date: rename_ecg_file(pdf_path, rc, date) else: print(" ⚠ Missing RC or date – file NOT renamed.") continue # ============================= # 2) NOT ROTATED → rotate + OCR # ============================= try: first = doc[0] first.set_rotation((first.rotation + 90) % 360) if doc.page_count > 1: doc.delete_page(1) meta["keywords"] = (keywords + " " + FLAG).strip() doc.set_metadata(meta) tmp = pdf_path.with_suffix(".tmp.pdf") doc.save(tmp, deflate=True, garbage=3) doc.close() os.replace(tmp, pdf_path) print(" → Rotated + saved + marked") doc2 = fitz.open(pdf_path) page = doc2[0] text = ocr_page(page) print("----- OCR RESULT -----") print(text) print("----------------------") rc = extract_rodne_cislo(text) date = extract_date(text) print("\n----- EXTRACTED DATA -----") print("Rodné číslo :", rc) print("Datum :", date) print("---------------------------") # CLOSE PDF FIRST — VERY IMPORTANT doc2.close() if rc and date: rename_ecg_file(pdf_path, rc, date) except Exception as e: print("❌ Error:", e) doc.close() print("\nDone.")