diff --git a/ECG/30 ECG test3.py b/ECG/30 ECG test3.py new file mode 100644 index 0000000..c028009 --- /dev/null +++ b/ECG/30 ECG test3.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import fitz +import os +import re +import time +import io +import pytesseract +from pathlib import Path +from PIL import Image + +# --- CONFIGURATION --- +BASE_DIR = Path(r"u:\Dropbox\Ordinace\Dokumentace_ke_zpracování\EKGforProcessing") +FLAG = "rotated-by-script" + + +# Point to your Tesseract executable if it's not in your PATH: +pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' + +def ocr_page(page): + """Perform OCR using Tesseract with high DPI for better accuracy.""" + # Increasing DPI to 300 is crucial for small text on EKG strips + pix = page.get_pixmap(dpi=300) + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + + # Using 'ces' for Czech language support + # PSM 6: Assume a single uniform block of text + text = pytesseract.image_to_string(img, lang='ces', config='--psm 6') + return text + + +def extract_rodne_cislo(text): + """Extracts RC (6 digits + optional slash/spaces + 3-4 digits).""" + m = re.search(r"(\d{6})\s*/?\s*(\d{3,4})", text) + if not m: + return None + left = m.group(1) + right = m.group(2).zfill(4) + return left + right + + +def extract_date(text): + """Extracts date with high tolerance for OCR spacing errors.""" + # Matches D.M.YYYY, D. M. YYYY, DD.MM.YYYY, etc. + # It also handles if OCR accidentally puts a comma instead of a dot + m = re.search(r"(\d{1,2})[\.,]\s*(\d{1,2})[\.,]\s*(\d{4})", text) + if m: + return m.groups() + return None + + +def convert_date_to_iso(date_tuple): + """Converts (D, M, Y) -> YYYY-MM-DD.""" + d, m, y = date_tuple + return f"{y}-{m.zfill(2)}-{d.zfill(2)}" + + +# --- MAIN PROCESS --- +for pdf_path in BASE_DIR.glob("*.pdf"): + if ".tmp" in pdf_path.name: continue + + print(f"\n{'=' * 60}") + print(f"PROCESSING: {pdf_path.name}") + print(f"{'=' * 60}") + + doc = fitz.open(pdf_path) + meta = doc.metadata + keywords = meta.get("keywords", "") or "" + + # 1. HANDLE ROTATION + if FLAG not in keywords: + print(" [Action] Rotating page...") + page = doc[0] + page.set_rotation((page.rotation + 90) % 360) + if doc.page_count > 1: + doc.delete_page(1) + meta["keywords"] = (keywords + " " + FLAG).strip() + doc.set_metadata(meta) + tmp = pdf_path.with_suffix(".tmp.pdf") + doc.save(tmp, deflate=True) + doc.close() + os.replace(tmp, pdf_path) + doc = fitz.open(pdf_path) + + # 2. PERFORM OCR + raw_text = ocr_page(doc[0]) + + # --- DEBUG PRINTOUT --- + print("\n--- START OF EXTRACTED TEXT ---") + print(raw_text) + print("--- END OF EXTRACTED TEXT ---\n") + # --------------------- + + rc = extract_rodne_cislo(raw_text) + date_tuple = extract_date(raw_text) + + print(f"RESULT -> RC: {rc if rc else 'NOT FOUND'}") + print(f"RESULT -> Date: {date_tuple if date_tuple else 'NOT FOUND'}") + + doc.close() + + # 3. RENAME LOGIC + if rc and date_tuple: + date_iso = convert_date_to_iso(date_tuple) + new_name = f"{rc} {date_iso} [EKG] [bez hodnocení].pdf" + new_path = pdf_path.with_name(new_name) + + if not new_path.exists(): + try: + # Close the handle properly before renaming + pdf_path.rename(new_path) + print(f"✅ Success: Renamed to {new_name}") + except Exception as e: + print(f"❌ Rename error: {e}") + else: + print(f"⚠ Skipping: {new_name} already exists.") + else: + print("❌ Script could not find all data. Check the extracted text above.") + +print("\nAll files processed.") \ No newline at end of file