156 lines
4.1 KiB
Python
156 lines
4.1 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
|
||
import fitz
|
||
from pathlib import Path
|
||
import os
|
||
import easyocr
|
||
from PIL import Image
|
||
import io
|
||
import re,time
|
||
|
||
BASE_DIR = Path(r"u:\Dropbox\Ordinace\Dokumentace_ke_zpracování\EKGforProcessing")
|
||
FLAG = "rotated-by-script"
|
||
|
||
# OCR Reader
|
||
reader = easyocr.Reader(['cs'], gpu=False)
|
||
|
||
|
||
def ocr_page(page):
|
||
pix = page.get_pixmap(alpha=False)
|
||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||
buf = io.BytesIO()
|
||
img.save(buf, format="PNG")
|
||
lines = reader.readtext(buf.getvalue(), detail=0)
|
||
return "\n".join(lines)
|
||
|
||
|
||
def extract_rodne_cislo(text):
|
||
"""Extract 10-digit rodné číslo (no slash)."""
|
||
m = re.search(r"\b\d{9,10}\b", text)
|
||
return m.group(0) if m else None
|
||
|
||
|
||
def extract_date(text):
|
||
"""Extract DD.MM.YYYY from 'DD.MM.YYYY HH.MM.SS'."""
|
||
m = re.search(r"\b(\d{1,2}\.\d{1,2}\.\d{4})\b", text)
|
||
return m.group(1) if m else None
|
||
|
||
|
||
def convert_date_to_iso(dmy):
|
||
"""Convert DD.MM.YYYY → YYYY-MM-DD."""
|
||
d, m, y = dmy.split(".")
|
||
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
|
||
|
||
|
||
|
||
|
||
def rename_ecg_file(pdf_path, rc, date_dmy):
|
||
"""Rename PDF reliably, even if Dropbox temporarily locks it."""
|
||
date_iso = convert_date_to_iso(date_dmy)
|
||
new_name = f"{rc} {date_iso} [EKG] [bez hodnocení].pdf"
|
||
new_path = pdf_path.with_name(new_name)
|
||
|
||
if new_path.exists():
|
||
print(f" ⚠ File with name already exists: {new_name}")
|
||
return
|
||
|
||
# Try renaming with retries in case Dropbox locks the file
|
||
for attempt in range(15): # ~4.5 seconds total
|
||
try:
|
||
pdf_path.rename(new_path)
|
||
print(f" → File renamed to: {new_name}")
|
||
return
|
||
except PermissionError:
|
||
print(f" ⚠ File locked (Dropbox?), retrying... {attempt+1}/15")
|
||
time.sleep(1)
|
||
print(" ❌ Could not rename file after several attempts.")
|
||
|
||
|
||
for pdf_path in BASE_DIR.glob("*.pdf"):
|
||
print(f"\nProcessing: {pdf_path.name}")
|
||
|
||
doc = fitz.open(pdf_path)
|
||
meta = doc.metadata
|
||
keywords = meta.get("keywords", "") or meta.get("Keywords", "")
|
||
|
||
# =============================
|
||
# 1) ALREADY ROTATED → do OCR
|
||
# =============================
|
||
if FLAG in keywords:
|
||
print(" → Already rotated, skipping rotation.")
|
||
page = doc[0]
|
||
print(" Performing OCR...")
|
||
text = ocr_page(page)
|
||
|
||
print("----- OCR RESULT -----")
|
||
print(text)
|
||
print("----------------------")
|
||
|
||
rc = extract_rodne_cislo(text)
|
||
date = extract_date(text)
|
||
|
||
print("\n----- EXTRACTED DATA -----")
|
||
print("Rodné číslo :", rc)
|
||
print("Datum :", date)
|
||
print("---------------------------")
|
||
|
||
# IMPORTANT: close file BEFORE renaming
|
||
doc.close()
|
||
|
||
if rc and date:
|
||
rename_ecg_file(pdf_path, rc, date)
|
||
else:
|
||
print(" ⚠ Missing RC or date – file NOT renamed.")
|
||
|
||
continue
|
||
|
||
# =============================
|
||
# 2) NOT ROTATED → rotate + OCR
|
||
# =============================
|
||
try:
|
||
first = doc[0]
|
||
first.set_rotation((first.rotation + 90) % 360)
|
||
|
||
if doc.page_count > 1:
|
||
doc.delete_page(1)
|
||
|
||
meta["keywords"] = (keywords + " " + FLAG).strip()
|
||
doc.set_metadata(meta)
|
||
|
||
tmp = pdf_path.with_suffix(".tmp.pdf")
|
||
doc.save(tmp, deflate=True, garbage=3)
|
||
doc.close()
|
||
os.replace(tmp, pdf_path)
|
||
|
||
print(" → Rotated + saved + marked")
|
||
|
||
doc2 = fitz.open(pdf_path)
|
||
page = doc2[0]
|
||
text = ocr_page(page)
|
||
print("----- OCR RESULT -----")
|
||
print(text)
|
||
print("----------------------")
|
||
|
||
rc = extract_rodne_cislo(text)
|
||
date = extract_date(text)
|
||
|
||
print("\n----- EXTRACTED DATA -----")
|
||
print("Rodné číslo :", rc)
|
||
print("Datum :", date)
|
||
print("---------------------------")
|
||
|
||
# CLOSE PDF FIRST — VERY IMPORTANT
|
||
doc2.close()
|
||
|
||
if rc and date:
|
||
rename_ecg_file(pdf_path, rc, date)
|
||
|
||
|
||
|
||
except Exception as e:
|
||
print("❌ Error:", e)
|
||
doc.close()
|
||
|
||
print("\nDone.")
|