Files
projects/ECG/20 ECG test2.py
2025-11-23 22:05:21 +01:00

169 lines
4.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import fitz
from pathlib import Path
import os
import easyocr
from PIL import Image
import io
import re,time
BASE_DIR = Path(r"u:\Dropbox\Ordinace\Dokumentace_ke_zpracování\EKGforProcessing")
FLAG = "rotated-by-script"
# OCR Reader
reader = easyocr.Reader(['cs'], gpu=False)
def ocr_page(page):
pix = page.get_pixmap(alpha=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
buf = io.BytesIO()
img.save(buf, format="PNG")
lines = reader.readtext(buf.getvalue(), detail=0)
return "\n".join(lines)
def extract_rodne_cislo(text):
"""
Extract rodné číslo in formats:
- 6 digits + slash + 4 digits → 655527/1910
- 6 digits + slash + 3 digits → 655527/910
- 10 digits without slash → 6555271910
Always returns 10 digits without slash.
"""
m = re.search(r"\b(\d{6})/?(\d{3,4})\b", text)
if not m:
return None
left = m.group(1)
right = m.group(2).zfill(4) # ensure 4 digits
return left + right
def extract_date(text):
"""Extract DD.MM.YYYY from 'DD.MM.YYYY HH.MM.SS'."""
m = re.search(r"\b(\d{1,2}\.\d{1,2}\.\d{4})\b", text)
return m.group(1) if m else None
def convert_date_to_iso(dmy):
"""Convert DD.MM.YYYY → YYYY-MM-DD."""
d, m, y = dmy.split(".")
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
def rename_ecg_file(pdf_path, rc, date_dmy):
"""Rename PDF reliably, even if Dropbox temporarily locks it."""
date_iso = convert_date_to_iso(date_dmy)
new_name = f"{rc} {date_iso} [EKG] [bez hodnocení].pdf"
new_path = pdf_path.with_name(new_name)
if new_path.exists():
print(f" ⚠ File with name already exists: {new_name}")
return
# Try renaming with retries in case Dropbox locks the file
for attempt in range(15): # ~4.5 seconds total
try:
pdf_path.rename(new_path)
print(f" → File renamed to: {new_name}")
return
except PermissionError:
print(f" ⚠ File locked (Dropbox?), retrying... {attempt+1}/15")
time.sleep(1)
print(" ❌ Could not rename file after several attempts.")
for pdf_path in BASE_DIR.glob("*.pdf"):
print(f"\nProcessing: {pdf_path.name}")
doc = fitz.open(pdf_path)
meta = doc.metadata
keywords = meta.get("keywords", "") or meta.get("Keywords", "")
# =============================
# 1) ALREADY ROTATED → do OCR
# =============================
if FLAG in keywords:
print(" → Already rotated, skipping rotation.")
page = doc[0]
print(" Performing OCR...")
text = ocr_page(page)
print("----- OCR RESULT -----")
print(text)
print("----------------------")
rc = extract_rodne_cislo(text)
date = extract_date(text)
print("\n----- EXTRACTED DATA -----")
print("Rodné číslo :", rc)
print("Datum :", date)
print("---------------------------")
# IMPORTANT: close file BEFORE renaming
doc.close()
if rc and date:
rename_ecg_file(pdf_path, rc, date)
else:
print(" ⚠ Missing RC or date file NOT renamed.")
continue
# =============================
# 2) NOT ROTATED → rotate + OCR
# =============================
try:
first = doc[0]
first.set_rotation((first.rotation + 90) % 360)
if doc.page_count > 1:
doc.delete_page(1)
meta["keywords"] = (keywords + " " + FLAG).strip()
doc.set_metadata(meta)
tmp = pdf_path.with_suffix(".tmp.pdf")
doc.save(tmp, deflate=True, garbage=3)
doc.close()
os.replace(tmp, pdf_path)
print(" → Rotated + saved + marked")
doc2 = fitz.open(pdf_path)
page = doc2[0]
text = ocr_page(page)
print("----- OCR RESULT -----")
print(text)
print("----------------------")
rc = extract_rodne_cislo(text)
date = extract_date(text)
print("\n----- EXTRACTED DATA -----")
print("Rodné číslo :", rc)
print("Datum :", date)
print("---------------------------")
# CLOSE PDF FIRST — VERY IMPORTANT
doc2.close()
if rc and date:
rename_ecg_file(pdf_path, rc, date)
except Exception as e:
print("❌ Error:", e)
doc.close()
print("\nDone.")