244 lines
8.3 KiB
Python
244 lines
8.3 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import fitz
|
|
import os
|
|
import re
|
|
import time
|
|
import io
|
|
import pytesseract
|
|
from pathlib import Path
|
|
from PIL import Image
|
|
from FunkceWhereIsDropbox import get_dropbox_path
|
|
|
|
# --- CONFIGURATION ---
|
|
#Get cesta to Dropbox on working computer
|
|
dropbox=get_dropbox_path()
|
|
BASE_DIR=dropbox/"Ordinace"/"Dokumentace_ke_zpracování"/"EKGforProcessing"
|
|
FLAG = "rotated-by-script"
|
|
|
|
|
|
# Point to your Tesseract executable if it's not in your PATH:
|
|
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
|
|
|
def ocr_page(page):
|
|
"""Perform OCR using Tesseract with high DPI for better accuracy."""
|
|
# Increasing DPI to 300 is crucial for small text on EKG strips
|
|
pix = page.get_pixmap(dpi=300)
|
|
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
|
|
|
# Using 'ces' for Czech language support
|
|
# PSM 6: Assume a single uniform block of text
|
|
text = pytesseract.image_to_string(img, lang='ces', config='--psm 6')
|
|
return text
|
|
|
|
|
|
# ─── Medicus ověření ──────────────────────────────────────────────────────────
|
|
|
|
def _medicus_connect():
|
|
try:
|
|
import fdb
|
|
return fdb.connect(
|
|
dsn=r"localhost:c:\medicus 3\data\medicus.fdb",
|
|
user="SYSDBA", password="masterkey", charset="win1250"
|
|
)
|
|
except Exception as e:
|
|
print(f" [Medicus] Nepřipojeno: {e}")
|
|
return None
|
|
|
|
def _lookup_by_rc(cur, rc_digits: str) -> dict | None:
|
|
cur.execute(
|
|
"SELECT IDPAC, PRIJMENI, JMENO, RODCIS FROM KAR "
|
|
"WHERE REPLACE(RODCIS, '/', '') = ?",
|
|
(rc_digits,)
|
|
)
|
|
row = cur.fetchone()
|
|
if row:
|
|
return {"idpac": row[0], "prijmeni": row[1].strip(), "jmeno": row[2].strip(), "rodcis": row[3].strip()}
|
|
return None
|
|
|
|
def _rc_candidates(rc: str) -> list[str]:
|
|
similar = {"0": "8", "8": "0", "1": "7", "7": "1", "5": "6", "6": "5", "3": "8"}
|
|
candidates = set()
|
|
for i in range(len(rc)):
|
|
candidates.add(rc[:i] + rc[i+1:])
|
|
for i, ch in enumerate(rc):
|
|
if ch in similar:
|
|
candidates.add(rc[:i] + similar[ch] + rc[i+1:])
|
|
candidates.discard(rc)
|
|
return sorted(candidates)
|
|
|
|
def _rc_checksum_ok(rc: str) -> bool:
|
|
digits = re.sub(r"\D", "", rc)
|
|
if len(digits) == 10:
|
|
return int(digits) % 11 == 0
|
|
return True
|
|
|
|
def verify_patient(rc_raw: str) -> dict:
|
|
"""
|
|
Ověří pacienta v Medicus.
|
|
status: "ok" | "fuzzy" | "not_found" | "offline"
|
|
"""
|
|
rc = re.sub(r"\D", "", rc_raw or "")
|
|
if not rc:
|
|
return {"status": "not_found", "patient": None, "rc_corrected": None}
|
|
|
|
con = _medicus_connect()
|
|
if con is None:
|
|
return {"status": "offline", "patient": None, "rc_corrected": None}
|
|
|
|
try:
|
|
cur = con.cursor()
|
|
patient = _lookup_by_rc(cur, rc)
|
|
if patient:
|
|
return {"status": "ok", "patient": patient, "rc_corrected": None}
|
|
|
|
candidates = _rc_candidates(rc)
|
|
matches = []
|
|
for cand in candidates:
|
|
p = _lookup_by_rc(cur, cand)
|
|
if p:
|
|
matches.append((cand, p))
|
|
|
|
if not matches:
|
|
return {"status": "not_found", "patient": None, "rc_corrected": None}
|
|
|
|
matches.sort(key=lambda x: (0 if _rc_checksum_ok(x[0]) else 1))
|
|
best_rc, best_patient = matches[0]
|
|
return {"status": "fuzzy", "patient": best_patient, "rc_corrected": best_rc, "all_matches": matches}
|
|
finally:
|
|
con.close()
|
|
|
|
def print_verification(verif: dict, rc_from_ocr: str):
|
|
status = verif["status"]
|
|
patient = verif.get("patient")
|
|
if status == "ok":
|
|
print(f" ✓ Medicus: {patient['prijmeni']} {patient['jmeno']} | RČ {patient['rodcis']}")
|
|
elif status == "fuzzy":
|
|
rc_corr = verif["rc_corrected"]
|
|
print(f" ⚠ Medicus: RČ z OCR '{rc_from_ocr}' nenalezeno")
|
|
print(f" → Nalezen podobný pacient: {patient['prijmeni']} {patient['jmeno']} | RČ {patient['rodcis']}")
|
|
print(f" → Pravděpodobná oprava RČ: {rc_from_ocr} → {rc_corr} (OCR chyba)")
|
|
if len(verif.get("all_matches", [])) > 1:
|
|
print(f" → Další shody: {[m[0] for m in verif['all_matches'][1:]]}")
|
|
elif status == "not_found":
|
|
print(f" ✗ Medicus: RČ '{rc_from_ocr}' nenalezeno ani při fuzzy hledání")
|
|
elif status == "offline":
|
|
print(f" — Medicus: nedostupný (offline), ověření přeskočeno")
|
|
|
|
|
|
# ─── OCR extrakce ─────────────────────────────────────────────────────────────
|
|
|
|
def extract_rodne_cislo(text):
|
|
"""Extracts RC (6 digits + optional slash/spaces + 3-4 digits)."""
|
|
m = re.search(r"(\d{6})\s*/?\s*(\d{3,4})", text)
|
|
if not m:
|
|
return None
|
|
left = m.group(1)
|
|
right = m.group(2).zfill(4)
|
|
return left + right
|
|
|
|
|
|
def extract_date(text):
|
|
"""Extracts date with high tolerance for OCR spacing errors."""
|
|
# Primary: D.M.YYYY, DD.MM.YYYY with dots/commas (also comma instead of dot)
|
|
m = re.search(r"(\d{1,2})[\.,]\s*(\d{1,2})[\.,]\s*(\d{4})", text)
|
|
if m:
|
|
return m.groups()
|
|
|
|
# Fallback: compact date without separators — OCR drops the dots
|
|
# Try DDMMYYYY (8 digits) first, then DDMYYYY (7 digits)
|
|
for pat in [r"\b(\d{2})(\d{2})(\d{4})\b", r"\b(\d{2})(\d{1})(\d{4})\b"]:
|
|
for m in re.finditer(pat, text):
|
|
d, mo, y = m.groups()
|
|
if 1 <= int(d) <= 31 and 1 <= int(mo) <= 12 and 1900 <= int(y) <= 2100:
|
|
return (d, mo, y)
|
|
|
|
return None
|
|
|
|
|
|
def convert_date_to_iso(date_tuple):
|
|
"""Converts (D, M, Y) -> YYYY-MM-DD."""
|
|
d, m, y = date_tuple
|
|
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
|
|
|
|
|
|
# --- MAIN PROCESS ---
|
|
for pdf_path in BASE_DIR.glob("*.pdf"):
|
|
if ".tmp" in pdf_path.name: continue
|
|
|
|
print(f"\n{'=' * 60}")
|
|
print(f"PROCESSING: {pdf_path.name}")
|
|
print(f"{'=' * 60}")
|
|
|
|
doc = fitz.open(pdf_path)
|
|
meta = doc.metadata
|
|
keywords = meta.get("keywords", "") or ""
|
|
|
|
# 1. HANDLE ROTATION
|
|
if FLAG not in keywords:
|
|
print(" [Action] Rotating page...")
|
|
page = doc[0]
|
|
page.set_rotation((page.rotation + 90) % 360)
|
|
if doc.page_count > 1:
|
|
doc.delete_page(1)
|
|
meta["keywords"] = (keywords + " " + FLAG).strip()
|
|
doc.set_metadata(meta)
|
|
tmp = pdf_path.with_suffix(".tmp.pdf")
|
|
doc.save(tmp, deflate=True)
|
|
doc.close()
|
|
os.replace(tmp, pdf_path)
|
|
doc = fitz.open(pdf_path)
|
|
|
|
# 2. PERFORM OCR
|
|
raw_text = ocr_page(doc[0])
|
|
|
|
# --- DEBUG PRINTOUT ---
|
|
print("\n--- START OF EXTRACTED TEXT ---")
|
|
print(raw_text)
|
|
print("--- END OF EXTRACTED TEXT ---\n")
|
|
# ---------------------
|
|
|
|
rc = extract_rodne_cislo(raw_text)
|
|
date_tuple = extract_date(raw_text)
|
|
|
|
print(f"RESULT -> RC: {rc if rc else 'NOT FOUND'}")
|
|
print(f"RESULT -> Date: {date_tuple if date_tuple else 'NOT FOUND'}")
|
|
|
|
doc.close()
|
|
|
|
# 3. MEDICUS VERIFICATION + FUZZY MATCHING
|
|
rc_final = rc
|
|
if rc:
|
|
print(f" [Medicus] Ověřuji RČ {rc}...")
|
|
verif = verify_patient(rc)
|
|
print_verification(verif, rc)
|
|
if verif["status"] == "fuzzy" and verif.get("rc_corrected"):
|
|
rc_final = verif["rc_corrected"]
|
|
print(f" [Medicus] RČ opraveno na: {rc_final}")
|
|
else:
|
|
verif = {"status": "not_found", "patient": None, "rc_corrected": None}
|
|
|
|
# 4. RENAME LOGIC
|
|
if rc_final and date_tuple:
|
|
date_iso = convert_date_to_iso(date_tuple)
|
|
patient = verif.get("patient")
|
|
if patient:
|
|
name_part = f"{patient['prijmeni']}, {patient['jmeno']} "
|
|
else:
|
|
name_part = ""
|
|
new_name = f"{rc_final} {name_part}{date_iso} [EKG] [bez hodnocení].pdf"
|
|
new_path = pdf_path.with_name(new_name)
|
|
|
|
if not new_path.exists():
|
|
try:
|
|
pdf_path.rename(new_path)
|
|
print(f"✅ Success: Renamed to {new_name}")
|
|
except Exception as e:
|
|
print(f"❌ Rename error: {e}")
|
|
else:
|
|
print(f"⚠ Skipping: {new_name} already exists.")
|
|
else:
|
|
print("❌ Script could not find all data. Check the extracted text above.")
|
|
|
|
print("\nAll files processed.") |