Files
ordinaceprojekt/Medevio/70 DěleníSouboruPDF/rozdelit_pdf.py
T
2026-05-05 08:38:57 +02:00

1138 lines
45 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
rozdelit_pdf.py — Dělení vícestránkového PDF na skupiny pacientů.
Spuštění:
python rozdelit_pdf.py soubor.pdf
Numerická klávesnice:
4 / Left kurzor ←
6 / Right kurzor →
7 / PgUp skok ← o 4 stránky
9 / PgDn skok → o 4 stránky
5 / Space přepni hranici pacienta před touto stránkou
8 / Up přesuň stránku doleva (swap)
2 / Down přesuň stránku doprava (swap)
- výběr pacienta ručně z Medicusu
Enter exportuj všechny skupiny do Split/
Esc konec
"""
import sys
import os
import io
import re
import json
import threading
from pathlib import Path
from typing import Optional
import tkinter as tk
from tkinter import messagebox
from PIL import Image, ImageTk
import fitz # PyMuPDF
# ── Cesty ─────────────────────────────────────────────────────────────────────
ROOT = Path(__file__).resolve().parent.parent # .../Medevio/
sys.path.insert(0, str(ROOT))
from Knihovny.najdi_medicus import get_medicus_config
from Knihovny.najdi_dropbox import get_dropbox_root
_DROPBOX = Path(get_dropbox_root())
_RICOH = _DROPBOX / r"Ordinace\Dokumentace_ke_zpracování\Ricoh Fi-8040"
SPLIT_DIR = _RICOH / "Split"
# ── Env ───────────────────────────────────────────────────────────────────────
def _load_env():
env_path = ROOT / ".env"
if env_path.exists():
for line in env_path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if "=" in line and not line.startswith("#"):
k, v = line.split("=", 1)
os.environ[k.strip()] = v.strip()
_load_env()
# ── Regex ─────────────────────────────────────────────────────────────────────
TESTOVANI = False
PATH_TO_TESTFILE = r"U:\Dropbox\Ordinace\Dokumentace_ke_zpracování\Ricoh Fi-8040\2026-05-04-07-50-17 - Copy.pdf"
TESSERACT_PATH = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# RČ s lomítkem: 710920/3893
RC_RE_SPLIT = re.compile(r"\b(\d{6})\s*/\s*(\d{3,4})\b")
# RČ za klíčovým slovem (Tesseract špatně přečte diakritiku → tolerujeme ASCII varianty)
RC_RE_KEYWORD = re.compile(
r"(?:C\.?P\.?|R\.?C\.?|RC|ID|NAR)\s*[:\.]?\s*(\d{9,10})\b",
re.IGNORECASE,
)
# Fallback: jakýkoli 9-10místný blok
RC_RE_PLAIN = re.compile(r"\b(\d{9,10})\b")
def _rc_valid(digits: str) -> bool:
if len(digits) not in (9, 10):
return False
month = int(digits[2:4])
day = int(digits[4:6])
return (month in range(1, 13) or month in range(51, 63)) and 1 <= day <= 31
def _extract_rc(text: str) -> Optional[str]:
# 1. lomítko
m = RC_RE_SPLIT.search(text)
if m:
return m.group(1) + m.group(2)
# 2. klíčové slovo + číslo
for m in RC_RE_KEYWORD.finditer(text):
if _rc_valid(m.group(1)):
return m.group(1)
# 3. plain fallback
for m in RC_RE_PLAIN.finditer(text):
if _rc_valid(m.group(1)):
return m.group(1)
return None
def _rc_candidates(rc: str) -> list[str]:
# Vizuálně podobné číslice při OCR — každá číslice může být zaměněna za více variant
similar: dict[str, list[str]] = {
"0": ["8", "6", "5"],
"1": ["7", "6"],
"2": [],
"3": ["8"],
"4": [],
"5": ["6", "0"],
"6": ["5", "0", "1"],
"7": ["1"],
"8": ["0", "3"],
"9": [],
}
candidates = set()
for i in range(len(rc)):
candidates.add(rc[:i] + rc[i+1:])
for i in range(len(rc) + 1):
candidates.add(rc[:i] + "0" + rc[i:])
for i, ch in enumerate(rc):
for alt in similar.get(ch, []):
candidates.add(rc[:i] + alt + rc[i+1:])
candidates.discard(rc)
return sorted(c for c in candidates if len(c) in (9, 10))
def _rc_checksum_ok(rc: str) -> bool:
digits = re.sub(r"\D", "", rc)
return len(digits) == 10 and int(digits) % 11 == 0
def _rc_candidates_level2(rc: str) -> list[str]:
"""Kandidáti se dvěma chybami — filtrováno checksumem aby nebylo příliš mnoho."""
level1 = set(_rc_candidates(rc))
level2 = set()
for c in level1:
level2.update(_rc_candidates(c))
level2 -= level1
level2.discard(rc)
# Bez checksumu by bylo příliš kandidátů — filtrujeme
return sorted(c for c in level2 if len(c) in (9, 10) and _rc_checksum_ok(c))
# ── Medicus ───────────────────────────────────────────────────────────────────
def _verify_medicus(rc_digits: str) -> dict:
try:
import fdb
cfg = get_medicus_config()
con = fdb.connect(dsn=cfg.dsn, user="SYSDBA", password="masterkey", charset="win1250")
try:
cur = con.cursor()
def _lookup(rc: str) -> Optional[dict]:
cur.execute(
"SELECT IDPAC, PRIJMENI, JMENO, RODCIS FROM KAR "
"WHERE REPLACE(RODCIS, '/', '') = ?", (rc,)
)
row = cur.fetchone()
if row:
return {
"idpac": row[0],
"prijmeni": row[1].strip(),
"jmeno": row[2].strip(),
"rodcis": row[3].strip(),
}
return None
p = _lookup(rc_digits)
if p:
return {"status": "ok", "patient": p}
for c in _rc_candidates(rc_digits):
p = _lookup(c)
if p:
return {"status": "fuzzy", "rc_corrected": c, "patient": p}
for c in _rc_candidates_level2(rc_digits):
p = _lookup(c)
if p:
return {"status": "fuzzy", "rc_corrected": c, "patient": p}
return {"status": "not_found", "patient": None}
finally:
con.close()
except Exception as e:
return {"status": "offline", "patient": None, "error": str(e)}
# ── Načtení všech pacientů z Medicus ─────────────────────────────────────────
def _load_all_patients() -> list[dict]:
try:
import fdb
from datetime import date
dnes = date.today().isoformat() # 'YYYY-MM-DD'
cfg = get_medicus_config()
con = fdb.connect(dsn=cfg.dsn, user="SYSDBA", password="masterkey", charset="win1250")
try:
cur = con.cursor()
cur.execute(
"SELECT KAR.IDPAC, KAR.PRIJMENI, KAR.JMENO, KAR.RODCIS "
"FROM KAR "
"WHERE KAR.VYRAZEN = 'N' "
"AND KAR.RODCIS IS NOT NULL AND KAR.RODCIS <> '' "
"AND EXISTS ("
" SELECT r.ID FROM REGISTR r "
" JOIN ICP i ON r.IDICP = i.IDICP "
" WHERE r.IDPAC = KAR.IDPAC "
" AND r.DATUM <= ? "
" AND (r.DATUM_ZRUSENI IS NULL OR r.DATUM_ZRUSENI >= ?) "
" AND r.PRIZNAK IN ('V', 'D', 'A') "
" AND i.ICP = '09305001' "
" AND i.ODB = '001' "
") "
"ORDER BY KAR.PRIJMENI_UP ASC, KAR.RODCIS ASC",
(dnes, dnes),
)
return [
{
"idpac": r[0],
"prijmeni": (r[1] or "").strip(),
"jmeno": (r[2] or "").strip(),
"rodcis": (r[3] or "").strip(),
}
for r in cur.fetchall()
]
finally:
con.close()
except Exception as e:
print(f"[Medicus] chyba načtení pacientů: {e}")
return []
# ── Jméno výstupního souboru ──────────────────────────────────────────────────
def _format_filename(group_idx: int, medicus: Optional[dict]) -> str:
p = medicus.get("patient") if medicus else None
if p:
rc = re.sub(r"\D", "", p["rodcis"])
return f"{rc} {p['prijmeni']}, {p['jmeno']} split_{group_idx:03d}.pdf"
return f"split_{group_idx:03d}.pdf"
# ── OCR worker (pozadí) ───────────────────────────────────────────────────────
class OcrWorker:
"""
Na pozadí OCR-uje stránky: Tesseract → Claude Vision (fallback) → Medicus.
Výsledky cachuje do JSON souboru vedle vstupního PDF.
"""
def __init__(self, doc: fitz.Document, cache_path: Path, on_page_done):
self.doc = doc
self.cache_path = cache_path
self.on_page_done = on_page_done # callback(page_idx: int)
self.results: dict[int, dict] = {}
self._stop = threading.Event()
self._lock = threading.Lock()
self._load_cache()
def _load_cache(self):
if self.cache_path.exists():
try:
data = json.loads(self.cache_path.read_text(encoding="utf-8"))
self.results = {int(k): v for k, v in data.items()}
print(f"[OCR cache] načteno {len(self.results)} stránek z {self.cache_path.name}")
except Exception as e:
print(f"[OCR cache] chyba čtení: {e}")
def _save_cache(self):
with self._lock:
self.cache_path.write_text(
json.dumps(self.results, ensure_ascii=False, indent=2),
encoding="utf-8",
)
def start(self):
t = threading.Thread(target=self._run, daemon=True)
t.start()
def stop(self):
self._stop.set()
def _run(self):
import pytesseract
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
n = len(self.doc)
for i in range(n):
if self._stop.is_set():
break
if i in self.results:
continue # cache hit
page = self.doc[i]
mat = fitz.Matrix(2.0, 2.0) # 144 DPI — dostatečné pro OCR
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# 1. Tesseract
rc = None
tess_text = None
try:
tess_text = pytesseract.image_to_string(img, lang="ces")
rc = _extract_rc(tess_text)
except Exception as e:
print(f"[OCR str.{i+1}] Tesseract: {e}")
# 2. Medicus — první pokus
medicus = _verify_medicus(rc) if rc else None
# 3. Claude Vision — když Tesseract nenašel RČ, nebo našel ale Medicus nezná
claude_raw = None
claude_usage = None
if not rc or (medicus and medicus.get("status") == "not_found"):
try:
rc_claude, claude_raw, claude_usage = self._claude_rc(img)
if rc_claude:
medicus_claude = _verify_medicus(rc_claude)
if medicus_claude.get("status") in ("ok", "fuzzy"):
print(f"[OCR str.{i+1}] Claude opravil RČ: {rc}{rc_claude}")
rc = rc_claude
medicus = medicus_claude
elif not rc:
rc = rc_claude
medicus = medicus_claude
except Exception as e:
print(f"[OCR str.{i+1}] Claude: {e}")
result = {
"rc": rc,
"medicus": medicus,
"tesseract_text": tess_text,
"claude_raw": claude_raw,
"claude_usage": claude_usage,
}
self.results[i] = result
self._save_cache()
self.on_page_done(i)
def _claude_rc(self, img: Image.Image) -> tuple[Optional[str], Optional[str], Optional[dict]]:
import anthropic, base64
buf = io.BytesIO()
img.save(buf, format="JPEG", quality=80)
b64 = base64.standard_b64encode(buf.getvalue()).decode()
client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
resp = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=100,
messages=[{"role": "user", "content": [
{"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": b64}},
{"type": "text", "text": (
"Najdi rodné číslo na tomto naskenovaném dokumentu. "
"Vrať JSON: {\"rodne_cislo\": \"XXXXXXXXXX\"} nebo {\"rodne_cislo\": null}. "
"Jen JSON, nic jiného."
)},
]}],
)
usage = {
"input_tokens": resp.usage.input_tokens,
"output_tokens": resp.usage.output_tokens,
}
raw = resp.content[0].text.strip()
raw = re.sub(r"^```\w*\n?", "", raw).rstrip("`").strip()
try:
rc_raw = json.loads(raw).get("rodne_cislo") or ""
return re.sub(r"\D", "", rc_raw) or None, raw, usage
except Exception:
return None, raw, usage
# ── Thumbnail worker (pozadí) ─────────────────────────────────────────────────
class ThumbnailWorker:
"""Renderuje stránky PDF do PIL Images na pozadí."""
def __init__(self, doc: fitz.Document, thumb_w: int, thumb_h: int, on_thumb_done):
self.doc = doc
self.thumb_w = thumb_w
self.thumb_h = thumb_h
self.on_thumb_done = on_thumb_done # callback(page_idx: int)
self._cache: dict[int, Image.Image] = {}
self._lock = threading.Lock()
t = threading.Thread(target=self._run, daemon=True)
t.start()
def get(self, page_idx: int) -> Optional[Image.Image]:
with self._lock:
return self._cache.get(page_idx)
def _run(self):
for i in range(len(self.doc)):
page = self.doc[i]
rect = page.rect
scale = min(self.thumb_w / rect.width, self.thumb_h / rect.height)
mat = fitz.Matrix(scale, scale)
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Orámuj do pevného plátna
canvas = Image.new("RGB", (self.thumb_w, self.thumb_h), (38, 38, 38))
x = (self.thumb_w - img.width) // 2
y = (self.thumb_h - img.height) // 2
canvas.paste(img, (x, y))
with self._lock:
self._cache[i] = canvas
self.on_thumb_done(i)
# ── Barvy a rozměry ───────────────────────────────────────────────────────────
COLS = 4
BORDER_W = 16 # šířka oddělovače mezi sloty
PAD = 8 # odsazení thumbnaillu od okraje slotu
INFO_H = 116 # výška info pásu pod thumbnailem
TOP_H = 44 # výška stavové lišty nahoře
BOT_H = 44 # výška nápovědy dole
# Rozměry se spočítají dynamicky v SplitterUI.__init__ podle rozlišení monitoru
BG = "#1e1e1e"
BG_SLOT = "#262626"
BG_INFO = "#181818"
C_CURSOR = "#4da6ff"
C_BOUNDARY = "#cc3333"
C_SAME = "#3a3a3a"
C_OK = "#4caf50"
C_FUZZY = "#ff9800"
C_NONE = "#f44336"
C_OFFLINE = "#888888"
C_LOADING = "#555555"
C_TEXT = "#dddddd"
C_DIM = "#666666"
GROUP_COLORS = [
"#1b2a3a", "#2a1b3a", "#1b3a2a", "#3a2a1b",
"#2a3a1b", "#1b2a2a", "#3a1b2a", "#2a2a1b",
]
# ── Dialog pro výběr pacienta ─────────────────────────────────────────────────
class PatientPickerDialog(tk.Toplevel):
"""Modální okno pro ruční výběr pacienta z Medicusu."""
def __init__(self, parent: tk.Tk, on_select):
super().__init__(parent)
self.on_select = on_select
self.all_patients: list[dict] = []
self.filtered: list[dict] = []
self.title("Výběr pacienta")
self.configure(bg=BG)
self.resizable(True, True)
self.geometry("760x520")
self.protocol("WM_DELETE_WINDOW", self.destroy)
# ── Vyhledávací řádek ─────────────────────────────────────────────────
tk.Label(
self, text="Hledat (RČ nebo jméno):",
bg=BG, fg=C_TEXT, font=("Consolas", 12), anchor="w",
).pack(fill="x", padx=10, pady=(10, 0))
self.search_var = tk.StringVar()
self.search_var.trace_add("write", lambda *_: self._update_list())
self.entry = tk.Entry(
self, textvariable=self.search_var,
font=("Consolas", 14), bg="#2d2d2d", fg=C_TEXT,
insertbackground=C_TEXT, relief="flat", bd=4,
)
self.entry.pack(fill="x", padx=10, pady=6)
# ── Listbox ───────────────────────────────────────────────────────────
frame = tk.Frame(self, bg=BG)
frame.pack(fill="both", expand=True, padx=10, pady=(0, 4))
sb = tk.Scrollbar(frame, orient="vertical")
self.listbox = tk.Listbox(
frame, yscrollcommand=sb.set,
bg="#1a1a1a", fg=C_TEXT,
selectbackground=C_CURSOR, selectforeground="white",
font=("Consolas", 12), activestyle="none",
borderwidth=0, highlightthickness=0,
)
sb.config(command=self.listbox.yview)
sb.pack(side="right", fill="y")
self.listbox.pack(side="left", fill="both", expand=True)
self.listbox.bind("<Double-Button-1>", lambda _: self._confirm())
# ── Stavový řádek ─────────────────────────────────────────────────────
self.status_label = tk.Label(
self, text="Načítám pacienty…",
bg=BG, fg=C_DIM, font=("Consolas", 10), anchor="w",
)
self.status_label.pack(fill="x", padx=10, pady=(0, 6))
# Klávesy: entry zachytí normální znaky, Toplevel zachytí navigaci
self.entry.bind("<KeyPress>", self._on_key)
self.bind("<KeyPress>", self._on_key)
# Načti pacienty na pozadí
threading.Thread(target=self._load, daemon=True).start()
self.grab_set()
self.entry.focus_set()
# ── Načtení pacientů ──────────────────────────────────────────────────────
def _load(self):
patients = _load_all_patients()
self.after(0, self._on_loaded, patients)
def _on_loaded(self, patients: list[dict]):
self.all_patients = patients
self._update_list()
self.status_label.config(text=f"Načteno {len(patients)} pacientů")
# ── Filtrování ────────────────────────────────────────────────────────────
def _update_list(self):
q = self.search_var.get().strip()
q_lower = q.lower()
q_digits = re.sub(r"\D", "", q)
if not q:
self.filtered = self.all_patients[:]
else:
result = []
for p in self.all_patients:
rc_digits = re.sub(r"\D", "", p["rodcis"])
name_lower = f"{p['prijmeni']} {p['jmeno']}".lower()
if (q_digits and rc_digits.startswith(q_digits)) or q_lower in name_lower:
result.append(p)
self.filtered = result
self.listbox.delete(0, "end")
for p in self.filtered:
rc = p["rodcis"] or ""
self.listbox.insert("end", f" {rc:<14} {p['prijmeni']} {p['jmeno']}")
if self.filtered:
self.listbox.selection_set(0)
self.listbox.see(0)
count = len(self.filtered)
total = len(self.all_patients)
suffix = f" (z {total})" if count != total else ""
self.status_label.config(text=f"{count} pacientů{suffix} │ Enter: vybrat 8/2 nebo ↑↓: navigace Esc: zrušit")
# ── Klávesnice ────────────────────────────────────────────────────────────
def _on_key(self, event):
ks = event.keysym
kc = event.keycode
# numpad 8 (keycode 104) = nahoru, numpad 2 (keycode 98) = dolů
if kc == 104 or ks in ("Up", "KP_Up"):
self._move(-1)
return "break"
if kc == 98 or ks in ("Down", "KP_Down"):
self._move(1)
return "break"
if ks in ("Return", "KP_Enter"):
self._confirm()
return "break"
if ks == "Escape":
self.destroy()
return "break"
def _move(self, delta: int):
if not self.filtered:
return
sel = self.listbox.curselection()
idx = sel[0] if sel else 0
new_idx = max(0, min(len(self.filtered) - 1, idx + delta))
self.listbox.selection_clear(0, "end")
self.listbox.selection_set(new_idx)
self.listbox.see(new_idx)
# ── Výběr ─────────────────────────────────────────────────────────────────
def _confirm(self):
sel = self.listbox.curselection()
if not sel or not self.filtered:
return
self.on_select(self.filtered[sel[0]])
self.destroy()
# ── Hlavní UI ─────────────────────────────────────────────────────────────────
class SplitterUI:
def __init__(self, root: tk.Tk, pdf_path: Path):
self.root = root
self.pdf_path = pdf_path
self.doc = fitz.open(str(pdf_path))
n = len(self.doc)
# Stav
self.page_order: list[int] = list(range(n))
self.boundaries: set[int] = {0} # pozice (v page_order) začínající novou skupinu
self.cursor: int = 0
self.scroll: int = 0 # index nejlevějšího viditelného slotu
# Cache
self.ocr_results: dict[int, dict] = {}
self._photo_cache: dict[tuple, ImageTk.PhotoImage] = {} # (page_idx, rot) → photo
self.rotations: dict[int, int] = {} # page_idx → stupně (0/90/180/270)
# Rozměry
sw = root.winfo_screenwidth()
sh = root.winfo_screenheight()
self.SLOT_W = (sw - (COLS - 1) * BORDER_W) // COLS
self.THUMB_W = self.SLOT_W - 2 * PAD
self.THUMB_H = int(self.THUMB_W * 842 / 595) # A4 poměr
self.CANVAS_W = COLS * self.SLOT_W + (COLS - 1) * BORDER_W
self.CANVAS_H = PAD + self.THUMB_H + PAD + INFO_H
win_h = min(TOP_H + self.CANVAS_H + BOT_H, sh - 60)
root.title(f"PDF Dělení — {pdf_path.name}")
root.configure(bg=BG)
root.geometry(f"{self.CANVAS_W}x{win_h}+0+0")
self._build_ui()
self._start_workers()
# ── Stavba UI ─────────────────────────────────────────────────────────────
def _build_ui(self):
self.top_label = tk.Label(
self.root, bg=BG, fg=C_TEXT,
font=("Consolas", 13), anchor="w", padx=12
)
self.top_label.pack(fill="x", side="top", ipady=4)
self.canvas = tk.Canvas(
self.root, width=self.CANVAS_W, height=self.CANVAS_H,
bg=BG, highlightthickness=0
)
self.canvas.pack(fill="both", expand=True)
hints = (
"4/6: navigace ←/→ 7/9: skok ×4 "
"5/Space: hranice pacienta "
"1/3: přesuň stránku "
"/: otočit ↺CCW *: otočit ↻CW "
"Del/.: smaž stránku "
"-: vyber pacienta ručně "
"Enter: exportuj Esc: konec"
)
self.bot_label = tk.Label(
self.root, text=hints, bg=BG, fg=C_DIM,
font=("Consolas", 11), anchor="center"
)
self.bot_label.pack(fill="x", side="bottom", ipady=6)
self.root.bind("<KeyPress>", self._on_key)
self.root.focus_set()
self._redraw()
# ── Startuji workery ──────────────────────────────────────────────────────
def _start_workers(self):
cache_path = self.pdf_path.parent / (self.pdf_path.stem + "_ocr_cache.json")
self.ocr_worker = OcrWorker(
self.doc, cache_path,
on_page_done=lambda idx: self.root.after(0, self._on_ocr_done, idx),
)
# Přeberu výsledky z cache
self.ocr_results.update(self.ocr_worker.results)
self._auto_detect_boundaries()
self.ocr_worker.start()
self.thumb_worker = ThumbnailWorker(
self.doc, self.THUMB_W, self.THUMB_H,
on_thumb_done=lambda idx: self.root.after(0, self._on_thumb_done, idx),
)
def _auto_detect_boundaries(self):
"""Nastaví hranice jen kde jsou obě strany potvrzeny v Medicus jako různí pacienti."""
prev_rc = None
prev_confirmed = False
for pos, page_idx in enumerate(self.page_order):
r = self.ocr_results.get(page_idx)
if not r:
continue
rc = r.get("rc")
status = (r.get("medicus") or {}).get("status")
confirmed = status in ("ok", "fuzzy")
if rc and confirmed and prev_rc and prev_confirmed and rc != prev_rc:
self.boundaries.add(pos)
if rc and confirmed:
prev_rc = rc
prev_confirmed = True
# ── Callbacky z workerů ───────────────────────────────────────────────────
def _on_ocr_done(self, page_idx: int):
self.ocr_results[page_idx] = self.ocr_worker.results[page_idx]
# Auto-detekce hranice spustí se když doběhne celé OCR a uživatel ještě nic neměnil
if (len(self.ocr_results) == len(self.page_order)
and self.boundaries == {0}):
self._auto_detect_boundaries()
self._redraw()
def _on_thumb_done(self, page_idx: int):
self._rebuild_photo(page_idx)
self._redraw()
def _rebuild_photo(self, page_idx: int):
rot = self.rotations.get(page_idx, 0)
key = (page_idx, rot)
if key in self._photo_cache:
return
if rot == 0:
# Bez rotace — použij předrenderovaný thumbnail
pil = self.thumb_worker.get(page_idx)
if pil is None:
return
self._photo_cache[key] = ImageTk.PhotoImage(pil)
else:
# Otočená stránka — přerenderuj přímo z PDF se správnými rozměry
page = self.doc[page_idx]
rect = page.rect
# Po otočení o 90°/270° se šířka a výška prohodí
if rot % 180 == 90:
eff_w, eff_h = rect.height, rect.width
else:
eff_w, eff_h = rect.width, rect.height
scale = min(self.THUMB_W / eff_w, self.THUMB_H / eff_h)
mat = fitz.Matrix(scale, scale).prerotate(rot)
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
canvas = Image.new("RGB", (self.THUMB_W, self.THUMB_H), (38, 38, 38))
canvas.paste(img, ((self.THUMB_W - img.width) // 2, (self.THUMB_H - img.height) // 2))
self._photo_cache[key] = ImageTk.PhotoImage(canvas)
# ── Klávesnice ────────────────────────────────────────────────────────────
def _on_key(self, event):
ks = event.keysym
kc = event.keycode
# Numpad keycodes (Windows): 96=KP0 97=KP1 ... 105=KP9 110=KP.
# NumLock ON → keysym='1'..'9', keycode=97..105
# NumLock OFF → keysym=Left/Clear/Right/Home/Up/Prior/Down
numpad = {
100: "num4", 101: "num5", 102: "num6",
103: "num7", 105: "num9",
97: "num1", 99: "num3", 110: "numdot",
111: "numslash", 106: "numstar", 109: "numminus",
}
action = numpad.get(kc) or {
"Left": "num4", "Right": "num6",
"Home": "num7", "Prior": "num9",
"Clear": "num5", "End": "num1",
"Next": "num3", "Delete": "numdot",
"space": "num5",
"KP_Divide": "numslash", "KP_Multiply": "numstar",
"slash": "numslash", "asterisk": "numstar",
"KP_Subtract": "numminus", "minus": "numminus",
}.get(ks)
if action == "num4":
self._move_cursor(-1)
elif action == "num6":
self._move_cursor(1)
elif action == "num7":
self._move_cursor(-COLS)
elif action == "num9":
self._move_cursor(COLS)
elif action == "num5":
self._toggle_boundary()
elif action == "num1":
self._move_page(-1)
elif action == "num3":
self._move_page(1)
elif action == "numslash":
self._rotate_page(90) # CCW
elif action == "numstar":
self._rotate_page(-90) # CW
elif action == "numdot":
self._delete_page()
elif action == "numminus":
self._open_patient_picker()
elif ks in ("Return", "KP_Enter"):
self._export()
elif ks == "Escape":
self.root.quit()
# ── Pohyb a manipulace ────────────────────────────────────────────────────
def _move_cursor(self, delta: int):
n = len(self.page_order)
self.cursor = max(0, min(n - 1, self.cursor + delta))
if self.cursor < self.scroll:
self.scroll = self.cursor
elif self.cursor >= self.scroll + COLS:
self.scroll = self.cursor - COLS + 1
self._redraw()
def _toggle_boundary(self):
pos = self.cursor
if pos == 0:
return
if pos in self.boundaries:
self.boundaries.discard(pos)
else:
self.boundaries.add(pos)
self._redraw()
def _rotate_page(self, delta: int):
page_idx = self.page_order[self.cursor]
rot = (self.rotations.get(page_idx, 0) + delta) % 360
self.rotations[page_idx] = rot
self._rebuild_photo(page_idx)
self._redraw()
def _delete_page(self):
n = len(self.page_order)
if n == 1:
return
pos = self.cursor
self.page_order.pop(pos)
# Posuň hranice: odstraň hranici na pos, posuň vyšší o -1
self.boundaries = {
b - 1 if b > pos else b
for b in self.boundaries
if b != pos
}
self.boundaries.add(0) # první stránka je vždy začátek
self.cursor = min(pos, len(self.page_order) - 1)
if self.cursor < self.scroll:
self.scroll = self.cursor
self._redraw()
def _open_patient_picker(self):
page_idx = self.page_order[self.cursor]
pos = self.cursor
def on_select(patient: dict):
rc_digits = re.sub(r"\D", "", patient["rodcis"])
result = {
"rc": rc_digits,
"medicus": {"status": "ok", "patient": patient},
"tesseract_text": None,
"claude_raw": None,
}
self.ocr_results[page_idx] = result
self.ocr_worker.results[page_idx] = result
self.ocr_worker._save_cache()
self._update_boundaries_around(pos)
self._redraw()
PatientPickerDialog(self.root, on_select)
def _update_boundaries_around(self, pos: int):
"""Přidá/odstraní hranice kolem pozice pos podle potvrzených pacientů."""
def confirmed_rc(p: int) -> Optional[str]:
r = self.ocr_results.get(self.page_order[p])
if not r:
return None
med = r.get("medicus") or {}
if med.get("status") not in ("ok", "fuzzy"):
return None
pat = med.get("patient")
return re.sub(r"\D", "", pat["rodcis"]) if pat else None
n = len(self.page_order)
# Hranice mezi pos-1 a pos
if pos > 0:
rc_prev = confirmed_rc(pos - 1)
rc_curr = confirmed_rc(pos)
if rc_prev and rc_curr:
if rc_prev != rc_curr:
self.boundaries.add(pos)
else:
self.boundaries.discard(pos)
# Hranice mezi pos a pos+1
if pos + 1 < n:
rc_curr = confirmed_rc(pos)
rc_next = confirmed_rc(pos + 1)
if rc_curr and rc_next:
if rc_curr != rc_next:
self.boundaries.add(pos + 1)
else:
self.boundaries.discard(pos + 1)
def _move_page(self, delta: int):
n = len(self.page_order)
pos = self.cursor
new_pos = pos + delta
if new_pos < 0 or new_pos >= n:
return
self.page_order[pos], self.page_order[new_pos] = (
self.page_order[new_pos], self.page_order[pos]
)
self.cursor = new_pos
if self.cursor < self.scroll:
self.scroll = self.cursor
elif self.cursor >= self.scroll + COLS:
self.scroll = self.cursor - COLS + 1
self._redraw()
# ── Skupiny ───────────────────────────────────────────────────────────────
def _group_of_pos(self) -> list[int]:
"""Vrátí seznam: group_idx pro každou pozici v page_order."""
result = []
gi = 0
for pos in range(len(self.page_order)):
if pos in self.boundaries and pos > 0:
gi += 1
result.append(gi)
return result
def _get_groups(self) -> list[list[int]]:
"""Vrátí skupiny: každá je list page_idx (v pořadí z page_order)."""
groups: list[list[int]] = []
current: list[int] = []
for pos, page_idx in enumerate(self.page_order):
if pos in self.boundaries and current:
groups.append(current)
current = []
current.append(page_idx)
if current:
groups.append(current)
return groups
def _best_medicus(self, pages: list[int]) -> Optional[dict]:
for status in ("ok", "fuzzy"):
for p in pages:
r = self.ocr_results.get(p)
if r and r.get("medicus") and r["medicus"].get("status") == status:
return r["medicus"]
return None
# ── Export ────────────────────────────────────────────────────────────────
def _export(self):
groups = self._get_groups()
SPLIT_DIR.mkdir(parents=True, exist_ok=True)
names = []
for i, pages in enumerate(groups, 1):
med = self._best_medicus(pages)
name = _format_filename(i, med)
out_path = SPLIT_DIR / name
out_doc = fitz.open()
for page_idx in pages:
out_doc.insert_pdf(self.doc, from_page=page_idx, to_page=page_idx)
rot = self.rotations.get(page_idx, 0)
if rot:
out_doc[-1].set_rotation((out_doc[-1].rotation - rot) % 360)
out_doc.save(str(out_path))
out_doc.close()
names.append(f"{name} ({len(pages)} str.)")
print(f" Exportováno: {name}")
messagebox.showinfo(
"Export hotov",
f"Exportováno {len(groups)} skupin do:\n{SPLIT_DIR}\n\n" + "\n".join(names),
)
# ── Kreslení ──────────────────────────────────────────────────────────────
def _redraw(self):
c = self.canvas
c.delete("all")
n = len(self.page_order)
group_of = self._group_of_pos()
ocr_done = sum(1 for i in range(n) if i in self.ocr_results)
# Pozadí
c.create_rectangle(0, 0, self.CANVAS_W, self.CANVAS_H, fill=BG, outline="")
for col in range(COLS):
pos = self.scroll + col
if pos >= n:
break
page_idx = self.page_order[pos]
gi = group_of[pos]
x0 = col * (self.SLOT_W + BORDER_W)
# ── Pozadí slotu ─────────────────────────────────────────────────
slot_color = GROUP_COLORS[gi % len(GROUP_COLORS)]
c.create_rectangle(
x0, 0, x0 + self.SLOT_W, PAD + self.THUMB_H + PAD,
fill=slot_color, outline=""
)
# ── Thumbnail ────────────────────────────────────────────────────
rot = self.rotations.get(page_idx, 0)
photo = self._photo_cache.get((page_idx, rot))
if photo:
c.create_image(x0 + PAD, PAD, anchor="nw", image=photo)
else:
c.create_text(
x0 + self.SLOT_W // 2, PAD + self.THUMB_H // 2,
text=f"\nstr. {pos + 1}",
fill=C_LOADING, font=("Consolas", 18), justify="center"
)
# ── Kurzor ───────────────────────────────────────────────────────
if pos == self.cursor:
c.create_rectangle(
x0 + 2, 2, x0 + self.SLOT_W - 2, PAD + self.THUMB_H + PAD - 2,
outline=C_CURSOR, width=5
)
# ── Info pás ─────────────────────────────────────────────────────
y_info = PAD + self.THUMB_H + PAD
c.create_rectangle(
x0, y_info, x0 + self.SLOT_W, y_info + INFO_H,
fill=BG_INFO, outline=""
)
result = self.ocr_results.get(page_idx)
if result is None:
rc_line = "⏳ OCR probíhá…"
pat_line = ""
claude_line = ""
stat_color = C_LOADING
else:
rc = result.get("rc")
rc_line = f"RČ: {rc}" if rc else "RČ: nenalezeno"
med = result.get("medicus")
if med:
s = med["status"]
p = med.get("patient")
if s == "ok" and p:
pat_line = f"{p['prijmeni']} {p['jmeno']}"
stat_color = C_OK
elif s == "fuzzy" and p:
pat_line = f"~ {p['prijmeni']} {p['jmeno']}"
stat_color = C_FUZZY
elif s == "not_found":
pat_line = "Nenalezen v Medicus"
stat_color = C_NONE
else:
pat_line = "Medicus offline"
stat_color = C_OFFLINE
elif rc:
pat_line = "Ověřuji…"
stat_color = C_LOADING
else:
pat_line = ""
stat_color = C_NONE
usage = result.get("claude_usage")
if usage:
# claude-sonnet-4-6: $3/MTok vstup, $15/MTok výstup
cost_usd = (usage["input_tokens"] * 3 + usage["output_tokens"] * 15) / 1_000_000
cost_czk = cost_usd * 23
claude_line = (
f"Claude: {usage['input_tokens']}+{usage['output_tokens']} tok "
f"${cost_usd:.4f} (~{cost_czk:.2f} Kč)"
)
elif result.get("claude_raw") is not None:
claude_line = "Claude: ✓ (cena nezaznamenána)"
else:
claude_line = ""
c.create_text(
x0 + 8, y_info + 6,
text=f"str. {pos + 1}/{n} (orig: {page_idx + 1})",
anchor="nw", fill=C_DIM, font=("Consolas", 10)
)
c.create_text(
x0 + 8, y_info + 26,
text=rc_line,
anchor="nw", fill=stat_color, font=("Consolas", 13, "bold")
)
c.create_text(
x0 + 8, y_info + 52,
text=pat_line,
anchor="nw", fill=stat_color, font=("Consolas", 14, "bold")
)
if claude_line:
c.create_text(
x0 + 8, y_info + 82,
text=claude_line,
anchor="nw", fill=C_DIM, font=("Consolas", 9)
)
# ── Oddělovač napravo od tohoto slotu ────────────────────────────
if col < COLS - 1:
next_pos = pos + 1
is_new = next_pos in self.boundaries
x_sep = x0 + self.SLOT_W
c.create_rectangle(
x_sep, 0, x_sep + BORDER_W, self.CANVAS_H,
fill=C_BOUNDARY if is_new else C_SAME, outline=""
)
if is_new:
c.create_text(
x_sep + BORDER_W // 2, self.CANVAS_H // 2,
text="\nNOVÝ",
fill="white", font=("Consolas", 7, "bold"), justify="center"
)
# ── Stavová lišta nahoře ──────────────────────────────────────────────
groups = self._get_groups()
self.top_label.config(
text=(
f" str. {self.cursor + 1}/{n}"
f"skupiny: {len(groups)}"
f"OCR: {ocr_done}/{n}"
f"{self.pdf_path.name}"
)
)
# ── Vstup ─────────────────────────────────────────────────────────────────────
def main():
root = tk.Tk()
root.withdraw()
if len(sys.argv) >= 2:
pdf_path = Path(sys.argv[1])
elif TESTOVANI:
pdf_path = Path(PATH_TO_TESTFILE)
else:
from tkinter import filedialog
chosen = filedialog.askopenfilename(
title="Vyber vstupní PDF",
initialdir=str(_RICOH),
filetypes=[("PDF soubory", "*.pdf")],
)
if not chosen:
root.destroy()
sys.exit(0)
pdf_path = Path(chosen)
if not pdf_path.exists():
print(f"Soubor nenalezen: {pdf_path}")
root.destroy()
sys.exit(1)
root.deiconify()
app = SplitterUI(root, pdf_path)
root.mainloop()
if __name__ == "__main__":
main()