841 lines
32 KiB
Python
841 lines
32 KiB
Python
"""
|
||
rozdelit_pdf.py — Dělení vícestránkového PDF na skupiny pacientů.
|
||
|
||
Spuštění:
|
||
python rozdelit_pdf.py soubor.pdf
|
||
|
||
Numerická klávesnice:
|
||
4 / Left kurzor ←
|
||
6 / Right kurzor →
|
||
7 / PgUp skok ← o 4 stránky
|
||
9 / PgDn skok → o 4 stránky
|
||
5 / Space přepni hranici pacienta před touto stránkou
|
||
8 / Up přesuň stránku doleva (swap)
|
||
2 / Down přesuň stránku doprava (swap)
|
||
Enter exportuj všechny skupiny do Split/
|
||
Esc konec
|
||
"""
|
||
|
||
import sys
|
||
import os
|
||
import io
|
||
import re
|
||
import json
|
||
import threading
|
||
from pathlib import Path
|
||
from typing import Optional
|
||
|
||
import tkinter as tk
|
||
from tkinter import messagebox
|
||
|
||
from PIL import Image, ImageTk
|
||
import fitz # PyMuPDF
|
||
|
||
# ── Cesty ─────────────────────────────────────────────────────────────────────
|
||
|
||
ROOT = Path(__file__).resolve().parent.parent # .../Medevio/
|
||
sys.path.insert(0, str(ROOT))
|
||
from Knihovny.najdi_medicus import get_medicus_config
|
||
from Knihovny.najdi_dropbox import get_dropbox_root
|
||
|
||
_DROPBOX = Path(get_dropbox_root())
|
||
_RICOH = _DROPBOX / r"Ordinace\Dokumentace_ke_zpracování\Ricoh Fi-8040"
|
||
SPLIT_DIR = _RICOH / "Split"
|
||
|
||
# ── Env ───────────────────────────────────────────────────────────────────────
|
||
|
||
def _load_env():
|
||
env_path = ROOT / ".env"
|
||
if env_path.exists():
|
||
for line in env_path.read_text(encoding="utf-8").splitlines():
|
||
line = line.strip()
|
||
if "=" in line and not line.startswith("#"):
|
||
k, v = line.split("=", 1)
|
||
os.environ[k.strip()] = v.strip()
|
||
|
||
_load_env()
|
||
|
||
# ── Regex ─────────────────────────────────────────────────────────────────────
|
||
|
||
TESTOVANI = False
|
||
PATH_TO_TESTFILE = r"U:\Dropbox\Ordinace\Dokumentace_ke_zpracování\Ricoh Fi-8040\2026-05-04-07-50-17 - Copy.pdf"
|
||
|
||
TESSERACT_PATH = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
||
|
||
# RČ s lomítkem: 710920/3893
|
||
RC_RE_SPLIT = re.compile(r"\b(\d{6})\s*/\s*(\d{3,4})\b")
|
||
# RČ za klíčovým slovem (Tesseract špatně přečte diakritiku → tolerujeme ASCII varianty)
|
||
RC_RE_KEYWORD = re.compile(
|
||
r"(?:C\.?P\.?|R\.?C\.?|RC|ID|NAR)\s*[:\.]?\s*(\d{9,10})\b",
|
||
re.IGNORECASE,
|
||
)
|
||
# Fallback: jakýkoli 9-10místný blok
|
||
RC_RE_PLAIN = re.compile(r"\b(\d{9,10})\b")
|
||
|
||
def _rc_valid(digits: str) -> bool:
|
||
if len(digits) not in (9, 10):
|
||
return False
|
||
month = int(digits[2:4])
|
||
day = int(digits[4:6])
|
||
return (month in range(1, 13) or month in range(51, 63)) and 1 <= day <= 31
|
||
|
||
def _extract_rc(text: str) -> Optional[str]:
|
||
# 1. lomítko
|
||
m = RC_RE_SPLIT.search(text)
|
||
if m:
|
||
return m.group(1) + m.group(2)
|
||
# 2. klíčové slovo + číslo
|
||
for m in RC_RE_KEYWORD.finditer(text):
|
||
if _rc_valid(m.group(1)):
|
||
return m.group(1)
|
||
# 3. plain fallback
|
||
for m in RC_RE_PLAIN.finditer(text):
|
||
if _rc_valid(m.group(1)):
|
||
return m.group(1)
|
||
return None
|
||
|
||
def _rc_candidates(rc: str) -> list[str]:
|
||
# Vizuálně podobné číslice při OCR — každá číslice může být zaměněna za více variant
|
||
similar: dict[str, list[str]] = {
|
||
"0": ["8", "6", "5"],
|
||
"1": ["7", "6"],
|
||
"2": [],
|
||
"3": ["8"],
|
||
"4": [],
|
||
"5": ["6", "0"],
|
||
"6": ["5", "0", "1"],
|
||
"7": ["1"],
|
||
"8": ["0", "3"],
|
||
"9": [],
|
||
}
|
||
candidates = set()
|
||
for i in range(len(rc)):
|
||
candidates.add(rc[:i] + rc[i+1:])
|
||
for i in range(len(rc) + 1):
|
||
candidates.add(rc[:i] + "0" + rc[i:])
|
||
for i, ch in enumerate(rc):
|
||
for alt in similar.get(ch, []):
|
||
candidates.add(rc[:i] + alt + rc[i+1:])
|
||
candidates.discard(rc)
|
||
return sorted(c for c in candidates if len(c) in (9, 10))
|
||
|
||
def _rc_checksum_ok(rc: str) -> bool:
|
||
digits = re.sub(r"\D", "", rc)
|
||
return len(digits) == 10 and int(digits) % 11 == 0
|
||
|
||
def _rc_candidates_level2(rc: str) -> list[str]:
|
||
"""Kandidáti se dvěma chybami — filtrováno checksumem aby nebylo příliš mnoho."""
|
||
level1 = set(_rc_candidates(rc))
|
||
level2 = set()
|
||
for c in level1:
|
||
level2.update(_rc_candidates(c))
|
||
level2 -= level1
|
||
level2.discard(rc)
|
||
# Bez checksumu by bylo příliš kandidátů — filtrujeme
|
||
return sorted(c for c in level2 if len(c) in (9, 10) and _rc_checksum_ok(c))
|
||
|
||
# ── Medicus ───────────────────────────────────────────────────────────────────
|
||
|
||
def _verify_medicus(rc_digits: str) -> dict:
|
||
try:
|
||
import fdb
|
||
cfg = get_medicus_config()
|
||
con = fdb.connect(dsn=cfg.dsn, user="SYSDBA", password="masterkey", charset="win1250")
|
||
try:
|
||
cur = con.cursor()
|
||
|
||
def _lookup(rc: str) -> Optional[dict]:
|
||
cur.execute(
|
||
"SELECT IDPAC, PRIJMENI, JMENO, RODCIS FROM KAR "
|
||
"WHERE REPLACE(RODCIS, '/', '') = ?", (rc,)
|
||
)
|
||
row = cur.fetchone()
|
||
if row:
|
||
return {
|
||
"idpac": row[0],
|
||
"prijmeni": row[1].strip(),
|
||
"jmeno": row[2].strip(),
|
||
"rodcis": row[3].strip(),
|
||
}
|
||
return None
|
||
|
||
p = _lookup(rc_digits)
|
||
if p:
|
||
return {"status": "ok", "patient": p}
|
||
for c in _rc_candidates(rc_digits):
|
||
p = _lookup(c)
|
||
if p:
|
||
return {"status": "fuzzy", "rc_corrected": c, "patient": p}
|
||
for c in _rc_candidates_level2(rc_digits):
|
||
p = _lookup(c)
|
||
if p:
|
||
return {"status": "fuzzy", "rc_corrected": c, "patient": p}
|
||
return {"status": "not_found", "patient": None}
|
||
finally:
|
||
con.close()
|
||
except Exception as e:
|
||
return {"status": "offline", "patient": None, "error": str(e)}
|
||
|
||
# ── Jméno výstupního souboru ──────────────────────────────────────────────────
|
||
|
||
def _format_filename(group_idx: int, medicus: Optional[dict]) -> str:
|
||
p = medicus.get("patient") if medicus else None
|
||
if p:
|
||
rc = re.sub(r"\D", "", p["rodcis"])
|
||
return f"{rc} {p['prijmeni']}, {p['jmeno']} split_{group_idx:03d}.pdf"
|
||
return f"split_{group_idx:03d}.pdf"
|
||
|
||
# ── OCR worker (pozadí) ───────────────────────────────────────────────────────
|
||
|
||
class OcrWorker:
|
||
"""
|
||
Na pozadí OCR-uje stránky: Tesseract → Claude Vision (fallback) → Medicus.
|
||
Výsledky cachuje do JSON souboru vedle vstupního PDF.
|
||
"""
|
||
|
||
def __init__(self, doc: fitz.Document, cache_path: Path, on_page_done):
|
||
self.doc = doc
|
||
self.cache_path = cache_path
|
||
self.on_page_done = on_page_done # callback(page_idx: int)
|
||
self.results: dict[int, dict] = {}
|
||
self._stop = threading.Event()
|
||
self._lock = threading.Lock()
|
||
self._load_cache()
|
||
|
||
def _load_cache(self):
|
||
if self.cache_path.exists():
|
||
try:
|
||
data = json.loads(self.cache_path.read_text(encoding="utf-8"))
|
||
self.results = {int(k): v for k, v in data.items()}
|
||
print(f"[OCR cache] načteno {len(self.results)} stránek z {self.cache_path.name}")
|
||
except Exception as e:
|
||
print(f"[OCR cache] chyba čtení: {e}")
|
||
|
||
def _save_cache(self):
|
||
with self._lock:
|
||
self.cache_path.write_text(
|
||
json.dumps(self.results, ensure_ascii=False, indent=2),
|
||
encoding="utf-8",
|
||
)
|
||
|
||
def start(self):
|
||
t = threading.Thread(target=self._run, daemon=True)
|
||
t.start()
|
||
|
||
def stop(self):
|
||
self._stop.set()
|
||
|
||
def _run(self):
|
||
import pytesseract
|
||
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
|
||
|
||
n = len(self.doc)
|
||
for i in range(n):
|
||
if self._stop.is_set():
|
||
break
|
||
if i in self.results:
|
||
continue # cache hit
|
||
|
||
page = self.doc[i]
|
||
mat = fitz.Matrix(2.0, 2.0) # 144 DPI — dostatečné pro OCR
|
||
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
|
||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||
|
||
# 1. Tesseract
|
||
rc = None
|
||
tess_text = None
|
||
try:
|
||
tess_text = pytesseract.image_to_string(img, lang="ces")
|
||
rc = _extract_rc(tess_text)
|
||
except Exception as e:
|
||
print(f"[OCR str.{i+1}] Tesseract: {e}")
|
||
|
||
# 2. Medicus — první pokus
|
||
medicus = _verify_medicus(rc) if rc else None
|
||
|
||
# 3. Claude Vision — když Tesseract nenašel RČ, nebo našel ale Medicus nezná
|
||
claude_raw = None
|
||
if not rc or (medicus and medicus.get("status") == "not_found"):
|
||
try:
|
||
rc_claude, claude_raw = self._claude_rc(img)
|
||
if rc_claude:
|
||
medicus_claude = _verify_medicus(rc_claude)
|
||
if medicus_claude.get("status") in ("ok", "fuzzy"):
|
||
print(f"[OCR str.{i+1}] Claude opravil RČ: {rc} → {rc_claude}")
|
||
rc = rc_claude
|
||
medicus = medicus_claude
|
||
elif not rc:
|
||
rc = rc_claude
|
||
medicus = medicus_claude
|
||
except Exception as e:
|
||
print(f"[OCR str.{i+1}] Claude: {e}")
|
||
|
||
result = {
|
||
"rc": rc,
|
||
"medicus": medicus,
|
||
"tesseract_text": tess_text,
|
||
"claude_raw": claude_raw,
|
||
}
|
||
self.results[i] = result
|
||
self._save_cache()
|
||
self.on_page_done(i)
|
||
|
||
def _claude_rc(self, img: Image.Image) -> tuple[Optional[str], Optional[str]]:
|
||
import anthropic, base64
|
||
|
||
buf = io.BytesIO()
|
||
img.save(buf, format="JPEG", quality=80)
|
||
b64 = base64.standard_b64encode(buf.getvalue()).decode()
|
||
|
||
client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
|
||
resp = client.messages.create(
|
||
model="claude-sonnet-4-6",
|
||
max_tokens=100,
|
||
messages=[{"role": "user", "content": [
|
||
{"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": b64}},
|
||
{"type": "text", "text": (
|
||
"Najdi rodné číslo na tomto naskenovaném dokumentu. "
|
||
"Vrať JSON: {\"rodne_cislo\": \"XXXXXXXXXX\"} nebo {\"rodne_cislo\": null}. "
|
||
"Jen JSON, nic jiného."
|
||
)},
|
||
]}],
|
||
)
|
||
raw = resp.content[0].text.strip()
|
||
raw = re.sub(r"^```\w*\n?", "", raw).rstrip("`").strip()
|
||
try:
|
||
rc_raw = json.loads(raw).get("rodne_cislo") or ""
|
||
return re.sub(r"\D", "", rc_raw) or None, raw
|
||
except Exception:
|
||
return None, raw
|
||
|
||
# ── Thumbnail worker (pozadí) ─────────────────────────────────────────────────
|
||
|
||
class ThumbnailWorker:
|
||
"""Renderuje stránky PDF do PIL Images na pozadí."""
|
||
|
||
def __init__(self, doc: fitz.Document, thumb_w: int, thumb_h: int, on_thumb_done):
|
||
self.doc = doc
|
||
self.thumb_w = thumb_w
|
||
self.thumb_h = thumb_h
|
||
self.on_thumb_done = on_thumb_done # callback(page_idx: int)
|
||
self._cache: dict[int, Image.Image] = {}
|
||
self._lock = threading.Lock()
|
||
t = threading.Thread(target=self._run, daemon=True)
|
||
t.start()
|
||
|
||
def get(self, page_idx: int) -> Optional[Image.Image]:
|
||
with self._lock:
|
||
return self._cache.get(page_idx)
|
||
|
||
def _run(self):
|
||
for i in range(len(self.doc)):
|
||
page = self.doc[i]
|
||
rect = page.rect
|
||
scale = min(self.thumb_w / rect.width, self.thumb_h / rect.height)
|
||
mat = fitz.Matrix(scale, scale)
|
||
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
|
||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||
# Orámuj do pevného plátna
|
||
canvas = Image.new("RGB", (self.thumb_w, self.thumb_h), (38, 38, 38))
|
||
x = (self.thumb_w - img.width) // 2
|
||
y = (self.thumb_h - img.height) // 2
|
||
canvas.paste(img, (x, y))
|
||
with self._lock:
|
||
self._cache[i] = canvas
|
||
self.on_thumb_done(i)
|
||
|
||
# ── Barvy a rozměry ───────────────────────────────────────────────────────────
|
||
|
||
COLS = 4
|
||
BORDER_W = 16 # šířka oddělovače mezi sloty
|
||
PAD = 8 # odsazení thumbnaillu od okraje slotu
|
||
INFO_H = 108 # výška info pásu pod thumbnailem
|
||
TOP_H = 44 # výška stavové lišty nahoře
|
||
BOT_H = 44 # výška nápovědy dole
|
||
|
||
# Rozměry se spočítají dynamicky v SplitterUI.__init__ podle rozlišení monitoru
|
||
|
||
BG = "#1e1e1e"
|
||
BG_SLOT = "#262626"
|
||
BG_INFO = "#181818"
|
||
C_CURSOR = "#4da6ff"
|
||
C_BOUNDARY = "#cc3333"
|
||
C_SAME = "#3a3a3a"
|
||
C_OK = "#4caf50"
|
||
C_FUZZY = "#ff9800"
|
||
C_NONE = "#f44336"
|
||
C_OFFLINE = "#888888"
|
||
C_LOADING = "#555555"
|
||
C_TEXT = "#dddddd"
|
||
C_DIM = "#666666"
|
||
|
||
GROUP_COLORS = [
|
||
"#1b2a3a", "#2a1b3a", "#1b3a2a", "#3a2a1b",
|
||
"#2a3a1b", "#1b2a2a", "#3a1b2a", "#2a2a1b",
|
||
]
|
||
|
||
# ── Hlavní UI ─────────────────────────────────────────────────────────────────
|
||
|
||
class SplitterUI:
|
||
def __init__(self, root: tk.Tk, pdf_path: Path):
|
||
self.root = root
|
||
self.pdf_path = pdf_path
|
||
self.doc = fitz.open(str(pdf_path))
|
||
n = len(self.doc)
|
||
|
||
# Stav
|
||
self.page_order: list[int] = list(range(n))
|
||
self.boundaries: set[int] = {0} # pozice (v page_order) začínající novou skupinu
|
||
self.cursor: int = 0
|
||
self.scroll: int = 0 # index nejlevějšího viditelného slotu
|
||
|
||
# Cache
|
||
self.ocr_results: dict[int, dict] = {}
|
||
self._photo_cache: dict[tuple, ImageTk.PhotoImage] = {} # (page_idx, rot) → photo
|
||
self.rotations: dict[int, int] = {} # page_idx → stupně (0/90/180/270)
|
||
|
||
# Rozměry
|
||
sw = root.winfo_screenwidth()
|
||
sh = root.winfo_screenheight()
|
||
self.SLOT_W = (sw - (COLS - 1) * BORDER_W) // COLS
|
||
self.THUMB_W = self.SLOT_W - 2 * PAD
|
||
self.THUMB_H = int(self.THUMB_W * 842 / 595) # A4 poměr
|
||
self.CANVAS_W = COLS * self.SLOT_W + (COLS - 1) * BORDER_W
|
||
self.CANVAS_H = PAD + self.THUMB_H + PAD + INFO_H
|
||
win_h = min(TOP_H + self.CANVAS_H + BOT_H, sh - 60)
|
||
|
||
root.title(f"PDF Dělení — {pdf_path.name}")
|
||
root.configure(bg=BG)
|
||
root.geometry(f"{self.CANVAS_W}x{win_h}+0+0")
|
||
|
||
self._build_ui()
|
||
self._start_workers()
|
||
|
||
# ── Stavba UI ─────────────────────────────────────────────────────────────
|
||
|
||
def _build_ui(self):
|
||
self.top_label = tk.Label(
|
||
self.root, bg=BG, fg=C_TEXT,
|
||
font=("Consolas", 13), anchor="w", padx=12
|
||
)
|
||
self.top_label.pack(fill="x", side="top", ipady=4)
|
||
|
||
self.canvas = tk.Canvas(
|
||
self.root, width=self.CANVAS_W, height=self.CANVAS_H,
|
||
bg=BG, highlightthickness=0
|
||
)
|
||
self.canvas.pack(fill="both", expand=True)
|
||
|
||
hints = (
|
||
"4/6: navigace ←/→ 7/9: skok ×4 "
|
||
"5/Space: hranice pacienta "
|
||
"1/3: přesuň stránku "
|
||
"/: otočit ↺CCW *: otočit ↻CW "
|
||
"Del/.: smaž stránku "
|
||
"Enter: exportuj Esc: konec"
|
||
)
|
||
self.bot_label = tk.Label(
|
||
self.root, text=hints, bg=BG, fg=C_DIM,
|
||
font=("Consolas", 11), anchor="center"
|
||
)
|
||
self.bot_label.pack(fill="x", side="bottom", ipady=6)
|
||
|
||
self.root.bind("<KeyPress>", self._on_key)
|
||
self.root.focus_set()
|
||
self._redraw()
|
||
|
||
# ── Startuji workery ──────────────────────────────────────────────────────
|
||
|
||
def _start_workers(self):
|
||
cache_path = self.pdf_path.parent / (self.pdf_path.stem + "_ocr_cache.json")
|
||
|
||
self.ocr_worker = OcrWorker(
|
||
self.doc, cache_path,
|
||
on_page_done=lambda idx: self.root.after(0, self._on_ocr_done, idx),
|
||
)
|
||
# Přeberu výsledky z cache
|
||
self.ocr_results.update(self.ocr_worker.results)
|
||
self._auto_detect_boundaries()
|
||
self.ocr_worker.start()
|
||
|
||
self.thumb_worker = ThumbnailWorker(
|
||
self.doc, self.THUMB_W, self.THUMB_H,
|
||
on_thumb_done=lambda idx: self.root.after(0, self._on_thumb_done, idx),
|
||
)
|
||
|
||
def _auto_detect_boundaries(self):
|
||
"""Nastaví hranice jen kde jsou obě strany potvrzeny v Medicus jako různí pacienti."""
|
||
prev_rc = None
|
||
prev_confirmed = False
|
||
for pos, page_idx in enumerate(self.page_order):
|
||
r = self.ocr_results.get(page_idx)
|
||
if not r:
|
||
continue
|
||
rc = r.get("rc")
|
||
status = (r.get("medicus") or {}).get("status")
|
||
confirmed = status in ("ok", "fuzzy")
|
||
if rc and confirmed and prev_rc and prev_confirmed and rc != prev_rc:
|
||
self.boundaries.add(pos)
|
||
if rc and confirmed:
|
||
prev_rc = rc
|
||
prev_confirmed = True
|
||
|
||
# ── Callbacky z workerů ───────────────────────────────────────────────────
|
||
|
||
def _on_ocr_done(self, page_idx: int):
|
||
self.ocr_results[page_idx] = self.ocr_worker.results[page_idx]
|
||
# Auto-detekce hranice spustí se když doběhne celé OCR a uživatel ještě nic neměnil
|
||
if (len(self.ocr_results) == len(self.page_order)
|
||
and self.boundaries == {0}):
|
||
self._auto_detect_boundaries()
|
||
self._redraw()
|
||
|
||
def _on_thumb_done(self, page_idx: int):
|
||
self._rebuild_photo(page_idx)
|
||
self._redraw()
|
||
|
||
def _rebuild_photo(self, page_idx: int):
|
||
pil = self.thumb_worker.get(page_idx)
|
||
if pil is None:
|
||
return
|
||
rot = self.rotations.get(page_idx, 0)
|
||
key = (page_idx, rot)
|
||
if key not in self._photo_cache:
|
||
img = pil.rotate(rot, expand=True).resize(
|
||
(self.THUMB_W, self.THUMB_H), Image.LANCZOS
|
||
)
|
||
self._photo_cache[key] = ImageTk.PhotoImage(img)
|
||
|
||
# ── Klávesnice ────────────────────────────────────────────────────────────
|
||
|
||
def _on_key(self, event):
|
||
ks = event.keysym
|
||
kc = event.keycode
|
||
# Numpad keycodes (Windows): 96=KP0 97=KP1 ... 105=KP9 110=KP.
|
||
# NumLock ON → keysym='1'..'9', keycode=97..105
|
||
# NumLock OFF → keysym=Left/Clear/Right/Home/Up/Prior/Down
|
||
numpad = {
|
||
100: "num4", 101: "num5", 102: "num6",
|
||
103: "num7", 105: "num9",
|
||
97: "num1", 99: "num3", 110: "numdot",
|
||
111: "numslash", 106: "numstar",
|
||
}
|
||
action = numpad.get(kc) or {
|
||
"Left": "num4", "Right": "num6",
|
||
"Home": "num7", "Prior": "num9",
|
||
"Clear": "num5", "End": "num1",
|
||
"Next": "num3", "Delete": "numdot",
|
||
"space": "num5",
|
||
"KP_Divide": "numslash", "KP_Multiply": "numstar",
|
||
"slash": "numslash", "asterisk": "numstar",
|
||
}.get(ks)
|
||
|
||
if action == "num4":
|
||
self._move_cursor(-1)
|
||
elif action == "num6":
|
||
self._move_cursor(1)
|
||
elif action == "num7":
|
||
self._move_cursor(-COLS)
|
||
elif action == "num9":
|
||
self._move_cursor(COLS)
|
||
elif action == "num5":
|
||
self._toggle_boundary()
|
||
elif action == "num1":
|
||
self._move_page(-1)
|
||
elif action == "num3":
|
||
self._move_page(1)
|
||
elif action == "numslash":
|
||
self._rotate_page(90) # CCW
|
||
elif action == "numstar":
|
||
self._rotate_page(-90) # CW
|
||
elif action == "numdot":
|
||
self._delete_page()
|
||
elif ks in ("Return", "KP_Enter"):
|
||
self._export()
|
||
elif ks == "Escape":
|
||
self.root.quit()
|
||
|
||
# ── Pohyb a manipulace ────────────────────────────────────────────────────
|
||
|
||
def _move_cursor(self, delta: int):
|
||
n = len(self.page_order)
|
||
self.cursor = max(0, min(n - 1, self.cursor + delta))
|
||
if self.cursor < self.scroll:
|
||
self.scroll = self.cursor
|
||
elif self.cursor >= self.scroll + COLS:
|
||
self.scroll = self.cursor - COLS + 1
|
||
self._redraw()
|
||
|
||
def _toggle_boundary(self):
|
||
pos = self.cursor
|
||
if pos == 0:
|
||
return
|
||
if pos in self.boundaries:
|
||
self.boundaries.discard(pos)
|
||
else:
|
||
self.boundaries.add(pos)
|
||
self._redraw()
|
||
|
||
def _rotate_page(self, delta: int):
|
||
page_idx = self.page_order[self.cursor]
|
||
rot = (self.rotations.get(page_idx, 0) + delta) % 360
|
||
self.rotations[page_idx] = rot
|
||
self._rebuild_photo(page_idx)
|
||
self._redraw()
|
||
|
||
def _delete_page(self):
|
||
n = len(self.page_order)
|
||
if n == 1:
|
||
return
|
||
pos = self.cursor
|
||
self.page_order.pop(pos)
|
||
# Posuň hranice: odstraň hranici na pos, posuň vyšší o -1
|
||
self.boundaries = {
|
||
b - 1 if b > pos else b
|
||
for b in self.boundaries
|
||
if b != pos
|
||
}
|
||
self.boundaries.add(0) # první stránka je vždy začátek
|
||
self.cursor = min(pos, len(self.page_order) - 1)
|
||
if self.cursor < self.scroll:
|
||
self.scroll = self.cursor
|
||
self._redraw()
|
||
|
||
def _move_page(self, delta: int):
|
||
n = len(self.page_order)
|
||
pos = self.cursor
|
||
new_pos = pos + delta
|
||
if new_pos < 0 or new_pos >= n:
|
||
return
|
||
self.page_order[pos], self.page_order[new_pos] = (
|
||
self.page_order[new_pos], self.page_order[pos]
|
||
)
|
||
self.cursor = new_pos
|
||
if self.cursor < self.scroll:
|
||
self.scroll = self.cursor
|
||
elif self.cursor >= self.scroll + COLS:
|
||
self.scroll = self.cursor - COLS + 1
|
||
self._redraw()
|
||
|
||
# ── Skupiny ───────────────────────────────────────────────────────────────
|
||
|
||
def _group_of_pos(self) -> list[int]:
|
||
"""Vrátí seznam: group_idx pro každou pozici v page_order."""
|
||
result = []
|
||
gi = 0
|
||
for pos in range(len(self.page_order)):
|
||
if pos in self.boundaries and pos > 0:
|
||
gi += 1
|
||
result.append(gi)
|
||
return result
|
||
|
||
def _get_groups(self) -> list[list[int]]:
|
||
"""Vrátí skupiny: každá je list page_idx (v pořadí z page_order)."""
|
||
groups: list[list[int]] = []
|
||
current: list[int] = []
|
||
for pos, page_idx in enumerate(self.page_order):
|
||
if pos in self.boundaries and current:
|
||
groups.append(current)
|
||
current = []
|
||
current.append(page_idx)
|
||
if current:
|
||
groups.append(current)
|
||
return groups
|
||
|
||
def _best_medicus(self, pages: list[int]) -> Optional[dict]:
|
||
for status in ("ok", "fuzzy"):
|
||
for p in pages:
|
||
r = self.ocr_results.get(p)
|
||
if r and r.get("medicus") and r["medicus"].get("status") == status:
|
||
return r["medicus"]
|
||
return None
|
||
|
||
# ── Export ────────────────────────────────────────────────────────────────
|
||
|
||
def _export(self):
|
||
groups = self._get_groups()
|
||
SPLIT_DIR.mkdir(parents=True, exist_ok=True)
|
||
names = []
|
||
for i, pages in enumerate(groups, 1):
|
||
med = self._best_medicus(pages)
|
||
name = _format_filename(i, med)
|
||
out_path = SPLIT_DIR / name
|
||
out_doc = fitz.open()
|
||
for page_idx in pages:
|
||
out_doc.insert_pdf(self.doc, from_page=page_idx, to_page=page_idx)
|
||
rot = self.rotations.get(page_idx, 0)
|
||
if rot:
|
||
out_doc[-1].set_rotation((out_doc[-1].rotation - rot) % 360)
|
||
out_doc.save(str(out_path))
|
||
out_doc.close()
|
||
names.append(f"{name} ({len(pages)} str.)")
|
||
print(f" Exportováno: {name}")
|
||
messagebox.showinfo(
|
||
"Export hotov",
|
||
f"Exportováno {len(groups)} skupin do:\n{SPLIT_DIR}\n\n" + "\n".join(names),
|
||
)
|
||
|
||
# ── Kreslení ──────────────────────────────────────────────────────────────
|
||
|
||
def _redraw(self):
|
||
c = self.canvas
|
||
c.delete("all")
|
||
n = len(self.page_order)
|
||
group_of = self._group_of_pos()
|
||
ocr_done = sum(1 for i in range(n) if i in self.ocr_results)
|
||
|
||
# Pozadí
|
||
c.create_rectangle(0, 0, self.CANVAS_W, self.CANVAS_H, fill=BG, outline="")
|
||
|
||
for col in range(COLS):
|
||
pos = self.scroll + col
|
||
if pos >= n:
|
||
break
|
||
|
||
page_idx = self.page_order[pos]
|
||
gi = group_of[pos]
|
||
x0 = col * (self.SLOT_W + BORDER_W)
|
||
|
||
# ── Pozadí slotu ─────────────────────────────────────────────────
|
||
slot_color = GROUP_COLORS[gi % len(GROUP_COLORS)]
|
||
c.create_rectangle(
|
||
x0, 0, x0 + self.SLOT_W, PAD + self.THUMB_H + PAD,
|
||
fill=slot_color, outline=""
|
||
)
|
||
|
||
# ── Thumbnail ────────────────────────────────────────────────────
|
||
rot = self.rotations.get(page_idx, 0)
|
||
photo = self._photo_cache.get((page_idx, rot))
|
||
if photo:
|
||
c.create_image(x0 + PAD, PAD, anchor="nw", image=photo)
|
||
else:
|
||
c.create_text(
|
||
x0 + self.SLOT_W // 2, PAD + self.THUMB_H // 2,
|
||
text=f"⏳\nstr. {pos + 1}",
|
||
fill=C_LOADING, font=("Consolas", 18), justify="center"
|
||
)
|
||
|
||
# ── Kurzor ───────────────────────────────────────────────────────
|
||
if pos == self.cursor:
|
||
c.create_rectangle(
|
||
x0 + 2, 2, x0 + self.SLOT_W - 2, PAD + self.THUMB_H + PAD - 2,
|
||
outline=C_CURSOR, width=5
|
||
)
|
||
|
||
# ── Info pás ─────────────────────────────────────────────────────
|
||
y_info = PAD + self.THUMB_H + PAD
|
||
c.create_rectangle(
|
||
x0, y_info, x0 + self.SLOT_W, y_info + INFO_H,
|
||
fill=BG_INFO, outline=""
|
||
)
|
||
|
||
result = self.ocr_results.get(page_idx)
|
||
if result is None:
|
||
rc_line = "⏳ OCR probíhá…"
|
||
pat_line = ""
|
||
stat_color = C_LOADING
|
||
else:
|
||
rc = result.get("rc")
|
||
rc_line = f"RČ: {rc}" if rc else "RČ: nenalezeno"
|
||
med = result.get("medicus")
|
||
if med:
|
||
s = med["status"]
|
||
p = med.get("patient")
|
||
if s == "ok" and p:
|
||
pat_line = f"{p['prijmeni']} {p['jmeno']}"
|
||
stat_color = C_OK
|
||
elif s == "fuzzy" and p:
|
||
pat_line = f"~ {p['prijmeni']} {p['jmeno']}"
|
||
stat_color = C_FUZZY
|
||
elif s == "not_found":
|
||
pat_line = "Nenalezen v Medicus"
|
||
stat_color = C_NONE
|
||
else:
|
||
pat_line = "Medicus offline"
|
||
stat_color = C_OFFLINE
|
||
elif rc:
|
||
pat_line = "Ověřuji…"
|
||
stat_color = C_LOADING
|
||
else:
|
||
pat_line = ""
|
||
stat_color = C_NONE
|
||
|
||
c.create_text(
|
||
x0 + 8, y_info + 6,
|
||
text=f"str. {pos + 1}/{n} (orig: {page_idx + 1})",
|
||
anchor="nw", fill=C_DIM, font=("Consolas", 10)
|
||
)
|
||
c.create_text(
|
||
x0 + 8, y_info + 26,
|
||
text=rc_line,
|
||
anchor="nw", fill=stat_color, font=("Consolas", 13, "bold")
|
||
)
|
||
c.create_text(
|
||
x0 + 8, y_info + 52,
|
||
text=pat_line,
|
||
anchor="nw", fill=stat_color, font=("Consolas", 14, "bold")
|
||
)
|
||
|
||
# ── Oddělovač napravo od tohoto slotu ────────────────────────────
|
||
if col < COLS - 1:
|
||
next_pos = pos + 1
|
||
is_new = next_pos in self.boundaries
|
||
x_sep = x0 + self.SLOT_W
|
||
c.create_rectangle(
|
||
x_sep, 0, x_sep + BORDER_W, self.CANVAS_H,
|
||
fill=C_BOUNDARY if is_new else C_SAME, outline=""
|
||
)
|
||
if is_new:
|
||
c.create_text(
|
||
x_sep + BORDER_W // 2, self.CANVAS_H // 2,
|
||
text="▼\nNOVÝ",
|
||
fill="white", font=("Consolas", 7, "bold"), justify="center"
|
||
)
|
||
|
||
# ── Stavová lišta nahoře ──────────────────────────────────────────────
|
||
groups = self._get_groups()
|
||
self.top_label.config(
|
||
text=(
|
||
f" str. {self.cursor + 1}/{n} │ "
|
||
f"skupiny: {len(groups)} │ "
|
||
f"OCR: {ocr_done}/{n} │ "
|
||
f"{self.pdf_path.name}"
|
||
)
|
||
)
|
||
|
||
|
||
# ── Vstup ─────────────────────────────────────────────────────────────────────
|
||
|
||
def main():
|
||
root = tk.Tk()
|
||
root.withdraw()
|
||
|
||
if len(sys.argv) >= 2:
|
||
pdf_path = Path(sys.argv[1])
|
||
elif TESTOVANI:
|
||
pdf_path = Path(PATH_TO_TESTFILE)
|
||
else:
|
||
from tkinter import filedialog
|
||
chosen = filedialog.askopenfilename(
|
||
title="Vyber vstupní PDF",
|
||
initialdir=str(_RICOH),
|
||
filetypes=[("PDF soubory", "*.pdf")],
|
||
)
|
||
if not chosen:
|
||
root.destroy()
|
||
sys.exit(0)
|
||
pdf_path = Path(chosen)
|
||
|
||
if not pdf_path.exists():
|
||
print(f"Soubor nenalezen: {pdf_path}")
|
||
root.destroy()
|
||
sys.exit(1)
|
||
|
||
root.deiconify()
|
||
app = SplitterUI(root, pdf_path)
|
||
root.mainloop()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|