diff --git a/Medevio/60 ScansProcessing/corrections.json b/Medevio/60 ScansProcessing/corrections.json index 14384ff..b9a8199 100644 --- a/Medevio/60 ScansProcessing/corrections.json +++ b/Medevio/60 ScansProcessing/corrections.json @@ -662,5 +662,13 @@ { "original": "Pages from 2026-04-30-08-09-11-9.pdf", "corrected": "0755225097 2026-04-29 Kalousová, Denisa [EKG] [bez hodnocení].pdf" + }, + { + "original": "445318078 2025-10-21 Kusáková, Jaroslava [LZ ortopedie] [gonartróza P kolene IV. st., TEP genus Bulovka 5/26, Durolane i.a.].pdf", + "corrected": "445318078 2025-10-21 Kusáková, Jaroslava [LZ ortopedie] [gonartróza P kolene IV. st., TEP genus Bulovka 526, Durolane i.a.].pdf" + }, + { + "original": "7952090443 2026-04-30 Kalousová, Eva [HolterTK] [ABPM 29h, denní SYS avg 133 DIA avg 92, noční SYS avg 114 DIA avg 72].pdf", + "corrected": "7952090443 2026-04-30 Kalousová, Eva [HolterTK] [ABPM 29h, nedostatečně kompenzovaná hypertenze, denní SYS avg 133 DIA avg 92, noční SYS avg 114 DIA avg 72].pdf" } ] \ No newline at end of file diff --git a/Medevio/70 DěleníSouboruPDF/debug_page3.jpg b/Medevio/70 DěleníSouboruPDF/debug_page3.jpg new file mode 100644 index 0000000..828343f Binary files /dev/null and b/Medevio/70 DěleníSouboruPDF/debug_page3.jpg differ diff --git a/Medevio/70 DěleníSouboruPDF/rozdelit_pdf.py b/Medevio/70 DěleníSouboruPDF/rozdelit_pdf.py new file mode 100644 index 0000000..f4bed95 --- /dev/null +++ b/Medevio/70 DěleníSouboruPDF/rozdelit_pdf.py @@ -0,0 +1,840 @@ +""" +rozdelit_pdf.py — Dělení vícestránkového PDF na skupiny pacientů. + +Spuštění: + python rozdelit_pdf.py soubor.pdf + +Numerická klávesnice: + 4 / Left kurzor ← + 6 / Right kurzor → + 7 / PgUp skok ← o 4 stránky + 9 / PgDn skok → o 4 stránky + 5 / Space přepni hranici pacienta před touto stránkou + 8 / Up přesuň stránku doleva (swap) + 2 / Down přesuň stránku doprava (swap) + Enter exportuj všechny skupiny do Split/ + Esc konec +""" + +import sys +import os +import io +import re +import json +import threading +from pathlib import Path +from typing import Optional + +import tkinter as tk +from tkinter import messagebox + +from PIL import Image, ImageTk +import fitz # PyMuPDF + +# ── Cesty ───────────────────────────────────────────────────────────────────── + +ROOT = Path(__file__).resolve().parent.parent # .../Medevio/ +sys.path.insert(0, str(ROOT)) +from Knihovny.najdi_medicus import get_medicus_config +from Knihovny.najdi_dropbox import get_dropbox_root + +_DROPBOX = Path(get_dropbox_root()) +_RICOH = _DROPBOX / r"Ordinace\Dokumentace_ke_zpracování\Ricoh Fi-8040" +SPLIT_DIR = _RICOH / "Split" + +# ── Env ─────────────────────────────────────────────────────────────────────── + +def _load_env(): + env_path = ROOT / ".env" + if env_path.exists(): + for line in env_path.read_text(encoding="utf-8").splitlines(): + line = line.strip() + if "=" in line and not line.startswith("#"): + k, v = line.split("=", 1) + os.environ[k.strip()] = v.strip() + +_load_env() + +# ── Regex ───────────────────────────────────────────────────────────────────── + +TESTOVANI = False +PATH_TO_TESTFILE = r"U:\Dropbox\Ordinace\Dokumentace_ke_zpracování\Ricoh Fi-8040\2026-05-04-07-50-17 - Copy.pdf" + +TESSERACT_PATH = r"C:\Program Files\Tesseract-OCR\tesseract.exe" + +# RČ s lomítkem: 710920/3893 +RC_RE_SPLIT = re.compile(r"\b(\d{6})\s*/\s*(\d{3,4})\b") +# RČ za klíčovým slovem (Tesseract špatně přečte diakritiku → tolerujeme ASCII varianty) +RC_RE_KEYWORD = re.compile( + r"(?:C\.?P\.?|R\.?C\.?|RC|ID|NAR)\s*[:\.]?\s*(\d{9,10})\b", + re.IGNORECASE, +) +# Fallback: jakýkoli 9-10místný blok +RC_RE_PLAIN = re.compile(r"\b(\d{9,10})\b") + +def _rc_valid(digits: str) -> bool: + if len(digits) not in (9, 10): + return False + month = int(digits[2:4]) + day = int(digits[4:6]) + return (month in range(1, 13) or month in range(51, 63)) and 1 <= day <= 31 + +def _extract_rc(text: str) -> Optional[str]: + # 1. lomítko + m = RC_RE_SPLIT.search(text) + if m: + return m.group(1) + m.group(2) + # 2. klíčové slovo + číslo + for m in RC_RE_KEYWORD.finditer(text): + if _rc_valid(m.group(1)): + return m.group(1) + # 3. plain fallback + for m in RC_RE_PLAIN.finditer(text): + if _rc_valid(m.group(1)): + return m.group(1) + return None + +def _rc_candidates(rc: str) -> list[str]: + # Vizuálně podobné číslice při OCR — každá číslice může být zaměněna za více variant + similar: dict[str, list[str]] = { + "0": ["8", "6", "5"], + "1": ["7", "6"], + "2": [], + "3": ["8"], + "4": [], + "5": ["6", "0"], + "6": ["5", "0", "1"], + "7": ["1"], + "8": ["0", "3"], + "9": [], + } + candidates = set() + for i in range(len(rc)): + candidates.add(rc[:i] + rc[i+1:]) + for i in range(len(rc) + 1): + candidates.add(rc[:i] + "0" + rc[i:]) + for i, ch in enumerate(rc): + for alt in similar.get(ch, []): + candidates.add(rc[:i] + alt + rc[i+1:]) + candidates.discard(rc) + return sorted(c for c in candidates if len(c) in (9, 10)) + +def _rc_checksum_ok(rc: str) -> bool: + digits = re.sub(r"\D", "", rc) + return len(digits) == 10 and int(digits) % 11 == 0 + +def _rc_candidates_level2(rc: str) -> list[str]: + """Kandidáti se dvěma chybami — filtrováno checksumem aby nebylo příliš mnoho.""" + level1 = set(_rc_candidates(rc)) + level2 = set() + for c in level1: + level2.update(_rc_candidates(c)) + level2 -= level1 + level2.discard(rc) + # Bez checksumu by bylo příliš kandidátů — filtrujeme + return sorted(c for c in level2 if len(c) in (9, 10) and _rc_checksum_ok(c)) + +# ── Medicus ─────────────────────────────────────────────────────────────────── + +def _verify_medicus(rc_digits: str) -> dict: + try: + import fdb + cfg = get_medicus_config() + con = fdb.connect(dsn=cfg.dsn, user="SYSDBA", password="masterkey", charset="win1250") + try: + cur = con.cursor() + + def _lookup(rc: str) -> Optional[dict]: + cur.execute( + "SELECT IDPAC, PRIJMENI, JMENO, RODCIS FROM KAR " + "WHERE REPLACE(RODCIS, '/', '') = ?", (rc,) + ) + row = cur.fetchone() + if row: + return { + "idpac": row[0], + "prijmeni": row[1].strip(), + "jmeno": row[2].strip(), + "rodcis": row[3].strip(), + } + return None + + p = _lookup(rc_digits) + if p: + return {"status": "ok", "patient": p} + for c in _rc_candidates(rc_digits): + p = _lookup(c) + if p: + return {"status": "fuzzy", "rc_corrected": c, "patient": p} + for c in _rc_candidates_level2(rc_digits): + p = _lookup(c) + if p: + return {"status": "fuzzy", "rc_corrected": c, "patient": p} + return {"status": "not_found", "patient": None} + finally: + con.close() + except Exception as e: + return {"status": "offline", "patient": None, "error": str(e)} + +# ── Jméno výstupního souboru ────────────────────────────────────────────────── + +def _format_filename(group_idx: int, medicus: Optional[dict]) -> str: + p = medicus.get("patient") if medicus else None + if p: + rc = re.sub(r"\D", "", p["rodcis"]) + return f"{rc} {p['prijmeni']}, {p['jmeno']} split_{group_idx:03d}.pdf" + return f"split_{group_idx:03d}.pdf" + +# ── OCR worker (pozadí) ─────────────────────────────────────────────────────── + +class OcrWorker: + """ + Na pozadí OCR-uje stránky: Tesseract → Claude Vision (fallback) → Medicus. + Výsledky cachuje do JSON souboru vedle vstupního PDF. + """ + + def __init__(self, doc: fitz.Document, cache_path: Path, on_page_done): + self.doc = doc + self.cache_path = cache_path + self.on_page_done = on_page_done # callback(page_idx: int) + self.results: dict[int, dict] = {} + self._stop = threading.Event() + self._lock = threading.Lock() + self._load_cache() + + def _load_cache(self): + if self.cache_path.exists(): + try: + data = json.loads(self.cache_path.read_text(encoding="utf-8")) + self.results = {int(k): v for k, v in data.items()} + print(f"[OCR cache] načteno {len(self.results)} stránek z {self.cache_path.name}") + except Exception as e: + print(f"[OCR cache] chyba čtení: {e}") + + def _save_cache(self): + with self._lock: + self.cache_path.write_text( + json.dumps(self.results, ensure_ascii=False, indent=2), + encoding="utf-8", + ) + + def start(self): + t = threading.Thread(target=self._run, daemon=True) + t.start() + + def stop(self): + self._stop.set() + + def _run(self): + import pytesseract + pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH + + n = len(self.doc) + for i in range(n): + if self._stop.is_set(): + break + if i in self.results: + continue # cache hit + + page = self.doc[i] + mat = fitz.Matrix(2.0, 2.0) # 144 DPI — dostatečné pro OCR + pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB) + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + + # 1. Tesseract + rc = None + tess_text = None + try: + tess_text = pytesseract.image_to_string(img, lang="ces") + rc = _extract_rc(tess_text) + except Exception as e: + print(f"[OCR str.{i+1}] Tesseract: {e}") + + # 2. Medicus — první pokus + medicus = _verify_medicus(rc) if rc else None + + # 3. Claude Vision — když Tesseract nenašel RČ, nebo našel ale Medicus nezná + claude_raw = None + if not rc or (medicus and medicus.get("status") == "not_found"): + try: + rc_claude, claude_raw = self._claude_rc(img) + if rc_claude: + medicus_claude = _verify_medicus(rc_claude) + if medicus_claude.get("status") in ("ok", "fuzzy"): + print(f"[OCR str.{i+1}] Claude opravil RČ: {rc} → {rc_claude}") + rc = rc_claude + medicus = medicus_claude + elif not rc: + rc = rc_claude + medicus = medicus_claude + except Exception as e: + print(f"[OCR str.{i+1}] Claude: {e}") + + result = { + "rc": rc, + "medicus": medicus, + "tesseract_text": tess_text, + "claude_raw": claude_raw, + } + self.results[i] = result + self._save_cache() + self.on_page_done(i) + + def _claude_rc(self, img: Image.Image) -> tuple[Optional[str], Optional[str]]: + import anthropic, base64 + + buf = io.BytesIO() + img.save(buf, format="JPEG", quality=80) + b64 = base64.standard_b64encode(buf.getvalue()).decode() + + client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY")) + resp = client.messages.create( + model="claude-sonnet-4-6", + max_tokens=100, + messages=[{"role": "user", "content": [ + {"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": b64}}, + {"type": "text", "text": ( + "Najdi rodné číslo na tomto naskenovaném dokumentu. " + "Vrať JSON: {\"rodne_cislo\": \"XXXXXXXXXX\"} nebo {\"rodne_cislo\": null}. " + "Jen JSON, nic jiného." + )}, + ]}], + ) + raw = resp.content[0].text.strip() + raw = re.sub(r"^```\w*\n?", "", raw).rstrip("`").strip() + try: + rc_raw = json.loads(raw).get("rodne_cislo") or "" + return re.sub(r"\D", "", rc_raw) or None, raw + except Exception: + return None, raw + +# ── Thumbnail worker (pozadí) ───────────────────────────────────────────────── + +class ThumbnailWorker: + """Renderuje stránky PDF do PIL Images na pozadí.""" + + def __init__(self, doc: fitz.Document, thumb_w: int, thumb_h: int, on_thumb_done): + self.doc = doc + self.thumb_w = thumb_w + self.thumb_h = thumb_h + self.on_thumb_done = on_thumb_done # callback(page_idx: int) + self._cache: dict[int, Image.Image] = {} + self._lock = threading.Lock() + t = threading.Thread(target=self._run, daemon=True) + t.start() + + def get(self, page_idx: int) -> Optional[Image.Image]: + with self._lock: + return self._cache.get(page_idx) + + def _run(self): + for i in range(len(self.doc)): + page = self.doc[i] + rect = page.rect + scale = min(self.thumb_w / rect.width, self.thumb_h / rect.height) + mat = fitz.Matrix(scale, scale) + pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB) + img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + # Orámuj do pevného plátna + canvas = Image.new("RGB", (self.thumb_w, self.thumb_h), (38, 38, 38)) + x = (self.thumb_w - img.width) // 2 + y = (self.thumb_h - img.height) // 2 + canvas.paste(img, (x, y)) + with self._lock: + self._cache[i] = canvas + self.on_thumb_done(i) + +# ── Barvy a rozměry ─────────────────────────────────────────────────────────── + +COLS = 4 +BORDER_W = 16 # šířka oddělovače mezi sloty +PAD = 8 # odsazení thumbnaillu od okraje slotu +INFO_H = 108 # výška info pásu pod thumbnailem +TOP_H = 44 # výška stavové lišty nahoře +BOT_H = 44 # výška nápovědy dole + +# Rozměry se spočítají dynamicky v SplitterUI.__init__ podle rozlišení monitoru + +BG = "#1e1e1e" +BG_SLOT = "#262626" +BG_INFO = "#181818" +C_CURSOR = "#4da6ff" +C_BOUNDARY = "#cc3333" +C_SAME = "#3a3a3a" +C_OK = "#4caf50" +C_FUZZY = "#ff9800" +C_NONE = "#f44336" +C_OFFLINE = "#888888" +C_LOADING = "#555555" +C_TEXT = "#dddddd" +C_DIM = "#666666" + +GROUP_COLORS = [ + "#1b2a3a", "#2a1b3a", "#1b3a2a", "#3a2a1b", + "#2a3a1b", "#1b2a2a", "#3a1b2a", "#2a2a1b", +] + +# ── Hlavní UI ───────────────────────────────────────────────────────────────── + +class SplitterUI: + def __init__(self, root: tk.Tk, pdf_path: Path): + self.root = root + self.pdf_path = pdf_path + self.doc = fitz.open(str(pdf_path)) + n = len(self.doc) + + # Stav + self.page_order: list[int] = list(range(n)) + self.boundaries: set[int] = {0} # pozice (v page_order) začínající novou skupinu + self.cursor: int = 0 + self.scroll: int = 0 # index nejlevějšího viditelného slotu + + # Cache + self.ocr_results: dict[int, dict] = {} + self._photo_cache: dict[tuple, ImageTk.PhotoImage] = {} # (page_idx, rot) → photo + self.rotations: dict[int, int] = {} # page_idx → stupně (0/90/180/270) + + # Rozměry + sw = root.winfo_screenwidth() + sh = root.winfo_screenheight() + self.SLOT_W = (sw - (COLS - 1) * BORDER_W) // COLS + self.THUMB_W = self.SLOT_W - 2 * PAD + self.THUMB_H = int(self.THUMB_W * 842 / 595) # A4 poměr + self.CANVAS_W = COLS * self.SLOT_W + (COLS - 1) * BORDER_W + self.CANVAS_H = PAD + self.THUMB_H + PAD + INFO_H + win_h = min(TOP_H + self.CANVAS_H + BOT_H, sh - 60) + + root.title(f"PDF Dělení — {pdf_path.name}") + root.configure(bg=BG) + root.geometry(f"{self.CANVAS_W}x{win_h}+0+0") + + self._build_ui() + self._start_workers() + + # ── Stavba UI ───────────────────────────────────────────────────────────── + + def _build_ui(self): + self.top_label = tk.Label( + self.root, bg=BG, fg=C_TEXT, + font=("Consolas", 13), anchor="w", padx=12 + ) + self.top_label.pack(fill="x", side="top", ipady=4) + + self.canvas = tk.Canvas( + self.root, width=self.CANVAS_W, height=self.CANVAS_H, + bg=BG, highlightthickness=0 + ) + self.canvas.pack(fill="both", expand=True) + + hints = ( + "4/6: navigace ←/→ 7/9: skok ×4 " + "5/Space: hranice pacienta " + "1/3: přesuň stránku " + "/: otočit ↺CCW *: otočit ↻CW " + "Del/.: smaž stránku " + "Enter: exportuj Esc: konec" + ) + self.bot_label = tk.Label( + self.root, text=hints, bg=BG, fg=C_DIM, + font=("Consolas", 11), anchor="center" + ) + self.bot_label.pack(fill="x", side="bottom", ipady=6) + + self.root.bind("", self._on_key) + self.root.focus_set() + self._redraw() + + # ── Startuji workery ────────────────────────────────────────────────────── + + def _start_workers(self): + cache_path = self.pdf_path.parent / (self.pdf_path.stem + "_ocr_cache.json") + + self.ocr_worker = OcrWorker( + self.doc, cache_path, + on_page_done=lambda idx: self.root.after(0, self._on_ocr_done, idx), + ) + # Přeberu výsledky z cache + self.ocr_results.update(self.ocr_worker.results) + self._auto_detect_boundaries() + self.ocr_worker.start() + + self.thumb_worker = ThumbnailWorker( + self.doc, self.THUMB_W, self.THUMB_H, + on_thumb_done=lambda idx: self.root.after(0, self._on_thumb_done, idx), + ) + + def _auto_detect_boundaries(self): + """Nastaví hranice jen kde jsou obě strany potvrzeny v Medicus jako různí pacienti.""" + prev_rc = None + prev_confirmed = False + for pos, page_idx in enumerate(self.page_order): + r = self.ocr_results.get(page_idx) + if not r: + continue + rc = r.get("rc") + status = (r.get("medicus") or {}).get("status") + confirmed = status in ("ok", "fuzzy") + if rc and confirmed and prev_rc and prev_confirmed and rc != prev_rc: + self.boundaries.add(pos) + if rc and confirmed: + prev_rc = rc + prev_confirmed = True + + # ── Callbacky z workerů ─────────────────────────────────────────────────── + + def _on_ocr_done(self, page_idx: int): + self.ocr_results[page_idx] = self.ocr_worker.results[page_idx] + # Auto-detekce hranice spustí se když doběhne celé OCR a uživatel ještě nic neměnil + if (len(self.ocr_results) == len(self.page_order) + and self.boundaries == {0}): + self._auto_detect_boundaries() + self._redraw() + + def _on_thumb_done(self, page_idx: int): + self._rebuild_photo(page_idx) + self._redraw() + + def _rebuild_photo(self, page_idx: int): + pil = self.thumb_worker.get(page_idx) + if pil is None: + return + rot = self.rotations.get(page_idx, 0) + key = (page_idx, rot) + if key not in self._photo_cache: + img = pil.rotate(rot, expand=True).resize( + (self.THUMB_W, self.THUMB_H), Image.LANCZOS + ) + self._photo_cache[key] = ImageTk.PhotoImage(img) + + # ── Klávesnice ──────────────────────────────────────────────────────────── + + def _on_key(self, event): + ks = event.keysym + kc = event.keycode + # Numpad keycodes (Windows): 96=KP0 97=KP1 ... 105=KP9 110=KP. + # NumLock ON → keysym='1'..'9', keycode=97..105 + # NumLock OFF → keysym=Left/Clear/Right/Home/Up/Prior/Down + numpad = { + 100: "num4", 101: "num5", 102: "num6", + 103: "num7", 105: "num9", + 97: "num1", 99: "num3", 110: "numdot", + 111: "numslash", 106: "numstar", + } + action = numpad.get(kc) or { + "Left": "num4", "Right": "num6", + "Home": "num7", "Prior": "num9", + "Clear": "num5", "End": "num1", + "Next": "num3", "Delete": "numdot", + "space": "num5", + "KP_Divide": "numslash", "KP_Multiply": "numstar", + "slash": "numslash", "asterisk": "numstar", + }.get(ks) + + if action == "num4": + self._move_cursor(-1) + elif action == "num6": + self._move_cursor(1) + elif action == "num7": + self._move_cursor(-COLS) + elif action == "num9": + self._move_cursor(COLS) + elif action == "num5": + self._toggle_boundary() + elif action == "num1": + self._move_page(-1) + elif action == "num3": + self._move_page(1) + elif action == "numslash": + self._rotate_page(90) # CCW + elif action == "numstar": + self._rotate_page(-90) # CW + elif action == "numdot": + self._delete_page() + elif ks in ("Return", "KP_Enter"): + self._export() + elif ks == "Escape": + self.root.quit() + + # ── Pohyb a manipulace ──────────────────────────────────────────────────── + + def _move_cursor(self, delta: int): + n = len(self.page_order) + self.cursor = max(0, min(n - 1, self.cursor + delta)) + if self.cursor < self.scroll: + self.scroll = self.cursor + elif self.cursor >= self.scroll + COLS: + self.scroll = self.cursor - COLS + 1 + self._redraw() + + def _toggle_boundary(self): + pos = self.cursor + if pos == 0: + return + if pos in self.boundaries: + self.boundaries.discard(pos) + else: + self.boundaries.add(pos) + self._redraw() + + def _rotate_page(self, delta: int): + page_idx = self.page_order[self.cursor] + rot = (self.rotations.get(page_idx, 0) + delta) % 360 + self.rotations[page_idx] = rot + self._rebuild_photo(page_idx) + self._redraw() + + def _delete_page(self): + n = len(self.page_order) + if n == 1: + return + pos = self.cursor + self.page_order.pop(pos) + # Posuň hranice: odstraň hranici na pos, posuň vyšší o -1 + self.boundaries = { + b - 1 if b > pos else b + for b in self.boundaries + if b != pos + } + self.boundaries.add(0) # první stránka je vždy začátek + self.cursor = min(pos, len(self.page_order) - 1) + if self.cursor < self.scroll: + self.scroll = self.cursor + self._redraw() + + def _move_page(self, delta: int): + n = len(self.page_order) + pos = self.cursor + new_pos = pos + delta + if new_pos < 0 or new_pos >= n: + return + self.page_order[pos], self.page_order[new_pos] = ( + self.page_order[new_pos], self.page_order[pos] + ) + self.cursor = new_pos + if self.cursor < self.scroll: + self.scroll = self.cursor + elif self.cursor >= self.scroll + COLS: + self.scroll = self.cursor - COLS + 1 + self._redraw() + + # ── Skupiny ─────────────────────────────────────────────────────────────── + + def _group_of_pos(self) -> list[int]: + """Vrátí seznam: group_idx pro každou pozici v page_order.""" + result = [] + gi = 0 + for pos in range(len(self.page_order)): + if pos in self.boundaries and pos > 0: + gi += 1 + result.append(gi) + return result + + def _get_groups(self) -> list[list[int]]: + """Vrátí skupiny: každá je list page_idx (v pořadí z page_order).""" + groups: list[list[int]] = [] + current: list[int] = [] + for pos, page_idx in enumerate(self.page_order): + if pos in self.boundaries and current: + groups.append(current) + current = [] + current.append(page_idx) + if current: + groups.append(current) + return groups + + def _best_medicus(self, pages: list[int]) -> Optional[dict]: + for status in ("ok", "fuzzy"): + for p in pages: + r = self.ocr_results.get(p) + if r and r.get("medicus") and r["medicus"].get("status") == status: + return r["medicus"] + return None + + # ── Export ──────────────────────────────────────────────────────────────── + + def _export(self): + groups = self._get_groups() + SPLIT_DIR.mkdir(parents=True, exist_ok=True) + names = [] + for i, pages in enumerate(groups, 1): + med = self._best_medicus(pages) + name = _format_filename(i, med) + out_path = SPLIT_DIR / name + out_doc = fitz.open() + for page_idx in pages: + out_doc.insert_pdf(self.doc, from_page=page_idx, to_page=page_idx) + rot = self.rotations.get(page_idx, 0) + if rot: + out_doc[-1].set_rotation((out_doc[-1].rotation - rot) % 360) + out_doc.save(str(out_path)) + out_doc.close() + names.append(f"{name} ({len(pages)} str.)") + print(f" Exportováno: {name}") + messagebox.showinfo( + "Export hotov", + f"Exportováno {len(groups)} skupin do:\n{SPLIT_DIR}\n\n" + "\n".join(names), + ) + + # ── Kreslení ────────────────────────────────────────────────────────────── + + def _redraw(self): + c = self.canvas + c.delete("all") + n = len(self.page_order) + group_of = self._group_of_pos() + ocr_done = sum(1 for i in range(n) if i in self.ocr_results) + + # Pozadí + c.create_rectangle(0, 0, self.CANVAS_W, self.CANVAS_H, fill=BG, outline="") + + for col in range(COLS): + pos = self.scroll + col + if pos >= n: + break + + page_idx = self.page_order[pos] + gi = group_of[pos] + x0 = col * (self.SLOT_W + BORDER_W) + + # ── Pozadí slotu ───────────────────────────────────────────────── + slot_color = GROUP_COLORS[gi % len(GROUP_COLORS)] + c.create_rectangle( + x0, 0, x0 + self.SLOT_W, PAD + self.THUMB_H + PAD, + fill=slot_color, outline="" + ) + + # ── Thumbnail ──────────────────────────────────────────────────── + rot = self.rotations.get(page_idx, 0) + photo = self._photo_cache.get((page_idx, rot)) + if photo: + c.create_image(x0 + PAD, PAD, anchor="nw", image=photo) + else: + c.create_text( + x0 + self.SLOT_W // 2, PAD + self.THUMB_H // 2, + text=f"⏳\nstr. {pos + 1}", + fill=C_LOADING, font=("Consolas", 18), justify="center" + ) + + # ── Kurzor ─────────────────────────────────────────────────────── + if pos == self.cursor: + c.create_rectangle( + x0 + 2, 2, x0 + self.SLOT_W - 2, PAD + self.THUMB_H + PAD - 2, + outline=C_CURSOR, width=5 + ) + + # ── Info pás ───────────────────────────────────────────────────── + y_info = PAD + self.THUMB_H + PAD + c.create_rectangle( + x0, y_info, x0 + self.SLOT_W, y_info + INFO_H, + fill=BG_INFO, outline="" + ) + + result = self.ocr_results.get(page_idx) + if result is None: + rc_line = "⏳ OCR probíhá…" + pat_line = "" + stat_color = C_LOADING + else: + rc = result.get("rc") + rc_line = f"RČ: {rc}" if rc else "RČ: nenalezeno" + med = result.get("medicus") + if med: + s = med["status"] + p = med.get("patient") + if s == "ok" and p: + pat_line = f"{p['prijmeni']} {p['jmeno']}" + stat_color = C_OK + elif s == "fuzzy" and p: + pat_line = f"~ {p['prijmeni']} {p['jmeno']}" + stat_color = C_FUZZY + elif s == "not_found": + pat_line = "Nenalezen v Medicus" + stat_color = C_NONE + else: + pat_line = "Medicus offline" + stat_color = C_OFFLINE + elif rc: + pat_line = "Ověřuji…" + stat_color = C_LOADING + else: + pat_line = "" + stat_color = C_NONE + + c.create_text( + x0 + 8, y_info + 6, + text=f"str. {pos + 1}/{n} (orig: {page_idx + 1})", + anchor="nw", fill=C_DIM, font=("Consolas", 10) + ) + c.create_text( + x0 + 8, y_info + 26, + text=rc_line, + anchor="nw", fill=stat_color, font=("Consolas", 13, "bold") + ) + c.create_text( + x0 + 8, y_info + 52, + text=pat_line, + anchor="nw", fill=stat_color, font=("Consolas", 14, "bold") + ) + + # ── Oddělovač napravo od tohoto slotu ──────────────────────────── + if col < COLS - 1: + next_pos = pos + 1 + is_new = next_pos in self.boundaries + x_sep = x0 + self.SLOT_W + c.create_rectangle( + x_sep, 0, x_sep + BORDER_W, self.CANVAS_H, + fill=C_BOUNDARY if is_new else C_SAME, outline="" + ) + if is_new: + c.create_text( + x_sep + BORDER_W // 2, self.CANVAS_H // 2, + text="▼\nNOVÝ", + fill="white", font=("Consolas", 7, "bold"), justify="center" + ) + + # ── Stavová lišta nahoře ────────────────────────────────────────────── + groups = self._get_groups() + self.top_label.config( + text=( + f" str. {self.cursor + 1}/{n} │ " + f"skupiny: {len(groups)} │ " + f"OCR: {ocr_done}/{n} │ " + f"{self.pdf_path.name}" + ) + ) + + +# ── Vstup ───────────────────────────────────────────────────────────────────── + +def main(): + root = tk.Tk() + root.withdraw() + + if len(sys.argv) >= 2: + pdf_path = Path(sys.argv[1]) + elif TESTOVANI: + pdf_path = Path(PATH_TO_TESTFILE) + else: + from tkinter import filedialog + chosen = filedialog.askopenfilename( + title="Vyber vstupní PDF", + initialdir=str(_RICOH), + filetypes=[("PDF soubory", "*.pdf")], + ) + if not chosen: + root.destroy() + sys.exit(0) + pdf_path = Path(chosen) + + if not pdf_path.exists(): + print(f"Soubor nenalezen: {pdf_path}") + root.destroy() + sys.exit(1) + + root.deiconify() + app = SplitterUI(root, pdf_path) + root.mainloop() + + +if __name__ == "__main__": + main()