z230
This commit is contained in:
Binary file not shown.
|
After Width: | Height: | Size: 235 KiB |
@@ -0,0 +1,840 @@
|
||||
"""
|
||||
rozdelit_pdf.py — Dělení vícestránkového PDF na skupiny pacientů.
|
||||
|
||||
Spuštění:
|
||||
python rozdelit_pdf.py soubor.pdf
|
||||
|
||||
Numerická klávesnice:
|
||||
4 / Left kurzor ←
|
||||
6 / Right kurzor →
|
||||
7 / PgUp skok ← o 4 stránky
|
||||
9 / PgDn skok → o 4 stránky
|
||||
5 / Space přepni hranici pacienta před touto stránkou
|
||||
8 / Up přesuň stránku doleva (swap)
|
||||
2 / Down přesuň stránku doprava (swap)
|
||||
Enter exportuj všechny skupiny do Split/
|
||||
Esc konec
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import io
|
||||
import re
|
||||
import json
|
||||
import threading
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import tkinter as tk
|
||||
from tkinter import messagebox
|
||||
|
||||
from PIL import Image, ImageTk
|
||||
import fitz # PyMuPDF
|
||||
|
||||
# ── Cesty ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
ROOT = Path(__file__).resolve().parent.parent # .../Medevio/
|
||||
sys.path.insert(0, str(ROOT))
|
||||
from Knihovny.najdi_medicus import get_medicus_config
|
||||
from Knihovny.najdi_dropbox import get_dropbox_root
|
||||
|
||||
_DROPBOX = Path(get_dropbox_root())
|
||||
_RICOH = _DROPBOX / r"Ordinace\Dokumentace_ke_zpracování\Ricoh Fi-8040"
|
||||
SPLIT_DIR = _RICOH / "Split"
|
||||
|
||||
# ── Env ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
def _load_env():
|
||||
env_path = ROOT / ".env"
|
||||
if env_path.exists():
|
||||
for line in env_path.read_text(encoding="utf-8").splitlines():
|
||||
line = line.strip()
|
||||
if "=" in line and not line.startswith("#"):
|
||||
k, v = line.split("=", 1)
|
||||
os.environ[k.strip()] = v.strip()
|
||||
|
||||
_load_env()
|
||||
|
||||
# ── Regex ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
TESTOVANI = False
|
||||
PATH_TO_TESTFILE = r"U:\Dropbox\Ordinace\Dokumentace_ke_zpracování\Ricoh Fi-8040\2026-05-04-07-50-17 - Copy.pdf"
|
||||
|
||||
TESSERACT_PATH = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
|
||||
|
||||
# RČ s lomítkem: 710920/3893
|
||||
RC_RE_SPLIT = re.compile(r"\b(\d{6})\s*/\s*(\d{3,4})\b")
|
||||
# RČ za klíčovým slovem (Tesseract špatně přečte diakritiku → tolerujeme ASCII varianty)
|
||||
RC_RE_KEYWORD = re.compile(
|
||||
r"(?:C\.?P\.?|R\.?C\.?|RC|ID|NAR)\s*[:\.]?\s*(\d{9,10})\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Fallback: jakýkoli 9-10místný blok
|
||||
RC_RE_PLAIN = re.compile(r"\b(\d{9,10})\b")
|
||||
|
||||
def _rc_valid(digits: str) -> bool:
|
||||
if len(digits) not in (9, 10):
|
||||
return False
|
||||
month = int(digits[2:4])
|
||||
day = int(digits[4:6])
|
||||
return (month in range(1, 13) or month in range(51, 63)) and 1 <= day <= 31
|
||||
|
||||
def _extract_rc(text: str) -> Optional[str]:
|
||||
# 1. lomítko
|
||||
m = RC_RE_SPLIT.search(text)
|
||||
if m:
|
||||
return m.group(1) + m.group(2)
|
||||
# 2. klíčové slovo + číslo
|
||||
for m in RC_RE_KEYWORD.finditer(text):
|
||||
if _rc_valid(m.group(1)):
|
||||
return m.group(1)
|
||||
# 3. plain fallback
|
||||
for m in RC_RE_PLAIN.finditer(text):
|
||||
if _rc_valid(m.group(1)):
|
||||
return m.group(1)
|
||||
return None
|
||||
|
||||
def _rc_candidates(rc: str) -> list[str]:
|
||||
# Vizuálně podobné číslice při OCR — každá číslice může být zaměněna za více variant
|
||||
similar: dict[str, list[str]] = {
|
||||
"0": ["8", "6", "5"],
|
||||
"1": ["7", "6"],
|
||||
"2": [],
|
||||
"3": ["8"],
|
||||
"4": [],
|
||||
"5": ["6", "0"],
|
||||
"6": ["5", "0", "1"],
|
||||
"7": ["1"],
|
||||
"8": ["0", "3"],
|
||||
"9": [],
|
||||
}
|
||||
candidates = set()
|
||||
for i in range(len(rc)):
|
||||
candidates.add(rc[:i] + rc[i+1:])
|
||||
for i in range(len(rc) + 1):
|
||||
candidates.add(rc[:i] + "0" + rc[i:])
|
||||
for i, ch in enumerate(rc):
|
||||
for alt in similar.get(ch, []):
|
||||
candidates.add(rc[:i] + alt + rc[i+1:])
|
||||
candidates.discard(rc)
|
||||
return sorted(c for c in candidates if len(c) in (9, 10))
|
||||
|
||||
def _rc_checksum_ok(rc: str) -> bool:
|
||||
digits = re.sub(r"\D", "", rc)
|
||||
return len(digits) == 10 and int(digits) % 11 == 0
|
||||
|
||||
def _rc_candidates_level2(rc: str) -> list[str]:
|
||||
"""Kandidáti se dvěma chybami — filtrováno checksumem aby nebylo příliš mnoho."""
|
||||
level1 = set(_rc_candidates(rc))
|
||||
level2 = set()
|
||||
for c in level1:
|
||||
level2.update(_rc_candidates(c))
|
||||
level2 -= level1
|
||||
level2.discard(rc)
|
||||
# Bez checksumu by bylo příliš kandidátů — filtrujeme
|
||||
return sorted(c for c in level2 if len(c) in (9, 10) and _rc_checksum_ok(c))
|
||||
|
||||
# ── Medicus ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def _verify_medicus(rc_digits: str) -> dict:
|
||||
try:
|
||||
import fdb
|
||||
cfg = get_medicus_config()
|
||||
con = fdb.connect(dsn=cfg.dsn, user="SYSDBA", password="masterkey", charset="win1250")
|
||||
try:
|
||||
cur = con.cursor()
|
||||
|
||||
def _lookup(rc: str) -> Optional[dict]:
|
||||
cur.execute(
|
||||
"SELECT IDPAC, PRIJMENI, JMENO, RODCIS FROM KAR "
|
||||
"WHERE REPLACE(RODCIS, '/', '') = ?", (rc,)
|
||||
)
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
return {
|
||||
"idpac": row[0],
|
||||
"prijmeni": row[1].strip(),
|
||||
"jmeno": row[2].strip(),
|
||||
"rodcis": row[3].strip(),
|
||||
}
|
||||
return None
|
||||
|
||||
p = _lookup(rc_digits)
|
||||
if p:
|
||||
return {"status": "ok", "patient": p}
|
||||
for c in _rc_candidates(rc_digits):
|
||||
p = _lookup(c)
|
||||
if p:
|
||||
return {"status": "fuzzy", "rc_corrected": c, "patient": p}
|
||||
for c in _rc_candidates_level2(rc_digits):
|
||||
p = _lookup(c)
|
||||
if p:
|
||||
return {"status": "fuzzy", "rc_corrected": c, "patient": p}
|
||||
return {"status": "not_found", "patient": None}
|
||||
finally:
|
||||
con.close()
|
||||
except Exception as e:
|
||||
return {"status": "offline", "patient": None, "error": str(e)}
|
||||
|
||||
# ── Jméno výstupního souboru ──────────────────────────────────────────────────
|
||||
|
||||
def _format_filename(group_idx: int, medicus: Optional[dict]) -> str:
|
||||
p = medicus.get("patient") if medicus else None
|
||||
if p:
|
||||
rc = re.sub(r"\D", "", p["rodcis"])
|
||||
return f"{rc} {p['prijmeni']}, {p['jmeno']} split_{group_idx:03d}.pdf"
|
||||
return f"split_{group_idx:03d}.pdf"
|
||||
|
||||
# ── OCR worker (pozadí) ───────────────────────────────────────────────────────
|
||||
|
||||
class OcrWorker:
|
||||
"""
|
||||
Na pozadí OCR-uje stránky: Tesseract → Claude Vision (fallback) → Medicus.
|
||||
Výsledky cachuje do JSON souboru vedle vstupního PDF.
|
||||
"""
|
||||
|
||||
def __init__(self, doc: fitz.Document, cache_path: Path, on_page_done):
|
||||
self.doc = doc
|
||||
self.cache_path = cache_path
|
||||
self.on_page_done = on_page_done # callback(page_idx: int)
|
||||
self.results: dict[int, dict] = {}
|
||||
self._stop = threading.Event()
|
||||
self._lock = threading.Lock()
|
||||
self._load_cache()
|
||||
|
||||
def _load_cache(self):
|
||||
if self.cache_path.exists():
|
||||
try:
|
||||
data = json.loads(self.cache_path.read_text(encoding="utf-8"))
|
||||
self.results = {int(k): v for k, v in data.items()}
|
||||
print(f"[OCR cache] načteno {len(self.results)} stránek z {self.cache_path.name}")
|
||||
except Exception as e:
|
||||
print(f"[OCR cache] chyba čtení: {e}")
|
||||
|
||||
def _save_cache(self):
|
||||
with self._lock:
|
||||
self.cache_path.write_text(
|
||||
json.dumps(self.results, ensure_ascii=False, indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
def start(self):
|
||||
t = threading.Thread(target=self._run, daemon=True)
|
||||
t.start()
|
||||
|
||||
def stop(self):
|
||||
self._stop.set()
|
||||
|
||||
def _run(self):
|
||||
import pytesseract
|
||||
pytesseract.pytesseract.tesseract_cmd = TESSERACT_PATH
|
||||
|
||||
n = len(self.doc)
|
||||
for i in range(n):
|
||||
if self._stop.is_set():
|
||||
break
|
||||
if i in self.results:
|
||||
continue # cache hit
|
||||
|
||||
page = self.doc[i]
|
||||
mat = fitz.Matrix(2.0, 2.0) # 144 DPI — dostatečné pro OCR
|
||||
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||
|
||||
# 1. Tesseract
|
||||
rc = None
|
||||
tess_text = None
|
||||
try:
|
||||
tess_text = pytesseract.image_to_string(img, lang="ces")
|
||||
rc = _extract_rc(tess_text)
|
||||
except Exception as e:
|
||||
print(f"[OCR str.{i+1}] Tesseract: {e}")
|
||||
|
||||
# 2. Medicus — první pokus
|
||||
medicus = _verify_medicus(rc) if rc else None
|
||||
|
||||
# 3. Claude Vision — když Tesseract nenašel RČ, nebo našel ale Medicus nezná
|
||||
claude_raw = None
|
||||
if not rc or (medicus and medicus.get("status") == "not_found"):
|
||||
try:
|
||||
rc_claude, claude_raw = self._claude_rc(img)
|
||||
if rc_claude:
|
||||
medicus_claude = _verify_medicus(rc_claude)
|
||||
if medicus_claude.get("status") in ("ok", "fuzzy"):
|
||||
print(f"[OCR str.{i+1}] Claude opravil RČ: {rc} → {rc_claude}")
|
||||
rc = rc_claude
|
||||
medicus = medicus_claude
|
||||
elif not rc:
|
||||
rc = rc_claude
|
||||
medicus = medicus_claude
|
||||
except Exception as e:
|
||||
print(f"[OCR str.{i+1}] Claude: {e}")
|
||||
|
||||
result = {
|
||||
"rc": rc,
|
||||
"medicus": medicus,
|
||||
"tesseract_text": tess_text,
|
||||
"claude_raw": claude_raw,
|
||||
}
|
||||
self.results[i] = result
|
||||
self._save_cache()
|
||||
self.on_page_done(i)
|
||||
|
||||
def _claude_rc(self, img: Image.Image) -> tuple[Optional[str], Optional[str]]:
|
||||
import anthropic, base64
|
||||
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format="JPEG", quality=80)
|
||||
b64 = base64.standard_b64encode(buf.getvalue()).decode()
|
||||
|
||||
client = anthropic.Anthropic(api_key=os.environ.get("ANTHROPIC_API_KEY"))
|
||||
resp = client.messages.create(
|
||||
model="claude-sonnet-4-6",
|
||||
max_tokens=100,
|
||||
messages=[{"role": "user", "content": [
|
||||
{"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": b64}},
|
||||
{"type": "text", "text": (
|
||||
"Najdi rodné číslo na tomto naskenovaném dokumentu. "
|
||||
"Vrať JSON: {\"rodne_cislo\": \"XXXXXXXXXX\"} nebo {\"rodne_cislo\": null}. "
|
||||
"Jen JSON, nic jiného."
|
||||
)},
|
||||
]}],
|
||||
)
|
||||
raw = resp.content[0].text.strip()
|
||||
raw = re.sub(r"^```\w*\n?", "", raw).rstrip("`").strip()
|
||||
try:
|
||||
rc_raw = json.loads(raw).get("rodne_cislo") or ""
|
||||
return re.sub(r"\D", "", rc_raw) or None, raw
|
||||
except Exception:
|
||||
return None, raw
|
||||
|
||||
# ── Thumbnail worker (pozadí) ─────────────────────────────────────────────────
|
||||
|
||||
class ThumbnailWorker:
|
||||
"""Renderuje stránky PDF do PIL Images na pozadí."""
|
||||
|
||||
def __init__(self, doc: fitz.Document, thumb_w: int, thumb_h: int, on_thumb_done):
|
||||
self.doc = doc
|
||||
self.thumb_w = thumb_w
|
||||
self.thumb_h = thumb_h
|
||||
self.on_thumb_done = on_thumb_done # callback(page_idx: int)
|
||||
self._cache: dict[int, Image.Image] = {}
|
||||
self._lock = threading.Lock()
|
||||
t = threading.Thread(target=self._run, daemon=True)
|
||||
t.start()
|
||||
|
||||
def get(self, page_idx: int) -> Optional[Image.Image]:
|
||||
with self._lock:
|
||||
return self._cache.get(page_idx)
|
||||
|
||||
def _run(self):
|
||||
for i in range(len(self.doc)):
|
||||
page = self.doc[i]
|
||||
rect = page.rect
|
||||
scale = min(self.thumb_w / rect.width, self.thumb_h / rect.height)
|
||||
mat = fitz.Matrix(scale, scale)
|
||||
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||
# Orámuj do pevného plátna
|
||||
canvas = Image.new("RGB", (self.thumb_w, self.thumb_h), (38, 38, 38))
|
||||
x = (self.thumb_w - img.width) // 2
|
||||
y = (self.thumb_h - img.height) // 2
|
||||
canvas.paste(img, (x, y))
|
||||
with self._lock:
|
||||
self._cache[i] = canvas
|
||||
self.on_thumb_done(i)
|
||||
|
||||
# ── Barvy a rozměry ───────────────────────────────────────────────────────────
|
||||
|
||||
COLS = 4
|
||||
BORDER_W = 16 # šířka oddělovače mezi sloty
|
||||
PAD = 8 # odsazení thumbnaillu od okraje slotu
|
||||
INFO_H = 108 # výška info pásu pod thumbnailem
|
||||
TOP_H = 44 # výška stavové lišty nahoře
|
||||
BOT_H = 44 # výška nápovědy dole
|
||||
|
||||
# Rozměry se spočítají dynamicky v SplitterUI.__init__ podle rozlišení monitoru
|
||||
|
||||
BG = "#1e1e1e"
|
||||
BG_SLOT = "#262626"
|
||||
BG_INFO = "#181818"
|
||||
C_CURSOR = "#4da6ff"
|
||||
C_BOUNDARY = "#cc3333"
|
||||
C_SAME = "#3a3a3a"
|
||||
C_OK = "#4caf50"
|
||||
C_FUZZY = "#ff9800"
|
||||
C_NONE = "#f44336"
|
||||
C_OFFLINE = "#888888"
|
||||
C_LOADING = "#555555"
|
||||
C_TEXT = "#dddddd"
|
||||
C_DIM = "#666666"
|
||||
|
||||
GROUP_COLORS = [
|
||||
"#1b2a3a", "#2a1b3a", "#1b3a2a", "#3a2a1b",
|
||||
"#2a3a1b", "#1b2a2a", "#3a1b2a", "#2a2a1b",
|
||||
]
|
||||
|
||||
# ── Hlavní UI ─────────────────────────────────────────────────────────────────
|
||||
|
||||
class SplitterUI:
|
||||
def __init__(self, root: tk.Tk, pdf_path: Path):
|
||||
self.root = root
|
||||
self.pdf_path = pdf_path
|
||||
self.doc = fitz.open(str(pdf_path))
|
||||
n = len(self.doc)
|
||||
|
||||
# Stav
|
||||
self.page_order: list[int] = list(range(n))
|
||||
self.boundaries: set[int] = {0} # pozice (v page_order) začínající novou skupinu
|
||||
self.cursor: int = 0
|
||||
self.scroll: int = 0 # index nejlevějšího viditelného slotu
|
||||
|
||||
# Cache
|
||||
self.ocr_results: dict[int, dict] = {}
|
||||
self._photo_cache: dict[tuple, ImageTk.PhotoImage] = {} # (page_idx, rot) → photo
|
||||
self.rotations: dict[int, int] = {} # page_idx → stupně (0/90/180/270)
|
||||
|
||||
# Rozměry
|
||||
sw = root.winfo_screenwidth()
|
||||
sh = root.winfo_screenheight()
|
||||
self.SLOT_W = (sw - (COLS - 1) * BORDER_W) // COLS
|
||||
self.THUMB_W = self.SLOT_W - 2 * PAD
|
||||
self.THUMB_H = int(self.THUMB_W * 842 / 595) # A4 poměr
|
||||
self.CANVAS_W = COLS * self.SLOT_W + (COLS - 1) * BORDER_W
|
||||
self.CANVAS_H = PAD + self.THUMB_H + PAD + INFO_H
|
||||
win_h = min(TOP_H + self.CANVAS_H + BOT_H, sh - 60)
|
||||
|
||||
root.title(f"PDF Dělení — {pdf_path.name}")
|
||||
root.configure(bg=BG)
|
||||
root.geometry(f"{self.CANVAS_W}x{win_h}+0+0")
|
||||
|
||||
self._build_ui()
|
||||
self._start_workers()
|
||||
|
||||
# ── Stavba UI ─────────────────────────────────────────────────────────────
|
||||
|
||||
def _build_ui(self):
|
||||
self.top_label = tk.Label(
|
||||
self.root, bg=BG, fg=C_TEXT,
|
||||
font=("Consolas", 13), anchor="w", padx=12
|
||||
)
|
||||
self.top_label.pack(fill="x", side="top", ipady=4)
|
||||
|
||||
self.canvas = tk.Canvas(
|
||||
self.root, width=self.CANVAS_W, height=self.CANVAS_H,
|
||||
bg=BG, highlightthickness=0
|
||||
)
|
||||
self.canvas.pack(fill="both", expand=True)
|
||||
|
||||
hints = (
|
||||
"4/6: navigace ←/→ 7/9: skok ×4 "
|
||||
"5/Space: hranice pacienta "
|
||||
"1/3: přesuň stránku "
|
||||
"/: otočit ↺CCW *: otočit ↻CW "
|
||||
"Del/.: smaž stránku "
|
||||
"Enter: exportuj Esc: konec"
|
||||
)
|
||||
self.bot_label = tk.Label(
|
||||
self.root, text=hints, bg=BG, fg=C_DIM,
|
||||
font=("Consolas", 11), anchor="center"
|
||||
)
|
||||
self.bot_label.pack(fill="x", side="bottom", ipady=6)
|
||||
|
||||
self.root.bind("<KeyPress>", self._on_key)
|
||||
self.root.focus_set()
|
||||
self._redraw()
|
||||
|
||||
# ── Startuji workery ──────────────────────────────────────────────────────
|
||||
|
||||
def _start_workers(self):
|
||||
cache_path = self.pdf_path.parent / (self.pdf_path.stem + "_ocr_cache.json")
|
||||
|
||||
self.ocr_worker = OcrWorker(
|
||||
self.doc, cache_path,
|
||||
on_page_done=lambda idx: self.root.after(0, self._on_ocr_done, idx),
|
||||
)
|
||||
# Přeberu výsledky z cache
|
||||
self.ocr_results.update(self.ocr_worker.results)
|
||||
self._auto_detect_boundaries()
|
||||
self.ocr_worker.start()
|
||||
|
||||
self.thumb_worker = ThumbnailWorker(
|
||||
self.doc, self.THUMB_W, self.THUMB_H,
|
||||
on_thumb_done=lambda idx: self.root.after(0, self._on_thumb_done, idx),
|
||||
)
|
||||
|
||||
def _auto_detect_boundaries(self):
|
||||
"""Nastaví hranice jen kde jsou obě strany potvrzeny v Medicus jako různí pacienti."""
|
||||
prev_rc = None
|
||||
prev_confirmed = False
|
||||
for pos, page_idx in enumerate(self.page_order):
|
||||
r = self.ocr_results.get(page_idx)
|
||||
if not r:
|
||||
continue
|
||||
rc = r.get("rc")
|
||||
status = (r.get("medicus") or {}).get("status")
|
||||
confirmed = status in ("ok", "fuzzy")
|
||||
if rc and confirmed and prev_rc and prev_confirmed and rc != prev_rc:
|
||||
self.boundaries.add(pos)
|
||||
if rc and confirmed:
|
||||
prev_rc = rc
|
||||
prev_confirmed = True
|
||||
|
||||
# ── Callbacky z workerů ───────────────────────────────────────────────────
|
||||
|
||||
def _on_ocr_done(self, page_idx: int):
|
||||
self.ocr_results[page_idx] = self.ocr_worker.results[page_idx]
|
||||
# Auto-detekce hranice spustí se když doběhne celé OCR a uživatel ještě nic neměnil
|
||||
if (len(self.ocr_results) == len(self.page_order)
|
||||
and self.boundaries == {0}):
|
||||
self._auto_detect_boundaries()
|
||||
self._redraw()
|
||||
|
||||
def _on_thumb_done(self, page_idx: int):
|
||||
self._rebuild_photo(page_idx)
|
||||
self._redraw()
|
||||
|
||||
def _rebuild_photo(self, page_idx: int):
|
||||
pil = self.thumb_worker.get(page_idx)
|
||||
if pil is None:
|
||||
return
|
||||
rot = self.rotations.get(page_idx, 0)
|
||||
key = (page_idx, rot)
|
||||
if key not in self._photo_cache:
|
||||
img = pil.rotate(rot, expand=True).resize(
|
||||
(self.THUMB_W, self.THUMB_H), Image.LANCZOS
|
||||
)
|
||||
self._photo_cache[key] = ImageTk.PhotoImage(img)
|
||||
|
||||
# ── Klávesnice ────────────────────────────────────────────────────────────
|
||||
|
||||
def _on_key(self, event):
|
||||
ks = event.keysym
|
||||
kc = event.keycode
|
||||
# Numpad keycodes (Windows): 96=KP0 97=KP1 ... 105=KP9 110=KP.
|
||||
# NumLock ON → keysym='1'..'9', keycode=97..105
|
||||
# NumLock OFF → keysym=Left/Clear/Right/Home/Up/Prior/Down
|
||||
numpad = {
|
||||
100: "num4", 101: "num5", 102: "num6",
|
||||
103: "num7", 105: "num9",
|
||||
97: "num1", 99: "num3", 110: "numdot",
|
||||
111: "numslash", 106: "numstar",
|
||||
}
|
||||
action = numpad.get(kc) or {
|
||||
"Left": "num4", "Right": "num6",
|
||||
"Home": "num7", "Prior": "num9",
|
||||
"Clear": "num5", "End": "num1",
|
||||
"Next": "num3", "Delete": "numdot",
|
||||
"space": "num5",
|
||||
"KP_Divide": "numslash", "KP_Multiply": "numstar",
|
||||
"slash": "numslash", "asterisk": "numstar",
|
||||
}.get(ks)
|
||||
|
||||
if action == "num4":
|
||||
self._move_cursor(-1)
|
||||
elif action == "num6":
|
||||
self._move_cursor(1)
|
||||
elif action == "num7":
|
||||
self._move_cursor(-COLS)
|
||||
elif action == "num9":
|
||||
self._move_cursor(COLS)
|
||||
elif action == "num5":
|
||||
self._toggle_boundary()
|
||||
elif action == "num1":
|
||||
self._move_page(-1)
|
||||
elif action == "num3":
|
||||
self._move_page(1)
|
||||
elif action == "numslash":
|
||||
self._rotate_page(90) # CCW
|
||||
elif action == "numstar":
|
||||
self._rotate_page(-90) # CW
|
||||
elif action == "numdot":
|
||||
self._delete_page()
|
||||
elif ks in ("Return", "KP_Enter"):
|
||||
self._export()
|
||||
elif ks == "Escape":
|
||||
self.root.quit()
|
||||
|
||||
# ── Pohyb a manipulace ────────────────────────────────────────────────────
|
||||
|
||||
def _move_cursor(self, delta: int):
|
||||
n = len(self.page_order)
|
||||
self.cursor = max(0, min(n - 1, self.cursor + delta))
|
||||
if self.cursor < self.scroll:
|
||||
self.scroll = self.cursor
|
||||
elif self.cursor >= self.scroll + COLS:
|
||||
self.scroll = self.cursor - COLS + 1
|
||||
self._redraw()
|
||||
|
||||
def _toggle_boundary(self):
|
||||
pos = self.cursor
|
||||
if pos == 0:
|
||||
return
|
||||
if pos in self.boundaries:
|
||||
self.boundaries.discard(pos)
|
||||
else:
|
||||
self.boundaries.add(pos)
|
||||
self._redraw()
|
||||
|
||||
def _rotate_page(self, delta: int):
|
||||
page_idx = self.page_order[self.cursor]
|
||||
rot = (self.rotations.get(page_idx, 0) + delta) % 360
|
||||
self.rotations[page_idx] = rot
|
||||
self._rebuild_photo(page_idx)
|
||||
self._redraw()
|
||||
|
||||
def _delete_page(self):
|
||||
n = len(self.page_order)
|
||||
if n == 1:
|
||||
return
|
||||
pos = self.cursor
|
||||
self.page_order.pop(pos)
|
||||
# Posuň hranice: odstraň hranici na pos, posuň vyšší o -1
|
||||
self.boundaries = {
|
||||
b - 1 if b > pos else b
|
||||
for b in self.boundaries
|
||||
if b != pos
|
||||
}
|
||||
self.boundaries.add(0) # první stránka je vždy začátek
|
||||
self.cursor = min(pos, len(self.page_order) - 1)
|
||||
if self.cursor < self.scroll:
|
||||
self.scroll = self.cursor
|
||||
self._redraw()
|
||||
|
||||
def _move_page(self, delta: int):
|
||||
n = len(self.page_order)
|
||||
pos = self.cursor
|
||||
new_pos = pos + delta
|
||||
if new_pos < 0 or new_pos >= n:
|
||||
return
|
||||
self.page_order[pos], self.page_order[new_pos] = (
|
||||
self.page_order[new_pos], self.page_order[pos]
|
||||
)
|
||||
self.cursor = new_pos
|
||||
if self.cursor < self.scroll:
|
||||
self.scroll = self.cursor
|
||||
elif self.cursor >= self.scroll + COLS:
|
||||
self.scroll = self.cursor - COLS + 1
|
||||
self._redraw()
|
||||
|
||||
# ── Skupiny ───────────────────────────────────────────────────────────────
|
||||
|
||||
def _group_of_pos(self) -> list[int]:
|
||||
"""Vrátí seznam: group_idx pro každou pozici v page_order."""
|
||||
result = []
|
||||
gi = 0
|
||||
for pos in range(len(self.page_order)):
|
||||
if pos in self.boundaries and pos > 0:
|
||||
gi += 1
|
||||
result.append(gi)
|
||||
return result
|
||||
|
||||
def _get_groups(self) -> list[list[int]]:
|
||||
"""Vrátí skupiny: každá je list page_idx (v pořadí z page_order)."""
|
||||
groups: list[list[int]] = []
|
||||
current: list[int] = []
|
||||
for pos, page_idx in enumerate(self.page_order):
|
||||
if pos in self.boundaries and current:
|
||||
groups.append(current)
|
||||
current = []
|
||||
current.append(page_idx)
|
||||
if current:
|
||||
groups.append(current)
|
||||
return groups
|
||||
|
||||
def _best_medicus(self, pages: list[int]) -> Optional[dict]:
|
||||
for status in ("ok", "fuzzy"):
|
||||
for p in pages:
|
||||
r = self.ocr_results.get(p)
|
||||
if r and r.get("medicus") and r["medicus"].get("status") == status:
|
||||
return r["medicus"]
|
||||
return None
|
||||
|
||||
# ── Export ────────────────────────────────────────────────────────────────
|
||||
|
||||
def _export(self):
|
||||
groups = self._get_groups()
|
||||
SPLIT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
names = []
|
||||
for i, pages in enumerate(groups, 1):
|
||||
med = self._best_medicus(pages)
|
||||
name = _format_filename(i, med)
|
||||
out_path = SPLIT_DIR / name
|
||||
out_doc = fitz.open()
|
||||
for page_idx in pages:
|
||||
out_doc.insert_pdf(self.doc, from_page=page_idx, to_page=page_idx)
|
||||
rot = self.rotations.get(page_idx, 0)
|
||||
if rot:
|
||||
out_doc[-1].set_rotation((out_doc[-1].rotation - rot) % 360)
|
||||
out_doc.save(str(out_path))
|
||||
out_doc.close()
|
||||
names.append(f"{name} ({len(pages)} str.)")
|
||||
print(f" Exportováno: {name}")
|
||||
messagebox.showinfo(
|
||||
"Export hotov",
|
||||
f"Exportováno {len(groups)} skupin do:\n{SPLIT_DIR}\n\n" + "\n".join(names),
|
||||
)
|
||||
|
||||
# ── Kreslení ──────────────────────────────────────────────────────────────
|
||||
|
||||
def _redraw(self):
|
||||
c = self.canvas
|
||||
c.delete("all")
|
||||
n = len(self.page_order)
|
||||
group_of = self._group_of_pos()
|
||||
ocr_done = sum(1 for i in range(n) if i in self.ocr_results)
|
||||
|
||||
# Pozadí
|
||||
c.create_rectangle(0, 0, self.CANVAS_W, self.CANVAS_H, fill=BG, outline="")
|
||||
|
||||
for col in range(COLS):
|
||||
pos = self.scroll + col
|
||||
if pos >= n:
|
||||
break
|
||||
|
||||
page_idx = self.page_order[pos]
|
||||
gi = group_of[pos]
|
||||
x0 = col * (self.SLOT_W + BORDER_W)
|
||||
|
||||
# ── Pozadí slotu ─────────────────────────────────────────────────
|
||||
slot_color = GROUP_COLORS[gi % len(GROUP_COLORS)]
|
||||
c.create_rectangle(
|
||||
x0, 0, x0 + self.SLOT_W, PAD + self.THUMB_H + PAD,
|
||||
fill=slot_color, outline=""
|
||||
)
|
||||
|
||||
# ── Thumbnail ────────────────────────────────────────────────────
|
||||
rot = self.rotations.get(page_idx, 0)
|
||||
photo = self._photo_cache.get((page_idx, rot))
|
||||
if photo:
|
||||
c.create_image(x0 + PAD, PAD, anchor="nw", image=photo)
|
||||
else:
|
||||
c.create_text(
|
||||
x0 + self.SLOT_W // 2, PAD + self.THUMB_H // 2,
|
||||
text=f"⏳\nstr. {pos + 1}",
|
||||
fill=C_LOADING, font=("Consolas", 18), justify="center"
|
||||
)
|
||||
|
||||
# ── Kurzor ───────────────────────────────────────────────────────
|
||||
if pos == self.cursor:
|
||||
c.create_rectangle(
|
||||
x0 + 2, 2, x0 + self.SLOT_W - 2, PAD + self.THUMB_H + PAD - 2,
|
||||
outline=C_CURSOR, width=5
|
||||
)
|
||||
|
||||
# ── Info pás ─────────────────────────────────────────────────────
|
||||
y_info = PAD + self.THUMB_H + PAD
|
||||
c.create_rectangle(
|
||||
x0, y_info, x0 + self.SLOT_W, y_info + INFO_H,
|
||||
fill=BG_INFO, outline=""
|
||||
)
|
||||
|
||||
result = self.ocr_results.get(page_idx)
|
||||
if result is None:
|
||||
rc_line = "⏳ OCR probíhá…"
|
||||
pat_line = ""
|
||||
stat_color = C_LOADING
|
||||
else:
|
||||
rc = result.get("rc")
|
||||
rc_line = f"RČ: {rc}" if rc else "RČ: nenalezeno"
|
||||
med = result.get("medicus")
|
||||
if med:
|
||||
s = med["status"]
|
||||
p = med.get("patient")
|
||||
if s == "ok" and p:
|
||||
pat_line = f"{p['prijmeni']} {p['jmeno']}"
|
||||
stat_color = C_OK
|
||||
elif s == "fuzzy" and p:
|
||||
pat_line = f"~ {p['prijmeni']} {p['jmeno']}"
|
||||
stat_color = C_FUZZY
|
||||
elif s == "not_found":
|
||||
pat_line = "Nenalezen v Medicus"
|
||||
stat_color = C_NONE
|
||||
else:
|
||||
pat_line = "Medicus offline"
|
||||
stat_color = C_OFFLINE
|
||||
elif rc:
|
||||
pat_line = "Ověřuji…"
|
||||
stat_color = C_LOADING
|
||||
else:
|
||||
pat_line = ""
|
||||
stat_color = C_NONE
|
||||
|
||||
c.create_text(
|
||||
x0 + 8, y_info + 6,
|
||||
text=f"str. {pos + 1}/{n} (orig: {page_idx + 1})",
|
||||
anchor="nw", fill=C_DIM, font=("Consolas", 10)
|
||||
)
|
||||
c.create_text(
|
||||
x0 + 8, y_info + 26,
|
||||
text=rc_line,
|
||||
anchor="nw", fill=stat_color, font=("Consolas", 13, "bold")
|
||||
)
|
||||
c.create_text(
|
||||
x0 + 8, y_info + 52,
|
||||
text=pat_line,
|
||||
anchor="nw", fill=stat_color, font=("Consolas", 14, "bold")
|
||||
)
|
||||
|
||||
# ── Oddělovač napravo od tohoto slotu ────────────────────────────
|
||||
if col < COLS - 1:
|
||||
next_pos = pos + 1
|
||||
is_new = next_pos in self.boundaries
|
||||
x_sep = x0 + self.SLOT_W
|
||||
c.create_rectangle(
|
||||
x_sep, 0, x_sep + BORDER_W, self.CANVAS_H,
|
||||
fill=C_BOUNDARY if is_new else C_SAME, outline=""
|
||||
)
|
||||
if is_new:
|
||||
c.create_text(
|
||||
x_sep + BORDER_W // 2, self.CANVAS_H // 2,
|
||||
text="▼\nNOVÝ",
|
||||
fill="white", font=("Consolas", 7, "bold"), justify="center"
|
||||
)
|
||||
|
||||
# ── Stavová lišta nahoře ──────────────────────────────────────────────
|
||||
groups = self._get_groups()
|
||||
self.top_label.config(
|
||||
text=(
|
||||
f" str. {self.cursor + 1}/{n} │ "
|
||||
f"skupiny: {len(groups)} │ "
|
||||
f"OCR: {ocr_done}/{n} │ "
|
||||
f"{self.pdf_path.name}"
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# ── Vstup ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
root = tk.Tk()
|
||||
root.withdraw()
|
||||
|
||||
if len(sys.argv) >= 2:
|
||||
pdf_path = Path(sys.argv[1])
|
||||
elif TESTOVANI:
|
||||
pdf_path = Path(PATH_TO_TESTFILE)
|
||||
else:
|
||||
from tkinter import filedialog
|
||||
chosen = filedialog.askopenfilename(
|
||||
title="Vyber vstupní PDF",
|
||||
initialdir=str(_RICOH),
|
||||
filetypes=[("PDF soubory", "*.pdf")],
|
||||
)
|
||||
if not chosen:
|
||||
root.destroy()
|
||||
sys.exit(0)
|
||||
pdf_path = Path(chosen)
|
||||
|
||||
if not pdf_path.exists():
|
||||
print(f"Soubor nenalezen: {pdf_path}")
|
||||
root.destroy()
|
||||
sys.exit(1)
|
||||
|
||||
root.deiconify()
|
||||
app = SplitterUI(root, pdf_path)
|
||||
root.mainloop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user