Files
2026-05-06 13:24:43 +02:00

138 lines
4.2 KiB
Python

"""
Batch crop Killer Sudoku PDF souborů — odstraní nadpis nahoře a copyright dole.
Zachovává vektorový obsah (cairo-generované PDF).
Použití:
python 20_CropPuzzles.py <vstup_dir> <vystup_dir> [--workers N]
"""
import argparse
import csv
import sys
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path
import fitz # PyMuPDF
from tqdm import tqdm
def detect_cuts(paths):
"""Vrátí (top_cut, bot_cut) nebo (None, None) pokud detekce selže."""
ys0 = sorted(set(round(p["rect"].y0) for p in paths))
ys1 = sorted(set(round(p["rect"].y1) for p in paths))
top_cut = None
for i in range(1, len(ys0)):
if ys0[i] - ys0[i - 1] > 10:
top_cut = (ys0[i - 1] + ys0[i]) / 2
break
bot_cut = None
for i in range(len(ys1) - 1, 0, -1):
if ys1[i] - ys1[i - 1] > 5:
bot_cut = (ys1[i - 1] + ys1[i]) / 2
break
return top_cut, bot_cut
def crop_one(args):
"""Zpracuje jeden soubor. Vrátí (src_path, status, detail)."""
src_path, dst_path = args
try:
doc_src = fitz.open(str(src_path))
page = doc_src[0]
paths = page.get_drawings()
if not paths:
doc_src.close()
return str(src_path), "anomalie", "žádné kresby (get_drawings prázdný)"
top_cut, bot_cut = detect_cuts(paths)
if top_cut is None or bot_cut is None:
doc_src.close()
return str(src_path), "anomalie", f"gap detekce selhala (top={top_cut}, bot={bot_cut})"
page_w = page.mediabox.width
clip = fitz.Rect(0, top_cut, page_w, bot_cut)
doc_new = fitz.open()
p = doc_new.new_page(width=clip.width, height=clip.height)
p.show_pdf_page(fitz.Rect(0, 0, clip.width, clip.height), doc_src, 0, clip=clip)
dst_path.parent.mkdir(parents=True, exist_ok=True)
doc_new.save(str(dst_path))
doc_src.close()
doc_new.close()
return str(src_path), "ok", ""
except Exception as e:
return str(src_path), "chyba", str(e)
def main():
parser = argparse.ArgumentParser(description="Batch crop Killer Sudoku PDF")
parser.add_argument("vstup", help="Vstupní adresář s PDF soubory")
parser.add_argument("vystup", help="Výstupní adresář pro oříznuté PDF")
parser.add_argument("--workers", type=int, default=4, help="Počet procesů (default: 4)")
args = parser.parse_args()
src_dir = Path(args.vstup)
dst_dir = Path(args.vystup)
if not src_dir.is_dir():
print(f"Chyba: vstupní adresář neexistuje: {src_dir}", file=sys.stderr)
sys.exit(1)
dst_dir.mkdir(parents=True, exist_ok=True)
all_pdfs = sorted(src_dir.rglob("*.pdf"))
if not all_pdfs:
print("Žádné PDF soubory nenalezeny.")
sys.exit(0)
# Přeskočit již zpracované
tasks = []
skipped = 0
for src in all_pdfs:
rel = src.relative_to(src_dir)
dst = dst_dir / rel
if dst.exists():
skipped += 1
else:
tasks.append((src, dst))
print(f"Celkem PDF: {len(all_pdfs)}, přeskočeno (existují): {skipped}, ke zpracování: {len(tasks)}")
if not tasks:
print("Vše již zpracováno.")
return
errors_csv = dst_dir / "errors.csv"
errors = []
with ProcessPoolExecutor(max_workers=args.workers) as executor:
futures = {executor.submit(crop_one, t): t for t in tasks}
with tqdm(total=len(tasks), unit="soubor") as bar:
for future in as_completed(futures):
src_path, status, detail = future.result()
if status != "ok":
errors.append({"soubor": src_path, "typ": status, "detail": detail})
bar.update(1)
bar.set_postfix(chyby=len(errors))
if errors:
with open(errors_csv, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["soubor", "typ", "detail"])
writer.writeheader()
writer.writerows(errors)
print(f"\nChyby/anomálie: {len(errors)} — viz {errors_csv}")
else:
print("\nVšechny soubory zpracovány bez chyb.")
if __name__ == "__main__":
main()