""" Batch crop Killer Sudoku PDF souborů — odstraní nadpis nahoře a copyright dole. Zachovává vektorový obsah (cairo-generované PDF). Použití: python 20_CropPuzzles.py [--workers N] """ import argparse import csv import sys from concurrent.futures import ProcessPoolExecutor, as_completed from pathlib import Path import fitz # PyMuPDF from tqdm import tqdm def detect_cuts(paths): """Vrátí (top_cut, bot_cut) nebo (None, None) pokud detekce selže.""" ys0 = sorted(set(round(p["rect"].y0) for p in paths)) ys1 = sorted(set(round(p["rect"].y1) for p in paths)) top_cut = None for i in range(1, len(ys0)): if ys0[i] - ys0[i - 1] > 10: top_cut = (ys0[i - 1] + ys0[i]) / 2 break bot_cut = None for i in range(len(ys1) - 1, 0, -1): if ys1[i] - ys1[i - 1] > 5: bot_cut = (ys1[i - 1] + ys1[i]) / 2 break return top_cut, bot_cut def crop_one(args): """Zpracuje jeden soubor. Vrátí (src_path, status, detail).""" src_path, dst_path = args try: doc_src = fitz.open(str(src_path)) page = doc_src[0] paths = page.get_drawings() if not paths: doc_src.close() return str(src_path), "anomalie", "žádné kresby (get_drawings prázdný)" top_cut, bot_cut = detect_cuts(paths) if top_cut is None or bot_cut is None: doc_src.close() return str(src_path), "anomalie", f"gap detekce selhala (top={top_cut}, bot={bot_cut})" page_w = page.mediabox.width clip = fitz.Rect(0, top_cut, page_w, bot_cut) doc_new = fitz.open() p = doc_new.new_page(width=clip.width, height=clip.height) p.show_pdf_page(fitz.Rect(0, 0, clip.width, clip.height), doc_src, 0, clip=clip) dst_path.parent.mkdir(parents=True, exist_ok=True) doc_new.save(str(dst_path)) doc_src.close() doc_new.close() return str(src_path), "ok", "" except Exception as e: return str(src_path), "chyba", str(e) def main(): parser = argparse.ArgumentParser(description="Batch crop Killer Sudoku PDF") parser.add_argument("vstup", help="Vstupní adresář s PDF soubory") parser.add_argument("vystup", help="Výstupní adresář pro oříznuté PDF") parser.add_argument("--workers", type=int, default=4, help="Počet procesů (default: 4)") args = parser.parse_args() src_dir = Path(args.vstup) dst_dir = Path(args.vystup) if not src_dir.is_dir(): print(f"Chyba: vstupní adresář neexistuje: {src_dir}", file=sys.stderr) sys.exit(1) dst_dir.mkdir(parents=True, exist_ok=True) all_pdfs = sorted(src_dir.rglob("*.pdf")) if not all_pdfs: print("Žádné PDF soubory nenalezeny.") sys.exit(0) # Přeskočit již zpracované tasks = [] skipped = 0 for src in all_pdfs: rel = src.relative_to(src_dir) dst = dst_dir / rel if dst.exists(): skipped += 1 else: tasks.append((src, dst)) print(f"Celkem PDF: {len(all_pdfs)}, přeskočeno (existují): {skipped}, ke zpracování: {len(tasks)}") if not tasks: print("Vše již zpracováno.") return errors_csv = dst_dir / "errors.csv" errors = [] with ProcessPoolExecutor(max_workers=args.workers) as executor: futures = {executor.submit(crop_one, t): t for t in tasks} with tqdm(total=len(tasks), unit="soubor") as bar: for future in as_completed(futures): src_path, status, detail = future.result() if status != "ok": errors.append({"soubor": src_path, "typ": status, "detail": detail}) bar.update(1) bar.set_postfix(chyby=len(errors)) if errors: with open(errors_csv, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=["soubor", "typ", "detail"]) writer.writeheader() writer.writerows(errors) print(f"\nChyby/anomálie: {len(errors)} — viz {errors_csv}") else: print("\nVšechny soubory zpracovány bez chyb.") if __name__ == "__main__": main()