ordinaceprojekt/SběrDatRůzné/SudokuKiller/Testy/20_CropPuzzles.py

"""
Batch crop Killer Sudoku PDF souborů — odstraní nadpis nahoře a copyright dole.
Zachovává vektorový obsah (cairo-generované PDF).

Použití:
    python 20_CropPuzzles.py <vstup_dir> <vystup_dir> [--workers N]
"""

import argparse
import csv
import sys
from concurrent.futures import ProcessPoolExecutor, as_completed
from pathlib import Path

import fitz  # PyMuPDF
from tqdm import tqdm


def detect_cuts(paths):
    """Vrátí (top_cut, bot_cut) nebo (None, None) pokud detekce selže."""
    ys0 = sorted(set(round(p["rect"].y0) for p in paths))
    ys1 = sorted(set(round(p["rect"].y1) for p in paths))

    top_cut = None
    for i in range(1, len(ys0)):
        if ys0[i] - ys0[i - 1] > 10:
            top_cut = (ys0[i - 1] + ys0[i]) / 2
            break

    bot_cut = None
    for i in range(len(ys1) - 1, 0, -1):
        if ys1[i] - ys1[i - 1] > 5:
            bot_cut = (ys1[i - 1] + ys1[i]) / 2
            break

    return top_cut, bot_cut


def crop_one(args):
    """Zpracuje jeden soubor. Vrátí (src_path, status, detail)."""
    src_path, dst_path = args
    try:
        doc_src = fitz.open(str(src_path))
        page = doc_src[0]
        paths = page.get_drawings()

        if not paths:
            doc_src.close()
            return str(src_path), "anomalie", "žádné kresby (get_drawings prázdný)"

        top_cut, bot_cut = detect_cuts(paths)

        if top_cut is None or bot_cut is None:
            doc_src.close()
            return str(src_path), "anomalie", f"gap detekce selhala (top={top_cut}, bot={bot_cut})"

        page_w = page.mediabox.width
        clip = fitz.Rect(0, top_cut, page_w, bot_cut)

        doc_new = fitz.open()
        p = doc_new.new_page(width=clip.width, height=clip.height)
        p.show_pdf_page(fitz.Rect(0, 0, clip.width, clip.height), doc_src, 0, clip=clip)

        dst_path.parent.mkdir(parents=True, exist_ok=True)
        doc_new.save(str(dst_path))

        doc_src.close()
        doc_new.close()
        return str(src_path), "ok", ""

    except Exception as e:
        return str(src_path), "chyba", str(e)


def main():
    parser = argparse.ArgumentParser(description="Batch crop Killer Sudoku PDF")
    parser.add_argument("vstup", help="Vstupní adresář s PDF soubory")
    parser.add_argument("vystup", help="Výstupní adresář pro oříznuté PDF")
    parser.add_argument("--workers", type=int, default=4, help="Počet procesů (default: 4)")
    args = parser.parse_args()

    src_dir = Path(args.vstup)
    dst_dir = Path(args.vystup)

    if not src_dir.is_dir():
        print(f"Chyba: vstupní adresář neexistuje: {src_dir}", file=sys.stderr)
        sys.exit(1)

    dst_dir.mkdir(parents=True, exist_ok=True)

    all_pdfs = sorted(src_dir.rglob("*.pdf"))
    if not all_pdfs:
        print("Žádné PDF soubory nenalezeny.")
        sys.exit(0)

    # Přeskočit již zpracované
    tasks = []
    skipped = 0
    for src in all_pdfs:
        rel = src.relative_to(src_dir)
        dst = dst_dir / rel
        if dst.exists():
            skipped += 1
        else:
            tasks.append((src, dst))

    print(f"Celkem PDF: {len(all_pdfs)}, přeskočeno (existují): {skipped}, ke zpracování: {len(tasks)}")

    if not tasks:
        print("Vše již zpracováno.")
        return

    errors_csv = dst_dir / "errors.csv"
    errors = []

    with ProcessPoolExecutor(max_workers=args.workers) as executor:
        futures = {executor.submit(crop_one, t): t for t in tasks}
        with tqdm(total=len(tasks), unit="soubor") as bar:
            for future in as_completed(futures):
                src_path, status, detail = future.result()
                if status != "ok":
                    errors.append({"soubor": src_path, "typ": status, "detail": detail})
                bar.update(1)
                bar.set_postfix(chyby=len(errors))

    if errors:
        with open(errors_csv, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=["soubor", "typ", "detail"])
            writer.writeheader()
            writer.writerows(errors)
        print(f"\nChyby/anomálie: {len(errors)} — viz {errors_csv}")
    else:
        print("\nVšechny soubory zpracovány bez chyb.")


if __name__ == "__main__":
    main()