138 lines
4.2 KiB
Python
138 lines
4.2 KiB
Python
"""
|
|
Batch crop Killer Sudoku PDF souborů — odstraní nadpis nahoře a copyright dole.
|
|
Zachovává vektorový obsah (cairo-generované PDF).
|
|
|
|
Použití:
|
|
python 20_CropPuzzles.py <vstup_dir> <vystup_dir> [--workers N]
|
|
"""
|
|
|
|
import argparse
|
|
import csv
|
|
import sys
|
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
from pathlib import Path
|
|
|
|
import fitz # PyMuPDF
|
|
from tqdm import tqdm
|
|
|
|
|
|
def detect_cuts(paths):
|
|
"""Vrátí (top_cut, bot_cut) nebo (None, None) pokud detekce selže."""
|
|
ys0 = sorted(set(round(p["rect"].y0) for p in paths))
|
|
ys1 = sorted(set(round(p["rect"].y1) for p in paths))
|
|
|
|
top_cut = None
|
|
for i in range(1, len(ys0)):
|
|
if ys0[i] - ys0[i - 1] > 10:
|
|
top_cut = (ys0[i - 1] + ys0[i]) / 2
|
|
break
|
|
|
|
bot_cut = None
|
|
for i in range(len(ys1) - 1, 0, -1):
|
|
if ys1[i] - ys1[i - 1] > 5:
|
|
bot_cut = (ys1[i - 1] + ys1[i]) / 2
|
|
break
|
|
|
|
return top_cut, bot_cut
|
|
|
|
|
|
def crop_one(args):
|
|
"""Zpracuje jeden soubor. Vrátí (src_path, status, detail)."""
|
|
src_path, dst_path = args
|
|
try:
|
|
doc_src = fitz.open(str(src_path))
|
|
page = doc_src[0]
|
|
paths = page.get_drawings()
|
|
|
|
if not paths:
|
|
doc_src.close()
|
|
return str(src_path), "anomalie", "žádné kresby (get_drawings prázdný)"
|
|
|
|
top_cut, bot_cut = detect_cuts(paths)
|
|
|
|
if top_cut is None or bot_cut is None:
|
|
doc_src.close()
|
|
return str(src_path), "anomalie", f"gap detekce selhala (top={top_cut}, bot={bot_cut})"
|
|
|
|
page_w = page.mediabox.width
|
|
clip = fitz.Rect(0, top_cut, page_w, bot_cut)
|
|
|
|
doc_new = fitz.open()
|
|
p = doc_new.new_page(width=clip.width, height=clip.height)
|
|
p.show_pdf_page(fitz.Rect(0, 0, clip.width, clip.height), doc_src, 0, clip=clip)
|
|
|
|
dst_path.parent.mkdir(parents=True, exist_ok=True)
|
|
doc_new.save(str(dst_path))
|
|
|
|
doc_src.close()
|
|
doc_new.close()
|
|
return str(src_path), "ok", ""
|
|
|
|
except Exception as e:
|
|
return str(src_path), "chyba", str(e)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Batch crop Killer Sudoku PDF")
|
|
parser.add_argument("vstup", help="Vstupní adresář s PDF soubory")
|
|
parser.add_argument("vystup", help="Výstupní adresář pro oříznuté PDF")
|
|
parser.add_argument("--workers", type=int, default=4, help="Počet procesů (default: 4)")
|
|
args = parser.parse_args()
|
|
|
|
src_dir = Path(args.vstup)
|
|
dst_dir = Path(args.vystup)
|
|
|
|
if not src_dir.is_dir():
|
|
print(f"Chyba: vstupní adresář neexistuje: {src_dir}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
dst_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
all_pdfs = sorted(src_dir.rglob("*.pdf"))
|
|
if not all_pdfs:
|
|
print("Žádné PDF soubory nenalezeny.")
|
|
sys.exit(0)
|
|
|
|
# Přeskočit již zpracované
|
|
tasks = []
|
|
skipped = 0
|
|
for src in all_pdfs:
|
|
rel = src.relative_to(src_dir)
|
|
dst = dst_dir / rel
|
|
if dst.exists():
|
|
skipped += 1
|
|
else:
|
|
tasks.append((src, dst))
|
|
|
|
print(f"Celkem PDF: {len(all_pdfs)}, přeskočeno (existují): {skipped}, ke zpracování: {len(tasks)}")
|
|
|
|
if not tasks:
|
|
print("Vše již zpracováno.")
|
|
return
|
|
|
|
errors_csv = dst_dir / "errors.csv"
|
|
errors = []
|
|
|
|
with ProcessPoolExecutor(max_workers=args.workers) as executor:
|
|
futures = {executor.submit(crop_one, t): t for t in tasks}
|
|
with tqdm(total=len(tasks), unit="soubor") as bar:
|
|
for future in as_completed(futures):
|
|
src_path, status, detail = future.result()
|
|
if status != "ok":
|
|
errors.append({"soubor": src_path, "typ": status, "detail": detail})
|
|
bar.update(1)
|
|
bar.set_postfix(chyby=len(errors))
|
|
|
|
if errors:
|
|
with open(errors_csv, "w", newline="", encoding="utf-8") as f:
|
|
writer = csv.DictWriter(f, fieldnames=["soubor", "typ", "detail"])
|
|
writer.writeheader()
|
|
writer.writerows(errors)
|
|
print(f"\nChyby/anomálie: {len(errors)} — viz {errors_csv}")
|
|
else:
|
|
print("\nVšechny soubory zpracovány bez chyb.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|