z230
This commit is contained in:
@@ -0,0 +1,137 @@
|
||||
"""
|
||||
Batch crop Killer Sudoku PDF souborů — odstraní nadpis nahoře a copyright dole.
|
||||
Zachovává vektorový obsah (cairo-generované PDF).
|
||||
|
||||
Použití:
|
||||
python 20_CropPuzzles.py <vstup_dir> <vystup_dir> [--workers N]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import csv
|
||||
import sys
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
from pathlib import Path
|
||||
|
||||
import fitz # PyMuPDF
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def detect_cuts(paths):
|
||||
"""Vrátí (top_cut, bot_cut) nebo (None, None) pokud detekce selže."""
|
||||
ys0 = sorted(set(round(p["rect"].y0) for p in paths))
|
||||
ys1 = sorted(set(round(p["rect"].y1) for p in paths))
|
||||
|
||||
top_cut = None
|
||||
for i in range(1, len(ys0)):
|
||||
if ys0[i] - ys0[i - 1] > 10:
|
||||
top_cut = (ys0[i - 1] + ys0[i]) / 2
|
||||
break
|
||||
|
||||
bot_cut = None
|
||||
for i in range(len(ys1) - 1, 0, -1):
|
||||
if ys1[i] - ys1[i - 1] > 5:
|
||||
bot_cut = (ys1[i - 1] + ys1[i]) / 2
|
||||
break
|
||||
|
||||
return top_cut, bot_cut
|
||||
|
||||
|
||||
def crop_one(args):
|
||||
"""Zpracuje jeden soubor. Vrátí (src_path, status, detail)."""
|
||||
src_path, dst_path = args
|
||||
try:
|
||||
doc_src = fitz.open(str(src_path))
|
||||
page = doc_src[0]
|
||||
paths = page.get_drawings()
|
||||
|
||||
if not paths:
|
||||
doc_src.close()
|
||||
return str(src_path), "anomalie", "žádné kresby (get_drawings prázdný)"
|
||||
|
||||
top_cut, bot_cut = detect_cuts(paths)
|
||||
|
||||
if top_cut is None or bot_cut is None:
|
||||
doc_src.close()
|
||||
return str(src_path), "anomalie", f"gap detekce selhala (top={top_cut}, bot={bot_cut})"
|
||||
|
||||
page_w = page.mediabox.width
|
||||
clip = fitz.Rect(0, top_cut, page_w, bot_cut)
|
||||
|
||||
doc_new = fitz.open()
|
||||
p = doc_new.new_page(width=clip.width, height=clip.height)
|
||||
p.show_pdf_page(fitz.Rect(0, 0, clip.width, clip.height), doc_src, 0, clip=clip)
|
||||
|
||||
dst_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
doc_new.save(str(dst_path))
|
||||
|
||||
doc_src.close()
|
||||
doc_new.close()
|
||||
return str(src_path), "ok", ""
|
||||
|
||||
except Exception as e:
|
||||
return str(src_path), "chyba", str(e)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Batch crop Killer Sudoku PDF")
|
||||
parser.add_argument("vstup", help="Vstupní adresář s PDF soubory")
|
||||
parser.add_argument("vystup", help="Výstupní adresář pro oříznuté PDF")
|
||||
parser.add_argument("--workers", type=int, default=4, help="Počet procesů (default: 4)")
|
||||
args = parser.parse_args()
|
||||
|
||||
src_dir = Path(args.vstup)
|
||||
dst_dir = Path(args.vystup)
|
||||
|
||||
if not src_dir.is_dir():
|
||||
print(f"Chyba: vstupní adresář neexistuje: {src_dir}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
dst_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
all_pdfs = sorted(src_dir.rglob("*.pdf"))
|
||||
if not all_pdfs:
|
||||
print("Žádné PDF soubory nenalezeny.")
|
||||
sys.exit(0)
|
||||
|
||||
# Přeskočit již zpracované
|
||||
tasks = []
|
||||
skipped = 0
|
||||
for src in all_pdfs:
|
||||
rel = src.relative_to(src_dir)
|
||||
dst = dst_dir / rel
|
||||
if dst.exists():
|
||||
skipped += 1
|
||||
else:
|
||||
tasks.append((src, dst))
|
||||
|
||||
print(f"Celkem PDF: {len(all_pdfs)}, přeskočeno (existují): {skipped}, ke zpracování: {len(tasks)}")
|
||||
|
||||
if not tasks:
|
||||
print("Vše již zpracováno.")
|
||||
return
|
||||
|
||||
errors_csv = dst_dir / "errors.csv"
|
||||
errors = []
|
||||
|
||||
with ProcessPoolExecutor(max_workers=args.workers) as executor:
|
||||
futures = {executor.submit(crop_one, t): t for t in tasks}
|
||||
with tqdm(total=len(tasks), unit="soubor") as bar:
|
||||
for future in as_completed(futures):
|
||||
src_path, status, detail = future.result()
|
||||
if status != "ok":
|
||||
errors.append({"soubor": src_path, "typ": status, "detail": detail})
|
||||
bar.update(1)
|
||||
bar.set_postfix(chyby=len(errors))
|
||||
|
||||
if errors:
|
||||
with open(errors_csv, "w", newline="", encoding="utf-8") as f:
|
||||
writer = csv.DictWriter(f, fieldnames=["soubor", "typ", "detail"])
|
||||
writer.writeheader()
|
||||
writer.writerows(errors)
|
||||
print(f"\nChyby/anomálie: {len(errors)} — viz {errors_csv}")
|
||||
else:
|
||||
print("\nVšechny soubory zpracovány bez chyb.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user