""" Batch ořez puzzle z MySQL. Pro každý řádek v sudoku_killer kde file_puzzle_cropped IS NULL: - načte file_puzzle + crop_method - ořízne podle metody - uloží zpět do file_puzzle_cropped """ # --------------------------------------------------------------------------- # Nastavení — upravuj zde před spuštěním v PyCharm # --------------------------------------------------------------------------- WORKERS = 4 # počet paralelních procesů LIMIT = None # None = vše; číslo (např. 20) = jen prvních N puzzle (pro testování) BATCH = 200 # kolik oříznutých PDF uložit najednou do DB DRY_RUN = False # True = jen ořez, nic se neuloží do DB LOG_EVERY = 500 # vypiš stav do konzole každých N zpracovaných puzzle # --------------------------------------------------------------------------- import sys import json import csv from pathlib import Path from concurrent.futures import ProcessPoolExecutor, as_completed import fitz from tqdm import tqdm sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Knihovny")) from mysql_db import connect_mysql sys.stdout.reconfigure(encoding="utf-8") sys.stderr.reconfigure(encoding="utf-8") ERRORS_CSV = Path(__file__).parent / "crop_errors.csv" # --------------------------------------------------------------------------- # Crop metody — přidat sem nové funkce pro nové metody # --------------------------------------------------------------------------- def crop_raycast_auto(pdf_bytes: bytes, params: dict) -> bytes: crop_margin = params.get("crop_margin_pt", 2) doc = fitz.open(stream=pdf_bytes, filetype="pdf") page = doc[0] paths = page.get_drawings() y_mid = page.mediabox.height / 2 hit_h = [(p["rect"], p.get("width") or 0) for p in paths if p["rect"].y0 <= y_mid <= p["rect"].y1] if not hit_h: raise ValueError("ray-cast: zadne kresby na y_mid") rects = [r for r, _ in hit_h] x_left = min(r.x0 for r in rects) x_right = max(r.x1 for r in rects) top_cut = min(r.y0 for r in rects) bot_cut = max(r.y1 for r in rects) lw_l = next((lw for r, lw in hit_h if r.x0 == x_left), 0) lw_r = next((lw for r, lw in hit_h if r.x1 == x_right), 0) clip = fitz.Rect( x_left - lw_l / 2 - crop_margin, top_cut - crop_margin, x_right + lw_r / 2 + crop_margin, bot_cut + crop_margin, ) doc_new = fitz.open() p = doc_new.new_page(width=clip.width, height=clip.height) p.show_pdf_page(fitz.Rect(0, 0, clip.width, clip.height), doc, 0, clip=clip) out = doc_new.tobytes() doc.close() doc_new.close() return out CROP_METHODS = { "raycast_auto": crop_raycast_auto, } # --------------------------------------------------------------------------- # Worker — spouští se v samostatném procesu # --------------------------------------------------------------------------- def process_one(args): puzzle_id, puzzle_number, pdf_bytes, method_name, params_json = args try: params = json.loads(params_json) if isinstance(params_json, str) else params_json fn = CROP_METHODS.get(method_name) if fn is None: return puzzle_id, puzzle_number, None, f"neznama metoda: {method_name}" cropped = fn(bytes(pdf_bytes), params) return puzzle_id, puzzle_number, cropped, None except Exception as e: return puzzle_id, puzzle_number, None, str(e) # --------------------------------------------------------------------------- # Hlavní logika # --------------------------------------------------------------------------- def fetch_todo(limit): import pymysql.cursors conn = connect_mysql(database="puzzle", cursorclass=pymysql.cursors.DictCursor) cur = conn.cursor() sql = """ SELECT sk.id, sk.puzzle_number, sk.file_puzzle, cm.name AS method_name, cm.params_json FROM sudoku_killer sk JOIN puzzle_crop_method cm ON sk.crop_method_id = cm.id WHERE sk.file_puzzle_cropped IS NULL ORDER BY sk.puzzle_number """ if limit: sql += f" LIMIT {int(limit)}" cur.execute(sql) rows = cur.fetchall() cur.close() conn.close() return rows def save_cropped(updates: list[tuple]): """updates = [(cropped_bytes, puzzle_id), ...]""" import pymysql.cursors conn = connect_mysql(database="puzzle", cursorclass=pymysql.cursors.DictCursor) cur = conn.cursor() cur.executemany( "UPDATE sudoku_killer SET file_puzzle_cropped = %s WHERE id = %s", updates, ) cur.close() conn.close() def main(): print("Nacitam seznam puzzle k orizeni...") rows = fetch_todo(LIMIT) total = len(rows) if total == 0: print("Vsechny puzzle jsou jiz orizeny.") return print(f"Ke zpracovani: {total} puzzle | workers: {WORKERS} | batch: {BATCH} | dry-run: {DRY_RUN}") errors = [] pending_saves = [] # [(cropped_bytes, puzzle_id)] done = 0 tasks = [ (r["id"], r["puzzle_number"], r["file_puzzle"], r["method_name"], r["params_json"]) for r in rows ] with ProcessPoolExecutor(max_workers=WORKERS) as executor: futures = {executor.submit(process_one, t): t for t in tasks} with tqdm(total=total, unit="puzzle") as bar: for future in as_completed(futures): puzzle_id, puzzle_number, cropped, err = future.result() if err: errors.append({"puzzle_id": puzzle_id, "puzzle_number": puzzle_number, "chyba": err}) tqdm.write(f" [CHYBA] puzzle #{puzzle_number}: {err}") elif not DRY_RUN: pending_saves.append((cropped, puzzle_id)) if len(pending_saves) >= BATCH: save_cropped(pending_saves) pending_saves.clear() done += 1 bar.update(1) bar.set_postfix(chyby=len(errors), ulozeno=done - len(errors) - len(pending_saves)) if done % LOG_EVERY == 0: zbyvá = total - done pct = done / total * 100 tqdm.write(f" >> {done}/{total} ({pct:.1f}%) | puzzle #{puzzle_number} | zbyvá: {zbyvá} | chyby: {len(errors)}") # Uložit zbývající if pending_saves and not DRY_RUN: save_cropped(pending_saves) if errors: with open(ERRORS_CSV, "w", newline="", encoding="utf-8") as f: w = csv.DictWriter(f, fieldnames=["puzzle_id", "puzzle_number", "chyba"]) w.writeheader() w.writerows(errors) print(f"\nChyby: {len(errors)} — viz {ERRORS_CSV}") else: print("\nVse bez chyb.") ok = done - len(errors) print(f"Hotovo: {ok} orizeno, {len(errors)} chyb, {total - done} preskoceno.") if __name__ == "__main__": main()