Files
ordinaceprojekt/SběrDatRůzné/SudokuKiller/30_BatchCrop.py
T
2026-05-06 13:24:43 +02:00

200 lines
6.8 KiB
Python

"""
Batch ořez puzzle z MySQL.
Pro každý řádek v sudoku_killer kde file_puzzle_cropped IS NULL:
- načte file_puzzle + crop_method
- ořízne podle metody
- uloží zpět do file_puzzle_cropped
"""
# ---------------------------------------------------------------------------
# Nastavení — upravuj zde před spuštěním v PyCharm
# ---------------------------------------------------------------------------
WORKERS = 4 # počet paralelních procesů
LIMIT = None # None = vše; číslo (např. 20) = jen prvních N puzzle (pro testování)
BATCH = 200 # kolik oříznutých PDF uložit najednou do DB
DRY_RUN = False # True = jen ořez, nic se neuloží do DB
LOG_EVERY = 500 # vypiš stav do konzole každých N zpracovaných puzzle
# ---------------------------------------------------------------------------
import sys
import json
import csv
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
import fitz
from tqdm import tqdm
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Knihovny"))
from mysql_db import connect_mysql
sys.stdout.reconfigure(encoding="utf-8")
sys.stderr.reconfigure(encoding="utf-8")
ERRORS_CSV = Path(__file__).parent / "crop_errors.csv"
# ---------------------------------------------------------------------------
# Crop metody — přidat sem nové funkce pro nové metody
# ---------------------------------------------------------------------------
def crop_raycast_auto(pdf_bytes: bytes, params: dict) -> bytes:
crop_margin = params.get("crop_margin_pt", 2)
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
page = doc[0]
paths = page.get_drawings()
y_mid = page.mediabox.height / 2
hit_h = [(p["rect"], p.get("width") or 0) for p in paths
if p["rect"].y0 <= y_mid <= p["rect"].y1]
if not hit_h:
raise ValueError("ray-cast: zadne kresby na y_mid")
rects = [r for r, _ in hit_h]
x_left = min(r.x0 for r in rects)
x_right = max(r.x1 for r in rects)
top_cut = min(r.y0 for r in rects)
bot_cut = max(r.y1 for r in rects)
lw_l = next((lw for r, lw in hit_h if r.x0 == x_left), 0)
lw_r = next((lw for r, lw in hit_h if r.x1 == x_right), 0)
clip = fitz.Rect(
x_left - lw_l / 2 - crop_margin,
top_cut - crop_margin,
x_right + lw_r / 2 + crop_margin,
bot_cut + crop_margin,
)
doc_new = fitz.open()
p = doc_new.new_page(width=clip.width, height=clip.height)
p.show_pdf_page(fitz.Rect(0, 0, clip.width, clip.height), doc, 0, clip=clip)
out = doc_new.tobytes()
doc.close()
doc_new.close()
return out
CROP_METHODS = {
"raycast_auto": crop_raycast_auto,
}
# ---------------------------------------------------------------------------
# Worker — spouští se v samostatném procesu
# ---------------------------------------------------------------------------
def process_one(args):
puzzle_id, puzzle_number, pdf_bytes, method_name, params_json = args
try:
params = json.loads(params_json) if isinstance(params_json, str) else params_json
fn = CROP_METHODS.get(method_name)
if fn is None:
return puzzle_id, puzzle_number, None, f"neznama metoda: {method_name}"
cropped = fn(bytes(pdf_bytes), params)
return puzzle_id, puzzle_number, cropped, None
except Exception as e:
return puzzle_id, puzzle_number, None, str(e)
# ---------------------------------------------------------------------------
# Hlavní logika
# ---------------------------------------------------------------------------
def fetch_todo(limit):
import pymysql.cursors
conn = connect_mysql(database="puzzle", cursorclass=pymysql.cursors.DictCursor)
cur = conn.cursor()
sql = """
SELECT sk.id, sk.puzzle_number, sk.file_puzzle,
cm.name AS method_name, cm.params_json
FROM sudoku_killer sk
JOIN puzzle_crop_method cm ON sk.crop_method_id = cm.id
WHERE sk.file_puzzle_cropped IS NULL
ORDER BY sk.puzzle_number
"""
if limit:
sql += f" LIMIT {int(limit)}"
cur.execute(sql)
rows = cur.fetchall()
cur.close()
conn.close()
return rows
def save_cropped(updates: list[tuple]):
"""updates = [(cropped_bytes, puzzle_id), ...]"""
import pymysql.cursors
conn = connect_mysql(database="puzzle", cursorclass=pymysql.cursors.DictCursor)
cur = conn.cursor()
cur.executemany(
"UPDATE sudoku_killer SET file_puzzle_cropped = %s WHERE id = %s",
updates,
)
cur.close()
conn.close()
def main():
print("Nacitam seznam puzzle k orizeni...")
rows = fetch_todo(LIMIT)
total = len(rows)
if total == 0:
print("Vsechny puzzle jsou jiz orizeny.")
return
print(f"Ke zpracovani: {total} puzzle | workers: {WORKERS} | batch: {BATCH} | dry-run: {DRY_RUN}")
errors = []
pending_saves = [] # [(cropped_bytes, puzzle_id)]
done = 0
tasks = [
(r["id"], r["puzzle_number"], r["file_puzzle"], r["method_name"], r["params_json"])
for r in rows
]
with ProcessPoolExecutor(max_workers=WORKERS) as executor:
futures = {executor.submit(process_one, t): t for t in tasks}
with tqdm(total=total, unit="puzzle") as bar:
for future in as_completed(futures):
puzzle_id, puzzle_number, cropped, err = future.result()
if err:
errors.append({"puzzle_id": puzzle_id, "puzzle_number": puzzle_number, "chyba": err})
tqdm.write(f" [CHYBA] puzzle #{puzzle_number}: {err}")
elif not DRY_RUN:
pending_saves.append((cropped, puzzle_id))
if len(pending_saves) >= BATCH:
save_cropped(pending_saves)
pending_saves.clear()
done += 1
bar.update(1)
bar.set_postfix(chyby=len(errors), ulozeno=done - len(errors) - len(pending_saves))
if done % LOG_EVERY == 0:
zbyvá = total - done
pct = done / total * 100
tqdm.write(f" >> {done}/{total} ({pct:.1f}%) | puzzle #{puzzle_number} | zbyvá: {zbyvá} | chyby: {len(errors)}")
# Uložit zbývající
if pending_saves and not DRY_RUN:
save_cropped(pending_saves)
if errors:
with open(ERRORS_CSV, "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=["puzzle_id", "puzzle_number", "chyba"])
w.writeheader()
w.writerows(errors)
print(f"\nChyby: {len(errors)} — viz {ERRORS_CSV}")
else:
print("\nVse bez chyb.")
ok = done - len(errors)
print(f"Hotovo: {ok} orizeno, {len(errors)} chyb, {total - done} preskoceno.")
if __name__ == "__main__":
main()