200 lines
6.8 KiB
Python
200 lines
6.8 KiB
Python
"""
|
|
Batch ořez puzzle z MySQL.
|
|
|
|
Pro každý řádek v sudoku_killer kde file_puzzle_cropped IS NULL:
|
|
- načte file_puzzle + crop_method
|
|
- ořízne podle metody
|
|
- uloží zpět do file_puzzle_cropped
|
|
"""
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Nastavení — upravuj zde před spuštěním v PyCharm
|
|
# ---------------------------------------------------------------------------
|
|
WORKERS = 4 # počet paralelních procesů
|
|
LIMIT = None # None = vše; číslo (např. 20) = jen prvních N puzzle (pro testování)
|
|
BATCH = 200 # kolik oříznutých PDF uložit najednou do DB
|
|
DRY_RUN = False # True = jen ořez, nic se neuloží do DB
|
|
LOG_EVERY = 500 # vypiš stav do konzole každých N zpracovaných puzzle
|
|
# ---------------------------------------------------------------------------
|
|
|
|
import sys
|
|
import json
|
|
import csv
|
|
from pathlib import Path
|
|
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
|
|
import fitz
|
|
from tqdm import tqdm
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Knihovny"))
|
|
from mysql_db import connect_mysql
|
|
|
|
sys.stdout.reconfigure(encoding="utf-8")
|
|
sys.stderr.reconfigure(encoding="utf-8")
|
|
|
|
ERRORS_CSV = Path(__file__).parent / "crop_errors.csv"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Crop metody — přidat sem nové funkce pro nové metody
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def crop_raycast_auto(pdf_bytes: bytes, params: dict) -> bytes:
|
|
crop_margin = params.get("crop_margin_pt", 2)
|
|
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
|
page = doc[0]
|
|
paths = page.get_drawings()
|
|
y_mid = page.mediabox.height / 2
|
|
|
|
hit_h = [(p["rect"], p.get("width") or 0) for p in paths
|
|
if p["rect"].y0 <= y_mid <= p["rect"].y1]
|
|
if not hit_h:
|
|
raise ValueError("ray-cast: zadne kresby na y_mid")
|
|
|
|
rects = [r for r, _ in hit_h]
|
|
x_left = min(r.x0 for r in rects)
|
|
x_right = max(r.x1 for r in rects)
|
|
top_cut = min(r.y0 for r in rects)
|
|
bot_cut = max(r.y1 for r in rects)
|
|
lw_l = next((lw for r, lw in hit_h if r.x0 == x_left), 0)
|
|
lw_r = next((lw for r, lw in hit_h if r.x1 == x_right), 0)
|
|
|
|
clip = fitz.Rect(
|
|
x_left - lw_l / 2 - crop_margin,
|
|
top_cut - crop_margin,
|
|
x_right + lw_r / 2 + crop_margin,
|
|
bot_cut + crop_margin,
|
|
)
|
|
|
|
doc_new = fitz.open()
|
|
p = doc_new.new_page(width=clip.width, height=clip.height)
|
|
p.show_pdf_page(fitz.Rect(0, 0, clip.width, clip.height), doc, 0, clip=clip)
|
|
out = doc_new.tobytes()
|
|
doc.close()
|
|
doc_new.close()
|
|
return out
|
|
|
|
|
|
CROP_METHODS = {
|
|
"raycast_auto": crop_raycast_auto,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Worker — spouští se v samostatném procesu
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def process_one(args):
|
|
puzzle_id, puzzle_number, pdf_bytes, method_name, params_json = args
|
|
try:
|
|
params = json.loads(params_json) if isinstance(params_json, str) else params_json
|
|
fn = CROP_METHODS.get(method_name)
|
|
if fn is None:
|
|
return puzzle_id, puzzle_number, None, f"neznama metoda: {method_name}"
|
|
cropped = fn(bytes(pdf_bytes), params)
|
|
return puzzle_id, puzzle_number, cropped, None
|
|
except Exception as e:
|
|
return puzzle_id, puzzle_number, None, str(e)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Hlavní logika
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def fetch_todo(limit):
|
|
import pymysql.cursors
|
|
conn = connect_mysql(database="puzzle", cursorclass=pymysql.cursors.DictCursor)
|
|
cur = conn.cursor()
|
|
sql = """
|
|
SELECT sk.id, sk.puzzle_number, sk.file_puzzle,
|
|
cm.name AS method_name, cm.params_json
|
|
FROM sudoku_killer sk
|
|
JOIN puzzle_crop_method cm ON sk.crop_method_id = cm.id
|
|
WHERE sk.file_puzzle_cropped IS NULL
|
|
ORDER BY sk.puzzle_number
|
|
"""
|
|
if limit:
|
|
sql += f" LIMIT {int(limit)}"
|
|
cur.execute(sql)
|
|
rows = cur.fetchall()
|
|
cur.close()
|
|
conn.close()
|
|
return rows
|
|
|
|
|
|
def save_cropped(updates: list[tuple]):
|
|
"""updates = [(cropped_bytes, puzzle_id), ...]"""
|
|
import pymysql.cursors
|
|
conn = connect_mysql(database="puzzle", cursorclass=pymysql.cursors.DictCursor)
|
|
cur = conn.cursor()
|
|
cur.executemany(
|
|
"UPDATE sudoku_killer SET file_puzzle_cropped = %s WHERE id = %s",
|
|
updates,
|
|
)
|
|
cur.close()
|
|
conn.close()
|
|
|
|
|
|
def main():
|
|
print("Nacitam seznam puzzle k orizeni...")
|
|
rows = fetch_todo(LIMIT)
|
|
total = len(rows)
|
|
if total == 0:
|
|
print("Vsechny puzzle jsou jiz orizeny.")
|
|
return
|
|
print(f"Ke zpracovani: {total} puzzle | workers: {WORKERS} | batch: {BATCH} | dry-run: {DRY_RUN}")
|
|
|
|
errors = []
|
|
pending_saves = [] # [(cropped_bytes, puzzle_id)]
|
|
done = 0
|
|
|
|
tasks = [
|
|
(r["id"], r["puzzle_number"], r["file_puzzle"], r["method_name"], r["params_json"])
|
|
for r in rows
|
|
]
|
|
|
|
with ProcessPoolExecutor(max_workers=WORKERS) as executor:
|
|
futures = {executor.submit(process_one, t): t for t in tasks}
|
|
with tqdm(total=total, unit="puzzle") as bar:
|
|
for future in as_completed(futures):
|
|
puzzle_id, puzzle_number, cropped, err = future.result()
|
|
|
|
if err:
|
|
errors.append({"puzzle_id": puzzle_id, "puzzle_number": puzzle_number, "chyba": err})
|
|
tqdm.write(f" [CHYBA] puzzle #{puzzle_number}: {err}")
|
|
elif not DRY_RUN:
|
|
pending_saves.append((cropped, puzzle_id))
|
|
if len(pending_saves) >= BATCH:
|
|
save_cropped(pending_saves)
|
|
pending_saves.clear()
|
|
|
|
done += 1
|
|
bar.update(1)
|
|
bar.set_postfix(chyby=len(errors), ulozeno=done - len(errors) - len(pending_saves))
|
|
|
|
if done % LOG_EVERY == 0:
|
|
zbyvá = total - done
|
|
pct = done / total * 100
|
|
tqdm.write(f" >> {done}/{total} ({pct:.1f}%) | puzzle #{puzzle_number} | zbyvá: {zbyvá} | chyby: {len(errors)}")
|
|
|
|
# Uložit zbývající
|
|
if pending_saves and not DRY_RUN:
|
|
save_cropped(pending_saves)
|
|
|
|
if errors:
|
|
with open(ERRORS_CSV, "w", newline="", encoding="utf-8") as f:
|
|
w = csv.DictWriter(f, fieldnames=["puzzle_id", "puzzle_number", "chyba"])
|
|
w.writeheader()
|
|
w.writerows(errors)
|
|
print(f"\nChyby: {len(errors)} — viz {ERRORS_CSV}")
|
|
else:
|
|
print("\nVse bez chyb.")
|
|
|
|
ok = done - len(errors)
|
|
print(f"Hotovo: {ok} orizeno, {len(errors)} chyb, {total - done} preskoceno.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|