z230
This commit is contained in:
@@ -0,0 +1,199 @@
|
||||
"""
|
||||
Batch ořez puzzle z MySQL.
|
||||
|
||||
Pro každý řádek v sudoku_killer kde file_puzzle_cropped IS NULL:
|
||||
- načte file_puzzle + crop_method
|
||||
- ořízne podle metody
|
||||
- uloží zpět do file_puzzle_cropped
|
||||
"""
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Nastavení — upravuj zde před spuštěním v PyCharm
|
||||
# ---------------------------------------------------------------------------
|
||||
WORKERS = 4 # počet paralelních procesů
|
||||
LIMIT = None # None = vše; číslo (např. 20) = jen prvních N puzzle (pro testování)
|
||||
BATCH = 200 # kolik oříznutých PDF uložit najednou do DB
|
||||
DRY_RUN = False # True = jen ořez, nic se neuloží do DB
|
||||
LOG_EVERY = 500 # vypiš stav do konzole každých N zpracovaných puzzle
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
import sys
|
||||
import json
|
||||
import csv
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ProcessPoolExecutor, as_completed
|
||||
|
||||
import fitz
|
||||
from tqdm import tqdm
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Knihovny"))
|
||||
from mysql_db import connect_mysql
|
||||
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
sys.stderr.reconfigure(encoding="utf-8")
|
||||
|
||||
ERRORS_CSV = Path(__file__).parent / "crop_errors.csv"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Crop metody — přidat sem nové funkce pro nové metody
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def crop_raycast_auto(pdf_bytes: bytes, params: dict) -> bytes:
|
||||
crop_margin = params.get("crop_margin_pt", 2)
|
||||
|
||||
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
||||
page = doc[0]
|
||||
paths = page.get_drawings()
|
||||
y_mid = page.mediabox.height / 2
|
||||
|
||||
hit_h = [(p["rect"], p.get("width") or 0) for p in paths
|
||||
if p["rect"].y0 <= y_mid <= p["rect"].y1]
|
||||
if not hit_h:
|
||||
raise ValueError("ray-cast: zadne kresby na y_mid")
|
||||
|
||||
rects = [r for r, _ in hit_h]
|
||||
x_left = min(r.x0 for r in rects)
|
||||
x_right = max(r.x1 for r in rects)
|
||||
top_cut = min(r.y0 for r in rects)
|
||||
bot_cut = max(r.y1 for r in rects)
|
||||
lw_l = next((lw for r, lw in hit_h if r.x0 == x_left), 0)
|
||||
lw_r = next((lw for r, lw in hit_h if r.x1 == x_right), 0)
|
||||
|
||||
clip = fitz.Rect(
|
||||
x_left - lw_l / 2 - crop_margin,
|
||||
top_cut - crop_margin,
|
||||
x_right + lw_r / 2 + crop_margin,
|
||||
bot_cut + crop_margin,
|
||||
)
|
||||
|
||||
doc_new = fitz.open()
|
||||
p = doc_new.new_page(width=clip.width, height=clip.height)
|
||||
p.show_pdf_page(fitz.Rect(0, 0, clip.width, clip.height), doc, 0, clip=clip)
|
||||
out = doc_new.tobytes()
|
||||
doc.close()
|
||||
doc_new.close()
|
||||
return out
|
||||
|
||||
|
||||
CROP_METHODS = {
|
||||
"raycast_auto": crop_raycast_auto,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Worker — spouští se v samostatném procesu
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def process_one(args):
|
||||
puzzle_id, puzzle_number, pdf_bytes, method_name, params_json = args
|
||||
try:
|
||||
params = json.loads(params_json) if isinstance(params_json, str) else params_json
|
||||
fn = CROP_METHODS.get(method_name)
|
||||
if fn is None:
|
||||
return puzzle_id, puzzle_number, None, f"neznama metoda: {method_name}"
|
||||
cropped = fn(bytes(pdf_bytes), params)
|
||||
return puzzle_id, puzzle_number, cropped, None
|
||||
except Exception as e:
|
||||
return puzzle_id, puzzle_number, None, str(e)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Hlavní logika
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def fetch_todo(limit):
|
||||
import pymysql.cursors
|
||||
conn = connect_mysql(database="puzzle", cursorclass=pymysql.cursors.DictCursor)
|
||||
cur = conn.cursor()
|
||||
sql = """
|
||||
SELECT sk.id, sk.puzzle_number, sk.file_puzzle,
|
||||
cm.name AS method_name, cm.params_json
|
||||
FROM sudoku_killer sk
|
||||
JOIN puzzle_crop_method cm ON sk.crop_method_id = cm.id
|
||||
WHERE sk.file_puzzle_cropped IS NULL
|
||||
ORDER BY sk.puzzle_number
|
||||
"""
|
||||
if limit:
|
||||
sql += f" LIMIT {int(limit)}"
|
||||
cur.execute(sql)
|
||||
rows = cur.fetchall()
|
||||
cur.close()
|
||||
conn.close()
|
||||
return rows
|
||||
|
||||
|
||||
def save_cropped(updates: list[tuple]):
|
||||
"""updates = [(cropped_bytes, puzzle_id), ...]"""
|
||||
import pymysql.cursors
|
||||
conn = connect_mysql(database="puzzle", cursorclass=pymysql.cursors.DictCursor)
|
||||
cur = conn.cursor()
|
||||
cur.executemany(
|
||||
"UPDATE sudoku_killer SET file_puzzle_cropped = %s WHERE id = %s",
|
||||
updates,
|
||||
)
|
||||
cur.close()
|
||||
conn.close()
|
||||
|
||||
|
||||
def main():
|
||||
print("Nacitam seznam puzzle k orizeni...")
|
||||
rows = fetch_todo(LIMIT)
|
||||
total = len(rows)
|
||||
if total == 0:
|
||||
print("Vsechny puzzle jsou jiz orizeny.")
|
||||
return
|
||||
print(f"Ke zpracovani: {total} puzzle | workers: {WORKERS} | batch: {BATCH} | dry-run: {DRY_RUN}")
|
||||
|
||||
errors = []
|
||||
pending_saves = [] # [(cropped_bytes, puzzle_id)]
|
||||
done = 0
|
||||
|
||||
tasks = [
|
||||
(r["id"], r["puzzle_number"], r["file_puzzle"], r["method_name"], r["params_json"])
|
||||
for r in rows
|
||||
]
|
||||
|
||||
with ProcessPoolExecutor(max_workers=WORKERS) as executor:
|
||||
futures = {executor.submit(process_one, t): t for t in tasks}
|
||||
with tqdm(total=total, unit="puzzle") as bar:
|
||||
for future in as_completed(futures):
|
||||
puzzle_id, puzzle_number, cropped, err = future.result()
|
||||
|
||||
if err:
|
||||
errors.append({"puzzle_id": puzzle_id, "puzzle_number": puzzle_number, "chyba": err})
|
||||
tqdm.write(f" [CHYBA] puzzle #{puzzle_number}: {err}")
|
||||
elif not DRY_RUN:
|
||||
pending_saves.append((cropped, puzzle_id))
|
||||
if len(pending_saves) >= BATCH:
|
||||
save_cropped(pending_saves)
|
||||
pending_saves.clear()
|
||||
|
||||
done += 1
|
||||
bar.update(1)
|
||||
bar.set_postfix(chyby=len(errors), ulozeno=done - len(errors) - len(pending_saves))
|
||||
|
||||
if done % LOG_EVERY == 0:
|
||||
zbyvá = total - done
|
||||
pct = done / total * 100
|
||||
tqdm.write(f" >> {done}/{total} ({pct:.1f}%) | puzzle #{puzzle_number} | zbyvá: {zbyvá} | chyby: {len(errors)}")
|
||||
|
||||
# Uložit zbývající
|
||||
if pending_saves and not DRY_RUN:
|
||||
save_cropped(pending_saves)
|
||||
|
||||
if errors:
|
||||
with open(ERRORS_CSV, "w", newline="", encoding="utf-8") as f:
|
||||
w = csv.DictWriter(f, fieldnames=["puzzle_id", "puzzle_number", "chyba"])
|
||||
w.writeheader()
|
||||
w.writerows(errors)
|
||||
print(f"\nChyby: {len(errors)} — viz {ERRORS_CSV}")
|
||||
else:
|
||||
print("\nVse bez chyb.")
|
||||
|
||||
ok = done - len(errors)
|
||||
print(f"Hotovo: {ok} orizeno, {len(errors)} chyb, {total - done} preskoceno.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user