notebookvb

This commit is contained in:
Vladimir Buzalka
2026-05-08 22:38:36 +02:00
parent c4c0d1d435
commit c083e8e79a
14 changed files with 140 additions and 98 deletions
@@ -0,0 +1,199 @@
"""
Batch ořez puzzle z MySQL.
Pro každý řádek v sudoku_killer kde file_puzzle_cropped IS NULL:
- načte file_puzzle + crop_method
- ořízne podle metody
- uloží zpět do file_puzzle_cropped
"""
# ---------------------------------------------------------------------------
# Nastavení — upravuj zde před spuštěním v PyCharm
# ---------------------------------------------------------------------------
WORKERS = 4 # počet paralelních procesů
LIMIT = None # None = vše; číslo (např. 20) = jen prvních N puzzle (pro testování)
BATCH = 200 # kolik oříznutých PDF uložit najednou do DB
DRY_RUN = False # True = jen ořez, nic se neuloží do DB
LOG_EVERY = 500 # vypiš stav do konzole každých N zpracovaných puzzle
# ---------------------------------------------------------------------------
import sys
import json
import csv
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
import fitz
from tqdm import tqdm
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Knihovny"))
from mysql_db import connect_mysql
sys.stdout.reconfigure(encoding="utf-8")
sys.stderr.reconfigure(encoding="utf-8")
ERRORS_CSV = Path(__file__).parent / "crop_errors.csv"
# ---------------------------------------------------------------------------
# Crop metody — přidat sem nové funkce pro nové metody
# ---------------------------------------------------------------------------
def crop_raycast_auto(pdf_bytes: bytes, params: dict) -> bytes:
crop_margin = params.get("crop_margin_pt", 2)
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
page = doc[0]
paths = page.get_drawings()
y_mid = page.mediabox.height / 2
hit_h = [(p["rect"], p.get("width") or 0) for p in paths
if p["rect"].y0 <= y_mid <= p["rect"].y1]
if not hit_h:
raise ValueError("ray-cast: zadne kresby na y_mid")
rects = [r for r, _ in hit_h]
x_left = min(r.x0 for r in rects)
x_right = max(r.x1 for r in rects)
top_cut = min(r.y0 for r in rects)
bot_cut = max(r.y1 for r in rects)
lw_l = next((lw for r, lw in hit_h if r.x0 == x_left), 0)
lw_r = next((lw for r, lw in hit_h if r.x1 == x_right), 0)
clip = fitz.Rect(
x_left - lw_l / 2 - crop_margin,
top_cut - crop_margin,
x_right + lw_r / 2 + crop_margin,
bot_cut + crop_margin,
)
doc_new = fitz.open()
p = doc_new.new_page(width=clip.width, height=clip.height)
p.show_pdf_page(fitz.Rect(0, 0, clip.width, clip.height), doc, 0, clip=clip)
out = doc_new.tobytes()
doc.close()
doc_new.close()
return out
CROP_METHODS = {
"raycast_auto": crop_raycast_auto,
}
# ---------------------------------------------------------------------------
# Worker — spouští se v samostatném procesu
# ---------------------------------------------------------------------------
def process_one(args):
puzzle_id, puzzle_number, pdf_bytes, method_name, params_json = args
try:
params = json.loads(params_json) if isinstance(params_json, str) else params_json
fn = CROP_METHODS.get(method_name)
if fn is None:
return puzzle_id, puzzle_number, None, f"neznama metoda: {method_name}"
cropped = fn(bytes(pdf_bytes), params)
return puzzle_id, puzzle_number, cropped, None
except Exception as e:
return puzzle_id, puzzle_number, None, str(e)
# ---------------------------------------------------------------------------
# Hlavní logika
# ---------------------------------------------------------------------------
def fetch_todo(limit):
import pymysql.cursors
conn = connect_mysql(database="puzzle", cursorclass=pymysql.cursors.DictCursor)
cur = conn.cursor()
sql = """
SELECT sk.id, sk.puzzle_number, sk.file_puzzle,
cm.name AS method_name, cm.params_json
FROM sudoku_killer sk
JOIN puzzle_crop_method cm ON sk.crop_method_id = cm.id
WHERE sk.file_puzzle_cropped IS NULL
ORDER BY sk.puzzle_number
"""
if limit:
sql += f" LIMIT {int(limit)}"
cur.execute(sql)
rows = cur.fetchall()
cur.close()
conn.close()
return rows
def save_cropped(updates: list[tuple]):
"""updates = [(cropped_bytes, puzzle_id), ...]"""
import pymysql.cursors
conn = connect_mysql(database="puzzle", cursorclass=pymysql.cursors.DictCursor)
cur = conn.cursor()
cur.executemany(
"UPDATE sudoku_killer SET file_puzzle_cropped = %s WHERE id = %s",
updates,
)
cur.close()
conn.close()
def main():
print("Nacitam seznam puzzle k orizeni...")
rows = fetch_todo(LIMIT)
total = len(rows)
if total == 0:
print("Vsechny puzzle jsou jiz orizeny.")
return
print(f"Ke zpracovani: {total} puzzle | workers: {WORKERS} | batch: {BATCH} | dry-run: {DRY_RUN}")
errors = []
pending_saves = [] # [(cropped_bytes, puzzle_id)]
done = 0
tasks = [
(r["id"], r["puzzle_number"], r["file_puzzle"], r["method_name"], r["params_json"])
for r in rows
]
with ProcessPoolExecutor(max_workers=WORKERS) as executor:
futures = {executor.submit(process_one, t): t for t in tasks}
with tqdm(total=total, unit="puzzle") as bar:
for future in as_completed(futures):
puzzle_id, puzzle_number, cropped, err = future.result()
if err:
errors.append({"puzzle_id": puzzle_id, "puzzle_number": puzzle_number, "chyba": err})
tqdm.write(f" [CHYBA] puzzle #{puzzle_number}: {err}")
elif not DRY_RUN:
pending_saves.append((cropped, puzzle_id))
if len(pending_saves) >= BATCH:
save_cropped(pending_saves)
pending_saves.clear()
done += 1
bar.update(1)
bar.set_postfix(chyby=len(errors), ulozeno=done - len(errors) - len(pending_saves))
if done % LOG_EVERY == 0:
zbyvá = total - done
pct = done / total * 100
tqdm.write(f" >> {done}/{total} ({pct:.1f}%) | puzzle #{puzzle_number} | zbyvá: {zbyvá} | chyby: {len(errors)}")
# Uložit zbývající
if pending_saves and not DRY_RUN:
save_cropped(pending_saves)
if errors:
with open(ERRORS_CSV, "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=["puzzle_id", "puzzle_number", "chyba"])
w.writeheader()
w.writerows(errors)
print(f"\nChyby: {len(errors)} — viz {ERRORS_CSV}")
else:
print("\nVse bez chyb.")
ok = done - len(errors)
print(f"Hotovo: {ok} orizeno, {len(errors)} chyb, {total - done} preskoceno.")
if __name__ == "__main__":
main()
@@ -0,0 +1,36 @@
"""
Exportuje originální PDF puzzle z tabulky sudoku_killer pro porovnání.
"""
import sys
from pathlib import Path
sys.stdout.reconfigure(encoding="utf-8")
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Knihovny"))
from mysql_db import connect_mysql
OUTPUT_DIR = Path(__file__).parent
conn = connect_mysql(database="puzzle")
cur = conn.cursor()
cur.execute(
"SELECT puzzle_number, file_puzzle, file_solution "
"FROM sudoku_killer WHERE puzzle_number = 31414"
)
row = cur.fetchone()
cur.close()
conn.close()
if not row:
print("Puzzle 31414 nenalezen v sudoku_killer.")
else:
num, pdf_puzzle, pdf_solution = row
if pdf_puzzle:
path = OUTPUT_DIR / f"original_{num}_puzzle.pdf"
path.write_bytes(pdf_puzzle)
print(f"Uloženo: {path}")
if pdf_solution:
path = OUTPUT_DIR / f"original_{num}_solution.pdf"
path.write_bytes(pdf_solution)
print(f"Uloženo: {path}")
@@ -0,0 +1,151 @@
"""
Naimportuje stažené PDF puzzle z DownloadedPuzzles/ do MySQL tabulky sudoku_killer.
Spuštění:
python import_do_mysql.py # přeskočí již existující (podle puzzle_number)
python import_do_mysql.py --all # reimportuje vše (přepíše existující)
"""
import re
import sys
import argparse
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Knihovny"))
from mysql_db import connect_mysql
sys.stdout.reconfigure(encoding="utf-8")
sys.stderr.reconfigure(encoding="utf-8")
SAVE_DIR = Path(__file__).parent / "DownloadedPuzzles"
# 2009-01-01 Puzzle SudokuKiller 1 [difficulty 5 of 10] [average solving time 47 min].pdf
FILENAME_RE = re.compile(
r"^(?P<date>\d{4}-\d{2}-\d{2}) Puzzle (?P<type>SudokuKillerGreaterThan|SudokuKiller) (?P<num>\d+) "
r"\[difficulty (?P<diff>\d+) of (?P<maxdiff>\d+)\] "
r"\[average solving time (?P<time>[^\]]+)\]"
r"(?P<solution> \[solution\])?\.pdf$"
)
def parse_time_to_minutes(time_str):
"""Převede '47 min', '1h 7m', '17h 44m' na celkový počet minut."""
time_str = time_str.strip()
m = re.match(r"^(\d+)h\s+(\d+)m$", time_str)
if m:
return int(m.group(1)) * 60 + int(m.group(2))
m = re.match(r"^(\d+)\s+min$", time_str)
if m:
return int(m.group(1))
return None
def load_puzzle_types(cursor):
cursor.execute("SELECT id, name FROM puzzle_type")
return {row["name"]: row["id"] for row in cursor.fetchall()}
def load_existing_numbers(cursor):
cursor.execute("SELECT puzzle_number FROM sudoku_killer")
return {row["puzzle_number"] for row in cursor.fetchall()}
def parse_files():
"""Vrátí dict: puzzle_number -> {"puzzle": Path, "solution": Path|None, metadata...}"""
puzzles = {}
for f in SAVE_DIR.iterdir():
m = FILENAME_RE.match(f.name)
if not m:
print(f"[SKIP] Nerozpoznaný název: {f.name}", file=sys.stderr)
continue
num = int(m.group("num"))
if num not in puzzles:
puzzles[num] = {
"puzzle_number": num,
"puzzle_date": m.group("date"),
"puzzle_type": m.group("type"),
"difficulty": int(m.group("diff")),
"max_difficulty": int(m.group("maxdiff")),
"avg_minutes": parse_time_to_minutes(m.group("time")),
"file_puzzle": None,
"file_solution": None,
}
if m.group("solution"):
puzzles[num]["file_solution"] = f
else:
puzzles[num]["file_puzzle"] = f
return puzzles
def import_puzzle(cursor, puzzle, type_ids):
if puzzle["file_puzzle"] is None:
print(f"[SKIP] puzzle_number={puzzle['puzzle_number']}: chybí PDF puzzlu")
return False
type_id = type_ids.get(puzzle["puzzle_type"])
if type_id is None:
print(f"[SKIP] Neznámý typ: {puzzle['puzzle_type']}")
return False
pdf_puzzle = puzzle["file_puzzle"].read_bytes()
pdf_solution = puzzle["file_solution"].read_bytes() if puzzle["file_solution"] else None
cursor.execute("""
INSERT INTO sudoku_killer
(puzzle_number, puzzle_type_id, puzzle_date, difficulty, max_difficulty,
avg_solving_time_minutes, file_puzzle, file_solution)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
puzzle_type_id = VALUES(puzzle_type_id),
puzzle_date = VALUES(puzzle_date),
difficulty = VALUES(difficulty),
max_difficulty = VALUES(max_difficulty),
avg_solving_time_minutes = VALUES(avg_solving_time_minutes),
file_puzzle = VALUES(file_puzzle),
file_solution = VALUES(file_solution)
""", (
puzzle["puzzle_number"],
type_id,
puzzle["puzzle_date"],
puzzle["difficulty"],
puzzle["max_difficulty"],
puzzle["avg_minutes"],
pdf_puzzle,
pdf_solution,
))
return True
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--all", action="store_true", help="Reimportuje i existující záznamy")
args = parser.parse_args()
import pymysql.cursors
conn = connect_mysql(database="puzzle", cursorclass=pymysql.cursors.DictCursor)
cursor = conn.cursor()
type_ids = load_puzzle_types(cursor)
existing = load_existing_numbers(cursor) if not args.all else set()
puzzles = parse_files()
total = len(puzzles)
print(f"Nalezeno {total} puzzle v adresáři.")
imported = skipped = errors = 0
for i, (num, puzzle) in enumerate(sorted(puzzles.items()), 1):
if num in existing:
skipped += 1
continue
try:
if import_puzzle(cursor, puzzle, type_ids):
imported += 1
else:
errors += 1
except Exception as e:
print(f"[CHYBA] puzzle_number={num}: {e}", file=sys.stderr)
errors += 1
if i % 500 == 0:
print(f" {i}/{total} zpracováno ({imported} importováno, {skipped} přeskočeno, {errors} chyb)")
cursor.close()
conn.close()
print(f"\nHotovo: {imported} importováno, {skipped} přeskočeno, {errors} chyb.")
if __name__ == "__main__":
main()
@@ -0,0 +1,23 @@
{
"2PuzzleOnA4": {
"description": "2 puzzle pod sebou, horizontalne vycentrovane, misto po stranach na vypocty",
"page": {
"format": "A4",
"width_pt": 595.276,
"height_pt": 841.89
},
"count": 2,
"arrangement": "vertical",
"horizontal_align": "center",
"vertical_distribution": "equal_gaps",
"target_puzzle_width_mm": 117.83,
"target_puzzle_height_mm": 117.83,
"crop_margin_pt": 2,
"info": {
"sample_raw_puzzle_mm": "107.12 x 107.12",
"scale_used_for_sample": 1.1,
"side_margin_mm": 46.09,
"gap_between_puzzles_mm": 20.45
}
}
}
@@ -0,0 +1,184 @@
"""
Průzkumný skript: zkouší najít strukturovaná data puzzle
na dailykillersudoku.com (cage definice, řešení).
"""
import asyncio
import json
import sys
sys.stdout.reconfigure(encoding="utf-8")
from playwright.async_api import async_playwright
URL = "https://www.dailykillersudoku.com/puzzle/70000"
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(viewport={"width": 1280, "height": 900})
page = await context.new_page()
# Zachytávat network requesty
api_responses = []
async def on_response(response):
url = response.url
if any(k in url for k in ["api", "puzzle", "data", "json", "cage", "grid"]):
try:
body = await response.text()
api_responses.append({"url": url, "status": response.status, "body": body[:2000]})
except:
api_responses.append({"url": url, "status": response.status, "body": "(could not read)"})
page.on("response", on_response)
print(f"Načítám {URL} ...")
await page.goto(URL, wait_until="networkidle", timeout=60_000)
# 1) Network requesty
print("\n=== Zachycené API/data requesty ===")
for r in api_responses:
print(f"\n URL: {r['url']}")
print(f" Status: {r['status']}")
if r['body'] and len(r['body']) < 2000:
print(f" Body: {r['body'][:500]}")
# 2) Globální JS proměnné
print("\n=== Globální proměnné ===")
globals_check = await page.evaluate("""() => {
const names = ['puzzle', 'puzzleData', 'gameData', 'game', 'board',
'grid', 'cages', 'cells', 'solution', 'killerData',
'sudoku', 'level', 'data', 'config', 'state',
'app', 'store', 'vuex', '__NUXT__', '__NEXT_DATA__',
'initialData', 'pageData', 'props', 'serverData'];
const found = {};
for (const name of names) {
if (typeof window[name] !== 'undefined') {
const val = window[name];
found[name] = {
type: typeof val,
keys: typeof val === 'object' && val !== null ? Object.keys(val).slice(0, 20) : null
};
}
}
return found;
}""")
print(json.dumps(globals_check, indent=2))
# 3) SVG/Canvas analýza
print("\n=== SVG/Canvas elementy ===")
svg_info = await page.evaluate("""() => {
const svgs = document.querySelectorAll('svg');
const canvases = document.querySelectorAll('canvas');
return {
svg_count: svgs.length,
canvas_count: canvases.length,
svg_ids: Array.from(svgs).map(s => s.id || s.className || '(no id)').slice(0, 5),
canvas_ids: Array.from(canvases).map(c => c.id || c.className || '(no id)').slice(0, 5)
};
}""")
print(json.dumps(svg_info, indent=2))
# 4) Data atributy
print("\n=== Elementy s data- atributy ===")
data_attrs = await page.evaluate("""() => {
const all = document.querySelectorAll('[data-cage], [data-cell], [data-sum], [data-group], [data-value], [data-row], [data-col]');
return {
count: all.length,
samples: Array.from(all).slice(0, 5).map(el => ({
tag: el.tagName,
attrs: Object.fromEntries(Array.from(el.attributes).filter(a => a.name.startsWith('data-')).map(a => [a.name, a.value]))
}))
};
}""")
print(json.dumps(data_attrs, indent=2))
# 5) Tabulky a mřížky
print("\n=== Tabulky / grid struktury ===")
tables = await page.evaluate("""() => {
const tables = document.querySelectorAll('table');
const grids = document.querySelectorAll('[class*=grid], [class*=puzzle], [class*=board], [class*=cage], [class*=cell], [id*=grid], [id*=puzzle], [id*=board]');
return {
table_count: tables.length,
grid_elements: Array.from(grids).slice(0, 10).map(el => ({
tag: el.tagName,
id: el.id,
class: el.className.toString().substring(0, 100),
children: el.children.length
}))
};
}""")
print(json.dumps(tables, indent=2))
# 6) Script tagy s daty
print("\n=== Script tagy s daty ===")
scripts = await page.evaluate("""() => {
const scripts = document.querySelectorAll('script');
const results = [];
for (const s of scripts) {
const text = s.textContent || '';
if (text.length > 10 && text.length < 50000) {
const keywords = ['puzzle', 'cage', 'cell', 'grid', 'solution', 'board', 'sum'];
const found = keywords.filter(k => text.toLowerCase().includes(k));
if (found.length > 0) {
results.push({
keywords: found,
length: text.length,
snippet: text.substring(0, 500)
});
}
}
}
return results;
}""")
print(json.dumps(scripts, indent=2, ensure_ascii=False)[:5000])
# 7) Vue/React/Angular state
print("\n=== Framework state ===")
framework = await page.evaluate("""() => {
// Vue
const vueEl = document.querySelector('[data-v-app]') || document.querySelector('#app') || document.querySelector('#__nuxt');
let vueData = null;
if (vueEl && vueEl.__vue_app__) {
vueData = 'Vue 3 app found';
} else if (vueEl && vueEl.__vue__) {
vueData = 'Vue 2 app found';
try {
const d = vueEl.__vue__.$data;
vueData = {type: 'Vue 2', keys: Object.keys(d)};
} catch(e) {}
}
// __NUXT__
if (typeof __NUXT__ !== 'undefined') {
try { vueData = {type: 'Nuxt', keys: Object.keys(__NUXT__)}; } catch(e) {}
}
// React
let reactData = null;
const reactRoot = document.querySelector('#__next') || document.querySelector('#root');
if (reactRoot) {
const fiberKey = Object.keys(reactRoot).find(k => k.startsWith('__reactFiber') || k.startsWith('__reactInternalInstance'));
if (fiberKey) reactData = 'React app found';
}
return {vue: vueData, react: reactData};
}""")
print(json.dumps(framework, indent=2))
# 8) Všechny window properties (custom)
print("\n=== Custom window properties ===")
custom_props = await page.evaluate("""() => {
const iframe = document.createElement('iframe');
document.body.appendChild(iframe);
const defaultKeys = new Set(Object.keys(iframe.contentWindow));
document.body.removeChild(iframe);
const custom = Object.keys(window).filter(k => !defaultKeys.has(k) && !k.startsWith('__'));
return custom.slice(0, 50);
}""")
print(json.dumps(custom_props, indent=2))
await browser.close()
if __name__ == "__main__":
asyncio.run(main())
@@ -0,0 +1,174 @@
"""
Průzkumný skript v2: zkouší najít strukturovaná data puzzle
na dailykillersudoku.com — prozkoumá DKS objekt a platný puzzle.
"""
import asyncio
import json
import sys
sys.stdout.reconfigure(encoding="utf-8")
from playwright.async_api import async_playwright
URL = "https://www.dailykillersudoku.com/puzzle/376"
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(viewport={"width": 1280, "height": 900})
page = await context.new_page()
api_responses = []
async def on_response(response):
url = response.url
if "dailykillersudoku" in url and url != URL:
try:
ct = response.headers.get("content-type", "")
if "json" in ct or "javascript" in ct or "text" in ct:
body = await response.text()
if len(body) < 5000:
api_responses.append({"url": url, "status": response.status, "body": body[:2000]})
else:
api_responses.append({"url": url, "status": response.status, "body": f"({len(body)} chars)"})
except:
pass
page.on("response", on_response)
print(f"Načítám {URL} ...")
await page.goto(URL, wait_until="networkidle", timeout=60_000)
# 1) DKS objekt — klíče
print("\n=== DKS objekt — klíče ===")
dks = await page.evaluate("""() => {
if (typeof DKS === 'undefined') return null;
const result = {};
for (const key of Object.keys(DKS)) {
const val = DKS[key];
const t = typeof val;
if (t === 'function') {
result[key] = 'function';
} else if (t === 'object' && val !== null) {
result[key] = {type: 'object', keys: Object.keys(val).slice(0, 15)};
} else {
result[key] = val;
}
}
return result;
}""")
if dks:
print(json.dumps(dks, indent=2, ensure_ascii=False)[:5000])
# 2) DKS.board nebo podobné puzzle objekty
print("\n=== DKS puzzle-related data ===")
puzzle_data = await page.evaluate("""() => {
if (typeof DKS === 'undefined') return null;
const result = {};
const interesting = ['board', 'puzzle', 'game', 'grid', 'cages', 'cells',
'solution', 'currentPuzzle', 'puzzleData', 'data',
'sudoku', 'killer', 'state'];
for (const key of Object.keys(DKS)) {
if (interesting.some(i => key.toLowerCase().includes(i))) {
try {
result[key] = JSON.parse(JSON.stringify(DKS[key]));
} catch(e) {
result[key] = String(DKS[key]).substring(0, 200);
}
}
}
return result;
}""")
if puzzle_data:
print(json.dumps(puzzle_data, indent=2, ensure_ascii=False)[:8000])
else:
print(" žádné puzzle data")
# 3) Script tagy s puzzle daty
print("\n=== Script tagy s puzzle daty ===")
scripts = await page.evaluate("""() => {
const scripts = document.querySelectorAll('script');
const results = [];
for (const s of scripts) {
const text = s.textContent || '';
if (text.includes('cage') || text.includes('cell') || text.includes('solution')
|| text.includes('group') || text.includes('sum') || text.includes('Board')
|| text.includes('Puzzle')) {
results.push({
length: text.length,
snippet: text.substring(0, 1000)
});
}
}
return results;
}""")
print(json.dumps(scripts, indent=2, ensure_ascii=False)[:8000])
# 4) Zachycené requesty
print("\n=== Zachycené requesty (dailykillersudoku) ===")
for r in api_responses:
print(f"\n URL: {r['url']}")
print(f" Status: {r['status']}")
print(f" Body: {r['body'][:500]}")
# 5) SVG obsah — puzzle mřížka
print("\n=== SVG puzzle mřížka ===")
svg_data = await page.evaluate("""() => {
const svgs = document.querySelectorAll('svg');
const results = [];
for (const svg of svgs) {
const html = svg.outerHTML;
if (html.length > 1000) {
// Pravděpodobně puzzle mřížka
const texts = svg.querySelectorAll('text');
const textContent = Array.from(texts).map(t => ({
text: t.textContent,
x: t.getAttribute('x'),
y: t.getAttribute('y'),
class: t.getAttribute('class')
}));
const paths = svg.querySelectorAll('path');
results.push({
size: html.length,
width: svg.getAttribute('width'),
height: svg.getAttribute('height'),
viewBox: svg.getAttribute('viewBox'),
text_count: texts.length,
path_count: paths.length,
texts: textContent.slice(0, 30)
});
}
}
return results;
}""")
print(json.dumps(svg_data, indent=2, ensure_ascii=False)[:5000])
# 6) Hledej inline JS s daty puzzle
print("\n=== Inline JS s puzzle daty ===")
inline_data = await page.evaluate("""() => {
const html = document.documentElement.innerHTML;
// Hledej vzory jako JSON pole, cage definice apod.
const patterns = [
/DKS\.\w+\s*=\s*(\{[^}]{20,}\})/g,
/DKS\.\w+\s*=\s*(\[[^\]]{20,}\])/g,
/var\s+\w+\s*=\s*(\{[^}]{50,}\})/g,
/puzzl\w*\s*[:=]\s*["'{[]/gi
];
const found = [];
for (const p of patterns) {
let m;
while ((m = p.exec(html)) !== null) {
found.push(m[0].substring(0, 300));
}
}
return found;
}""")
print(json.dumps(inline_data, indent=2, ensure_ascii=False)[:3000])
await browser.close()
if __name__ == "__main__":
asyncio.run(main())
@@ -0,0 +1,156 @@
"""
Průzkumný skript v3: dekóduje board_base64 a solution_base64
z dailykillersudoku.com — zjistí formát dat.
"""
import asyncio
import json
import sys
sys.stdout.reconfigure(encoding="utf-8")
from playwright.async_api import async_playwright
URL = "https://www.dailykillersudoku.com/puzzle/376"
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(viewport={"width": 1280, "height": 900})
page = await context.new_page()
print(f"Načítám {URL} ...")
await page.goto(URL, wait_until="networkidle", timeout=60_000)
# 1) Vytáhni JSON puzzle dat
print("\n=== Puzzle JSON ===")
puzzle_json = await page.evaluate("""() => {
return DKS.puzzle._json;
}""")
print(json.dumps(puzzle_json, indent=2))
# 2) Dekóduj base64 → raw bytes
print("\n=== board_base64 dekódováno ===")
board_bytes = await page.evaluate("""() => {
const b64 = DKS.puzzle._json.board_base64;
const bytes = DKS.base64ToByteArray(b64);
return Array.from(bytes);
}""")
print(f" Délka: {len(board_bytes)} bytes")
print(f" Raw: {board_bytes}")
print("\n=== solution_base64 dekódováno ===")
sol_bytes = await page.evaluate("""() => {
const b64 = DKS.puzzle._json.solution_base64;
const bytes = DKS.base64ToByteArray(b64);
return Array.from(bytes);
}""")
print(f" Délka: {len(sol_bytes)} bytes")
print(f" Raw: {sol_bytes}")
# 3) Jak Board parsuje data
print("\n=== Board po rozbalení ===")
board_data = await page.evaluate("""() => {
const board = DKS.puzzle.board;
return {
size: board.size,
cell_count: board._canvas ? 'has canvas' : 'no canvas',
};
}""")
print(json.dumps(board_data, indent=2))
# 4) Buňky a klece z board
print("\n=== Board cells ===")
cells_data = await page.evaluate("""() => {
const board = DKS.puzzle.board;
if (!board._cells) return 'no _cells';
const result = [];
for (let r = 0; r < board.size; r++) {
for (let c = 0; c < board.size; c++) {
const cell = board._cells[r][c];
result.push({
row: r, col: c,
value: cell._value || cell.value,
cage: cell._cage ? {
sum: cell._cage._sum || cell._cage.sum,
id: cell._cage._id || cell._cage.id
} : null
});
}
}
return result;
}""")
if isinstance(cells_data, list):
print(f" Celkem buněk: {len(cells_data)}")
for c in cells_data[:20]:
print(f" [{c['row']},{c['col']}] value={c.get('value')} cage={c.get('cage')}")
else:
print(f" {cells_data}")
# 5) Zkus přístup přes cages
print("\n=== Cages ===")
cages_data = await page.evaluate("""() => {
const board = DKS.puzzle.board;
// Zkus najít cages
const props = Object.keys(board).filter(k => !k.startsWith('_') || k.includes('cage') || k.includes('Cage'));
const allProps = Object.keys(board);
return {all_props: allProps, filtered: props};
}""")
print(json.dumps(cages_data, indent=2))
# 6) Všechny vlastnosti boardu
print("\n=== Board — všechny vlastnosti ===")
board_full = await page.evaluate("""() => {
const board = DKS.puzzle.board;
const result = {};
for (const key of Object.keys(board)) {
const val = board[key];
const t = typeof val;
if (t === 'function') continue;
if (t === 'object' && val !== null) {
if (Array.isArray(val)) {
result[key] = `Array(${val.length})`;
if (val.length > 0 && val.length < 100) {
try {
const sample = val[0];
result[key + '_sample'] = typeof sample === 'object' ? Object.keys(sample || {}).slice(0,10) : sample;
} catch(e) {}
}
} else {
result[key] = Object.keys(val).slice(0, 10);
}
} else {
result[key] = val;
}
}
return result;
}""")
print(json.dumps(board_full, indent=2, ensure_ascii=False)[:5000])
# 7) Solution data
print("\n=== Solution ===")
solution_data = await page.evaluate("""() => {
const sol = DKS.puzzle.solution;
if (!sol) return 'no solution';
const props = Object.keys(sol);
const result = {props: props};
for (const p of props) {
const v = sol[p];
if (typeof v !== 'function') {
if (Array.isArray(v)) {
result[p] = v.slice(0, 20);
} else {
result[p] = v;
}
}
}
return result;
}""")
print(json.dumps(solution_data, indent=2, ensure_ascii=False)[:3000])
await browser.close()
if __name__ == "__main__":
asyncio.run(main())
@@ -0,0 +1,77 @@
"""
Průzkumný skript v4: vytáhne klece (cages) z DKS.puzzle.board.
"""
import asyncio
import json
import sys
sys.stdout.reconfigure(encoding="utf-8")
from playwright.async_api import async_playwright
URL = "https://www.dailykillersudoku.com/puzzle/376"
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(viewport={"width": 1280, "height": 900})
page = await context.new_page()
print(f"Načítám {URL} ...")
await page.goto(URL, wait_until="networkidle", timeout=60_000)
# Klece
print("\n=== Cages ===")
cages = await page.evaluate("""() => {
const board = DKS.puzzle.board;
return board._cages.map((cage, i) => ({
id: i,
sum: cage.sum,
cells: cage.cells.map(c => ({row: c._row, col: c._col}))
}));
}""")
for cage in cages:
cells_str = ", ".join(f"({c['row']},{c['col']})" for c in cage['cells'])
print(f" Klec {cage['id']:2d}: sum={cage['sum']:2d}, buňky=[{cells_str}]")
# Řešení
print("\n=== Řešení ===")
solution = await page.evaluate("""() => {
return DKS.puzzle.solution._values;
}""")
for r, row in enumerate(solution):
print(f" Řádek {r}: {row}")
# Cage map — ověření
print("\n=== Cage map (ověření) ===")
cage_map = await page.evaluate("""() => {
const board = DKS.puzzle.board;
const map = [];
for (let r = 0; r < board.size; r++) {
const row = [];
for (let c = 0; c < board.size; c++) {
const cell = board._cells[r][c];
const cageIdx = board._cages.indexOf(cell._cage);
row.push(cageIdx);
}
map.push(row);
}
return map;
}""")
for r, row in enumerate(cage_map):
print(f" {row}")
# Ověření součtů
print("\n=== Ověření součtů ===")
for cage in cages:
total = sum(solution[c['row']][c['col']] for c in cage['cells'])
ok = "" if total == cage['sum'] else ""
print(f" Klec {cage['id']:2d}: sum={cage['sum']:2d}, actual={total:2d} {ok}")
await browser.close()
if __name__ == "__main__":
asyncio.run(main())
@@ -0,0 +1,113 @@
"""
Průzkumný skript v5: najde správné property names pro cell row/col.
"""
import asyncio
import json
import sys
sys.stdout.reconfigure(encoding="utf-8")
from playwright.async_api import async_playwright
URL = "https://www.dailykillersudoku.com/puzzle/376"
async def main():
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context(viewport={"width": 1280, "height": 900})
page = await context.new_page()
print(f"Načítám {URL} ...")
await page.goto(URL, wait_until="networkidle", timeout=60_000)
# Zjisti property names buněk v klecích
print("\n=== Cell properties ===")
cell_props = await page.evaluate("""() => {
const cage = DKS.puzzle.board._cages[0];
const cell = cage.cells[0];
return Object.keys(cell);
}""")
print(json.dumps(cell_props, indent=2))
# Zkus všechny varianty row/col
print("\n=== Cell row/col lookup ===")
cell_data = await page.evaluate("""() => {
const cage = DKS.puzzle.board._cages[0];
const cell = cage.cells[0];
const result = {};
for (const key of Object.keys(cell)) {
const val = cell[key];
if (typeof val !== 'function' && typeof val !== 'object') {
result[key] = val;
}
}
return result;
}""")
print(json.dumps(cell_data, indent=2))
# Klece s buňkami — správné property
print("\n=== Cages s buňkami ===")
cages = await page.evaluate("""() => {
const board = DKS.puzzle.board;
return board._cages.map((cage, i) => {
const cells = cage.cells.map(c => {
// Najdi row/col property
const keys = Object.keys(c);
const rowKey = keys.find(k => k.toLowerCase().includes('row') && typeof c[k] === 'number');
const colKey = keys.find(k => (k.toLowerCase().includes('col') || k.toLowerCase().includes('column')) && typeof c[k] === 'number');
return {
row: rowKey ? c[rowKey] : null,
col: colKey ? c[colKey] : null,
rowKey: rowKey,
colKey: colKey
};
});
return {id: i, sum: cage.sum, cells: cells};
});
}""")
for cage in cages[:5]:
cells_str = ", ".join(f"({c['row']},{c['col']})" for c in cage['cells'])
print(f" Klec {cage['id']:2d}: sum={cage['sum']:2d}, buňky=[{cells_str}]")
if cage['id'] == 0:
print(f" rowKey={cage['cells'][0]['rowKey']}, colKey={cage['cells'][0]['colKey']}")
# Pokud row/col stále None, zkus index-based approach
print("\n=== Fallback: cage map z _cells ===")
cage_map = await page.evaluate("""() => {
const board = DKS.puzzle.board;
const result = [];
for (const cage of board._cages) {
const cellPositions = [];
for (const cageCell of cage.cells) {
// Najdi pozici buňky v _cells mřížce
for (let r = 0; r < board.size; r++) {
for (let c = 0; c < board.size; c++) {
if (board._cells[r][c] === cageCell) {
cellPositions.push({row: r, col: c});
}
}
}
}
result.push({sum: cage.sum, cells: cellPositions});
}
return result;
}""")
for i, cage in enumerate(cage_map):
cells_str = ", ".join(f"({c['row']},{c['col']})" for c in cage['cells'])
print(f" Klec {i:2d}: sum={cage['sum']:2d}, buňky=[{cells_str}]")
# Ověření součtů
print("\n=== Ověření součtů ===")
solution = await page.evaluate("() => DKS.puzzle.solution._values")
for i, cage in enumerate(cage_map):
total = sum(solution[c['row']][c['col']] for c in cage['cells'])
ok = "" if total == cage['sum'] else ""
print(f" Klec {i:2d}: sum={cage['sum']:2d}, actual={total:2d} {ok}")
await browser.close()
if __name__ == "__main__":
asyncio.run(main())
@@ -0,0 +1,35 @@
"""
Zjistí rozsah puzzle v sudoku_killer tabulce a počet.
"""
import sys
from pathlib import Path
sys.stdout.reconfigure(encoding="utf-8")
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Knihovny"))
from mysql_db import connect_mysql
conn = connect_mysql(database="puzzle")
cur = conn.cursor()
cur.execute("""
SELECT puzzle_type_id, COUNT(*), MIN(puzzle_number), MAX(puzzle_number),
MIN(puzzle_date), MAX(puzzle_date)
FROM sudoku_killer
GROUP BY puzzle_type_id
""")
for row in cur.fetchall():
print(f" type_id={row[0]}, count={row[1]}, nums={row[2]}-{row[3]}, dates={row[4]}-{row[5]}")
cur.execute("SELECT id, name FROM puzzle_type")
for row in cur.fetchall():
print(f" puzzle_type: id={row[0]}, name={row[1]}")
cur.execute("""
SELECT COUNT(*) FROM puzzles WHERE game_type = 'killer_sudoku'
""")
print(f" Už v puzzles tabulce: {cur.fetchone()[0]}")
cur.close()
conn.close()
@@ -0,0 +1,279 @@
"""
Stáhne / přejmenuje Greater-Than Killer Sudoku puzzle + solutions z dailykillersudoku.com.
Název souboru: YYYY-MM-DD Puzzle SudokuKillerGreaterThan {n} [difficulty {d} of 10] [average solving time {t}].pdf
Logika:
1. Načte všechna čísla GT puzzlů ze search (t=4, d=2..10, všechny stránky)
2. Pro každé číslo:
- existuje SudokuKillerGreaterThan {n} → přeskočit
- existuje SudokuKiller {n} → přejmenovat na SudokuKillerGreaterThan
- jinak → stáhnout z /pdfs/{n}.pdf
Spuštění:
python stahni_greater_than.py
"""
import re
import sys
import time
import threading
from datetime import datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from bs4 import BeautifulSoup
sys.stdout.reconfigure(encoding="utf-8")
sys.stderr.reconfigure(encoding="utf-8")
BASE_URL = "https://www.dailykillersudoku.com"
SAVE_DIR = Path(__file__).parent / "DownloadedPuzzles"
SAVE_DIR.mkdir(exist_ok=True)
DELAY = 0.1 # sekundy mezi requesty v rámci jednoho vlákna
NUM_THREADS = 6
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "Mozilla/5.0 (compatible; puzzle-downloader/1.0)"})
_print_lock = threading.Lock()
def tname() -> str:
name = threading.current_thread().name
if name == "MainThread":
return "[Hlavní]"
try:
return f"[T{int(name.split('_')[-1]) + 1}]"
except (ValueError, IndexError):
return f"[{name[:8]}]"
def tprint(*args, **kwargs):
with _print_lock:
print(tname(), *args, **kwargs)
# ---------------------------------------------------------------------------
# Získání čísel GT puzzlů ze search
# ---------------------------------------------------------------------------
def get_page_puzzle_ids(d: int, page: int) -> list[int]:
url = f"{BASE_URL}/search?d={d}&t=4&p={page}"
try:
resp = SESSION.get(url, timeout=15)
except requests.RequestException as e:
tprint(f" Chyba načítání search d={d} p={page}: {e}")
return []
ids = re.findall(r'id="board(\d+)"', resp.text)
return [int(i) for i in ids]
def get_max_page(d: int) -> int:
url = f"{BASE_URL}/search?d={d}&t=4&s=0"
try:
resp = SESSION.get(url, timeout=15)
except requests.RequestException:
return 0
pages = re.findall(r'href="/search\?[^"]*p=(\d+)"', resp.text)
return max([int(p) for p in pages], default=1) if pages else 1
def collect_all_gt_numbers() -> list[int]:
"""Projde search (d=2..10, t=4) a vrátí seřazený seznam všech GT čísel."""
all_ids = set()
for d in range(2, 11):
max_p = get_max_page(d)
if max_p == 0:
continue
tprint(f" Difficulty {d}: {max_p} stránek")
for page in range(1, max_p + 1):
ids = get_page_puzzle_ids(d, page)
all_ids.update(ids)
time.sleep(DELAY)
return sorted(all_ids)
# ---------------------------------------------------------------------------
# Čtení existujících souborů
# ---------------------------------------------------------------------------
def find_downloaded_killer() -> dict[int, Path]:
"""Vrátí {číslo: cesta} pro SudokuKiller (ne GreaterThan) soubory (puzzle, ne solution)."""
result = {}
for f in SAVE_DIR.glob("*Puzzle SudokuKiller *.pdf"):
if "[solution]" in f.name or "GreaterThan" in f.name:
continue
m = re.search(r"SudokuKiller (\d+)", f.name)
if m:
result[int(m.group(1))] = f
return result
def find_downloaded_gt() -> set[int]:
"""Vrátí čísla již stažených/přejmenovaných SudokuKillerGreaterThan souborů."""
result = set()
for f in SAVE_DIR.glob("*Puzzle SudokuKillerGreaterThan *.pdf"):
if "[solution]" in f.name:
continue
m = re.search(r"SudokuKillerGreaterThan (\d+)", f.name)
if m:
result.add(int(m.group(1)))
return result
# ---------------------------------------------------------------------------
# Přejmenování / stažení
# ---------------------------------------------------------------------------
def killer_to_gt_filename(path: Path) -> str:
return path.name.replace("SudokuKiller ", "SudokuKillerGreaterThan ")
def rename_pair(n: int, killer_path: Path) -> bool:
"""Přejmenuje puzzle + solution soubory SudokuKiller → SudokuKillerGreaterThan."""
ok = True
for f in [killer_path,
killer_path.with_name(killer_path.stem + " [solution].pdf")]:
if not f.exists():
if "[solution]" in f.name:
continue # solution soubor nemusí existovat
tprint(f" Soubor nenalezen pro přejmenování: {f.name}")
ok = False
continue
new_name = killer_to_gt_filename(f)
new_path = SAVE_DIR / new_name
f.rename(new_path)
tprint(f" Přejmenováno: {f.name}{new_name}")
return ok
def get_puzzle_info(n: int) -> dict | None:
url = f"{BASE_URL}/search?n={n}"
try:
resp = SESSION.get(url, timeout=15)
except requests.RequestException as e:
tprint(f" Chyba info puzzle {n}: {e}")
return None
soup = BeautifulSoup(resp.text, "html.parser")
section = soup.select_one("section.puzzle")
if not section:
return None
short_month = section.select_one("span.short-month")
day = section.select_one("span.day")
year = section.select_one("span.year")
if not (short_month and day and year):
return None
try:
date_iso = datetime.strptime(
f"{short_month.text.strip()} {day.text.strip()} {year.text.strip()}",
"%b %d %Y",
).strftime("%Y-%m-%d")
except ValueError:
return None
diff_el = section.select_one("span.puzzle-difficulty-value")
time_el = section.select_one("span.puzzle-timing-value")
return {
"date": date_iso,
"number": n,
"difficulty": diff_el.text.strip() if diff_el else "?",
"avg_time": time_el.text.strip() if time_el else "?",
}
def make_filename(info: dict, solution: bool = False) -> str:
suffix = " [solution]" if solution else ""
avg_time = re.sub(r'[\\/:*?"<>|]', "-", info["avg_time"])
return (
f"{info['date']} Puzzle SudokuKillerGreaterThan {info['number']} "
f"[difficulty {info['difficulty']} of 10] "
f"[average solving time {avg_time}]{suffix}.pdf"
)
def download_pdf(n: int, info: dict, solution: bool = False) -> bool:
filename = make_filename(info, solution)
filepath = SAVE_DIR / filename
if filepath.exists():
return True
suffix = ".solution" if solution else ""
pdf_url = f"{BASE_URL}/pdfs/{n}{suffix}.pdf"
try:
resp = SESSION.get(pdf_url, timeout=30)
except requests.RequestException as e:
tprint(f" Chyba stahování {pdf_url}: {e}")
return False
if resp.status_code != 200:
tprint(f" PDF nedostupné (HTTP {resp.status_code}): {pdf_url}")
return False
if resp.headers.get("content-type", "").startswith("text/html"):
tprint(f" PDF vrátilo HTML: {pdf_url}")
return False
filepath.write_bytes(resp.content)
tprint(f" Staženo: {filename}")
return True
def process_puzzle(n: int, idx: int, total: int,
killer_map: dict[int, Path]) -> bool:
tprint(f"[{idx}/{total}] Puzzle #{n}")
if n in killer_map:
return rename_pair(n, killer_map[n])
# není jako SudokuKiller → stáhnout
info = get_puzzle_info(n)
time.sleep(DELAY)
if not info:
tprint(f" Puzzle {n}: info stránka nenalezena")
return False
ok1 = download_pdf(n, info, solution=False)
time.sleep(DELAY)
ok2 = download_pdf(n, info, solution=True)
return ok1 and ok2
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
tprint("Sbírám čísla GT puzzlů ze search (d=2..10, t=4)...")
gt_numbers = collect_all_gt_numbers()
tprint(f"Celkem GT puzzlů nalezeno: {len(gt_numbers)}")
already_gt = find_downloaded_gt()
killer_map = find_downloaded_killer()
to_process = [n for n in gt_numbers if n not in already_gt]
tprint(f"Již hotovo (GreaterThan): {len(already_gt)}")
tprint(f"Ke zpracování: {len(to_process)}")
if not to_process:
tprint("Vše již zpracováno.")
return
rename_count = sum(1 for n in to_process if n in killer_map)
download_count = len(to_process) - rename_count
tprint(f" → přejmenovat: {rename_count}, stáhnout: {download_count}")
ok_count = 0
err_count = 0
total = len(to_process)
tprint(f"Spouštím {NUM_THREADS} vláken...")
with ThreadPoolExecutor(max_workers=NUM_THREADS,
thread_name_prefix="ThreadPoolExecutor-0") as executor:
futures = {
executor.submit(process_puzzle, n, idx, total, killer_map): n
for idx, n in enumerate(to_process, 1)
}
for future in as_completed(futures):
if future.result():
ok_count += 1
else:
err_count += 1
tprint(f"\nHotovo. Úspěšně: {ok_count}, chyby: {err_count}")
if __name__ == "__main__":
main()
@@ -0,0 +1,230 @@
"""
Stáhne všechna Killer Sudoku puzzle + solutions z dailykillersudoku.com jako PDF.
Název souboru: YYYY-MM-DD Puzzle SudokuKiller {n} [difficulty {d} of 10] [average solving time {t}].pdf
Spuštění:
python stahni_killer_sudoku.py # stáhne vše nové od posledního stažení
python stahni_killer_sudoku.py --all # projde všechna čísla znovu (přeskočí existující)
"""
import re
import sys
import time
import threading
import argparse
from datetime import datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from bs4 import BeautifulSoup
sys.stdout.reconfigure(encoding="utf-8")
sys.stderr.reconfigure(encoding="utf-8")
BASE_URL = "https://www.dailykillersudoku.com"
SAVE_DIR = Path(__file__).parent / "DownloadedPuzzles"
SAVE_DIR.mkdir(exist_ok=True)
DELAY = 0.1 # sekundy mezi requesty v rámci jednoho vlákna
NUM_THREADS = 6 # počet souběžných vláken
# Kolik puzzle stáhnout (od nejmenšího chybějícího).
# 0 = stáhni všechna chybějící až do aktuálního.
AMOUNT_TO_DOWNLOAD = 0
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "Mozilla/5.0 (compatible; puzzle-downloader/1.0)"})
_print_lock = threading.Lock()
def tname() -> str:
name = threading.current_thread().name
if name == "MainThread":
return "[Hlavní]"
try:
return f"[T{int(name.split('_')[-1]) + 1}]"
except (ValueError, IndexError):
return f"[{name[:8]}]"
def tprint(*args, **kwargs):
with _print_lock:
print(tname(), *args, **kwargs)
def puzzle_exists(n: int) -> bool:
try:
resp = SESSION.get(f"{BASE_URL}/search?n={n}", timeout=15)
return 'section class="puzzle' in resp.text
except requests.RequestException:
return False
def get_max_puzzle_number() -> int:
"""Binárním vyhledáváním zjistí číslo nejnovějšího puzzle."""
lo, hi = 1, 99999
while lo < hi:
mid = (lo + hi + 1) // 2
if puzzle_exists(mid):
lo = mid
else:
hi = mid - 1
time.sleep(0.5)
return lo
def get_puzzle_info(n: int) -> dict | None:
url = f"{BASE_URL}/search?n={n}"
try:
resp = SESSION.get(url, timeout=15)
except requests.RequestException as e:
tprint(f" Chyba při načítání info puzzle {n}: {e}")
return None
if resp.status_code != 200:
tprint(f" Puzzle {n}: info stránka nedostupná (HTTP {resp.status_code})")
return None
soup = BeautifulSoup(resp.text, "html.parser")
section = soup.select_one("section.puzzle")
if not section:
tprint(f" Puzzle {n}: nenalezena sekce section.puzzle")
return None
short_month = section.select_one("span.short-month")
day = section.select_one("span.day")
year = section.select_one("span.year")
if not (short_month and day and year):
tprint(f" Puzzle {n}: datum nenalezeno (chybí span.short-month / .day / .year)")
return None
try:
date_iso = datetime.strptime(
f"{short_month.text.strip()} {day.text.strip()} {year.text.strip()}",
"%b %d %Y",
).strftime("%Y-%m-%d")
except ValueError as e:
tprint(f" Puzzle {n}: chyba parsování data ({e})")
return None
diff_el = section.select_one("span.puzzle-difficulty-value")
difficulty = diff_el.text.strip() if diff_el else "?"
time_el = section.select_one("span.puzzle-timing-value")
avg_time = time_el.text.strip() if time_el else "?"
return {"date": date_iso, "number": n, "difficulty": difficulty, "avg_time": avg_time}
def make_filename(info: dict, solution: bool = False) -> str:
suffix = " [solution]" if solution else ""
avg_time = re.sub(r'[\\/:*?"<>|]', '-', info["avg_time"])
return (
f"{info['date']} Puzzle SudokuKiller {info['number']} "
f"[difficulty {info['difficulty']} of 10] "
f"[average solving time {avg_time}]{suffix}.pdf"
)
def download_pdf(n: int, info: dict, solution: bool = False) -> bool:
filename = make_filename(info, solution)
filepath = SAVE_DIR / filename
if filepath.exists():
tprint(f" Přeskočeno (existuje): {filename}")
return True
suffix = ".solution" if solution else ""
pdf_url = f"{BASE_URL}/pdfs/{n}{suffix}.pdf"
try:
resp = SESSION.get(pdf_url, timeout=30)
except requests.RequestException as e:
tprint(f" Chyba stahování {pdf_url}: {e}")
return False
if resp.status_code != 200:
tprint(f" PDF nedostupné (HTTP {resp.status_code}): {pdf_url}")
return False
if resp.headers.get("content-type", "").startswith("text/html"):
tprint(f" PDF vrátilo HTML místo binárního obsahu: {pdf_url}")
return False
filepath.write_bytes(resp.content)
tprint(f" Uloženo: {filename}")
return True
def process_puzzle(n: int, idx: int, total: int) -> bool:
tprint(f"[{idx}/{total}] Puzzle #{n}...")
info = get_puzzle_info(n)
time.sleep(DELAY)
if not info:
return False
puzzle_ok = download_pdf(n, info, solution=False)
time.sleep(DELAY)
solution_ok = download_pdf(n, info, solution=True)
return puzzle_ok and solution_ok
def find_already_downloaded() -> set[int]:
downloaded = set()
for f in SAVE_DIR.glob("*Puzzle SudokuKiller*.pdf"):
m = re.search(r'SudokuKiller (\d+)', f.name)
if m:
downloaded.add(int(m.group(1)))
return downloaded
def main():
parser = argparse.ArgumentParser(description="Stáhne Killer Sudoku PDF")
parser.add_argument("--all", action="store_true", help="Projde všechna čísla od 1 (přeskočí existující)")
parser.add_argument("--start", type=int, default=1, help="Začáteční číslo puzzle (výchozí: 1)")
parser.add_argument("--end", type=int, default=None, help="Koncové číslo puzzle (výchozí: aktuální)")
args = parser.parse_args()
tprint("Zjišťuji aktuální číslo puzzle...")
max_n = get_max_puzzle_number()
tprint(f"Aktuální nejvyšší puzzle: #{max_n}")
end_n = args.end if args.end else max_n
start_n = args.start
downloaded = find_already_downloaded()
if args.all:
to_download = list(range(start_n, end_n + 1))
tprint(f"Projdu všechna puzzle #{start_n}#{end_n} (přeskočím existující soubory)")
else:
to_download = [n for n in range(start_n, end_n + 1) if n not in downloaded]
tprint(f"Již staženo: {len(downloaded)} puzzle, zbývá stáhnout: {len(to_download)}")
if AMOUNT_TO_DOWNLOAD > 0:
to_download = to_download[:AMOUNT_TO_DOWNLOAD]
tprint(f"AMOUNT_TO_DOWNLOAD={AMOUNT_TO_DOWNLOAD} → stáhnu prvních {len(to_download)} chybějících")
if not to_download:
tprint("Vše je již staženo.")
return
total = len(to_download)
ok_count = 0
err_count = 0
tprint(f"Spouštím {NUM_THREADS} vláken...")
with ThreadPoolExecutor(max_workers=NUM_THREADS, thread_name_prefix="ThreadPoolExecutor-0") as executor:
futures = {
executor.submit(process_puzzle, n, idx, total): n
for idx, n in enumerate(to_download, 1)
}
for future in as_completed(futures):
if future.result():
ok_count += 1
else:
err_count += 1
tprint(f"\nHotovo. Úspěšně: {ok_count}, chyby: {err_count}")
if __name__ == "__main__":
main()