notebookvb
This commit is contained in:
@@ -0,0 +1,151 @@
|
||||
"""
|
||||
Naimportuje stažené PDF puzzle z DownloadedPuzzles/ do MySQL tabulky sudoku_killer.
|
||||
|
||||
Spuštění:
|
||||
python import_do_mysql.py # přeskočí již existující (podle puzzle_number)
|
||||
python import_do_mysql.py --all # reimportuje vše (přepíše existující)
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent / "Knihovny"))
|
||||
from mysql_db import connect_mysql
|
||||
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
sys.stderr.reconfigure(encoding="utf-8")
|
||||
|
||||
SAVE_DIR = Path(__file__).parent / "DownloadedPuzzles"
|
||||
|
||||
# 2009-01-01 Puzzle SudokuKiller 1 [difficulty 5 of 10] [average solving time 47 min].pdf
|
||||
FILENAME_RE = re.compile(
|
||||
r"^(?P<date>\d{4}-\d{2}-\d{2}) Puzzle (?P<type>SudokuKillerGreaterThan|SudokuKiller) (?P<num>\d+) "
|
||||
r"\[difficulty (?P<diff>\d+) of (?P<maxdiff>\d+)\] "
|
||||
r"\[average solving time (?P<time>[^\]]+)\]"
|
||||
r"(?P<solution> \[solution\])?\.pdf$"
|
||||
)
|
||||
|
||||
def parse_time_to_minutes(time_str):
|
||||
"""Převede '47 min', '1h 7m', '17h 44m' na celkový počet minut."""
|
||||
time_str = time_str.strip()
|
||||
m = re.match(r"^(\d+)h\s+(\d+)m$", time_str)
|
||||
if m:
|
||||
return int(m.group(1)) * 60 + int(m.group(2))
|
||||
m = re.match(r"^(\d+)\s+min$", time_str)
|
||||
if m:
|
||||
return int(m.group(1))
|
||||
return None
|
||||
|
||||
def load_puzzle_types(cursor):
|
||||
cursor.execute("SELECT id, name FROM puzzle_type")
|
||||
return {row["name"]: row["id"] for row in cursor.fetchall()}
|
||||
|
||||
def load_existing_numbers(cursor):
|
||||
cursor.execute("SELECT puzzle_number FROM sudoku_killer")
|
||||
return {row["puzzle_number"] for row in cursor.fetchall()}
|
||||
|
||||
def parse_files():
|
||||
"""Vrátí dict: puzzle_number -> {"puzzle": Path, "solution": Path|None, metadata...}"""
|
||||
puzzles = {}
|
||||
for f in SAVE_DIR.iterdir():
|
||||
m = FILENAME_RE.match(f.name)
|
||||
if not m:
|
||||
print(f"[SKIP] Nerozpoznaný název: {f.name}", file=sys.stderr)
|
||||
continue
|
||||
num = int(m.group("num"))
|
||||
if num not in puzzles:
|
||||
puzzles[num] = {
|
||||
"puzzle_number": num,
|
||||
"puzzle_date": m.group("date"),
|
||||
"puzzle_type": m.group("type"),
|
||||
"difficulty": int(m.group("diff")),
|
||||
"max_difficulty": int(m.group("maxdiff")),
|
||||
"avg_minutes": parse_time_to_minutes(m.group("time")),
|
||||
"file_puzzle": None,
|
||||
"file_solution": None,
|
||||
}
|
||||
if m.group("solution"):
|
||||
puzzles[num]["file_solution"] = f
|
||||
else:
|
||||
puzzles[num]["file_puzzle"] = f
|
||||
return puzzles
|
||||
|
||||
def import_puzzle(cursor, puzzle, type_ids):
|
||||
if puzzle["file_puzzle"] is None:
|
||||
print(f"[SKIP] puzzle_number={puzzle['puzzle_number']}: chybí PDF puzzlu")
|
||||
return False
|
||||
|
||||
type_id = type_ids.get(puzzle["puzzle_type"])
|
||||
if type_id is None:
|
||||
print(f"[SKIP] Neznámý typ: {puzzle['puzzle_type']}")
|
||||
return False
|
||||
|
||||
pdf_puzzle = puzzle["file_puzzle"].read_bytes()
|
||||
pdf_solution = puzzle["file_solution"].read_bytes() if puzzle["file_solution"] else None
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO sudoku_killer
|
||||
(puzzle_number, puzzle_type_id, puzzle_date, difficulty, max_difficulty,
|
||||
avg_solving_time_minutes, file_puzzle, file_solution)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON DUPLICATE KEY UPDATE
|
||||
puzzle_type_id = VALUES(puzzle_type_id),
|
||||
puzzle_date = VALUES(puzzle_date),
|
||||
difficulty = VALUES(difficulty),
|
||||
max_difficulty = VALUES(max_difficulty),
|
||||
avg_solving_time_minutes = VALUES(avg_solving_time_minutes),
|
||||
file_puzzle = VALUES(file_puzzle),
|
||||
file_solution = VALUES(file_solution)
|
||||
""", (
|
||||
puzzle["puzzle_number"],
|
||||
type_id,
|
||||
puzzle["puzzle_date"],
|
||||
puzzle["difficulty"],
|
||||
puzzle["max_difficulty"],
|
||||
puzzle["avg_minutes"],
|
||||
pdf_puzzle,
|
||||
pdf_solution,
|
||||
))
|
||||
return True
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--all", action="store_true", help="Reimportuje i existující záznamy")
|
||||
args = parser.parse_args()
|
||||
|
||||
import pymysql.cursors
|
||||
conn = connect_mysql(database="puzzle", cursorclass=pymysql.cursors.DictCursor)
|
||||
cursor = conn.cursor()
|
||||
|
||||
type_ids = load_puzzle_types(cursor)
|
||||
existing = load_existing_numbers(cursor) if not args.all else set()
|
||||
|
||||
puzzles = parse_files()
|
||||
total = len(puzzles)
|
||||
print(f"Nalezeno {total} puzzle v adresáři.")
|
||||
|
||||
imported = skipped = errors = 0
|
||||
for i, (num, puzzle) in enumerate(sorted(puzzles.items()), 1):
|
||||
if num in existing:
|
||||
skipped += 1
|
||||
continue
|
||||
try:
|
||||
if import_puzzle(cursor, puzzle, type_ids):
|
||||
imported += 1
|
||||
else:
|
||||
errors += 1
|
||||
except Exception as e:
|
||||
print(f"[CHYBA] puzzle_number={num}: {e}", file=sys.stderr)
|
||||
errors += 1
|
||||
|
||||
if i % 500 == 0:
|
||||
print(f" {i}/{total} zpracováno ({imported} importováno, {skipped} přeskočeno, {errors} chyb)")
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
||||
print(f"\nHotovo: {imported} importováno, {skipped} přeskočeno, {errors} chyb.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,279 @@
|
||||
"""
|
||||
Stáhne / přejmenuje Greater-Than Killer Sudoku puzzle + solutions z dailykillersudoku.com.
|
||||
Název souboru: YYYY-MM-DD Puzzle SudokuKillerGreaterThan {n} [difficulty {d} of 10] [average solving time {t}].pdf
|
||||
|
||||
Logika:
|
||||
1. Načte všechna čísla GT puzzlů ze search (t=4, d=2..10, všechny stránky)
|
||||
2. Pro každé číslo:
|
||||
- existuje SudokuKillerGreaterThan {n} → přeskočit
|
||||
- existuje SudokuKiller {n} → přejmenovat na SudokuKillerGreaterThan
|
||||
- jinak → stáhnout z /pdfs/{n}.pdf
|
||||
|
||||
Spuštění:
|
||||
python stahni_greater_than.py
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import threading
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
sys.stderr.reconfigure(encoding="utf-8")
|
||||
|
||||
BASE_URL = "https://www.dailykillersudoku.com"
|
||||
SAVE_DIR = Path(__file__).parent / "DownloadedPuzzles"
|
||||
SAVE_DIR.mkdir(exist_ok=True)
|
||||
DELAY = 0.1 # sekundy mezi requesty v rámci jednoho vlákna
|
||||
NUM_THREADS = 6
|
||||
|
||||
SESSION = requests.Session()
|
||||
SESSION.headers.update({"User-Agent": "Mozilla/5.0 (compatible; puzzle-downloader/1.0)"})
|
||||
|
||||
_print_lock = threading.Lock()
|
||||
|
||||
|
||||
def tname() -> str:
|
||||
name = threading.current_thread().name
|
||||
if name == "MainThread":
|
||||
return "[Hlavní]"
|
||||
try:
|
||||
return f"[T{int(name.split('_')[-1]) + 1}]"
|
||||
except (ValueError, IndexError):
|
||||
return f"[{name[:8]}]"
|
||||
|
||||
|
||||
def tprint(*args, **kwargs):
|
||||
with _print_lock:
|
||||
print(tname(), *args, **kwargs)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Získání čísel GT puzzlů ze search
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_page_puzzle_ids(d: int, page: int) -> list[int]:
|
||||
url = f"{BASE_URL}/search?d={d}&t=4&p={page}"
|
||||
try:
|
||||
resp = SESSION.get(url, timeout=15)
|
||||
except requests.RequestException as e:
|
||||
tprint(f" Chyba načítání search d={d} p={page}: {e}")
|
||||
return []
|
||||
ids = re.findall(r'id="board(\d+)"', resp.text)
|
||||
return [int(i) for i in ids]
|
||||
|
||||
|
||||
def get_max_page(d: int) -> int:
|
||||
url = f"{BASE_URL}/search?d={d}&t=4&s=0"
|
||||
try:
|
||||
resp = SESSION.get(url, timeout=15)
|
||||
except requests.RequestException:
|
||||
return 0
|
||||
pages = re.findall(r'href="/search\?[^"]*p=(\d+)"', resp.text)
|
||||
return max([int(p) for p in pages], default=1) if pages else 1
|
||||
|
||||
|
||||
def collect_all_gt_numbers() -> list[int]:
|
||||
"""Projde search (d=2..10, t=4) a vrátí seřazený seznam všech GT čísel."""
|
||||
all_ids = set()
|
||||
for d in range(2, 11):
|
||||
max_p = get_max_page(d)
|
||||
if max_p == 0:
|
||||
continue
|
||||
tprint(f" Difficulty {d}: {max_p} stránek")
|
||||
for page in range(1, max_p + 1):
|
||||
ids = get_page_puzzle_ids(d, page)
|
||||
all_ids.update(ids)
|
||||
time.sleep(DELAY)
|
||||
return sorted(all_ids)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Čtení existujících souborů
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def find_downloaded_killer() -> dict[int, Path]:
|
||||
"""Vrátí {číslo: cesta} pro SudokuKiller (ne GreaterThan) soubory (puzzle, ne solution)."""
|
||||
result = {}
|
||||
for f in SAVE_DIR.glob("*Puzzle SudokuKiller *.pdf"):
|
||||
if "[solution]" in f.name or "GreaterThan" in f.name:
|
||||
continue
|
||||
m = re.search(r"SudokuKiller (\d+)", f.name)
|
||||
if m:
|
||||
result[int(m.group(1))] = f
|
||||
return result
|
||||
|
||||
|
||||
def find_downloaded_gt() -> set[int]:
|
||||
"""Vrátí čísla již stažených/přejmenovaných SudokuKillerGreaterThan souborů."""
|
||||
result = set()
|
||||
for f in SAVE_DIR.glob("*Puzzle SudokuKillerGreaterThan *.pdf"):
|
||||
if "[solution]" in f.name:
|
||||
continue
|
||||
m = re.search(r"SudokuKillerGreaterThan (\d+)", f.name)
|
||||
if m:
|
||||
result.add(int(m.group(1)))
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Přejmenování / stažení
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def killer_to_gt_filename(path: Path) -> str:
|
||||
return path.name.replace("SudokuKiller ", "SudokuKillerGreaterThan ")
|
||||
|
||||
|
||||
def rename_pair(n: int, killer_path: Path) -> bool:
|
||||
"""Přejmenuje puzzle + solution soubory SudokuKiller → SudokuKillerGreaterThan."""
|
||||
ok = True
|
||||
for f in [killer_path,
|
||||
killer_path.with_name(killer_path.stem + " [solution].pdf")]:
|
||||
if not f.exists():
|
||||
if "[solution]" in f.name:
|
||||
continue # solution soubor nemusí existovat
|
||||
tprint(f" Soubor nenalezen pro přejmenování: {f.name}")
|
||||
ok = False
|
||||
continue
|
||||
new_name = killer_to_gt_filename(f)
|
||||
new_path = SAVE_DIR / new_name
|
||||
f.rename(new_path)
|
||||
tprint(f" Přejmenováno: {f.name} → {new_name}")
|
||||
return ok
|
||||
|
||||
|
||||
def get_puzzle_info(n: int) -> dict | None:
|
||||
url = f"{BASE_URL}/search?n={n}"
|
||||
try:
|
||||
resp = SESSION.get(url, timeout=15)
|
||||
except requests.RequestException as e:
|
||||
tprint(f" Chyba info puzzle {n}: {e}")
|
||||
return None
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
section = soup.select_one("section.puzzle")
|
||||
if not section:
|
||||
return None
|
||||
short_month = section.select_one("span.short-month")
|
||||
day = section.select_one("span.day")
|
||||
year = section.select_one("span.year")
|
||||
if not (short_month and day and year):
|
||||
return None
|
||||
try:
|
||||
date_iso = datetime.strptime(
|
||||
f"{short_month.text.strip()} {day.text.strip()} {year.text.strip()}",
|
||||
"%b %d %Y",
|
||||
).strftime("%Y-%m-%d")
|
||||
except ValueError:
|
||||
return None
|
||||
diff_el = section.select_one("span.puzzle-difficulty-value")
|
||||
time_el = section.select_one("span.puzzle-timing-value")
|
||||
return {
|
||||
"date": date_iso,
|
||||
"number": n,
|
||||
"difficulty": diff_el.text.strip() if diff_el else "?",
|
||||
"avg_time": time_el.text.strip() if time_el else "?",
|
||||
}
|
||||
|
||||
|
||||
def make_filename(info: dict, solution: bool = False) -> str:
|
||||
suffix = " [solution]" if solution else ""
|
||||
avg_time = re.sub(r'[\\/:*?"<>|]', "-", info["avg_time"])
|
||||
return (
|
||||
f"{info['date']} Puzzle SudokuKillerGreaterThan {info['number']} "
|
||||
f"[difficulty {info['difficulty']} of 10] "
|
||||
f"[average solving time {avg_time}]{suffix}.pdf"
|
||||
)
|
||||
|
||||
|
||||
def download_pdf(n: int, info: dict, solution: bool = False) -> bool:
|
||||
filename = make_filename(info, solution)
|
||||
filepath = SAVE_DIR / filename
|
||||
if filepath.exists():
|
||||
return True
|
||||
suffix = ".solution" if solution else ""
|
||||
pdf_url = f"{BASE_URL}/pdfs/{n}{suffix}.pdf"
|
||||
try:
|
||||
resp = SESSION.get(pdf_url, timeout=30)
|
||||
except requests.RequestException as e:
|
||||
tprint(f" Chyba stahování {pdf_url}: {e}")
|
||||
return False
|
||||
if resp.status_code != 200:
|
||||
tprint(f" PDF nedostupné (HTTP {resp.status_code}): {pdf_url}")
|
||||
return False
|
||||
if resp.headers.get("content-type", "").startswith("text/html"):
|
||||
tprint(f" PDF vrátilo HTML: {pdf_url}")
|
||||
return False
|
||||
filepath.write_bytes(resp.content)
|
||||
tprint(f" Staženo: {filename}")
|
||||
return True
|
||||
|
||||
|
||||
def process_puzzle(n: int, idx: int, total: int,
|
||||
killer_map: dict[int, Path]) -> bool:
|
||||
tprint(f"[{idx}/{total}] Puzzle #{n}")
|
||||
if n in killer_map:
|
||||
return rename_pair(n, killer_map[n])
|
||||
# není jako SudokuKiller → stáhnout
|
||||
info = get_puzzle_info(n)
|
||||
time.sleep(DELAY)
|
||||
if not info:
|
||||
tprint(f" Puzzle {n}: info stránka nenalezena")
|
||||
return False
|
||||
ok1 = download_pdf(n, info, solution=False)
|
||||
time.sleep(DELAY)
|
||||
ok2 = download_pdf(n, info, solution=True)
|
||||
return ok1 and ok2
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
tprint("Sbírám čísla GT puzzlů ze search (d=2..10, t=4)...")
|
||||
gt_numbers = collect_all_gt_numbers()
|
||||
tprint(f"Celkem GT puzzlů nalezeno: {len(gt_numbers)}")
|
||||
|
||||
already_gt = find_downloaded_gt()
|
||||
killer_map = find_downloaded_killer()
|
||||
|
||||
to_process = [n for n in gt_numbers if n not in already_gt]
|
||||
tprint(f"Již hotovo (GreaterThan): {len(already_gt)}")
|
||||
tprint(f"Ke zpracování: {len(to_process)}")
|
||||
|
||||
if not to_process:
|
||||
tprint("Vše již zpracováno.")
|
||||
return
|
||||
|
||||
rename_count = sum(1 for n in to_process if n in killer_map)
|
||||
download_count = len(to_process) - rename_count
|
||||
tprint(f" → přejmenovat: {rename_count}, stáhnout: {download_count}")
|
||||
|
||||
ok_count = 0
|
||||
err_count = 0
|
||||
total = len(to_process)
|
||||
|
||||
tprint(f"Spouštím {NUM_THREADS} vláken...")
|
||||
with ThreadPoolExecutor(max_workers=NUM_THREADS,
|
||||
thread_name_prefix="ThreadPoolExecutor-0") as executor:
|
||||
futures = {
|
||||
executor.submit(process_puzzle, n, idx, total, killer_map): n
|
||||
for idx, n in enumerate(to_process, 1)
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
if future.result():
|
||||
ok_count += 1
|
||||
else:
|
||||
err_count += 1
|
||||
|
||||
tprint(f"\nHotovo. Úspěšně: {ok_count}, chyby: {err_count}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user