notebookvb
This commit is contained in:
@@ -0,0 +1,230 @@
|
|||||||
|
"""
|
||||||
|
Stáhne všechna Killer Sudoku puzzle + solutions z dailykillersudoku.com jako PDF.
|
||||||
|
Název souboru: YYYY-MM-DD Puzzle SudokuKiller {n} [difficulty {d} of 10] [average solving time {t}].pdf
|
||||||
|
|
||||||
|
Spuštění:
|
||||||
|
python stahni_killer_sudoku.py # stáhne vše nové od posledního stažení
|
||||||
|
python stahni_killer_sudoku.py --all # projde všechna čísla znovu (přeskočí existující)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import threading
|
||||||
|
import argparse
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
sys.stdout.reconfigure(encoding="utf-8")
|
||||||
|
sys.stderr.reconfigure(encoding="utf-8")
|
||||||
|
|
||||||
|
BASE_URL = "https://www.dailykillersudoku.com"
|
||||||
|
SAVE_DIR = Path(__file__).parent / "DownloadedPuzzles"
|
||||||
|
SAVE_DIR.mkdir(exist_ok=True)
|
||||||
|
DELAY = 1.0 # sekundy mezi requesty v rámci jednoho vlákna
|
||||||
|
NUM_THREADS = 5 # počet souběžných vláken
|
||||||
|
|
||||||
|
# Kolik puzzle stáhnout (od nejmenšího chybějícího).
|
||||||
|
# 0 = stáhni všechna chybějící až do aktuálního.
|
||||||
|
AMOUNT_TO_DOWNLOAD = 100
|
||||||
|
|
||||||
|
SESSION = requests.Session()
|
||||||
|
SESSION.headers.update({"User-Agent": "Mozilla/5.0 (compatible; puzzle-downloader/1.0)"})
|
||||||
|
|
||||||
|
_print_lock = threading.Lock()
|
||||||
|
|
||||||
|
|
||||||
|
def tname() -> str:
|
||||||
|
name = threading.current_thread().name
|
||||||
|
if name == "MainThread":
|
||||||
|
return "[Hlavní]"
|
||||||
|
try:
|
||||||
|
return f"[T{int(name.split('_')[-1]) + 1}]"
|
||||||
|
except (ValueError, IndexError):
|
||||||
|
return f"[{name[:8]}]"
|
||||||
|
|
||||||
|
|
||||||
|
def tprint(*args, **kwargs):
|
||||||
|
with _print_lock:
|
||||||
|
print(tname(), *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def puzzle_exists(n: int) -> bool:
|
||||||
|
try:
|
||||||
|
resp = SESSION.get(f"{BASE_URL}/search?n={n}", timeout=15)
|
||||||
|
return 'section class="puzzle' in resp.text
|
||||||
|
except requests.RequestException:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_max_puzzle_number() -> int:
|
||||||
|
"""Binárním vyhledáváním zjistí číslo nejnovějšího puzzle."""
|
||||||
|
lo, hi = 1, 99999
|
||||||
|
while lo < hi:
|
||||||
|
mid = (lo + hi + 1) // 2
|
||||||
|
if puzzle_exists(mid):
|
||||||
|
lo = mid
|
||||||
|
else:
|
||||||
|
hi = mid - 1
|
||||||
|
time.sleep(0.5)
|
||||||
|
return lo
|
||||||
|
|
||||||
|
|
||||||
|
def get_puzzle_info(n: int) -> dict | None:
|
||||||
|
url = f"{BASE_URL}/search?n={n}"
|
||||||
|
try:
|
||||||
|
resp = SESSION.get(url, timeout=15)
|
||||||
|
except requests.RequestException as e:
|
||||||
|
tprint(f" Chyba při načítání info puzzle {n}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
if resp.status_code != 200:
|
||||||
|
tprint(f" Puzzle {n}: info stránka nedostupná (HTTP {resp.status_code})")
|
||||||
|
return None
|
||||||
|
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
section = soup.select_one("section.puzzle")
|
||||||
|
if not section:
|
||||||
|
tprint(f" Puzzle {n}: nenalezena sekce section.puzzle")
|
||||||
|
return None
|
||||||
|
|
||||||
|
short_month = section.select_one("span.short-month")
|
||||||
|
day = section.select_one("span.day")
|
||||||
|
year = section.select_one("span.year")
|
||||||
|
if not (short_month and day and year):
|
||||||
|
tprint(f" Puzzle {n}: datum nenalezeno (chybí span.short-month / .day / .year)")
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
date_iso = datetime.strptime(
|
||||||
|
f"{short_month.text.strip()} {day.text.strip()} {year.text.strip()}",
|
||||||
|
"%b %d %Y",
|
||||||
|
).strftime("%Y-%m-%d")
|
||||||
|
except ValueError as e:
|
||||||
|
tprint(f" Puzzle {n}: chyba parsování data ({e})")
|
||||||
|
return None
|
||||||
|
|
||||||
|
diff_el = section.select_one("span.puzzle-difficulty-value")
|
||||||
|
difficulty = diff_el.text.strip() if diff_el else "?"
|
||||||
|
|
||||||
|
time_el = section.select_one("span.puzzle-timing-value")
|
||||||
|
avg_time = time_el.text.strip() if time_el else "?"
|
||||||
|
|
||||||
|
return {"date": date_iso, "number": n, "difficulty": difficulty, "avg_time": avg_time}
|
||||||
|
|
||||||
|
|
||||||
|
def make_filename(info: dict, solution: bool = False) -> str:
|
||||||
|
suffix = " [solution]" if solution else ""
|
||||||
|
avg_time = re.sub(r'[\\/:*?"<>|]', '-', info["avg_time"])
|
||||||
|
return (
|
||||||
|
f"{info['date']} Puzzle SudokuKiller {info['number']} "
|
||||||
|
f"[difficulty {info['difficulty']} of 10] "
|
||||||
|
f"[average solving time {avg_time}]{suffix}.pdf"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def download_pdf(n: int, info: dict, solution: bool = False) -> bool:
|
||||||
|
filename = make_filename(info, solution)
|
||||||
|
filepath = SAVE_DIR / filename
|
||||||
|
|
||||||
|
if filepath.exists():
|
||||||
|
tprint(f" Přeskočeno (existuje): {filename}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
suffix = ".solution" if solution else ""
|
||||||
|
pdf_url = f"{BASE_URL}/pdfs/{n}{suffix}.pdf"
|
||||||
|
|
||||||
|
try:
|
||||||
|
resp = SESSION.get(pdf_url, timeout=30)
|
||||||
|
except requests.RequestException as e:
|
||||||
|
tprint(f" Chyba stahování {pdf_url}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if resp.status_code != 200:
|
||||||
|
tprint(f" PDF nedostupné (HTTP {resp.status_code}): {pdf_url}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
if resp.headers.get("content-type", "").startswith("text/html"):
|
||||||
|
tprint(f" PDF vrátilo HTML místo binárního obsahu: {pdf_url}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
filepath.write_bytes(resp.content)
|
||||||
|
tprint(f" Uloženo: {filename}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def process_puzzle(n: int, idx: int, total: int) -> bool:
|
||||||
|
tprint(f"[{idx}/{total}] Puzzle #{n}...")
|
||||||
|
info = get_puzzle_info(n)
|
||||||
|
time.sleep(DELAY)
|
||||||
|
if not info:
|
||||||
|
return False
|
||||||
|
puzzle_ok = download_pdf(n, info, solution=False)
|
||||||
|
time.sleep(DELAY)
|
||||||
|
solution_ok = download_pdf(n, info, solution=True)
|
||||||
|
return puzzle_ok and solution_ok
|
||||||
|
|
||||||
|
|
||||||
|
def find_already_downloaded() -> set[int]:
|
||||||
|
downloaded = set()
|
||||||
|
for f in SAVE_DIR.glob("*Puzzle SudokuKiller*.pdf"):
|
||||||
|
m = re.search(r'SudokuKiller (\d+)', f.name)
|
||||||
|
if m:
|
||||||
|
downloaded.add(int(m.group(1)))
|
||||||
|
return downloaded
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(description="Stáhne Killer Sudoku PDF")
|
||||||
|
parser.add_argument("--all", action="store_true", help="Projde všechna čísla od 1 (přeskočí existující)")
|
||||||
|
parser.add_argument("--start", type=int, default=1, help="Začáteční číslo puzzle (výchozí: 1)")
|
||||||
|
parser.add_argument("--end", type=int, default=None, help="Koncové číslo puzzle (výchozí: aktuální)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
tprint("Zjišťuji aktuální číslo puzzle...")
|
||||||
|
max_n = get_max_puzzle_number()
|
||||||
|
tprint(f"Aktuální nejvyšší puzzle: #{max_n}")
|
||||||
|
|
||||||
|
end_n = args.end if args.end else max_n
|
||||||
|
start_n = args.start
|
||||||
|
|
||||||
|
downloaded = find_already_downloaded()
|
||||||
|
if args.all:
|
||||||
|
to_download = list(range(start_n, end_n + 1))
|
||||||
|
tprint(f"Projdu všechna puzzle #{start_n}–#{end_n} (přeskočím existující soubory)")
|
||||||
|
else:
|
||||||
|
to_download = [n for n in range(start_n, end_n + 1) if n not in downloaded]
|
||||||
|
tprint(f"Již staženo: {len(downloaded)} puzzle, zbývá stáhnout: {len(to_download)}")
|
||||||
|
|
||||||
|
if AMOUNT_TO_DOWNLOAD > 0:
|
||||||
|
to_download = to_download[:AMOUNT_TO_DOWNLOAD]
|
||||||
|
tprint(f"AMOUNT_TO_DOWNLOAD={AMOUNT_TO_DOWNLOAD} → stáhnu prvních {len(to_download)} chybějících")
|
||||||
|
|
||||||
|
if not to_download:
|
||||||
|
tprint("Vše je již staženo.")
|
||||||
|
return
|
||||||
|
|
||||||
|
total = len(to_download)
|
||||||
|
ok_count = 0
|
||||||
|
err_count = 0
|
||||||
|
|
||||||
|
tprint(f"Spouštím {NUM_THREADS} vláken...")
|
||||||
|
with ThreadPoolExecutor(max_workers=NUM_THREADS, thread_name_prefix="ThreadPoolExecutor-0") as executor:
|
||||||
|
futures = {
|
||||||
|
executor.submit(process_puzzle, n, idx, total): n
|
||||||
|
for idx, n in enumerate(to_download, 1)
|
||||||
|
}
|
||||||
|
for future in as_completed(futures):
|
||||||
|
if future.result():
|
||||||
|
ok_count += 1
|
||||||
|
else:
|
||||||
|
err_count += 1
|
||||||
|
|
||||||
|
tprint(f"\nHotovo. Úspěšně: {ok_count}, chyby: {err_count}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user