notebookvb
This commit is contained in:
@@ -0,0 +1,230 @@
|
||||
"""
|
||||
Stáhne všechna Killer Sudoku puzzle + solutions z dailykillersudoku.com jako PDF.
|
||||
Název souboru: YYYY-MM-DD Puzzle SudokuKiller {n} [difficulty {d} of 10] [average solving time {t}].pdf
|
||||
|
||||
Spuštění:
|
||||
python stahni_killer_sudoku.py # stáhne vše nové od posledního stažení
|
||||
python stahni_killer_sudoku.py --all # projde všechna čísla znovu (přeskočí existující)
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import threading
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
sys.stderr.reconfigure(encoding="utf-8")
|
||||
|
||||
BASE_URL = "https://www.dailykillersudoku.com"
|
||||
SAVE_DIR = Path(__file__).parent / "DownloadedPuzzles"
|
||||
SAVE_DIR.mkdir(exist_ok=True)
|
||||
DELAY = 1.0 # sekundy mezi requesty v rámci jednoho vlákna
|
||||
NUM_THREADS = 5 # počet souběžných vláken
|
||||
|
||||
# Kolik puzzle stáhnout (od nejmenšího chybějícího).
|
||||
# 0 = stáhni všechna chybějící až do aktuálního.
|
||||
AMOUNT_TO_DOWNLOAD = 100
|
||||
|
||||
SESSION = requests.Session()
|
||||
SESSION.headers.update({"User-Agent": "Mozilla/5.0 (compatible; puzzle-downloader/1.0)"})
|
||||
|
||||
_print_lock = threading.Lock()
|
||||
|
||||
|
||||
def tname() -> str:
|
||||
name = threading.current_thread().name
|
||||
if name == "MainThread":
|
||||
return "[Hlavní]"
|
||||
try:
|
||||
return f"[T{int(name.split('_')[-1]) + 1}]"
|
||||
except (ValueError, IndexError):
|
||||
return f"[{name[:8]}]"
|
||||
|
||||
|
||||
def tprint(*args, **kwargs):
|
||||
with _print_lock:
|
||||
print(tname(), *args, **kwargs)
|
||||
|
||||
|
||||
def puzzle_exists(n: int) -> bool:
|
||||
try:
|
||||
resp = SESSION.get(f"{BASE_URL}/search?n={n}", timeout=15)
|
||||
return 'section class="puzzle' in resp.text
|
||||
except requests.RequestException:
|
||||
return False
|
||||
|
||||
|
||||
def get_max_puzzle_number() -> int:
|
||||
"""Binárním vyhledáváním zjistí číslo nejnovějšího puzzle."""
|
||||
lo, hi = 1, 99999
|
||||
while lo < hi:
|
||||
mid = (lo + hi + 1) // 2
|
||||
if puzzle_exists(mid):
|
||||
lo = mid
|
||||
else:
|
||||
hi = mid - 1
|
||||
time.sleep(0.5)
|
||||
return lo
|
||||
|
||||
|
||||
def get_puzzle_info(n: int) -> dict | None:
|
||||
url = f"{BASE_URL}/search?n={n}"
|
||||
try:
|
||||
resp = SESSION.get(url, timeout=15)
|
||||
except requests.RequestException as e:
|
||||
tprint(f" Chyba při načítání info puzzle {n}: {e}")
|
||||
return None
|
||||
|
||||
if resp.status_code != 200:
|
||||
tprint(f" Puzzle {n}: info stránka nedostupná (HTTP {resp.status_code})")
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
section = soup.select_one("section.puzzle")
|
||||
if not section:
|
||||
tprint(f" Puzzle {n}: nenalezena sekce section.puzzle")
|
||||
return None
|
||||
|
||||
short_month = section.select_one("span.short-month")
|
||||
day = section.select_one("span.day")
|
||||
year = section.select_one("span.year")
|
||||
if not (short_month and day and year):
|
||||
tprint(f" Puzzle {n}: datum nenalezeno (chybí span.short-month / .day / .year)")
|
||||
return None
|
||||
try:
|
||||
date_iso = datetime.strptime(
|
||||
f"{short_month.text.strip()} {day.text.strip()} {year.text.strip()}",
|
||||
"%b %d %Y",
|
||||
).strftime("%Y-%m-%d")
|
||||
except ValueError as e:
|
||||
tprint(f" Puzzle {n}: chyba parsování data ({e})")
|
||||
return None
|
||||
|
||||
diff_el = section.select_one("span.puzzle-difficulty-value")
|
||||
difficulty = diff_el.text.strip() if diff_el else "?"
|
||||
|
||||
time_el = section.select_one("span.puzzle-timing-value")
|
||||
avg_time = time_el.text.strip() if time_el else "?"
|
||||
|
||||
return {"date": date_iso, "number": n, "difficulty": difficulty, "avg_time": avg_time}
|
||||
|
||||
|
||||
def make_filename(info: dict, solution: bool = False) -> str:
|
||||
suffix = " [solution]" if solution else ""
|
||||
avg_time = re.sub(r'[\\/:*?"<>|]', '-', info["avg_time"])
|
||||
return (
|
||||
f"{info['date']} Puzzle SudokuKiller {info['number']} "
|
||||
f"[difficulty {info['difficulty']} of 10] "
|
||||
f"[average solving time {avg_time}]{suffix}.pdf"
|
||||
)
|
||||
|
||||
|
||||
def download_pdf(n: int, info: dict, solution: bool = False) -> bool:
|
||||
filename = make_filename(info, solution)
|
||||
filepath = SAVE_DIR / filename
|
||||
|
||||
if filepath.exists():
|
||||
tprint(f" Přeskočeno (existuje): {filename}")
|
||||
return True
|
||||
|
||||
suffix = ".solution" if solution else ""
|
||||
pdf_url = f"{BASE_URL}/pdfs/{n}{suffix}.pdf"
|
||||
|
||||
try:
|
||||
resp = SESSION.get(pdf_url, timeout=30)
|
||||
except requests.RequestException as e:
|
||||
tprint(f" Chyba stahování {pdf_url}: {e}")
|
||||
return False
|
||||
|
||||
if resp.status_code != 200:
|
||||
tprint(f" PDF nedostupné (HTTP {resp.status_code}): {pdf_url}")
|
||||
return False
|
||||
|
||||
if resp.headers.get("content-type", "").startswith("text/html"):
|
||||
tprint(f" PDF vrátilo HTML místo binárního obsahu: {pdf_url}")
|
||||
return False
|
||||
|
||||
filepath.write_bytes(resp.content)
|
||||
tprint(f" Uloženo: {filename}")
|
||||
return True
|
||||
|
||||
|
||||
def process_puzzle(n: int, idx: int, total: int) -> bool:
|
||||
tprint(f"[{idx}/{total}] Puzzle #{n}...")
|
||||
info = get_puzzle_info(n)
|
||||
time.sleep(DELAY)
|
||||
if not info:
|
||||
return False
|
||||
puzzle_ok = download_pdf(n, info, solution=False)
|
||||
time.sleep(DELAY)
|
||||
solution_ok = download_pdf(n, info, solution=True)
|
||||
return puzzle_ok and solution_ok
|
||||
|
||||
|
||||
def find_already_downloaded() -> set[int]:
|
||||
downloaded = set()
|
||||
for f in SAVE_DIR.glob("*Puzzle SudokuKiller*.pdf"):
|
||||
m = re.search(r'SudokuKiller (\d+)', f.name)
|
||||
if m:
|
||||
downloaded.add(int(m.group(1)))
|
||||
return downloaded
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Stáhne Killer Sudoku PDF")
|
||||
parser.add_argument("--all", action="store_true", help="Projde všechna čísla od 1 (přeskočí existující)")
|
||||
parser.add_argument("--start", type=int, default=1, help="Začáteční číslo puzzle (výchozí: 1)")
|
||||
parser.add_argument("--end", type=int, default=None, help="Koncové číslo puzzle (výchozí: aktuální)")
|
||||
args = parser.parse_args()
|
||||
|
||||
tprint("Zjišťuji aktuální číslo puzzle...")
|
||||
max_n = get_max_puzzle_number()
|
||||
tprint(f"Aktuální nejvyšší puzzle: #{max_n}")
|
||||
|
||||
end_n = args.end if args.end else max_n
|
||||
start_n = args.start
|
||||
|
||||
downloaded = find_already_downloaded()
|
||||
if args.all:
|
||||
to_download = list(range(start_n, end_n + 1))
|
||||
tprint(f"Projdu všechna puzzle #{start_n}–#{end_n} (přeskočím existující soubory)")
|
||||
else:
|
||||
to_download = [n for n in range(start_n, end_n + 1) if n not in downloaded]
|
||||
tprint(f"Již staženo: {len(downloaded)} puzzle, zbývá stáhnout: {len(to_download)}")
|
||||
|
||||
if AMOUNT_TO_DOWNLOAD > 0:
|
||||
to_download = to_download[:AMOUNT_TO_DOWNLOAD]
|
||||
tprint(f"AMOUNT_TO_DOWNLOAD={AMOUNT_TO_DOWNLOAD} → stáhnu prvních {len(to_download)} chybějících")
|
||||
|
||||
if not to_download:
|
||||
tprint("Vše je již staženo.")
|
||||
return
|
||||
|
||||
total = len(to_download)
|
||||
ok_count = 0
|
||||
err_count = 0
|
||||
|
||||
tprint(f"Spouštím {NUM_THREADS} vláken...")
|
||||
with ThreadPoolExecutor(max_workers=NUM_THREADS, thread_name_prefix="ThreadPoolExecutor-0") as executor:
|
||||
futures = {
|
||||
executor.submit(process_puzzle, n, idx, total): n
|
||||
for idx, n in enumerate(to_download, 1)
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
if future.result():
|
||||
ok_count += 1
|
||||
else:
|
||||
err_count += 1
|
||||
|
||||
tprint(f"\nHotovo. Úspěšně: {ok_count}, chyby: {err_count}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user