ordinaceprojekt/SběrDatRůzné/SudokuKiller/stahni_greater_than.py

"""
Stáhne / přejmenuje Greater-Than Killer Sudoku puzzle + solutions z dailykillersudoku.com.
Název souboru: YYYY-MM-DD Puzzle SudokuKillerGreaterThan {n} [difficulty {d} of 10] [average solving time {t}].pdf

Logika:
  1. Načte všechna čísla GT puzzlů ze search (t=4, d=2..10, všechny stránky)
  2. Pro každé číslo:
     - existuje SudokuKillerGreaterThan {n} → přeskočit
     - existuje SudokuKiller {n}             → přejmenovat na SudokuKillerGreaterThan
     - jinak                                  → stáhnout z /pdfs/{n}.pdf

Spuštění:
  python stahni_greater_than.py
"""

import re
import sys
import time
import threading
from datetime import datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed

import requests
from bs4 import BeautifulSoup

sys.stdout.reconfigure(encoding="utf-8")
sys.stderr.reconfigure(encoding="utf-8")

BASE_URL   = "https://www.dailykillersudoku.com"
SAVE_DIR   = Path(__file__).parent / "DownloadedPuzzles"
SAVE_DIR.mkdir(exist_ok=True)
DELAY      = 0.1   # sekundy mezi requesty v rámci jednoho vlákna
NUM_THREADS = 6

SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "Mozilla/5.0 (compatible; puzzle-downloader/1.0)"})

_print_lock = threading.Lock()


def tname() -> str:
    name = threading.current_thread().name
    if name == "MainThread":
        return "[Hlavní]"
    try:
        return f"[T{int(name.split('_')[-1]) + 1}]"
    except (ValueError, IndexError):
        return f"[{name[:8]}]"


def tprint(*args, **kwargs):
    with _print_lock:
        print(tname(), *args, **kwargs)


# ---------------------------------------------------------------------------
# Získání čísel GT puzzlů ze search
# ---------------------------------------------------------------------------

def get_page_puzzle_ids(d: int, page: int) -> list[int]:
    url = f"{BASE_URL}/search?d={d}&t=4&p={page}"
    try:
        resp = SESSION.get(url, timeout=15)
    except requests.RequestException as e:
        tprint(f"  Chyba načítání search d={d} p={page}: {e}")
        return []
    ids = re.findall(r'id="board(\d+)"', resp.text)
    return [int(i) for i in ids]


def get_max_page(d: int) -> int:
    url = f"{BASE_URL}/search?d={d}&t=4&s=0"
    try:
        resp = SESSION.get(url, timeout=15)
    except requests.RequestException:
        return 0
    pages = re.findall(r'href="/search\?[^"]*p=(\d+)"', resp.text)
    return max([int(p) for p in pages], default=1) if pages else 1


def collect_all_gt_numbers() -> list[int]:
    """Projde search (d=2..10, t=4) a vrátí seřazený seznam všech GT čísel."""
    all_ids = set()
    for d in range(2, 11):
        max_p = get_max_page(d)
        if max_p == 0:
            continue
        tprint(f"  Difficulty {d}: {max_p} stránek")
        for page in range(1, max_p + 1):
            ids = get_page_puzzle_ids(d, page)
            all_ids.update(ids)
            time.sleep(DELAY)
    return sorted(all_ids)


# ---------------------------------------------------------------------------
# Čtení existujících souborů
# ---------------------------------------------------------------------------

def find_downloaded_killer() -> dict[int, Path]:
    """Vrátí {číslo: cesta} pro SudokuKiller (ne GreaterThan) soubory (puzzle, ne solution)."""
    result = {}
    for f in SAVE_DIR.glob("*Puzzle SudokuKiller *.pdf"):
        if "[solution]" in f.name or "GreaterThan" in f.name:
            continue
        m = re.search(r"SudokuKiller (\d+)", f.name)
        if m:
            result[int(m.group(1))] = f
    return result


def find_downloaded_gt() -> set[int]:
    """Vrátí čísla již stažených/přejmenovaných SudokuKillerGreaterThan souborů."""
    result = set()
    for f in SAVE_DIR.glob("*Puzzle SudokuKillerGreaterThan *.pdf"):
        if "[solution]" in f.name:
            continue
        m = re.search(r"SudokuKillerGreaterThan (\d+)", f.name)
        if m:
            result.add(int(m.group(1)))
    return result


# ---------------------------------------------------------------------------
# Přejmenování / stažení
# ---------------------------------------------------------------------------

def killer_to_gt_filename(path: Path) -> str:
    return path.name.replace("SudokuKiller ", "SudokuKillerGreaterThan ")


def rename_pair(n: int, killer_path: Path) -> bool:
    """Přejmenuje puzzle + solution soubory SudokuKiller → SudokuKillerGreaterThan."""
    ok = True
    for f in [killer_path,
              killer_path.with_name(killer_path.stem + " [solution].pdf")]:
        if not f.exists():
            if "[solution]" in f.name:
                continue  # solution soubor nemusí existovat
            tprint(f"  Soubor nenalezen pro přejmenování: {f.name}")
            ok = False
            continue
        new_name = killer_to_gt_filename(f)
        new_path = SAVE_DIR / new_name
        f.rename(new_path)
        tprint(f"  Přejmenováno: {f.name} → {new_name}")
    return ok


def get_puzzle_info(n: int) -> dict | None:
    url = f"{BASE_URL}/search?n={n}"
    try:
        resp = SESSION.get(url, timeout=15)
    except requests.RequestException as e:
        tprint(f"  Chyba info puzzle {n}: {e}")
        return None
    soup = BeautifulSoup(resp.text, "html.parser")
    section = soup.select_one("section.puzzle")
    if not section:
        return None
    short_month = section.select_one("span.short-month")
    day         = section.select_one("span.day")
    year        = section.select_one("span.year")
    if not (short_month and day and year):
        return None
    try:
        date_iso = datetime.strptime(
            f"{short_month.text.strip()} {day.text.strip()} {year.text.strip()}",
            "%b %d %Y",
        ).strftime("%Y-%m-%d")
    except ValueError:
        return None
    diff_el = section.select_one("span.puzzle-difficulty-value")
    time_el = section.select_one("span.puzzle-timing-value")
    return {
        "date": date_iso,
        "number": n,
        "difficulty": diff_el.text.strip() if diff_el else "?",
        "avg_time": time_el.text.strip() if time_el else "?",
    }


def make_filename(info: dict, solution: bool = False) -> str:
    suffix   = " [solution]" if solution else ""
    avg_time = re.sub(r'[\\/:*?"<>|]', "-", info["avg_time"])
    return (
        f"{info['date']} Puzzle SudokuKillerGreaterThan {info['number']} "
        f"[difficulty {info['difficulty']} of 10] "
        f"[average solving time {avg_time}]{suffix}.pdf"
    )


def download_pdf(n: int, info: dict, solution: bool = False) -> bool:
    filename = make_filename(info, solution)
    filepath = SAVE_DIR / filename
    if filepath.exists():
        return True
    suffix  = ".solution" if solution else ""
    pdf_url = f"{BASE_URL}/pdfs/{n}{suffix}.pdf"
    try:
        resp = SESSION.get(pdf_url, timeout=30)
    except requests.RequestException as e:
        tprint(f"  Chyba stahování {pdf_url}: {e}")
        return False
    if resp.status_code != 200:
        tprint(f"  PDF nedostupné (HTTP {resp.status_code}): {pdf_url}")
        return False
    if resp.headers.get("content-type", "").startswith("text/html"):
        tprint(f"  PDF vrátilo HTML: {pdf_url}")
        return False
    filepath.write_bytes(resp.content)
    tprint(f"  Staženo: {filename}")
    return True


def process_puzzle(n: int, idx: int, total: int,
                   killer_map: dict[int, Path]) -> bool:
    tprint(f"[{idx}/{total}] Puzzle #{n}")
    if n in killer_map:
        return rename_pair(n, killer_map[n])
    # není jako SudokuKiller → stáhnout
    info = get_puzzle_info(n)
    time.sleep(DELAY)
    if not info:
        tprint(f"  Puzzle {n}: info stránka nenalezena")
        return False
    ok1 = download_pdf(n, info, solution=False)
    time.sleep(DELAY)
    ok2 = download_pdf(n, info, solution=True)
    return ok1 and ok2


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------

def main():
    tprint("Sbírám čísla GT puzzlů ze search (d=2..10, t=4)...")
    gt_numbers = collect_all_gt_numbers()
    tprint(f"Celkem GT puzzlů nalezeno: {len(gt_numbers)}")

    already_gt   = find_downloaded_gt()
    killer_map   = find_downloaded_killer()

    to_process = [n for n in gt_numbers if n not in already_gt]
    tprint(f"Již hotovo (GreaterThan): {len(already_gt)}")
    tprint(f"Ke zpracování: {len(to_process)}")

    if not to_process:
        tprint("Vše již zpracováno.")
        return

    rename_count   = sum(1 for n in to_process if n in killer_map)
    download_count = len(to_process) - rename_count
    tprint(f"  → přejmenovat: {rename_count}, stáhnout: {download_count}")

    ok_count  = 0
    err_count = 0
    total     = len(to_process)

    tprint(f"Spouštím {NUM_THREADS} vláken...")
    with ThreadPoolExecutor(max_workers=NUM_THREADS,
                            thread_name_prefix="ThreadPoolExecutor-0") as executor:
        futures = {
            executor.submit(process_puzzle, n, idx, total, killer_map): n
            for idx, n in enumerate(to_process, 1)
        }
        for future in as_completed(futures):
            if future.result():
                ok_count += 1
            else:
                err_count += 1

    tprint(f"\nHotovo. Úspěšně: {ok_count}, chyby: {err_count}")


if __name__ == "__main__":
    main()