notebookVB

2026-06-01 07:24:46 +02:00
parent e3522f4017
commit 5073c01692
8 changed files with 1056 additions and 261 deletions
@@ -1,254 +1,407 @@
 """
-Rohlik.cz Price Scraper - Main Scraper
-Version: 1.0.0
-Date: 2026-05-31
-
-Playwright-based scraper that iterates all leaf categories on Rohlik.cz,
-scrolls to lazy-load every product card, and extracts pricing data from the DOM.
-Supports authenticated scraping (prices differ for logged-in users).
+Rohlik.cz Price Scraper — API-based
+Iterates leaf categories, fetches product IDs via listing API,
+pulls details from 4 batch endpoints, upserts into MongoDB.

 Usage:
-    python scraper.py --no-db --visible          # scrape to JSON, visible browser
-    python scraper.py --no-db --filter "Brambory" # scrape single category to JSON
-    python scraper.py                             # scrape to MongoDB
-    python scraper.py --visible                   # scrape to MongoDB, visible browser
+    python scraper.py                             # all categories -> MongoDB
+    python scraper.py --category "Ovoce a zelenina"  # one main category only
+    python scraper.py --no-db                     # dry run, no DB writes
+    python scraper.py --visible                   # show browser window
 """

-import re
-import json
+import sys
+import io
+import argparse
 import logging
 from datetime import datetime, timezone
-from pathlib import Path

-from playwright.sync_api import sync_playwright, Page
+from playwright.sync_api import sync_playwright

-from config import (
-    BASE_URL, AUTH_STATE_PATH,
-    ROHLIK_EMAIL, ROHLIK_PASSWORD,
-    SCROLL_PAUSE, MAX_SCROLLS,
+from config import BASE_URL
+from test_login import ensure_logged_in
+from db import get_db, ensure_indexes, upsert_products, upsert_categories, log_scrape_run
+
+sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
+sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s  %(message)s",
+    datefmt="%H:%M:%S",
 )
-from categories import get_leaf_categories, get_all_categories_flat
-from db import get_db, ensure_indexes, upsert_product, upsert_category, log_scrape_run
-
-logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 log = logging.getLogger(__name__)

+PAGE_SIZE = 50
+CHUNK = 30

-def parse_price(raw: str | None) -> float | None:
-    if not raw:
-        return None
-    digits = re.sub(r"[^\d]", "", raw)
-    if not digits:
-        return None
-    return int(digits) / 100
+MAIN_CATS_URL = f"{BASE_URL}/api/v5/navigation/components/navigation-tabs/categories"
+SUBCATS_URL = f"{BASE_URL}/api/v4/navigation/components/navigation-tabs/subcategories"
+
+BATCH_ENDPOINTS = {
+    "base":       "/api/v1/products",
+    "prices":     "/api/v1/products/prices",
+    "stock":      "/api/v1/products/stock",
+    "categories": "/api/v1/products/categories",
+}


-def parse_original_price(raw: str | None) -> float | None:
-    if not raw:
-        return None
-    match = re.search(r"([\d\s]+[,.][\d]+)", raw.replace("\xa0", " "))
-    if match:
-        return float(match.group(1).replace(" ", "").replace(",", "."))
-    digits = re.sub(r"[^\d]", "", raw)
-    if digits:
-        return float(digits) / 100
+# ---------------------------------------------------------------------------
+# helpers
+# ---------------------------------------------------------------------------
+
+def get_json(context, url, **params):
+    resp = context.request.get(url, params=params or None)
+    if resp.status != 200:
+        raise RuntimeError(f"HTTP {resp.status}: {url[:120]}")
+    return resp.json()
+
+
+def as_list(payload):
+    if isinstance(payload, list):
+        return payload
+    if isinstance(payload, dict):
+        for k in ("data", "products", "items"):
+            v = payload.get(k)
+            if isinstance(v, list):
+                return v
+    return []
+
+
+def pick(d, *keys):
+    """Return the first non-None value among the given keys."""
+    for k in keys:
+        if isinstance(d, dict) and d.get(k) is not None:
+            return d[k]
    return None


-def login(page: Page):
-    log.info("Logging in to Rohlik.cz...")
-    page.goto(BASE_URL, wait_until="domcontentloaded", timeout=60000)
-    page.wait_for_timeout(3000)
+# ---------------------------------------------------------------------------
+# category tree — live from API
+# ---------------------------------------------------------------------------

-    page.locator('text="Přihlásit se"').first.click()
-    page.wait_for_timeout(2000)
-
-    page.locator('input[type="email"], input[name="email"]').first.fill(ROHLIK_EMAIL)
-    page.locator('input[type="password"], input[name="password"]').first.fill(ROHLIK_PASSWORD)
-    page.locator('button[type="submit"]').first.click()
-    page.wait_for_timeout(5000)
-
-    page.context.storage_state(path=AUTH_STATE_PATH)
-    log.info("Login successful, auth state saved.")
+def normalize_main(payload):
+    if isinstance(payload, list):
+        return payload
+    for key in ("data", "categories", "items", "navigationTabs", "tabs"):
+        v = payload.get(key)
+        if isinstance(v, list):
+            return v
+        if isinstance(v, dict):
+            for k2 in ("categories", "items", "tabs"):
+                if isinstance(v.get(k2), list):
+                    return v[k2]
+    return []


-def scroll_to_load_all(page: Page) -> int:
-    prev_count = 0
-    for i in range(MAX_SCROLLS):
-        page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
-        page.wait_for_timeout(int(SCROLL_PAUSE * 1000))
-        current_count = page.locator('[data-test^="productCard-AVAILABLE-"]').count()
-        if current_count == prev_count and i > 2:
-            break
-        prev_count = current_count
-    return prev_count
+def subs_from_payload(payload):
+    if isinstance(payload, list):
+        return payload
+    if isinstance(payload, dict):
+        for k in ("data", "subcategories", "items", "categories"):
+            v = payload.get(k)
+            if isinstance(v, list):
+                return v
+    return []


-def extract_products(page: Page, category: dict) -> list[dict]:
-    products_data = page.evaluate("""
-        () => {
-            const products = [];
-            document.querySelectorAll('[data-test^="productCard-AVAILABLE-"]').forEach(card => {
-                const id = card.getAttribute('data-test').replace('productCard-AVAILABLE-', '');
-                const nameEl = card.querySelector('[data-test="productCard-body-name"]');
-                const priceNoEl = card.querySelector('[data-test="productCard-body-price-priceNo"]');
-                const saleEl = card.querySelector('[data-test="productCard-body-price-sale"]');
-                const amountEl = card.querySelector('[data-test="productCard-footer-amount"]');
-                const unitPriceEl = card.querySelector('[data-test="productCard-footer-unitPrice"]');
-                const badgeEl = card.querySelector('[data-test="productCard-body-badge"]');
-                const imgEl = card.querySelector('img');
-                const linkEl = card.querySelector('a[href*="/"]');
-
-                products.push({
-                    product_id: id,
-                    name: nameEl?.textContent?.trim() || '',
-                    price_raw: priceNoEl?.textContent?.trim() || '',
-                    original_price_raw: saleEl?.textContent?.trim() || '',
-                    amount: amountEl?.textContent?.trim() || '',
-                    unit_price_raw: unitPriceEl?.textContent?.trim() || '',
-                    discount_badge: badgeEl?.textContent?.trim() || '',
-                    image_url: imgEl?.src || '',
-                    product_url: linkEl?.getAttribute('href') || '',
-                });
-            });
-            return products;
-        }
-    """)
-
-    results = []
-    for p in products_data:
-        results.append({
-            "product_id": p["product_id"],
-            "name": p["name"],
-            "price": parse_price(p["price_raw"]),
-            "original_price": parse_original_price(p["original_price_raw"]),
-            "discount_badge": p["discount_badge"] or None,
-            "amount": p["amount"] or None,
-            "unit_price": p["unit_price_raw"].strip() or None,
-            "image_url": p["image_url"] or None,
-            "product_url": f"{BASE_URL}{p['product_url']}" if p["product_url"] else None,
-            "category_id": category["id"],
-            "category_name": category["name"],
-            "category_path": " > ".join(category.get("path", [category["name"]])),
-        })
-    return results
-
-
-def scrape_leaf(page: Page, category: dict) -> list[dict]:
-    url = f"{BASE_URL}{category['url']}"
-    log.info("Scraping: %s (%s)", " > ".join(category.get("path", [category["name"]])), url)
-
-    page.goto(url, wait_until="domcontentloaded", timeout=60000)
-    page.wait_for_timeout(3000)
-
-    try:
-        page.wait_for_selector('[data-test^="productCard-AVAILABLE-"]', timeout=15000)
-    except Exception:
-        log.warning("  No products found in %s, skipping.", category["name"])
+def fetch_children_recursive(context, parent_id, visited, depth=1, max_depth=6):
+    if str(parent_id) in visited or depth > max_depth:
        return []
+    visited.add(str(parent_id))

-    total = scroll_to_load_all(page)
-    products = extract_products(page, category)
-    log.info("  %d products extracted (loaded %d)", len(products), total)
-    return products
+    sub_payload = get_json(context, SUBCATS_URL, categoryIds=str(parent_id))
+    subs = subs_from_payload(sub_payload)
+
+    out = []
+    for s in subs:
+        if not isinstance(s, dict):
+            continue
+        sid = pick(s, "id", "categoryId")
+        node = {
+            "id": sid,
+            "name": pick(s, "name", "title", "label"),
+            "url": pick(s, "url", "link", "slug"),
+            "children": [],
+        }
+        if sid and s.get("subcategoryIds"):
+            node["children"] = fetch_children_recursive(context, sid, visited, depth + 1, max_depth)
+        out.append(node)
+    return out


-def run_scraper(
-    category_filter: str | None = None,
-    headless: bool = True,
-    save_to_db: bool = True,
-):
-    leaves = get_leaf_categories()
-    if category_filter:
-        category_filter_lower = category_filter.lower()
-        leaves = [c for c in leaves if category_filter_lower in " > ".join(c["path"]).lower()]
+def fetch_category_tree(context):
+    """Fetch full category tree live from Rohlik API."""
+    log.info("Fetching main categories ...")
+    main_payload = get_json(context, MAIN_CATS_URL)
+    main_cats = normalize_main(main_payload)
+    log.info("  %d main categories", len(main_cats))

-    log.info("Will scrape %d leaf categories", len(leaves))
+    tree = []
+    visited = set()
+
+    log.info("Fetching subcategories recursively ...")
+    for cat in main_cats:
+        cid = pick(cat, "id", "categoryId")
+        cname = pick(cat, "name", "title", "label")
+        curl = pick(cat, "url", "link", "slug")
+        if not cid:
+            continue
+
+        children = fetch_children_recursive(context, cid, visited)
+        node = {"id": cid, "name": cname, "url": curl, "children": children}
+        tree.append(node)
+
+        n_desc = count_nodes(children)
+        log.info("  - %s -> %d subcategories", cname, n_desc)
+
+    total = count_nodes(tree)
+    log.info("  Total: %d categories (incl. main)", total)
+    return tree
+
+
+def count_nodes(nodes):
+    total = len(nodes)
+    for n in nodes:
+        total += count_nodes(n.get("children", []))
+    return total
+
+
+def collect_leaves(nodes, path=None):
+    """Return flat list of leaf nodes with their full path."""
+    if path is None:
+        path = []
+    leaves = []
+    for n in nodes:
+        current = path + [n["name"]]
+        children = n.get("children") or []
+        if children:
+            leaves.extend(collect_leaves(children, current))
+        else:
+            leaves.append({**n, "path": current})
+    return leaves
+
+
+def tree_to_db_docs(nodes, parent_id=None, path=None, path_names=None):
+    """Convert tree nodes to flat category docs for MongoDB."""
+    if path is None:
+        path = []
+    if path_names is None:
+        path_names = []
+    docs = []
+    for n in nodes:
+        cur_path = path + [n["id"]]
+        cur_names = path_names + [n["name"]]
+        children = n.get("children") or []
+        docs.append({
+            "_id": n["id"],
+            "name": n["name"],
+            "slug": (n.get("url") or "").lstrip("/"),
+            "path": cur_path,
+            "pathNames": cur_names,
+            "parentId": parent_id,
+            "isLeaf": len(children) == 0,
+        })
+        if children:
+            docs.extend(tree_to_db_docs(children, n["id"], cur_path, cur_names))
+    return docs
+
+
+# ---------------------------------------------------------------------------
+# product fetching
+# ---------------------------------------------------------------------------
+
+def fetch_product_ids(context, category_id):
+    """Paginate through listing API, return all product IDs for a leaf."""
+    all_ids = []
+    page = 0
+    while True:
+        url = (f"{BASE_URL}/api/v1/categories/normal/{category_id}/products"
+               f"?page={page}&size={PAGE_SIZE}&sort=recommended&filter=&excludeProductIds=")
+        data = get_json(context, url)
+        ids = data.get("productIds") or []
+        all_ids.extend(ids)
+        if len(ids) < PAGE_SIZE:
+            break
+        page += 1
+    return all_ids
+
+
+def fetch_batch(context, endpoint, product_ids):
+    qs = "&".join(f"products={pid}" for pid in product_ids)
+    url = f"{BASE_URL}{endpoint}?{qs}"
+    return as_list(get_json(context, url))
+
+
+def fetch_product_details(context, product_ids):
+    """For a chunk of IDs, call 4 batch endpoints and return raw lists."""
+    bases = fetch_batch(context, BATCH_ENDPOINTS["base"], product_ids)
+    prices = fetch_batch(context, BATCH_ENDPOINTS["prices"], product_ids)
+    stocks = fetch_batch(context, BATCH_ENDPOINTS["stock"], product_ids)
+    cats = fetch_batch(context, BATCH_ENDPOINTS["categories"], product_ids)
+    return bases, prices, stocks, cats
+
+
+# ---------------------------------------------------------------------------
+# console output
+# ---------------------------------------------------------------------------
+
+def print_header():
+    log.info("=" * 100)
+    log.info("  ROHLIK.CZ PRICE SCRAPER")
+    log.info("=" * 100)
+
+
+def print_category_header(leaf, leaf_idx, total_leaves):
+    path_str = " > ".join(leaf["path"])
+    log.info("")
+    log.info("-" * 100)
+    log.info("  [%d/%d]  %s  (id=%s)", leaf_idx, total_leaves, path_str, leaf["id"])
+    log.info("-" * 100)
+
+
+def print_products_table(bases, prices_list, stocks):
+    """Print a compact table of products in this chunk."""
+    prices_map = {p["productId"]: p for p in prices_list}
+    stock_map = {s["productId"]: s for s in stocks}
+
+    for b in bases:
+        pid = b["id"]
+        p = prices_map.get(pid, {})
+        s = stock_map.get(pid, {})
+
+        name = b.get("name", "?")[:50]
+        price = p.get("price", {}).get("amount")
+        ppu = p.get("pricePerUnit", {}).get("amount")
+        unit = b.get("unit", "")
+        in_stock = s.get("inStock")
+        stock_str = "+" if in_stock else "-" if in_stock is False else "?"
+
+        sale_str = ""
+        sales = p.get("sales") or []
+        if sales:
+            sp = sales[0].get("price", {}).get("amount")
+            badge = (sales[0].get("badges") or [{}])[0].get("title", "")
+            if sp:
+                sale_str = f"{sp:.2f} {badge}"
+
+        price_str = f"{price:.2f}" if isinstance(price, (int, float)) else "?"
+        ppu_str = f"{ppu:.2f}/{unit}" if isinstance(ppu, (int, float)) else ""
+
+        log.info("  %s %9d  %8s  %12s  %14s  %s",
+                 stock_str, pid, price_str, ppu_str, sale_str, name)
+
+
+def print_summary(stats):
+    log.info("")
+    log.info("=" * 100)
+    log.info("  DONE")
+    log.info("  Categories: %d", stats["categories_scraped"])
+    log.info("  Products:   %d unique", stats["products_total"])
+    log.info("  Duration:   %.1f s", stats["duration_seconds"])
+    if stats.get("errors"):
+        log.info("  Errors:     %d", stats["errors"])
+    log.info("=" * 100)
+
+
+# ---------------------------------------------------------------------------
+# main
+# ---------------------------------------------------------------------------
+
+def run_scraper(category_filter=None, headless=True, save_to_db=True):
+    db = None
+    if save_to_db:
+        db = get_db()
+        ensure_indexes(db)

    with sync_playwright() as pw:
-        ctx_args = {}
-        if Path(AUTH_STATE_PATH).exists():
-            ctx_args["storage_state"] = AUTH_STATE_PATH
+        context, page = ensure_logged_in(pw, headless=headless)

-        browser = pw.chromium.launch(headless=headless)
-        context = browser.new_context(**ctx_args)
-        page = context.new_page()
+        # fetch live category tree from API
+        tree = fetch_category_tree(context)

-        page.goto(BASE_URL, wait_until="domcontentloaded", timeout=60000)
-        page.wait_for_timeout(5000)
+        # filter to one main category if requested
+        if category_filter:
+            cf = category_filter.lower()
+            tree = [t for t in tree if cf in t["name"].lower()]
+            if not tree:
+                raise SystemExit(f"No main category matching '{category_filter}'")

-        is_logged_in = page.locator('text="Přihlásit se"').count() == 0
-        if not is_logged_in:
-            if ROHLIK_EMAIL and ROHLIK_PASSWORD:
-                login(page)
-                context = browser.new_context(storage_state=AUTH_STATE_PATH)
-                page = context.new_page()
-            else:
-                log.warning("Not logged in! Prices may differ from member prices.")
+        leaves = collect_leaves(tree)
+        log.info("Scraping %d leaf categories", len(leaves))
+
+        # save categories to MongoDB
+        if db is not None:
+            cat_docs = tree_to_db_docs(tree)
+            upsert_categories(db, cat_docs)
+            log.info("Upserted %d category docs", len(cat_docs))
+
+        print_header()

        run_start = datetime.now(timezone.utc)
-        all_products = []
        seen_ids = set()
+        total_products = 0
+        errors = 0

-        db = None
-        if save_to_db:
-            db = get_db()
-            ensure_indexes(db)
-            for cat_data in get_all_categories_flat():
-                upsert_category(db, cat_data)
+        for i, leaf in enumerate(leaves, 1):
+            print_category_header(leaf, i, len(leaves))

-        for leaf in leaves:
            try:
-                products = scrape_leaf(page, leaf)
-                for p in products:
-                    if p["product_id"] not in seen_ids:
-                        seen_ids.add(p["product_id"])
-                        all_products.append(p)
-                        if db:
-                            upsert_product(db, p)
+                product_ids = fetch_product_ids(context, leaf["id"])
+                log.info("  %d product IDs", len(product_ids))
+
+                if not product_ids:
+                    continue
+
+                # deduplicate within run
+                new_ids = [pid for pid in product_ids if pid not in seen_ids]
+                seen_ids.update(product_ids)
+
+                # process in chunks
+                for j in range(0, len(new_ids), CHUNK):
+                    chunk = new_ids[j:j + CHUNK]
+                    bases, prices, stocks, cats = fetch_product_details(context, chunk)
+
+                    print_products_table(bases, prices, stocks)
+
+                    if db is not None:
+                        upsert_products(db, bases, prices, stocks, cats)
+
+                    total_products += len(bases)
+
            except Exception:
-                log.exception("Error scraping %s", leaf["name"])
+                log.exception("  ERROR in %s", leaf["name"])
+                errors += 1
+
+        context.browser.close()

        run_end = datetime.now(timezone.utc)
-        run_data = {
-            "started_at": run_start,
-            "finished_at": run_end,
+        stats = {
+            "startedAt": run_start,
+            "finishedAt": run_end,
            "duration_seconds": (run_end - run_start).total_seconds(),
            "categories_scraped": len(leaves),
-            "products_scraped": len(all_products),
+            "products_total": total_products,
+            "errors": errors,
+            "filter": category_filter,
        }

-        if db:
-            log_scrape_run(db, run_data)
+        if db is not None:
+            log_scrape_run(db, stats)

-        log.info(
-            "Done: %d unique products from %d categories in %.1fs",
-            len(all_products), len(leaves), run_data["duration_seconds"],
-        )
-
-        browser.close()
-
-    return all_products
-
-
-def scrape_to_json(output_path: str = "products.json", **kwargs):
-    products = run_scraper(save_to_db=False, **kwargs)
-    with open(output_path, "w", encoding="utf-8") as f:
-        json.dump(products, f, ensure_ascii=False, indent=2, default=str)
-    log.info("Saved %d products to %s", len(products), output_path)
-    return products
+        print_summary(stats)
+        return stats


 if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser(description="Rohlik.cz price scraper")
-    parser.add_argument("--no-db", action="store_true", help="Save to JSON instead of MongoDB")
-    parser.add_argument("--visible", action="store_true", help="Run browser in visible mode")
-    parser.add_argument("--filter", type=str, help="Filter categories by name (e.g. 'Ovoce', 'Zelenina > Rajčata')")
+    parser = argparse.ArgumentParser(description="Rohlik.cz price scraper (API)")
+    parser.add_argument("--category", type=str, help="Scrape only this main category (e.g. 'Ovoce a zelenina')")
+    parser.add_argument("--no-db", action="store_true", help="Dry run — no MongoDB writes")
+    parser.add_argument("--visible", action="store_true", help="Show browser window")
    args = parser.parse_args()

-    if args.no_db:
-        scrape_to_json(category_filter=args.filter, headless=not args.visible)
-    else:
-        run_scraper(category_filter=args.filter, headless=not args.visible)
+    run_scraper(
+        category_filter=args.category,
+        headless=not args.visible,
+        save_to_db=not args.no_db,
+    )