rohlik/10PriceScraping/Rohlik/scrape_first_leaf.py

"""
Open the first leaf (deepest) subcategory from categories_live.json
and list all products in it via the Rohlik JSON API.

Endpoint:
  GET /api/v1/categories/normal/{categoryId}/products?page=N&size=50&sort=recommended
"""

import json
from pathlib import Path
from playwright.sync_api import sync_playwright
from config import BASE_URL
from test_login import ensure_logged_in

TREE_PATH = Path(__file__).parent / "categories_live.json"
PAGE_SIZE = 50


def find_first_leaf(nodes, path=None):
    """Walk the tree depth-first and return (path, leaf_node) of the first leaf."""
    if path is None:
        path = []
    for n in nodes:
        current = path + [n["name"]]
        children = n.get("children") or []
        if not children:
            return current, n
        result = find_first_leaf(children, current)
        if result:
            return result
    return None


def fetch_products_page(context, category_id, page):
    url = f"{BASE_URL}/api/v1/categories/normal/{category_id}/products"
    params = {"page": page, "size": PAGE_SIZE, "sort": "recommended", "filter": "", "excludeProductIds": ""}
    resp = context.request.get(url, params=params)
    if resp.status != 200:
        raise RuntimeError(f"GET {url} -> {resp.status}: {resp.text()[:200]}")
    return resp.json()


def extract_products(payload):
    """Find the products list in the payload — try common shapes."""
    if isinstance(payload, list):
        return payload
    if isinstance(payload, dict):
        for k in ("products", "data", "items"):
            v = payload.get(k)
            if isinstance(v, list):
                return v
            if isinstance(v, dict):
                for k2 in ("products", "items"):
                    if isinstance(v.get(k2), list):
                        return v[k2]
    return []


def format_price(p):
    """Try common price fields."""
    if not isinstance(p, dict):
        return ""
    for k in ("price", "amount", "value"):
        v = p.get(k)
        if isinstance(v, (int, float)):
            return f"{v:.2f}"
        if isinstance(v, dict):
            for k2 in ("amount", "value", "full"):
                if isinstance(v.get(k2), (int, float)):
                    return f"{v[k2]:.2f}"
    return ""


def main():
    if not TREE_PATH.exists():
        raise SystemExit(f"Missing {TREE_PATH} — run scrape_categories.py first.")

    data = json.loads(TREE_PATH.read_text(encoding="utf-8"))
    tree = data["tree"]
    path, leaf = find_first_leaf(tree)
    print(f"First leaf: {' > '.join(path)} (id={leaf['id']})")
    print(f"URL: {BASE_URL}{leaf['url']}\n")

    with sync_playwright() as pw:
        context, page = ensure_logged_in(pw)

        all_products = []
        page_num = 0
        while True:
            print(f"Fetching page {page_num} ...")
            payload = fetch_products_page(context, leaf["id"], page_num)
            products = extract_products(payload)
            print(f"  got {len(products)} products")
            if not products:
                break
            all_products.extend(products)
            if len(products) < PAGE_SIZE:
                break
            page_num += 1

        print(f"\nTotal products: {len(all_products)}\n")

        # Show first product raw structure so we can confirm field names
        if all_products:
            print("--- Sample raw product (first item, truncated) ---")
            print(json.dumps(all_products[0], ensure_ascii=False, indent=2)[:1500])
            print("--- end sample ---\n")

        print("Products in category:")
        for p in all_products:
            name = p.get("productName") or p.get("name") or p.get("title") or "?"
            pid = p.get("productId") or p.get("id") or "?"
            price = format_price(p)
            print(f"  [{pid}] {name}  {price}")

        out_path = Path(__file__).parent / f"products_{leaf['id']}.json"
        out_path.write_text(json.dumps(all_products, ensure_ascii=False, indent=2), encoding="utf-8")
        print(f"\nSaved raw products -> {out_path} ({out_path.stat().st_size} bytes)")

        context.browser.close()


if __name__ == "__main__":
    main()