notebookVB

2026-06-01 06:02:34 +02:00
parent 32e877ac81
commit e3522f4017
2 changed files with 2311 additions and 47 deletions
@@ -1,9 +1,18 @@
 """
 Open the first leaf (deepest) subcategory from categories_live.json
-and list all products in it via the Rohlik JSON API.
+and list all products in it via the same API calls the website itself makes.

-Endpoint:
-  GET /api/v1/categories/normal/{categoryId}/products?page=N&size=50&sort=recommended
+Flow that mimics the real frontend:
+  1. GET /api/v1/categories/normal/{categoryId}/products?page=N
+       -> { productIds: [...] }
+  2. For each chunk of IDs, call 5 batch endpoints in the same way the site does:
+       /api/v1/products
+       /api/v1/products/prices
+       /api/v1/products/stock
+       /api/v1/products/categories
+       /api/v1/products/user-data
+     All use repeated query params: ?products=ID1&products=ID2&...
+  3. Merge results per productId into one record.
 """

 import json
@@ -14,10 +23,19 @@ from test_login import ensure_logged_in

 TREE_PATH = Path(__file__).parent / "categories_live.json"
 PAGE_SIZE = 50
+CHUNK = 30  # how many IDs per batch request
+
+# Endpoints that the frontend calls in parallel for each set of product IDs.
+PRODUCT_BATCH_ENDPOINTS = {
+    "base":       "/api/v1/products",
+    "prices":     "/api/v1/products/prices",
+    "stock":      "/api/v1/products/stock",
+    "categories": "/api/v1/products/categories",
+    "user_data":  "/api/v1/products/user-data",
+}


 def find_first_leaf(nodes, path=None):
-    """Walk the tree depth-first and return (path, leaf_node) of the first leaf."""
    if path is None:
        path = []
    for n in nodes:
@@ -31,44 +49,63 @@ def find_first_leaf(nodes, path=None):
    return None


-def fetch_products_page(context, category_id, page):
-    url = f"{BASE_URL}/api/v1/categories/normal/{category_id}/products"
-    params = {"page": page, "size": PAGE_SIZE, "sort": "recommended", "filter": "", "excludeProductIds": ""}
-    resp = context.request.get(url, params=params)
+def get_json(context, url):
+    resp = context.request.get(url)
    if resp.status != 200:
-        raise RuntimeError(f"GET {url} -> {resp.status}: {resp.text()[:200]}")
+        raise RuntimeError(f"GET {url[:120]}... -> {resp.status}: {resp.text()[:200]}")
    return resp.json()


-def extract_products(payload):
-    """Find the products list in the payload — try common shapes."""
+def fetch_products_page(context, category_id, page):
+    url = (f"{BASE_URL}/api/v1/categories/normal/{category_id}/products"
+           f"?page={page}&size={PAGE_SIZE}&sort=recommended&filter=&excludeProductIds=")
+    return get_json(context, url)
+
+
+def fetch_batch(context, path, product_ids):
+    """Call a batch endpoint with ?products=ID&products=ID&... — like the frontend does."""
+    qs = "&".join(f"products={pid}" for pid in product_ids)
+    url = f"{BASE_URL}{path}?{qs}"
+    return get_json(context, url)
+
+
+def as_list(payload):
+    """Each batch endpoint returns either a list or a wrapper around one."""
    if isinstance(payload, list):
        return payload
    if isinstance(payload, dict):
-        for k in ("products", "data", "items"):
+        for k in ("data", "products", "items"):
            v = payload.get(k)
            if isinstance(v, list):
                return v
-            if isinstance(v, dict):
-                for k2 in ("products", "items"):
-                    if isinstance(v.get(k2), list):
-                        return v[k2]
    return []


-def format_price(p):
-    """Try common price fields."""
-    if not isinstance(p, dict):
-        return ""
-    for k in ("price", "amount", "value"):
-        v = p.get(k)
-        if isinstance(v, (int, float)):
-            return f"{v:.2f}"
-        if isinstance(v, dict):
-            for k2 in ("amount", "value", "full"):
-                if isinstance(v.get(k2), (int, float)):
-                    return f"{v[k2]:.2f}"
-    return ""
+def index_by_id(items):
+    out = {}
+    for it in items:
+        if not isinstance(it, dict):
+            continue
+        pid = it.get("productId") or it.get("id")
+        if pid is not None:
+            out[int(pid)] = it
+    return out
+
+
+def fetch_merged_products(context, product_ids):
+    """For a chunk of IDs, call all 5 endpoints and merge per productId."""
+    results = {key: index_by_id(as_list(fetch_batch(context, path, product_ids)))
+               for key, path in PRODUCT_BATCH_ENDPOINTS.items()}
+
+    merged = []
+    for pid in product_ids:
+        record = {"productId": pid}
+        for key in PRODUCT_BATCH_ENDPOINTS:
+            data = results[key].get(int(pid))
+            if data is not None:
+                record[key] = data
+        merged.append(record)
+    return merged


 def main():
@@ -76,46 +113,87 @@ def main():
        raise SystemExit(f"Missing {TREE_PATH} — run scrape_categories.py first.")

    data = json.loads(TREE_PATH.read_text(encoding="utf-8"))
-    tree = data["tree"]
-    path, leaf = find_first_leaf(tree)
+    path, leaf = find_first_leaf(data["tree"])
    print(f"First leaf: {' > '.join(path)} (id={leaf['id']})")
    print(f"URL: {BASE_URL}{leaf['url']}\n")

    with sync_playwright() as pw:
        context, page = ensure_logged_in(pw)

-        all_products = []
+        # Step 1: collect all product IDs across pages
+        all_ids = []
        page_num = 0
        while True:
-            print(f"Fetching page {page_num} ...")
+            print(f"Listing page {page_num} ...")
            payload = fetch_products_page(context, leaf["id"], page_num)
-            products = extract_products(payload)
-            print(f"  got {len(products)} products")
-            if not products:
+            ids = payload.get("productIds") or []
+            print(f"  got {len(ids)} product IDs")
+            if not ids:
                break
-            all_products.extend(products)
-            if len(products) < PAGE_SIZE:
+            all_ids.extend(ids)
+            if len(ids) < PAGE_SIZE:
                break
            page_num += 1

+        print(f"\nTotal IDs: {len(all_ids)}")
+        if not all_ids:
+            context.browser.close()
+            return
+
+        # Step 2: per chunk, hit the 5 batch endpoints the frontend uses and merge
+        all_products = []
+        for i in range(0, len(all_ids), CHUNK):
+            chunk = all_ids[i:i + CHUNK]
+            print(f"Batch fetch for IDs {i}..{i + len(chunk) - 1} ({len(chunk)} items) ...")
+            merged = fetch_merged_products(context, chunk)
+            all_products.extend(merged)
+
        print(f"\nTotal products: {len(all_products)}\n")

-        # Show first product raw structure so we can confirm field names
+        # Show one merged record so we see real field shapes
        if all_products:
-            print("--- Sample raw product (first item, truncated) ---")
-            print(json.dumps(all_products[0], ensure_ascii=False, indent=2)[:1500])
+            print("--- Sample merged product (first item, truncated) ---")
+            print(json.dumps(all_products[0], ensure_ascii=False, indent=2)[:2500])
            print("--- end sample ---\n")

-        print("Products in category:")
+        # Simple human-readable listing
+        print(f"{'ID':>9}  {'Skladem':<8}  {'Cena':>10}  {'Za jedn.':>11}  {'Akce':>10}  Název  (balení)")
+        print("-" * 100)
        for p in all_products:
-            name = p.get("productName") or p.get("name") or p.get("title") or "?"
-            pid = p.get("productId") or p.get("id") or "?"
-            price = format_price(p)
-            print(f"  [{pid}] {name}  {price}")
+            base = p.get("base") or {}
+            prices = p.get("prices") or {}
+            stock = p.get("stock") or {}
+
+            name = base.get("name") or "?"
+            unit = base.get("unit") or ""
+            textual = base.get("textualAmount") or ""
+
+            price = (prices.get("price") or {}).get("amount")
+            ppu = (prices.get("pricePerUnit") or {}).get("amount")
+            sale_price = None
+            sale_badge = ""
+            sales = prices.get("sales") or []
+            if sales:
+                first = sales[0]
+                sale_price = (first.get("price") or {}).get("amount")
+                badges = first.get("badges") or []
+                if badges:
+                    sale_badge = badges[0].get("title") or first.get("type") or ""
+                else:
+                    sale_badge = first.get("type") or ""
+
+            in_stock = stock.get("inStock")
+            stock_str = "ano" if in_stock else ("ne" if in_stock is False else "?")
+
+            price_str = f"{price:.2f}" if isinstance(price, (int, float)) else ""
+            ppu_str = f"{ppu:.2f}/{unit}" if isinstance(ppu, (int, float)) else ""
+            sale_str = f"{sale_price:.2f} {sale_badge}".strip() if isinstance(sale_price, (int, float)) else ""
+
+            print(f"{p['productId']:>9}  {stock_str:<8}  {price_str:>10}  {ppu_str:>11}  {sale_str:>10}  {name}  ({textual})")

        out_path = Path(__file__).parent / f"products_{leaf['id']}.json"
        out_path.write_text(json.dumps(all_products, ensure_ascii=False, indent=2), encoding="utf-8")
-        print(f"\nSaved raw products -> {out_path} ({out_path.stat().st_size} bytes)")
+        print(f"\nSaved -> {out_path} ({out_path.stat().st_size} bytes)")

        context.browser.close()