notebookVB
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -1,9 +1,18 @@
|
||||
"""
|
||||
Open the first leaf (deepest) subcategory from categories_live.json
|
||||
and list all products in it via the Rohlik JSON API.
|
||||
and list all products in it via the same API calls the website itself makes.
|
||||
|
||||
Endpoint:
|
||||
GET /api/v1/categories/normal/{categoryId}/products?page=N&size=50&sort=recommended
|
||||
Flow that mimics the real frontend:
|
||||
1. GET /api/v1/categories/normal/{categoryId}/products?page=N
|
||||
-> { productIds: [...] }
|
||||
2. For each chunk of IDs, call 5 batch endpoints in the same way the site does:
|
||||
/api/v1/products
|
||||
/api/v1/products/prices
|
||||
/api/v1/products/stock
|
||||
/api/v1/products/categories
|
||||
/api/v1/products/user-data
|
||||
All use repeated query params: ?products=ID1&products=ID2&...
|
||||
3. Merge results per productId into one record.
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -14,10 +23,19 @@ from test_login import ensure_logged_in
|
||||
|
||||
TREE_PATH = Path(__file__).parent / "categories_live.json"
|
||||
PAGE_SIZE = 50
|
||||
CHUNK = 30 # how many IDs per batch request
|
||||
|
||||
# Endpoints that the frontend calls in parallel for each set of product IDs.
|
||||
PRODUCT_BATCH_ENDPOINTS = {
|
||||
"base": "/api/v1/products",
|
||||
"prices": "/api/v1/products/prices",
|
||||
"stock": "/api/v1/products/stock",
|
||||
"categories": "/api/v1/products/categories",
|
||||
"user_data": "/api/v1/products/user-data",
|
||||
}
|
||||
|
||||
|
||||
def find_first_leaf(nodes, path=None):
|
||||
"""Walk the tree depth-first and return (path, leaf_node) of the first leaf."""
|
||||
if path is None:
|
||||
path = []
|
||||
for n in nodes:
|
||||
@@ -31,44 +49,63 @@ def find_first_leaf(nodes, path=None):
|
||||
return None
|
||||
|
||||
|
||||
def fetch_products_page(context, category_id, page):
|
||||
url = f"{BASE_URL}/api/v1/categories/normal/{category_id}/products"
|
||||
params = {"page": page, "size": PAGE_SIZE, "sort": "recommended", "filter": "", "excludeProductIds": ""}
|
||||
resp = context.request.get(url, params=params)
|
||||
def get_json(context, url):
|
||||
resp = context.request.get(url)
|
||||
if resp.status != 200:
|
||||
raise RuntimeError(f"GET {url} -> {resp.status}: {resp.text()[:200]}")
|
||||
raise RuntimeError(f"GET {url[:120]}... -> {resp.status}: {resp.text()[:200]}")
|
||||
return resp.json()
|
||||
|
||||
|
||||
def extract_products(payload):
|
||||
"""Find the products list in the payload — try common shapes."""
|
||||
def fetch_products_page(context, category_id, page):
|
||||
url = (f"{BASE_URL}/api/v1/categories/normal/{category_id}/products"
|
||||
f"?page={page}&size={PAGE_SIZE}&sort=recommended&filter=&excludeProductIds=")
|
||||
return get_json(context, url)
|
||||
|
||||
|
||||
def fetch_batch(context, path, product_ids):
|
||||
"""Call a batch endpoint with ?products=ID&products=ID&... — like the frontend does."""
|
||||
qs = "&".join(f"products={pid}" for pid in product_ids)
|
||||
url = f"{BASE_URL}{path}?{qs}"
|
||||
return get_json(context, url)
|
||||
|
||||
|
||||
def as_list(payload):
|
||||
"""Each batch endpoint returns either a list or a wrapper around one."""
|
||||
if isinstance(payload, list):
|
||||
return payload
|
||||
if isinstance(payload, dict):
|
||||
for k in ("products", "data", "items"):
|
||||
for k in ("data", "products", "items"):
|
||||
v = payload.get(k)
|
||||
if isinstance(v, list):
|
||||
return v
|
||||
if isinstance(v, dict):
|
||||
for k2 in ("products", "items"):
|
||||
if isinstance(v.get(k2), list):
|
||||
return v[k2]
|
||||
return []
|
||||
|
||||
|
||||
def format_price(p):
|
||||
"""Try common price fields."""
|
||||
if not isinstance(p, dict):
|
||||
return ""
|
||||
for k in ("price", "amount", "value"):
|
||||
v = p.get(k)
|
||||
if isinstance(v, (int, float)):
|
||||
return f"{v:.2f}"
|
||||
if isinstance(v, dict):
|
||||
for k2 in ("amount", "value", "full"):
|
||||
if isinstance(v.get(k2), (int, float)):
|
||||
return f"{v[k2]:.2f}"
|
||||
return ""
|
||||
def index_by_id(items):
|
||||
out = {}
|
||||
for it in items:
|
||||
if not isinstance(it, dict):
|
||||
continue
|
||||
pid = it.get("productId") or it.get("id")
|
||||
if pid is not None:
|
||||
out[int(pid)] = it
|
||||
return out
|
||||
|
||||
|
||||
def fetch_merged_products(context, product_ids):
|
||||
"""For a chunk of IDs, call all 5 endpoints and merge per productId."""
|
||||
results = {key: index_by_id(as_list(fetch_batch(context, path, product_ids)))
|
||||
for key, path in PRODUCT_BATCH_ENDPOINTS.items()}
|
||||
|
||||
merged = []
|
||||
for pid in product_ids:
|
||||
record = {"productId": pid}
|
||||
for key in PRODUCT_BATCH_ENDPOINTS:
|
||||
data = results[key].get(int(pid))
|
||||
if data is not None:
|
||||
record[key] = data
|
||||
merged.append(record)
|
||||
return merged
|
||||
|
||||
|
||||
def main():
|
||||
@@ -76,46 +113,87 @@ def main():
|
||||
raise SystemExit(f"Missing {TREE_PATH} — run scrape_categories.py first.")
|
||||
|
||||
data = json.loads(TREE_PATH.read_text(encoding="utf-8"))
|
||||
tree = data["tree"]
|
||||
path, leaf = find_first_leaf(tree)
|
||||
path, leaf = find_first_leaf(data["tree"])
|
||||
print(f"First leaf: {' > '.join(path)} (id={leaf['id']})")
|
||||
print(f"URL: {BASE_URL}{leaf['url']}\n")
|
||||
|
||||
with sync_playwright() as pw:
|
||||
context, page = ensure_logged_in(pw)
|
||||
|
||||
all_products = []
|
||||
# Step 1: collect all product IDs across pages
|
||||
all_ids = []
|
||||
page_num = 0
|
||||
while True:
|
||||
print(f"Fetching page {page_num} ...")
|
||||
print(f"Listing page {page_num} ...")
|
||||
payload = fetch_products_page(context, leaf["id"], page_num)
|
||||
products = extract_products(payload)
|
||||
print(f" got {len(products)} products")
|
||||
if not products:
|
||||
ids = payload.get("productIds") or []
|
||||
print(f" got {len(ids)} product IDs")
|
||||
if not ids:
|
||||
break
|
||||
all_products.extend(products)
|
||||
if len(products) < PAGE_SIZE:
|
||||
all_ids.extend(ids)
|
||||
if len(ids) < PAGE_SIZE:
|
||||
break
|
||||
page_num += 1
|
||||
|
||||
print(f"\nTotal IDs: {len(all_ids)}")
|
||||
if not all_ids:
|
||||
context.browser.close()
|
||||
return
|
||||
|
||||
# Step 2: per chunk, hit the 5 batch endpoints the frontend uses and merge
|
||||
all_products = []
|
||||
for i in range(0, len(all_ids), CHUNK):
|
||||
chunk = all_ids[i:i + CHUNK]
|
||||
print(f"Batch fetch for IDs {i}..{i + len(chunk) - 1} ({len(chunk)} items) ...")
|
||||
merged = fetch_merged_products(context, chunk)
|
||||
all_products.extend(merged)
|
||||
|
||||
print(f"\nTotal products: {len(all_products)}\n")
|
||||
|
||||
# Show first product raw structure so we can confirm field names
|
||||
# Show one merged record so we see real field shapes
|
||||
if all_products:
|
||||
print("--- Sample raw product (first item, truncated) ---")
|
||||
print(json.dumps(all_products[0], ensure_ascii=False, indent=2)[:1500])
|
||||
print("--- Sample merged product (first item, truncated) ---")
|
||||
print(json.dumps(all_products[0], ensure_ascii=False, indent=2)[:2500])
|
||||
print("--- end sample ---\n")
|
||||
|
||||
print("Products in category:")
|
||||
# Simple human-readable listing
|
||||
print(f"{'ID':>9} {'Skladem':<8} {'Cena':>10} {'Za jedn.':>11} {'Akce':>10} Název (balení)")
|
||||
print("-" * 100)
|
||||
for p in all_products:
|
||||
name = p.get("productName") or p.get("name") or p.get("title") or "?"
|
||||
pid = p.get("productId") or p.get("id") or "?"
|
||||
price = format_price(p)
|
||||
print(f" [{pid}] {name} {price}")
|
||||
base = p.get("base") or {}
|
||||
prices = p.get("prices") or {}
|
||||
stock = p.get("stock") or {}
|
||||
|
||||
name = base.get("name") or "?"
|
||||
unit = base.get("unit") or ""
|
||||
textual = base.get("textualAmount") or ""
|
||||
|
||||
price = (prices.get("price") or {}).get("amount")
|
||||
ppu = (prices.get("pricePerUnit") or {}).get("amount")
|
||||
sale_price = None
|
||||
sale_badge = ""
|
||||
sales = prices.get("sales") or []
|
||||
if sales:
|
||||
first = sales[0]
|
||||
sale_price = (first.get("price") or {}).get("amount")
|
||||
badges = first.get("badges") or []
|
||||
if badges:
|
||||
sale_badge = badges[0].get("title") or first.get("type") or ""
|
||||
else:
|
||||
sale_badge = first.get("type") or ""
|
||||
|
||||
in_stock = stock.get("inStock")
|
||||
stock_str = "ano" if in_stock else ("ne" if in_stock is False else "?")
|
||||
|
||||
price_str = f"{price:.2f}" if isinstance(price, (int, float)) else ""
|
||||
ppu_str = f"{ppu:.2f}/{unit}" if isinstance(ppu, (int, float)) else ""
|
||||
sale_str = f"{sale_price:.2f} {sale_badge}".strip() if isinstance(sale_price, (int, float)) else ""
|
||||
|
||||
print(f"{p['productId']:>9} {stock_str:<8} {price_str:>10} {ppu_str:>11} {sale_str:>10} {name} ({textual})")
|
||||
|
||||
out_path = Path(__file__).parent / f"products_{leaf['id']}.json"
|
||||
out_path.write_text(json.dumps(all_products, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
print(f"\nSaved raw products -> {out_path} ({out_path.stat().st_size} bytes)")
|
||||
print(f"\nSaved -> {out_path} ({out_path.stat().st_size} bytes)")
|
||||
|
||||
context.browser.close()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user