125 lines
4.1 KiB
Python
125 lines
4.1 KiB
Python
"""
|
|
Open the first leaf (deepest) subcategory from categories_live.json
|
|
and list all products in it via the Rohlik JSON API.
|
|
|
|
Endpoint:
|
|
GET /api/v1/categories/normal/{categoryId}/products?page=N&size=50&sort=recommended
|
|
"""
|
|
|
|
import json
|
|
from pathlib import Path
|
|
from playwright.sync_api import sync_playwright
|
|
from config import BASE_URL
|
|
from test_login import ensure_logged_in
|
|
|
|
TREE_PATH = Path(__file__).parent / "categories_live.json"
|
|
PAGE_SIZE = 50
|
|
|
|
|
|
def find_first_leaf(nodes, path=None):
|
|
"""Walk the tree depth-first and return (path, leaf_node) of the first leaf."""
|
|
if path is None:
|
|
path = []
|
|
for n in nodes:
|
|
current = path + [n["name"]]
|
|
children = n.get("children") or []
|
|
if not children:
|
|
return current, n
|
|
result = find_first_leaf(children, current)
|
|
if result:
|
|
return result
|
|
return None
|
|
|
|
|
|
def fetch_products_page(context, category_id, page):
|
|
url = f"{BASE_URL}/api/v1/categories/normal/{category_id}/products"
|
|
params = {"page": page, "size": PAGE_SIZE, "sort": "recommended", "filter": "", "excludeProductIds": ""}
|
|
resp = context.request.get(url, params=params)
|
|
if resp.status != 200:
|
|
raise RuntimeError(f"GET {url} -> {resp.status}: {resp.text()[:200]}")
|
|
return resp.json()
|
|
|
|
|
|
def extract_products(payload):
|
|
"""Find the products list in the payload — try common shapes."""
|
|
if isinstance(payload, list):
|
|
return payload
|
|
if isinstance(payload, dict):
|
|
for k in ("products", "data", "items"):
|
|
v = payload.get(k)
|
|
if isinstance(v, list):
|
|
return v
|
|
if isinstance(v, dict):
|
|
for k2 in ("products", "items"):
|
|
if isinstance(v.get(k2), list):
|
|
return v[k2]
|
|
return []
|
|
|
|
|
|
def format_price(p):
|
|
"""Try common price fields."""
|
|
if not isinstance(p, dict):
|
|
return ""
|
|
for k in ("price", "amount", "value"):
|
|
v = p.get(k)
|
|
if isinstance(v, (int, float)):
|
|
return f"{v:.2f}"
|
|
if isinstance(v, dict):
|
|
for k2 in ("amount", "value", "full"):
|
|
if isinstance(v.get(k2), (int, float)):
|
|
return f"{v[k2]:.2f}"
|
|
return ""
|
|
|
|
|
|
def main():
|
|
if not TREE_PATH.exists():
|
|
raise SystemExit(f"Missing {TREE_PATH} — run scrape_categories.py first.")
|
|
|
|
data = json.loads(TREE_PATH.read_text(encoding="utf-8"))
|
|
tree = data["tree"]
|
|
path, leaf = find_first_leaf(tree)
|
|
print(f"First leaf: {' > '.join(path)} (id={leaf['id']})")
|
|
print(f"URL: {BASE_URL}{leaf['url']}\n")
|
|
|
|
with sync_playwright() as pw:
|
|
context, page = ensure_logged_in(pw)
|
|
|
|
all_products = []
|
|
page_num = 0
|
|
while True:
|
|
print(f"Fetching page {page_num} ...")
|
|
payload = fetch_products_page(context, leaf["id"], page_num)
|
|
products = extract_products(payload)
|
|
print(f" got {len(products)} products")
|
|
if not products:
|
|
break
|
|
all_products.extend(products)
|
|
if len(products) < PAGE_SIZE:
|
|
break
|
|
page_num += 1
|
|
|
|
print(f"\nTotal products: {len(all_products)}\n")
|
|
|
|
# Show first product raw structure so we can confirm field names
|
|
if all_products:
|
|
print("--- Sample raw product (first item, truncated) ---")
|
|
print(json.dumps(all_products[0], ensure_ascii=False, indent=2)[:1500])
|
|
print("--- end sample ---\n")
|
|
|
|
print("Products in category:")
|
|
for p in all_products:
|
|
name = p.get("productName") or p.get("name") or p.get("title") or "?"
|
|
pid = p.get("productId") or p.get("id") or "?"
|
|
price = format_price(p)
|
|
print(f" [{pid}] {name} {price}")
|
|
|
|
out_path = Path(__file__).parent / f"products_{leaf['id']}.json"
|
|
out_path.write_text(json.dumps(all_products, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
print(f"\nSaved raw products -> {out_path} ({out_path.stat().st_size} bytes)")
|
|
|
|
context.browser.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|