Files
rohlik/10PriceScraping/Rohlik/scrape_first_leaf.py
T
2026-06-01 05:34:31 +02:00

125 lines
4.1 KiB
Python

"""
Open the first leaf (deepest) subcategory from categories_live.json
and list all products in it via the Rohlik JSON API.
Endpoint:
GET /api/v1/categories/normal/{categoryId}/products?page=N&size=50&sort=recommended
"""
import json
from pathlib import Path
from playwright.sync_api import sync_playwright
from config import BASE_URL
from test_login import ensure_logged_in
TREE_PATH = Path(__file__).parent / "categories_live.json"
PAGE_SIZE = 50
def find_first_leaf(nodes, path=None):
"""Walk the tree depth-first and return (path, leaf_node) of the first leaf."""
if path is None:
path = []
for n in nodes:
current = path + [n["name"]]
children = n.get("children") or []
if not children:
return current, n
result = find_first_leaf(children, current)
if result:
return result
return None
def fetch_products_page(context, category_id, page):
url = f"{BASE_URL}/api/v1/categories/normal/{category_id}/products"
params = {"page": page, "size": PAGE_SIZE, "sort": "recommended", "filter": "", "excludeProductIds": ""}
resp = context.request.get(url, params=params)
if resp.status != 200:
raise RuntimeError(f"GET {url} -> {resp.status}: {resp.text()[:200]}")
return resp.json()
def extract_products(payload):
"""Find the products list in the payload — try common shapes."""
if isinstance(payload, list):
return payload
if isinstance(payload, dict):
for k in ("products", "data", "items"):
v = payload.get(k)
if isinstance(v, list):
return v
if isinstance(v, dict):
for k2 in ("products", "items"):
if isinstance(v.get(k2), list):
return v[k2]
return []
def format_price(p):
"""Try common price fields."""
if not isinstance(p, dict):
return ""
for k in ("price", "amount", "value"):
v = p.get(k)
if isinstance(v, (int, float)):
return f"{v:.2f}"
if isinstance(v, dict):
for k2 in ("amount", "value", "full"):
if isinstance(v.get(k2), (int, float)):
return f"{v[k2]:.2f}"
return ""
def main():
if not TREE_PATH.exists():
raise SystemExit(f"Missing {TREE_PATH} — run scrape_categories.py first.")
data = json.loads(TREE_PATH.read_text(encoding="utf-8"))
tree = data["tree"]
path, leaf = find_first_leaf(tree)
print(f"First leaf: {' > '.join(path)} (id={leaf['id']})")
print(f"URL: {BASE_URL}{leaf['url']}\n")
with sync_playwright() as pw:
context, page = ensure_logged_in(pw)
all_products = []
page_num = 0
while True:
print(f"Fetching page {page_num} ...")
payload = fetch_products_page(context, leaf["id"], page_num)
products = extract_products(payload)
print(f" got {len(products)} products")
if not products:
break
all_products.extend(products)
if len(products) < PAGE_SIZE:
break
page_num += 1
print(f"\nTotal products: {len(all_products)}\n")
# Show first product raw structure so we can confirm field names
if all_products:
print("--- Sample raw product (first item, truncated) ---")
print(json.dumps(all_products[0], ensure_ascii=False, indent=2)[:1500])
print("--- end sample ---\n")
print("Products in category:")
for p in all_products:
name = p.get("productName") or p.get("name") or p.get("title") or "?"
pid = p.get("productId") or p.get("id") or "?"
price = format_price(p)
print(f" [{pid}] {name} {price}")
out_path = Path(__file__).parent / f"products_{leaf['id']}.json"
out_path.write_text(json.dumps(all_products, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"\nSaved raw products -> {out_path} ({out_path.stat().st_size} bytes)")
context.browser.close()
if __name__ == "__main__":
main()