notebookVB
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,208 @@
|
|||||||
|
"""
|
||||||
|
Scrape the live Rohlik.cz category tree (main categories + subcategories)
|
||||||
|
via the navigation API and save it as JSON.
|
||||||
|
|
||||||
|
Endpoints:
|
||||||
|
GET /api/v5/navigation/components/navigation-tabs/categories
|
||||||
|
GET /api/v4/navigation/components/navigation-tabs/subcategories?categoryIds=ID1,ID2,...
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
from config import BASE_URL
|
||||||
|
from test_login import ensure_logged_in
|
||||||
|
|
||||||
|
OUT_PATH = Path(__file__).parent / "categories_live.json"
|
||||||
|
|
||||||
|
MAIN_URL = f"{BASE_URL}/api/v5/navigation/components/navigation-tabs/categories"
|
||||||
|
SUB_URL = f"{BASE_URL}/api/v4/navigation/components/navigation-tabs/subcategories"
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_json(context, url, **params):
|
||||||
|
resp = context.request.get(url, params=params or None)
|
||||||
|
if resp.status != 200:
|
||||||
|
raise RuntimeError(f"GET {url} -> {resp.status}: {resp.text()[:200]}")
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_main(payload):
|
||||||
|
"""The API wraps payload in different shapes — try to find the categories list."""
|
||||||
|
if isinstance(payload, list):
|
||||||
|
return payload
|
||||||
|
for key in ("data", "categories", "items", "navigationTabs", "tabs"):
|
||||||
|
v = payload.get(key)
|
||||||
|
if isinstance(v, list):
|
||||||
|
return v
|
||||||
|
if isinstance(v, dict):
|
||||||
|
for k2 in ("categories", "items", "tabs"):
|
||||||
|
if isinstance(v.get(k2), list):
|
||||||
|
return v[k2]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def pick(d, *keys):
|
||||||
|
"""Return the first non-None value among the given keys."""
|
||||||
|
for k in keys:
|
||||||
|
if isinstance(d, dict) and d.get(k) is not None:
|
||||||
|
return d[k]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def find_subcats_for(payload, parent_id):
|
||||||
|
"""Try to locate the subcategories list for a given parent id in the response."""
|
||||||
|
pid = str(parent_id)
|
||||||
|
# 1) dict keyed by parent id
|
||||||
|
if isinstance(payload, dict):
|
||||||
|
if pid in payload and isinstance(payload[pid], (list, dict)):
|
||||||
|
v = payload[pid]
|
||||||
|
if isinstance(v, list):
|
||||||
|
return v
|
||||||
|
for k in ("subcategories", "children", "items", "categories"):
|
||||||
|
if isinstance(v.get(k), list):
|
||||||
|
return v[k]
|
||||||
|
# 2) wrapped under "data"/etc
|
||||||
|
for wrap in ("data", "subcategories", "categories", "items"):
|
||||||
|
sub = payload.get(wrap)
|
||||||
|
if isinstance(sub, dict) and pid in sub:
|
||||||
|
v = sub[pid]
|
||||||
|
if isinstance(v, list):
|
||||||
|
return v
|
||||||
|
if isinstance(sub, list):
|
||||||
|
# 3) list of {parentId/categoryId/id: ..., children: [...]}
|
||||||
|
for entry in sub:
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
continue
|
||||||
|
if str(pick(entry, "parentId", "categoryId", "id")) == pid:
|
||||||
|
for k in ("subcategories", "children", "items", "categories"):
|
||||||
|
if isinstance(entry.get(k), list):
|
||||||
|
return entry[k]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def build_clean_tree(main_payload, sub_payload):
|
||||||
|
"""Build a [{name, id, url, children:[{name,id,url}]}, ...] tree."""
|
||||||
|
out = []
|
||||||
|
for cat in normalize_main(main_payload):
|
||||||
|
cid = pick(cat, "id", "categoryId")
|
||||||
|
node = {
|
||||||
|
"id": cid,
|
||||||
|
"name": pick(cat, "name", "title", "label"),
|
||||||
|
"url": pick(cat, "url", "slug", "link"),
|
||||||
|
"children": [],
|
||||||
|
}
|
||||||
|
for sub in find_subcats_for(sub_payload, cid):
|
||||||
|
if not isinstance(sub, dict):
|
||||||
|
continue
|
||||||
|
node["children"].append({
|
||||||
|
"id": pick(sub, "id", "categoryId"),
|
||||||
|
"name": pick(sub, "name", "title", "label"),
|
||||||
|
"url": pick(sub, "url", "slug", "link"),
|
||||||
|
})
|
||||||
|
out.append(node)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def subs_from_payload(payload):
|
||||||
|
"""Subcategories API returns either a flat list or a dict wrapping one."""
|
||||||
|
if isinstance(payload, list):
|
||||||
|
return payload
|
||||||
|
if isinstance(payload, dict):
|
||||||
|
for k in ("data", "subcategories", "items", "categories"):
|
||||||
|
v = payload.get(k)
|
||||||
|
if isinstance(v, list):
|
||||||
|
return v
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_children_recursive(context, parent_id, visited, depth=1, max_depth=6):
|
||||||
|
"""Fetch subcategories recursively for parent_id, return list of nodes."""
|
||||||
|
if str(parent_id) in visited or depth > max_depth:
|
||||||
|
return []
|
||||||
|
visited.add(str(parent_id))
|
||||||
|
|
||||||
|
sub_payload = fetch_json(context, SUB_URL, categoryIds=str(parent_id))
|
||||||
|
subs = subs_from_payload(sub_payload)
|
||||||
|
|
||||||
|
out = []
|
||||||
|
for s in subs:
|
||||||
|
if not isinstance(s, dict):
|
||||||
|
continue
|
||||||
|
sid = pick(s, "id", "categoryId")
|
||||||
|
node = {
|
||||||
|
"id": sid,
|
||||||
|
"name": pick(s, "name", "title", "label"),
|
||||||
|
"url": pick(s, "url", "link", "slug"),
|
||||||
|
"children": [],
|
||||||
|
}
|
||||||
|
# Only recurse if the item itself advertises children
|
||||||
|
if sid and s.get("subcategoryIds"):
|
||||||
|
node["children"] = fetch_children_recursive(context, sid, visited, depth + 1, max_depth)
|
||||||
|
out.append(node)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def print_tree(nodes, indent=0):
|
||||||
|
for n in nodes:
|
||||||
|
print(f"{' ' * indent}- {n['name']} (id={n['id']})")
|
||||||
|
if n.get("children"):
|
||||||
|
print_tree(n["children"], indent + 1)
|
||||||
|
|
||||||
|
|
||||||
|
def count_nodes(nodes):
|
||||||
|
total = len(nodes)
|
||||||
|
for n in nodes:
|
||||||
|
total += count_nodes(n.get("children", []))
|
||||||
|
return total
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
with sync_playwright() as pw:
|
||||||
|
context, page = ensure_logged_in(pw)
|
||||||
|
|
||||||
|
print(f"\nFetching main categories ...")
|
||||||
|
main_payload = fetch_json(context, MAIN_URL)
|
||||||
|
main_cats = normalize_main(main_payload)
|
||||||
|
print(f" Got {len(main_cats)} main categories")
|
||||||
|
|
||||||
|
clean_tree = []
|
||||||
|
visited = set()
|
||||||
|
|
||||||
|
print(f"\nFetching subcategories recursively ...")
|
||||||
|
for cat in main_cats:
|
||||||
|
cid = pick(cat, "id", "categoryId")
|
||||||
|
cname = pick(cat, "name", "title", "label")
|
||||||
|
curl = pick(cat, "url", "link", "slug")
|
||||||
|
if not cid:
|
||||||
|
continue
|
||||||
|
|
||||||
|
children = fetch_children_recursive(context, cid, visited)
|
||||||
|
node = {
|
||||||
|
"id": cid,
|
||||||
|
"name": cname,
|
||||||
|
"url": curl,
|
||||||
|
"children": children,
|
||||||
|
}
|
||||||
|
clean_tree.append(node)
|
||||||
|
total = count_nodes(children)
|
||||||
|
print(f" - {cname} (id={cid}) -> {len(children)} direct, {total} total descendants")
|
||||||
|
|
||||||
|
print("\nFull category tree:")
|
||||||
|
print_tree(clean_tree)
|
||||||
|
|
||||||
|
grand_total = count_nodes(clean_tree)
|
||||||
|
print(f"\nTotal nodes (incl. main): {grand_total}")
|
||||||
|
|
||||||
|
tree = {
|
||||||
|
"tree": clean_tree,
|
||||||
|
"raw_main": main_payload,
|
||||||
|
}
|
||||||
|
|
||||||
|
OUT_PATH.write_text(json.dumps(tree, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
print(f"Saved -> {OUT_PATH} ({OUT_PATH.stat().st_size} bytes)")
|
||||||
|
|
||||||
|
context.browser.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -0,0 +1,124 @@
|
|||||||
|
"""
|
||||||
|
Open the first leaf (deepest) subcategory from categories_live.json
|
||||||
|
and list all products in it via the Rohlik JSON API.
|
||||||
|
|
||||||
|
Endpoint:
|
||||||
|
GET /api/v1/categories/normal/{categoryId}/products?page=N&size=50&sort=recommended
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
from config import BASE_URL
|
||||||
|
from test_login import ensure_logged_in
|
||||||
|
|
||||||
|
TREE_PATH = Path(__file__).parent / "categories_live.json"
|
||||||
|
PAGE_SIZE = 50
|
||||||
|
|
||||||
|
|
||||||
|
def find_first_leaf(nodes, path=None):
|
||||||
|
"""Walk the tree depth-first and return (path, leaf_node) of the first leaf."""
|
||||||
|
if path is None:
|
||||||
|
path = []
|
||||||
|
for n in nodes:
|
||||||
|
current = path + [n["name"]]
|
||||||
|
children = n.get("children") or []
|
||||||
|
if not children:
|
||||||
|
return current, n
|
||||||
|
result = find_first_leaf(children, current)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_products_page(context, category_id, page):
|
||||||
|
url = f"{BASE_URL}/api/v1/categories/normal/{category_id}/products"
|
||||||
|
params = {"page": page, "size": PAGE_SIZE, "sort": "recommended", "filter": "", "excludeProductIds": ""}
|
||||||
|
resp = context.request.get(url, params=params)
|
||||||
|
if resp.status != 200:
|
||||||
|
raise RuntimeError(f"GET {url} -> {resp.status}: {resp.text()[:200]}")
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
def extract_products(payload):
|
||||||
|
"""Find the products list in the payload — try common shapes."""
|
||||||
|
if isinstance(payload, list):
|
||||||
|
return payload
|
||||||
|
if isinstance(payload, dict):
|
||||||
|
for k in ("products", "data", "items"):
|
||||||
|
v = payload.get(k)
|
||||||
|
if isinstance(v, list):
|
||||||
|
return v
|
||||||
|
if isinstance(v, dict):
|
||||||
|
for k2 in ("products", "items"):
|
||||||
|
if isinstance(v.get(k2), list):
|
||||||
|
return v[k2]
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def format_price(p):
|
||||||
|
"""Try common price fields."""
|
||||||
|
if not isinstance(p, dict):
|
||||||
|
return ""
|
||||||
|
for k in ("price", "amount", "value"):
|
||||||
|
v = p.get(k)
|
||||||
|
if isinstance(v, (int, float)):
|
||||||
|
return f"{v:.2f}"
|
||||||
|
if isinstance(v, dict):
|
||||||
|
for k2 in ("amount", "value", "full"):
|
||||||
|
if isinstance(v.get(k2), (int, float)):
|
||||||
|
return f"{v[k2]:.2f}"
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if not TREE_PATH.exists():
|
||||||
|
raise SystemExit(f"Missing {TREE_PATH} — run scrape_categories.py first.")
|
||||||
|
|
||||||
|
data = json.loads(TREE_PATH.read_text(encoding="utf-8"))
|
||||||
|
tree = data["tree"]
|
||||||
|
path, leaf = find_first_leaf(tree)
|
||||||
|
print(f"First leaf: {' > '.join(path)} (id={leaf['id']})")
|
||||||
|
print(f"URL: {BASE_URL}{leaf['url']}\n")
|
||||||
|
|
||||||
|
with sync_playwright() as pw:
|
||||||
|
context, page = ensure_logged_in(pw)
|
||||||
|
|
||||||
|
all_products = []
|
||||||
|
page_num = 0
|
||||||
|
while True:
|
||||||
|
print(f"Fetching page {page_num} ...")
|
||||||
|
payload = fetch_products_page(context, leaf["id"], page_num)
|
||||||
|
products = extract_products(payload)
|
||||||
|
print(f" got {len(products)} products")
|
||||||
|
if not products:
|
||||||
|
break
|
||||||
|
all_products.extend(products)
|
||||||
|
if len(products) < PAGE_SIZE:
|
||||||
|
break
|
||||||
|
page_num += 1
|
||||||
|
|
||||||
|
print(f"\nTotal products: {len(all_products)}\n")
|
||||||
|
|
||||||
|
# Show first product raw structure so we can confirm field names
|
||||||
|
if all_products:
|
||||||
|
print("--- Sample raw product (first item, truncated) ---")
|
||||||
|
print(json.dumps(all_products[0], ensure_ascii=False, indent=2)[:1500])
|
||||||
|
print("--- end sample ---\n")
|
||||||
|
|
||||||
|
print("Products in category:")
|
||||||
|
for p in all_products:
|
||||||
|
name = p.get("productName") or p.get("name") or p.get("title") or "?"
|
||||||
|
pid = p.get("productId") or p.get("id") or "?"
|
||||||
|
price = format_price(p)
|
||||||
|
print(f" [{pid}] {name} {price}")
|
||||||
|
|
||||||
|
out_path = Path(__file__).parent / f"products_{leaf['id']}.json"
|
||||||
|
out_path.write_text(json.dumps(all_products, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
print(f"\nSaved raw products -> {out_path} ({out_path.stat().st_size} bytes)")
|
||||||
|
|
||||||
|
context.browser.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -1,49 +1,99 @@
|
|||||||
"""
|
"""
|
||||||
Reuse saved browser state (cookies + localStorage) so the Usercentrics cookie
|
Reusable login flow for Rohlik.cz:
|
||||||
banner never appears and we stay logged in — same situation as a returning user.
|
|
||||||
|
|
||||||
- If auth_state.json is MISSING: opens a browser, you accept cookies + log in
|
1. Load saved session (auth_state.json) if it exists.
|
||||||
manually, then press Enter to save the state.
|
2. Open the site and check whether we're already logged in.
|
||||||
- If auth_state.json EXISTS: loads it and just verifies (no banner, logged in).
|
3. If yes -> continue.
|
||||||
|
4. If no -> log in via the JSON API, accept cookies, save the session, continue.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from playwright.sync_api import sync_playwright
|
from playwright.sync_api import sync_playwright, BrowserContext, Page
|
||||||
from config import BASE_URL, AUTH_STATE_PATH
|
from config import BASE_URL, AUTH_STATE_PATH, ROHLIK_EMAIL, ROHLIK_PASSWORD
|
||||||
|
|
||||||
auth_path = Path(AUTH_STATE_PATH)
|
LOGIN_URL = f"{BASE_URL}/services/frontend-service/login"
|
||||||
have_state = auth_path.exists()
|
|
||||||
|
|
||||||
|
def is_logged_in(page: Page) -> bool:
|
||||||
|
return page.locator('text="Přihlásit se"').count() == 0
|
||||||
|
|
||||||
|
|
||||||
|
def accept_cookies(page: Page):
|
||||||
|
"""Accept the Usercentrics consent banner via its official JS API."""
|
||||||
|
result = page.evaluate('''async () => {
|
||||||
|
for (let i = 0; i < 20; i++) {
|
||||||
|
if (window.UC_UI && window.UC_UI.isInitialized && window.UC_UI.isInitialized()) break;
|
||||||
|
await new Promise(r => setTimeout(r, 250));
|
||||||
|
}
|
||||||
|
if (window.UC_UI && typeof window.UC_UI.acceptAllConsents === 'function') {
|
||||||
|
await window.UC_UI.acceptAllConsents();
|
||||||
|
await window.UC_UI.closeCMP();
|
||||||
|
return "accepted";
|
||||||
|
}
|
||||||
|
return "UC_UI not available";
|
||||||
|
}''')
|
||||||
|
# Wait for the banner to actually detach from the DOM (close animation ~1s)
|
||||||
|
try:
|
||||||
|
page.wait_for_selector('#usercentrics-cmp-ui', state='detached', timeout=5000)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def api_login(context: BrowserContext) -> int:
|
||||||
|
resp = context.request.post(
|
||||||
|
LOGIN_URL,
|
||||||
|
data=json.dumps({"email": ROHLIK_EMAIL, "password": ROHLIK_PASSWORD}),
|
||||||
|
headers={"Content-Type": "application/json", "Accept": "application/json"},
|
||||||
|
)
|
||||||
|
return resp.status
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_logged_in(pw) -> tuple[BrowserContext, Page]:
|
||||||
|
auth_path = Path(AUTH_STATE_PATH)
|
||||||
|
have_state = auth_path.exists()
|
||||||
|
|
||||||
with sync_playwright() as pw:
|
|
||||||
browser = pw.chromium.launch(headless=False, args=["--start-maximized"])
|
browser = pw.chromium.launch(headless=False, args=["--start-maximized"])
|
||||||
|
|
||||||
ctx_args = {"no_viewport": True}
|
ctx_args = {"no_viewport": True}
|
||||||
if have_state:
|
if have_state:
|
||||||
ctx_args["storage_state"] = AUTH_STATE_PATH
|
ctx_args["storage_state"] = AUTH_STATE_PATH
|
||||||
context = browser.new_context(**ctx_args)
|
context = browser.new_context(**ctx_args)
|
||||||
page = context.new_page()
|
page = context.new_page()
|
||||||
|
|
||||||
print(f"Opening {BASE_URL} (state loaded: {have_state}) ...")
|
print(f"1) Opening site (saved session: {have_state}) ...")
|
||||||
page.goto(BASE_URL, wait_until="domcontentloaded", timeout=60000)
|
page.goto(BASE_URL, wait_until="domcontentloaded", timeout=60000)
|
||||||
page.wait_for_timeout(4000)
|
page.wait_for_timeout(3000)
|
||||||
|
|
||||||
banner_present = page.locator('#usercentrics-cmp-ui').count() > 0
|
if is_logged_in(page):
|
||||||
is_logged_in = page.locator('text="Přihlásit se"').count() == 0
|
print("2) Already logged in from saved session — continuing.")
|
||||||
print(f"Cookie banner present: {banner_present}")
|
return context, page
|
||||||
print(f"Logged in: {is_logged_in}")
|
|
||||||
|
|
||||||
if not have_state:
|
print("2) Not logged in — logging in via API ...")
|
||||||
print("\n" + "=" * 60)
|
status = api_login(context)
|
||||||
print("No saved state. Accept cookies + log in manually,")
|
print(f" Login API status: {status}")
|
||||||
print("then press Enter here to save the state.")
|
|
||||||
print("=" * 60)
|
page.goto(BASE_URL, wait_until="domcontentloaded", timeout=60000)
|
||||||
input()
|
page.wait_for_timeout(3000)
|
||||||
|
|
||||||
|
print(f"3) Accepting cookies: {accept_cookies(page)}")
|
||||||
|
|
||||||
|
if is_logged_in(page):
|
||||||
context.storage_state(path=AUTH_STATE_PATH)
|
context.storage_state(path=AUTH_STATE_PATH)
|
||||||
print(f"Saved state to {AUTH_STATE_PATH}")
|
print("4) Logged in and session saved.")
|
||||||
banner_present = page.locator('#usercentrics-cmp-ui').count() > 0
|
else:
|
||||||
is_logged_in = page.locator('text="Přihlásit se"').count() == 0
|
print("4) Login FAILED — check API status above.")
|
||||||
print(f" -> banner present now: {banner_present}, logged in: {is_logged_in}")
|
|
||||||
|
|
||||||
print("\nPress Enter to close browser...")
|
return context, page
|
||||||
input()
|
|
||||||
browser.close()
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
with sync_playwright() as pw:
|
||||||
|
context, page = ensure_logged_in(pw)
|
||||||
|
|
||||||
|
print(f"\n -> logged in: {is_logged_in(page)}")
|
||||||
|
print(f" -> cookie banner present: {page.locator('#usercentrics-cmp-ui').count() > 0}")
|
||||||
|
|
||||||
|
print("\nReady to scrape. Press Enter to close browser...")
|
||||||
|
input()
|
||||||
|
context.browser.close()
|
||||||
|
|||||||
Reference in New Issue
Block a user