notebookVB

This commit is contained in:
2026-06-01 07:24:46 +02:00
parent e3522f4017
commit 5073c01692
8 changed files with 1056 additions and 261 deletions
+355 -202
View File
@@ -1,254 +1,407 @@
"""
Rohlik.cz Price Scraper - Main Scraper
Version: 1.0.0
Date: 2026-05-31
Playwright-based scraper that iterates all leaf categories on Rohlik.cz,
scrolls to lazy-load every product card, and extracts pricing data from the DOM.
Supports authenticated scraping (prices differ for logged-in users).
Rohlik.cz Price Scraper — API-based
Iterates leaf categories, fetches product IDs via listing API,
pulls details from 4 batch endpoints, upserts into MongoDB.
Usage:
python scraper.py --no-db --visible # scrape to JSON, visible browser
python scraper.py --no-db --filter "Brambory" # scrape single category to JSON
python scraper.py # scrape to MongoDB
python scraper.py --visible # scrape to MongoDB, visible browser
python scraper.py # all categories -> MongoDB
python scraper.py --category "Ovoce a zelenina" # one main category only
python scraper.py --no-db # dry run, no DB writes
python scraper.py --visible # show browser window
"""
import re
import json
import sys
import io
import argparse
import logging
from datetime import datetime, timezone
from pathlib import Path
from playwright.sync_api import sync_playwright, Page
from playwright.sync_api import sync_playwright
from config import (
BASE_URL, AUTH_STATE_PATH,
ROHLIK_EMAIL, ROHLIK_PASSWORD,
SCROLL_PAUSE, MAX_SCROLLS,
from config import BASE_URL
from test_login import ensure_logged_in
from db import get_db, ensure_indexes, upsert_products, upsert_categories, log_scrape_run
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(message)s",
datefmt="%H:%M:%S",
)
from categories import get_leaf_categories, get_all_categories_flat
from db import get_db, ensure_indexes, upsert_product, upsert_category, log_scrape_run
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)
PAGE_SIZE = 50
CHUNK = 30
def parse_price(raw: str | None) -> float | None:
if not raw:
return None
digits = re.sub(r"[^\d]", "", raw)
if not digits:
return None
return int(digits) / 100
MAIN_CATS_URL = f"{BASE_URL}/api/v5/navigation/components/navigation-tabs/categories"
SUBCATS_URL = f"{BASE_URL}/api/v4/navigation/components/navigation-tabs/subcategories"
BATCH_ENDPOINTS = {
"base": "/api/v1/products",
"prices": "/api/v1/products/prices",
"stock": "/api/v1/products/stock",
"categories": "/api/v1/products/categories",
}
def parse_original_price(raw: str | None) -> float | None:
if not raw:
return None
match = re.search(r"([\d\s]+[,.][\d]+)", raw.replace("\xa0", " "))
if match:
return float(match.group(1).replace(" ", "").replace(",", "."))
digits = re.sub(r"[^\d]", "", raw)
if digits:
return float(digits) / 100
# ---------------------------------------------------------------------------
# helpers
# ---------------------------------------------------------------------------
def get_json(context, url, **params):
resp = context.request.get(url, params=params or None)
if resp.status != 200:
raise RuntimeError(f"HTTP {resp.status}: {url[:120]}")
return resp.json()
def as_list(payload):
if isinstance(payload, list):
return payload
if isinstance(payload, dict):
for k in ("data", "products", "items"):
v = payload.get(k)
if isinstance(v, list):
return v
return []
def pick(d, *keys):
"""Return the first non-None value among the given keys."""
for k in keys:
if isinstance(d, dict) and d.get(k) is not None:
return d[k]
return None
def login(page: Page):
log.info("Logging in to Rohlik.cz...")
page.goto(BASE_URL, wait_until="domcontentloaded", timeout=60000)
page.wait_for_timeout(3000)
# ---------------------------------------------------------------------------
# category tree — live from API
# ---------------------------------------------------------------------------
page.locator('text="Přihlásit se"').first.click()
page.wait_for_timeout(2000)
page.locator('input[type="email"], input[name="email"]').first.fill(ROHLIK_EMAIL)
page.locator('input[type="password"], input[name="password"]').first.fill(ROHLIK_PASSWORD)
page.locator('button[type="submit"]').first.click()
page.wait_for_timeout(5000)
page.context.storage_state(path=AUTH_STATE_PATH)
log.info("Login successful, auth state saved.")
def normalize_main(payload):
if isinstance(payload, list):
return payload
for key in ("data", "categories", "items", "navigationTabs", "tabs"):
v = payload.get(key)
if isinstance(v, list):
return v
if isinstance(v, dict):
for k2 in ("categories", "items", "tabs"):
if isinstance(v.get(k2), list):
return v[k2]
return []
def scroll_to_load_all(page: Page) -> int:
prev_count = 0
for i in range(MAX_SCROLLS):
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
page.wait_for_timeout(int(SCROLL_PAUSE * 1000))
current_count = page.locator('[data-test^="productCard-AVAILABLE-"]').count()
if current_count == prev_count and i > 2:
break
prev_count = current_count
return prev_count
def subs_from_payload(payload):
if isinstance(payload, list):
return payload
if isinstance(payload, dict):
for k in ("data", "subcategories", "items", "categories"):
v = payload.get(k)
if isinstance(v, list):
return v
return []
def extract_products(page: Page, category: dict) -> list[dict]:
products_data = page.evaluate("""
() => {
const products = [];
document.querySelectorAll('[data-test^="productCard-AVAILABLE-"]').forEach(card => {
const id = card.getAttribute('data-test').replace('productCard-AVAILABLE-', '');
const nameEl = card.querySelector('[data-test="productCard-body-name"]');
const priceNoEl = card.querySelector('[data-test="productCard-body-price-priceNo"]');
const saleEl = card.querySelector('[data-test="productCard-body-price-sale"]');
const amountEl = card.querySelector('[data-test="productCard-footer-amount"]');
const unitPriceEl = card.querySelector('[data-test="productCard-footer-unitPrice"]');
const badgeEl = card.querySelector('[data-test="productCard-body-badge"]');
const imgEl = card.querySelector('img');
const linkEl = card.querySelector('a[href*="/"]');
products.push({
product_id: id,
name: nameEl?.textContent?.trim() || '',
price_raw: priceNoEl?.textContent?.trim() || '',
original_price_raw: saleEl?.textContent?.trim() || '',
amount: amountEl?.textContent?.trim() || '',
unit_price_raw: unitPriceEl?.textContent?.trim() || '',
discount_badge: badgeEl?.textContent?.trim() || '',
image_url: imgEl?.src || '',
product_url: linkEl?.getAttribute('href') || '',
});
});
return products;
}
""")
results = []
for p in products_data:
results.append({
"product_id": p["product_id"],
"name": p["name"],
"price": parse_price(p["price_raw"]),
"original_price": parse_original_price(p["original_price_raw"]),
"discount_badge": p["discount_badge"] or None,
"amount": p["amount"] or None,
"unit_price": p["unit_price_raw"].strip() or None,
"image_url": p["image_url"] or None,
"product_url": f"{BASE_URL}{p['product_url']}" if p["product_url"] else None,
"category_id": category["id"],
"category_name": category["name"],
"category_path": " > ".join(category.get("path", [category["name"]])),
})
return results
def scrape_leaf(page: Page, category: dict) -> list[dict]:
url = f"{BASE_URL}{category['url']}"
log.info("Scraping: %s (%s)", " > ".join(category.get("path", [category["name"]])), url)
page.goto(url, wait_until="domcontentloaded", timeout=60000)
page.wait_for_timeout(3000)
try:
page.wait_for_selector('[data-test^="productCard-AVAILABLE-"]', timeout=15000)
except Exception:
log.warning(" No products found in %s, skipping.", category["name"])
def fetch_children_recursive(context, parent_id, visited, depth=1, max_depth=6):
if str(parent_id) in visited or depth > max_depth:
return []
visited.add(str(parent_id))
total = scroll_to_load_all(page)
products = extract_products(page, category)
log.info(" %d products extracted (loaded %d)", len(products), total)
return products
sub_payload = get_json(context, SUBCATS_URL, categoryIds=str(parent_id))
subs = subs_from_payload(sub_payload)
out = []
for s in subs:
if not isinstance(s, dict):
continue
sid = pick(s, "id", "categoryId")
node = {
"id": sid,
"name": pick(s, "name", "title", "label"),
"url": pick(s, "url", "link", "slug"),
"children": [],
}
if sid and s.get("subcategoryIds"):
node["children"] = fetch_children_recursive(context, sid, visited, depth + 1, max_depth)
out.append(node)
return out
def run_scraper(
category_filter: str | None = None,
headless: bool = True,
save_to_db: bool = True,
):
leaves = get_leaf_categories()
if category_filter:
category_filter_lower = category_filter.lower()
leaves = [c for c in leaves if category_filter_lower in " > ".join(c["path"]).lower()]
def fetch_category_tree(context):
"""Fetch full category tree live from Rohlik API."""
log.info("Fetching main categories ...")
main_payload = get_json(context, MAIN_CATS_URL)
main_cats = normalize_main(main_payload)
log.info(" %d main categories", len(main_cats))
log.info("Will scrape %d leaf categories", len(leaves))
tree = []
visited = set()
log.info("Fetching subcategories recursively ...")
for cat in main_cats:
cid = pick(cat, "id", "categoryId")
cname = pick(cat, "name", "title", "label")
curl = pick(cat, "url", "link", "slug")
if not cid:
continue
children = fetch_children_recursive(context, cid, visited)
node = {"id": cid, "name": cname, "url": curl, "children": children}
tree.append(node)
n_desc = count_nodes(children)
log.info(" - %s -> %d subcategories", cname, n_desc)
total = count_nodes(tree)
log.info(" Total: %d categories (incl. main)", total)
return tree
def count_nodes(nodes):
total = len(nodes)
for n in nodes:
total += count_nodes(n.get("children", []))
return total
def collect_leaves(nodes, path=None):
"""Return flat list of leaf nodes with their full path."""
if path is None:
path = []
leaves = []
for n in nodes:
current = path + [n["name"]]
children = n.get("children") or []
if children:
leaves.extend(collect_leaves(children, current))
else:
leaves.append({**n, "path": current})
return leaves
def tree_to_db_docs(nodes, parent_id=None, path=None, path_names=None):
"""Convert tree nodes to flat category docs for MongoDB."""
if path is None:
path = []
if path_names is None:
path_names = []
docs = []
for n in nodes:
cur_path = path + [n["id"]]
cur_names = path_names + [n["name"]]
children = n.get("children") or []
docs.append({
"_id": n["id"],
"name": n["name"],
"slug": (n.get("url") or "").lstrip("/"),
"path": cur_path,
"pathNames": cur_names,
"parentId": parent_id,
"isLeaf": len(children) == 0,
})
if children:
docs.extend(tree_to_db_docs(children, n["id"], cur_path, cur_names))
return docs
# ---------------------------------------------------------------------------
# product fetching
# ---------------------------------------------------------------------------
def fetch_product_ids(context, category_id):
"""Paginate through listing API, return all product IDs for a leaf."""
all_ids = []
page = 0
while True:
url = (f"{BASE_URL}/api/v1/categories/normal/{category_id}/products"
f"?page={page}&size={PAGE_SIZE}&sort=recommended&filter=&excludeProductIds=")
data = get_json(context, url)
ids = data.get("productIds") or []
all_ids.extend(ids)
if len(ids) < PAGE_SIZE:
break
page += 1
return all_ids
def fetch_batch(context, endpoint, product_ids):
qs = "&".join(f"products={pid}" for pid in product_ids)
url = f"{BASE_URL}{endpoint}?{qs}"
return as_list(get_json(context, url))
def fetch_product_details(context, product_ids):
"""For a chunk of IDs, call 4 batch endpoints and return raw lists."""
bases = fetch_batch(context, BATCH_ENDPOINTS["base"], product_ids)
prices = fetch_batch(context, BATCH_ENDPOINTS["prices"], product_ids)
stocks = fetch_batch(context, BATCH_ENDPOINTS["stock"], product_ids)
cats = fetch_batch(context, BATCH_ENDPOINTS["categories"], product_ids)
return bases, prices, stocks, cats
# ---------------------------------------------------------------------------
# console output
# ---------------------------------------------------------------------------
def print_header():
log.info("=" * 100)
log.info(" ROHLIK.CZ PRICE SCRAPER")
log.info("=" * 100)
def print_category_header(leaf, leaf_idx, total_leaves):
path_str = " > ".join(leaf["path"])
log.info("")
log.info("-" * 100)
log.info(" [%d/%d] %s (id=%s)", leaf_idx, total_leaves, path_str, leaf["id"])
log.info("-" * 100)
def print_products_table(bases, prices_list, stocks):
"""Print a compact table of products in this chunk."""
prices_map = {p["productId"]: p for p in prices_list}
stock_map = {s["productId"]: s for s in stocks}
for b in bases:
pid = b["id"]
p = prices_map.get(pid, {})
s = stock_map.get(pid, {})
name = b.get("name", "?")[:50]
price = p.get("price", {}).get("amount")
ppu = p.get("pricePerUnit", {}).get("amount")
unit = b.get("unit", "")
in_stock = s.get("inStock")
stock_str = "+" if in_stock else "-" if in_stock is False else "?"
sale_str = ""
sales = p.get("sales") or []
if sales:
sp = sales[0].get("price", {}).get("amount")
badge = (sales[0].get("badges") or [{}])[0].get("title", "")
if sp:
sale_str = f"{sp:.2f} {badge}"
price_str = f"{price:.2f}" if isinstance(price, (int, float)) else "?"
ppu_str = f"{ppu:.2f}/{unit}" if isinstance(ppu, (int, float)) else ""
log.info(" %s %9d %8s %12s %14s %s",
stock_str, pid, price_str, ppu_str, sale_str, name)
def print_summary(stats):
log.info("")
log.info("=" * 100)
log.info(" DONE")
log.info(" Categories: %d", stats["categories_scraped"])
log.info(" Products: %d unique", stats["products_total"])
log.info(" Duration: %.1f s", stats["duration_seconds"])
if stats.get("errors"):
log.info(" Errors: %d", stats["errors"])
log.info("=" * 100)
# ---------------------------------------------------------------------------
# main
# ---------------------------------------------------------------------------
def run_scraper(category_filter=None, headless=True, save_to_db=True):
db = None
if save_to_db:
db = get_db()
ensure_indexes(db)
with sync_playwright() as pw:
ctx_args = {}
if Path(AUTH_STATE_PATH).exists():
ctx_args["storage_state"] = AUTH_STATE_PATH
context, page = ensure_logged_in(pw, headless=headless)
browser = pw.chromium.launch(headless=headless)
context = browser.new_context(**ctx_args)
page = context.new_page()
# fetch live category tree from API
tree = fetch_category_tree(context)
page.goto(BASE_URL, wait_until="domcontentloaded", timeout=60000)
page.wait_for_timeout(5000)
# filter to one main category if requested
if category_filter:
cf = category_filter.lower()
tree = [t for t in tree if cf in t["name"].lower()]
if not tree:
raise SystemExit(f"No main category matching '{category_filter}'")
is_logged_in = page.locator('text="Přihlásit se"').count() == 0
if not is_logged_in:
if ROHLIK_EMAIL and ROHLIK_PASSWORD:
login(page)
context = browser.new_context(storage_state=AUTH_STATE_PATH)
page = context.new_page()
else:
log.warning("Not logged in! Prices may differ from member prices.")
leaves = collect_leaves(tree)
log.info("Scraping %d leaf categories", len(leaves))
# save categories to MongoDB
if db is not None:
cat_docs = tree_to_db_docs(tree)
upsert_categories(db, cat_docs)
log.info("Upserted %d category docs", len(cat_docs))
print_header()
run_start = datetime.now(timezone.utc)
all_products = []
seen_ids = set()
total_products = 0
errors = 0
db = None
if save_to_db:
db = get_db()
ensure_indexes(db)
for cat_data in get_all_categories_flat():
upsert_category(db, cat_data)
for i, leaf in enumerate(leaves, 1):
print_category_header(leaf, i, len(leaves))
for leaf in leaves:
try:
products = scrape_leaf(page, leaf)
for p in products:
if p["product_id"] not in seen_ids:
seen_ids.add(p["product_id"])
all_products.append(p)
if db:
upsert_product(db, p)
product_ids = fetch_product_ids(context, leaf["id"])
log.info(" %d product IDs", len(product_ids))
if not product_ids:
continue
# deduplicate within run
new_ids = [pid for pid in product_ids if pid not in seen_ids]
seen_ids.update(product_ids)
# process in chunks
for j in range(0, len(new_ids), CHUNK):
chunk = new_ids[j:j + CHUNK]
bases, prices, stocks, cats = fetch_product_details(context, chunk)
print_products_table(bases, prices, stocks)
if db is not None:
upsert_products(db, bases, prices, stocks, cats)
total_products += len(bases)
except Exception:
log.exception("Error scraping %s", leaf["name"])
log.exception(" ERROR in %s", leaf["name"])
errors += 1
context.browser.close()
run_end = datetime.now(timezone.utc)
run_data = {
"started_at": run_start,
"finished_at": run_end,
stats = {
"startedAt": run_start,
"finishedAt": run_end,
"duration_seconds": (run_end - run_start).total_seconds(),
"categories_scraped": len(leaves),
"products_scraped": len(all_products),
"products_total": total_products,
"errors": errors,
"filter": category_filter,
}
if db:
log_scrape_run(db, run_data)
if db is not None:
log_scrape_run(db, stats)
log.info(
"Done: %d unique products from %d categories in %.1fs",
len(all_products), len(leaves), run_data["duration_seconds"],
)
browser.close()
return all_products
def scrape_to_json(output_path: str = "products.json", **kwargs):
products = run_scraper(save_to_db=False, **kwargs)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(products, f, ensure_ascii=False, indent=2, default=str)
log.info("Saved %d products to %s", len(products), output_path)
return products
print_summary(stats)
return stats
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Rohlik.cz price scraper")
parser.add_argument("--no-db", action="store_true", help="Save to JSON instead of MongoDB")
parser.add_argument("--visible", action="store_true", help="Run browser in visible mode")
parser.add_argument("--filter", type=str, help="Filter categories by name (e.g. 'Ovoce', 'Zelenina > Rajčata')")
parser = argparse.ArgumentParser(description="Rohlik.cz price scraper (API)")
parser.add_argument("--category", type=str, help="Scrape only this main category (e.g. 'Ovoce a zelenina')")
parser.add_argument("--no-db", action="store_true", help="Dry run — no MongoDB writes")
parser.add_argument("--visible", action="store_true", help="Show browser window")
args = parser.parse_args()
if args.no_db:
scrape_to_json(category_filter=args.filter, headless=not args.visible)
else:
run_scraper(category_filter=args.filter, headless=not args.visible)
run_scraper(
category_filter=args.category,
headless=not args.visible,
save_to_db=not args.no_db,
)