From 5073c016929126d64b35ce55a36bb4817f7adb06 Mon Sep 17 00:00:00 2001 From: Vladimir Buzalka Date: Mon, 1 Jun 2026 07:24:46 +0200 Subject: [PATCH] notebookVB --- 10PriceScraping/Rohlik/API_NOTES.md | 512 ++++++++++++++++ .../Rohlik/{ => Trash}/categories.py | 0 .../Rohlik/{ => Trash}/scrape_categories.py | 0 .../Rohlik/{ => Trash}/scrape_first_leaf.py | 0 10PriceScraping/Rohlik/Trash/test_db.py | 93 +++ 10PriceScraping/Rohlik/db.py | 151 +++-- 10PriceScraping/Rohlik/scraper.py | 557 +++++++++++------- 10PriceScraping/Rohlik/test_login.py | 4 +- 8 files changed, 1056 insertions(+), 261 deletions(-) create mode 100644 10PriceScraping/Rohlik/API_NOTES.md rename 10PriceScraping/Rohlik/{ => Trash}/categories.py (100%) rename 10PriceScraping/Rohlik/{ => Trash}/scrape_categories.py (100%) rename 10PriceScraping/Rohlik/{ => Trash}/scrape_first_leaf.py (100%) create mode 100644 10PriceScraping/Rohlik/Trash/test_db.py diff --git a/10PriceScraping/Rohlik/API_NOTES.md b/10PriceScraping/Rohlik/API_NOTES.md new file mode 100644 index 0000000..bb7f99d --- /dev/null +++ b/10PriceScraping/Rohlik/API_NOTES.md @@ -0,0 +1,512 @@ +# Rohlik.cz Scraper — API & Data Notes + +Co víme o rohlik.cz scrapingu k 2026-06-01. Tento dokument shrnuje endpointy, +tvary odpovědí, login flow a poznámky pro návrh databáze. + +--- + +## 1. Login / session + +### 1.1 API login (bez UI) + +Stránka má klasický JSON endpoint, který se chová stejně jako přihlášení přes formulář: + +``` +POST https://www.rohlik.cz/services/frontend-service/login +Content-Type: application/json +Accept: application/json + +{ "email": "...", "password": "..." } +``` + +Odpověď (status 200): +```json +{ + "status": 200, + "messages": [], + "data": { + "status": ..., + "fbLoginUrl": "...", + "uniqsid": "...", + "user": { ... }, + "address": { ... }, + "zoneId": ..., + "availableStores": [...], + "features": [...], + "deliveryPoint": { ... }, + "segment": "...", + "personalizationConsent": ..., + "newUserCreated": false, + "session": { ... }, + "admin": false, + "isAuthenticated": true, + "store": ..., + "isAdmin": false + } +} +``` + +Po úspěšném loginu sedí v contextu cookie `PHPSESSION` na `.rohlik.cz`, +která drží přihlášení pro všechny další API calls. + +### 1.2 Cloudflare cookies + +První GET na `https://www.rohlik.cz` vyrobí cookie `cf_clearance` (Cloudflare +challenge JS běží automaticky v headful Playwrightu). Bez ní login API +nereaguje. Proto skript **nejdřív** otevře homepage, **pak** posílá login POST. + +### 1.3 Cookie consent banner (Usercentrics) + +- Banner se renderuje přes web-component `#usercentrics-cmp-ui` se shadow DOM. +- Pokus o klikání přes DOM selektory zvenku **nefunguje** — shadow root blokuje pointer events i pro elementy pod ním. +- Funkční cesta: oficiální JS API + ```js + await window.UC_UI.acceptAllConsents(); + await window.UC_UI.closeCMP(); + ``` +- Banner mizí s ~1 s animací, takže po close je potřeba `wait_for_selector('#usercentrics-cmp-ui', state='detached')`. +- Souhlas se ukládá do `localStorage` (klíče `uc_user_interaction`, `uc_settings`, …) + cookie `consentTracked=true`. + +### 1.4 Reuse: `auth_state.json` + +`context.storage_state(path=...)` uloží cookies + localStorage. Při příštím +běhu se to nahraje přes `browser.new_context(storage_state=...)` a uživatel je: + +- už přihlášený (login API se neopakuje), +- už má souhlas s cookies (banner se vůbec nezobrazí). + +Implementace flow viz `test_login.py::ensure_logged_in()`: +1. načti `auth_state.json` pokud existuje, +2. otevři `BASE_URL`, zkontroluj `text="Přihlásit se"` (přítomné → nepřihlášen), +3. když nepřihlášen → `POST /services/frontend-service/login`, accept cookies, ulož state, +4. když přihlášen → rovnou jeď dál. + +--- + +## 2. Kategorie + +### 2.1 Hlavní kategorie + +``` +GET /api/v5/navigation/components/navigation-tabs/categories +``` + +Vrací list 17 hlavních kategorií. Každá obsahuje: + +```json +{ + "id": 300102000, + "name": "Ovoce a zelenina", + "link": "/c300102000-ovoce-a-zelenina", + "image": "/images/.../fruits-and-veggies.png", + "imageType": "rich", + ... +} +``` + +Aktuální seznam (k dnešku): + +| ID | Název | +|------------|----------------------| +| 300102000 | Ovoce a zelenina | +| 300105000 | Mléčné a chlazené | +| 300103000 | Maso a ryby | +| 300117503 | Grilování | +| 300101000 | Pekárna a cukrárna | +| 300104000 | Uzeniny a lahůdky | +| 300107000 | Mražené | +| 300121429 | Plant Based | +| 300106000 | Trvanlivé | +| 300108000 | Nápoje | +| 300112393 | Speciální výživa | +| 300124206 | Kosmetika | +| 300109000 | Drogerie | +| 300111000 | Domácnost a zahrada | +| 300110000 | Dítě | +| 300112000 | Zvíře | +| 300112985 | Lékárna | + +> Hardcoded strom v `categories.py` je zastaralý (chybí Dítě, Zvíře, Lékárna). +> Doporučeno přejít na živé tahání z API. + +### 2.2 Subkategorie (rekurzivně) + +``` +GET /api/v4/navigation/components/navigation-tabs/subcategories?categoryIds= +``` + +Vrací **flat list** dětí dané kategorie. Příklad jednoho prvku: + +```json +{ + "id": 300112001, + "name": "Pes", + "image": "/images/.../1342001-1531397856.jpg", + "imageColor": "var(--green-60)", + "link": "/c300112001-pes", + "imageLink": null, + "imageType": "rich", + "subcategoryIds": [300112002, 300112003, 300112004, 300112008, 300112009, 300118461, 300124184, 300124185] +} +``` + +Klíčový moment: pole `subcategoryIds` říká, že tento uzel má další děti. +Pro získání těch dětí musíme **opět zavolat stejný endpoint** s tímto ID jako parentem. + +#### Rekurzivní algoritmus + +```python +def fetch_children(parent_id, visited, depth=1, max_depth=6): + if str(parent_id) in visited or depth > max_depth: return [] + visited.add(str(parent_id)) + subs = GET /api/v4/.../subcategories?categoryIds={parent_id} + out = [] + for s in subs: + node = {id, name, url, children: []} + if s.subcategoryIds: + node.children = fetch_children(s.id, visited, depth+1) + out.append(node) + return out +``` + +Implementace v `scrape_categories.py`. Výstup uložen v `categories_live.json` +jako `{tree: [{id, name, url, children: [...]}], raw_main: ...}`. + +--- + +## 3. Listing produktů v kategorii + +### 3.1 Endpoint + +``` +GET /api/v1/categories/normal//products + ?page= # 0-based + &size=50 # max items per page + &sort=recommended + &filter= + &excludeProductIds= +``` + +### 3.2 Odpověď — jen IDs + +```json +{ + "categoryId": 300102013, + "categoryType": "normal", + "productIds": [1407650, 1354613, 1350461, ...], + "productsWithType": [{"id": 1407650, "type": "PRODUCT"}, ...], + "impressions": [], + "interactiveProductCardAds": [], + "pageable": { + "pageNumber": 0, + "pageSize": 50, + "sort": {...}, + "offset": 0, + "unpaged": false, + "paged": true + } +} +``` + +Listing **nevrací detaily** — jen ID. Detail produktů se musí dotáhnout přes 5 batch endpointů (níže). + +### 3.3 Stránkování + +- `size=50` se chová jako horní limit; pokud kategorie má méně, vrátí všechno najednou. +- Konec stránek = první stránka, která vrátí prázdný `productIds`, **nebo** stránka s méně než `size` items. + +--- + +## 4. Detail produktů — 5 paralelních batch endpointů + +Stránka pro každou sadu ID volá **5 batch endpointů paralelně**, vždy s opakovaným query parametrem `?products=ID1&products=ID2&...`: + +``` +GET /api/v1/products?products=... +GET /api/v1/products/prices?products=... +GET /api/v1/products/stock?products=... +GET /api/v1/products/categories?products=... +GET /api/v1/products/user-data?products=... +``` + +> ⚠ `categoryType=normal` parametr stránka taky posílá — bezpečnější ho přidat. +> ⚠ Syntaxe je **opakovaný klíč**, ne čárka. `?products=1&products=2`, ne `?products=1,2`. +> ⚠ Existuje i `/api/v1/products/card?products=...` — listing ho **nepoužívá**. Vyhnout se. + +### 4.1 `/api/v1/products` — základní info + +```json +[ + { + "id": 1407650, + "name": "Čerstvě utrženo – Okurka hadovka, bez folie", + "slug": "cerstve-utrzeno-okurka-hadovka-bez-folie", + "mainCategoryId": 300102013, + "unit": "kg", + "textualAmount": "cca 380 g", + "weightedItem": true, + "packageRatio": null, + "brand": null, + "sellerId": 1, + "flag": "cz", + "archived": false, + "premiumOnly": false, + "type": "PRODUCT", + "images": [ + "https://cdn.rohlik.cz/images/grocery/products/1407650/1407650-...jpg", + ... + ], + "countries": [ + { "name": "Česká republika", "nameId": "ceska-republika", "code": "CZ" } + ], + "countryOfOriginFlagIcon": "https://cdn.rohlik.cz/images/countryFlags/cz.svg", + "badges": [ + { "type": "freshly-harvested", "title": "Čerstvě sklizeno", "subtitle": null, "tooltip": "" } + ], + "filters": [], + "information": [], + "attachments": [], + "image3dData": null, + "adviceForSafeUse": null, + "productStory": null, + "canBeFavorite": true, + "canBeRated": true + } +] +``` + +| Pole | Typ | Popis | +|--------------------|----------|-------| +| `id` | int | Product ID | +| `name` | string | Plný název | +| `slug` | string | URL slug (`/{slug}-c{id}` nebo přes `/products/{id}-{slug}`) | +| `mainCategoryId` | int | ID kategorie kam patří | +| `unit` | string | "kg" / "ks" / "l" / ... — jednotka ceny | +| `textualAmount` | string | "cca 380 g" / "1 ks" / "500 ml" — pro zobrazení | +| `weightedItem` | bool | true = vážené (variabilní hmotnost), false = kusové | +| `brand` | string? | Značka nebo null | +| `flag` | string? | Země původu kód ("cz", "it", ...) | +| `images` | string[] | URL obrázků (první je hlavní) | +| `countries` | object[] | Strukturovaná země původu | +| `badges` | object[] | Štítky (bio, čerstvě sklizeno, …) | +| `archived` | bool | True = produkt už nabídku opustil | +| `premiumOnly` | bool | Jen pro Xtra členy | + +### 4.2 `/api/v1/products/prices` — ceny + +Bez slevy: +```json +[ + { + "productId": 1407650, + "price": { "amount": 34.16, "currency": "CZK" }, + "pricePerUnit": { "amount": 89.9, "currency": "CZK" }, + "sales": [], + "lastMinuteTitle": null + } +] +``` + +Se slevou: +```json +{ + "productId": 1437841, + "price": { "amount": 65.69, "currency": "CZK" }, + "pricePerUnit": { "amount": 429.9, "currency": "CZK" }, + "sales": [ + { + "id": 12988802, + "type": "premium", // "premium" / "sale" / ... + "triggerAmount": 1, + "price": { "amount": 55.83, "currency": "CZK" }, + "pricePerUnit": { "amount": 365.38, "currency": "CZK" }, + "originalPrice": { "amount": 65.69, "currency": "CZK" }, + "originalPricePerUnit": null, + "badges": [{ "type": "premium-discount", "title": "-15 %", "subtitle": null }], + "validTill": "2029-01-02T23:59:00+01:00", + "active": true, + "silent": false, + "bundleId": null + } + ], + "lastMinuteTitle": null +} +``` + +| Pole | Cesta | Popis | +|------|-------|-------| +| Cena | `price.amount` | Aktuální cena za balení (Kč) | +| Cena/jednotku | `pricePerUnit.amount` | Cena za `unit` z `/products` | +| Akce | `sales[0].price.amount` | Pokud `sales` neprázdné | +| Typ akce | `sales[0].type` | `premium` (Xtra), `sale`, … | +| Štítek | `sales[0].badges[0].title` | "-10 %", "-15 %", ... | +| Platnost | `sales[0].validTill` | ISO datetime | + +### 4.3 `/api/v1/products/stock` — skladovost + +```json +[ + { + "productId": 1407650, + "warehouseId": 8799, + "packageInfo": { "amount": 0.38, "unit": "kg" }, + "inStock": false, + "maxBasketAmount": 0, + "maxBasketAmountReason": "AVAILABLE", // "ALLOWED" když lze koupit + "preorderEnabled": false, + "unavailabilityReason": null, + "deliveryRestriction": null, + "expectedReplenishment": null, + "availabilityDimension": 0, + "shelfLife": null, // { value, unit } + "billablePackaging": null, // záloha (lahve) + "freshness": null, + "premiumOnly": false, + "tooltips": [], + "sales": [] + } +] +``` + +| Pole | Popis | +|------|-------| +| `inStock` | bool — skladem ano/ne | +| `maxBasketAmount` | int — max kusů do košíku | +| `packageInfo.amount` + `.unit` | Reálná hmotnost/objem balení (oproti `textualAmount` z base) | +| `warehouseId` | ID skladu (může se lišit podle adresy) | +| `shelfLife` | Trvanlivost (pokud uvedena) | +| `billablePackaging` | Zálohovaný obal (lahev atd.) | + +### 4.4 `/api/v1/products/categories` + +```json +[ + { + "productId": 1407650, + "categories": [ + { "id": 300102000, "type": "normal", "name": "Ovoce a zelenina", "slug": "ovoce-a-zelenina", "level": 0 }, + { "id": 300102008, "type": "normal", "name": "Zelenina", "slug": "zelenina", "level": 1 }, + { "id": 300102013, "type": "normal", "name": "Okurky, cukety a lilky", "slug": "okurky-cukety-a-lilky", "level": 2 } + ] + } +] +``` + +Plný strom kategorií od kořene k listu, `level=0` = hlavní. Užitečné, protože produkt může patřit do více kategorií (např. „Grilování" duplikuje listy z masa). + +### 4.5 `/api/v1/products/user-data` + +Per-user data (oblíbené, naposled koupeno…). Pro scraping cen **nepotřebujeme**, ale stránka to volá, takže když to vynecháme, vypadáme méně jako frontend. + +--- + +## 5. Sample merged record + +Po zavolání všech 5 endpointů a merge podle `productId`: + +```json +{ + "productId": 1407650, + "base": { ... pole z /products ... }, + "prices": { ... pole z /products/prices ... }, + "stock": { ... pole z /products/stock ... }, + "categories": { ... pole z /products/categories ... }, + "user_data": { ... pole z /products/user-data ... } +} +``` + +Reálná tabulka prvního leafu (Okurky, cukety a lilky → 17 produktů): + +``` +ID Skladem Cena Za jedn. Akce Název (balení) +1407650 ne 34.16 89.90/kg Čerstvě utrženo – Okurka hadovka (cca 380 g) +1354613 ano 31.87 109.90/kg Okurka polní 1 ks (cca 290 g) +1294911 ano 49.90 49.90/ks 44.91 -10 % BIO Okurka hadovka 1 ks (1 ks) +... +``` + +--- + +## 6. Číselníky / enumy které jsme viděli + +### Typ slevy (`sales[].type`) +- `"premium"` — Xtra members discount +- (`"sale"` — klasická akce, ne vlastní pozorování ale dle označení) + +### `badges[].type` (base) +- `"freshly-harvested"`, `"bio"`, `"low-price"`, ... + +### `maxBasketAmountReason` +- `"ALLOWED"` — normálně lze koupit +- `"AVAILABLE"` — vidíme když `inStock=false` (out of stock) + +### `flag` (base) — kód země původu +- `"cz"`, `"it"`, `"de"`, ... + +### `unit` (base) +- `"kg"`, `"l"`, `"ks"`, `"g"`, `"ml"`, ... + +### `categoryType` (listing) +- `"normal"` — běžné kategorie +- (existují i `"premium"`, `"recipes"` aj., nepoužíváme) + +--- + +## 7. Postup scrapingu (high level) + +``` +ensure_logged_in() + └─ načte auth_state.json NEBO se přihlásí přes API a uloží state + +get_category_tree() + └─ rekurzivně přes /navigation-tabs/categories + /subcategories + └─ vrátí strom uzlů {id, name, url, children} + +for each leaf in tree (without children): + page = 0 + while True: + ids = GET /api/v1/categories/normal/{leaf.id}/products?page={page}&size=50 + if not ids: break + all_ids += ids + if len(ids) < 50: break + page += 1 + + for chunk in chunks(all_ids, 30): + base = GET /api/v1/products?products=... + prices = GET /api/v1/products/prices?products=... + stock = GET /api/v1/products/stock?products=... + categories = GET /api/v1/products/categories?products=... + merged = merge by productId + upsert to MongoDB +``` + +--- + +## 8. Důležité poznámky / gotchas + +- **Cloudflare**: vždy nejdřív otevřít homepage v Playwright contextu, pak teprve API. +- **Cookie consent**: pro pokud možno nenápadné chování přijmout cookies přes `UC_UI.acceptAllConsents()`. Uložený state ho už neukazuje. +- **Headers**: zatím nepotřebujeme posílat speciální `User-Agent` ani `X-...` — Playwright context cookies stačí. +- **Rate**: zatím netestováno. Stránka sama posílá 5 paralelních requestů per chunk + listing. Ne víc. +- **Velikost chunků**: 30 ID per batch nám prošlo bez problémů. URL délka by zvládla i víc, ale držme se toho, co reálně chrome dělá. +- **Identita produktu**: `id` v base / `productId` v ostatních endpointech — totéž. Není garantována stálost ID napříč warehouses (ale `warehouseId=8799` je nás stabilní zóna). +- **Sklad-specifická data**: cena, dostupnost i `warehouseId` se odvíjí od `zoneId` v session. Pokud měníme adresu, měníme i ceny → držet jednu doručovací adresu pro reprodukovatelnost. +- **Kategorie ne-listy**: hlavní kategorie zobrazují jen "Doporučujeme" (cca 5 produktů). Pro úplný katalog scrapovat **jen listy** stromu (uzly bez `children`). +- **Archived products**: `archived: true` znamená, že produkt už není v nabídce — uložit historicky, ale nemarkovat jako aktivní. + +--- + +## 9. Soubory v projektu + +| Soubor | Co dělá | +|--------|---------| +| `config.py` | Cesty + creds z `.env` | +| `test_login.py` | `ensure_logged_in()` — session reuse + API login + accept cookies | +| `scrape_categories.py` | Stáhne živý strom kategorií → `categories_live.json` | +| `scrape_first_leaf.py` | Demo: stáhne první leaf a vypíše produkty | +| `auth_state.json` | Cookies + localStorage (gitignored) | +| `categories_live.json` | Aktuální strom kategorií | +| `products_.json` | Demo dump produktů z jedné kategorie | +| `scraper.py` | (zastaralý) původní DOM scraping přes Playwright | +| `categories.py` | (zastaralý) hardcoded strom kategorií | +| `db.py` | MongoDB ops — bude potřeba upravit pro nový tvar dat | diff --git a/10PriceScraping/Rohlik/categories.py b/10PriceScraping/Rohlik/Trash/categories.py similarity index 100% rename from 10PriceScraping/Rohlik/categories.py rename to 10PriceScraping/Rohlik/Trash/categories.py diff --git a/10PriceScraping/Rohlik/scrape_categories.py b/10PriceScraping/Rohlik/Trash/scrape_categories.py similarity index 100% rename from 10PriceScraping/Rohlik/scrape_categories.py rename to 10PriceScraping/Rohlik/Trash/scrape_categories.py diff --git a/10PriceScraping/Rohlik/scrape_first_leaf.py b/10PriceScraping/Rohlik/Trash/scrape_first_leaf.py similarity index 100% rename from 10PriceScraping/Rohlik/scrape_first_leaf.py rename to 10PriceScraping/Rohlik/Trash/scrape_first_leaf.py diff --git a/10PriceScraping/Rohlik/Trash/test_db.py b/10PriceScraping/Rohlik/Trash/test_db.py new file mode 100644 index 0000000..40da2e3 --- /dev/null +++ b/10PriceScraping/Rohlik/Trash/test_db.py @@ -0,0 +1,93 @@ +""" +Test DB layer: load products_300102013.json (already scraped data) +and upsert into MongoDB 'rohlik' database. + +No scraping needed — just validates the db.py functions work +with real API response shapes. +""" + +import json +import sys +import io +from pathlib import Path +from db import get_db, ensure_indexes, upsert_products, upsert_category + +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") + +DATA_FILE = Path(__file__).parent / "products_300102013.json" + + +def main(): + db = get_db() + print(f"Connected to: {db.client.address} / {db.name}") + + ensure_indexes(db) + print("Indexes created.\n") + + # --- test category upsert --- + upsert_category(db, { + "_id": 300102013, + "name": "Okurky, cukety a lilky", + "slug": "okurky-cukety-a-lilky", + "path": [300102000, 300102008, 300102013], + "pathNames": ["Ovoce a zelenina", "Zelenina", "Okurky, cukety a lilky"], + "parentId": 300102008, + "isLeaf": True, + }) + print("Category 300102013 upserted.") + + # --- load scraped products --- + products = json.loads(DATA_FILE.read_text(encoding="utf-8")) + print(f"Loaded {len(products)} products from {DATA_FILE.name}\n") + + # split merged records back into the 4 lists that upsert_products expects + bases = [] + prices_list = [] + stocks = [] + categories_list = [] + + for p in products: + base = p.get("base", {}) + prices = p.get("prices", {}) + stock = p.get("stock", {}) + cats = p.get("categories", {}) + + bases.append(base) + prices_list.append(prices) + stocks.append(stock) + categories_list.append(cats) + + upsert_products(db, bases, prices_list, stocks, categories_list) + print(f"Upserted {len(bases)} products.\n") + + # --- verify --- + n_products = db.products.count_documents({}) + n_history = db.price_history.count_documents({}) + n_cats = db.categories.count_documents({}) + + print(f"DB counts:") + print(f" products: {n_products}") + print(f" price_history: {n_history}") + print(f" categories: {n_cats}") + + # show one sample + sample = db.products.find_one({"_id": 1407650}) + if sample: + print(f"\nSample product: {sample['name']}") + print(f" price: {sample['currentPrice']} {sample['currency']}") + print(f" per unit: {sample['currentPricePerUnit']}/{sample.get('unit', '?')}") + print(f" inStock: {sample['inStock']}") + print(f" sale: {sample['sale']}") + print(f" badges: {[b['title'] for b in sample.get('badges', [])]}") + + # show price_history entry + hist = db.price_history.find_one({"productId": 1407650}) + if hist: + print(f"\n price_history record: price={hist['price']}, " + f"inStock={hist['inStock']}, scrapedAt={hist['scrapedAt']}") + + print("\nDone.") + + +if __name__ == "__main__": + main() diff --git a/10PriceScraping/Rohlik/db.py b/10PriceScraping/Rohlik/db.py index ab86094..cff59b9 100644 --- a/10PriceScraping/Rohlik/db.py +++ b/10PriceScraping/Rohlik/db.py @@ -1,15 +1,6 @@ -""" -Rohlik.cz Price Scraper - Database Operations -Version: 1.0.0 -Date: 2026-05-31 - -MongoDB operations for the Rohlik.cz price scraper. -Collections: products, price_history, categories, scrape_runs. -MongoDB server: 192.168.1.76 (no authentication). -""" - from datetime import datetime, timezone -from pymongo import MongoClient, ASCENDING + +from pymongo import MongoClient, ASCENDING, DESCENDING, TEXT from config import MONGO_URI, MONGO_DB @@ -19,70 +10,116 @@ def get_db(): def ensure_indexes(db): - db.products.create_index([("product_id", ASCENDING)], unique=True) - db.products.create_index([("category_id", ASCENDING)]) - db.products.create_index([("name", ASCENDING)]) + db.categories.create_index("parentId") + db.categories.create_index("isLeaf") - db.price_history.create_index([("product_id", ASCENDING), ("scraped_at", ASCENDING)]) - db.price_history.create_index([("scraped_at", ASCENDING)]) + db.products.create_index("mainCategoryId") + db.products.create_index([("archived", ASCENDING), ("lastSeen", DESCENDING)]) + db.products.create_index([("name", TEXT)]) - db.categories.create_index([("category_id", ASCENDING)], unique=True) + db.price_history.create_index([("productId", ASCENDING), ("scrapedAt", DESCENDING)]) + db.price_history.create_index([("scrapedAt", DESCENDING)]) - db.scrape_runs.create_index([("started_at", ASCENDING)]) + db.scrape_runs.create_index([("startedAt", DESCENDING)]) -def upsert_product(db, product: dict): +def upsert_category(db, cat: dict): + db.categories.update_one( + {"_id": cat["_id"]}, + {"$set": cat}, + upsert=True, + ) + + +def upsert_categories(db, cats: list[dict]): + for cat in cats: + upsert_category(db, cat) + + +def upsert_product(db, base: dict, prices: dict, stock: dict, categories: list[dict]): now = datetime.now(timezone.utc) - product_id = product["product_id"] + product_id = base["id"] + + sale_raw = prices.get("sales", []) + sale = None + if sale_raw: + s = sale_raw[0] + sale = { + "type": s.get("type"), + "price": s["price"]["amount"], + "pricePerUnit": s.get("pricePerUnit", {}).get("amount"), + "badge": (s.get("badges") or [{}])[0].get("title"), + "validTill": s.get("validTill"), + } + + category_path = [c["id"] for c in categories] if categories else [] + + doc = { + "name": base["name"], + "slug": base.get("slug"), + "brand": base.get("brand"), + "unit": base.get("unit"), + "textualAmount": base.get("textualAmount"), + "weightedItem": base.get("weightedItem", False), + "mainCategoryId": base.get("mainCategoryId"), + "categoryPath": category_path, + "allCategories": [ + {"id": c["id"], "name": c["name"], "level": c.get("level", 0)} + for c in categories + ] if categories else [], + "countryCode": base.get("flag"), + "images": base.get("images", []), + "badges": base.get("badges", []), + "archived": base.get("archived", False), + "premiumOnly": base.get("premiumOnly", False), + "currentPrice": prices["price"]["amount"], + "currentPricePerUnit": prices.get("pricePerUnit", {}).get("amount"), + "currency": prices["price"].get("currency", "CZK"), + "sale": sale, + "inStock": stock.get("inStock", False), + "maxBasketAmount": stock.get("maxBasketAmount", 0), + "packageAmount": stock.get("packageInfo", {}).get("amount"), + "packageUnit": stock.get("packageInfo", {}).get("unit"), + "warehouseId": stock.get("warehouseId"), + "lastSeen": now, + "lastScrapedAt": now, + } db.products.update_one( - {"product_id": product_id}, + {"_id": product_id}, { - "$set": { - "name": product["name"], - "category_id": product.get("category_id"), - "category_name": product.get("category_name"), - "amount": product.get("amount"), - "unit_price": product.get("unit_price"), - "image_url": product.get("image_url"), - "product_url": product.get("product_url"), - "category_path": product.get("category_path"), - "updated_at": now, - }, - "$setOnInsert": { - "created_at": now, - }, + "$set": doc, + "$setOnInsert": {"firstSeen": now}, }, upsert=True, ) db.price_history.insert_one({ - "product_id": product_id, - "price": product["price"], - "original_price": product.get("original_price"), - "discount_badge": product.get("discount_badge"), - "unit_price": product.get("unit_price"), - "scraped_at": now, + "productId": product_id, + "scrapedAt": now, + "price": prices["price"]["amount"], + "pricePerUnit": prices.get("pricePerUnit", {}).get("amount"), + "inStock": stock.get("inStock", False), + "sale": sale, }) -def upsert_category(db, category: dict): - now = datetime.now(timezone.utc) - db.categories.update_one( - {"category_id": category["category_id"]}, - { - "$set": { - "name": category["name"], - "url": category["url"], - "parent_id": category.get("parent_id"), - "has_children": category.get("has_children", False), - "updated_at": now, - }, - "$setOnInsert": {"created_at": now}, - }, - upsert=True, - ) +def upsert_products(db, bases: list, prices_list: list, stocks: list, categories_list: list): + prices_map = {p["productId"]: p for p in prices_list} + stock_map = {s["productId"]: s for s in stocks} + cats_map = {c["productId"]: c.get("categories", []) for c in categories_list} + + for base in bases: + pid = base["id"] + upsert_product( + db, + base, + prices_map.get(pid, {"price": {"amount": 0}}), + stock_map.get(pid, {}), + cats_map.get(pid, []), + ) def log_scrape_run(db, run_data: dict): + run_data.setdefault("startedAt", datetime.now(timezone.utc)) db.scrape_runs.insert_one(run_data) diff --git a/10PriceScraping/Rohlik/scraper.py b/10PriceScraping/Rohlik/scraper.py index 7dd0e0d..ce7f0fe 100644 --- a/10PriceScraping/Rohlik/scraper.py +++ b/10PriceScraping/Rohlik/scraper.py @@ -1,254 +1,407 @@ """ -Rohlik.cz Price Scraper - Main Scraper -Version: 1.0.0 -Date: 2026-05-31 - -Playwright-based scraper that iterates all leaf categories on Rohlik.cz, -scrolls to lazy-load every product card, and extracts pricing data from the DOM. -Supports authenticated scraping (prices differ for logged-in users). +Rohlik.cz Price Scraper — API-based +Iterates leaf categories, fetches product IDs via listing API, +pulls details from 4 batch endpoints, upserts into MongoDB. Usage: - python scraper.py --no-db --visible # scrape to JSON, visible browser - python scraper.py --no-db --filter "Brambory" # scrape single category to JSON - python scraper.py # scrape to MongoDB - python scraper.py --visible # scrape to MongoDB, visible browser + python scraper.py # all categories -> MongoDB + python scraper.py --category "Ovoce a zelenina" # one main category only + python scraper.py --no-db # dry run, no DB writes + python scraper.py --visible # show browser window """ -import re -import json +import sys +import io +import argparse import logging from datetime import datetime, timezone -from pathlib import Path -from playwright.sync_api import sync_playwright, Page +from playwright.sync_api import sync_playwright -from config import ( - BASE_URL, AUTH_STATE_PATH, - ROHLIK_EMAIL, ROHLIK_PASSWORD, - SCROLL_PAUSE, MAX_SCROLLS, +from config import BASE_URL +from test_login import ensure_logged_in +from db import get_db, ensure_indexes, upsert_products, upsert_categories, log_scrape_run + +sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace") +sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace") + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(message)s", + datefmt="%H:%M:%S", ) -from categories import get_leaf_categories, get_all_categories_flat -from db import get_db, ensure_indexes, upsert_product, upsert_category, log_scrape_run - -logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") log = logging.getLogger(__name__) +PAGE_SIZE = 50 +CHUNK = 30 -def parse_price(raw: str | None) -> float | None: - if not raw: - return None - digits = re.sub(r"[^\d]", "", raw) - if not digits: - return None - return int(digits) / 100 +MAIN_CATS_URL = f"{BASE_URL}/api/v5/navigation/components/navigation-tabs/categories" +SUBCATS_URL = f"{BASE_URL}/api/v4/navigation/components/navigation-tabs/subcategories" + +BATCH_ENDPOINTS = { + "base": "/api/v1/products", + "prices": "/api/v1/products/prices", + "stock": "/api/v1/products/stock", + "categories": "/api/v1/products/categories", +} -def parse_original_price(raw: str | None) -> float | None: - if not raw: - return None - match = re.search(r"([\d\s]+[,.][\d]+)", raw.replace("\xa0", " ")) - if match: - return float(match.group(1).replace(" ", "").replace(",", ".")) - digits = re.sub(r"[^\d]", "", raw) - if digits: - return float(digits) / 100 +# --------------------------------------------------------------------------- +# helpers +# --------------------------------------------------------------------------- + +def get_json(context, url, **params): + resp = context.request.get(url, params=params or None) + if resp.status != 200: + raise RuntimeError(f"HTTP {resp.status}: {url[:120]}") + return resp.json() + + +def as_list(payload): + if isinstance(payload, list): + return payload + if isinstance(payload, dict): + for k in ("data", "products", "items"): + v = payload.get(k) + if isinstance(v, list): + return v + return [] + + +def pick(d, *keys): + """Return the first non-None value among the given keys.""" + for k in keys: + if isinstance(d, dict) and d.get(k) is not None: + return d[k] return None -def login(page: Page): - log.info("Logging in to Rohlik.cz...") - page.goto(BASE_URL, wait_until="domcontentloaded", timeout=60000) - page.wait_for_timeout(3000) +# --------------------------------------------------------------------------- +# category tree — live from API +# --------------------------------------------------------------------------- - page.locator('text="Přihlásit se"').first.click() - page.wait_for_timeout(2000) - - page.locator('input[type="email"], input[name="email"]').first.fill(ROHLIK_EMAIL) - page.locator('input[type="password"], input[name="password"]').first.fill(ROHLIK_PASSWORD) - page.locator('button[type="submit"]').first.click() - page.wait_for_timeout(5000) - - page.context.storage_state(path=AUTH_STATE_PATH) - log.info("Login successful, auth state saved.") +def normalize_main(payload): + if isinstance(payload, list): + return payload + for key in ("data", "categories", "items", "navigationTabs", "tabs"): + v = payload.get(key) + if isinstance(v, list): + return v + if isinstance(v, dict): + for k2 in ("categories", "items", "tabs"): + if isinstance(v.get(k2), list): + return v[k2] + return [] -def scroll_to_load_all(page: Page) -> int: - prev_count = 0 - for i in range(MAX_SCROLLS): - page.evaluate("window.scrollTo(0, document.body.scrollHeight)") - page.wait_for_timeout(int(SCROLL_PAUSE * 1000)) - current_count = page.locator('[data-test^="productCard-AVAILABLE-"]').count() - if current_count == prev_count and i > 2: - break - prev_count = current_count - return prev_count +def subs_from_payload(payload): + if isinstance(payload, list): + return payload + if isinstance(payload, dict): + for k in ("data", "subcategories", "items", "categories"): + v = payload.get(k) + if isinstance(v, list): + return v + return [] -def extract_products(page: Page, category: dict) -> list[dict]: - products_data = page.evaluate(""" - () => { - const products = []; - document.querySelectorAll('[data-test^="productCard-AVAILABLE-"]').forEach(card => { - const id = card.getAttribute('data-test').replace('productCard-AVAILABLE-', ''); - const nameEl = card.querySelector('[data-test="productCard-body-name"]'); - const priceNoEl = card.querySelector('[data-test="productCard-body-price-priceNo"]'); - const saleEl = card.querySelector('[data-test="productCard-body-price-sale"]'); - const amountEl = card.querySelector('[data-test="productCard-footer-amount"]'); - const unitPriceEl = card.querySelector('[data-test="productCard-footer-unitPrice"]'); - const badgeEl = card.querySelector('[data-test="productCard-body-badge"]'); - const imgEl = card.querySelector('img'); - const linkEl = card.querySelector('a[href*="/"]'); - - products.push({ - product_id: id, - name: nameEl?.textContent?.trim() || '', - price_raw: priceNoEl?.textContent?.trim() || '', - original_price_raw: saleEl?.textContent?.trim() || '', - amount: amountEl?.textContent?.trim() || '', - unit_price_raw: unitPriceEl?.textContent?.trim() || '', - discount_badge: badgeEl?.textContent?.trim() || '', - image_url: imgEl?.src || '', - product_url: linkEl?.getAttribute('href') || '', - }); - }); - return products; - } - """) - - results = [] - for p in products_data: - results.append({ - "product_id": p["product_id"], - "name": p["name"], - "price": parse_price(p["price_raw"]), - "original_price": parse_original_price(p["original_price_raw"]), - "discount_badge": p["discount_badge"] or None, - "amount": p["amount"] or None, - "unit_price": p["unit_price_raw"].strip() or None, - "image_url": p["image_url"] or None, - "product_url": f"{BASE_URL}{p['product_url']}" if p["product_url"] else None, - "category_id": category["id"], - "category_name": category["name"], - "category_path": " > ".join(category.get("path", [category["name"]])), - }) - return results - - -def scrape_leaf(page: Page, category: dict) -> list[dict]: - url = f"{BASE_URL}{category['url']}" - log.info("Scraping: %s (%s)", " > ".join(category.get("path", [category["name"]])), url) - - page.goto(url, wait_until="domcontentloaded", timeout=60000) - page.wait_for_timeout(3000) - - try: - page.wait_for_selector('[data-test^="productCard-AVAILABLE-"]', timeout=15000) - except Exception: - log.warning(" No products found in %s, skipping.", category["name"]) +def fetch_children_recursive(context, parent_id, visited, depth=1, max_depth=6): + if str(parent_id) in visited or depth > max_depth: return [] + visited.add(str(parent_id)) - total = scroll_to_load_all(page) - products = extract_products(page, category) - log.info(" %d products extracted (loaded %d)", len(products), total) - return products + sub_payload = get_json(context, SUBCATS_URL, categoryIds=str(parent_id)) + subs = subs_from_payload(sub_payload) + + out = [] + for s in subs: + if not isinstance(s, dict): + continue + sid = pick(s, "id", "categoryId") + node = { + "id": sid, + "name": pick(s, "name", "title", "label"), + "url": pick(s, "url", "link", "slug"), + "children": [], + } + if sid and s.get("subcategoryIds"): + node["children"] = fetch_children_recursive(context, sid, visited, depth + 1, max_depth) + out.append(node) + return out -def run_scraper( - category_filter: str | None = None, - headless: bool = True, - save_to_db: bool = True, -): - leaves = get_leaf_categories() - if category_filter: - category_filter_lower = category_filter.lower() - leaves = [c for c in leaves if category_filter_lower in " > ".join(c["path"]).lower()] +def fetch_category_tree(context): + """Fetch full category tree live from Rohlik API.""" + log.info("Fetching main categories ...") + main_payload = get_json(context, MAIN_CATS_URL) + main_cats = normalize_main(main_payload) + log.info(" %d main categories", len(main_cats)) - log.info("Will scrape %d leaf categories", len(leaves)) + tree = [] + visited = set() + + log.info("Fetching subcategories recursively ...") + for cat in main_cats: + cid = pick(cat, "id", "categoryId") + cname = pick(cat, "name", "title", "label") + curl = pick(cat, "url", "link", "slug") + if not cid: + continue + + children = fetch_children_recursive(context, cid, visited) + node = {"id": cid, "name": cname, "url": curl, "children": children} + tree.append(node) + + n_desc = count_nodes(children) + log.info(" - %s -> %d subcategories", cname, n_desc) + + total = count_nodes(tree) + log.info(" Total: %d categories (incl. main)", total) + return tree + + +def count_nodes(nodes): + total = len(nodes) + for n in nodes: + total += count_nodes(n.get("children", [])) + return total + + +def collect_leaves(nodes, path=None): + """Return flat list of leaf nodes with their full path.""" + if path is None: + path = [] + leaves = [] + for n in nodes: + current = path + [n["name"]] + children = n.get("children") or [] + if children: + leaves.extend(collect_leaves(children, current)) + else: + leaves.append({**n, "path": current}) + return leaves + + +def tree_to_db_docs(nodes, parent_id=None, path=None, path_names=None): + """Convert tree nodes to flat category docs for MongoDB.""" + if path is None: + path = [] + if path_names is None: + path_names = [] + docs = [] + for n in nodes: + cur_path = path + [n["id"]] + cur_names = path_names + [n["name"]] + children = n.get("children") or [] + docs.append({ + "_id": n["id"], + "name": n["name"], + "slug": (n.get("url") or "").lstrip("/"), + "path": cur_path, + "pathNames": cur_names, + "parentId": parent_id, + "isLeaf": len(children) == 0, + }) + if children: + docs.extend(tree_to_db_docs(children, n["id"], cur_path, cur_names)) + return docs + + +# --------------------------------------------------------------------------- +# product fetching +# --------------------------------------------------------------------------- + +def fetch_product_ids(context, category_id): + """Paginate through listing API, return all product IDs for a leaf.""" + all_ids = [] + page = 0 + while True: + url = (f"{BASE_URL}/api/v1/categories/normal/{category_id}/products" + f"?page={page}&size={PAGE_SIZE}&sort=recommended&filter=&excludeProductIds=") + data = get_json(context, url) + ids = data.get("productIds") or [] + all_ids.extend(ids) + if len(ids) < PAGE_SIZE: + break + page += 1 + return all_ids + + +def fetch_batch(context, endpoint, product_ids): + qs = "&".join(f"products={pid}" for pid in product_ids) + url = f"{BASE_URL}{endpoint}?{qs}" + return as_list(get_json(context, url)) + + +def fetch_product_details(context, product_ids): + """For a chunk of IDs, call 4 batch endpoints and return raw lists.""" + bases = fetch_batch(context, BATCH_ENDPOINTS["base"], product_ids) + prices = fetch_batch(context, BATCH_ENDPOINTS["prices"], product_ids) + stocks = fetch_batch(context, BATCH_ENDPOINTS["stock"], product_ids) + cats = fetch_batch(context, BATCH_ENDPOINTS["categories"], product_ids) + return bases, prices, stocks, cats + + +# --------------------------------------------------------------------------- +# console output +# --------------------------------------------------------------------------- + +def print_header(): + log.info("=" * 100) + log.info(" ROHLIK.CZ PRICE SCRAPER") + log.info("=" * 100) + + +def print_category_header(leaf, leaf_idx, total_leaves): + path_str = " > ".join(leaf["path"]) + log.info("") + log.info("-" * 100) + log.info(" [%d/%d] %s (id=%s)", leaf_idx, total_leaves, path_str, leaf["id"]) + log.info("-" * 100) + + +def print_products_table(bases, prices_list, stocks): + """Print a compact table of products in this chunk.""" + prices_map = {p["productId"]: p for p in prices_list} + stock_map = {s["productId"]: s for s in stocks} + + for b in bases: + pid = b["id"] + p = prices_map.get(pid, {}) + s = stock_map.get(pid, {}) + + name = b.get("name", "?")[:50] + price = p.get("price", {}).get("amount") + ppu = p.get("pricePerUnit", {}).get("amount") + unit = b.get("unit", "") + in_stock = s.get("inStock") + stock_str = "+" if in_stock else "-" if in_stock is False else "?" + + sale_str = "" + sales = p.get("sales") or [] + if sales: + sp = sales[0].get("price", {}).get("amount") + badge = (sales[0].get("badges") or [{}])[0].get("title", "") + if sp: + sale_str = f"{sp:.2f} {badge}" + + price_str = f"{price:.2f}" if isinstance(price, (int, float)) else "?" + ppu_str = f"{ppu:.2f}/{unit}" if isinstance(ppu, (int, float)) else "" + + log.info(" %s %9d %8s %12s %14s %s", + stock_str, pid, price_str, ppu_str, sale_str, name) + + +def print_summary(stats): + log.info("") + log.info("=" * 100) + log.info(" DONE") + log.info(" Categories: %d", stats["categories_scraped"]) + log.info(" Products: %d unique", stats["products_total"]) + log.info(" Duration: %.1f s", stats["duration_seconds"]) + if stats.get("errors"): + log.info(" Errors: %d", stats["errors"]) + log.info("=" * 100) + + +# --------------------------------------------------------------------------- +# main +# --------------------------------------------------------------------------- + +def run_scraper(category_filter=None, headless=True, save_to_db=True): + db = None + if save_to_db: + db = get_db() + ensure_indexes(db) with sync_playwright() as pw: - ctx_args = {} - if Path(AUTH_STATE_PATH).exists(): - ctx_args["storage_state"] = AUTH_STATE_PATH + context, page = ensure_logged_in(pw, headless=headless) - browser = pw.chromium.launch(headless=headless) - context = browser.new_context(**ctx_args) - page = context.new_page() + # fetch live category tree from API + tree = fetch_category_tree(context) - page.goto(BASE_URL, wait_until="domcontentloaded", timeout=60000) - page.wait_for_timeout(5000) + # filter to one main category if requested + if category_filter: + cf = category_filter.lower() + tree = [t for t in tree if cf in t["name"].lower()] + if not tree: + raise SystemExit(f"No main category matching '{category_filter}'") - is_logged_in = page.locator('text="Přihlásit se"').count() == 0 - if not is_logged_in: - if ROHLIK_EMAIL and ROHLIK_PASSWORD: - login(page) - context = browser.new_context(storage_state=AUTH_STATE_PATH) - page = context.new_page() - else: - log.warning("Not logged in! Prices may differ from member prices.") + leaves = collect_leaves(tree) + log.info("Scraping %d leaf categories", len(leaves)) + + # save categories to MongoDB + if db is not None: + cat_docs = tree_to_db_docs(tree) + upsert_categories(db, cat_docs) + log.info("Upserted %d category docs", len(cat_docs)) + + print_header() run_start = datetime.now(timezone.utc) - all_products = [] seen_ids = set() + total_products = 0 + errors = 0 - db = None - if save_to_db: - db = get_db() - ensure_indexes(db) - for cat_data in get_all_categories_flat(): - upsert_category(db, cat_data) + for i, leaf in enumerate(leaves, 1): + print_category_header(leaf, i, len(leaves)) - for leaf in leaves: try: - products = scrape_leaf(page, leaf) - for p in products: - if p["product_id"] not in seen_ids: - seen_ids.add(p["product_id"]) - all_products.append(p) - if db: - upsert_product(db, p) + product_ids = fetch_product_ids(context, leaf["id"]) + log.info(" %d product IDs", len(product_ids)) + + if not product_ids: + continue + + # deduplicate within run + new_ids = [pid for pid in product_ids if pid not in seen_ids] + seen_ids.update(product_ids) + + # process in chunks + for j in range(0, len(new_ids), CHUNK): + chunk = new_ids[j:j + CHUNK] + bases, prices, stocks, cats = fetch_product_details(context, chunk) + + print_products_table(bases, prices, stocks) + + if db is not None: + upsert_products(db, bases, prices, stocks, cats) + + total_products += len(bases) + except Exception: - log.exception("Error scraping %s", leaf["name"]) + log.exception(" ERROR in %s", leaf["name"]) + errors += 1 + + context.browser.close() run_end = datetime.now(timezone.utc) - run_data = { - "started_at": run_start, - "finished_at": run_end, + stats = { + "startedAt": run_start, + "finishedAt": run_end, "duration_seconds": (run_end - run_start).total_seconds(), "categories_scraped": len(leaves), - "products_scraped": len(all_products), + "products_total": total_products, + "errors": errors, + "filter": category_filter, } - if db: - log_scrape_run(db, run_data) + if db is not None: + log_scrape_run(db, stats) - log.info( - "Done: %d unique products from %d categories in %.1fs", - len(all_products), len(leaves), run_data["duration_seconds"], - ) - - browser.close() - - return all_products - - -def scrape_to_json(output_path: str = "products.json", **kwargs): - products = run_scraper(save_to_db=False, **kwargs) - with open(output_path, "w", encoding="utf-8") as f: - json.dump(products, f, ensure_ascii=False, indent=2, default=str) - log.info("Saved %d products to %s", len(products), output_path) - return products + print_summary(stats) + return stats if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser(description="Rohlik.cz price scraper") - parser.add_argument("--no-db", action="store_true", help="Save to JSON instead of MongoDB") - parser.add_argument("--visible", action="store_true", help="Run browser in visible mode") - parser.add_argument("--filter", type=str, help="Filter categories by name (e.g. 'Ovoce', 'Zelenina > Rajčata')") + parser = argparse.ArgumentParser(description="Rohlik.cz price scraper (API)") + parser.add_argument("--category", type=str, help="Scrape only this main category (e.g. 'Ovoce a zelenina')") + parser.add_argument("--no-db", action="store_true", help="Dry run — no MongoDB writes") + parser.add_argument("--visible", action="store_true", help="Show browser window") args = parser.parse_args() - if args.no_db: - scrape_to_json(category_filter=args.filter, headless=not args.visible) - else: - run_scraper(category_filter=args.filter, headless=not args.visible) + run_scraper( + category_filter=args.category, + headless=not args.visible, + save_to_db=not args.no_db, + ) diff --git a/10PriceScraping/Rohlik/test_login.py b/10PriceScraping/Rohlik/test_login.py index c3adc37..006eb82 100644 --- a/10PriceScraping/Rohlik/test_login.py +++ b/10PriceScraping/Rohlik/test_login.py @@ -50,11 +50,11 @@ def api_login(context: BrowserContext) -> int: return resp.status -def ensure_logged_in(pw) -> tuple[BrowserContext, Page]: +def ensure_logged_in(pw, headless=False) -> tuple[BrowserContext, Page]: auth_path = Path(AUTH_STATE_PATH) have_state = auth_path.exists() - browser = pw.chromium.launch(headless=False, args=["--start-maximized"]) + browser = pw.chromium.launch(headless=headless, args=["--start-maximized"]) ctx_args = {"no_viewport": True} if have_state: ctx_args["storage_state"] = AUTH_STATE_PATH