Initial commit: Rohlik.cz price scraper

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-31 12:24:25 +02:00
commit 210311a7b3
14 changed files with 697 additions and 0 deletions
+4
View File
@@ -0,0 +1,4 @@
MONGO_URI=mongodb://192.168.1.76:27017
MONGO_DB=rohlik
ROHLIK_EMAIL=your@email.com
ROHLIK_PASSWORD=yourpassword
+6
View File
@@ -0,0 +1,6 @@
.venv/
.env
__pycache__/
*.pyc
.idea/
auth_state.json
+265
View File
@@ -0,0 +1,265 @@
"""
Rohlik.cz Price Scraper - Category Tree
Version: 1.0.0
Date: 2026-05-31
Complete hardcoded category tree for Rohlik.cz (14 main categories, ~170+ leaves).
Only leaf categories (without children) contain the full product listing.
"""
CATEGORY_TREE = [
{
"name": "Ovoce a zelenina", "id": "300102000", "url": "/c300102000-ovoce-a-zelenina",
"children": [
{"name": "Zelenina", "id": "300102008", "url": "/c300102008-zelenina", "children": [
{"name": "Okurky, cukety a lilky", "id": "300102013", "url": "/c300102013-okurky-cukety-a-lilky"},
{"name": "Mrkev a kořenová zelenina", "id": "300102012", "url": "/c300102012-mrkev-a-korenova-zelenina"},
{"name": "Cibule, česnek a pórek", "id": "300102010", "url": "/c300102010-cibule-cesnek-a-porek"},
{"name": "Papriky a chilli", "id": "300102015", "url": "/c300102015-papriky-a-chilli"},
{"name": "Saláty", "id": "300102021", "url": "/c300102021-salaty"},
{"name": "Rajčata", "id": "300102014", "url": "/c300102014-rajcata"},
{"name": "Brambory", "id": "300102009", "url": "/c300102009-brambory"},
{"name": "Ostatní zelenina", "id": "300102020", "url": "/c300102020-ostatni-zelenina"},
{"name": "Květák a košťálová zelenina", "id": "300102016", "url": "/c300102016-kvetak-a-kostalova-zelenina"},
{"name": "Dýně a tykve", "id": "300102011", "url": "/c300102011-dyne-a-tykve"},
{"name": "Kukuřice a lusky", "id": "300102017", "url": "/c300102017-kukurice-a-lusky"},
{"name": "Bio zelenina", "id": "300102040", "url": "/c300102040-bio-zelenina"},
{"name": "Baby zelenina", "id": "300102018", "url": "/c300102018-baby-zelenina"},
{"name": "Rychlá příprava a předvařená zelenina", "id": "300112157", "url": "/c300112157-rychla-priprava-a-predvarena-zelenina"},
]},
{"name": "Ovoce", "id": "300102001", "url": "/c300102001-ovoce"},
{"name": "Bylinky a microgreens", "id": "300102026", "url": "/c300102026-bylinky-a-microgreens"},
{"name": "Kuchařské boxy, hotové saláty a ready to cook", "id": "300124625", "url": "/c300124625-kucharske-boxy-hotove-salaty-a-ready-to-cook"},
{"name": "Houby", "id": "300102022", "url": "/c300102022-houby"},
{"name": "Květiny", "id": "300112201", "url": "/c300112201-kvetiny"},
{"name": "Čerstvě sklizeno", "id": "300120435", "url": "/c300120435-cerstve-sklizeno"},
{"name": "Český chřest", "id": "300114291", "url": "/c300114291-cesky-chrest"},
{"name": "BIO ovoce a zelenina", "id": "300114343", "url": "/c300114343-bio-ovoce-a-zelenina"},
{"name": "Kunratická stodola", "id": "300124164", "url": "/c300124164-kunraticka-stodola"},
{"name": "Speciality ze světa", "id": "300102038", "url": "/c300102038-speciality-ze-sveta"},
]
},
{
"name": "Mléčné a chlazené", "id": "300105000", "url": "/c300105000-mlecne-a-chlazene",
"children": [
{"name": "Sýry", "id": "300105026", "url": "/c300105026-syry"},
{"name": "Jogurty a mléčné dezerty", "id": "300105008", "url": "/c300105008-jogurty-a-mlecne-dezerty"},
{"name": "Vejce a droždí", "id": "300105053", "url": "/c300105053-vejce-a-drozdi"},
{"name": "Smetany, šlehačky a tvarohy", "id": "300105021", "url": "/c300105021-smetany-slehacky-a-tvarohy"},
{"name": "Máslo, tuky a margaríny", "id": "300105048", "url": "/c300105048-maslo-tuky-a-margariny"},
{"name": "Majonézy, tatarské omáčky a dresingy", "id": "300105058", "url": "/c300105058-majonezy-tatarske-omacky-a-dresingy"},
{"name": "Mléko a mléčné nápoje", "id": "300105001", "url": "/c300105001-mleko-a-mlecne-napoje"},
{"name": "Bez laktózy, A2 a High protein", "id": "300121231", "url": "/c300121231-bez-laktozy-a2-a-high-protein"},
]
},
{
"name": "Maso a ryby", "id": "300103000", "url": "/c300103000-maso-a-ryby",
"children": [
{"name": "Drůbež", "id": "300115247", "url": "/c300115247-drubez"},
{"name": "Hovězí a telecí", "id": "300117217", "url": "/c300117217-hovezi-a-teleci"},
{"name": "Vepřové", "id": "300103009", "url": "/c300103009-veprove"},
{"name": "Ryby a mořské plody", "id": "300117385", "url": "/c300117385-ryby-a-morske-plody"},
{"name": "BIO maso a ryby", "id": "300121424", "url": "/c300121424-bio-maso-a-ryby"},
{"name": "Maso na gril, steaky a burgery", "id": "300117355", "url": "/c300117355-maso-na-gril-steaky-a-burgery"},
{"name": "Zvěřina, jehněčí, králičí a speciality", "id": "300122988", "url": "/c300122988-zverina-jehneci-kralici-a-speciality"},
]
},
{
"name": "Pekárna a cukrárna", "id": "300101000", "url": "/c300101000-pekarna-a-cukrarna",
"children": [
{"name": "Slané pečivo", "id": "300101012", "url": "/c300101012-slane-pecivo"},
{"name": "Chléb", "id": "300101007", "url": "/c300101007-chleb"},
{"name": "Sladké pečivo", "id": "300101024", "url": "/c300101024-sladke-pecivo"},
{"name": "Dorty a zákusky", "id": "300101033", "url": "/c300101033-dorty-a-zakusky"},
{"name": "Racio a Knäckebrot", "id": "300101043", "url": "/c300101043-racio-a-knaeckebrot"},
{"name": "Tortilly a pita chleby", "id": "300101019", "url": "/c300101019-tortilly-a-pita-chleby"},
{"name": "Pečicí směsi a strouhanka", "id": "300101049", "url": "/c300101049-pecici-smesi-a-strouhanka"},
{"name": "Bezlepková pekárna a cukrárna", "id": "300118595", "url": "/c300118595-bezlepkova-pekarna-a-cukrarna"},
]
},
{
"name": "Uzeniny a lahůdky", "id": "300104000", "url": "/c300104000-uzeniny-a-lahudky",
"children": [
{"name": "Šunky a slaniny", "id": "300104001", "url": "/c300104001-sunky-a-slaniny"},
{"name": "Párky, klobásy a špekáčky", "id": "300104012", "url": "/c300104012-parky-klobasy-a-spekacky"},
{"name": "Salámy", "id": "300104007", "url": "/c300104007-salamy"},
{"name": "Hotová jídla a přílohy", "id": "300104049", "url": "/c300104049-hotova-jidla-a-prilohy"},
{"name": "Lahůdky", "id": "300104039", "url": "/c300104039-lahudky"},
{"name": "Paštiky a masné výrobky", "id": "300104016", "url": "/c300104016-pastiky-a-masne-vyrobky"},
{"name": "Saláty, pomazánky a pesta", "id": "300104032", "url": "/c300104032-salaty-pomazanky-a-pesta"},
{"name": "Zabijačkové speciality", "id": "300104020", "url": "/c300104020-zabijackove-speciality"},
{"name": "Dárkové koše a kazety", "id": "300121878", "url": "/c300121878-darkove-kose-a-kazety"},
]
},
{
"name": "Mražené", "id": "300107000", "url": "/c300107000-mrazene",
"children": [
{"name": "Pizza", "id": "300107020", "url": "/c300107020-pizza"},
{"name": "Zmrzlina a led", "id": "300107035", "url": "/c300107035-zmrzlina-a-led"},
{"name": "Ovoce a zelenina", "id": "300122993", "url": "/c300122993-ovoce-a-zelenina"},
{"name": "Bramborové výrobky", "id": "300107024", "url": "/c300107024-bramborove-vyrobky"},
{"name": "Hotová jídla", "id": "300107028", "url": "/c300107028-hotova-jidla"},
{"name": "Pečivo, dezerty a dorty", "id": "300107041", "url": "/c300107041-pecivo-dezerty-a-dorty"},
{"name": "Ryby a mořské plody", "id": "300107014", "url": "/c300107014-ryby-a-morske-plody"},
{"name": "Maso a drůbež", "id": "300107010", "url": "/c300107010-maso-a-drubez"},
]
},
{
"name": "Grilování", "id": "300117503", "url": "/c300117503-grilovani",
"children": [
{"name": "Rohlík doporučuje", "id": "300124865", "url": "/c300124865-rohlik-doporucuje"},
{"name": "Maso a ryby", "id": "300117505", "url": "/c300117505-maso-a-ryby"},
{"name": "Klobásy a slanina", "id": "300124497", "url": "/c300124497-klobasy-a-slanina"},
{"name": "Sýry", "id": "300121109", "url": "/c300121109-syry"},
{"name": "Pekárna", "id": "300117581", "url": "/c300117581-pekarna"},
{"name": "Zelenina, houby a bylinky", "id": "300124458", "url": "/c300124458-zelenina-houby-a-bylinky"},
{"name": "Sterilovaná zelenina a olivy", "id": "300124864", "url": "/c300124864-sterilovana-zelenina-a-olivy"},
{"name": "Plant based", "id": "300122070", "url": "/c300122070-plant-based"},
{"name": "Omáčky a koření", "id": "300117565", "url": "/c300117565-omacky-a-koreni"},
{"name": "Nápoje", "id": "300124465", "url": "/c300124465-napoje"},
{"name": "Příslušenství na grilování", "id": "300124489", "url": "/c300124489-prislusenstvi-na-grilovani"},
]
},
{
"name": "Plant Based", "id": "300121429", "url": "/c300121429-plant-based",
"children": [
{"name": "Trvanlivé", "id": "300121448", "url": "/c300121448-trvanlive"},
{"name": "Nápoje", "id": "300121430", "url": "/c300121430-napoje"},
{"name": "Alternativy mléčného a chlazeného", "id": "300121486", "url": "/c300121486-alternativy-mlecneho-a-chlazeneho"},
{"name": "Alternativy uzenin a lahůdek", "id": "300121493", "url": "/c300121493-alternativy-uzenin-a-lahudek"},
{"name": "Pekárna a cukrárna", "id": "300121491", "url": "/c300121491-pekarna-a-cukrarna"},
{"name": "Mražené", "id": "300121492", "url": "/c300121492-mrazene"},
{"name": "Bezmasé a nerybí výrobky", "id": "300121488", "url": "/c300121488-bezmase-a-nerybi-vyrobky"},
{"name": "Ovoce a zelenina", "id": "300121436", "url": "/c300121436-ovoce-a-zelenina"},
{"name": "Dítě", "id": "300121470", "url": "/c300121470-dite"},
{"name": "Drogerie a kosmetika", "id": "300121621", "url": "/c300121621-drogerie-a-kosmetika"},
]
},
{
"name": "Trvanlivé", "id": "300106000", "url": "/c300106000-trvanlive",
"children": [
{"name": "Sladkosti", "id": "300106128", "url": "/c300106128-sladkosti"},
{"name": "Přísady na pečení", "id": "300106001", "url": "/c300106001-prisady-na-peceni"},
{"name": "Konzervy, hotovky a instantní pokrmy", "id": "300123466", "url": "/c300123466-konzervy-hotovky-a-instantni-pokrmy"},
{"name": "Slané, snacky a ořechy", "id": "300123392", "url": "/c300123392-slane-snacky-a-orechy"},
{"name": "Koření a ochucovadla", "id": "300106031", "url": "/c300106031-koreni-a-ochucovadla"},
{"name": "Rýže, těstoviny, luštěniny, soja", "id": "300106096", "url": "/c300106096-ryze-testoviny-lusteniny-soja"},
{"name": "Džemy, medy, oříškové krémy, sladké pomazánky a sirupy", "id": "300106124", "url": "/c300106124-dzemy-medy-oriskove-kremy-sladke-pomazanky-a-sirupy"},
{"name": "Kečupy, hořčice a omáčky", "id": "300123436", "url": "/c300123436-kecupy-horcice-a-omacky"},
{"name": "Oleje a octy", "id": "300123421", "url": "/c300123421-oleje-a-octy"},
{"name": "Müsli, cereálie, kaše a tyčinky", "id": "300106117", "url": "/c300106117-muesli-cerealie-kase-a-tycinky"},
]
},
{
"name": "Nápoje", "id": "300108000", "url": "/c300108000-napoje",
"children": [
{"name": "Vody a minerálky", "id": "300122110", "url": "/c300122110-vody-a-mineralky"},
{"name": "Limonády a energy", "id": "300122167", "url": "/c300122167-limonady-a-energy"},
{"name": "Piva a cidery", "id": "300108052", "url": "/c300108052-piva-a-cidery"},
{"name": "Džusy, ovocné nápoje a sirupy", "id": "300122136", "url": "/c300122136-dzusy-ovocne-napoje-a-sirupy"},
{"name": "Víno", "id": "300108064", "url": "/c300108064-vino"},
{"name": "Horké nápoje", "id": "300123000", "url": "/c300123000-horke-napoje"},
{"name": "Lihoviny", "id": "300108072", "url": "/c300108072-lihoviny"},
{"name": "Vozíme chlazené", "id": "300124505", "url": "/c300124505-vozime-chlazene"},
{"name": "Dětské nápoje", "id": "300122626", "url": "/c300122626-detske-napoje"},
]
},
{
"name": "Speciální výživa", "id": "300112393", "url": "/c300112393-specialni-vyziva",
"children": [
{"name": "Bezlaktózové výrobky", "id": "300112395", "url": "/c300112395-bezlaktozove-vyrobky"},
{"name": "Fitness výživa", "id": "300115595", "url": "/c300115595-fitness-vyziva"},
{"name": "Bezlepkové výrobky", "id": "300112399", "url": "/c300112399-bezlepkove-vyrobky"},
{"name": "Potraviny se sníženým obsahem cukru", "id": "300112403", "url": "/c300112403-potraviny-se-snizenym-obsahem-cukru"},
{"name": "Veggie", "id": "300112843", "url": "/c300112843-veggie"},
{"name": "Alternativní strava", "id": "300124517", "url": "/c300124517-alternativni-strava"},
{"name": "Doplňky stravy", "id": "300115585", "url": "/c300115585-doplnky-stravy"},
{"name": "Nízkobílkovinové výrobky", "id": "300122547", "url": "/c300122547-nizkobilkovinove-vyrobky"},
]
},
{
"name": "Kosmetika", "id": "300124206", "url": "/c300124206-kosmetika",
"children": [
{"name": "Vlasová péče", "id": "300124300", "url": "/c300124300-vlasova-pece"},
{"name": "Ústní hygiena", "id": "300124215", "url": "/c300124215-ustni-hygiena"},
{"name": "Pleťová péče a dekorativní kosmetika", "id": "300124327", "url": "/c300124327-pletova-pece-a-dekorativni-kosmetika"},
{"name": "Dámská hygiena", "id": "300124245", "url": "/c300124245-damska-hygiena"},
{"name": "Sprchové gely a mýdla a přísady do koupele", "id": "300124259", "url": "/c300124259-sprchove-gely-a-mydla-a-prisady-do-koupele"},
{"name": "Tělová péče", "id": "300124320", "url": "/c300124320-telova-pece"},
{"name": "Holení a depilace", "id": "300124337", "url": "/c300124337-holeni-a-depilace"},
{"name": "Deodoranty a parfémy", "id": "300124281", "url": "/c300124281-deodoranty-a-parfemy"},
{"name": "Pánský svět", "id": "300124350", "url": "/c300124350-pansky-svet"},
{"name": "Pro intimní chvíle", "id": "300124361", "url": "/c300124361-pro-intimni-chvile"},
{"name": "Kosmetické dárkové balíčky", "id": "300124366", "url": "/c300124366-kosmeticke-darkove-balicky"},
{"name": "Cestovní balení a produkty na léto", "id": "300124367", "url": "/c300124367-cestovni-baleni-a-produkty-na-leto"},
]
},
{
"name": "Drogerie", "id": "300109000", "url": "/c300109000-drogerie",
"children": [
{"name": "Papírová a vatová hygiena", "id": "300109010", "url": "/c300109010-papirova-a-vatova-hygiena"},
{"name": "Prací prostředky", "id": "300109001", "url": "/c300109001-praci-prostredky"},
{"name": "Mytí nádobí", "id": "300109042", "url": "/c300109042-myti-nadobi"},
{"name": "Čisticí prostředky", "id": "300109028", "url": "/c300109028-cistici-prostredky"},
{"name": "Ekologická a šetrná drogerie", "id": "300124564", "url": "/c300124564-ekologicka-a-setrna-drogerie"},
{"name": "Dětské praní", "id": "300124266", "url": "/c300124266-detske-prani"},
{"name": "Velká balení", "id": "300124287", "url": "/c300124287-velka-baleni"},
]
},
{
"name": "Domácnost a zahrada", "id": "300111000", "url": "/c300111000-domacnost-a-zahrada",
"children": [
{"name": "Úklidové potřeby", "id": "300124615", "url": "/c300124615-uklidove-potreby"},
{"name": "Dům, byt a garáž", "id": "300123083", "url": "/c300123083-dum-byt-a-garaz"},
{"name": "Kuchyňské potřeby", "id": "300124614", "url": "/c300124614-kuchynske-potreby"},
{"name": "Zahrada a květiny", "id": "300111043", "url": "/c300111043-zahrada-a-kvetiny"},
{"name": "Kancelář a papírnictví", "id": "300111034", "url": "/c300111034-kancelar-a-papirnictvi"},
{"name": "Dekorace, svíčky a vůně", "id": "300124685", "url": "/c300124685-dekorace-svicky-a-vune"},
{"name": "Trafika", "id": "300111091", "url": "/c300111091-trafika"},
{"name": "Oslavy a párty", "id": "300124678", "url": "/c300124678-oslavy-a-party"},
{"name": "Punčocháče a ponožky", "id": "300124722", "url": "/c300124722-puncochace-a-ponozky"},
{"name": "Knihy", "id": "300124157", "url": "/c300124157-knihy"},
]
},
]
def get_leaf_categories(tree: list[dict] | None = None, parent_path: list[str] | None = None) -> list[dict]:
if tree is None:
tree = CATEGORY_TREE
if parent_path is None:
parent_path = []
leaves = []
for cat in tree:
current_path = parent_path + [cat["name"]]
children = cat.get("children")
if children:
leaves.extend(get_leaf_categories(children, current_path))
else:
leaves.append({
"id": cat["id"],
"name": cat["name"],
"url": cat["url"],
"path": current_path,
"parent_id": tree[0].get("id") if parent_path else None,
})
return leaves
def get_all_categories_flat(tree: list[dict] | None = None, parent_id: str | None = None) -> list[dict]:
if tree is None:
tree = CATEGORY_TREE
result = []
for cat in tree:
result.append({
"category_id": cat["id"],
"name": cat["name"],
"url": cat["url"],
"parent_id": parent_id,
"has_children": bool(cat.get("children")),
})
if cat.get("children"):
result.extend(get_all_categories_flat(cat["children"], parent_id=cat["id"]))
return result
+27
View File
@@ -0,0 +1,27 @@
"""
Rohlik.cz Price Scraper - Configuration
Version: 1.0.0
Date: 2026-05-31
Central configuration for the Rohlik.cz price scraper.
Loads environment variables from .env file for credentials and MongoDB connection.
Defines scraping parameters (scroll behavior, timeouts) and URL constants.
"""
import os
from dotenv import load_dotenv
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI", "mongodb://192.168.1.76:27017")
MONGO_DB = os.getenv("MONGO_DB", "rohlik")
ROHLIK_EMAIL = os.getenv("ROHLIK_EMAIL", "")
ROHLIK_PASSWORD = os.getenv("ROHLIK_PASSWORD", "")
BASE_URL = "https://www.rohlik.cz"
AUTH_STATE_PATH = "auth_state.json"
SCROLL_PAUSE = 1.5
MAX_SCROLLS = 50
+88
View File
@@ -0,0 +1,88 @@
"""
Rohlik.cz Price Scraper - Database Operations
Version: 1.0.0
Date: 2026-05-31
MongoDB operations for the Rohlik.cz price scraper.
Collections: products, price_history, categories, scrape_runs.
MongoDB server: 192.168.1.76 (no authentication).
"""
from datetime import datetime, timezone
from pymongo import MongoClient, ASCENDING
from config import MONGO_URI, MONGO_DB
def get_db():
client = MongoClient(MONGO_URI)
return client[MONGO_DB]
def ensure_indexes(db):
db.products.create_index([("product_id", ASCENDING)], unique=True)
db.products.create_index([("category_id", ASCENDING)])
db.products.create_index([("name", ASCENDING)])
db.price_history.create_index([("product_id", ASCENDING), ("scraped_at", ASCENDING)])
db.price_history.create_index([("scraped_at", ASCENDING)])
db.categories.create_index([("category_id", ASCENDING)], unique=True)
db.scrape_runs.create_index([("started_at", ASCENDING)])
def upsert_product(db, product: dict):
now = datetime.now(timezone.utc)
product_id = product["product_id"]
db.products.update_one(
{"product_id": product_id},
{
"$set": {
"name": product["name"],
"category_id": product.get("category_id"),
"category_name": product.get("category_name"),
"amount": product.get("amount"),
"unit_price": product.get("unit_price"),
"image_url": product.get("image_url"),
"product_url": product.get("product_url"),
"category_path": product.get("category_path"),
"updated_at": now,
},
"$setOnInsert": {
"created_at": now,
},
},
upsert=True,
)
db.price_history.insert_one({
"product_id": product_id,
"price": product["price"],
"original_price": product.get("original_price"),
"discount_badge": product.get("discount_badge"),
"unit_price": product.get("unit_price"),
"scraped_at": now,
})
def upsert_category(db, category: dict):
now = datetime.now(timezone.utc)
db.categories.update_one(
{"category_id": category["category_id"]},
{
"$set": {
"name": category["name"],
"url": category["url"],
"parent_id": category.get("parent_id"),
"has_children": category.get("has_children", False),
"updated_at": now,
},
"$setOnInsert": {"created_at": now},
},
upsert=True,
)
def log_scrape_run(db, run_data: dict):
db.scrape_runs.insert_one(run_data)
+3
View File
@@ -0,0 +1,3 @@
playwright==1.52.0
pymongo==4.12.1
python-dotenv==1.1.0
+254
View File
@@ -0,0 +1,254 @@
"""
Rohlik.cz Price Scraper - Main Scraper
Version: 1.0.0
Date: 2026-05-31
Playwright-based scraper that iterates all leaf categories on Rohlik.cz,
scrolls to lazy-load every product card, and extracts pricing data from the DOM.
Supports authenticated scraping (prices differ for logged-in users).
Usage:
python scraper.py --no-db --visible # scrape to JSON, visible browser
python scraper.py --no-db --filter "Brambory" # scrape single category to JSON
python scraper.py # scrape to MongoDB
python scraper.py --visible # scrape to MongoDB, visible browser
"""
import re
import json
import logging
from datetime import datetime, timezone
from pathlib import Path
from playwright.sync_api import sync_playwright, Page
from config import (
BASE_URL, AUTH_STATE_PATH,
ROHLIK_EMAIL, ROHLIK_PASSWORD,
SCROLL_PAUSE, MAX_SCROLLS,
)
from categories import get_leaf_categories, get_all_categories_flat
from db import get_db, ensure_indexes, upsert_product, upsert_category, log_scrape_run
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger(__name__)
def parse_price(raw: str | None) -> float | None:
if not raw:
return None
digits = re.sub(r"[^\d]", "", raw)
if not digits:
return None
return int(digits) / 100
def parse_original_price(raw: str | None) -> float | None:
if not raw:
return None
match = re.search(r"([\d\s]+[,.][\d]+)", raw.replace("\xa0", " "))
if match:
return float(match.group(1).replace(" ", "").replace(",", "."))
digits = re.sub(r"[^\d]", "", raw)
if digits:
return float(digits) / 100
return None
def login(page: Page):
log.info("Logging in to Rohlik.cz...")
page.goto(BASE_URL, wait_until="domcontentloaded", timeout=60000)
page.wait_for_timeout(3000)
page.locator('text="Přihlásit se"').first.click()
page.wait_for_timeout(2000)
page.locator('input[type="email"], input[name="email"]').first.fill(ROHLIK_EMAIL)
page.locator('input[type="password"], input[name="password"]').first.fill(ROHLIK_PASSWORD)
page.locator('button[type="submit"]').first.click()
page.wait_for_timeout(5000)
page.context.storage_state(path=AUTH_STATE_PATH)
log.info("Login successful, auth state saved.")
def scroll_to_load_all(page: Page) -> int:
prev_count = 0
for i in range(MAX_SCROLLS):
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
page.wait_for_timeout(int(SCROLL_PAUSE * 1000))
current_count = page.locator('[data-test^="productCard-AVAILABLE-"]').count()
if current_count == prev_count and i > 2:
break
prev_count = current_count
return prev_count
def extract_products(page: Page, category: dict) -> list[dict]:
products_data = page.evaluate("""
() => {
const products = [];
document.querySelectorAll('[data-test^="productCard-AVAILABLE-"]').forEach(card => {
const id = card.getAttribute('data-test').replace('productCard-AVAILABLE-', '');
const nameEl = card.querySelector('[data-test="productCard-body-name"]');
const priceNoEl = card.querySelector('[data-test="productCard-body-price-priceNo"]');
const saleEl = card.querySelector('[data-test="productCard-body-price-sale"]');
const amountEl = card.querySelector('[data-test="productCard-footer-amount"]');
const unitPriceEl = card.querySelector('[data-test="productCard-footer-unitPrice"]');
const badgeEl = card.querySelector('[data-test="productCard-body-badge"]');
const imgEl = card.querySelector('img');
const linkEl = card.querySelector('a[href*="/"]');
products.push({
product_id: id,
name: nameEl?.textContent?.trim() || '',
price_raw: priceNoEl?.textContent?.trim() || '',
original_price_raw: saleEl?.textContent?.trim() || '',
amount: amountEl?.textContent?.trim() || '',
unit_price_raw: unitPriceEl?.textContent?.trim() || '',
discount_badge: badgeEl?.textContent?.trim() || '',
image_url: imgEl?.src || '',
product_url: linkEl?.getAttribute('href') || '',
});
});
return products;
}
""")
results = []
for p in products_data:
results.append({
"product_id": p["product_id"],
"name": p["name"],
"price": parse_price(p["price_raw"]),
"original_price": parse_original_price(p["original_price_raw"]),
"discount_badge": p["discount_badge"] or None,
"amount": p["amount"] or None,
"unit_price": p["unit_price_raw"].strip() or None,
"image_url": p["image_url"] or None,
"product_url": f"{BASE_URL}{p['product_url']}" if p["product_url"] else None,
"category_id": category["id"],
"category_name": category["name"],
"category_path": " > ".join(category.get("path", [category["name"]])),
})
return results
def scrape_leaf(page: Page, category: dict) -> list[dict]:
url = f"{BASE_URL}{category['url']}"
log.info("Scraping: %s (%s)", " > ".join(category.get("path", [category["name"]])), url)
page.goto(url, wait_until="domcontentloaded", timeout=60000)
page.wait_for_timeout(3000)
try:
page.wait_for_selector('[data-test^="productCard-AVAILABLE-"]', timeout=15000)
except Exception:
log.warning(" No products found in %s, skipping.", category["name"])
return []
total = scroll_to_load_all(page)
products = extract_products(page, category)
log.info(" %d products extracted (loaded %d)", len(products), total)
return products
def run_scraper(
category_filter: str | None = None,
headless: bool = True,
save_to_db: bool = True,
):
leaves = get_leaf_categories()
if category_filter:
category_filter_lower = category_filter.lower()
leaves = [c for c in leaves if category_filter_lower in " > ".join(c["path"]).lower()]
log.info("Will scrape %d leaf categories", len(leaves))
with sync_playwright() as pw:
ctx_args = {}
if Path(AUTH_STATE_PATH).exists():
ctx_args["storage_state"] = AUTH_STATE_PATH
browser = pw.chromium.launch(headless=headless)
context = browser.new_context(**ctx_args)
page = context.new_page()
page.goto(BASE_URL, wait_until="domcontentloaded", timeout=60000)
page.wait_for_timeout(5000)
is_logged_in = page.locator('text="Přihlásit se"').count() == 0
if not is_logged_in:
if ROHLIK_EMAIL and ROHLIK_PASSWORD:
login(page)
context = browser.new_context(storage_state=AUTH_STATE_PATH)
page = context.new_page()
else:
log.warning("Not logged in! Prices may differ from member prices.")
run_start = datetime.now(timezone.utc)
all_products = []
seen_ids = set()
db = None
if save_to_db:
db = get_db()
ensure_indexes(db)
for cat_data in get_all_categories_flat():
upsert_category(db, cat_data)
for leaf in leaves:
try:
products = scrape_leaf(page, leaf)
for p in products:
if p["product_id"] not in seen_ids:
seen_ids.add(p["product_id"])
all_products.append(p)
if db:
upsert_product(db, p)
except Exception:
log.exception("Error scraping %s", leaf["name"])
run_end = datetime.now(timezone.utc)
run_data = {
"started_at": run_start,
"finished_at": run_end,
"duration_seconds": (run_end - run_start).total_seconds(),
"categories_scraped": len(leaves),
"products_scraped": len(all_products),
}
if db:
log_scrape_run(db, run_data)
log.info(
"Done: %d unique products from %d categories in %.1fs",
len(all_products), len(leaves), run_data["duration_seconds"],
)
browser.close()
return all_products
def scrape_to_json(output_path: str = "products.json", **kwargs):
products = run_scraper(save_to_db=False, **kwargs)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(products, f, ensure_ascii=False, indent=2, default=str)
log.info("Saved %d products to %s", len(products), output_path)
return products
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Rohlik.cz price scraper")
parser.add_argument("--no-db", action="store_true", help="Save to JSON instead of MongoDB")
parser.add_argument("--visible", action="store_true", help="Run browser in visible mode")
parser.add_argument("--filter", type=str, help="Filter categories by name (e.g. 'Ovoce', 'Zelenina > Rajčata')")
args = parser.parse_args()
if args.no_db:
scrape_to_json(category_filter=args.filter, headless=not args.visible)
else:
run_scraper(category_filter=args.filter, headless=not args.visible)