commit 210311a7b31fee3f2787d5ac44dfe423b7e6a69e Author: Vladimir Buzalka Date: Sun May 31 12:24:25 2026 +0200 Initial commit: Rohlik.cz price scraper Co-Authored-By: Claude Opus 4.6 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..77ac754 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.venv/ +__pycache__/ +*.pyc diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/Rohlik.iml b/.idea/Rohlik.iml new file mode 100644 index 0000000..23f8f9c --- /dev/null +++ b/.idea/Rohlik.iml @@ -0,0 +1,10 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..c53b08f --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,16 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..05e0969 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..d843f34 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/10PriceScraping/Rohlik/.env.example b/10PriceScraping/Rohlik/.env.example new file mode 100644 index 0000000..2bfec88 --- /dev/null +++ b/10PriceScraping/Rohlik/.env.example @@ -0,0 +1,4 @@ +MONGO_URI=mongodb://192.168.1.76:27017 +MONGO_DB=rohlik +ROHLIK_EMAIL=your@email.com +ROHLIK_PASSWORD=yourpassword diff --git a/10PriceScraping/Rohlik/.gitignore b/10PriceScraping/Rohlik/.gitignore new file mode 100644 index 0000000..7858911 --- /dev/null +++ b/10PriceScraping/Rohlik/.gitignore @@ -0,0 +1,6 @@ +.venv/ +.env +__pycache__/ +*.pyc +.idea/ +auth_state.json diff --git a/10PriceScraping/Rohlik/categories.py b/10PriceScraping/Rohlik/categories.py new file mode 100644 index 0000000..fd8726d --- /dev/null +++ b/10PriceScraping/Rohlik/categories.py @@ -0,0 +1,265 @@ +""" +Rohlik.cz Price Scraper - Category Tree +Version: 1.0.0 +Date: 2026-05-31 + +Complete hardcoded category tree for Rohlik.cz (14 main categories, ~170+ leaves). +Only leaf categories (without children) contain the full product listing. +""" + +CATEGORY_TREE = [ + { + "name": "Ovoce a zelenina", "id": "300102000", "url": "/c300102000-ovoce-a-zelenina", + "children": [ + {"name": "Zelenina", "id": "300102008", "url": "/c300102008-zelenina", "children": [ + {"name": "Okurky, cukety a lilky", "id": "300102013", "url": "/c300102013-okurky-cukety-a-lilky"}, + {"name": "Mrkev a kořenová zelenina", "id": "300102012", "url": "/c300102012-mrkev-a-korenova-zelenina"}, + {"name": "Cibule, česnek a pórek", "id": "300102010", "url": "/c300102010-cibule-cesnek-a-porek"}, + {"name": "Papriky a chilli", "id": "300102015", "url": "/c300102015-papriky-a-chilli"}, + {"name": "Saláty", "id": "300102021", "url": "/c300102021-salaty"}, + {"name": "Rajčata", "id": "300102014", "url": "/c300102014-rajcata"}, + {"name": "Brambory", "id": "300102009", "url": "/c300102009-brambory"}, + {"name": "Ostatní zelenina", "id": "300102020", "url": "/c300102020-ostatni-zelenina"}, + {"name": "Květák a košťálová zelenina", "id": "300102016", "url": "/c300102016-kvetak-a-kostalova-zelenina"}, + {"name": "Dýně a tykve", "id": "300102011", "url": "/c300102011-dyne-a-tykve"}, + {"name": "Kukuřice a lusky", "id": "300102017", "url": "/c300102017-kukurice-a-lusky"}, + {"name": "Bio zelenina", "id": "300102040", "url": "/c300102040-bio-zelenina"}, + {"name": "Baby zelenina", "id": "300102018", "url": "/c300102018-baby-zelenina"}, + {"name": "Rychlá příprava a předvařená zelenina", "id": "300112157", "url": "/c300112157-rychla-priprava-a-predvarena-zelenina"}, + ]}, + {"name": "Ovoce", "id": "300102001", "url": "/c300102001-ovoce"}, + {"name": "Bylinky a microgreens", "id": "300102026", "url": "/c300102026-bylinky-a-microgreens"}, + {"name": "Kuchařské boxy, hotové saláty a ready to cook", "id": "300124625", "url": "/c300124625-kucharske-boxy-hotove-salaty-a-ready-to-cook"}, + {"name": "Houby", "id": "300102022", "url": "/c300102022-houby"}, + {"name": "Květiny", "id": "300112201", "url": "/c300112201-kvetiny"}, + {"name": "Čerstvě sklizeno", "id": "300120435", "url": "/c300120435-cerstve-sklizeno"}, + {"name": "Český chřest", "id": "300114291", "url": "/c300114291-cesky-chrest"}, + {"name": "BIO ovoce a zelenina", "id": "300114343", "url": "/c300114343-bio-ovoce-a-zelenina"}, + {"name": "Kunratická stodola", "id": "300124164", "url": "/c300124164-kunraticka-stodola"}, + {"name": "Speciality ze světa", "id": "300102038", "url": "/c300102038-speciality-ze-sveta"}, + ] + }, + { + "name": "Mléčné a chlazené", "id": "300105000", "url": "/c300105000-mlecne-a-chlazene", + "children": [ + {"name": "Sýry", "id": "300105026", "url": "/c300105026-syry"}, + {"name": "Jogurty a mléčné dezerty", "id": "300105008", "url": "/c300105008-jogurty-a-mlecne-dezerty"}, + {"name": "Vejce a droždí", "id": "300105053", "url": "/c300105053-vejce-a-drozdi"}, + {"name": "Smetany, šlehačky a tvarohy", "id": "300105021", "url": "/c300105021-smetany-slehacky-a-tvarohy"}, + {"name": "Máslo, tuky a margaríny", "id": "300105048", "url": "/c300105048-maslo-tuky-a-margariny"}, + {"name": "Majonézy, tatarské omáčky a dresingy", "id": "300105058", "url": "/c300105058-majonezy-tatarske-omacky-a-dresingy"}, + {"name": "Mléko a mléčné nápoje", "id": "300105001", "url": "/c300105001-mleko-a-mlecne-napoje"}, + {"name": "Bez laktózy, A2 a High protein", "id": "300121231", "url": "/c300121231-bez-laktozy-a2-a-high-protein"}, + ] + }, + { + "name": "Maso a ryby", "id": "300103000", "url": "/c300103000-maso-a-ryby", + "children": [ + {"name": "Drůbež", "id": "300115247", "url": "/c300115247-drubez"}, + {"name": "Hovězí a telecí", "id": "300117217", "url": "/c300117217-hovezi-a-teleci"}, + {"name": "Vepřové", "id": "300103009", "url": "/c300103009-veprove"}, + {"name": "Ryby a mořské plody", "id": "300117385", "url": "/c300117385-ryby-a-morske-plody"}, + {"name": "BIO maso a ryby", "id": "300121424", "url": "/c300121424-bio-maso-a-ryby"}, + {"name": "Maso na gril, steaky a burgery", "id": "300117355", "url": "/c300117355-maso-na-gril-steaky-a-burgery"}, + {"name": "Zvěřina, jehněčí, králičí a speciality", "id": "300122988", "url": "/c300122988-zverina-jehneci-kralici-a-speciality"}, + ] + }, + { + "name": "Pekárna a cukrárna", "id": "300101000", "url": "/c300101000-pekarna-a-cukrarna", + "children": [ + {"name": "Slané pečivo", "id": "300101012", "url": "/c300101012-slane-pecivo"}, + {"name": "Chléb", "id": "300101007", "url": "/c300101007-chleb"}, + {"name": "Sladké pečivo", "id": "300101024", "url": "/c300101024-sladke-pecivo"}, + {"name": "Dorty a zákusky", "id": "300101033", "url": "/c300101033-dorty-a-zakusky"}, + {"name": "Racio a Knäckebrot", "id": "300101043", "url": "/c300101043-racio-a-knaeckebrot"}, + {"name": "Tortilly a pita chleby", "id": "300101019", "url": "/c300101019-tortilly-a-pita-chleby"}, + {"name": "Pečicí směsi a strouhanka", "id": "300101049", "url": "/c300101049-pecici-smesi-a-strouhanka"}, + {"name": "Bezlepková pekárna a cukrárna", "id": "300118595", "url": "/c300118595-bezlepkova-pekarna-a-cukrarna"}, + ] + }, + { + "name": "Uzeniny a lahůdky", "id": "300104000", "url": "/c300104000-uzeniny-a-lahudky", + "children": [ + {"name": "Šunky a slaniny", "id": "300104001", "url": "/c300104001-sunky-a-slaniny"}, + {"name": "Párky, klobásy a špekáčky", "id": "300104012", "url": "/c300104012-parky-klobasy-a-spekacky"}, + {"name": "Salámy", "id": "300104007", "url": "/c300104007-salamy"}, + {"name": "Hotová jídla a přílohy", "id": "300104049", "url": "/c300104049-hotova-jidla-a-prilohy"}, + {"name": "Lahůdky", "id": "300104039", "url": "/c300104039-lahudky"}, + {"name": "Paštiky a masné výrobky", "id": "300104016", "url": "/c300104016-pastiky-a-masne-vyrobky"}, + {"name": "Saláty, pomazánky a pesta", "id": "300104032", "url": "/c300104032-salaty-pomazanky-a-pesta"}, + {"name": "Zabijačkové speciality", "id": "300104020", "url": "/c300104020-zabijackove-speciality"}, + {"name": "Dárkové koše a kazety", "id": "300121878", "url": "/c300121878-darkove-kose-a-kazety"}, + ] + }, + { + "name": "Mražené", "id": "300107000", "url": "/c300107000-mrazene", + "children": [ + {"name": "Pizza", "id": "300107020", "url": "/c300107020-pizza"}, + {"name": "Zmrzlina a led", "id": "300107035", "url": "/c300107035-zmrzlina-a-led"}, + {"name": "Ovoce a zelenina", "id": "300122993", "url": "/c300122993-ovoce-a-zelenina"}, + {"name": "Bramborové výrobky", "id": "300107024", "url": "/c300107024-bramborove-vyrobky"}, + {"name": "Hotová jídla", "id": "300107028", "url": "/c300107028-hotova-jidla"}, + {"name": "Pečivo, dezerty a dorty", "id": "300107041", "url": "/c300107041-pecivo-dezerty-a-dorty"}, + {"name": "Ryby a mořské plody", "id": "300107014", "url": "/c300107014-ryby-a-morske-plody"}, + {"name": "Maso a drůbež", "id": "300107010", "url": "/c300107010-maso-a-drubez"}, + ] + }, + { + "name": "Grilování", "id": "300117503", "url": "/c300117503-grilovani", + "children": [ + {"name": "Rohlík doporučuje", "id": "300124865", "url": "/c300124865-rohlik-doporucuje"}, + {"name": "Maso a ryby", "id": "300117505", "url": "/c300117505-maso-a-ryby"}, + {"name": "Klobásy a slanina", "id": "300124497", "url": "/c300124497-klobasy-a-slanina"}, + {"name": "Sýry", "id": "300121109", "url": "/c300121109-syry"}, + {"name": "Pekárna", "id": "300117581", "url": "/c300117581-pekarna"}, + {"name": "Zelenina, houby a bylinky", "id": "300124458", "url": "/c300124458-zelenina-houby-a-bylinky"}, + {"name": "Sterilovaná zelenina a olivy", "id": "300124864", "url": "/c300124864-sterilovana-zelenina-a-olivy"}, + {"name": "Plant based", "id": "300122070", "url": "/c300122070-plant-based"}, + {"name": "Omáčky a koření", "id": "300117565", "url": "/c300117565-omacky-a-koreni"}, + {"name": "Nápoje", "id": "300124465", "url": "/c300124465-napoje"}, + {"name": "Příslušenství na grilování", "id": "300124489", "url": "/c300124489-prislusenstvi-na-grilovani"}, + ] + }, + { + "name": "Plant Based", "id": "300121429", "url": "/c300121429-plant-based", + "children": [ + {"name": "Trvanlivé", "id": "300121448", "url": "/c300121448-trvanlive"}, + {"name": "Nápoje", "id": "300121430", "url": "/c300121430-napoje"}, + {"name": "Alternativy mléčného a chlazeného", "id": "300121486", "url": "/c300121486-alternativy-mlecneho-a-chlazeneho"}, + {"name": "Alternativy uzenin a lahůdek", "id": "300121493", "url": "/c300121493-alternativy-uzenin-a-lahudek"}, + {"name": "Pekárna a cukrárna", "id": "300121491", "url": "/c300121491-pekarna-a-cukrarna"}, + {"name": "Mražené", "id": "300121492", "url": "/c300121492-mrazene"}, + {"name": "Bezmasé a nerybí výrobky", "id": "300121488", "url": "/c300121488-bezmase-a-nerybi-vyrobky"}, + {"name": "Ovoce a zelenina", "id": "300121436", "url": "/c300121436-ovoce-a-zelenina"}, + {"name": "Dítě", "id": "300121470", "url": "/c300121470-dite"}, + {"name": "Drogerie a kosmetika", "id": "300121621", "url": "/c300121621-drogerie-a-kosmetika"}, + ] + }, + { + "name": "Trvanlivé", "id": "300106000", "url": "/c300106000-trvanlive", + "children": [ + {"name": "Sladkosti", "id": "300106128", "url": "/c300106128-sladkosti"}, + {"name": "Přísady na pečení", "id": "300106001", "url": "/c300106001-prisady-na-peceni"}, + {"name": "Konzervy, hotovky a instantní pokrmy", "id": "300123466", "url": "/c300123466-konzervy-hotovky-a-instantni-pokrmy"}, + {"name": "Slané, snacky a ořechy", "id": "300123392", "url": "/c300123392-slane-snacky-a-orechy"}, + {"name": "Koření a ochucovadla", "id": "300106031", "url": "/c300106031-koreni-a-ochucovadla"}, + {"name": "Rýže, těstoviny, luštěniny, soja", "id": "300106096", "url": "/c300106096-ryze-testoviny-lusteniny-soja"}, + {"name": "Džemy, medy, oříškové krémy, sladké pomazánky a sirupy", "id": "300106124", "url": "/c300106124-dzemy-medy-oriskove-kremy-sladke-pomazanky-a-sirupy"}, + {"name": "Kečupy, hořčice a omáčky", "id": "300123436", "url": "/c300123436-kecupy-horcice-a-omacky"}, + {"name": "Oleje a octy", "id": "300123421", "url": "/c300123421-oleje-a-octy"}, + {"name": "Müsli, cereálie, kaše a tyčinky", "id": "300106117", "url": "/c300106117-muesli-cerealie-kase-a-tycinky"}, + ] + }, + { + "name": "Nápoje", "id": "300108000", "url": "/c300108000-napoje", + "children": [ + {"name": "Vody a minerálky", "id": "300122110", "url": "/c300122110-vody-a-mineralky"}, + {"name": "Limonády a energy", "id": "300122167", "url": "/c300122167-limonady-a-energy"}, + {"name": "Piva a cidery", "id": "300108052", "url": "/c300108052-piva-a-cidery"}, + {"name": "Džusy, ovocné nápoje a sirupy", "id": "300122136", "url": "/c300122136-dzusy-ovocne-napoje-a-sirupy"}, + {"name": "Víno", "id": "300108064", "url": "/c300108064-vino"}, + {"name": "Horké nápoje", "id": "300123000", "url": "/c300123000-horke-napoje"}, + {"name": "Lihoviny", "id": "300108072", "url": "/c300108072-lihoviny"}, + {"name": "Vozíme chlazené", "id": "300124505", "url": "/c300124505-vozime-chlazene"}, + {"name": "Dětské nápoje", "id": "300122626", "url": "/c300122626-detske-napoje"}, + ] + }, + { + "name": "Speciální výživa", "id": "300112393", "url": "/c300112393-specialni-vyziva", + "children": [ + {"name": "Bezlaktózové výrobky", "id": "300112395", "url": "/c300112395-bezlaktozove-vyrobky"}, + {"name": "Fitness výživa", "id": "300115595", "url": "/c300115595-fitness-vyziva"}, + {"name": "Bezlepkové výrobky", "id": "300112399", "url": "/c300112399-bezlepkove-vyrobky"}, + {"name": "Potraviny se sníženým obsahem cukru", "id": "300112403", "url": "/c300112403-potraviny-se-snizenym-obsahem-cukru"}, + {"name": "Veggie", "id": "300112843", "url": "/c300112843-veggie"}, + {"name": "Alternativní strava", "id": "300124517", "url": "/c300124517-alternativni-strava"}, + {"name": "Doplňky stravy", "id": "300115585", "url": "/c300115585-doplnky-stravy"}, + {"name": "Nízkobílkovinové výrobky", "id": "300122547", "url": "/c300122547-nizkobilkovinove-vyrobky"}, + ] + }, + { + "name": "Kosmetika", "id": "300124206", "url": "/c300124206-kosmetika", + "children": [ + {"name": "Vlasová péče", "id": "300124300", "url": "/c300124300-vlasova-pece"}, + {"name": "Ústní hygiena", "id": "300124215", "url": "/c300124215-ustni-hygiena"}, + {"name": "Pleťová péče a dekorativní kosmetika", "id": "300124327", "url": "/c300124327-pletova-pece-a-dekorativni-kosmetika"}, + {"name": "Dámská hygiena", "id": "300124245", "url": "/c300124245-damska-hygiena"}, + {"name": "Sprchové gely a mýdla a přísady do koupele", "id": "300124259", "url": "/c300124259-sprchove-gely-a-mydla-a-prisady-do-koupele"}, + {"name": "Tělová péče", "id": "300124320", "url": "/c300124320-telova-pece"}, + {"name": "Holení a depilace", "id": "300124337", "url": "/c300124337-holeni-a-depilace"}, + {"name": "Deodoranty a parfémy", "id": "300124281", "url": "/c300124281-deodoranty-a-parfemy"}, + {"name": "Pánský svět", "id": "300124350", "url": "/c300124350-pansky-svet"}, + {"name": "Pro intimní chvíle", "id": "300124361", "url": "/c300124361-pro-intimni-chvile"}, + {"name": "Kosmetické dárkové balíčky", "id": "300124366", "url": "/c300124366-kosmeticke-darkove-balicky"}, + {"name": "Cestovní balení a produkty na léto", "id": "300124367", "url": "/c300124367-cestovni-baleni-a-produkty-na-leto"}, + ] + }, + { + "name": "Drogerie", "id": "300109000", "url": "/c300109000-drogerie", + "children": [ + {"name": "Papírová a vatová hygiena", "id": "300109010", "url": "/c300109010-papirova-a-vatova-hygiena"}, + {"name": "Prací prostředky", "id": "300109001", "url": "/c300109001-praci-prostredky"}, + {"name": "Mytí nádobí", "id": "300109042", "url": "/c300109042-myti-nadobi"}, + {"name": "Čisticí prostředky", "id": "300109028", "url": "/c300109028-cistici-prostredky"}, + {"name": "Ekologická a šetrná drogerie", "id": "300124564", "url": "/c300124564-ekologicka-a-setrna-drogerie"}, + {"name": "Dětské praní", "id": "300124266", "url": "/c300124266-detske-prani"}, + {"name": "Velká balení", "id": "300124287", "url": "/c300124287-velka-baleni"}, + ] + }, + { + "name": "Domácnost a zahrada", "id": "300111000", "url": "/c300111000-domacnost-a-zahrada", + "children": [ + {"name": "Úklidové potřeby", "id": "300124615", "url": "/c300124615-uklidove-potreby"}, + {"name": "Dům, byt a garáž", "id": "300123083", "url": "/c300123083-dum-byt-a-garaz"}, + {"name": "Kuchyňské potřeby", "id": "300124614", "url": "/c300124614-kuchynske-potreby"}, + {"name": "Zahrada a květiny", "id": "300111043", "url": "/c300111043-zahrada-a-kvetiny"}, + {"name": "Kancelář a papírnictví", "id": "300111034", "url": "/c300111034-kancelar-a-papirnictvi"}, + {"name": "Dekorace, svíčky a vůně", "id": "300124685", "url": "/c300124685-dekorace-svicky-a-vune"}, + {"name": "Trafika", "id": "300111091", "url": "/c300111091-trafika"}, + {"name": "Oslavy a párty", "id": "300124678", "url": "/c300124678-oslavy-a-party"}, + {"name": "Punčocháče a ponožky", "id": "300124722", "url": "/c300124722-puncochace-a-ponozky"}, + {"name": "Knihy", "id": "300124157", "url": "/c300124157-knihy"}, + ] + }, +] + + +def get_leaf_categories(tree: list[dict] | None = None, parent_path: list[str] | None = None) -> list[dict]: + if tree is None: + tree = CATEGORY_TREE + if parent_path is None: + parent_path = [] + + leaves = [] + for cat in tree: + current_path = parent_path + [cat["name"]] + children = cat.get("children") + if children: + leaves.extend(get_leaf_categories(children, current_path)) + else: + leaves.append({ + "id": cat["id"], + "name": cat["name"], + "url": cat["url"], + "path": current_path, + "parent_id": tree[0].get("id") if parent_path else None, + }) + return leaves + + +def get_all_categories_flat(tree: list[dict] | None = None, parent_id: str | None = None) -> list[dict]: + if tree is None: + tree = CATEGORY_TREE + + result = [] + for cat in tree: + result.append({ + "category_id": cat["id"], + "name": cat["name"], + "url": cat["url"], + "parent_id": parent_id, + "has_children": bool(cat.get("children")), + }) + if cat.get("children"): + result.extend(get_all_categories_flat(cat["children"], parent_id=cat["id"])) + return result diff --git a/10PriceScraping/Rohlik/config.py b/10PriceScraping/Rohlik/config.py new file mode 100644 index 0000000..a9e09e9 --- /dev/null +++ b/10PriceScraping/Rohlik/config.py @@ -0,0 +1,27 @@ +""" +Rohlik.cz Price Scraper - Configuration +Version: 1.0.0 +Date: 2026-05-31 + +Central configuration for the Rohlik.cz price scraper. +Loads environment variables from .env file for credentials and MongoDB connection. +Defines scraping parameters (scroll behavior, timeouts) and URL constants. +""" + +import os +from dotenv import load_dotenv + +load_dotenv() + +MONGO_URI = os.getenv("MONGO_URI", "mongodb://192.168.1.76:27017") +MONGO_DB = os.getenv("MONGO_DB", "rohlik") + +ROHLIK_EMAIL = os.getenv("ROHLIK_EMAIL", "") +ROHLIK_PASSWORD = os.getenv("ROHLIK_PASSWORD", "") + +BASE_URL = "https://www.rohlik.cz" + +AUTH_STATE_PATH = "auth_state.json" + +SCROLL_PAUSE = 1.5 +MAX_SCROLLS = 50 diff --git a/10PriceScraping/Rohlik/db.py b/10PriceScraping/Rohlik/db.py new file mode 100644 index 0000000..ab86094 --- /dev/null +++ b/10PriceScraping/Rohlik/db.py @@ -0,0 +1,88 @@ +""" +Rohlik.cz Price Scraper - Database Operations +Version: 1.0.0 +Date: 2026-05-31 + +MongoDB operations for the Rohlik.cz price scraper. +Collections: products, price_history, categories, scrape_runs. +MongoDB server: 192.168.1.76 (no authentication). +""" + +from datetime import datetime, timezone +from pymongo import MongoClient, ASCENDING +from config import MONGO_URI, MONGO_DB + + +def get_db(): + client = MongoClient(MONGO_URI) + return client[MONGO_DB] + + +def ensure_indexes(db): + db.products.create_index([("product_id", ASCENDING)], unique=True) + db.products.create_index([("category_id", ASCENDING)]) + db.products.create_index([("name", ASCENDING)]) + + db.price_history.create_index([("product_id", ASCENDING), ("scraped_at", ASCENDING)]) + db.price_history.create_index([("scraped_at", ASCENDING)]) + + db.categories.create_index([("category_id", ASCENDING)], unique=True) + + db.scrape_runs.create_index([("started_at", ASCENDING)]) + + +def upsert_product(db, product: dict): + now = datetime.now(timezone.utc) + product_id = product["product_id"] + + db.products.update_one( + {"product_id": product_id}, + { + "$set": { + "name": product["name"], + "category_id": product.get("category_id"), + "category_name": product.get("category_name"), + "amount": product.get("amount"), + "unit_price": product.get("unit_price"), + "image_url": product.get("image_url"), + "product_url": product.get("product_url"), + "category_path": product.get("category_path"), + "updated_at": now, + }, + "$setOnInsert": { + "created_at": now, + }, + }, + upsert=True, + ) + + db.price_history.insert_one({ + "product_id": product_id, + "price": product["price"], + "original_price": product.get("original_price"), + "discount_badge": product.get("discount_badge"), + "unit_price": product.get("unit_price"), + "scraped_at": now, + }) + + +def upsert_category(db, category: dict): + now = datetime.now(timezone.utc) + db.categories.update_one( + {"category_id": category["category_id"]}, + { + "$set": { + "name": category["name"], + "url": category["url"], + "parent_id": category.get("parent_id"), + "has_children": category.get("has_children", False), + "updated_at": now, + }, + "$setOnInsert": {"created_at": now}, + }, + upsert=True, + ) + + +def log_scrape_run(db, run_data: dict): + db.scrape_runs.insert_one(run_data) diff --git a/10PriceScraping/Rohlik/requirements.txt b/10PriceScraping/Rohlik/requirements.txt new file mode 100644 index 0000000..e8a0c30 --- /dev/null +++ b/10PriceScraping/Rohlik/requirements.txt @@ -0,0 +1,3 @@ +playwright==1.52.0 +pymongo==4.12.1 +python-dotenv==1.1.0 diff --git a/10PriceScraping/Rohlik/scraper.py b/10PriceScraping/Rohlik/scraper.py new file mode 100644 index 0000000..7dd0e0d --- /dev/null +++ b/10PriceScraping/Rohlik/scraper.py @@ -0,0 +1,254 @@ +""" +Rohlik.cz Price Scraper - Main Scraper +Version: 1.0.0 +Date: 2026-05-31 + +Playwright-based scraper that iterates all leaf categories on Rohlik.cz, +scrolls to lazy-load every product card, and extracts pricing data from the DOM. +Supports authenticated scraping (prices differ for logged-in users). + +Usage: + python scraper.py --no-db --visible # scrape to JSON, visible browser + python scraper.py --no-db --filter "Brambory" # scrape single category to JSON + python scraper.py # scrape to MongoDB + python scraper.py --visible # scrape to MongoDB, visible browser +""" + +import re +import json +import logging +from datetime import datetime, timezone +from pathlib import Path + +from playwright.sync_api import sync_playwright, Page + +from config import ( + BASE_URL, AUTH_STATE_PATH, + ROHLIK_EMAIL, ROHLIK_PASSWORD, + SCROLL_PAUSE, MAX_SCROLLS, +) +from categories import get_leaf_categories, get_all_categories_flat +from db import get_db, ensure_indexes, upsert_product, upsert_category, log_scrape_run + +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +log = logging.getLogger(__name__) + + +def parse_price(raw: str | None) -> float | None: + if not raw: + return None + digits = re.sub(r"[^\d]", "", raw) + if not digits: + return None + return int(digits) / 100 + + +def parse_original_price(raw: str | None) -> float | None: + if not raw: + return None + match = re.search(r"([\d\s]+[,.][\d]+)", raw.replace("\xa0", " ")) + if match: + return float(match.group(1).replace(" ", "").replace(",", ".")) + digits = re.sub(r"[^\d]", "", raw) + if digits: + return float(digits) / 100 + return None + + +def login(page: Page): + log.info("Logging in to Rohlik.cz...") + page.goto(BASE_URL, wait_until="domcontentloaded", timeout=60000) + page.wait_for_timeout(3000) + + page.locator('text="Přihlásit se"').first.click() + page.wait_for_timeout(2000) + + page.locator('input[type="email"], input[name="email"]').first.fill(ROHLIK_EMAIL) + page.locator('input[type="password"], input[name="password"]').first.fill(ROHLIK_PASSWORD) + page.locator('button[type="submit"]').first.click() + page.wait_for_timeout(5000) + + page.context.storage_state(path=AUTH_STATE_PATH) + log.info("Login successful, auth state saved.") + + +def scroll_to_load_all(page: Page) -> int: + prev_count = 0 + for i in range(MAX_SCROLLS): + page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + page.wait_for_timeout(int(SCROLL_PAUSE * 1000)) + current_count = page.locator('[data-test^="productCard-AVAILABLE-"]').count() + if current_count == prev_count and i > 2: + break + prev_count = current_count + return prev_count + + +def extract_products(page: Page, category: dict) -> list[dict]: + products_data = page.evaluate(""" + () => { + const products = []; + document.querySelectorAll('[data-test^="productCard-AVAILABLE-"]').forEach(card => { + const id = card.getAttribute('data-test').replace('productCard-AVAILABLE-', ''); + const nameEl = card.querySelector('[data-test="productCard-body-name"]'); + const priceNoEl = card.querySelector('[data-test="productCard-body-price-priceNo"]'); + const saleEl = card.querySelector('[data-test="productCard-body-price-sale"]'); + const amountEl = card.querySelector('[data-test="productCard-footer-amount"]'); + const unitPriceEl = card.querySelector('[data-test="productCard-footer-unitPrice"]'); + const badgeEl = card.querySelector('[data-test="productCard-body-badge"]'); + const imgEl = card.querySelector('img'); + const linkEl = card.querySelector('a[href*="/"]'); + + products.push({ + product_id: id, + name: nameEl?.textContent?.trim() || '', + price_raw: priceNoEl?.textContent?.trim() || '', + original_price_raw: saleEl?.textContent?.trim() || '', + amount: amountEl?.textContent?.trim() || '', + unit_price_raw: unitPriceEl?.textContent?.trim() || '', + discount_badge: badgeEl?.textContent?.trim() || '', + image_url: imgEl?.src || '', + product_url: linkEl?.getAttribute('href') || '', + }); + }); + return products; + } + """) + + results = [] + for p in products_data: + results.append({ + "product_id": p["product_id"], + "name": p["name"], + "price": parse_price(p["price_raw"]), + "original_price": parse_original_price(p["original_price_raw"]), + "discount_badge": p["discount_badge"] or None, + "amount": p["amount"] or None, + "unit_price": p["unit_price_raw"].strip() or None, + "image_url": p["image_url"] or None, + "product_url": f"{BASE_URL}{p['product_url']}" if p["product_url"] else None, + "category_id": category["id"], + "category_name": category["name"], + "category_path": " > ".join(category.get("path", [category["name"]])), + }) + return results + + +def scrape_leaf(page: Page, category: dict) -> list[dict]: + url = f"{BASE_URL}{category['url']}" + log.info("Scraping: %s (%s)", " > ".join(category.get("path", [category["name"]])), url) + + page.goto(url, wait_until="domcontentloaded", timeout=60000) + page.wait_for_timeout(3000) + + try: + page.wait_for_selector('[data-test^="productCard-AVAILABLE-"]', timeout=15000) + except Exception: + log.warning(" No products found in %s, skipping.", category["name"]) + return [] + + total = scroll_to_load_all(page) + products = extract_products(page, category) + log.info(" %d products extracted (loaded %d)", len(products), total) + return products + + +def run_scraper( + category_filter: str | None = None, + headless: bool = True, + save_to_db: bool = True, +): + leaves = get_leaf_categories() + if category_filter: + category_filter_lower = category_filter.lower() + leaves = [c for c in leaves if category_filter_lower in " > ".join(c["path"]).lower()] + + log.info("Will scrape %d leaf categories", len(leaves)) + + with sync_playwright() as pw: + ctx_args = {} + if Path(AUTH_STATE_PATH).exists(): + ctx_args["storage_state"] = AUTH_STATE_PATH + + browser = pw.chromium.launch(headless=headless) + context = browser.new_context(**ctx_args) + page = context.new_page() + + page.goto(BASE_URL, wait_until="domcontentloaded", timeout=60000) + page.wait_for_timeout(5000) + + is_logged_in = page.locator('text="Přihlásit se"').count() == 0 + if not is_logged_in: + if ROHLIK_EMAIL and ROHLIK_PASSWORD: + login(page) + context = browser.new_context(storage_state=AUTH_STATE_PATH) + page = context.new_page() + else: + log.warning("Not logged in! Prices may differ from member prices.") + + run_start = datetime.now(timezone.utc) + all_products = [] + seen_ids = set() + + db = None + if save_to_db: + db = get_db() + ensure_indexes(db) + for cat_data in get_all_categories_flat(): + upsert_category(db, cat_data) + + for leaf in leaves: + try: + products = scrape_leaf(page, leaf) + for p in products: + if p["product_id"] not in seen_ids: + seen_ids.add(p["product_id"]) + all_products.append(p) + if db: + upsert_product(db, p) + except Exception: + log.exception("Error scraping %s", leaf["name"]) + + run_end = datetime.now(timezone.utc) + run_data = { + "started_at": run_start, + "finished_at": run_end, + "duration_seconds": (run_end - run_start).total_seconds(), + "categories_scraped": len(leaves), + "products_scraped": len(all_products), + } + + if db: + log_scrape_run(db, run_data) + + log.info( + "Done: %d unique products from %d categories in %.1fs", + len(all_products), len(leaves), run_data["duration_seconds"], + ) + + browser.close() + + return all_products + + +def scrape_to_json(output_path: str = "products.json", **kwargs): + products = run_scraper(save_to_db=False, **kwargs) + with open(output_path, "w", encoding="utf-8") as f: + json.dump(products, f, ensure_ascii=False, indent=2, default=str) + log.info("Saved %d products to %s", len(products), output_path) + return products + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Rohlik.cz price scraper") + parser.add_argument("--no-db", action="store_true", help="Save to JSON instead of MongoDB") + parser.add_argument("--visible", action="store_true", help="Run browser in visible mode") + parser.add_argument("--filter", type=str, help="Filter categories by name (e.g. 'Ovoce', 'Zelenina > Rajčata')") + args = parser.parse_args() + + if args.no_db: + scrape_to_json(category_filter=args.filter, headless=not args.visible) + else: + run_scraper(category_filter=args.filter, headless=not args.visible)