notebookVB
This commit is contained in:
@@ -0,0 +1,265 @@
|
||||
"""
|
||||
Rohlik.cz Price Scraper - Category Tree
|
||||
Version: 1.0.0
|
||||
Date: 2026-05-31
|
||||
|
||||
Complete hardcoded category tree for Rohlik.cz (14 main categories, ~170+ leaves).
|
||||
Only leaf categories (without children) contain the full product listing.
|
||||
"""
|
||||
|
||||
CATEGORY_TREE = [
|
||||
{
|
||||
"name": "Ovoce a zelenina", "id": "300102000", "url": "/c300102000-ovoce-a-zelenina",
|
||||
"children": [
|
||||
{"name": "Zelenina", "id": "300102008", "url": "/c300102008-zelenina", "children": [
|
||||
{"name": "Okurky, cukety a lilky", "id": "300102013", "url": "/c300102013-okurky-cukety-a-lilky"},
|
||||
{"name": "Mrkev a kořenová zelenina", "id": "300102012", "url": "/c300102012-mrkev-a-korenova-zelenina"},
|
||||
{"name": "Cibule, česnek a pórek", "id": "300102010", "url": "/c300102010-cibule-cesnek-a-porek"},
|
||||
{"name": "Papriky a chilli", "id": "300102015", "url": "/c300102015-papriky-a-chilli"},
|
||||
{"name": "Saláty", "id": "300102021", "url": "/c300102021-salaty"},
|
||||
{"name": "Rajčata", "id": "300102014", "url": "/c300102014-rajcata"},
|
||||
{"name": "Brambory", "id": "300102009", "url": "/c300102009-brambory"},
|
||||
{"name": "Ostatní zelenina", "id": "300102020", "url": "/c300102020-ostatni-zelenina"},
|
||||
{"name": "Květák a košťálová zelenina", "id": "300102016", "url": "/c300102016-kvetak-a-kostalova-zelenina"},
|
||||
{"name": "Dýně a tykve", "id": "300102011", "url": "/c300102011-dyne-a-tykve"},
|
||||
{"name": "Kukuřice a lusky", "id": "300102017", "url": "/c300102017-kukurice-a-lusky"},
|
||||
{"name": "Bio zelenina", "id": "300102040", "url": "/c300102040-bio-zelenina"},
|
||||
{"name": "Baby zelenina", "id": "300102018", "url": "/c300102018-baby-zelenina"},
|
||||
{"name": "Rychlá příprava a předvařená zelenina", "id": "300112157", "url": "/c300112157-rychla-priprava-a-predvarena-zelenina"},
|
||||
]},
|
||||
{"name": "Ovoce", "id": "300102001", "url": "/c300102001-ovoce"},
|
||||
{"name": "Bylinky a microgreens", "id": "300102026", "url": "/c300102026-bylinky-a-microgreens"},
|
||||
{"name": "Kuchařské boxy, hotové saláty a ready to cook", "id": "300124625", "url": "/c300124625-kucharske-boxy-hotove-salaty-a-ready-to-cook"},
|
||||
{"name": "Houby", "id": "300102022", "url": "/c300102022-houby"},
|
||||
{"name": "Květiny", "id": "300112201", "url": "/c300112201-kvetiny"},
|
||||
{"name": "Čerstvě sklizeno", "id": "300120435", "url": "/c300120435-cerstve-sklizeno"},
|
||||
{"name": "Český chřest", "id": "300114291", "url": "/c300114291-cesky-chrest"},
|
||||
{"name": "BIO ovoce a zelenina", "id": "300114343", "url": "/c300114343-bio-ovoce-a-zelenina"},
|
||||
{"name": "Kunratická stodola", "id": "300124164", "url": "/c300124164-kunraticka-stodola"},
|
||||
{"name": "Speciality ze světa", "id": "300102038", "url": "/c300102038-speciality-ze-sveta"},
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Mléčné a chlazené", "id": "300105000", "url": "/c300105000-mlecne-a-chlazene",
|
||||
"children": [
|
||||
{"name": "Sýry", "id": "300105026", "url": "/c300105026-syry"},
|
||||
{"name": "Jogurty a mléčné dezerty", "id": "300105008", "url": "/c300105008-jogurty-a-mlecne-dezerty"},
|
||||
{"name": "Vejce a droždí", "id": "300105053", "url": "/c300105053-vejce-a-drozdi"},
|
||||
{"name": "Smetany, šlehačky a tvarohy", "id": "300105021", "url": "/c300105021-smetany-slehacky-a-tvarohy"},
|
||||
{"name": "Máslo, tuky a margaríny", "id": "300105048", "url": "/c300105048-maslo-tuky-a-margariny"},
|
||||
{"name": "Majonézy, tatarské omáčky a dresingy", "id": "300105058", "url": "/c300105058-majonezy-tatarske-omacky-a-dresingy"},
|
||||
{"name": "Mléko a mléčné nápoje", "id": "300105001", "url": "/c300105001-mleko-a-mlecne-napoje"},
|
||||
{"name": "Bez laktózy, A2 a High protein", "id": "300121231", "url": "/c300121231-bez-laktozy-a2-a-high-protein"},
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Maso a ryby", "id": "300103000", "url": "/c300103000-maso-a-ryby",
|
||||
"children": [
|
||||
{"name": "Drůbež", "id": "300115247", "url": "/c300115247-drubez"},
|
||||
{"name": "Hovězí a telecí", "id": "300117217", "url": "/c300117217-hovezi-a-teleci"},
|
||||
{"name": "Vepřové", "id": "300103009", "url": "/c300103009-veprove"},
|
||||
{"name": "Ryby a mořské plody", "id": "300117385", "url": "/c300117385-ryby-a-morske-plody"},
|
||||
{"name": "BIO maso a ryby", "id": "300121424", "url": "/c300121424-bio-maso-a-ryby"},
|
||||
{"name": "Maso na gril, steaky a burgery", "id": "300117355", "url": "/c300117355-maso-na-gril-steaky-a-burgery"},
|
||||
{"name": "Zvěřina, jehněčí, králičí a speciality", "id": "300122988", "url": "/c300122988-zverina-jehneci-kralici-a-speciality"},
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Pekárna a cukrárna", "id": "300101000", "url": "/c300101000-pekarna-a-cukrarna",
|
||||
"children": [
|
||||
{"name": "Slané pečivo", "id": "300101012", "url": "/c300101012-slane-pecivo"},
|
||||
{"name": "Chléb", "id": "300101007", "url": "/c300101007-chleb"},
|
||||
{"name": "Sladké pečivo", "id": "300101024", "url": "/c300101024-sladke-pecivo"},
|
||||
{"name": "Dorty a zákusky", "id": "300101033", "url": "/c300101033-dorty-a-zakusky"},
|
||||
{"name": "Racio a Knäckebrot", "id": "300101043", "url": "/c300101043-racio-a-knaeckebrot"},
|
||||
{"name": "Tortilly a pita chleby", "id": "300101019", "url": "/c300101019-tortilly-a-pita-chleby"},
|
||||
{"name": "Pečicí směsi a strouhanka", "id": "300101049", "url": "/c300101049-pecici-smesi-a-strouhanka"},
|
||||
{"name": "Bezlepková pekárna a cukrárna", "id": "300118595", "url": "/c300118595-bezlepkova-pekarna-a-cukrarna"},
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Uzeniny a lahůdky", "id": "300104000", "url": "/c300104000-uzeniny-a-lahudky",
|
||||
"children": [
|
||||
{"name": "Šunky a slaniny", "id": "300104001", "url": "/c300104001-sunky-a-slaniny"},
|
||||
{"name": "Párky, klobásy a špekáčky", "id": "300104012", "url": "/c300104012-parky-klobasy-a-spekacky"},
|
||||
{"name": "Salámy", "id": "300104007", "url": "/c300104007-salamy"},
|
||||
{"name": "Hotová jídla a přílohy", "id": "300104049", "url": "/c300104049-hotova-jidla-a-prilohy"},
|
||||
{"name": "Lahůdky", "id": "300104039", "url": "/c300104039-lahudky"},
|
||||
{"name": "Paštiky a masné výrobky", "id": "300104016", "url": "/c300104016-pastiky-a-masne-vyrobky"},
|
||||
{"name": "Saláty, pomazánky a pesta", "id": "300104032", "url": "/c300104032-salaty-pomazanky-a-pesta"},
|
||||
{"name": "Zabijačkové speciality", "id": "300104020", "url": "/c300104020-zabijackove-speciality"},
|
||||
{"name": "Dárkové koše a kazety", "id": "300121878", "url": "/c300121878-darkove-kose-a-kazety"},
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Mražené", "id": "300107000", "url": "/c300107000-mrazene",
|
||||
"children": [
|
||||
{"name": "Pizza", "id": "300107020", "url": "/c300107020-pizza"},
|
||||
{"name": "Zmrzlina a led", "id": "300107035", "url": "/c300107035-zmrzlina-a-led"},
|
||||
{"name": "Ovoce a zelenina", "id": "300122993", "url": "/c300122993-ovoce-a-zelenina"},
|
||||
{"name": "Bramborové výrobky", "id": "300107024", "url": "/c300107024-bramborove-vyrobky"},
|
||||
{"name": "Hotová jídla", "id": "300107028", "url": "/c300107028-hotova-jidla"},
|
||||
{"name": "Pečivo, dezerty a dorty", "id": "300107041", "url": "/c300107041-pecivo-dezerty-a-dorty"},
|
||||
{"name": "Ryby a mořské plody", "id": "300107014", "url": "/c300107014-ryby-a-morske-plody"},
|
||||
{"name": "Maso a drůbež", "id": "300107010", "url": "/c300107010-maso-a-drubez"},
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Grilování", "id": "300117503", "url": "/c300117503-grilovani",
|
||||
"children": [
|
||||
{"name": "Rohlík doporučuje", "id": "300124865", "url": "/c300124865-rohlik-doporucuje"},
|
||||
{"name": "Maso a ryby", "id": "300117505", "url": "/c300117505-maso-a-ryby"},
|
||||
{"name": "Klobásy a slanina", "id": "300124497", "url": "/c300124497-klobasy-a-slanina"},
|
||||
{"name": "Sýry", "id": "300121109", "url": "/c300121109-syry"},
|
||||
{"name": "Pekárna", "id": "300117581", "url": "/c300117581-pekarna"},
|
||||
{"name": "Zelenina, houby a bylinky", "id": "300124458", "url": "/c300124458-zelenina-houby-a-bylinky"},
|
||||
{"name": "Sterilovaná zelenina a olivy", "id": "300124864", "url": "/c300124864-sterilovana-zelenina-a-olivy"},
|
||||
{"name": "Plant based", "id": "300122070", "url": "/c300122070-plant-based"},
|
||||
{"name": "Omáčky a koření", "id": "300117565", "url": "/c300117565-omacky-a-koreni"},
|
||||
{"name": "Nápoje", "id": "300124465", "url": "/c300124465-napoje"},
|
||||
{"name": "Příslušenství na grilování", "id": "300124489", "url": "/c300124489-prislusenstvi-na-grilovani"},
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Plant Based", "id": "300121429", "url": "/c300121429-plant-based",
|
||||
"children": [
|
||||
{"name": "Trvanlivé", "id": "300121448", "url": "/c300121448-trvanlive"},
|
||||
{"name": "Nápoje", "id": "300121430", "url": "/c300121430-napoje"},
|
||||
{"name": "Alternativy mléčného a chlazeného", "id": "300121486", "url": "/c300121486-alternativy-mlecneho-a-chlazeneho"},
|
||||
{"name": "Alternativy uzenin a lahůdek", "id": "300121493", "url": "/c300121493-alternativy-uzenin-a-lahudek"},
|
||||
{"name": "Pekárna a cukrárna", "id": "300121491", "url": "/c300121491-pekarna-a-cukrarna"},
|
||||
{"name": "Mražené", "id": "300121492", "url": "/c300121492-mrazene"},
|
||||
{"name": "Bezmasé a nerybí výrobky", "id": "300121488", "url": "/c300121488-bezmase-a-nerybi-vyrobky"},
|
||||
{"name": "Ovoce a zelenina", "id": "300121436", "url": "/c300121436-ovoce-a-zelenina"},
|
||||
{"name": "Dítě", "id": "300121470", "url": "/c300121470-dite"},
|
||||
{"name": "Drogerie a kosmetika", "id": "300121621", "url": "/c300121621-drogerie-a-kosmetika"},
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Trvanlivé", "id": "300106000", "url": "/c300106000-trvanlive",
|
||||
"children": [
|
||||
{"name": "Sladkosti", "id": "300106128", "url": "/c300106128-sladkosti"},
|
||||
{"name": "Přísady na pečení", "id": "300106001", "url": "/c300106001-prisady-na-peceni"},
|
||||
{"name": "Konzervy, hotovky a instantní pokrmy", "id": "300123466", "url": "/c300123466-konzervy-hotovky-a-instantni-pokrmy"},
|
||||
{"name": "Slané, snacky a ořechy", "id": "300123392", "url": "/c300123392-slane-snacky-a-orechy"},
|
||||
{"name": "Koření a ochucovadla", "id": "300106031", "url": "/c300106031-koreni-a-ochucovadla"},
|
||||
{"name": "Rýže, těstoviny, luštěniny, soja", "id": "300106096", "url": "/c300106096-ryze-testoviny-lusteniny-soja"},
|
||||
{"name": "Džemy, medy, oříškové krémy, sladké pomazánky a sirupy", "id": "300106124", "url": "/c300106124-dzemy-medy-oriskove-kremy-sladke-pomazanky-a-sirupy"},
|
||||
{"name": "Kečupy, hořčice a omáčky", "id": "300123436", "url": "/c300123436-kecupy-horcice-a-omacky"},
|
||||
{"name": "Oleje a octy", "id": "300123421", "url": "/c300123421-oleje-a-octy"},
|
||||
{"name": "Müsli, cereálie, kaše a tyčinky", "id": "300106117", "url": "/c300106117-muesli-cerealie-kase-a-tycinky"},
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Nápoje", "id": "300108000", "url": "/c300108000-napoje",
|
||||
"children": [
|
||||
{"name": "Vody a minerálky", "id": "300122110", "url": "/c300122110-vody-a-mineralky"},
|
||||
{"name": "Limonády a energy", "id": "300122167", "url": "/c300122167-limonady-a-energy"},
|
||||
{"name": "Piva a cidery", "id": "300108052", "url": "/c300108052-piva-a-cidery"},
|
||||
{"name": "Džusy, ovocné nápoje a sirupy", "id": "300122136", "url": "/c300122136-dzusy-ovocne-napoje-a-sirupy"},
|
||||
{"name": "Víno", "id": "300108064", "url": "/c300108064-vino"},
|
||||
{"name": "Horké nápoje", "id": "300123000", "url": "/c300123000-horke-napoje"},
|
||||
{"name": "Lihoviny", "id": "300108072", "url": "/c300108072-lihoviny"},
|
||||
{"name": "Vozíme chlazené", "id": "300124505", "url": "/c300124505-vozime-chlazene"},
|
||||
{"name": "Dětské nápoje", "id": "300122626", "url": "/c300122626-detske-napoje"},
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Speciální výživa", "id": "300112393", "url": "/c300112393-specialni-vyziva",
|
||||
"children": [
|
||||
{"name": "Bezlaktózové výrobky", "id": "300112395", "url": "/c300112395-bezlaktozove-vyrobky"},
|
||||
{"name": "Fitness výživa", "id": "300115595", "url": "/c300115595-fitness-vyziva"},
|
||||
{"name": "Bezlepkové výrobky", "id": "300112399", "url": "/c300112399-bezlepkove-vyrobky"},
|
||||
{"name": "Potraviny se sníženým obsahem cukru", "id": "300112403", "url": "/c300112403-potraviny-se-snizenym-obsahem-cukru"},
|
||||
{"name": "Veggie", "id": "300112843", "url": "/c300112843-veggie"},
|
||||
{"name": "Alternativní strava", "id": "300124517", "url": "/c300124517-alternativni-strava"},
|
||||
{"name": "Doplňky stravy", "id": "300115585", "url": "/c300115585-doplnky-stravy"},
|
||||
{"name": "Nízkobílkovinové výrobky", "id": "300122547", "url": "/c300122547-nizkobilkovinove-vyrobky"},
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Kosmetika", "id": "300124206", "url": "/c300124206-kosmetika",
|
||||
"children": [
|
||||
{"name": "Vlasová péče", "id": "300124300", "url": "/c300124300-vlasova-pece"},
|
||||
{"name": "Ústní hygiena", "id": "300124215", "url": "/c300124215-ustni-hygiena"},
|
||||
{"name": "Pleťová péče a dekorativní kosmetika", "id": "300124327", "url": "/c300124327-pletova-pece-a-dekorativni-kosmetika"},
|
||||
{"name": "Dámská hygiena", "id": "300124245", "url": "/c300124245-damska-hygiena"},
|
||||
{"name": "Sprchové gely a mýdla a přísady do koupele", "id": "300124259", "url": "/c300124259-sprchove-gely-a-mydla-a-prisady-do-koupele"},
|
||||
{"name": "Tělová péče", "id": "300124320", "url": "/c300124320-telova-pece"},
|
||||
{"name": "Holení a depilace", "id": "300124337", "url": "/c300124337-holeni-a-depilace"},
|
||||
{"name": "Deodoranty a parfémy", "id": "300124281", "url": "/c300124281-deodoranty-a-parfemy"},
|
||||
{"name": "Pánský svět", "id": "300124350", "url": "/c300124350-pansky-svet"},
|
||||
{"name": "Pro intimní chvíle", "id": "300124361", "url": "/c300124361-pro-intimni-chvile"},
|
||||
{"name": "Kosmetické dárkové balíčky", "id": "300124366", "url": "/c300124366-kosmeticke-darkove-balicky"},
|
||||
{"name": "Cestovní balení a produkty na léto", "id": "300124367", "url": "/c300124367-cestovni-baleni-a-produkty-na-leto"},
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Drogerie", "id": "300109000", "url": "/c300109000-drogerie",
|
||||
"children": [
|
||||
{"name": "Papírová a vatová hygiena", "id": "300109010", "url": "/c300109010-papirova-a-vatova-hygiena"},
|
||||
{"name": "Prací prostředky", "id": "300109001", "url": "/c300109001-praci-prostredky"},
|
||||
{"name": "Mytí nádobí", "id": "300109042", "url": "/c300109042-myti-nadobi"},
|
||||
{"name": "Čisticí prostředky", "id": "300109028", "url": "/c300109028-cistici-prostredky"},
|
||||
{"name": "Ekologická a šetrná drogerie", "id": "300124564", "url": "/c300124564-ekologicka-a-setrna-drogerie"},
|
||||
{"name": "Dětské praní", "id": "300124266", "url": "/c300124266-detske-prani"},
|
||||
{"name": "Velká balení", "id": "300124287", "url": "/c300124287-velka-baleni"},
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Domácnost a zahrada", "id": "300111000", "url": "/c300111000-domacnost-a-zahrada",
|
||||
"children": [
|
||||
{"name": "Úklidové potřeby", "id": "300124615", "url": "/c300124615-uklidove-potreby"},
|
||||
{"name": "Dům, byt a garáž", "id": "300123083", "url": "/c300123083-dum-byt-a-garaz"},
|
||||
{"name": "Kuchyňské potřeby", "id": "300124614", "url": "/c300124614-kuchynske-potreby"},
|
||||
{"name": "Zahrada a květiny", "id": "300111043", "url": "/c300111043-zahrada-a-kvetiny"},
|
||||
{"name": "Kancelář a papírnictví", "id": "300111034", "url": "/c300111034-kancelar-a-papirnictvi"},
|
||||
{"name": "Dekorace, svíčky a vůně", "id": "300124685", "url": "/c300124685-dekorace-svicky-a-vune"},
|
||||
{"name": "Trafika", "id": "300111091", "url": "/c300111091-trafika"},
|
||||
{"name": "Oslavy a párty", "id": "300124678", "url": "/c300124678-oslavy-a-party"},
|
||||
{"name": "Punčocháče a ponožky", "id": "300124722", "url": "/c300124722-puncochace-a-ponozky"},
|
||||
{"name": "Knihy", "id": "300124157", "url": "/c300124157-knihy"},
|
||||
]
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def get_leaf_categories(tree: list[dict] | None = None, parent_path: list[str] | None = None) -> list[dict]:
|
||||
if tree is None:
|
||||
tree = CATEGORY_TREE
|
||||
if parent_path is None:
|
||||
parent_path = []
|
||||
|
||||
leaves = []
|
||||
for cat in tree:
|
||||
current_path = parent_path + [cat["name"]]
|
||||
children = cat.get("children")
|
||||
if children:
|
||||
leaves.extend(get_leaf_categories(children, current_path))
|
||||
else:
|
||||
leaves.append({
|
||||
"id": cat["id"],
|
||||
"name": cat["name"],
|
||||
"url": cat["url"],
|
||||
"path": current_path,
|
||||
"parent_id": tree[0].get("id") if parent_path else None,
|
||||
})
|
||||
return leaves
|
||||
|
||||
|
||||
def get_all_categories_flat(tree: list[dict] | None = None, parent_id: str | None = None) -> list[dict]:
|
||||
if tree is None:
|
||||
tree = CATEGORY_TREE
|
||||
|
||||
result = []
|
||||
for cat in tree:
|
||||
result.append({
|
||||
"category_id": cat["id"],
|
||||
"name": cat["name"],
|
||||
"url": cat["url"],
|
||||
"parent_id": parent_id,
|
||||
"has_children": bool(cat.get("children")),
|
||||
})
|
||||
if cat.get("children"):
|
||||
result.extend(get_all_categories_flat(cat["children"], parent_id=cat["id"]))
|
||||
return result
|
||||
@@ -0,0 +1,208 @@
|
||||
"""
|
||||
Scrape the live Rohlik.cz category tree (main categories + subcategories)
|
||||
via the navigation API and save it as JSON.
|
||||
|
||||
Endpoints:
|
||||
GET /api/v5/navigation/components/navigation-tabs/categories
|
||||
GET /api/v4/navigation/components/navigation-tabs/subcategories?categoryIds=ID1,ID2,...
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from playwright.sync_api import sync_playwright
|
||||
from config import BASE_URL
|
||||
from test_login import ensure_logged_in
|
||||
|
||||
OUT_PATH = Path(__file__).parent / "categories_live.json"
|
||||
|
||||
MAIN_URL = f"{BASE_URL}/api/v5/navigation/components/navigation-tabs/categories"
|
||||
SUB_URL = f"{BASE_URL}/api/v4/navigation/components/navigation-tabs/subcategories"
|
||||
|
||||
|
||||
def fetch_json(context, url, **params):
|
||||
resp = context.request.get(url, params=params or None)
|
||||
if resp.status != 200:
|
||||
raise RuntimeError(f"GET {url} -> {resp.status}: {resp.text()[:200]}")
|
||||
return resp.json()
|
||||
|
||||
|
||||
def normalize_main(payload):
|
||||
"""The API wraps payload in different shapes — try to find the categories list."""
|
||||
if isinstance(payload, list):
|
||||
return payload
|
||||
for key in ("data", "categories", "items", "navigationTabs", "tabs"):
|
||||
v = payload.get(key)
|
||||
if isinstance(v, list):
|
||||
return v
|
||||
if isinstance(v, dict):
|
||||
for k2 in ("categories", "items", "tabs"):
|
||||
if isinstance(v.get(k2), list):
|
||||
return v[k2]
|
||||
return []
|
||||
|
||||
|
||||
def pick(d, *keys):
|
||||
"""Return the first non-None value among the given keys."""
|
||||
for k in keys:
|
||||
if isinstance(d, dict) and d.get(k) is not None:
|
||||
return d[k]
|
||||
return None
|
||||
|
||||
|
||||
def find_subcats_for(payload, parent_id):
|
||||
"""Try to locate the subcategories list for a given parent id in the response."""
|
||||
pid = str(parent_id)
|
||||
# 1) dict keyed by parent id
|
||||
if isinstance(payload, dict):
|
||||
if pid in payload and isinstance(payload[pid], (list, dict)):
|
||||
v = payload[pid]
|
||||
if isinstance(v, list):
|
||||
return v
|
||||
for k in ("subcategories", "children", "items", "categories"):
|
||||
if isinstance(v.get(k), list):
|
||||
return v[k]
|
||||
# 2) wrapped under "data"/etc
|
||||
for wrap in ("data", "subcategories", "categories", "items"):
|
||||
sub = payload.get(wrap)
|
||||
if isinstance(sub, dict) and pid in sub:
|
||||
v = sub[pid]
|
||||
if isinstance(v, list):
|
||||
return v
|
||||
if isinstance(sub, list):
|
||||
# 3) list of {parentId/categoryId/id: ..., children: [...]}
|
||||
for entry in sub:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
if str(pick(entry, "parentId", "categoryId", "id")) == pid:
|
||||
for k in ("subcategories", "children", "items", "categories"):
|
||||
if isinstance(entry.get(k), list):
|
||||
return entry[k]
|
||||
return []
|
||||
|
||||
|
||||
def build_clean_tree(main_payload, sub_payload):
|
||||
"""Build a [{name, id, url, children:[{name,id,url}]}, ...] tree."""
|
||||
out = []
|
||||
for cat in normalize_main(main_payload):
|
||||
cid = pick(cat, "id", "categoryId")
|
||||
node = {
|
||||
"id": cid,
|
||||
"name": pick(cat, "name", "title", "label"),
|
||||
"url": pick(cat, "url", "slug", "link"),
|
||||
"children": [],
|
||||
}
|
||||
for sub in find_subcats_for(sub_payload, cid):
|
||||
if not isinstance(sub, dict):
|
||||
continue
|
||||
node["children"].append({
|
||||
"id": pick(sub, "id", "categoryId"),
|
||||
"name": pick(sub, "name", "title", "label"),
|
||||
"url": pick(sub, "url", "slug", "link"),
|
||||
})
|
||||
out.append(node)
|
||||
return out
|
||||
|
||||
|
||||
def subs_from_payload(payload):
|
||||
"""Subcategories API returns either a flat list or a dict wrapping one."""
|
||||
if isinstance(payload, list):
|
||||
return payload
|
||||
if isinstance(payload, dict):
|
||||
for k in ("data", "subcategories", "items", "categories"):
|
||||
v = payload.get(k)
|
||||
if isinstance(v, list):
|
||||
return v
|
||||
return []
|
||||
|
||||
|
||||
def fetch_children_recursive(context, parent_id, visited, depth=1, max_depth=6):
|
||||
"""Fetch subcategories recursively for parent_id, return list of nodes."""
|
||||
if str(parent_id) in visited or depth > max_depth:
|
||||
return []
|
||||
visited.add(str(parent_id))
|
||||
|
||||
sub_payload = fetch_json(context, SUB_URL, categoryIds=str(parent_id))
|
||||
subs = subs_from_payload(sub_payload)
|
||||
|
||||
out = []
|
||||
for s in subs:
|
||||
if not isinstance(s, dict):
|
||||
continue
|
||||
sid = pick(s, "id", "categoryId")
|
||||
node = {
|
||||
"id": sid,
|
||||
"name": pick(s, "name", "title", "label"),
|
||||
"url": pick(s, "url", "link", "slug"),
|
||||
"children": [],
|
||||
}
|
||||
# Only recurse if the item itself advertises children
|
||||
if sid and s.get("subcategoryIds"):
|
||||
node["children"] = fetch_children_recursive(context, sid, visited, depth + 1, max_depth)
|
||||
out.append(node)
|
||||
return out
|
||||
|
||||
|
||||
def print_tree(nodes, indent=0):
|
||||
for n in nodes:
|
||||
print(f"{' ' * indent}- {n['name']} (id={n['id']})")
|
||||
if n.get("children"):
|
||||
print_tree(n["children"], indent + 1)
|
||||
|
||||
|
||||
def count_nodes(nodes):
|
||||
total = len(nodes)
|
||||
for n in nodes:
|
||||
total += count_nodes(n.get("children", []))
|
||||
return total
|
||||
|
||||
|
||||
def main():
|
||||
with sync_playwright() as pw:
|
||||
context, page = ensure_logged_in(pw)
|
||||
|
||||
print(f"\nFetching main categories ...")
|
||||
main_payload = fetch_json(context, MAIN_URL)
|
||||
main_cats = normalize_main(main_payload)
|
||||
print(f" Got {len(main_cats)} main categories")
|
||||
|
||||
clean_tree = []
|
||||
visited = set()
|
||||
|
||||
print(f"\nFetching subcategories recursively ...")
|
||||
for cat in main_cats:
|
||||
cid = pick(cat, "id", "categoryId")
|
||||
cname = pick(cat, "name", "title", "label")
|
||||
curl = pick(cat, "url", "link", "slug")
|
||||
if not cid:
|
||||
continue
|
||||
|
||||
children = fetch_children_recursive(context, cid, visited)
|
||||
node = {
|
||||
"id": cid,
|
||||
"name": cname,
|
||||
"url": curl,
|
||||
"children": children,
|
||||
}
|
||||
clean_tree.append(node)
|
||||
total = count_nodes(children)
|
||||
print(f" - {cname} (id={cid}) -> {len(children)} direct, {total} total descendants")
|
||||
|
||||
print("\nFull category tree:")
|
||||
print_tree(clean_tree)
|
||||
|
||||
grand_total = count_nodes(clean_tree)
|
||||
print(f"\nTotal nodes (incl. main): {grand_total}")
|
||||
|
||||
tree = {
|
||||
"tree": clean_tree,
|
||||
"raw_main": main_payload,
|
||||
}
|
||||
|
||||
OUT_PATH.write_text(json.dumps(tree, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
print(f"Saved -> {OUT_PATH} ({OUT_PATH.stat().st_size} bytes)")
|
||||
|
||||
context.browser.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,202 @@
|
||||
"""
|
||||
Open the first leaf (deepest) subcategory from categories_live.json
|
||||
and list all products in it via the same API calls the website itself makes.
|
||||
|
||||
Flow that mimics the real frontend:
|
||||
1. GET /api/v1/categories/normal/{categoryId}/products?page=N
|
||||
-> { productIds: [...] }
|
||||
2. For each chunk of IDs, call 5 batch endpoints in the same way the site does:
|
||||
/api/v1/products
|
||||
/api/v1/products/prices
|
||||
/api/v1/products/stock
|
||||
/api/v1/products/categories
|
||||
/api/v1/products/user-data
|
||||
All use repeated query params: ?products=ID1&products=ID2&...
|
||||
3. Merge results per productId into one record.
|
||||
"""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from playwright.sync_api import sync_playwright
|
||||
from config import BASE_URL
|
||||
from test_login import ensure_logged_in
|
||||
|
||||
TREE_PATH = Path(__file__).parent / "categories_live.json"
|
||||
PAGE_SIZE = 50
|
||||
CHUNK = 30 # how many IDs per batch request
|
||||
|
||||
# Endpoints that the frontend calls in parallel for each set of product IDs.
|
||||
PRODUCT_BATCH_ENDPOINTS = {
|
||||
"base": "/api/v1/products",
|
||||
"prices": "/api/v1/products/prices",
|
||||
"stock": "/api/v1/products/stock",
|
||||
"categories": "/api/v1/products/categories",
|
||||
"user_data": "/api/v1/products/user-data",
|
||||
}
|
||||
|
||||
|
||||
def find_first_leaf(nodes, path=None):
|
||||
if path is None:
|
||||
path = []
|
||||
for n in nodes:
|
||||
current = path + [n["name"]]
|
||||
children = n.get("children") or []
|
||||
if not children:
|
||||
return current, n
|
||||
result = find_first_leaf(children, current)
|
||||
if result:
|
||||
return result
|
||||
return None
|
||||
|
||||
|
||||
def get_json(context, url):
|
||||
resp = context.request.get(url)
|
||||
if resp.status != 200:
|
||||
raise RuntimeError(f"GET {url[:120]}... -> {resp.status}: {resp.text()[:200]}")
|
||||
return resp.json()
|
||||
|
||||
|
||||
def fetch_products_page(context, category_id, page):
|
||||
url = (f"{BASE_URL}/api/v1/categories/normal/{category_id}/products"
|
||||
f"?page={page}&size={PAGE_SIZE}&sort=recommended&filter=&excludeProductIds=")
|
||||
return get_json(context, url)
|
||||
|
||||
|
||||
def fetch_batch(context, path, product_ids):
|
||||
"""Call a batch endpoint with ?products=ID&products=ID&... — like the frontend does."""
|
||||
qs = "&".join(f"products={pid}" for pid in product_ids)
|
||||
url = f"{BASE_URL}{path}?{qs}"
|
||||
return get_json(context, url)
|
||||
|
||||
|
||||
def as_list(payload):
|
||||
"""Each batch endpoint returns either a list or a wrapper around one."""
|
||||
if isinstance(payload, list):
|
||||
return payload
|
||||
if isinstance(payload, dict):
|
||||
for k in ("data", "products", "items"):
|
||||
v = payload.get(k)
|
||||
if isinstance(v, list):
|
||||
return v
|
||||
return []
|
||||
|
||||
|
||||
def index_by_id(items):
|
||||
out = {}
|
||||
for it in items:
|
||||
if not isinstance(it, dict):
|
||||
continue
|
||||
pid = it.get("productId") or it.get("id")
|
||||
if pid is not None:
|
||||
out[int(pid)] = it
|
||||
return out
|
||||
|
||||
|
||||
def fetch_merged_products(context, product_ids):
|
||||
"""For a chunk of IDs, call all 5 endpoints and merge per productId."""
|
||||
results = {key: index_by_id(as_list(fetch_batch(context, path, product_ids)))
|
||||
for key, path in PRODUCT_BATCH_ENDPOINTS.items()}
|
||||
|
||||
merged = []
|
||||
for pid in product_ids:
|
||||
record = {"productId": pid}
|
||||
for key in PRODUCT_BATCH_ENDPOINTS:
|
||||
data = results[key].get(int(pid))
|
||||
if data is not None:
|
||||
record[key] = data
|
||||
merged.append(record)
|
||||
return merged
|
||||
|
||||
|
||||
def main():
|
||||
if not TREE_PATH.exists():
|
||||
raise SystemExit(f"Missing {TREE_PATH} — run scrape_categories.py first.")
|
||||
|
||||
data = json.loads(TREE_PATH.read_text(encoding="utf-8"))
|
||||
path, leaf = find_first_leaf(data["tree"])
|
||||
print(f"First leaf: {' > '.join(path)} (id={leaf['id']})")
|
||||
print(f"URL: {BASE_URL}{leaf['url']}\n")
|
||||
|
||||
with sync_playwright() as pw:
|
||||
context, page = ensure_logged_in(pw)
|
||||
|
||||
# Step 1: collect all product IDs across pages
|
||||
all_ids = []
|
||||
page_num = 0
|
||||
while True:
|
||||
print(f"Listing page {page_num} ...")
|
||||
payload = fetch_products_page(context, leaf["id"], page_num)
|
||||
ids = payload.get("productIds") or []
|
||||
print(f" got {len(ids)} product IDs")
|
||||
if not ids:
|
||||
break
|
||||
all_ids.extend(ids)
|
||||
if len(ids) < PAGE_SIZE:
|
||||
break
|
||||
page_num += 1
|
||||
|
||||
print(f"\nTotal IDs: {len(all_ids)}")
|
||||
if not all_ids:
|
||||
context.browser.close()
|
||||
return
|
||||
|
||||
# Step 2: per chunk, hit the 5 batch endpoints the frontend uses and merge
|
||||
all_products = []
|
||||
for i in range(0, len(all_ids), CHUNK):
|
||||
chunk = all_ids[i:i + CHUNK]
|
||||
print(f"Batch fetch for IDs {i}..{i + len(chunk) - 1} ({len(chunk)} items) ...")
|
||||
merged = fetch_merged_products(context, chunk)
|
||||
all_products.extend(merged)
|
||||
|
||||
print(f"\nTotal products: {len(all_products)}\n")
|
||||
|
||||
# Show one merged record so we see real field shapes
|
||||
if all_products:
|
||||
print("--- Sample merged product (first item, truncated) ---")
|
||||
print(json.dumps(all_products[0], ensure_ascii=False, indent=2)[:2500])
|
||||
print("--- end sample ---\n")
|
||||
|
||||
# Simple human-readable listing
|
||||
print(f"{'ID':>9} {'Skladem':<8} {'Cena':>10} {'Za jedn.':>11} {'Akce':>10} Název (balení)")
|
||||
print("-" * 100)
|
||||
for p in all_products:
|
||||
base = p.get("base") or {}
|
||||
prices = p.get("prices") or {}
|
||||
stock = p.get("stock") or {}
|
||||
|
||||
name = base.get("name") or "?"
|
||||
unit = base.get("unit") or ""
|
||||
textual = base.get("textualAmount") or ""
|
||||
|
||||
price = (prices.get("price") or {}).get("amount")
|
||||
ppu = (prices.get("pricePerUnit") or {}).get("amount")
|
||||
sale_price = None
|
||||
sale_badge = ""
|
||||
sales = prices.get("sales") or []
|
||||
if sales:
|
||||
first = sales[0]
|
||||
sale_price = (first.get("price") or {}).get("amount")
|
||||
badges = first.get("badges") or []
|
||||
if badges:
|
||||
sale_badge = badges[0].get("title") or first.get("type") or ""
|
||||
else:
|
||||
sale_badge = first.get("type") or ""
|
||||
|
||||
in_stock = stock.get("inStock")
|
||||
stock_str = "ano" if in_stock else ("ne" if in_stock is False else "?")
|
||||
|
||||
price_str = f"{price:.2f}" if isinstance(price, (int, float)) else ""
|
||||
ppu_str = f"{ppu:.2f}/{unit}" if isinstance(ppu, (int, float)) else ""
|
||||
sale_str = f"{sale_price:.2f} {sale_badge}".strip() if isinstance(sale_price, (int, float)) else ""
|
||||
|
||||
print(f"{p['productId']:>9} {stock_str:<8} {price_str:>10} {ppu_str:>11} {sale_str:>10} {name} ({textual})")
|
||||
|
||||
out_path = Path(__file__).parent / f"products_{leaf['id']}.json"
|
||||
out_path.write_text(json.dumps(all_products, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
print(f"\nSaved -> {out_path} ({out_path.stat().st_size} bytes)")
|
||||
|
||||
context.browser.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,93 @@
|
||||
"""
|
||||
Test DB layer: load products_300102013.json (already scraped data)
|
||||
and upsert into MongoDB 'rohlik' database.
|
||||
|
||||
No scraping needed — just validates the db.py functions work
|
||||
with real API response shapes.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
import io
|
||||
from pathlib import Path
|
||||
from db import get_db, ensure_indexes, upsert_products, upsert_category
|
||||
|
||||
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
||||
|
||||
DATA_FILE = Path(__file__).parent / "products_300102013.json"
|
||||
|
||||
|
||||
def main():
|
||||
db = get_db()
|
||||
print(f"Connected to: {db.client.address} / {db.name}")
|
||||
|
||||
ensure_indexes(db)
|
||||
print("Indexes created.\n")
|
||||
|
||||
# --- test category upsert ---
|
||||
upsert_category(db, {
|
||||
"_id": 300102013,
|
||||
"name": "Okurky, cukety a lilky",
|
||||
"slug": "okurky-cukety-a-lilky",
|
||||
"path": [300102000, 300102008, 300102013],
|
||||
"pathNames": ["Ovoce a zelenina", "Zelenina", "Okurky, cukety a lilky"],
|
||||
"parentId": 300102008,
|
||||
"isLeaf": True,
|
||||
})
|
||||
print("Category 300102013 upserted.")
|
||||
|
||||
# --- load scraped products ---
|
||||
products = json.loads(DATA_FILE.read_text(encoding="utf-8"))
|
||||
print(f"Loaded {len(products)} products from {DATA_FILE.name}\n")
|
||||
|
||||
# split merged records back into the 4 lists that upsert_products expects
|
||||
bases = []
|
||||
prices_list = []
|
||||
stocks = []
|
||||
categories_list = []
|
||||
|
||||
for p in products:
|
||||
base = p.get("base", {})
|
||||
prices = p.get("prices", {})
|
||||
stock = p.get("stock", {})
|
||||
cats = p.get("categories", {})
|
||||
|
||||
bases.append(base)
|
||||
prices_list.append(prices)
|
||||
stocks.append(stock)
|
||||
categories_list.append(cats)
|
||||
|
||||
upsert_products(db, bases, prices_list, stocks, categories_list)
|
||||
print(f"Upserted {len(bases)} products.\n")
|
||||
|
||||
# --- verify ---
|
||||
n_products = db.products.count_documents({})
|
||||
n_history = db.price_history.count_documents({})
|
||||
n_cats = db.categories.count_documents({})
|
||||
|
||||
print(f"DB counts:")
|
||||
print(f" products: {n_products}")
|
||||
print(f" price_history: {n_history}")
|
||||
print(f" categories: {n_cats}")
|
||||
|
||||
# show one sample
|
||||
sample = db.products.find_one({"_id": 1407650})
|
||||
if sample:
|
||||
print(f"\nSample product: {sample['name']}")
|
||||
print(f" price: {sample['currentPrice']} {sample['currency']}")
|
||||
print(f" per unit: {sample['currentPricePerUnit']}/{sample.get('unit', '?')}")
|
||||
print(f" inStock: {sample['inStock']}")
|
||||
print(f" sale: {sample['sale']}")
|
||||
print(f" badges: {[b['title'] for b in sample.get('badges', [])]}")
|
||||
|
||||
# show price_history entry
|
||||
hist = db.price_history.find_one({"productId": 1407650})
|
||||
if hist:
|
||||
print(f"\n price_history record: price={hist['price']}, "
|
||||
f"inStock={hist['inStock']}, scrapedAt={hist['scrapedAt']}")
|
||||
|
||||
print("\nDone.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user