notebookVB
This commit is contained in:
@@ -1,15 +1,6 @@
|
||||
"""
|
||||
Rohlik.cz Price Scraper - Database Operations
|
||||
Version: 1.0.0
|
||||
Date: 2026-05-31
|
||||
|
||||
MongoDB operations for the Rohlik.cz price scraper.
|
||||
Collections: products, price_history, categories, scrape_runs.
|
||||
MongoDB server: 192.168.1.76 (no authentication).
|
||||
"""
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from pymongo import MongoClient, ASCENDING
|
||||
|
||||
from pymongo import MongoClient, ASCENDING, DESCENDING, TEXT
|
||||
from config import MONGO_URI, MONGO_DB
|
||||
|
||||
|
||||
@@ -19,70 +10,116 @@ def get_db():
|
||||
|
||||
|
||||
def ensure_indexes(db):
|
||||
db.products.create_index([("product_id", ASCENDING)], unique=True)
|
||||
db.products.create_index([("category_id", ASCENDING)])
|
||||
db.products.create_index([("name", ASCENDING)])
|
||||
db.categories.create_index("parentId")
|
||||
db.categories.create_index("isLeaf")
|
||||
|
||||
db.price_history.create_index([("product_id", ASCENDING), ("scraped_at", ASCENDING)])
|
||||
db.price_history.create_index([("scraped_at", ASCENDING)])
|
||||
db.products.create_index("mainCategoryId")
|
||||
db.products.create_index([("archived", ASCENDING), ("lastSeen", DESCENDING)])
|
||||
db.products.create_index([("name", TEXT)])
|
||||
|
||||
db.categories.create_index([("category_id", ASCENDING)], unique=True)
|
||||
db.price_history.create_index([("productId", ASCENDING), ("scrapedAt", DESCENDING)])
|
||||
db.price_history.create_index([("scrapedAt", DESCENDING)])
|
||||
|
||||
db.scrape_runs.create_index([("started_at", ASCENDING)])
|
||||
db.scrape_runs.create_index([("startedAt", DESCENDING)])
|
||||
|
||||
|
||||
def upsert_product(db, product: dict):
|
||||
def upsert_category(db, cat: dict):
|
||||
db.categories.update_one(
|
||||
{"_id": cat["_id"]},
|
||||
{"$set": cat},
|
||||
upsert=True,
|
||||
)
|
||||
|
||||
|
||||
def upsert_categories(db, cats: list[dict]):
|
||||
for cat in cats:
|
||||
upsert_category(db, cat)
|
||||
|
||||
|
||||
def upsert_product(db, base: dict, prices: dict, stock: dict, categories: list[dict]):
|
||||
now = datetime.now(timezone.utc)
|
||||
product_id = product["product_id"]
|
||||
product_id = base["id"]
|
||||
|
||||
sale_raw = prices.get("sales", [])
|
||||
sale = None
|
||||
if sale_raw:
|
||||
s = sale_raw[0]
|
||||
sale = {
|
||||
"type": s.get("type"),
|
||||
"price": s["price"]["amount"],
|
||||
"pricePerUnit": s.get("pricePerUnit", {}).get("amount"),
|
||||
"badge": (s.get("badges") or [{}])[0].get("title"),
|
||||
"validTill": s.get("validTill"),
|
||||
}
|
||||
|
||||
category_path = [c["id"] for c in categories] if categories else []
|
||||
|
||||
doc = {
|
||||
"name": base["name"],
|
||||
"slug": base.get("slug"),
|
||||
"brand": base.get("brand"),
|
||||
"unit": base.get("unit"),
|
||||
"textualAmount": base.get("textualAmount"),
|
||||
"weightedItem": base.get("weightedItem", False),
|
||||
"mainCategoryId": base.get("mainCategoryId"),
|
||||
"categoryPath": category_path,
|
||||
"allCategories": [
|
||||
{"id": c["id"], "name": c["name"], "level": c.get("level", 0)}
|
||||
for c in categories
|
||||
] if categories else [],
|
||||
"countryCode": base.get("flag"),
|
||||
"images": base.get("images", []),
|
||||
"badges": base.get("badges", []),
|
||||
"archived": base.get("archived", False),
|
||||
"premiumOnly": base.get("premiumOnly", False),
|
||||
"currentPrice": prices["price"]["amount"],
|
||||
"currentPricePerUnit": prices.get("pricePerUnit", {}).get("amount"),
|
||||
"currency": prices["price"].get("currency", "CZK"),
|
||||
"sale": sale,
|
||||
"inStock": stock.get("inStock", False),
|
||||
"maxBasketAmount": stock.get("maxBasketAmount", 0),
|
||||
"packageAmount": stock.get("packageInfo", {}).get("amount"),
|
||||
"packageUnit": stock.get("packageInfo", {}).get("unit"),
|
||||
"warehouseId": stock.get("warehouseId"),
|
||||
"lastSeen": now,
|
||||
"lastScrapedAt": now,
|
||||
}
|
||||
|
||||
db.products.update_one(
|
||||
{"product_id": product_id},
|
||||
{"_id": product_id},
|
||||
{
|
||||
"$set": {
|
||||
"name": product["name"],
|
||||
"category_id": product.get("category_id"),
|
||||
"category_name": product.get("category_name"),
|
||||
"amount": product.get("amount"),
|
||||
"unit_price": product.get("unit_price"),
|
||||
"image_url": product.get("image_url"),
|
||||
"product_url": product.get("product_url"),
|
||||
"category_path": product.get("category_path"),
|
||||
"updated_at": now,
|
||||
},
|
||||
"$setOnInsert": {
|
||||
"created_at": now,
|
||||
},
|
||||
"$set": doc,
|
||||
"$setOnInsert": {"firstSeen": now},
|
||||
},
|
||||
upsert=True,
|
||||
)
|
||||
|
||||
db.price_history.insert_one({
|
||||
"product_id": product_id,
|
||||
"price": product["price"],
|
||||
"original_price": product.get("original_price"),
|
||||
"discount_badge": product.get("discount_badge"),
|
||||
"unit_price": product.get("unit_price"),
|
||||
"scraped_at": now,
|
||||
"productId": product_id,
|
||||
"scrapedAt": now,
|
||||
"price": prices["price"]["amount"],
|
||||
"pricePerUnit": prices.get("pricePerUnit", {}).get("amount"),
|
||||
"inStock": stock.get("inStock", False),
|
||||
"sale": sale,
|
||||
})
|
||||
|
||||
|
||||
def upsert_category(db, category: dict):
|
||||
now = datetime.now(timezone.utc)
|
||||
db.categories.update_one(
|
||||
{"category_id": category["category_id"]},
|
||||
{
|
||||
"$set": {
|
||||
"name": category["name"],
|
||||
"url": category["url"],
|
||||
"parent_id": category.get("parent_id"),
|
||||
"has_children": category.get("has_children", False),
|
||||
"updated_at": now,
|
||||
},
|
||||
"$setOnInsert": {"created_at": now},
|
||||
},
|
||||
upsert=True,
|
||||
)
|
||||
def upsert_products(db, bases: list, prices_list: list, stocks: list, categories_list: list):
|
||||
prices_map = {p["productId"]: p for p in prices_list}
|
||||
stock_map = {s["productId"]: s for s in stocks}
|
||||
cats_map = {c["productId"]: c.get("categories", []) for c in categories_list}
|
||||
|
||||
for base in bases:
|
||||
pid = base["id"]
|
||||
upsert_product(
|
||||
db,
|
||||
base,
|
||||
prices_map.get(pid, {"price": {"amount": 0}}),
|
||||
stock_map.get(pid, {}),
|
||||
cats_map.get(pid, []),
|
||||
)
|
||||
|
||||
|
||||
def log_scrape_run(db, run_data: dict):
|
||||
run_data.setdefault("startedAt", datetime.now(timezone.utc))
|
||||
db.scrape_runs.insert_one(run_data)
|
||||
|
||||
Reference in New Issue
Block a user