Files
2026-06-01 12:17:41 +02:00

260 lines
8.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Stahování zdravotních výkonů ze szv.mzd.gov.cz a uložení do MongoDB.
Kolekce:
vykony -- aktuální stav každého výkonu (_aktivni, _platny_od)
vykony_historie -- archiv každé změny (_platny_do, _zmenena_pole)
Požadavky:
pip install requests beautifulsoup4 pymongo lxml
"""
import re
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient, UpdateOne, InsertOne
import logging
from datetime import datetime, timezone
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
MONGO_URI = "mongodb://192.168.1.76:27017/"
MONGO_DB = "zdravotni_vykony"
COL_VYKONY = "vykony"
COL_HISTORIE = "vykony_historie"
BASE_URL = "https://szv.mzd.gov.cz"
COLS = (
"Odbornost,CisloVykonu,NazevVykonu,Kategorie,TypVykonu,"
"DobaTrvani,OmezeniMistem,OmezeniFrekvenci,PrimeNaklady,Osobni,"
"BodyRezijni,BodyCelkem,Revize,Detail"
)
PAGE_URL = f"{BASE_URL}/Vykon/?cols={COLS}&page={{page}}"
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
"Accept-Language": "cs,en;q=0.9",
}
# Pole která se porovnávají pro detekci změn (metadata vynecháme)
COMPARE_FIELDS = [
"odbornost", "nazev_vykonu", "kategorie", "typ_vykonu",
"doba_trvani", "omezeni_mistem", "omezeni_frekvenci",
"prime_naklady", "osobni", "body_rezijni", "body_celkem",
"revize", "detail_url",
]
def parse_decimal(value: str) -> float | None:
if not value or value.strip() in ("", "-", ""):
return None
cleaned = value.strip().replace("\xa0", "").replace(" ", "").replace(",", ".")
try:
return float(cleaned)
except ValueError:
return None
def parse_date(value: str) -> datetime | None:
if not value or value.strip() in ("", "-"):
return None
try:
return datetime.strptime(value.strip(), "%d.%m.%Y")
except ValueError:
return None
def parse_page(html: str) -> tuple[list[dict], int, int]:
"""Vrátí (výkony, zobrazenoDo, celkem)."""
soup = BeautifulSoup(html, "lxml")
displayed_to = total = 0
m = re.search(r"Zobrazeno\s+\d+\s+\S+\s+(\d+)\s+z\S+\s+z\s+(\d+)", soup.get_text(" "))
if m:
displayed_to = int(m.group(1))
total = int(m.group(2))
table = soup.find("table")
if not table:
return [], displayed_to, total
vykony = []
for row in table.find_all("tr")[1:]:
cells = row.find_all(["td", "th"])
if len(cells) < 13:
continue
def cell_text(idx):
cell = cells[idx]
for elem in cell.find_all(attrs={"title": True}):
t = elem.get("title", "").strip()
if t:
return t
return cell.get_text(separator=" ", strip=True)
detail_link = None
if len(cells) >= 14:
link = cells[13].find("a") or cells[13].find("button")
if link:
href = link.get("href", "") or link.get("data-url", "")
if href:
detail_link = BASE_URL + href if href.startswith("/") else href
vykony.append({
"cislo_vykonu": cell_text(1).strip(),
"odbornost": cell_text(0).strip(),
"nazev_vykonu": cell_text(2).strip(),
"kategorie": cell_text(3).strip(),
"typ_vykonu": cell_text(4).strip(),
"doba_trvani": parse_decimal(cell_text(5)),
"omezeni_mistem": cell_text(6).strip() or None,
"omezeni_frekvenci": cell_text(7).strip() or None,
"prime_naklady": parse_decimal(cell_text(8)),
"osobni": parse_decimal(cell_text(9)),
"body_rezijni": parse_decimal(cell_text(10)),
"body_celkem": parse_decimal(cell_text(11)),
"revize": parse_date(cell_text(12)),
"detail_url": detail_link,
})
return vykony, displayed_to, total
def fetch_all(session: requests.Session) -> list[dict]:
all_vykony = []
page = 1
while True:
url = PAGE_URL.format(page=page)
logger.info(f"Stahuji stránku {page} ...")
resp = session.get(url, headers=HEADERS, timeout=60)
resp.raise_for_status()
resp.encoding = resp.apparent_encoding or "utf-8"
batch, displayed_to, total = parse_page(resp.text)
all_vykony.extend(batch)
logger.info(f" {len(batch)} výkonů (zobrazeno {displayed_to} z {total})")
if displayed_to >= total > 0:
logger.info("Poslední stránka — konec.")
break
if not batch:
logger.warning("Prázdná stránka — konec.")
break
if len(batch) < 50:
logger.info("Poslední stránka (méně než 50 záznamů) — konec.")
break
page += 1
return all_vykony
def process_changes(new_vykony: list[dict], col_vykony, col_historie, run_at: datetime) -> dict:
"""
Porovná nová data s aktuálním stavem v DB.
Změněné verze archivuje do vykony_historie.
Výkony co zmizely označí _aktivni=False.
"""
existing = {d["cislo_vykonu"]: d for d in col_vykony.find({}, {"_id": 0})}
new_ids = {v["cislo_vykonu"] for v in new_vykony}
ops_vykony = []
ops_historie = []
stats = {"novy": 0, "zmenen": 0, "nezmenen": 0, "deaktivovan": 0}
for vykon in new_vykony:
cid = vykon["cislo_vykonu"]
vykon["_aktivni"] = True
vykon["_scraped_at"] = run_at
if cid not in existing:
vykon["_platny_od"] = run_at
ops_vykony.append(UpdateOne({"cislo_vykonu": cid}, {"$set": vykon}, upsert=True))
stats["novy"] += 1
else:
old = existing[cid]
changed = [f for f in COMPARE_FIELDS if old.get(f) != vykon.get(f)]
if changed:
archive = {k: v for k, v in old.items()}
archive["_platny_do"] = run_at
archive["_zmenena_pole"] = changed
ops_historie.append(InsertOne(archive))
vykon["_platny_od"] = run_at
ops_vykony.append(UpdateOne({"cislo_vykonu": cid}, {"$set": vykon}))
stats["zmenen"] += 1
else:
ops_vykony.append(UpdateOne(
{"cislo_vykonu": cid},
{"$set": {"_scraped_at": run_at, "_aktivni": True}},
))
stats["nezmenen"] += 1
# Výkony co v novém scrape chybí → deaktivovat
for cid, old in existing.items():
if cid not in new_ids and old.get("_aktivni", True):
archive = {k: v for k, v in old.items()}
archive["_platny_do"] = run_at
archive["_zmenena_pole"] = ["_aktivni"]
ops_historie.append(InsertOne(archive))
ops_vykony.append(UpdateOne(
{"cislo_vykonu": cid},
{"$set": {"_aktivni": False, "_deaktivovano": run_at}},
))
stats["deaktivovan"] += 1
if ops_vykony:
col_vykony.bulk_write(ops_vykony, ordered=False)
if ops_historie:
col_historie.bulk_write(ops_historie, ordered=False)
return stats
def create_indexes(col_vykony, col_historie):
col_vykony.create_index("cislo_vykonu", unique=True)
col_vykony.create_index("odbornost")
col_vykony.create_index("_aktivni")
col_vykony.create_index("_platny_od")
col_historie.create_index("cislo_vykonu")
col_historie.create_index("_platny_do")
col_historie.create_index("_zmenena_pole")
logger.info("Indexy vytvořeny")
def main():
logger.info("=== Spouštím stahování zdravotních výkonů ===")
run_at = datetime.now(timezone.utc)
session = requests.Session()
vykony = fetch_all(session)
if not vykony:
logger.error("Žádná data nebyla naparsována!")
return
logger.info(f"Celkem naparsováno: {len(vykony)} výkonů")
client = MongoClient(MONGO_URI)
col_vykony = client[MONGO_DB][COL_VYKONY]
col_historie = client[MONGO_DB][COL_HISTORIE]
create_indexes(col_vykony, col_historie)
stats = process_changes(vykony, col_vykony, col_historie, run_at)
logger.info(
f"Výsledek: nové={stats['novy']}, změněné={stats['zmenen']}, "
f"nezměněné={stats['nezmenen']}, deaktivované={stats['deaktivovan']}"
)
client.close()
logger.info("=== Hotovo ===")
if __name__ == "__main__":
main()