This commit is contained in:
2026-06-01 12:17:41 +02:00
parent a9ef60212d
commit 5b0f8aa08b
8 changed files with 2018 additions and 0 deletions
+259
View File
@@ -0,0 +1,259 @@
"""
Stahování zdravotních výkonů ze szv.mzd.gov.cz a uložení do MongoDB.
Kolekce:
vykony -- aktuální stav každého výkonu (_aktivni, _platny_od)
vykony_historie -- archiv každé změny (_platny_do, _zmenena_pole)
Požadavky:
pip install requests beautifulsoup4 pymongo lxml
"""
import re
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient, UpdateOne, InsertOne
import logging
from datetime import datetime, timezone
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
MONGO_URI = "mongodb://192.168.1.76:27017/"
MONGO_DB = "zdravotni_vykony"
COL_VYKONY = "vykony"
COL_HISTORIE = "vykony_historie"
BASE_URL = "https://szv.mzd.gov.cz"
COLS = (
"Odbornost,CisloVykonu,NazevVykonu,Kategorie,TypVykonu,"
"DobaTrvani,OmezeniMistem,OmezeniFrekvenci,PrimeNaklady,Osobni,"
"BodyRezijni,BodyCelkem,Revize,Detail"
)
PAGE_URL = f"{BASE_URL}/Vykon/?cols={COLS}&page={{page}}"
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
"Accept-Language": "cs,en;q=0.9",
}
# Pole která se porovnávají pro detekci změn (metadata vynecháme)
COMPARE_FIELDS = [
"odbornost", "nazev_vykonu", "kategorie", "typ_vykonu",
"doba_trvani", "omezeni_mistem", "omezeni_frekvenci",
"prime_naklady", "osobni", "body_rezijni", "body_celkem",
"revize", "detail_url",
]
def parse_decimal(value: str) -> float | None:
if not value or value.strip() in ("", "-", ""):
return None
cleaned = value.strip().replace("\xa0", "").replace(" ", "").replace(",", ".")
try:
return float(cleaned)
except ValueError:
return None
def parse_date(value: str) -> datetime | None:
if not value or value.strip() in ("", "-"):
return None
try:
return datetime.strptime(value.strip(), "%d.%m.%Y")
except ValueError:
return None
def parse_page(html: str) -> tuple[list[dict], int, int]:
"""Vrátí (výkony, zobrazenoDo, celkem)."""
soup = BeautifulSoup(html, "lxml")
displayed_to = total = 0
m = re.search(r"Zobrazeno\s+\d+\s+\S+\s+(\d+)\s+z\S+\s+z\s+(\d+)", soup.get_text(" "))
if m:
displayed_to = int(m.group(1))
total = int(m.group(2))
table = soup.find("table")
if not table:
return [], displayed_to, total
vykony = []
for row in table.find_all("tr")[1:]:
cells = row.find_all(["td", "th"])
if len(cells) < 13:
continue
def cell_text(idx):
cell = cells[idx]
for elem in cell.find_all(attrs={"title": True}):
t = elem.get("title", "").strip()
if t:
return t
return cell.get_text(separator=" ", strip=True)
detail_link = None
if len(cells) >= 14:
link = cells[13].find("a") or cells[13].find("button")
if link:
href = link.get("href", "") or link.get("data-url", "")
if href:
detail_link = BASE_URL + href if href.startswith("/") else href
vykony.append({
"cislo_vykonu": cell_text(1).strip(),
"odbornost": cell_text(0).strip(),
"nazev_vykonu": cell_text(2).strip(),
"kategorie": cell_text(3).strip(),
"typ_vykonu": cell_text(4).strip(),
"doba_trvani": parse_decimal(cell_text(5)),
"omezeni_mistem": cell_text(6).strip() or None,
"omezeni_frekvenci": cell_text(7).strip() or None,
"prime_naklady": parse_decimal(cell_text(8)),
"osobni": parse_decimal(cell_text(9)),
"body_rezijni": parse_decimal(cell_text(10)),
"body_celkem": parse_decimal(cell_text(11)),
"revize": parse_date(cell_text(12)),
"detail_url": detail_link,
})
return vykony, displayed_to, total
def fetch_all(session: requests.Session) -> list[dict]:
all_vykony = []
page = 1
while True:
url = PAGE_URL.format(page=page)
logger.info(f"Stahuji stránku {page} ...")
resp = session.get(url, headers=HEADERS, timeout=60)
resp.raise_for_status()
resp.encoding = resp.apparent_encoding or "utf-8"
batch, displayed_to, total = parse_page(resp.text)
all_vykony.extend(batch)
logger.info(f" {len(batch)} výkonů (zobrazeno {displayed_to} z {total})")
if displayed_to >= total > 0:
logger.info("Poslední stránka — konec.")
break
if not batch:
logger.warning("Prázdná stránka — konec.")
break
if len(batch) < 50:
logger.info("Poslední stránka (méně než 50 záznamů) — konec.")
break
page += 1
return all_vykony
def process_changes(new_vykony: list[dict], col_vykony, col_historie, run_at: datetime) -> dict:
"""
Porovná nová data s aktuálním stavem v DB.
Změněné verze archivuje do vykony_historie.
Výkony co zmizely označí _aktivni=False.
"""
existing = {d["cislo_vykonu"]: d for d in col_vykony.find({}, {"_id": 0})}
new_ids = {v["cislo_vykonu"] for v in new_vykony}
ops_vykony = []
ops_historie = []
stats = {"novy": 0, "zmenen": 0, "nezmenen": 0, "deaktivovan": 0}
for vykon in new_vykony:
cid = vykon["cislo_vykonu"]
vykon["_aktivni"] = True
vykon["_scraped_at"] = run_at
if cid not in existing:
vykon["_platny_od"] = run_at
ops_vykony.append(UpdateOne({"cislo_vykonu": cid}, {"$set": vykon}, upsert=True))
stats["novy"] += 1
else:
old = existing[cid]
changed = [f for f in COMPARE_FIELDS if old.get(f) != vykon.get(f)]
if changed:
archive = {k: v for k, v in old.items()}
archive["_platny_do"] = run_at
archive["_zmenena_pole"] = changed
ops_historie.append(InsertOne(archive))
vykon["_platny_od"] = run_at
ops_vykony.append(UpdateOne({"cislo_vykonu": cid}, {"$set": vykon}))
stats["zmenen"] += 1
else:
ops_vykony.append(UpdateOne(
{"cislo_vykonu": cid},
{"$set": {"_scraped_at": run_at, "_aktivni": True}},
))
stats["nezmenen"] += 1
# Výkony co v novém scrape chybí → deaktivovat
for cid, old in existing.items():
if cid not in new_ids and old.get("_aktivni", True):
archive = {k: v for k, v in old.items()}
archive["_platny_do"] = run_at
archive["_zmenena_pole"] = ["_aktivni"]
ops_historie.append(InsertOne(archive))
ops_vykony.append(UpdateOne(
{"cislo_vykonu": cid},
{"$set": {"_aktivni": False, "_deaktivovano": run_at}},
))
stats["deaktivovan"] += 1
if ops_vykony:
col_vykony.bulk_write(ops_vykony, ordered=False)
if ops_historie:
col_historie.bulk_write(ops_historie, ordered=False)
return stats
def create_indexes(col_vykony, col_historie):
col_vykony.create_index("cislo_vykonu", unique=True)
col_vykony.create_index("odbornost")
col_vykony.create_index("_aktivni")
col_vykony.create_index("_platny_od")
col_historie.create_index("cislo_vykonu")
col_historie.create_index("_platny_do")
col_historie.create_index("_zmenena_pole")
logger.info("Indexy vytvořeny")
def main():
logger.info("=== Spouštím stahování zdravotních výkonů ===")
run_at = datetime.now(timezone.utc)
session = requests.Session()
vykony = fetch_all(session)
if not vykony:
logger.error("Žádná data nebyla naparsována!")
return
logger.info(f"Celkem naparsováno: {len(vykony)} výkonů")
client = MongoClient(MONGO_URI)
col_vykony = client[MONGO_DB][COL_VYKONY]
col_historie = client[MONGO_DB][COL_HISTORIE]
create_indexes(col_vykony, col_historie)
stats = process_changes(vykony, col_vykony, col_historie, run_at)
logger.info(
f"Výsledek: nové={stats['novy']}, změněné={stats['zmenen']}, "
f"nezměněné={stats['nezmenen']}, deaktivované={stats['deaktivovan']}"
)
client.close()
logger.info("=== Hotovo ===")
if __name__ == "__main__":
main()