260 lines
8.4 KiB
Python
260 lines
8.4 KiB
Python
"""
|
||
Stahování zdravotních výkonů ze szv.mzd.gov.cz a uložení do MongoDB.
|
||
|
||
Kolekce:
|
||
vykony -- aktuální stav každého výkonu (_aktivni, _platny_od)
|
||
vykony_historie -- archiv každé změny (_platny_do, _zmenena_pole)
|
||
|
||
Požadavky:
|
||
pip install requests beautifulsoup4 pymongo lxml
|
||
"""
|
||
|
||
import re
|
||
import requests
|
||
from bs4 import BeautifulSoup
|
||
from pymongo import MongoClient, UpdateOne, InsertOne
|
||
import logging
|
||
from datetime import datetime, timezone
|
||
|
||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||
logger = logging.getLogger(__name__)
|
||
|
||
MONGO_URI = "mongodb://192.168.1.76:27017/"
|
||
MONGO_DB = "zdravotni_vykony"
|
||
COL_VYKONY = "vykony"
|
||
COL_HISTORIE = "vykony_historie"
|
||
|
||
BASE_URL = "https://szv.mzd.gov.cz"
|
||
COLS = (
|
||
"Odbornost,CisloVykonu,NazevVykonu,Kategorie,TypVykonu,"
|
||
"DobaTrvani,OmezeniMistem,OmezeniFrekvenci,PrimeNaklady,Osobni,"
|
||
"BodyRezijni,BodyCelkem,Revize,Detail"
|
||
)
|
||
PAGE_URL = f"{BASE_URL}/Vykon/?cols={COLS}&page={{page}}"
|
||
|
||
HEADERS = {
|
||
"User-Agent": (
|
||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||
"Chrome/120.0.0.0 Safari/537.36"
|
||
),
|
||
"Accept-Language": "cs,en;q=0.9",
|
||
}
|
||
|
||
# Pole která se porovnávají pro detekci změn (metadata vynecháme)
|
||
COMPARE_FIELDS = [
|
||
"odbornost", "nazev_vykonu", "kategorie", "typ_vykonu",
|
||
"doba_trvani", "omezeni_mistem", "omezeni_frekvenci",
|
||
"prime_naklady", "osobni", "body_rezijni", "body_celkem",
|
||
"revize", "detail_url",
|
||
]
|
||
|
||
|
||
def parse_decimal(value: str) -> float | None:
|
||
if not value or value.strip() in ("", "-", "–"):
|
||
return None
|
||
cleaned = value.strip().replace("\xa0", "").replace(" ", "").replace(",", ".")
|
||
try:
|
||
return float(cleaned)
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def parse_date(value: str) -> datetime | None:
|
||
if not value or value.strip() in ("", "-"):
|
||
return None
|
||
try:
|
||
return datetime.strptime(value.strip(), "%d.%m.%Y")
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def parse_page(html: str) -> tuple[list[dict], int, int]:
|
||
"""Vrátí (výkony, zobrazenoDo, celkem)."""
|
||
soup = BeautifulSoup(html, "lxml")
|
||
|
||
displayed_to = total = 0
|
||
m = re.search(r"Zobrazeno\s+\d+\s+\S+\s+(\d+)\s+z\S+\s+z\s+(\d+)", soup.get_text(" "))
|
||
if m:
|
||
displayed_to = int(m.group(1))
|
||
total = int(m.group(2))
|
||
|
||
table = soup.find("table")
|
||
if not table:
|
||
return [], displayed_to, total
|
||
|
||
vykony = []
|
||
for row in table.find_all("tr")[1:]:
|
||
cells = row.find_all(["td", "th"])
|
||
if len(cells) < 13:
|
||
continue
|
||
|
||
def cell_text(idx):
|
||
cell = cells[idx]
|
||
for elem in cell.find_all(attrs={"title": True}):
|
||
t = elem.get("title", "").strip()
|
||
if t:
|
||
return t
|
||
return cell.get_text(separator=" ", strip=True)
|
||
|
||
detail_link = None
|
||
if len(cells) >= 14:
|
||
link = cells[13].find("a") or cells[13].find("button")
|
||
if link:
|
||
href = link.get("href", "") or link.get("data-url", "")
|
||
if href:
|
||
detail_link = BASE_URL + href if href.startswith("/") else href
|
||
|
||
vykony.append({
|
||
"cislo_vykonu": cell_text(1).strip(),
|
||
"odbornost": cell_text(0).strip(),
|
||
"nazev_vykonu": cell_text(2).strip(),
|
||
"kategorie": cell_text(3).strip(),
|
||
"typ_vykonu": cell_text(4).strip(),
|
||
"doba_trvani": parse_decimal(cell_text(5)),
|
||
"omezeni_mistem": cell_text(6).strip() or None,
|
||
"omezeni_frekvenci": cell_text(7).strip() or None,
|
||
"prime_naklady": parse_decimal(cell_text(8)),
|
||
"osobni": parse_decimal(cell_text(9)),
|
||
"body_rezijni": parse_decimal(cell_text(10)),
|
||
"body_celkem": parse_decimal(cell_text(11)),
|
||
"revize": parse_date(cell_text(12)),
|
||
"detail_url": detail_link,
|
||
})
|
||
|
||
return vykony, displayed_to, total
|
||
|
||
|
||
def fetch_all(session: requests.Session) -> list[dict]:
|
||
all_vykony = []
|
||
page = 1
|
||
while True:
|
||
url = PAGE_URL.format(page=page)
|
||
logger.info(f"Stahuji stránku {page} ...")
|
||
resp = session.get(url, headers=HEADERS, timeout=60)
|
||
resp.raise_for_status()
|
||
resp.encoding = resp.apparent_encoding or "utf-8"
|
||
|
||
batch, displayed_to, total = parse_page(resp.text)
|
||
all_vykony.extend(batch)
|
||
logger.info(f" {len(batch)} výkonů (zobrazeno {displayed_to} z {total})")
|
||
|
||
if displayed_to >= total > 0:
|
||
logger.info("Poslední stránka — konec.")
|
||
break
|
||
if not batch:
|
||
logger.warning("Prázdná stránka — konec.")
|
||
break
|
||
if len(batch) < 50:
|
||
logger.info("Poslední stránka (méně než 50 záznamů) — konec.")
|
||
break
|
||
page += 1
|
||
|
||
return all_vykony
|
||
|
||
|
||
def process_changes(new_vykony: list[dict], col_vykony, col_historie, run_at: datetime) -> dict:
|
||
"""
|
||
Porovná nová data s aktuálním stavem v DB.
|
||
Změněné verze archivuje do vykony_historie.
|
||
Výkony co zmizely označí _aktivni=False.
|
||
"""
|
||
existing = {d["cislo_vykonu"]: d for d in col_vykony.find({}, {"_id": 0})}
|
||
new_ids = {v["cislo_vykonu"] for v in new_vykony}
|
||
|
||
ops_vykony = []
|
||
ops_historie = []
|
||
stats = {"novy": 0, "zmenen": 0, "nezmenen": 0, "deaktivovan": 0}
|
||
|
||
for vykon in new_vykony:
|
||
cid = vykon["cislo_vykonu"]
|
||
vykon["_aktivni"] = True
|
||
vykon["_scraped_at"] = run_at
|
||
|
||
if cid not in existing:
|
||
vykon["_platny_od"] = run_at
|
||
ops_vykony.append(UpdateOne({"cislo_vykonu": cid}, {"$set": vykon}, upsert=True))
|
||
stats["novy"] += 1
|
||
else:
|
||
old = existing[cid]
|
||
changed = [f for f in COMPARE_FIELDS if old.get(f) != vykon.get(f)]
|
||
if changed:
|
||
archive = {k: v for k, v in old.items()}
|
||
archive["_platny_do"] = run_at
|
||
archive["_zmenena_pole"] = changed
|
||
ops_historie.append(InsertOne(archive))
|
||
|
||
vykon["_platny_od"] = run_at
|
||
ops_vykony.append(UpdateOne({"cislo_vykonu": cid}, {"$set": vykon}))
|
||
stats["zmenen"] += 1
|
||
else:
|
||
ops_vykony.append(UpdateOne(
|
||
{"cislo_vykonu": cid},
|
||
{"$set": {"_scraped_at": run_at, "_aktivni": True}},
|
||
))
|
||
stats["nezmenen"] += 1
|
||
|
||
# Výkony co v novém scrape chybí → deaktivovat
|
||
for cid, old in existing.items():
|
||
if cid not in new_ids and old.get("_aktivni", True):
|
||
archive = {k: v for k, v in old.items()}
|
||
archive["_platny_do"] = run_at
|
||
archive["_zmenena_pole"] = ["_aktivni"]
|
||
ops_historie.append(InsertOne(archive))
|
||
ops_vykony.append(UpdateOne(
|
||
{"cislo_vykonu": cid},
|
||
{"$set": {"_aktivni": False, "_deaktivovano": run_at}},
|
||
))
|
||
stats["deaktivovan"] += 1
|
||
|
||
if ops_vykony:
|
||
col_vykony.bulk_write(ops_vykony, ordered=False)
|
||
if ops_historie:
|
||
col_historie.bulk_write(ops_historie, ordered=False)
|
||
|
||
return stats
|
||
|
||
|
||
def create_indexes(col_vykony, col_historie):
|
||
col_vykony.create_index("cislo_vykonu", unique=True)
|
||
col_vykony.create_index("odbornost")
|
||
col_vykony.create_index("_aktivni")
|
||
col_vykony.create_index("_platny_od")
|
||
|
||
col_historie.create_index("cislo_vykonu")
|
||
col_historie.create_index("_platny_do")
|
||
col_historie.create_index("_zmenena_pole")
|
||
logger.info("Indexy vytvořeny")
|
||
|
||
|
||
def main():
|
||
logger.info("=== Spouštím stahování zdravotních výkonů ===")
|
||
run_at = datetime.now(timezone.utc)
|
||
|
||
session = requests.Session()
|
||
vykony = fetch_all(session)
|
||
|
||
if not vykony:
|
||
logger.error("Žádná data nebyla naparsována!")
|
||
return
|
||
|
||
logger.info(f"Celkem naparsováno: {len(vykony)} výkonů")
|
||
|
||
client = MongoClient(MONGO_URI)
|
||
col_vykony = client[MONGO_DB][COL_VYKONY]
|
||
col_historie = client[MONGO_DB][COL_HISTORIE]
|
||
create_indexes(col_vykony, col_historie)
|
||
|
||
stats = process_changes(vykony, col_vykony, col_historie, run_at)
|
||
logger.info(
|
||
f"Výsledek: nové={stats['novy']}, změněné={stats['zmenen']}, "
|
||
f"nezměněné={stats['nezmenen']}, deaktivované={stats['deaktivovan']}"
|
||
)
|
||
|
||
client.close()
|
||
logger.info("=== Hotovo ===")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|