z230
This commit is contained in:
@@ -0,0 +1,259 @@
|
||||
"""
|
||||
Stahování zdravotních výkonů ze szv.mzd.gov.cz a uložení do MongoDB.
|
||||
|
||||
Kolekce:
|
||||
vykony -- aktuální stav každého výkonu (_aktivni, _platny_od)
|
||||
vykony_historie -- archiv každé změny (_platny_do, _zmenena_pole)
|
||||
|
||||
Požadavky:
|
||||
pip install requests beautifulsoup4 pymongo lxml
|
||||
"""
|
||||
|
||||
import re
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from pymongo import MongoClient, UpdateOne, InsertOne
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017/"
|
||||
MONGO_DB = "zdravotni_vykony"
|
||||
COL_VYKONY = "vykony"
|
||||
COL_HISTORIE = "vykony_historie"
|
||||
|
||||
BASE_URL = "https://szv.mzd.gov.cz"
|
||||
COLS = (
|
||||
"Odbornost,CisloVykonu,NazevVykonu,Kategorie,TypVykonu,"
|
||||
"DobaTrvani,OmezeniMistem,OmezeniFrekvenci,PrimeNaklady,Osobni,"
|
||||
"BodyRezijni,BodyCelkem,Revize,Detail"
|
||||
)
|
||||
PAGE_URL = f"{BASE_URL}/Vykon/?cols={COLS}&page={{page}}"
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept-Language": "cs,en;q=0.9",
|
||||
}
|
||||
|
||||
# Pole která se porovnávají pro detekci změn (metadata vynecháme)
|
||||
COMPARE_FIELDS = [
|
||||
"odbornost", "nazev_vykonu", "kategorie", "typ_vykonu",
|
||||
"doba_trvani", "omezeni_mistem", "omezeni_frekvenci",
|
||||
"prime_naklady", "osobni", "body_rezijni", "body_celkem",
|
||||
"revize", "detail_url",
|
||||
]
|
||||
|
||||
|
||||
def parse_decimal(value: str) -> float | None:
|
||||
if not value or value.strip() in ("", "-", "–"):
|
||||
return None
|
||||
cleaned = value.strip().replace("\xa0", "").replace(" ", "").replace(",", ".")
|
||||
try:
|
||||
return float(cleaned)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def parse_date(value: str) -> datetime | None:
|
||||
if not value or value.strip() in ("", "-"):
|
||||
return None
|
||||
try:
|
||||
return datetime.strptime(value.strip(), "%d.%m.%Y")
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def parse_page(html: str) -> tuple[list[dict], int, int]:
|
||||
"""Vrátí (výkony, zobrazenoDo, celkem)."""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
|
||||
displayed_to = total = 0
|
||||
m = re.search(r"Zobrazeno\s+\d+\s+\S+\s+(\d+)\s+z\S+\s+z\s+(\d+)", soup.get_text(" "))
|
||||
if m:
|
||||
displayed_to = int(m.group(1))
|
||||
total = int(m.group(2))
|
||||
|
||||
table = soup.find("table")
|
||||
if not table:
|
||||
return [], displayed_to, total
|
||||
|
||||
vykony = []
|
||||
for row in table.find_all("tr")[1:]:
|
||||
cells = row.find_all(["td", "th"])
|
||||
if len(cells) < 13:
|
||||
continue
|
||||
|
||||
def cell_text(idx):
|
||||
cell = cells[idx]
|
||||
for elem in cell.find_all(attrs={"title": True}):
|
||||
t = elem.get("title", "").strip()
|
||||
if t:
|
||||
return t
|
||||
return cell.get_text(separator=" ", strip=True)
|
||||
|
||||
detail_link = None
|
||||
if len(cells) >= 14:
|
||||
link = cells[13].find("a") or cells[13].find("button")
|
||||
if link:
|
||||
href = link.get("href", "") or link.get("data-url", "")
|
||||
if href:
|
||||
detail_link = BASE_URL + href if href.startswith("/") else href
|
||||
|
||||
vykony.append({
|
||||
"cislo_vykonu": cell_text(1).strip(),
|
||||
"odbornost": cell_text(0).strip(),
|
||||
"nazev_vykonu": cell_text(2).strip(),
|
||||
"kategorie": cell_text(3).strip(),
|
||||
"typ_vykonu": cell_text(4).strip(),
|
||||
"doba_trvani": parse_decimal(cell_text(5)),
|
||||
"omezeni_mistem": cell_text(6).strip() or None,
|
||||
"omezeni_frekvenci": cell_text(7).strip() or None,
|
||||
"prime_naklady": parse_decimal(cell_text(8)),
|
||||
"osobni": parse_decimal(cell_text(9)),
|
||||
"body_rezijni": parse_decimal(cell_text(10)),
|
||||
"body_celkem": parse_decimal(cell_text(11)),
|
||||
"revize": parse_date(cell_text(12)),
|
||||
"detail_url": detail_link,
|
||||
})
|
||||
|
||||
return vykony, displayed_to, total
|
||||
|
||||
|
||||
def fetch_all(session: requests.Session) -> list[dict]:
|
||||
all_vykony = []
|
||||
page = 1
|
||||
while True:
|
||||
url = PAGE_URL.format(page=page)
|
||||
logger.info(f"Stahuji stránku {page} ...")
|
||||
resp = session.get(url, headers=HEADERS, timeout=60)
|
||||
resp.raise_for_status()
|
||||
resp.encoding = resp.apparent_encoding or "utf-8"
|
||||
|
||||
batch, displayed_to, total = parse_page(resp.text)
|
||||
all_vykony.extend(batch)
|
||||
logger.info(f" {len(batch)} výkonů (zobrazeno {displayed_to} z {total})")
|
||||
|
||||
if displayed_to >= total > 0:
|
||||
logger.info("Poslední stránka — konec.")
|
||||
break
|
||||
if not batch:
|
||||
logger.warning("Prázdná stránka — konec.")
|
||||
break
|
||||
if len(batch) < 50:
|
||||
logger.info("Poslední stránka (méně než 50 záznamů) — konec.")
|
||||
break
|
||||
page += 1
|
||||
|
||||
return all_vykony
|
||||
|
||||
|
||||
def process_changes(new_vykony: list[dict], col_vykony, col_historie, run_at: datetime) -> dict:
|
||||
"""
|
||||
Porovná nová data s aktuálním stavem v DB.
|
||||
Změněné verze archivuje do vykony_historie.
|
||||
Výkony co zmizely označí _aktivni=False.
|
||||
"""
|
||||
existing = {d["cislo_vykonu"]: d for d in col_vykony.find({}, {"_id": 0})}
|
||||
new_ids = {v["cislo_vykonu"] for v in new_vykony}
|
||||
|
||||
ops_vykony = []
|
||||
ops_historie = []
|
||||
stats = {"novy": 0, "zmenen": 0, "nezmenen": 0, "deaktivovan": 0}
|
||||
|
||||
for vykon in new_vykony:
|
||||
cid = vykon["cislo_vykonu"]
|
||||
vykon["_aktivni"] = True
|
||||
vykon["_scraped_at"] = run_at
|
||||
|
||||
if cid not in existing:
|
||||
vykon["_platny_od"] = run_at
|
||||
ops_vykony.append(UpdateOne({"cislo_vykonu": cid}, {"$set": vykon}, upsert=True))
|
||||
stats["novy"] += 1
|
||||
else:
|
||||
old = existing[cid]
|
||||
changed = [f for f in COMPARE_FIELDS if old.get(f) != vykon.get(f)]
|
||||
if changed:
|
||||
archive = {k: v for k, v in old.items()}
|
||||
archive["_platny_do"] = run_at
|
||||
archive["_zmenena_pole"] = changed
|
||||
ops_historie.append(InsertOne(archive))
|
||||
|
||||
vykon["_platny_od"] = run_at
|
||||
ops_vykony.append(UpdateOne({"cislo_vykonu": cid}, {"$set": vykon}))
|
||||
stats["zmenen"] += 1
|
||||
else:
|
||||
ops_vykony.append(UpdateOne(
|
||||
{"cislo_vykonu": cid},
|
||||
{"$set": {"_scraped_at": run_at, "_aktivni": True}},
|
||||
))
|
||||
stats["nezmenen"] += 1
|
||||
|
||||
# Výkony co v novém scrape chybí → deaktivovat
|
||||
for cid, old in existing.items():
|
||||
if cid not in new_ids and old.get("_aktivni", True):
|
||||
archive = {k: v for k, v in old.items()}
|
||||
archive["_platny_do"] = run_at
|
||||
archive["_zmenena_pole"] = ["_aktivni"]
|
||||
ops_historie.append(InsertOne(archive))
|
||||
ops_vykony.append(UpdateOne(
|
||||
{"cislo_vykonu": cid},
|
||||
{"$set": {"_aktivni": False, "_deaktivovano": run_at}},
|
||||
))
|
||||
stats["deaktivovan"] += 1
|
||||
|
||||
if ops_vykony:
|
||||
col_vykony.bulk_write(ops_vykony, ordered=False)
|
||||
if ops_historie:
|
||||
col_historie.bulk_write(ops_historie, ordered=False)
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def create_indexes(col_vykony, col_historie):
|
||||
col_vykony.create_index("cislo_vykonu", unique=True)
|
||||
col_vykony.create_index("odbornost")
|
||||
col_vykony.create_index("_aktivni")
|
||||
col_vykony.create_index("_platny_od")
|
||||
|
||||
col_historie.create_index("cislo_vykonu")
|
||||
col_historie.create_index("_platny_do")
|
||||
col_historie.create_index("_zmenena_pole")
|
||||
logger.info("Indexy vytvořeny")
|
||||
|
||||
|
||||
def main():
|
||||
logger.info("=== Spouštím stahování zdravotních výkonů ===")
|
||||
run_at = datetime.now(timezone.utc)
|
||||
|
||||
session = requests.Session()
|
||||
vykony = fetch_all(session)
|
||||
|
||||
if not vykony:
|
||||
logger.error("Žádná data nebyla naparsována!")
|
||||
return
|
||||
|
||||
logger.info(f"Celkem naparsováno: {len(vykony)} výkonů")
|
||||
|
||||
client = MongoClient(MONGO_URI)
|
||||
col_vykony = client[MONGO_DB][COL_VYKONY]
|
||||
col_historie = client[MONGO_DB][COL_HISTORIE]
|
||||
create_indexes(col_vykony, col_historie)
|
||||
|
||||
stats = process_changes(vykony, col_vykony, col_historie, run_at)
|
||||
logger.info(
|
||||
f"Výsledek: nové={stats['novy']}, změněné={stats['zmenen']}, "
|
||||
f"nezměněné={stats['nezmenen']}, deaktivované={stats['deaktivovan']}"
|
||||
)
|
||||
|
||||
client.close()
|
||||
logger.info("=== Hotovo ===")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user