Files
2026-06-01 12:17:41 +02:00

81 lines
2.7 KiB
Python

"""
Ladění parseru detailu. Stáhne HTML jednoho výkonu, uloží na disk
a vypíše co parser vidí (tabulky, řádky, labely).
Spuštění:
python debug_detail.py
python debug_detail.py 09581
"""
import sys
import requests
from bs4 import BeautifulSoup
from pathlib import Path
CISLO = sys.argv[1] if len(sys.argv) > 1 else "01021"
BASE_URL = "https://szv.mzd.gov.cz"
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
"Accept-Language": "cs,en;q=0.9",
}
url = f"{BASE_URL}/Vykon/Detail/{CISLO}/"
print(f"Stahuji: {url}")
resp = requests.get(url, headers=HEADERS, timeout=30)
resp.encoding = resp.apparent_encoding or "utf-8"
# Ulož HTML na disk
html_path = Path(__file__).parent / f"debug_{CISLO}.html"
html_path.write_text(resp.text, encoding="utf-8")
print(f"HTML uloženo: {html_path}")
soup = BeautifulSoup(resp.text, "lxml")
# --- Kolik tabulek je na stránce? ---
all_tables = soup.find_all("table")
print(f"\nPočet <table> na stránce: {len(all_tables)}")
# --- Co najde aktuální selektor? ---
main_table = soup.select_one("div.container table") or soup.find("table")
print(f"Selektor 'div.container table': {bool(soup.select_one('div.container table'))}")
print(f"Fallback soup.find('table'): {bool(soup.find('table'))}")
if not main_table:
print("ŽÁDNÁ TABULKA NENALEZENA!")
sys.exit(1)
# --- Přímé <tr> potomky hlavní tabulky ---
direct_trs = main_table.find_all("tr", recursive=False)
print(f"\nPřímých <tr> v hlavní tabulce: {len(direct_trs)}")
print("\n--- Labely v přímých řádcích ---")
for i, tr in enumerate(direct_trs):
tds = tr.find_all("td", recursive=False)
if len(tds) >= 2:
label = tds[0].get_text(strip=True)
preview = tds[1].get_text(strip=True)[:60]
print(f" [{i:2d}] label='{label}' | hodnota='{preview}'")
elif len(tds) == 1:
print(f" [{i:2d}] 1 buňka: '{tds[0].get_text(strip=True)[:80]}'")
else:
ths = tr.find_all("th", recursive=False)
if ths:
print(f" [{i:2d}] <th>: {[th.get_text(strip=True) for th in ths]}")
# --- Zkus také tbody ---
tbody = main_table.find("tbody")
if tbody:
tbody_trs = tbody.find_all("tr", recursive=False)
print(f"\nTabulka má <tbody> s {len(tbody_trs)} přímými <tr>")
print("--- Labely v tbody řádcích ---")
for i, tr in enumerate(tbody_trs[:5]):
tds = tr.find_all("td", recursive=False)
if len(tds) >= 2:
label = tds[0].get_text(strip=True)
preview = tds[1].get_text(strip=True)[:60]
print(f" [{i:2d}] label='{label}' | hodnota='{preview}'")