81 lines
2.7 KiB
Python
81 lines
2.7 KiB
Python
"""
|
|
Ladění parseru detailu. Stáhne HTML jednoho výkonu, uloží na disk
|
|
a vypíše co parser vidí (tabulky, řádky, labely).
|
|
|
|
Spuštění:
|
|
python debug_detail.py
|
|
python debug_detail.py 09581
|
|
"""
|
|
|
|
import sys
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
from pathlib import Path
|
|
|
|
CISLO = sys.argv[1] if len(sys.argv) > 1 else "01021"
|
|
BASE_URL = "https://szv.mzd.gov.cz"
|
|
HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/120.0.0.0 Safari/537.36"
|
|
),
|
|
"Accept-Language": "cs,en;q=0.9",
|
|
}
|
|
|
|
url = f"{BASE_URL}/Vykon/Detail/{CISLO}/"
|
|
print(f"Stahuji: {url}")
|
|
resp = requests.get(url, headers=HEADERS, timeout=30)
|
|
resp.encoding = resp.apparent_encoding or "utf-8"
|
|
|
|
# Ulož HTML na disk
|
|
html_path = Path(__file__).parent / f"debug_{CISLO}.html"
|
|
html_path.write_text(resp.text, encoding="utf-8")
|
|
print(f"HTML uloženo: {html_path}")
|
|
|
|
soup = BeautifulSoup(resp.text, "lxml")
|
|
|
|
# --- Kolik tabulek je na stránce? ---
|
|
all_tables = soup.find_all("table")
|
|
print(f"\nPočet <table> na stránce: {len(all_tables)}")
|
|
|
|
# --- Co najde aktuální selektor? ---
|
|
main_table = soup.select_one("div.container table") or soup.find("table")
|
|
print(f"Selektor 'div.container table': {bool(soup.select_one('div.container table'))}")
|
|
print(f"Fallback soup.find('table'): {bool(soup.find('table'))}")
|
|
|
|
if not main_table:
|
|
print("ŽÁDNÁ TABULKA NENALEZENA!")
|
|
sys.exit(1)
|
|
|
|
# --- Přímé <tr> potomky hlavní tabulky ---
|
|
direct_trs = main_table.find_all("tr", recursive=False)
|
|
print(f"\nPřímých <tr> v hlavní tabulce: {len(direct_trs)}")
|
|
|
|
print("\n--- Labely v přímých řádcích ---")
|
|
for i, tr in enumerate(direct_trs):
|
|
tds = tr.find_all("td", recursive=False)
|
|
if len(tds) >= 2:
|
|
label = tds[0].get_text(strip=True)
|
|
preview = tds[1].get_text(strip=True)[:60]
|
|
print(f" [{i:2d}] label='{label}' | hodnota='{preview}'")
|
|
elif len(tds) == 1:
|
|
print(f" [{i:2d}] 1 buňka: '{tds[0].get_text(strip=True)[:80]}'")
|
|
else:
|
|
ths = tr.find_all("th", recursive=False)
|
|
if ths:
|
|
print(f" [{i:2d}] <th>: {[th.get_text(strip=True) for th in ths]}")
|
|
|
|
# --- Zkus také tbody ---
|
|
tbody = main_table.find("tbody")
|
|
if tbody:
|
|
tbody_trs = tbody.find_all("tr", recursive=False)
|
|
print(f"\nTabulka má <tbody> s {len(tbody_trs)} přímými <tr>")
|
|
print("--- Labely v tbody řádcích ---")
|
|
for i, tr in enumerate(tbody_trs[:5]):
|
|
tds = tr.find_all("td", recursive=False)
|
|
if len(tds) >= 2:
|
|
label = tds[0].get_text(strip=True)
|
|
preview = tds[1].get_text(strip=True)[:60]
|
|
print(f" [{i:2d}] label='{label}' | hodnota='{preview}'")
|