z230
This commit is contained in:
@@ -0,0 +1,80 @@
|
||||
"""
|
||||
Ladění parseru detailu. Stáhne HTML jednoho výkonu, uloží na disk
|
||||
a vypíše co parser vidí (tabulky, řádky, labely).
|
||||
|
||||
Spuštění:
|
||||
python debug_detail.py
|
||||
python debug_detail.py 09581
|
||||
"""
|
||||
|
||||
import sys
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from pathlib import Path
|
||||
|
||||
CISLO = sys.argv[1] if len(sys.argv) > 1 else "01021"
|
||||
BASE_URL = "https://szv.mzd.gov.cz"
|
||||
HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept-Language": "cs,en;q=0.9",
|
||||
}
|
||||
|
||||
url = f"{BASE_URL}/Vykon/Detail/{CISLO}/"
|
||||
print(f"Stahuji: {url}")
|
||||
resp = requests.get(url, headers=HEADERS, timeout=30)
|
||||
resp.encoding = resp.apparent_encoding or "utf-8"
|
||||
|
||||
# Ulož HTML na disk
|
||||
html_path = Path(__file__).parent / f"debug_{CISLO}.html"
|
||||
html_path.write_text(resp.text, encoding="utf-8")
|
||||
print(f"HTML uloženo: {html_path}")
|
||||
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
|
||||
# --- Kolik tabulek je na stránce? ---
|
||||
all_tables = soup.find_all("table")
|
||||
print(f"\nPočet <table> na stránce: {len(all_tables)}")
|
||||
|
||||
# --- Co najde aktuální selektor? ---
|
||||
main_table = soup.select_one("div.container table") or soup.find("table")
|
||||
print(f"Selektor 'div.container table': {bool(soup.select_one('div.container table'))}")
|
||||
print(f"Fallback soup.find('table'): {bool(soup.find('table'))}")
|
||||
|
||||
if not main_table:
|
||||
print("ŽÁDNÁ TABULKA NENALEZENA!")
|
||||
sys.exit(1)
|
||||
|
||||
# --- Přímé <tr> potomky hlavní tabulky ---
|
||||
direct_trs = main_table.find_all("tr", recursive=False)
|
||||
print(f"\nPřímých <tr> v hlavní tabulce: {len(direct_trs)}")
|
||||
|
||||
print("\n--- Labely v přímých řádcích ---")
|
||||
for i, tr in enumerate(direct_trs):
|
||||
tds = tr.find_all("td", recursive=False)
|
||||
if len(tds) >= 2:
|
||||
label = tds[0].get_text(strip=True)
|
||||
preview = tds[1].get_text(strip=True)[:60]
|
||||
print(f" [{i:2d}] label='{label}' | hodnota='{preview}'")
|
||||
elif len(tds) == 1:
|
||||
print(f" [{i:2d}] 1 buňka: '{tds[0].get_text(strip=True)[:80]}'")
|
||||
else:
|
||||
ths = tr.find_all("th", recursive=False)
|
||||
if ths:
|
||||
print(f" [{i:2d}] <th>: {[th.get_text(strip=True) for th in ths]}")
|
||||
|
||||
# --- Zkus také tbody ---
|
||||
tbody = main_table.find("tbody")
|
||||
if tbody:
|
||||
tbody_trs = tbody.find_all("tr", recursive=False)
|
||||
print(f"\nTabulka má <tbody> s {len(tbody_trs)} přímými <tr>")
|
||||
print("--- Labely v tbody řádcích ---")
|
||||
for i, tr in enumerate(tbody_trs[:5]):
|
||||
tds = tr.find_all("td", recursive=False)
|
||||
if len(tds) >= 2:
|
||||
label = tds[0].get_text(strip=True)
|
||||
preview = tds[1].get_text(strip=True)[:60]
|
||||
print(f" [{i:2d}] label='{label}' | hodnota='{preview}'")
|
||||
Reference in New Issue
Block a user