import pymysql import requests import json import time import re import sys from bs4 import BeautifulSoup from datetime import datetime # ============================================================ # CONFIG # ============================================================ COOKIE_FILE = "sktorrent_cookies.json" BASE_URL = "https://sktorrent.eu/torrent/torrents.php?active=0&category=24&order=data&by=DESC" SLEEP_BETWEEN_PAGES = 2.0 # sekundy mezi stránkami (web nás neblokuje) MAX_PAGES = 300 # pojistka — skript se zastaví nejpozději zde # Kolik stránek za sebou bez jediné shody v DB = konec (dorazili jsme k novým torrentům) STOP_AFTER_EMPTY_PAGES = 5 # Kolik 403 chyb za sebou = přerušit (web nás blokuje) STOP_AFTER_403 = 3 DB_CONFIG = { "host": "192.168.1.76", "port": 3306, "user": "root", "password": "Vlado9674+", "database": "torrents", "charset": "utf8mb4", "autocommit": True, } # ============================================================ # CONNECT # ============================================================ def connect_db(): return pymysql.connect(**DB_CONFIG) def build_session(): with open(COOKIE_FILE, "r", encoding="utf-8") as f: cookies = json.load(f) session = requests.Session() session.headers["User-Agent"] = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" ) for c in cookies: session.cookies.set(c["name"], c["value"], domain=c.get("domain", "")) return session # ============================================================ # PARSE ONE PAGE # ============================================================ def parse_page(html): """ Vrátí seznam dict: {hash, seeders, leechers} """ soup = BeautifulSoup(html, "html.parser") results = [] for row in soup.select("table tr"): cells = row.find_all("td") if len(cells) != 7: continue # td[1] musí mít odkaz download.php?id= dl_link = cells[1].find("a", href=re.compile(r"download\.php\?id=")) if not dl_link: continue match = re.search(r"id=([a-f0-9]+)", dl_link["href"]) if not match: continue torrent_hash = match.group(1).lower() # seeders = td[4], leechers = td[5] seeders_text = cells[4].get_text(strip=True) leechers_text = cells[5].get_text(strip=True) try: seeders = int(seeders_text) except ValueError: seeders = 0 try: leechers = int(leechers_text) except ValueError: leechers = 0 results.append({ "hash": torrent_hash, "seeders": seeders, "leechers": leechers, }) return results # ============================================================ # MAIN # ============================================================ def main(): sys.stdout.reconfigure(encoding="utf-8") print("=" * 60) print("AKTUALIZACE SEEDERS / LEECHERS — sktorrent.eu") print(f"Spuštěno: {datetime.now():%Y-%m-%d %H:%M:%S}") print("=" * 60) session = build_session() db = connect_db() cursor = db.cursor() # Zjisti max stránku r0 = session.get(f"{BASE_URL}&page=0", timeout=15) all_page_nums = [int(m.group(1)) for m in re.finditer(r"page=(\d+)", r0.text)] max_page = max(all_page_nums) if all_page_nums else MAX_PAGES print(f"Max stránka na webu: {max_page}") print(f"Prochází od stránky {max_page} směrem dolů...\n") total_pages = 0 total_parsed = 0 total_updated = 0 total_skipped = 0 consecutive_empty = 0 # stránky za sebou bez jediné shody v DB consecutive_403 = 0 # 403 chyby za sebou # Procházíme od nejstarší stránky (konec) k nejnovější (začátek) for page in range(max_page, -1, -1): url = f"{BASE_URL}&page={page}" try: r = session.get(url, timeout=15) r.raise_for_status() consecutive_403 = 0 # reset po úspěchu except requests.exceptions.HTTPError as e: if e.response is not None and e.response.status_code == 403: consecutive_403 += 1 print(f"⚠️ Stránka {page} — 403 Forbidden ({consecutive_403}/{STOP_AFTER_403})") if consecutive_403 >= STOP_AFTER_403: print(f"\n🛑 {STOP_AFTER_403}× 403 za sebou — web nás blokuje, přerušuji.") break time.sleep(5) # pauza po 403 else: print(f"⚠️ Stránka {page} — chyba: {e}") continue except Exception as e: print(f"⚠️ Stránka {page} — chyba: {e}") continue if "login.php" in r.url or "Prihlas sa" in r.text: print("❌ Cookies expiraly — je potřeba se znovu přihlásit (spusť Selenium skript)") break rows = parse_page(r.text) if not rows: print(f" Stránka {page:3d} → prázdná, konec paginace.") break total_pages += 1 total_parsed += len(rows) page_updated = 0 for item in rows: cursor.execute(""" UPDATE torrents SET seeders = %s, leechers = %s, qb_last_update = NOW() WHERE torrent_hash = %s """, (item["seeders"], item["leechers"], item["hash"])) if cursor.rowcount > 0: total_updated += 1 page_updated += 1 else: total_skipped += 1 print(f" Stránka {page:3d} → {len(rows):2d} torrentů, " f"updatováno: {page_updated:2d} (celkem: {total_updated})") # Zastavit pokud jsme dorazili do oblasti novějších torrentů (mimo DB) if page_updated == 0: consecutive_empty += 1 if consecutive_empty >= STOP_AFTER_EMPTY_PAGES: print(f"\n⏹ {STOP_AFTER_EMPTY_PAGES} stránek po sobě bez shody → " f"dorazili jsme k novějším torrentům, které nejsou v DB. Konec.") break else: consecutive_empty = 0 time.sleep(SLEEP_BETWEEN_PAGES) # ============================================================ # SUMMARY # ============================================================ print() print("=" * 60) print(f"Hotovo: {datetime.now():%Y-%m-%d %H:%M:%S}") print(f"Stránek zpracováno : {total_pages}") print(f"Záznamů parsováno : {total_parsed}") print(f"DB řádků updatováno: {total_updated}") print(f"Nebylo v DB : {total_skipped}") print("=" * 60) db.close() if __name__ == "__main__": main()