Add Seedbox/60 AktualizaceSeeders.py — scrape seeders/leechers from sktorrent.eu

2026-03-01 11:45:43 +01:00
parent 0710af0f82
commit d57f7d75ce
1 changed files with 220 additions and 0 deletions
@@ -0,0 +1,220 @@
+import pymysql
+import requests
+import json
+import time
+import re
+import sys
+from bs4 import BeautifulSoup
+from datetime import datetime
+
+# ============================================================
+# CONFIG
+# ============================================================
+
+COOKIE_FILE = "sktorrent_cookies.json"
+
+BASE_URL = "https://sktorrent.eu/torrent/torrents.php?active=0&category=24&order=data&by=DESC"
+
+SLEEP_BETWEEN_PAGES = 2.0   # sekundy mezi stránkami (web nás neblokuje)
+MAX_PAGES = 300              # pojistka — skript se zastaví nejpozději zde
+
+# Kolik stránek za sebou bez jediné shody v DB = konec (dorazili jsme k novým torrentům)
+STOP_AFTER_EMPTY_PAGES = 5
+# Kolik 403 chyb za sebou = přerušit (web nás blokuje)
+STOP_AFTER_403 = 3
+
+DB_CONFIG = {
+    "host": "192.168.1.76",
+    "port": 3306,
+    "user": "root",
+    "password": "Vlado9674+",
+    "database": "torrents",
+    "charset": "utf8mb4",
+    "autocommit": True,
+}
+
+# ============================================================
+# CONNECT
+# ============================================================
+
+def connect_db():
+    return pymysql.connect(**DB_CONFIG)
+
+
+def build_session():
+    with open(COOKIE_FILE, "r", encoding="utf-8") as f:
+        cookies = json.load(f)
+    session = requests.Session()
+    session.headers["User-Agent"] = (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+    )
+    for c in cookies:
+        session.cookies.set(c["name"], c["value"], domain=c.get("domain", ""))
+    return session
+
+
+# ============================================================
+# PARSE ONE PAGE
+# ============================================================
+
+def parse_page(html):
+    """
+    Vrátí seznam dict: {hash, seeders, leechers}
+    """
+    soup = BeautifulSoup(html, "html.parser")
+    results = []
+
+    for row in soup.select("table tr"):
+        cells = row.find_all("td")
+        if len(cells) != 7:
+            continue
+
+        # td[1] musí mít odkaz download.php?id=<hash>
+        dl_link = cells[1].find("a", href=re.compile(r"download\.php\?id="))
+        if not dl_link:
+            continue
+
+        match = re.search(r"id=([a-f0-9]+)", dl_link["href"])
+        if not match:
+            continue
+
+        torrent_hash = match.group(1).lower()
+
+        # seeders = td[4], leechers = td[5]
+        seeders_text  = cells[4].get_text(strip=True)
+        leechers_text = cells[5].get_text(strip=True)
+
+        try:
+            seeders  = int(seeders_text)
+        except ValueError:
+            seeders  = 0
+
+        try:
+            leechers = int(leechers_text)
+        except ValueError:
+            leechers = 0
+
+        results.append({
+            "hash":     torrent_hash,
+            "seeders":  seeders,
+            "leechers": leechers,
+        })
+
+    return results
+
+
+# ============================================================
+# MAIN
+# ============================================================
+
+def main():
+    sys.stdout.reconfigure(encoding="utf-8")
+
+    print("=" * 60)
+    print("AKTUALIZACE SEEDERS / LEECHERS — sktorrent.eu")
+    print(f"Spuštěno: {datetime.now():%Y-%m-%d %H:%M:%S}")
+    print("=" * 60)
+
+    session = build_session()
+    db      = connect_db()
+    cursor  = db.cursor()
+
+    # Zjisti max stránku
+    r0 = session.get(f"{BASE_URL}&page=0", timeout=15)
+    all_page_nums = [int(m.group(1)) for m in re.finditer(r"page=(\d+)", r0.text)]
+    max_page = max(all_page_nums) if all_page_nums else MAX_PAGES
+    print(f"Max stránka na webu: {max_page}")
+    print(f"Prochází od stránky {max_page} směrem dolů...\n")
+
+    total_pages        = 0
+    total_parsed       = 0
+    total_updated      = 0
+    total_skipped      = 0
+    consecutive_empty  = 0   # stránky za sebou bez jediné shody v DB
+    consecutive_403    = 0   # 403 chyby za sebou
+
+    # Procházíme od nejstarší stránky (konec) k nejnovější (začátek)
+    for page in range(max_page, -1, -1):
+
+        url = f"{BASE_URL}&page={page}"
+        try:
+            r = session.get(url, timeout=15)
+            r.raise_for_status()
+            consecutive_403 = 0   # reset po úspěchu
+        except requests.exceptions.HTTPError as e:
+            if e.response is not None and e.response.status_code == 403:
+                consecutive_403 += 1
+                print(f"⚠️ Stránka {page} — 403 Forbidden ({consecutive_403}/{STOP_AFTER_403})")
+                if consecutive_403 >= STOP_AFTER_403:
+                    print(f"\n🛑 {STOP_AFTER_403}× 403 za sebou — web nás blokuje, přerušuji.")
+                    break
+                time.sleep(5)   # pauza po 403
+            else:
+                print(f"⚠️ Stránka {page} — chyba: {e}")
+            continue
+        except Exception as e:
+            print(f"⚠️ Stránka {page} — chyba: {e}")
+            continue
+
+        if "login.php" in r.url or "Prihlas sa" in r.text:
+            print("❌ Cookies expiraly — je potřeba se znovu přihlásit (spusť Selenium skript)")
+            break
+
+        rows = parse_page(r.text)
+
+        if not rows:
+            print(f"  Stránka {page:3d} → prázdná, konec paginace.")
+            break
+
+        total_pages  += 1
+        total_parsed += len(rows)
+        page_updated  = 0
+
+        for item in rows:
+            cursor.execute("""
+                UPDATE torrents
+                SET
+                    seeders        = %s,
+                    leechers       = %s,
+                    qb_last_update = NOW()
+                WHERE torrent_hash = %s
+            """, (item["seeders"], item["leechers"], item["hash"]))
+
+            if cursor.rowcount > 0:
+                total_updated += 1
+                page_updated  += 1
+            else:
+                total_skipped += 1
+
+        print(f"  Stránka {page:3d} → {len(rows):2d} torrentů, "
+              f"updatováno: {page_updated:2d}  (celkem: {total_updated})")
+
+        # Zastavit pokud jsme dorazili do oblasti novějších torrentů (mimo DB)
+        if page_updated == 0:
+            consecutive_empty += 1
+            if consecutive_empty >= STOP_AFTER_EMPTY_PAGES:
+                print(f"\n⏹ {STOP_AFTER_EMPTY_PAGES} stránek po sobě bez shody → "
+                      f"dorazili jsme k novějším torrentům, které nejsou v DB. Konec.")
+                break
+        else:
+            consecutive_empty = 0
+
+        time.sleep(SLEEP_BETWEEN_PAGES)
+
+    # ============================================================
+    # SUMMARY
+    # ============================================================
+    print()
+    print("=" * 60)
+    print(f"Hotovo: {datetime.now():%Y-%m-%d %H:%M:%S}")
+    print(f"Stránek zpracováno : {total_pages}")
+    print(f"Záznamů parsováno  : {total_parsed}")
+    print(f"DB řádků updatováno: {total_updated}")
+    print(f"Nebylo v DB        : {total_skipped}")
+    print("=" * 60)
+
+    db.close()
+
+
+if __name__ == "__main__":
+    main()