Add 95 IncrementalImport.py — incremental torrent scraper without Selenium

2026-03-01 11:58:22 +01:00
parent d57f7d75ce
commit 6b8728360c
2 changed files with 308 additions and 0 deletions
@@ -0,0 +1,286 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Incremental import — sktorrent.eu
+- Prochází od nejnovějších torrentů
+- Stahuje a ukládá .torrent soubory pro nové záznamy
+- Zastaví se, jakmile narazí na torrent, který už v DB máme
+- Nevyžaduje Selenium — stačí requests + BeautifulSoup + cookies
+"""
+
+import pymysql
+import requests
+import json
+import time
+import re
+import sys
+from bs4 import BeautifulSoup
+from pathlib import Path
+from datetime import datetime
+import urllib.parse as urlparse
+
+# ============================================================
+# CONFIG
+# ============================================================
+
+COOKIE_FILE = Path("sktorrent_cookies.json")
+
+BASE_URL = (
+    "https://sktorrent.eu/torrent/torrents.php"
+    "?active=0&category=24&order=data&by=DESC"
+)
+
+SLEEP_BETWEEN_PAGES   = 2.0   # pauza mezi stránkami
+SLEEP_BEFORE_DOWNLOAD = 1.5   # pauza před stažením každého .torrent souboru
+
+DB_CONFIG = {
+    "host": "192.168.1.76",
+    "port": 3306,
+    "user": "root",
+    "password": "Vlado9674+",
+    "database": "torrents",
+    "charset": "utf8mb4",
+    "autocommit": True,
+}
+
+# ============================================================
+# CONNECT
+# ============================================================
+
+def connect_db():
+    return pymysql.connect(**DB_CONFIG)
+
+
+def build_session():
+    if not COOKIE_FILE.exists():
+        raise FileNotFoundError(f"Cookie soubor nenalezen: {COOKIE_FILE}")
+    with open(COOKIE_FILE, "r", encoding="utf-8") as f:
+        cookies = json.load(f)
+    session = requests.Session()
+    session.headers["User-Agent"] = (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+    )
+    for c in cookies:
+        session.cookies.set(c["name"], c["value"], domain=c.get("domain", ""))
+    return session
+
+
+# ============================================================
+# PARSE ONE LISTING PAGE
+# ============================================================
+
+def parse_page(html):
+    """
+    Vrátí seznam dict pro každý torrent řádek na stránce.
+    Prázdný seznam = konec paginace nebo chyba.
+    """
+    soup = BeautifulSoup(html, "html.parser")
+    results = []
+
+    for row in soup.select("table tr"):
+        cells = row.find_all("td")
+        if len(cells) != 7:
+            continue
+
+        # td[1] — odkaz na stažení: download.php?id=<hash>&f=<filename>
+        dl_a = cells[1].find("a", href=re.compile(r"download\.php\?id="))
+        if not dl_a:
+            continue
+
+        download_url = dl_a["href"]
+        if not download_url.startswith("http"):
+            download_url = "https://sktorrent.eu/torrent/" + download_url
+
+        m_hash = re.search(r"id=([a-f0-9A-F]+)", download_url)
+        if not m_hash:
+            continue
+        torrent_hash = m_hash.group(1).lower()
+
+        parsed_dl = urlparse.urlparse(download_url)
+        dl_query  = urlparse.parse_qs(parsed_dl.query)
+        torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]
+
+        # td[2] — název, details link, velikost, datum
+        title_a = cells[2].find("a", href=re.compile(r"details\.php\?id="))
+        if not title_a:
+            continue
+
+        title_visible = title_a.get_text(strip=True)
+        title_full    = title_a.get("title", title_visible)
+        details_link  = title_a["href"]
+        if not details_link.startswith("http"):
+            details_link = "https://sktorrent.eu/torrent/" + details_link
+
+        cell2_text = cells[2].get_text(" ", strip=True)
+
+        size_match  = re.search(r"Velkost\s+([\d\.,]+\s*[KMG]B)", cell2_text, re.IGNORECASE)
+        added_match = re.search(r"Pridany\s+(\d+/\d+/\d+)\s+(?:o\s+)?(\d+:\d+)", cell2_text, re.IGNORECASE)
+
+        size_pretty   = size_match.group(1).strip() if size_match else None
+        added_mysql   = None
+        if added_match:
+            try:
+                d, mo, y = added_match.group(1).split("/")
+                t = added_match.group(2) + ":00"
+                added_mysql = f"{y}-{mo}-{d} {t}"
+            except Exception:
+                pass
+
+        # td[0] — kategorie
+        category = cells[0].get_text(strip=True)
+
+        # td[4] seeders, td[5] leechers
+        try:
+            seeders  = int(cells[4].get_text(strip=True))
+        except ValueError:
+            seeders  = 0
+        try:
+            leechers = int(cells[5].get_text(strip=True))
+        except ValueError:
+            leechers = 0
+
+        results.append({
+            "torrent_hash":     torrent_hash,
+            "download_url":     download_url,
+            "details_link":     details_link,
+            "torrent_filename": torrent_filename,
+            "category":         category,
+            "title_visible":    title_visible,
+            "title_full":       title_full,
+            "size_pretty":      size_pretty,
+            "added_datetime":   added_mysql,
+            "seeders":          seeders,
+            "leechers":         leechers,
+        })
+
+    return results
+
+
+# ============================================================
+# DOWNLOAD .TORRENT FILE
+# ============================================================
+
+def download_torrent(session, url):
+    try:
+        r = session.get(url, timeout=15)
+        r.raise_for_status()
+        if len(r.content) < 20:
+            return None
+        return r.content
+    except Exception as e:
+        print(f"      ⚠️ Stažení selhalo: {e}")
+        return None
+
+
+# ============================================================
+# DB INSERT
+# ============================================================
+
+INSERT_SQL = """
+INSERT INTO torrents (
+    torrent_hash, details_link, download_url, category,
+    title_visible, title_full, size_pretty, added_datetime,
+    seeders, leechers, torrent_filename, torrent_content
+) VALUES (
+    %(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s,
+    %(title_visible)s, %(title_full)s, %(size_pretty)s, %(added_datetime)s,
+    %(seeders)s, %(leechers)s, %(torrent_filename)s, %(torrent_content)s
+)
+ON DUPLICATE KEY UPDATE
+    seeders          = VALUES(seeders),
+    leechers         = VALUES(leechers),
+    download_url     = VALUES(download_url),
+    torrent_content  = COALESCE(VALUES(torrent_content), torrent_content)
+"""
+
+
+# ============================================================
+# MAIN
+# ============================================================
+
+def main():
+    sys.stdout.reconfigure(encoding="utf-8")
+
+    print("=" * 60)
+    print("INCREMENTAL IMPORT — sktorrent.eu")
+    print(f"Spuštěno: {datetime.now():%Y-%m-%d %H:%M:%S}")
+    print("Pořadí: nejnovější → nejstarší  |  stop při první shodě")
+    print("=" * 60)
+
+    session = build_session()
+    db      = connect_db()
+    cursor  = db.cursor()
+
+    new_count  = 0
+    page       = 0
+    stop       = False
+
+    while not stop:
+
+        url = f"{BASE_URL}&page={page}"
+        try:
+            r = session.get(url, timeout=15)
+            r.raise_for_status()
+        except Exception as e:
+            print(f"⚠️ Stránka {page} — chyba: {e}")
+            break
+
+        if "login.php" in r.url or "Prihlas sa" in r.text:
+            print("❌ Cookies expiraly — spusť přihlašovací Selenium skript a obnov cookies.")
+            break
+
+        rows = parse_page(r.text)
+
+        if not rows:
+            print(f"  Stránka {page} — žádné záznamy, konec.")
+            break
+
+        print(f"\n📄 Stránka {page}  ({len(rows)} torrentů)")
+
+        for item in rows:
+
+            # Zkontroluj DB
+            cursor.execute(
+                "SELECT 1 FROM torrents WHERE torrent_hash = %s",
+                (item["torrent_hash"],)
+            )
+            exists = cursor.fetchone()
+
+            if exists:
+                print(f"  ⏹ Již v DB: {item['title_visible']} → zastavuji import.")
+                stop = True
+                break
+
+            # Nový torrent — stáhni .torrent soubor
+            print(f"  ⬇️  Nový: {item['title_visible']}")
+            time.sleep(SLEEP_BEFORE_DOWNLOAD)
+
+            content = download_torrent(session, item["download_url"])
+            if content:
+                print(f"      ✔ Staženo ({len(content):,} B)")
+            else:
+                print(f"      ✖ Nepodařilo se stáhnout, ukládám bez obsahu")
+
+            item["torrent_content"] = content
+            cursor.execute(INSERT_SQL, item)
+            new_count += 1
+
+        if not stop:
+            page += 1
+            time.sleep(SLEEP_BETWEEN_PAGES)
+
+    # ============================================================
+    # SUMMARY
+    # ============================================================
+    print()
+    print("=" * 60)
+    print(f"Hotovo: {datetime.now():%Y-%m-%d %H:%M:%S}")
+    print(f"Nových torrentů uloženo : {new_count}")
+    print(f"Stránek prošlo          : {page}")
+    print("=" * 60)
+
+    db.close()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,22 @@
+[
+  {
+    "name": "uid",
+    "value": "646071",
+    "domain": "sktorrent.eu",
+    "path": "/",
+    "expires": 1798003565.462807,
+    "httpOnly": false,
+    "secure": false,
+    "sameSite": "Lax"
+  },
+  {
+    "name": "pass",
+    "value": "91df6b497860582e09a7b333569d0187",
+    "domain": "sktorrent.eu",
+    "path": "/",
+    "expires": 1798003565.463191,
+    "httpOnly": false,
+    "secure": false,
+    "sameSite": "Lax"
+  }
+]