From 6b8728360cab3022c753530f1c525fc31ea6bfd4 Mon Sep 17 00:00:00 2001 From: Vladimir Buzalka Date: Sun, 1 Mar 2026 11:58:22 +0100 Subject: [PATCH] =?UTF-8?q?Add=2095=20IncrementalImport.py=20=E2=80=94=20i?= =?UTF-8?q?ncremental=20torrent=20scraper=20without=20Selenium?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 95 IncrementalImport.py | 286 +++++++++++++++++++++++++++++++++ Seedbox/sktorrent_cookies.json | 22 +++ 2 files changed, 308 insertions(+) create mode 100644 95 IncrementalImport.py create mode 100644 Seedbox/sktorrent_cookies.json diff --git a/95 IncrementalImport.py b/95 IncrementalImport.py new file mode 100644 index 0000000..f370502 --- /dev/null +++ b/95 IncrementalImport.py @@ -0,0 +1,286 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Incremental import — sktorrent.eu +- Prochází od nejnovějších torrentů +- Stahuje a ukládá .torrent soubory pro nové záznamy +- Zastaví se, jakmile narazí na torrent, který už v DB máme +- Nevyžaduje Selenium — stačí requests + BeautifulSoup + cookies +""" + +import pymysql +import requests +import json +import time +import re +import sys +from bs4 import BeautifulSoup +from pathlib import Path +from datetime import datetime +import urllib.parse as urlparse + +# ============================================================ +# CONFIG +# ============================================================ + +COOKIE_FILE = Path("sktorrent_cookies.json") + +BASE_URL = ( + "https://sktorrent.eu/torrent/torrents.php" + "?active=0&category=24&order=data&by=DESC" +) + +SLEEP_BETWEEN_PAGES = 2.0 # pauza mezi stránkami +SLEEP_BEFORE_DOWNLOAD = 1.5 # pauza před stažením každého .torrent souboru + +DB_CONFIG = { + "host": "192.168.1.76", + "port": 3306, + "user": "root", + "password": "Vlado9674+", + "database": "torrents", + "charset": "utf8mb4", + "autocommit": True, +} + +# ============================================================ +# CONNECT +# ============================================================ + +def connect_db(): + return pymysql.connect(**DB_CONFIG) + + +def build_session(): + if not COOKIE_FILE.exists(): + raise FileNotFoundError(f"Cookie soubor nenalezen: {COOKIE_FILE}") + with open(COOKIE_FILE, "r", encoding="utf-8") as f: + cookies = json.load(f) + session = requests.Session() + session.headers["User-Agent"] = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + ) + for c in cookies: + session.cookies.set(c["name"], c["value"], domain=c.get("domain", "")) + return session + + +# ============================================================ +# PARSE ONE LISTING PAGE +# ============================================================ + +def parse_page(html): + """ + Vrátí seznam dict pro každý torrent řádek na stránce. + Prázdný seznam = konec paginace nebo chyba. + """ + soup = BeautifulSoup(html, "html.parser") + results = [] + + for row in soup.select("table tr"): + cells = row.find_all("td") + if len(cells) != 7: + continue + + # td[1] — odkaz na stažení: download.php?id=&f= + dl_a = cells[1].find("a", href=re.compile(r"download\.php\?id=")) + if not dl_a: + continue + + download_url = dl_a["href"] + if not download_url.startswith("http"): + download_url = "https://sktorrent.eu/torrent/" + download_url + + m_hash = re.search(r"id=([a-f0-9A-F]+)", download_url) + if not m_hash: + continue + torrent_hash = m_hash.group(1).lower() + + parsed_dl = urlparse.urlparse(download_url) + dl_query = urlparse.parse_qs(parsed_dl.query) + torrent_filename = dl_query.get("f", ["unknown.torrent"])[0] + + # td[2] — název, details link, velikost, datum + title_a = cells[2].find("a", href=re.compile(r"details\.php\?id=")) + if not title_a: + continue + + title_visible = title_a.get_text(strip=True) + title_full = title_a.get("title", title_visible) + details_link = title_a["href"] + if not details_link.startswith("http"): + details_link = "https://sktorrent.eu/torrent/" + details_link + + cell2_text = cells[2].get_text(" ", strip=True) + + size_match = re.search(r"Velkost\s+([\d\.,]+\s*[KMG]B)", cell2_text, re.IGNORECASE) + added_match = re.search(r"Pridany\s+(\d+/\d+/\d+)\s+(?:o\s+)?(\d+:\d+)", cell2_text, re.IGNORECASE) + + size_pretty = size_match.group(1).strip() if size_match else None + added_mysql = None + if added_match: + try: + d, mo, y = added_match.group(1).split("/") + t = added_match.group(2) + ":00" + added_mysql = f"{y}-{mo}-{d} {t}" + except Exception: + pass + + # td[0] — kategorie + category = cells[0].get_text(strip=True) + + # td[4] seeders, td[5] leechers + try: + seeders = int(cells[4].get_text(strip=True)) + except ValueError: + seeders = 0 + try: + leechers = int(cells[5].get_text(strip=True)) + except ValueError: + leechers = 0 + + results.append({ + "torrent_hash": torrent_hash, + "download_url": download_url, + "details_link": details_link, + "torrent_filename": torrent_filename, + "category": category, + "title_visible": title_visible, + "title_full": title_full, + "size_pretty": size_pretty, + "added_datetime": added_mysql, + "seeders": seeders, + "leechers": leechers, + }) + + return results + + +# ============================================================ +# DOWNLOAD .TORRENT FILE +# ============================================================ + +def download_torrent(session, url): + try: + r = session.get(url, timeout=15) + r.raise_for_status() + if len(r.content) < 20: + return None + return r.content + except Exception as e: + print(f" ⚠️ Stažení selhalo: {e}") + return None + + +# ============================================================ +# DB INSERT +# ============================================================ + +INSERT_SQL = """ +INSERT INTO torrents ( + torrent_hash, details_link, download_url, category, + title_visible, title_full, size_pretty, added_datetime, + seeders, leechers, torrent_filename, torrent_content +) VALUES ( + %(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s, + %(title_visible)s, %(title_full)s, %(size_pretty)s, %(added_datetime)s, + %(seeders)s, %(leechers)s, %(torrent_filename)s, %(torrent_content)s +) +ON DUPLICATE KEY UPDATE + seeders = VALUES(seeders), + leechers = VALUES(leechers), + download_url = VALUES(download_url), + torrent_content = COALESCE(VALUES(torrent_content), torrent_content) +""" + + +# ============================================================ +# MAIN +# ============================================================ + +def main(): + sys.stdout.reconfigure(encoding="utf-8") + + print("=" * 60) + print("INCREMENTAL IMPORT — sktorrent.eu") + print(f"Spuštěno: {datetime.now():%Y-%m-%d %H:%M:%S}") + print("Pořadí: nejnovější → nejstarší | stop při první shodě") + print("=" * 60) + + session = build_session() + db = connect_db() + cursor = db.cursor() + + new_count = 0 + page = 0 + stop = False + + while not stop: + + url = f"{BASE_URL}&page={page}" + try: + r = session.get(url, timeout=15) + r.raise_for_status() + except Exception as e: + print(f"⚠️ Stránka {page} — chyba: {e}") + break + + if "login.php" in r.url or "Prihlas sa" in r.text: + print("❌ Cookies expiraly — spusť přihlašovací Selenium skript a obnov cookies.") + break + + rows = parse_page(r.text) + + if not rows: + print(f" Stránka {page} — žádné záznamy, konec.") + break + + print(f"\n📄 Stránka {page} ({len(rows)} torrentů)") + + for item in rows: + + # Zkontroluj DB + cursor.execute( + "SELECT 1 FROM torrents WHERE torrent_hash = %s", + (item["torrent_hash"],) + ) + exists = cursor.fetchone() + + if exists: + print(f" ⏹ Již v DB: {item['title_visible']} → zastavuji import.") + stop = True + break + + # Nový torrent — stáhni .torrent soubor + print(f" ⬇️ Nový: {item['title_visible']}") + time.sleep(SLEEP_BEFORE_DOWNLOAD) + + content = download_torrent(session, item["download_url"]) + if content: + print(f" ✔ Staženo ({len(content):,} B)") + else: + print(f" ✖ Nepodařilo se stáhnout, ukládám bez obsahu") + + item["torrent_content"] = content + cursor.execute(INSERT_SQL, item) + new_count += 1 + + if not stop: + page += 1 + time.sleep(SLEEP_BETWEEN_PAGES) + + # ============================================================ + # SUMMARY + # ============================================================ + print() + print("=" * 60) + print(f"Hotovo: {datetime.now():%Y-%m-%d %H:%M:%S}") + print(f"Nových torrentů uloženo : {new_count}") + print(f"Stránek prošlo : {page}") + print("=" * 60) + + db.close() + + +if __name__ == "__main__": + main() diff --git a/Seedbox/sktorrent_cookies.json b/Seedbox/sktorrent_cookies.json new file mode 100644 index 0000000..8fd4a97 --- /dev/null +++ b/Seedbox/sktorrent_cookies.json @@ -0,0 +1,22 @@ +[ + { + "name": "uid", + "value": "646071", + "domain": "sktorrent.eu", + "path": "/", + "expires": 1798003565.462807, + "httpOnly": false, + "secure": false, + "sameSite": "Lax" + }, + { + "name": "pass", + "value": "91df6b497860582e09a7b333569d0187", + "domain": "sktorrent.eu", + "path": "/", + "expires": 1798003565.463191, + "httpOnly": false, + "secure": false, + "sameSite": "Lax" + } +] \ No newline at end of file