#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Incremental import — sktorrent.eu - Prochází od nejnovějších torrentů - Stahuje a ukládá .torrent soubory pro nové záznamy - Zastaví se, jakmile narazí na torrent, který už v DB máme - Nevyžaduje Selenium — stačí requests + BeautifulSoup + cookies """ import pymysql import requests import json import time import re import sys from bs4 import BeautifulSoup from pathlib import Path from datetime import datetime import urllib.parse as urlparse # ============================================================ # CONFIG # ============================================================ COOKIE_FILE = Path("sktorrent_cookies.json") BASE_URL = ( "https://sktorrent.eu/torrent/torrents.php" "?active=0&category=24&order=data&by=DESC" ) SLEEP_BETWEEN_PAGES = 2.0 # pauza mezi stránkami SLEEP_BEFORE_DOWNLOAD = 1.5 # pauza před stažením každého .torrent souboru DB_CONFIG = { "host": "192.168.1.76", "port": 3306, "user": "root", "password": "Vlado9674+", "database": "torrents", "charset": "utf8mb4", "autocommit": True, } # ============================================================ # CONNECT # ============================================================ def connect_db(): return pymysql.connect(**DB_CONFIG) def build_session(): if not COOKIE_FILE.exists(): raise FileNotFoundError(f"Cookie soubor nenalezen: {COOKIE_FILE}") with open(COOKIE_FILE, "r", encoding="utf-8") as f: cookies = json.load(f) session = requests.Session() session.headers["User-Agent"] = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" ) for c in cookies: session.cookies.set(c["name"], c["value"], domain=c.get("domain", "")) return session # ============================================================ # PARSE ONE LISTING PAGE # ============================================================ def parse_page(html): """ Vrátí seznam dict pro každý torrent řádek na stránce. Prázdný seznam = konec paginace nebo chyba. """ soup = BeautifulSoup(html, "html.parser") results = [] for row in soup.select("table tr"): cells = row.find_all("td") if len(cells) != 7: continue # td[1] — odkaz na stažení: download.php?id=&f= dl_a = cells[1].find("a", href=re.compile(r"download\.php\?id=")) if not dl_a: continue download_url = dl_a["href"] if not download_url.startswith("http"): download_url = "https://sktorrent.eu/torrent/" + download_url m_hash = re.search(r"id=([a-f0-9A-F]+)", download_url) if not m_hash: continue torrent_hash = m_hash.group(1).lower() parsed_dl = urlparse.urlparse(download_url) dl_query = urlparse.parse_qs(parsed_dl.query) torrent_filename = dl_query.get("f", ["unknown.torrent"])[0] # td[2] — název, details link, velikost, datum title_a = cells[2].find("a", href=re.compile(r"details\.php\?id=")) if not title_a: continue title_visible = title_a.get_text(strip=True) title_full = title_a.get("title", title_visible) details_link = title_a["href"] if not details_link.startswith("http"): details_link = "https://sktorrent.eu/torrent/" + details_link cell2_text = cells[2].get_text(" ", strip=True) size_match = re.search(r"Velkost\s+([\d\.,]+\s*[KMG]B)", cell2_text, re.IGNORECASE) added_match = re.search(r"Pridany\s+(\d+/\d+/\d+)\s+(?:o\s+)?(\d+:\d+)", cell2_text, re.IGNORECASE) size_pretty = size_match.group(1).strip() if size_match else None added_mysql = None if added_match: try: d, mo, y = added_match.group(1).split("/") t = added_match.group(2) + ":00" added_mysql = f"{y}-{mo}-{d} {t}" except Exception: pass # td[0] — kategorie category = cells[0].get_text(strip=True) # td[4] seeders, td[5] leechers try: seeders = int(cells[4].get_text(strip=True)) except ValueError: seeders = 0 try: leechers = int(cells[5].get_text(strip=True)) except ValueError: leechers = 0 results.append({ "torrent_hash": torrent_hash, "download_url": download_url, "details_link": details_link, "torrent_filename": torrent_filename, "category": category, "title_visible": title_visible, "title_full": title_full, "size_pretty": size_pretty, "added_datetime": added_mysql, "seeders": seeders, "leechers": leechers, }) return results # ============================================================ # DOWNLOAD .TORRENT FILE # ============================================================ def download_torrent(session, url): try: r = session.get(url, timeout=15) r.raise_for_status() if len(r.content) < 20: return None return r.content except Exception as e: print(f" ⚠️ Stažení selhalo: {e}") return None # ============================================================ # DB INSERT # ============================================================ INSERT_SQL = """ INSERT INTO torrents ( torrent_hash, details_link, download_url, category, title_visible, title_full, size_pretty, added_datetime, seeders, leechers, torrent_filename, torrent_content ) VALUES ( %(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s, %(title_visible)s, %(title_full)s, %(size_pretty)s, %(added_datetime)s, %(seeders)s, %(leechers)s, %(torrent_filename)s, %(torrent_content)s ) ON DUPLICATE KEY UPDATE seeders = VALUES(seeders), leechers = VALUES(leechers), download_url = VALUES(download_url), torrent_content = COALESCE(VALUES(torrent_content), torrent_content) """ # ============================================================ # MAIN # ============================================================ def main(): sys.stdout.reconfigure(encoding="utf-8") print("=" * 60) print("INCREMENTAL IMPORT — sktorrent.eu") print(f"Spuštěno: {datetime.now():%Y-%m-%d %H:%M:%S}") print("Pořadí: nejnovější → nejstarší | stop při první shodě") print("=" * 60) session = build_session() db = connect_db() cursor = db.cursor() new_count = 0 page = 0 stop = False while not stop: url = f"{BASE_URL}&page={page}" try: r = session.get(url, timeout=15) r.raise_for_status() except Exception as e: print(f"⚠️ Stránka {page} — chyba: {e}") break if "login.php" in r.url or "Prihlas sa" in r.text: print("❌ Cookies expiraly — spusť přihlašovací Selenium skript a obnov cookies.") break rows = parse_page(r.text) if not rows: print(f" Stránka {page} — žádné záznamy, konec.") break print(f"\n📄 Stránka {page} ({len(rows)} torrentů)") for item in rows: # Zkontroluj DB cursor.execute( "SELECT 1 FROM torrents WHERE torrent_hash = %s", (item["torrent_hash"],) ) exists = cursor.fetchone() if exists: print(f" ⏹ Již v DB: {item['title_visible']} → zastavuji import.") stop = True break # Nový torrent — stáhni .torrent soubor print(f" ⬇️ Nový: {item['title_visible']}") time.sleep(SLEEP_BEFORE_DOWNLOAD) content = download_torrent(session, item["download_url"]) if content: print(f" ✔ Staženo ({len(content):,} B)") else: print(f" ✖ Nepodařilo se stáhnout, ukládám bez obsahu") item["torrent_content"] = content cursor.execute(INSERT_SQL, item) new_count += 1 if not stop: page += 1 time.sleep(SLEEP_BETWEEN_PAGES) # ============================================================ # SUMMARY # ============================================================ print() print("=" * 60) print(f"Hotovo: {datetime.now():%Y-%m-%d %H:%M:%S}") print(f"Nových torrentů uloženo : {new_count}") print(f"Stránek prošlo : {page}") print("=" * 60) db.close() if __name__ == "__main__": main()