Add 95 IncrementalImport.py — incremental torrent scraper without Selenium

This commit is contained in:
2026-03-01 11:58:22 +01:00
parent d57f7d75ce
commit 6b8728360c
2 changed files with 308 additions and 0 deletions

286
95 IncrementalImport.py Normal file
View File

@@ -0,0 +1,286 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Incremental import — sktorrent.eu
- Prochází od nejnovějších torrentů
- Stahuje a ukládá .torrent soubory pro nové záznamy
- Zastaví se, jakmile narazí na torrent, který už v DB máme
- Nevyžaduje Selenium — stačí requests + BeautifulSoup + cookies
"""
import pymysql
import requests
import json
import time
import re
import sys
from bs4 import BeautifulSoup
from pathlib import Path
from datetime import datetime
import urllib.parse as urlparse
# ============================================================
# CONFIG
# ============================================================
COOKIE_FILE = Path("sktorrent_cookies.json")
BASE_URL = (
"https://sktorrent.eu/torrent/torrents.php"
"?active=0&category=24&order=data&by=DESC"
)
SLEEP_BETWEEN_PAGES = 2.0 # pauza mezi stránkami
SLEEP_BEFORE_DOWNLOAD = 1.5 # pauza před stažením každého .torrent souboru
DB_CONFIG = {
"host": "192.168.1.76",
"port": 3306,
"user": "root",
"password": "Vlado9674+",
"database": "torrents",
"charset": "utf8mb4",
"autocommit": True,
}
# ============================================================
# CONNECT
# ============================================================
def connect_db():
return pymysql.connect(**DB_CONFIG)
def build_session():
if not COOKIE_FILE.exists():
raise FileNotFoundError(f"Cookie soubor nenalezen: {COOKIE_FILE}")
with open(COOKIE_FILE, "r", encoding="utf-8") as f:
cookies = json.load(f)
session = requests.Session()
session.headers["User-Agent"] = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
)
for c in cookies:
session.cookies.set(c["name"], c["value"], domain=c.get("domain", ""))
return session
# ============================================================
# PARSE ONE LISTING PAGE
# ============================================================
def parse_page(html):
"""
Vrátí seznam dict pro každý torrent řádek na stránce.
Prázdný seznam = konec paginace nebo chyba.
"""
soup = BeautifulSoup(html, "html.parser")
results = []
for row in soup.select("table tr"):
cells = row.find_all("td")
if len(cells) != 7:
continue
# td[1] — odkaz na stažení: download.php?id=<hash>&f=<filename>
dl_a = cells[1].find("a", href=re.compile(r"download\.php\?id="))
if not dl_a:
continue
download_url = dl_a["href"]
if not download_url.startswith("http"):
download_url = "https://sktorrent.eu/torrent/" + download_url
m_hash = re.search(r"id=([a-f0-9A-F]+)", download_url)
if not m_hash:
continue
torrent_hash = m_hash.group(1).lower()
parsed_dl = urlparse.urlparse(download_url)
dl_query = urlparse.parse_qs(parsed_dl.query)
torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]
# td[2] — název, details link, velikost, datum
title_a = cells[2].find("a", href=re.compile(r"details\.php\?id="))
if not title_a:
continue
title_visible = title_a.get_text(strip=True)
title_full = title_a.get("title", title_visible)
details_link = title_a["href"]
if not details_link.startswith("http"):
details_link = "https://sktorrent.eu/torrent/" + details_link
cell2_text = cells[2].get_text(" ", strip=True)
size_match = re.search(r"Velkost\s+([\d\.,]+\s*[KMG]B)", cell2_text, re.IGNORECASE)
added_match = re.search(r"Pridany\s+(\d+/\d+/\d+)\s+(?:o\s+)?(\d+:\d+)", cell2_text, re.IGNORECASE)
size_pretty = size_match.group(1).strip() if size_match else None
added_mysql = None
if added_match:
try:
d, mo, y = added_match.group(1).split("/")
t = added_match.group(2) + ":00"
added_mysql = f"{y}-{mo}-{d} {t}"
except Exception:
pass
# td[0] — kategorie
category = cells[0].get_text(strip=True)
# td[4] seeders, td[5] leechers
try:
seeders = int(cells[4].get_text(strip=True))
except ValueError:
seeders = 0
try:
leechers = int(cells[5].get_text(strip=True))
except ValueError:
leechers = 0
results.append({
"torrent_hash": torrent_hash,
"download_url": download_url,
"details_link": details_link,
"torrent_filename": torrent_filename,
"category": category,
"title_visible": title_visible,
"title_full": title_full,
"size_pretty": size_pretty,
"added_datetime": added_mysql,
"seeders": seeders,
"leechers": leechers,
})
return results
# ============================================================
# DOWNLOAD .TORRENT FILE
# ============================================================
def download_torrent(session, url):
try:
r = session.get(url, timeout=15)
r.raise_for_status()
if len(r.content) < 20:
return None
return r.content
except Exception as e:
print(f" ⚠️ Stažení selhalo: {e}")
return None
# ============================================================
# DB INSERT
# ============================================================
INSERT_SQL = """
INSERT INTO torrents (
torrent_hash, details_link, download_url, category,
title_visible, title_full, size_pretty, added_datetime,
seeders, leechers, torrent_filename, torrent_content
) VALUES (
%(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s,
%(title_visible)s, %(title_full)s, %(size_pretty)s, %(added_datetime)s,
%(seeders)s, %(leechers)s, %(torrent_filename)s, %(torrent_content)s
)
ON DUPLICATE KEY UPDATE
seeders = VALUES(seeders),
leechers = VALUES(leechers),
download_url = VALUES(download_url),
torrent_content = COALESCE(VALUES(torrent_content), torrent_content)
"""
# ============================================================
# MAIN
# ============================================================
def main():
sys.stdout.reconfigure(encoding="utf-8")
print("=" * 60)
print("INCREMENTAL IMPORT — sktorrent.eu")
print(f"Spuštěno: {datetime.now():%Y-%m-%d %H:%M:%S}")
print("Pořadí: nejnovější → nejstarší | stop při první shodě")
print("=" * 60)
session = build_session()
db = connect_db()
cursor = db.cursor()
new_count = 0
page = 0
stop = False
while not stop:
url = f"{BASE_URL}&page={page}"
try:
r = session.get(url, timeout=15)
r.raise_for_status()
except Exception as e:
print(f"⚠️ Stránka {page} — chyba: {e}")
break
if "login.php" in r.url or "Prihlas sa" in r.text:
print("❌ Cookies expiraly — spusť přihlašovací Selenium skript a obnov cookies.")
break
rows = parse_page(r.text)
if not rows:
print(f" Stránka {page} — žádné záznamy, konec.")
break
print(f"\n📄 Stránka {page} ({len(rows)} torrentů)")
for item in rows:
# Zkontroluj DB
cursor.execute(
"SELECT 1 FROM torrents WHERE torrent_hash = %s",
(item["torrent_hash"],)
)
exists = cursor.fetchone()
if exists:
print(f" ⏹ Již v DB: {item['title_visible']} → zastavuji import.")
stop = True
break
# Nový torrent — stáhni .torrent soubor
print(f" ⬇️ Nový: {item['title_visible']}")
time.sleep(SLEEP_BEFORE_DOWNLOAD)
content = download_torrent(session, item["download_url"])
if content:
print(f" ✔ Staženo ({len(content):,} B)")
else:
print(f" ✖ Nepodařilo se stáhnout, ukládám bez obsahu")
item["torrent_content"] = content
cursor.execute(INSERT_SQL, item)
new_count += 1
if not stop:
page += 1
time.sleep(SLEEP_BETWEEN_PAGES)
# ============================================================
# SUMMARY
# ============================================================
print()
print("=" * 60)
print(f"Hotovo: {datetime.now():%Y-%m-%d %H:%M:%S}")
print(f"Nových torrentů uloženo : {new_count}")
print(f"Stránek prošlo : {page}")
print("=" * 60)
db.close()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,22 @@
[
{
"name": "uid",
"value": "646071",
"domain": "sktorrent.eu",
"path": "/",
"expires": 1798003565.462807,
"httpOnly": false,
"secure": false,
"sameSite": "Lax"
},
{
"name": "pass",
"value": "91df6b497860582e09a7b333569d0187",
"domain": "sktorrent.eu",
"path": "/",
"expires": 1798003565.463191,
"httpOnly": false,
"secure": false,
"sameSite": "Lax"
}
]