Add 95 IncrementalImport.py — incremental torrent scraper without Selenium
This commit is contained in:
286
95 IncrementalImport.py
Normal file
286
95 IncrementalImport.py
Normal file
@@ -0,0 +1,286 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Incremental import — sktorrent.eu
|
||||||
|
- Prochází od nejnovějších torrentů
|
||||||
|
- Stahuje a ukládá .torrent soubory pro nové záznamy
|
||||||
|
- Zastaví se, jakmile narazí na torrent, který už v DB máme
|
||||||
|
- Nevyžaduje Selenium — stačí requests + BeautifulSoup + cookies
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pymysql
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
import urllib.parse as urlparse
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# CONFIG
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
COOKIE_FILE = Path("sktorrent_cookies.json")
|
||||||
|
|
||||||
|
BASE_URL = (
|
||||||
|
"https://sktorrent.eu/torrent/torrents.php"
|
||||||
|
"?active=0&category=24&order=data&by=DESC"
|
||||||
|
)
|
||||||
|
|
||||||
|
SLEEP_BETWEEN_PAGES = 2.0 # pauza mezi stránkami
|
||||||
|
SLEEP_BEFORE_DOWNLOAD = 1.5 # pauza před stažením každého .torrent souboru
|
||||||
|
|
||||||
|
DB_CONFIG = {
|
||||||
|
"host": "192.168.1.76",
|
||||||
|
"port": 3306,
|
||||||
|
"user": "root",
|
||||||
|
"password": "Vlado9674+",
|
||||||
|
"database": "torrents",
|
||||||
|
"charset": "utf8mb4",
|
||||||
|
"autocommit": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# CONNECT
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def connect_db():
|
||||||
|
return pymysql.connect(**DB_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
|
def build_session():
|
||||||
|
if not COOKIE_FILE.exists():
|
||||||
|
raise FileNotFoundError(f"Cookie soubor nenalezen: {COOKIE_FILE}")
|
||||||
|
with open(COOKIE_FILE, "r", encoding="utf-8") as f:
|
||||||
|
cookies = json.load(f)
|
||||||
|
session = requests.Session()
|
||||||
|
session.headers["User-Agent"] = (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||||
|
)
|
||||||
|
for c in cookies:
|
||||||
|
session.cookies.set(c["name"], c["value"], domain=c.get("domain", ""))
|
||||||
|
return session
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# PARSE ONE LISTING PAGE
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def parse_page(html):
|
||||||
|
"""
|
||||||
|
Vrátí seznam dict pro každý torrent řádek na stránce.
|
||||||
|
Prázdný seznam = konec paginace nebo chyba.
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for row in soup.select("table tr"):
|
||||||
|
cells = row.find_all("td")
|
||||||
|
if len(cells) != 7:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# td[1] — odkaz na stažení: download.php?id=<hash>&f=<filename>
|
||||||
|
dl_a = cells[1].find("a", href=re.compile(r"download\.php\?id="))
|
||||||
|
if not dl_a:
|
||||||
|
continue
|
||||||
|
|
||||||
|
download_url = dl_a["href"]
|
||||||
|
if not download_url.startswith("http"):
|
||||||
|
download_url = "https://sktorrent.eu/torrent/" + download_url
|
||||||
|
|
||||||
|
m_hash = re.search(r"id=([a-f0-9A-F]+)", download_url)
|
||||||
|
if not m_hash:
|
||||||
|
continue
|
||||||
|
torrent_hash = m_hash.group(1).lower()
|
||||||
|
|
||||||
|
parsed_dl = urlparse.urlparse(download_url)
|
||||||
|
dl_query = urlparse.parse_qs(parsed_dl.query)
|
||||||
|
torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]
|
||||||
|
|
||||||
|
# td[2] — název, details link, velikost, datum
|
||||||
|
title_a = cells[2].find("a", href=re.compile(r"details\.php\?id="))
|
||||||
|
if not title_a:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title_visible = title_a.get_text(strip=True)
|
||||||
|
title_full = title_a.get("title", title_visible)
|
||||||
|
details_link = title_a["href"]
|
||||||
|
if not details_link.startswith("http"):
|
||||||
|
details_link = "https://sktorrent.eu/torrent/" + details_link
|
||||||
|
|
||||||
|
cell2_text = cells[2].get_text(" ", strip=True)
|
||||||
|
|
||||||
|
size_match = re.search(r"Velkost\s+([\d\.,]+\s*[KMG]B)", cell2_text, re.IGNORECASE)
|
||||||
|
added_match = re.search(r"Pridany\s+(\d+/\d+/\d+)\s+(?:o\s+)?(\d+:\d+)", cell2_text, re.IGNORECASE)
|
||||||
|
|
||||||
|
size_pretty = size_match.group(1).strip() if size_match else None
|
||||||
|
added_mysql = None
|
||||||
|
if added_match:
|
||||||
|
try:
|
||||||
|
d, mo, y = added_match.group(1).split("/")
|
||||||
|
t = added_match.group(2) + ":00"
|
||||||
|
added_mysql = f"{y}-{mo}-{d} {t}"
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# td[0] — kategorie
|
||||||
|
category = cells[0].get_text(strip=True)
|
||||||
|
|
||||||
|
# td[4] seeders, td[5] leechers
|
||||||
|
try:
|
||||||
|
seeders = int(cells[4].get_text(strip=True))
|
||||||
|
except ValueError:
|
||||||
|
seeders = 0
|
||||||
|
try:
|
||||||
|
leechers = int(cells[5].get_text(strip=True))
|
||||||
|
except ValueError:
|
||||||
|
leechers = 0
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"torrent_hash": torrent_hash,
|
||||||
|
"download_url": download_url,
|
||||||
|
"details_link": details_link,
|
||||||
|
"torrent_filename": torrent_filename,
|
||||||
|
"category": category,
|
||||||
|
"title_visible": title_visible,
|
||||||
|
"title_full": title_full,
|
||||||
|
"size_pretty": size_pretty,
|
||||||
|
"added_datetime": added_mysql,
|
||||||
|
"seeders": seeders,
|
||||||
|
"leechers": leechers,
|
||||||
|
})
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# DOWNLOAD .TORRENT FILE
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def download_torrent(session, url):
|
||||||
|
try:
|
||||||
|
r = session.get(url, timeout=15)
|
||||||
|
r.raise_for_status()
|
||||||
|
if len(r.content) < 20:
|
||||||
|
return None
|
||||||
|
return r.content
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ⚠️ Stažení selhalo: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# DB INSERT
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
INSERT_SQL = """
|
||||||
|
INSERT INTO torrents (
|
||||||
|
torrent_hash, details_link, download_url, category,
|
||||||
|
title_visible, title_full, size_pretty, added_datetime,
|
||||||
|
seeders, leechers, torrent_filename, torrent_content
|
||||||
|
) VALUES (
|
||||||
|
%(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s,
|
||||||
|
%(title_visible)s, %(title_full)s, %(size_pretty)s, %(added_datetime)s,
|
||||||
|
%(seeders)s, %(leechers)s, %(torrent_filename)s, %(torrent_content)s
|
||||||
|
)
|
||||||
|
ON DUPLICATE KEY UPDATE
|
||||||
|
seeders = VALUES(seeders),
|
||||||
|
leechers = VALUES(leechers),
|
||||||
|
download_url = VALUES(download_url),
|
||||||
|
torrent_content = COALESCE(VALUES(torrent_content), torrent_content)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# MAIN
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def main():
|
||||||
|
sys.stdout.reconfigure(encoding="utf-8")
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("INCREMENTAL IMPORT — sktorrent.eu")
|
||||||
|
print(f"Spuštěno: {datetime.now():%Y-%m-%d %H:%M:%S}")
|
||||||
|
print("Pořadí: nejnovější → nejstarší | stop při první shodě")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
session = build_session()
|
||||||
|
db = connect_db()
|
||||||
|
cursor = db.cursor()
|
||||||
|
|
||||||
|
new_count = 0
|
||||||
|
page = 0
|
||||||
|
stop = False
|
||||||
|
|
||||||
|
while not stop:
|
||||||
|
|
||||||
|
url = f"{BASE_URL}&page={page}"
|
||||||
|
try:
|
||||||
|
r = session.get(url, timeout=15)
|
||||||
|
r.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Stránka {page} — chyba: {e}")
|
||||||
|
break
|
||||||
|
|
||||||
|
if "login.php" in r.url or "Prihlas sa" in r.text:
|
||||||
|
print("❌ Cookies expiraly — spusť přihlašovací Selenium skript a obnov cookies.")
|
||||||
|
break
|
||||||
|
|
||||||
|
rows = parse_page(r.text)
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
print(f" Stránka {page} — žádné záznamy, konec.")
|
||||||
|
break
|
||||||
|
|
||||||
|
print(f"\n📄 Stránka {page} ({len(rows)} torrentů)")
|
||||||
|
|
||||||
|
for item in rows:
|
||||||
|
|
||||||
|
# Zkontroluj DB
|
||||||
|
cursor.execute(
|
||||||
|
"SELECT 1 FROM torrents WHERE torrent_hash = %s",
|
||||||
|
(item["torrent_hash"],)
|
||||||
|
)
|
||||||
|
exists = cursor.fetchone()
|
||||||
|
|
||||||
|
if exists:
|
||||||
|
print(f" ⏹ Již v DB: {item['title_visible']} → zastavuji import.")
|
||||||
|
stop = True
|
||||||
|
break
|
||||||
|
|
||||||
|
# Nový torrent — stáhni .torrent soubor
|
||||||
|
print(f" ⬇️ Nový: {item['title_visible']}")
|
||||||
|
time.sleep(SLEEP_BEFORE_DOWNLOAD)
|
||||||
|
|
||||||
|
content = download_torrent(session, item["download_url"])
|
||||||
|
if content:
|
||||||
|
print(f" ✔ Staženo ({len(content):,} B)")
|
||||||
|
else:
|
||||||
|
print(f" ✖ Nepodařilo se stáhnout, ukládám bez obsahu")
|
||||||
|
|
||||||
|
item["torrent_content"] = content
|
||||||
|
cursor.execute(INSERT_SQL, item)
|
||||||
|
new_count += 1
|
||||||
|
|
||||||
|
if not stop:
|
||||||
|
page += 1
|
||||||
|
time.sleep(SLEEP_BETWEEN_PAGES)
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# SUMMARY
|
||||||
|
# ============================================================
|
||||||
|
print()
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Hotovo: {datetime.now():%Y-%m-%d %H:%M:%S}")
|
||||||
|
print(f"Nových torrentů uloženo : {new_count}")
|
||||||
|
print(f"Stránek prošlo : {page}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
22
Seedbox/sktorrent_cookies.json
Normal file
22
Seedbox/sktorrent_cookies.json
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"name": "uid",
|
||||||
|
"value": "646071",
|
||||||
|
"domain": "sktorrent.eu",
|
||||||
|
"path": "/",
|
||||||
|
"expires": 1798003565.462807,
|
||||||
|
"httpOnly": false,
|
||||||
|
"secure": false,
|
||||||
|
"sameSite": "Lax"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "pass",
|
||||||
|
"value": "91df6b497860582e09a7b333569d0187",
|
||||||
|
"domain": "sktorrent.eu",
|
||||||
|
"path": "/",
|
||||||
|
"expires": 1798003565.463191,
|
||||||
|
"httpOnly": false,
|
||||||
|
"secure": false,
|
||||||
|
"sameSite": "Lax"
|
||||||
|
}
|
||||||
|
]
|
||||||
Reference in New Issue
Block a user