Add 95 IncrementalImport.py — incremental torrent scraper without Selenium
This commit is contained in:
286
95 IncrementalImport.py
Normal file
286
95 IncrementalImport.py
Normal file
@@ -0,0 +1,286 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Incremental import — sktorrent.eu
|
||||
- Prochází od nejnovějších torrentů
|
||||
- Stahuje a ukládá .torrent soubory pro nové záznamy
|
||||
- Zastaví se, jakmile narazí na torrent, který už v DB máme
|
||||
- Nevyžaduje Selenium — stačí requests + BeautifulSoup + cookies
|
||||
"""
|
||||
|
||||
import pymysql
|
||||
import requests
|
||||
import json
|
||||
import time
|
||||
import re
|
||||
import sys
|
||||
from bs4 import BeautifulSoup
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import urllib.parse as urlparse
|
||||
|
||||
# ============================================================
|
||||
# CONFIG
|
||||
# ============================================================
|
||||
|
||||
COOKIE_FILE = Path("sktorrent_cookies.json")
|
||||
|
||||
BASE_URL = (
|
||||
"https://sktorrent.eu/torrent/torrents.php"
|
||||
"?active=0&category=24&order=data&by=DESC"
|
||||
)
|
||||
|
||||
SLEEP_BETWEEN_PAGES = 2.0 # pauza mezi stránkami
|
||||
SLEEP_BEFORE_DOWNLOAD = 1.5 # pauza před stažením každého .torrent souboru
|
||||
|
||||
DB_CONFIG = {
|
||||
"host": "192.168.1.76",
|
||||
"port": 3306,
|
||||
"user": "root",
|
||||
"password": "Vlado9674+",
|
||||
"database": "torrents",
|
||||
"charset": "utf8mb4",
|
||||
"autocommit": True,
|
||||
}
|
||||
|
||||
# ============================================================
|
||||
# CONNECT
|
||||
# ============================================================
|
||||
|
||||
def connect_db():
|
||||
return pymysql.connect(**DB_CONFIG)
|
||||
|
||||
|
||||
def build_session():
|
||||
if not COOKIE_FILE.exists():
|
||||
raise FileNotFoundError(f"Cookie soubor nenalezen: {COOKIE_FILE}")
|
||||
with open(COOKIE_FILE, "r", encoding="utf-8") as f:
|
||||
cookies = json.load(f)
|
||||
session = requests.Session()
|
||||
session.headers["User-Agent"] = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||
)
|
||||
for c in cookies:
|
||||
session.cookies.set(c["name"], c["value"], domain=c.get("domain", ""))
|
||||
return session
|
||||
|
||||
|
||||
# ============================================================
|
||||
# PARSE ONE LISTING PAGE
|
||||
# ============================================================
|
||||
|
||||
def parse_page(html):
|
||||
"""
|
||||
Vrátí seznam dict pro každý torrent řádek na stránce.
|
||||
Prázdný seznam = konec paginace nebo chyba.
|
||||
"""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
results = []
|
||||
|
||||
for row in soup.select("table tr"):
|
||||
cells = row.find_all("td")
|
||||
if len(cells) != 7:
|
||||
continue
|
||||
|
||||
# td[1] — odkaz na stažení: download.php?id=<hash>&f=<filename>
|
||||
dl_a = cells[1].find("a", href=re.compile(r"download\.php\?id="))
|
||||
if not dl_a:
|
||||
continue
|
||||
|
||||
download_url = dl_a["href"]
|
||||
if not download_url.startswith("http"):
|
||||
download_url = "https://sktorrent.eu/torrent/" + download_url
|
||||
|
||||
m_hash = re.search(r"id=([a-f0-9A-F]+)", download_url)
|
||||
if not m_hash:
|
||||
continue
|
||||
torrent_hash = m_hash.group(1).lower()
|
||||
|
||||
parsed_dl = urlparse.urlparse(download_url)
|
||||
dl_query = urlparse.parse_qs(parsed_dl.query)
|
||||
torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]
|
||||
|
||||
# td[2] — název, details link, velikost, datum
|
||||
title_a = cells[2].find("a", href=re.compile(r"details\.php\?id="))
|
||||
if not title_a:
|
||||
continue
|
||||
|
||||
title_visible = title_a.get_text(strip=True)
|
||||
title_full = title_a.get("title", title_visible)
|
||||
details_link = title_a["href"]
|
||||
if not details_link.startswith("http"):
|
||||
details_link = "https://sktorrent.eu/torrent/" + details_link
|
||||
|
||||
cell2_text = cells[2].get_text(" ", strip=True)
|
||||
|
||||
size_match = re.search(r"Velkost\s+([\d\.,]+\s*[KMG]B)", cell2_text, re.IGNORECASE)
|
||||
added_match = re.search(r"Pridany\s+(\d+/\d+/\d+)\s+(?:o\s+)?(\d+:\d+)", cell2_text, re.IGNORECASE)
|
||||
|
||||
size_pretty = size_match.group(1).strip() if size_match else None
|
||||
added_mysql = None
|
||||
if added_match:
|
||||
try:
|
||||
d, mo, y = added_match.group(1).split("/")
|
||||
t = added_match.group(2) + ":00"
|
||||
added_mysql = f"{y}-{mo}-{d} {t}"
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# td[0] — kategorie
|
||||
category = cells[0].get_text(strip=True)
|
||||
|
||||
# td[4] seeders, td[5] leechers
|
||||
try:
|
||||
seeders = int(cells[4].get_text(strip=True))
|
||||
except ValueError:
|
||||
seeders = 0
|
||||
try:
|
||||
leechers = int(cells[5].get_text(strip=True))
|
||||
except ValueError:
|
||||
leechers = 0
|
||||
|
||||
results.append({
|
||||
"torrent_hash": torrent_hash,
|
||||
"download_url": download_url,
|
||||
"details_link": details_link,
|
||||
"torrent_filename": torrent_filename,
|
||||
"category": category,
|
||||
"title_visible": title_visible,
|
||||
"title_full": title_full,
|
||||
"size_pretty": size_pretty,
|
||||
"added_datetime": added_mysql,
|
||||
"seeders": seeders,
|
||||
"leechers": leechers,
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ============================================================
|
||||
# DOWNLOAD .TORRENT FILE
|
||||
# ============================================================
|
||||
|
||||
def download_torrent(session, url):
|
||||
try:
|
||||
r = session.get(url, timeout=15)
|
||||
r.raise_for_status()
|
||||
if len(r.content) < 20:
|
||||
return None
|
||||
return r.content
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Stažení selhalo: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# ============================================================
|
||||
# DB INSERT
|
||||
# ============================================================
|
||||
|
||||
INSERT_SQL = """
|
||||
INSERT INTO torrents (
|
||||
torrent_hash, details_link, download_url, category,
|
||||
title_visible, title_full, size_pretty, added_datetime,
|
||||
seeders, leechers, torrent_filename, torrent_content
|
||||
) VALUES (
|
||||
%(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s,
|
||||
%(title_visible)s, %(title_full)s, %(size_pretty)s, %(added_datetime)s,
|
||||
%(seeders)s, %(leechers)s, %(torrent_filename)s, %(torrent_content)s
|
||||
)
|
||||
ON DUPLICATE KEY UPDATE
|
||||
seeders = VALUES(seeders),
|
||||
leechers = VALUES(leechers),
|
||||
download_url = VALUES(download_url),
|
||||
torrent_content = COALESCE(VALUES(torrent_content), torrent_content)
|
||||
"""
|
||||
|
||||
|
||||
# ============================================================
|
||||
# MAIN
|
||||
# ============================================================
|
||||
|
||||
def main():
|
||||
sys.stdout.reconfigure(encoding="utf-8")
|
||||
|
||||
print("=" * 60)
|
||||
print("INCREMENTAL IMPORT — sktorrent.eu")
|
||||
print(f"Spuštěno: {datetime.now():%Y-%m-%d %H:%M:%S}")
|
||||
print("Pořadí: nejnovější → nejstarší | stop při první shodě")
|
||||
print("=" * 60)
|
||||
|
||||
session = build_session()
|
||||
db = connect_db()
|
||||
cursor = db.cursor()
|
||||
|
||||
new_count = 0
|
||||
page = 0
|
||||
stop = False
|
||||
|
||||
while not stop:
|
||||
|
||||
url = f"{BASE_URL}&page={page}"
|
||||
try:
|
||||
r = session.get(url, timeout=15)
|
||||
r.raise_for_status()
|
||||
except Exception as e:
|
||||
print(f"⚠️ Stránka {page} — chyba: {e}")
|
||||
break
|
||||
|
||||
if "login.php" in r.url or "Prihlas sa" in r.text:
|
||||
print("❌ Cookies expiraly — spusť přihlašovací Selenium skript a obnov cookies.")
|
||||
break
|
||||
|
||||
rows = parse_page(r.text)
|
||||
|
||||
if not rows:
|
||||
print(f" Stránka {page} — žádné záznamy, konec.")
|
||||
break
|
||||
|
||||
print(f"\n📄 Stránka {page} ({len(rows)} torrentů)")
|
||||
|
||||
for item in rows:
|
||||
|
||||
# Zkontroluj DB
|
||||
cursor.execute(
|
||||
"SELECT 1 FROM torrents WHERE torrent_hash = %s",
|
||||
(item["torrent_hash"],)
|
||||
)
|
||||
exists = cursor.fetchone()
|
||||
|
||||
if exists:
|
||||
print(f" ⏹ Již v DB: {item['title_visible']} → zastavuji import.")
|
||||
stop = True
|
||||
break
|
||||
|
||||
# Nový torrent — stáhni .torrent soubor
|
||||
print(f" ⬇️ Nový: {item['title_visible']}")
|
||||
time.sleep(SLEEP_BEFORE_DOWNLOAD)
|
||||
|
||||
content = download_torrent(session, item["download_url"])
|
||||
if content:
|
||||
print(f" ✔ Staženo ({len(content):,} B)")
|
||||
else:
|
||||
print(f" ✖ Nepodařilo se stáhnout, ukládám bez obsahu")
|
||||
|
||||
item["torrent_content"] = content
|
||||
cursor.execute(INSERT_SQL, item)
|
||||
new_count += 1
|
||||
|
||||
if not stop:
|
||||
page += 1
|
||||
time.sleep(SLEEP_BETWEEN_PAGES)
|
||||
|
||||
# ============================================================
|
||||
# SUMMARY
|
||||
# ============================================================
|
||||
print()
|
||||
print("=" * 60)
|
||||
print(f"Hotovo: {datetime.now():%Y-%m-%d %H:%M:%S}")
|
||||
print(f"Nových torrentů uloženo : {new_count}")
|
||||
print(f"Stránek prošlo : {page}")
|
||||
print("=" * 60)
|
||||
|
||||
db.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user