Files
torrents/Seedbox/60 AktualizaceSeeders.py

221 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import pymysql
import requests
import json
import time
import re
import sys
from bs4 import BeautifulSoup
from datetime import datetime
# ============================================================
# CONFIG
# ============================================================
COOKIE_FILE = "sktorrent_cookies.json"
BASE_URL = "https://sktorrent.eu/torrent/torrents.php?active=0&category=24&order=data&by=DESC"
SLEEP_BETWEEN_PAGES = 2.0 # sekundy mezi stránkami (web nás neblokuje)
MAX_PAGES = 300 # pojistka — skript se zastaví nejpozději zde
# Kolik stránek za sebou bez jediné shody v DB = konec (dorazili jsme k novým torrentům)
STOP_AFTER_EMPTY_PAGES = 5
# Kolik 403 chyb za sebou = přerušit (web nás blokuje)
STOP_AFTER_403 = 3
DB_CONFIG = {
"host": "192.168.1.76",
"port": 3306,
"user": "root",
"password": "Vlado9674+",
"database": "torrents",
"charset": "utf8mb4",
"autocommit": True,
}
# ============================================================
# CONNECT
# ============================================================
def connect_db():
return pymysql.connect(**DB_CONFIG)
def build_session():
with open(COOKIE_FILE, "r", encoding="utf-8") as f:
cookies = json.load(f)
session = requests.Session()
session.headers["User-Agent"] = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
)
for c in cookies:
session.cookies.set(c["name"], c["value"], domain=c.get("domain", ""))
return session
# ============================================================
# PARSE ONE PAGE
# ============================================================
def parse_page(html):
"""
Vrátí seznam dict: {hash, seeders, leechers}
"""
soup = BeautifulSoup(html, "html.parser")
results = []
for row in soup.select("table tr"):
cells = row.find_all("td")
if len(cells) != 7:
continue
# td[1] musí mít odkaz download.php?id=<hash>
dl_link = cells[1].find("a", href=re.compile(r"download\.php\?id="))
if not dl_link:
continue
match = re.search(r"id=([a-f0-9]+)", dl_link["href"])
if not match:
continue
torrent_hash = match.group(1).lower()
# seeders = td[4], leechers = td[5]
seeders_text = cells[4].get_text(strip=True)
leechers_text = cells[5].get_text(strip=True)
try:
seeders = int(seeders_text)
except ValueError:
seeders = 0
try:
leechers = int(leechers_text)
except ValueError:
leechers = 0
results.append({
"hash": torrent_hash,
"seeders": seeders,
"leechers": leechers,
})
return results
# ============================================================
# MAIN
# ============================================================
def main():
sys.stdout.reconfigure(encoding="utf-8")
print("=" * 60)
print("AKTUALIZACE SEEDERS / LEECHERS — sktorrent.eu")
print(f"Spuštěno: {datetime.now():%Y-%m-%d %H:%M:%S}")
print("=" * 60)
session = build_session()
db = connect_db()
cursor = db.cursor()
# Zjisti max stránku
r0 = session.get(f"{BASE_URL}&page=0", timeout=15)
all_page_nums = [int(m.group(1)) for m in re.finditer(r"page=(\d+)", r0.text)]
max_page = max(all_page_nums) if all_page_nums else MAX_PAGES
print(f"Max stránka na webu: {max_page}")
print(f"Prochází od stránky {max_page} směrem dolů...\n")
total_pages = 0
total_parsed = 0
total_updated = 0
total_skipped = 0
consecutive_empty = 0 # stránky za sebou bez jediné shody v DB
consecutive_403 = 0 # 403 chyby za sebou
# Procházíme od nejstarší stránky (konec) k nejnovější (začátek)
for page in range(max_page, -1, -1):
url = f"{BASE_URL}&page={page}"
try:
r = session.get(url, timeout=15)
r.raise_for_status()
consecutive_403 = 0 # reset po úspěchu
except requests.exceptions.HTTPError as e:
if e.response is not None and e.response.status_code == 403:
consecutive_403 += 1
print(f"⚠️ Stránka {page} — 403 Forbidden ({consecutive_403}/{STOP_AFTER_403})")
if consecutive_403 >= STOP_AFTER_403:
print(f"\n🛑 {STOP_AFTER_403}× 403 za sebou — web nás blokuje, přerušuji.")
break
time.sleep(5) # pauza po 403
else:
print(f"⚠️ Stránka {page} — chyba: {e}")
continue
except Exception as e:
print(f"⚠️ Stránka {page} — chyba: {e}")
continue
if "login.php" in r.url or "Prihlas sa" in r.text:
print("❌ Cookies expiraly — je potřeba se znovu přihlásit (spusť Selenium skript)")
break
rows = parse_page(r.text)
if not rows:
print(f" Stránka {page:3d} → prázdná, konec paginace.")
break
total_pages += 1
total_parsed += len(rows)
page_updated = 0
for item in rows:
cursor.execute("""
UPDATE torrents
SET
seeders = %s,
leechers = %s,
qb_last_update = NOW()
WHERE torrent_hash = %s
""", (item["seeders"], item["leechers"], item["hash"]))
if cursor.rowcount > 0:
total_updated += 1
page_updated += 1
else:
total_skipped += 1
print(f" Stránka {page:3d}{len(rows):2d} torrentů, "
f"updatováno: {page_updated:2d} (celkem: {total_updated})")
# Zastavit pokud jsme dorazili do oblasti novějších torrentů (mimo DB)
if page_updated == 0:
consecutive_empty += 1
if consecutive_empty >= STOP_AFTER_EMPTY_PAGES:
print(f"\n{STOP_AFTER_EMPTY_PAGES} stránek po sobě bez shody → "
f"dorazili jsme k novějším torrentům, které nejsou v DB. Konec.")
break
else:
consecutive_empty = 0
time.sleep(SLEEP_BETWEEN_PAGES)
# ============================================================
# SUMMARY
# ============================================================
print()
print("=" * 60)
print(f"Hotovo: {datetime.now():%Y-%m-%d %H:%M:%S}")
print(f"Stránek zpracováno : {total_pages}")
print(f"Záznamů parsováno : {total_parsed}")
print(f"DB řádků updatováno: {total_updated}")
print(f"Nebylo v DB : {total_skipped}")
print("=" * 60)
db.close()
if __name__ == "__main__":
main()