Add Seedbox/60 AktualizaceSeeders.py — scrape seeders/leechers from sktorrent.eu
This commit is contained in:
220
Seedbox/60 AktualizaceSeeders.py
Normal file
220
Seedbox/60 AktualizaceSeeders.py
Normal file
@@ -0,0 +1,220 @@
|
|||||||
|
import pymysql
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# CONFIG
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
COOKIE_FILE = "sktorrent_cookies.json"
|
||||||
|
|
||||||
|
BASE_URL = "https://sktorrent.eu/torrent/torrents.php?active=0&category=24&order=data&by=DESC"
|
||||||
|
|
||||||
|
SLEEP_BETWEEN_PAGES = 2.0 # sekundy mezi stránkami (web nás neblokuje)
|
||||||
|
MAX_PAGES = 300 # pojistka — skript se zastaví nejpozději zde
|
||||||
|
|
||||||
|
# Kolik stránek za sebou bez jediné shody v DB = konec (dorazili jsme k novým torrentům)
|
||||||
|
STOP_AFTER_EMPTY_PAGES = 5
|
||||||
|
# Kolik 403 chyb za sebou = přerušit (web nás blokuje)
|
||||||
|
STOP_AFTER_403 = 3
|
||||||
|
|
||||||
|
DB_CONFIG = {
|
||||||
|
"host": "192.168.1.76",
|
||||||
|
"port": 3306,
|
||||||
|
"user": "root",
|
||||||
|
"password": "Vlado9674+",
|
||||||
|
"database": "torrents",
|
||||||
|
"charset": "utf8mb4",
|
||||||
|
"autocommit": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# CONNECT
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def connect_db():
|
||||||
|
return pymysql.connect(**DB_CONFIG)
|
||||||
|
|
||||||
|
|
||||||
|
def build_session():
|
||||||
|
with open(COOKIE_FILE, "r", encoding="utf-8") as f:
|
||||||
|
cookies = json.load(f)
|
||||||
|
session = requests.Session()
|
||||||
|
session.headers["User-Agent"] = (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||||
|
)
|
||||||
|
for c in cookies:
|
||||||
|
session.cookies.set(c["name"], c["value"], domain=c.get("domain", ""))
|
||||||
|
return session
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# PARSE ONE PAGE
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def parse_page(html):
|
||||||
|
"""
|
||||||
|
Vrátí seznam dict: {hash, seeders, leechers}
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for row in soup.select("table tr"):
|
||||||
|
cells = row.find_all("td")
|
||||||
|
if len(cells) != 7:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# td[1] musí mít odkaz download.php?id=<hash>
|
||||||
|
dl_link = cells[1].find("a", href=re.compile(r"download\.php\?id="))
|
||||||
|
if not dl_link:
|
||||||
|
continue
|
||||||
|
|
||||||
|
match = re.search(r"id=([a-f0-9]+)", dl_link["href"])
|
||||||
|
if not match:
|
||||||
|
continue
|
||||||
|
|
||||||
|
torrent_hash = match.group(1).lower()
|
||||||
|
|
||||||
|
# seeders = td[4], leechers = td[5]
|
||||||
|
seeders_text = cells[4].get_text(strip=True)
|
||||||
|
leechers_text = cells[5].get_text(strip=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
seeders = int(seeders_text)
|
||||||
|
except ValueError:
|
||||||
|
seeders = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
leechers = int(leechers_text)
|
||||||
|
except ValueError:
|
||||||
|
leechers = 0
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"hash": torrent_hash,
|
||||||
|
"seeders": seeders,
|
||||||
|
"leechers": leechers,
|
||||||
|
})
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# MAIN
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def main():
|
||||||
|
sys.stdout.reconfigure(encoding="utf-8")
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("AKTUALIZACE SEEDERS / LEECHERS — sktorrent.eu")
|
||||||
|
print(f"Spuštěno: {datetime.now():%Y-%m-%d %H:%M:%S}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
session = build_session()
|
||||||
|
db = connect_db()
|
||||||
|
cursor = db.cursor()
|
||||||
|
|
||||||
|
# Zjisti max stránku
|
||||||
|
r0 = session.get(f"{BASE_URL}&page=0", timeout=15)
|
||||||
|
all_page_nums = [int(m.group(1)) for m in re.finditer(r"page=(\d+)", r0.text)]
|
||||||
|
max_page = max(all_page_nums) if all_page_nums else MAX_PAGES
|
||||||
|
print(f"Max stránka na webu: {max_page}")
|
||||||
|
print(f"Prochází od stránky {max_page} směrem dolů...\n")
|
||||||
|
|
||||||
|
total_pages = 0
|
||||||
|
total_parsed = 0
|
||||||
|
total_updated = 0
|
||||||
|
total_skipped = 0
|
||||||
|
consecutive_empty = 0 # stránky za sebou bez jediné shody v DB
|
||||||
|
consecutive_403 = 0 # 403 chyby za sebou
|
||||||
|
|
||||||
|
# Procházíme od nejstarší stránky (konec) k nejnovější (začátek)
|
||||||
|
for page in range(max_page, -1, -1):
|
||||||
|
|
||||||
|
url = f"{BASE_URL}&page={page}"
|
||||||
|
try:
|
||||||
|
r = session.get(url, timeout=15)
|
||||||
|
r.raise_for_status()
|
||||||
|
consecutive_403 = 0 # reset po úspěchu
|
||||||
|
except requests.exceptions.HTTPError as e:
|
||||||
|
if e.response is not None and e.response.status_code == 403:
|
||||||
|
consecutive_403 += 1
|
||||||
|
print(f"⚠️ Stránka {page} — 403 Forbidden ({consecutive_403}/{STOP_AFTER_403})")
|
||||||
|
if consecutive_403 >= STOP_AFTER_403:
|
||||||
|
print(f"\n🛑 {STOP_AFTER_403}× 403 za sebou — web nás blokuje, přerušuji.")
|
||||||
|
break
|
||||||
|
time.sleep(5) # pauza po 403
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Stránka {page} — chyba: {e}")
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Stránka {page} — chyba: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "login.php" in r.url or "Prihlas sa" in r.text:
|
||||||
|
print("❌ Cookies expiraly — je potřeba se znovu přihlásit (spusť Selenium skript)")
|
||||||
|
break
|
||||||
|
|
||||||
|
rows = parse_page(r.text)
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
print(f" Stránka {page:3d} → prázdná, konec paginace.")
|
||||||
|
break
|
||||||
|
|
||||||
|
total_pages += 1
|
||||||
|
total_parsed += len(rows)
|
||||||
|
page_updated = 0
|
||||||
|
|
||||||
|
for item in rows:
|
||||||
|
cursor.execute("""
|
||||||
|
UPDATE torrents
|
||||||
|
SET
|
||||||
|
seeders = %s,
|
||||||
|
leechers = %s,
|
||||||
|
qb_last_update = NOW()
|
||||||
|
WHERE torrent_hash = %s
|
||||||
|
""", (item["seeders"], item["leechers"], item["hash"]))
|
||||||
|
|
||||||
|
if cursor.rowcount > 0:
|
||||||
|
total_updated += 1
|
||||||
|
page_updated += 1
|
||||||
|
else:
|
||||||
|
total_skipped += 1
|
||||||
|
|
||||||
|
print(f" Stránka {page:3d} → {len(rows):2d} torrentů, "
|
||||||
|
f"updatováno: {page_updated:2d} (celkem: {total_updated})")
|
||||||
|
|
||||||
|
# Zastavit pokud jsme dorazili do oblasti novějších torrentů (mimo DB)
|
||||||
|
if page_updated == 0:
|
||||||
|
consecutive_empty += 1
|
||||||
|
if consecutive_empty >= STOP_AFTER_EMPTY_PAGES:
|
||||||
|
print(f"\n⏹ {STOP_AFTER_EMPTY_PAGES} stránek po sobě bez shody → "
|
||||||
|
f"dorazili jsme k novějším torrentům, které nejsou v DB. Konec.")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
consecutive_empty = 0
|
||||||
|
|
||||||
|
time.sleep(SLEEP_BETWEEN_PAGES)
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# SUMMARY
|
||||||
|
# ============================================================
|
||||||
|
print()
|
||||||
|
print("=" * 60)
|
||||||
|
print(f"Hotovo: {datetime.now():%Y-%m-%d %H:%M:%S}")
|
||||||
|
print(f"Stránek zpracováno : {total_pages}")
|
||||||
|
print(f"Záznamů parsováno : {total_parsed}")
|
||||||
|
print(f"DB řádků updatováno: {total_updated}")
|
||||||
|
print(f"Nebylo v DB : {total_skipped}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user