This commit is contained in:
2026-02-01 07:18:20 +01:00
parent 7b0404bfe3
commit 3d11661997
7 changed files with 1074 additions and 116 deletions

292
91 5threaddownloader.py Normal file
View File

@@ -0,0 +1,292 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import re
import urllib.parse as urlparse
from pathlib import Path
import json
import requests
import datetime
import sys
import threading
from concurrent.futures import ThreadPoolExecutor
# Ensure this file exists in your directory
from EmailMessagingGraph import send_mail
# ============================================================
# CONFIGURATION
# ============================================================
TOTAL_PAGES = 226
THREADS = 5
COOKIE_FILE = Path("sktorrent_cookies.json")
# Database settings
DB_CONFIG = {
"host": "192.168.1.50",
"port": 3306,
"user": "root",
"password": "Vlado9674+",
"database": "torrents",
"charset": "utf8mb4",
"autocommit": True,
}
BASE_URL = (
"https://sktorrent.eu/torrent/torrents.php"
"?active=0&category=24&order=data&by=DESC&zaner=&jazyk="
)
# Global counters for reporting (Thread-safe lock needed)
stats_lock = threading.Lock()
stats = {
"processed": 0,
"new": 0,
"existing": 0,
"new_titles": []
}
# ============================================================
# 1) WORKER FUNCTION (Runs inside each thread)
# ============================================================
def process_page_chunk(page_indices, thread_id):
"""
This function creates its OWN browser and OWN database connection.
It processes the specific list of page numbers assigned to it.
"""
print(f"🧵 [Thread-{thread_id}] Starting. Assigned {len(page_indices)} pages.")
# --- A. Setup Independent DB Connection ---
try:
db = pymysql.connect(**DB_CONFIG)
cursor = db.cursor()
except Exception as e:
print(f"❌ [Thread-{thread_id}] DB Connection failed: {e}")
return
# --- B. Setup Independent Selenium Driver ---
chrome_options = Options()
# HEADLESS MODE is safer for 5 threads to avoid popping up 5 windows
chrome_options.add_argument("--headless=new")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--log-level=3") # Reduce noise
driver = webdriver.Chrome(options=chrome_options)
driver.set_window_size(1350, 1000)
# --- C. Login / Cookies ---
driver.get("https://sktorrent.eu")
if COOKIE_FILE.exists():
with open(COOKIE_FILE, "r", encoding="utf-8") as f:
cookies = json.load(f)
for c in cookies:
driver.add_cookie(c)
# --- D. Requests Session ---
requests_session = requests.Session()
for ck in driver.get_cookies():
requests_session.cookies.set(ck["name"], ck["value"])
# --- E. Helper: Parse Row (Local scope) ---
def parse_row(cells):
try:
category = cells[0].text.strip()
# Download URL
download_a = cells[1].find_element(By.TAG_NAME, "a")
download_url = download_a.get_attribute("href")
parsed_dl = urlparse.urlparse(download_url)
dl_query = urlparse.parse_qs(parsed_dl.query)
torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]
# Details & Hash
title_links = cells[2].find_elements(By.TAG_NAME, "a")
if not title_links: return None
a_tag = title_links[0]
visible_name = a_tag.text.strip()
full_title = a_tag.get_attribute("title")
details_link = a_tag.get_attribute("href")
parsed = urlparse.urlparse(details_link)
query = urlparse.parse_qs(parsed.query)
if "id" not in query: return None
torrent_hash = query["id"][0]
# Size & Date
text_block = cells[2].get_attribute("innerText")
clean_text = " ".join(text_block.split())
size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", clean_text, re.IGNORECASE)
added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", clean_text, re.IGNORECASE)
size_pretty = size_match.group(1) if size_match else None
added_mysql = None
if added_match:
clean = added_match.group(1).replace(" o ", " ").strip()
parts = clean.split(" ")
if len(parts) >= 2:
d, m, y = parts[0].split("/")
t = parts[1] + ":00" if len(parts[1].split(":")) == 2 else parts[1]
try:
added_mysql = f"{y}-{m}-{d} {t}"
except:
pass
# Image
img_link = None
try:
img_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
img_src = re.search(r"src=([^ ]+)", img_a.get_attribute("onmouseover"))
if img_src:
img_link = img_src.group(1).replace("'", "").strip()
if img_link.startswith("//"): img_link = "https:" + img_link
except:
pass
# Stats
seeders = int(cells[4].find_element(By.TAG_NAME, "a").text.strip())
seeders_link = cells[4].find_element(By.TAG_NAME, "a").get_attribute("href")
leechers = int(cells[5].find_element(By.TAG_NAME, "a").text.strip())
leechers_link = cells[5].find_element(By.TAG_NAME, "a").get_attribute("href")
# Check DB
cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,))
row = cursor.fetchone()
already_have_file = row is not None and row[0] is not None
content = None
if not already_have_file:
# Politeness sleep only if downloading
time.sleep(1)
try:
r = requests_session.get(download_url, timeout=10)
r.raise_for_status()
content = r.content
except:
pass
return {
"torrent_hash": torrent_hash, "details_link": details_link, "download_url": download_url,
"category": category, "title_visible": visible_name, "title_full": full_title,
"size_pretty": size_pretty, "added_datetime": added_mysql, "preview_image": img_link,
"seeders": seeders, "seeders_link": seeders_link, "leechers": leechers, "leechers_link": leechers_link,
"torrent_filename": torrent_filename, "torrent_content": content,
"is_new_torrent": not already_have_file
}
except Exception:
return None
# --- F. Loop through Assigned Pages ---
for page_num in page_indices:
url = f"{BASE_URL}&page={page_num}"
print(f" 🔄 [Thread-{thread_id}] Scraping Page {page_num}")
try:
driver.get(url)
# Close popup (simplified JS)
driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}")
# Row Filtering
rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
real_rows = []
for r in rows:
cs = r.find_elements(By.TAG_NAME, "td")
if len(cs) == 7 and cs[1].find_elements(By.TAG_NAME, "a"):
real_rows.append(cs)
if not real_rows:
print(f" ⚠️ [Thread-{thread_id}] Page {page_num} empty.")
continue
# Process Rows
for cells in real_rows:
data = parse_row(cells)
if not data: continue
# Update Global Stats safely
with stats_lock:
stats["processed"] += 1
if data["is_new_torrent"]:
stats["new"] += 1
stats["new_titles"].append(data["title_visible"])
else:
stats["existing"] += 1
# Insert SQL
sql = """
INSERT INTO torrents (
torrent_hash, details_link, download_url, category, title_visible, title_full,
size_pretty, added_datetime, preview_image,
seeders, seeders_link, leechers, leechers_link,
torrent_filename, torrent_content
) VALUES (
%(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s, %(title_visible)s, %(title_full)s,
%(size_pretty)s, %(added_datetime)s, %(preview_image)s,
%(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s,
%(torrent_filename)s, %(torrent_content)s
)
ON DUPLICATE KEY UPDATE
seeders = VALUES(seeders),
leechers = VALUES(leechers),
download_url = VALUES(download_url),
torrent_content = COALESCE(VALUES(torrent_content), torrent_content);
"""
cursor.execute(sql, data)
except Exception as e:
print(f" 💥 [Thread-{thread_id}] Error on page {page_num}: {e}")
# Cleanup
driver.quit()
db.close()
print(f"🏁 [Thread-{thread_id}] Finished assigned pages.")
# ============================================================
# 2) MAIN EXECUTION
# ============================================================
if __name__ == "__main__":
RUN_START = datetime.datetime.now()
print(f"🚀 Starting Multithreaded Scraper with {THREADS} threads...")
# 1. Distribute pages among threads
# Example: If 226 pages and 5 threads, each gets ~45 pages
all_pages = list(range(TOTAL_PAGES))
chunk_size = len(all_pages) // THREADS + 1
chunks = [all_pages[i:i + chunk_size] for i in range(0, len(all_pages), chunk_size)]
# 2. Start Threads
with ThreadPoolExecutor(max_workers=THREADS) as executor:
futures = []
for i, page_chunk in enumerate(chunks):
if page_chunk: # Only start if chunk is not empty
futures.append(executor.submit(process_page_chunk, page_chunk, i + 1))
# Wait for all to finish
for f in futures:
f.result()
# 3. Final Report
RUN_END = datetime.datetime.now()
print("\n✅ All threads completed.")
body = (
f"Run started: {RUN_START:%Y-%m-%d %H:%M:%S}\n"
f"Run finished: {RUN_END:%Y-%m-%d %H:%M:%S}\n\n"
f"Processed torrents: {stats['processed']}\n"
f"New torrents saved: {stats['new']}\n"
f"Existing torrents updated: {stats['existing']}\n"
)
if stats["new_titles"]:
body += "\nNew torrents list:\n- " + "\n- ".join(stats["new_titles"])
send_mail(to="vladimir.buzalka@buzalka.cz", subject=f"SKTorrent Multi-Thread Run", body=body, html=False)
print("📧 Email report sent.")