#!/usr/bin/env python3 # -*- coding: utf-8 -*- import pymysql from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import time import re import urllib.parse as urlparse from pathlib import Path import json import requests import datetime import sys import threading from concurrent.futures import ThreadPoolExecutor # Ensure this file exists in your directory from EmailMessagingGraph import send_mail # ============================================================ # CONFIGURATION # ============================================================ TOTAL_PAGES = 226 THREADS = 5 COOKIE_FILE = Path("sktorrent_cookies.json") # Database settings DB_CONFIG = { "host": "192.168.1.50", "port": 3306, "user": "root", "password": "Vlado9674+", "database": "torrents", "charset": "utf8mb4", "autocommit": True, } BASE_URL = ( "https://sktorrent.eu/torrent/torrents.php" "?active=0&category=24&order=data&by=DESC&zaner=&jazyk=" ) # Global counters for reporting (Thread-safe lock needed) stats_lock = threading.Lock() stats = { "processed": 0, "new": 0, "existing": 0, "new_titles": [] } # ============================================================ # 1) WORKER FUNCTION (Runs inside each thread) # ============================================================ def process_page_chunk(page_indices, thread_id): """ This function creates its OWN browser and OWN database connection. It processes the specific list of page numbers assigned to it. """ print(f"🧵 [Thread-{thread_id}] Starting. Assigned {len(page_indices)} pages.") # --- A. Setup Independent DB Connection --- try: db = pymysql.connect(**DB_CONFIG) cursor = db.cursor() except Exception as e: print(f"❌ [Thread-{thread_id}] DB Connection failed: {e}") return # --- B. Setup Independent Selenium Driver --- chrome_options = Options() # HEADLESS MODE is safer for 5 threads to avoid popping up 5 windows chrome_options.add_argument("--headless=new") chrome_options.add_argument("--disable-notifications") chrome_options.add_argument("--disable-popup-blocking") chrome_options.add_argument("--disable-extensions") chrome_options.add_argument("--log-level=3") # Reduce noise driver = webdriver.Chrome(options=chrome_options) driver.set_window_size(1350, 1000) # --- C. Login / Cookies --- driver.get("https://sktorrent.eu") if COOKIE_FILE.exists(): with open(COOKIE_FILE, "r", encoding="utf-8") as f: cookies = json.load(f) for c in cookies: driver.add_cookie(c) # --- D. Requests Session --- requests_session = requests.Session() for ck in driver.get_cookies(): requests_session.cookies.set(ck["name"], ck["value"]) # --- E. Helper: Parse Row (Local scope) --- def parse_row(cells): try: category = cells[0].text.strip() # Download URL download_a = cells[1].find_element(By.TAG_NAME, "a") download_url = download_a.get_attribute("href") parsed_dl = urlparse.urlparse(download_url) dl_query = urlparse.parse_qs(parsed_dl.query) torrent_filename = dl_query.get("f", ["unknown.torrent"])[0] # Details & Hash title_links = cells[2].find_elements(By.TAG_NAME, "a") if not title_links: return None a_tag = title_links[0] visible_name = a_tag.text.strip() full_title = a_tag.get_attribute("title") details_link = a_tag.get_attribute("href") parsed = urlparse.urlparse(details_link) query = urlparse.parse_qs(parsed.query) if "id" not in query: return None torrent_hash = query["id"][0] # Size & Date text_block = cells[2].get_attribute("innerText") clean_text = " ".join(text_block.split()) size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", clean_text, re.IGNORECASE) added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", clean_text, re.IGNORECASE) size_pretty = size_match.group(1) if size_match else None added_mysql = None if added_match: clean = added_match.group(1).replace(" o ", " ").strip() parts = clean.split(" ") if len(parts) >= 2: d, m, y = parts[0].split("/") t = parts[1] + ":00" if len(parts[1].split(":")) == 2 else parts[1] try: added_mysql = f"{y}-{m}-{d} {t}" except: pass # Image img_link = None try: img_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]") img_src = re.search(r"src=([^ ]+)", img_a.get_attribute("onmouseover")) if img_src: img_link = img_src.group(1).replace("'", "").strip() if img_link.startswith("//"): img_link = "https:" + img_link except: pass # Stats seeders = int(cells[4].find_element(By.TAG_NAME, "a").text.strip()) seeders_link = cells[4].find_element(By.TAG_NAME, "a").get_attribute("href") leechers = int(cells[5].find_element(By.TAG_NAME, "a").text.strip()) leechers_link = cells[5].find_element(By.TAG_NAME, "a").get_attribute("href") # Check DB cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,)) row = cursor.fetchone() already_have_file = row is not None and row[0] is not None content = None if not already_have_file: # Politeness sleep only if downloading time.sleep(1) try: r = requests_session.get(download_url, timeout=10) r.raise_for_status() content = r.content except: pass return { "torrent_hash": torrent_hash, "details_link": details_link, "download_url": download_url, "category": category, "title_visible": visible_name, "title_full": full_title, "size_pretty": size_pretty, "added_datetime": added_mysql, "preview_image": img_link, "seeders": seeders, "seeders_link": seeders_link, "leechers": leechers, "leechers_link": leechers_link, "torrent_filename": torrent_filename, "torrent_content": content, "is_new_torrent": not already_have_file } except Exception: return None # --- F. Loop through Assigned Pages --- for page_num in page_indices: url = f"{BASE_URL}&page={page_num}" print(f" 🔄 [Thread-{thread_id}] Scraping Page {page_num}") try: driver.get(url) # Close popup (simplified JS) driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}") # Row Filtering rows = driver.find_elements(By.CSS_SELECTOR, "table tr") real_rows = [] for r in rows: cs = r.find_elements(By.TAG_NAME, "td") if len(cs) == 7 and cs[1].find_elements(By.TAG_NAME, "a"): real_rows.append(cs) if not real_rows: print(f" ⚠️ [Thread-{thread_id}] Page {page_num} empty.") continue # Process Rows for cells in real_rows: data = parse_row(cells) if not data: continue # Update Global Stats safely with stats_lock: stats["processed"] += 1 if data["is_new_torrent"]: stats["new"] += 1 stats["new_titles"].append(data["title_visible"]) else: stats["existing"] += 1 # Insert SQL sql = """ INSERT INTO torrents ( torrent_hash, details_link, download_url, category, title_visible, title_full, size_pretty, added_datetime, preview_image, seeders, seeders_link, leechers, leechers_link, torrent_filename, torrent_content ) VALUES ( %(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s, %(title_visible)s, %(title_full)s, %(size_pretty)s, %(added_datetime)s, %(preview_image)s, %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s, %(torrent_filename)s, %(torrent_content)s ) ON DUPLICATE KEY UPDATE seeders = VALUES(seeders), leechers = VALUES(leechers), download_url = VALUES(download_url), torrent_content = COALESCE(VALUES(torrent_content), torrent_content); """ cursor.execute(sql, data) except Exception as e: print(f" 💥 [Thread-{thread_id}] Error on page {page_num}: {e}") # Cleanup driver.quit() db.close() print(f"🏁 [Thread-{thread_id}] Finished assigned pages.") # ============================================================ # 2) MAIN EXECUTION # ============================================================ if __name__ == "__main__": RUN_START = datetime.datetime.now() print(f"🚀 Starting Multithreaded Scraper with {THREADS} threads...") # 1. Distribute pages among threads # Example: If 226 pages and 5 threads, each gets ~45 pages all_pages = list(range(TOTAL_PAGES)) chunk_size = len(all_pages) // THREADS + 1 chunks = [all_pages[i:i + chunk_size] for i in range(0, len(all_pages), chunk_size)] # 2. Start Threads with ThreadPoolExecutor(max_workers=THREADS) as executor: futures = [] for i, page_chunk in enumerate(chunks): if page_chunk: # Only start if chunk is not empty futures.append(executor.submit(process_page_chunk, page_chunk, i + 1)) # Wait for all to finish for f in futures: f.result() # 3. Final Report RUN_END = datetime.datetime.now() print("\n✅ All threads completed.") body = ( f"Run started: {RUN_START:%Y-%m-%d %H:%M:%S}\n" f"Run finished: {RUN_END:%Y-%m-%d %H:%M:%S}\n\n" f"Processed torrents: {stats['processed']}\n" f"New torrents saved: {stats['new']}\n" f"Existing torrents updated: {stats['existing']}\n" ) if stats["new_titles"]: body += "\nNew torrents list:\n- " + "\n- ".join(stats["new_titles"]) send_mail(to="vladimir.buzalka@buzalka.cz", subject=f"SKTorrent Multi-Thread Run", body=body, html=False) print("📧 Email report sent.")