torrents/91 5threaddownloader.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import re
import urllib.parse as urlparse
from pathlib import Path
import json
import requests
import datetime
import sys
import threading
from concurrent.futures import ThreadPoolExecutor

# Ensure this file exists in your directory
from EmailMessagingGraph import send_mail

# ============================================================
# CONFIGURATION
# ============================================================
TOTAL_PAGES = 226
THREADS = 5
COOKIE_FILE = Path("sktorrent_cookies.json")

# Database settings
DB_CONFIG = {
    "host": "192.168.1.50",
    "port": 3306,
    "user": "root",
    "password": "Vlado9674+",
    "database": "torrents",
    "charset": "utf8mb4",
    "autocommit": True,
}

BASE_URL = (
    "https://sktorrent.eu/torrent/torrents.php"
    "?active=0&category=24&order=data&by=DESC&zaner=&jazyk="
)

# Global counters for reporting (Thread-safe lock needed)
stats_lock = threading.Lock()
stats = {
    "processed": 0,
    "new": 0,
    "existing": 0,
    "new_titles": []
}


# ============================================================
# 1) WORKER FUNCTION (Runs inside each thread)
# ============================================================
def process_page_chunk(page_indices, thread_id):
    """
    This function creates its OWN browser and OWN database connection.
    It processes the specific list of page numbers assigned to it.
    """
    print(f"🧵 [Thread-{thread_id}] Starting. Assigned {len(page_indices)} pages.")

    # --- A. Setup Independent DB Connection ---
    try:
        db = pymysql.connect(**DB_CONFIG)
        cursor = db.cursor()
    except Exception as e:
        print(f"❌ [Thread-{thread_id}] DB Connection failed: {e}")
        return

    # --- B. Setup Independent Selenium Driver ---
    chrome_options = Options()
    # HEADLESS MODE is safer for 5 threads to avoid popping up 5 windows
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--disable-notifications")
    chrome_options.add_argument("--disable-popup-blocking")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--log-level=3")  # Reduce noise

    driver = webdriver.Chrome(options=chrome_options)
    driver.set_window_size(1350, 1000)

    # --- C. Login / Cookies ---
    driver.get("https://sktorrent.eu")
    if COOKIE_FILE.exists():
        with open(COOKIE_FILE, "r", encoding="utf-8") as f:
            cookies = json.load(f)
        for c in cookies:
            driver.add_cookie(c)

    # --- D. Requests Session ---
    requests_session = requests.Session()
    for ck in driver.get_cookies():
        requests_session.cookies.set(ck["name"], ck["value"])

    # --- E. Helper: Parse Row (Local scope) ---
    def parse_row(cells):
        try:
            category = cells[0].text.strip()

            # Download URL
            download_a = cells[1].find_element(By.TAG_NAME, "a")
            download_url = download_a.get_attribute("href")

            parsed_dl = urlparse.urlparse(download_url)
            dl_query = urlparse.parse_qs(parsed_dl.query)
            torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]

            # Details & Hash
            title_links = cells[2].find_elements(By.TAG_NAME, "a")
            if not title_links: return None
            a_tag = title_links[0]
            visible_name = a_tag.text.strip()
            full_title = a_tag.get_attribute("title")
            details_link = a_tag.get_attribute("href")

            parsed = urlparse.urlparse(details_link)
            query = urlparse.parse_qs(parsed.query)
            if "id" not in query: return None
            torrent_hash = query["id"][0]

            # Size & Date
            text_block = cells[2].get_attribute("innerText")
            clean_text = " ".join(text_block.split())
            size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", clean_text, re.IGNORECASE)
            added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", clean_text, re.IGNORECASE)
            size_pretty = size_match.group(1) if size_match else None

            added_mysql = None
            if added_match:
                clean = added_match.group(1).replace(" o ", " ").strip()
                parts = clean.split(" ")
                if len(parts) >= 2:
                    d, m, y = parts[0].split("/")
                    t = parts[1] + ":00" if len(parts[1].split(":")) == 2 else parts[1]
                    try:
                        added_mysql = f"{y}-{m}-{d} {t}"
                    except:
                        pass

            # Image
            img_link = None
            try:
                img_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
                img_src = re.search(r"src=([^ ]+)", img_a.get_attribute("onmouseover"))
                if img_src:
                    img_link = img_src.group(1).replace("'", "").strip()
                    if img_link.startswith("//"): img_link = "https:" + img_link
            except:
                pass

            # Stats
            seeders = int(cells[4].find_element(By.TAG_NAME, "a").text.strip())
            seeders_link = cells[4].find_element(By.TAG_NAME, "a").get_attribute("href")
            leechers = int(cells[5].find_element(By.TAG_NAME, "a").text.strip())
            leechers_link = cells[5].find_element(By.TAG_NAME, "a").get_attribute("href")

            # Check DB
            cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,))
            row = cursor.fetchone()
            already_have_file = row is not None and row[0] is not None

            content = None
            if not already_have_file:
                # Politeness sleep only if downloading
                time.sleep(1)
                try:
                    r = requests_session.get(download_url, timeout=10)
                    r.raise_for_status()
                    content = r.content
                except:
                    pass

            return {
                "torrent_hash": torrent_hash, "details_link": details_link, "download_url": download_url,
                "category": category, "title_visible": visible_name, "title_full": full_title,
                "size_pretty": size_pretty, "added_datetime": added_mysql, "preview_image": img_link,
                "seeders": seeders, "seeders_link": seeders_link, "leechers": leechers, "leechers_link": leechers_link,
                "torrent_filename": torrent_filename, "torrent_content": content,
                "is_new_torrent": not already_have_file
            }
        except Exception:
            return None

    # --- F. Loop through Assigned Pages ---
    for page_num in page_indices:
        url = f"{BASE_URL}&page={page_num}"
        print(f"   🔄 [Thread-{thread_id}] Scraping Page {page_num}")

        try:
            driver.get(url)
            # Close popup (simplified JS)
            driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}")

            # Row Filtering
            rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
            real_rows = []
            for r in rows:
                cs = r.find_elements(By.TAG_NAME, "td")
                if len(cs) == 7 and cs[1].find_elements(By.TAG_NAME, "a"):
                    real_rows.append(cs)

            if not real_rows:
                print(f"   ⚠️ [Thread-{thread_id}] Page {page_num} empty.")
                continue

            # Process Rows
            for cells in real_rows:
                data = parse_row(cells)
                if not data: continue

                # Update Global Stats safely
                with stats_lock:
                    stats["processed"] += 1
                    if data["is_new_torrent"]:
                        stats["new"] += 1
                        stats["new_titles"].append(data["title_visible"])
                    else:
                        stats["existing"] += 1

                # Insert SQL
                sql = """
                INSERT INTO torrents (
                    torrent_hash, details_link, download_url, category, title_visible, title_full,
                    size_pretty, added_datetime, preview_image,
                    seeders, seeders_link, leechers, leechers_link,
                    torrent_filename, torrent_content
                ) VALUES (
                    %(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s, %(title_visible)s, %(title_full)s,
                    %(size_pretty)s, %(added_datetime)s, %(preview_image)s,
                    %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s,
                    %(torrent_filename)s, %(torrent_content)s
                )
                ON DUPLICATE KEY UPDATE
                    seeders = VALUES(seeders),
                    leechers = VALUES(leechers),
                    download_url = VALUES(download_url),
                    torrent_content = COALESCE(VALUES(torrent_content), torrent_content);
                """
                cursor.execute(sql, data)

        except Exception as e:
            print(f"   💥 [Thread-{thread_id}] Error on page {page_num}: {e}")

    # Cleanup
    driver.quit()
    db.close()
    print(f"🏁 [Thread-{thread_id}] Finished assigned pages.")


# ============================================================
# 2) MAIN EXECUTION
# ============================================================
if __name__ == "__main__":
    RUN_START = datetime.datetime.now()
    print(f"🚀 Starting Multithreaded Scraper with {THREADS} threads...")

    # 1. Distribute pages among threads
    # Example: If 226 pages and 5 threads, each gets ~45 pages
    all_pages = list(range(TOTAL_PAGES))
    chunk_size = len(all_pages) // THREADS + 1
    chunks = [all_pages[i:i + chunk_size] for i in range(0, len(all_pages), chunk_size)]

    # 2. Start Threads
    with ThreadPoolExecutor(max_workers=THREADS) as executor:
        futures = []
        for i, page_chunk in enumerate(chunks):
            if page_chunk:  # Only start if chunk is not empty
                futures.append(executor.submit(process_page_chunk, page_chunk, i + 1))

        # Wait for all to finish
        for f in futures:
            f.result()

    # 3. Final Report
    RUN_END = datetime.datetime.now()
    print("\n✅ All threads completed.")

    body = (
        f"Run started:  {RUN_START:%Y-%m-%d %H:%M:%S}\n"
        f"Run finished: {RUN_END:%Y-%m-%d %H:%M:%S}\n\n"
        f"Processed torrents: {stats['processed']}\n"
        f"New torrents saved: {stats['new']}\n"
        f"Existing torrents updated: {stats['existing']}\n"
    )
    if stats["new_titles"]:
        body += "\nNew torrents list:\n- " + "\n- ".join(stats["new_titles"])

    send_mail(to="vladimir.buzalka@buzalka.cz", subject=f"SKTorrent Multi-Thread Run", body=body, html=False)
    print("📧 Email report sent.")