git

2026-02-01 07:18:20 +01:00
parent 7b0404bfe3
commit 3d11661997
7 changed files with 1074 additions and 116 deletions
--- a/5threaddownloader.py
+++ b/5threaddownloader.py
@@ -0,0 +1,292 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import pymysql
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.options import Options
+import time
+import re
+import urllib.parse as urlparse
+from pathlib import Path
+import json
+import requests
+import datetime
+import sys
+import threading
+from concurrent.futures import ThreadPoolExecutor
+
+# Ensure this file exists in your directory
+from EmailMessagingGraph import send_mail
+
+# ============================================================
+# CONFIGURATION
+# ============================================================
+TOTAL_PAGES = 226
+THREADS = 5
+COOKIE_FILE = Path("sktorrent_cookies.json")
+
+# Database settings
+DB_CONFIG = {
+    "host": "192.168.1.50",
+    "port": 3306,
+    "user": "root",
+    "password": "Vlado9674+",
+    "database": "torrents",
+    "charset": "utf8mb4",
+    "autocommit": True,
+}
+
+BASE_URL = (
+    "https://sktorrent.eu/torrent/torrents.php"
+    "?active=0&category=24&order=data&by=DESC&zaner=&jazyk="
+)
+
+# Global counters for reporting (Thread-safe lock needed)
+stats_lock = threading.Lock()
+stats = {
+    "processed": 0,
+    "new": 0,
+    "existing": 0,
+    "new_titles": []
+}
+
+
+# ============================================================
+# 1) WORKER FUNCTION (Runs inside each thread)
+# ============================================================
+def process_page_chunk(page_indices, thread_id):
+    """
+    This function creates its OWN browser and OWN database connection.
+    It processes the specific list of page numbers assigned to it.
+    """
+    print(f"🧵 [Thread-{thread_id}] Starting. Assigned {len(page_indices)} pages.")
+
+    # --- A. Setup Independent DB Connection ---
+    try:
+        db = pymysql.connect(**DB_CONFIG)
+        cursor = db.cursor()
+    except Exception as e:
+        print(f"❌ [Thread-{thread_id}] DB Connection failed: {e}")
+        return
+
+    # --- B. Setup Independent Selenium Driver ---
+    chrome_options = Options()
+    # HEADLESS MODE is safer for 5 threads to avoid popping up 5 windows
+    chrome_options.add_argument("--headless=new")
+    chrome_options.add_argument("--disable-notifications")
+    chrome_options.add_argument("--disable-popup-blocking")
+    chrome_options.add_argument("--disable-extensions")
+    chrome_options.add_argument("--log-level=3")  # Reduce noise
+
+    driver = webdriver.Chrome(options=chrome_options)
+    driver.set_window_size(1350, 1000)
+
+    # --- C. Login / Cookies ---
+    driver.get("https://sktorrent.eu")
+    if COOKIE_FILE.exists():
+        with open(COOKIE_FILE, "r", encoding="utf-8") as f:
+            cookies = json.load(f)
+        for c in cookies:
+            driver.add_cookie(c)
+
+    # --- D. Requests Session ---
+    requests_session = requests.Session()
+    for ck in driver.get_cookies():
+        requests_session.cookies.set(ck["name"], ck["value"])
+
+    # --- E. Helper: Parse Row (Local scope) ---
+    def parse_row(cells):
+        try:
+            category = cells[0].text.strip()
+
+            # Download URL
+            download_a = cells[1].find_element(By.TAG_NAME, "a")
+            download_url = download_a.get_attribute("href")
+
+            parsed_dl = urlparse.urlparse(download_url)
+            dl_query = urlparse.parse_qs(parsed_dl.query)
+            torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]
+
+            # Details & Hash
+            title_links = cells[2].find_elements(By.TAG_NAME, "a")
+            if not title_links: return None
+            a_tag = title_links[0]
+            visible_name = a_tag.text.strip()
+            full_title = a_tag.get_attribute("title")
+            details_link = a_tag.get_attribute("href")
+
+            parsed = urlparse.urlparse(details_link)
+            query = urlparse.parse_qs(parsed.query)
+            if "id" not in query: return None
+            torrent_hash = query["id"][0]
+
+            # Size & Date
+            text_block = cells[2].get_attribute("innerText")
+            clean_text = " ".join(text_block.split())
+            size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", clean_text, re.IGNORECASE)
+            added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", clean_text, re.IGNORECASE)
+            size_pretty = size_match.group(1) if size_match else None
+
+            added_mysql = None
+            if added_match:
+                clean = added_match.group(1).replace(" o ", " ").strip()
+                parts = clean.split(" ")
+                if len(parts) >= 2:
+                    d, m, y = parts[0].split("/")
+                    t = parts[1] + ":00" if len(parts[1].split(":")) == 2 else parts[1]
+                    try:
+                        added_mysql = f"{y}-{m}-{d} {t}"
+                    except:
+                        pass
+
+            # Image
+            img_link = None
+            try:
+                img_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
+                img_src = re.search(r"src=([^ ]+)", img_a.get_attribute("onmouseover"))
+                if img_src:
+                    img_link = img_src.group(1).replace("'", "").strip()
+                    if img_link.startswith("//"): img_link = "https:" + img_link
+            except:
+                pass
+
+            # Stats
+            seeders = int(cells[4].find_element(By.TAG_NAME, "a").text.strip())
+            seeders_link = cells[4].find_element(By.TAG_NAME, "a").get_attribute("href")
+            leechers = int(cells[5].find_element(By.TAG_NAME, "a").text.strip())
+            leechers_link = cells[5].find_element(By.TAG_NAME, "a").get_attribute("href")
+
+            # Check DB
+            cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,))
+            row = cursor.fetchone()
+            already_have_file = row is not None and row[0] is not None
+
+            content = None
+            if not already_have_file:
+                # Politeness sleep only if downloading
+                time.sleep(1)
+                try:
+                    r = requests_session.get(download_url, timeout=10)
+                    r.raise_for_status()
+                    content = r.content
+                except:
+                    pass
+
+            return {
+                "torrent_hash": torrent_hash, "details_link": details_link, "download_url": download_url,
+                "category": category, "title_visible": visible_name, "title_full": full_title,
+                "size_pretty": size_pretty, "added_datetime": added_mysql, "preview_image": img_link,
+                "seeders": seeders, "seeders_link": seeders_link, "leechers": leechers, "leechers_link": leechers_link,
+                "torrent_filename": torrent_filename, "torrent_content": content,
+                "is_new_torrent": not already_have_file
+            }
+        except Exception:
+            return None
+
+    # --- F. Loop through Assigned Pages ---
+    for page_num in page_indices:
+        url = f"{BASE_URL}&page={page_num}"
+        print(f"   🔄 [Thread-{thread_id}] Scraping Page {page_num}")
+
+        try:
+            driver.get(url)
+            # Close popup (simplified JS)
+            driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}")
+
+            # Row Filtering
+            rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
+            real_rows = []
+            for r in rows:
+                cs = r.find_elements(By.TAG_NAME, "td")
+                if len(cs) == 7 and cs[1].find_elements(By.TAG_NAME, "a"):
+                    real_rows.append(cs)
+
+            if not real_rows:
+                print(f"   ⚠️ [Thread-{thread_id}] Page {page_num} empty.")
+                continue
+
+            # Process Rows
+            for cells in real_rows:
+                data = parse_row(cells)
+                if not data: continue
+
+                # Update Global Stats safely
+                with stats_lock:
+                    stats["processed"] += 1
+                    if data["is_new_torrent"]:
+                        stats["new"] += 1
+                        stats["new_titles"].append(data["title_visible"])
+                    else:
+                        stats["existing"] += 1
+
+                # Insert SQL
+                sql = """
+                INSERT INTO torrents (
+                    torrent_hash, details_link, download_url, category, title_visible, title_full,
+                    size_pretty, added_datetime, preview_image,
+                    seeders, seeders_link, leechers, leechers_link,
+                    torrent_filename, torrent_content
+                ) VALUES (
+                    %(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s, %(title_visible)s, %(title_full)s,
+                    %(size_pretty)s, %(added_datetime)s, %(preview_image)s,
+                    %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s,
+                    %(torrent_filename)s, %(torrent_content)s
+                )
+                ON DUPLICATE KEY UPDATE
+                    seeders = VALUES(seeders),
+                    leechers = VALUES(leechers),
+                    download_url = VALUES(download_url),
+                    torrent_content = COALESCE(VALUES(torrent_content), torrent_content);
+                """
+                cursor.execute(sql, data)
+
+        except Exception as e:
+            print(f"   💥 [Thread-{thread_id}] Error on page {page_num}: {e}")
+
+    # Cleanup
+    driver.quit()
+    db.close()
+    print(f"🏁 [Thread-{thread_id}] Finished assigned pages.")
+
+
+# ============================================================
+# 2) MAIN EXECUTION
+# ============================================================
+if __name__ == "__main__":
+    RUN_START = datetime.datetime.now()
+    print(f"🚀 Starting Multithreaded Scraper with {THREADS} threads...")
+
+    # 1. Distribute pages among threads
+    # Example: If 226 pages and 5 threads, each gets ~45 pages
+    all_pages = list(range(TOTAL_PAGES))
+    chunk_size = len(all_pages) // THREADS + 1
+    chunks = [all_pages[i:i + chunk_size] for i in range(0, len(all_pages), chunk_size)]
+
+    # 2. Start Threads
+    with ThreadPoolExecutor(max_workers=THREADS) as executor:
+        futures = []
+        for i, page_chunk in enumerate(chunks):
+            if page_chunk:  # Only start if chunk is not empty
+                futures.append(executor.submit(process_page_chunk, page_chunk, i + 1))
+
+        # Wait for all to finish
+        for f in futures:
+            f.result()
+
+    # 3. Final Report
+    RUN_END = datetime.datetime.now()
+    print("\n✅ All threads completed.")
+
+    body = (
+        f"Run started:  {RUN_START:%Y-%m-%d %H:%M:%S}\n"
+        f"Run finished: {RUN_END:%Y-%m-%d %H:%M:%S}\n\n"
+        f"Processed torrents: {stats['processed']}\n"
+        f"New torrents saved: {stats['new']}\n"
+        f"Existing torrents updated: {stats['existing']}\n"
+    )
+    if stats["new_titles"]:
+        body += "\nNew torrents list:\n- " + "\n- ".join(stats["new_titles"])
+
+    send_mail(to="vladimir.buzalka@buzalka.cz", subject=f"SKTorrent Multi-Thread Run", body=body, html=False)
+    print("📧 Email report sent.")