292 lines
11 KiB
Python
292 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import pymysql
|
|
from selenium import webdriver
|
|
from selenium.webdriver.common.by import By
|
|
from selenium.webdriver.chrome.options import Options
|
|
import time
|
|
import re
|
|
import urllib.parse as urlparse
|
|
from pathlib import Path
|
|
import json
|
|
import requests
|
|
import datetime
|
|
import sys
|
|
import threading
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
# Ensure this file exists in your directory
|
|
from EmailMessagingGraph import send_mail
|
|
|
|
# ============================================================
|
|
# CONFIGURATION
|
|
# ============================================================
|
|
TOTAL_PAGES = 226
|
|
THREADS = 5
|
|
COOKIE_FILE = Path("sktorrent_cookies.json")
|
|
|
|
# Database settings
|
|
DB_CONFIG = {
|
|
"host": "192.168.1.50",
|
|
"port": 3306,
|
|
"user": "root",
|
|
"password": "Vlado9674+",
|
|
"database": "torrents",
|
|
"charset": "utf8mb4",
|
|
"autocommit": True,
|
|
}
|
|
|
|
BASE_URL = (
|
|
"https://sktorrent.eu/torrent/torrents.php"
|
|
"?active=0&category=24&order=data&by=DESC&zaner=&jazyk="
|
|
)
|
|
|
|
# Global counters for reporting (Thread-safe lock needed)
|
|
stats_lock = threading.Lock()
|
|
stats = {
|
|
"processed": 0,
|
|
"new": 0,
|
|
"existing": 0,
|
|
"new_titles": []
|
|
}
|
|
|
|
|
|
# ============================================================
|
|
# 1) WORKER FUNCTION (Runs inside each thread)
|
|
# ============================================================
|
|
def process_page_chunk(page_indices, thread_id):
|
|
"""
|
|
This function creates its OWN browser and OWN database connection.
|
|
It processes the specific list of page numbers assigned to it.
|
|
"""
|
|
print(f"🧵 [Thread-{thread_id}] Starting. Assigned {len(page_indices)} pages.")
|
|
|
|
# --- A. Setup Independent DB Connection ---
|
|
try:
|
|
db = pymysql.connect(**DB_CONFIG)
|
|
cursor = db.cursor()
|
|
except Exception as e:
|
|
print(f"❌ [Thread-{thread_id}] DB Connection failed: {e}")
|
|
return
|
|
|
|
# --- B. Setup Independent Selenium Driver ---
|
|
chrome_options = Options()
|
|
# HEADLESS MODE is safer for 5 threads to avoid popping up 5 windows
|
|
chrome_options.add_argument("--headless=new")
|
|
chrome_options.add_argument("--disable-notifications")
|
|
chrome_options.add_argument("--disable-popup-blocking")
|
|
chrome_options.add_argument("--disable-extensions")
|
|
chrome_options.add_argument("--log-level=3") # Reduce noise
|
|
|
|
driver = webdriver.Chrome(options=chrome_options)
|
|
driver.set_window_size(1350, 1000)
|
|
|
|
# --- C. Login / Cookies ---
|
|
driver.get("https://sktorrent.eu")
|
|
if COOKIE_FILE.exists():
|
|
with open(COOKIE_FILE, "r", encoding="utf-8") as f:
|
|
cookies = json.load(f)
|
|
for c in cookies:
|
|
driver.add_cookie(c)
|
|
|
|
# --- D. Requests Session ---
|
|
requests_session = requests.Session()
|
|
for ck in driver.get_cookies():
|
|
requests_session.cookies.set(ck["name"], ck["value"])
|
|
|
|
# --- E. Helper: Parse Row (Local scope) ---
|
|
def parse_row(cells):
|
|
try:
|
|
category = cells[0].text.strip()
|
|
|
|
# Download URL
|
|
download_a = cells[1].find_element(By.TAG_NAME, "a")
|
|
download_url = download_a.get_attribute("href")
|
|
|
|
parsed_dl = urlparse.urlparse(download_url)
|
|
dl_query = urlparse.parse_qs(parsed_dl.query)
|
|
torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]
|
|
|
|
# Details & Hash
|
|
title_links = cells[2].find_elements(By.TAG_NAME, "a")
|
|
if not title_links: return None
|
|
a_tag = title_links[0]
|
|
visible_name = a_tag.text.strip()
|
|
full_title = a_tag.get_attribute("title")
|
|
details_link = a_tag.get_attribute("href")
|
|
|
|
parsed = urlparse.urlparse(details_link)
|
|
query = urlparse.parse_qs(parsed.query)
|
|
if "id" not in query: return None
|
|
torrent_hash = query["id"][0]
|
|
|
|
# Size & Date
|
|
text_block = cells[2].get_attribute("innerText")
|
|
clean_text = " ".join(text_block.split())
|
|
size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", clean_text, re.IGNORECASE)
|
|
added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", clean_text, re.IGNORECASE)
|
|
size_pretty = size_match.group(1) if size_match else None
|
|
|
|
added_mysql = None
|
|
if added_match:
|
|
clean = added_match.group(1).replace(" o ", " ").strip()
|
|
parts = clean.split(" ")
|
|
if len(parts) >= 2:
|
|
d, m, y = parts[0].split("/")
|
|
t = parts[1] + ":00" if len(parts[1].split(":")) == 2 else parts[1]
|
|
try:
|
|
added_mysql = f"{y}-{m}-{d} {t}"
|
|
except:
|
|
pass
|
|
|
|
# Image
|
|
img_link = None
|
|
try:
|
|
img_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
|
|
img_src = re.search(r"src=([^ ]+)", img_a.get_attribute("onmouseover"))
|
|
if img_src:
|
|
img_link = img_src.group(1).replace("'", "").strip()
|
|
if img_link.startswith("//"): img_link = "https:" + img_link
|
|
except:
|
|
pass
|
|
|
|
# Stats
|
|
seeders = int(cells[4].find_element(By.TAG_NAME, "a").text.strip())
|
|
seeders_link = cells[4].find_element(By.TAG_NAME, "a").get_attribute("href")
|
|
leechers = int(cells[5].find_element(By.TAG_NAME, "a").text.strip())
|
|
leechers_link = cells[5].find_element(By.TAG_NAME, "a").get_attribute("href")
|
|
|
|
# Check DB
|
|
cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,))
|
|
row = cursor.fetchone()
|
|
already_have_file = row is not None and row[0] is not None
|
|
|
|
content = None
|
|
if not already_have_file:
|
|
# Politeness sleep only if downloading
|
|
time.sleep(1)
|
|
try:
|
|
r = requests_session.get(download_url, timeout=10)
|
|
r.raise_for_status()
|
|
content = r.content
|
|
except:
|
|
pass
|
|
|
|
return {
|
|
"torrent_hash": torrent_hash, "details_link": details_link, "download_url": download_url,
|
|
"category": category, "title_visible": visible_name, "title_full": full_title,
|
|
"size_pretty": size_pretty, "added_datetime": added_mysql, "preview_image": img_link,
|
|
"seeders": seeders, "seeders_link": seeders_link, "leechers": leechers, "leechers_link": leechers_link,
|
|
"torrent_filename": torrent_filename, "torrent_content": content,
|
|
"is_new_torrent": not already_have_file
|
|
}
|
|
except Exception:
|
|
return None
|
|
|
|
# --- F. Loop through Assigned Pages ---
|
|
for page_num in page_indices:
|
|
url = f"{BASE_URL}&page={page_num}"
|
|
print(f" 🔄 [Thread-{thread_id}] Scraping Page {page_num}")
|
|
|
|
try:
|
|
driver.get(url)
|
|
# Close popup (simplified JS)
|
|
driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}")
|
|
|
|
# Row Filtering
|
|
rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
|
|
real_rows = []
|
|
for r in rows:
|
|
cs = r.find_elements(By.TAG_NAME, "td")
|
|
if len(cs) == 7 and cs[1].find_elements(By.TAG_NAME, "a"):
|
|
real_rows.append(cs)
|
|
|
|
if not real_rows:
|
|
print(f" ⚠️ [Thread-{thread_id}] Page {page_num} empty.")
|
|
continue
|
|
|
|
# Process Rows
|
|
for cells in real_rows:
|
|
data = parse_row(cells)
|
|
if not data: continue
|
|
|
|
# Update Global Stats safely
|
|
with stats_lock:
|
|
stats["processed"] += 1
|
|
if data["is_new_torrent"]:
|
|
stats["new"] += 1
|
|
stats["new_titles"].append(data["title_visible"])
|
|
else:
|
|
stats["existing"] += 1
|
|
|
|
# Insert SQL
|
|
sql = """
|
|
INSERT INTO torrents (
|
|
torrent_hash, details_link, download_url, category, title_visible, title_full,
|
|
size_pretty, added_datetime, preview_image,
|
|
seeders, seeders_link, leechers, leechers_link,
|
|
torrent_filename, torrent_content
|
|
) VALUES (
|
|
%(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s, %(title_visible)s, %(title_full)s,
|
|
%(size_pretty)s, %(added_datetime)s, %(preview_image)s,
|
|
%(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s,
|
|
%(torrent_filename)s, %(torrent_content)s
|
|
)
|
|
ON DUPLICATE KEY UPDATE
|
|
seeders = VALUES(seeders),
|
|
leechers = VALUES(leechers),
|
|
download_url = VALUES(download_url),
|
|
torrent_content = COALESCE(VALUES(torrent_content), torrent_content);
|
|
"""
|
|
cursor.execute(sql, data)
|
|
|
|
except Exception as e:
|
|
print(f" 💥 [Thread-{thread_id}] Error on page {page_num}: {e}")
|
|
|
|
# Cleanup
|
|
driver.quit()
|
|
db.close()
|
|
print(f"🏁 [Thread-{thread_id}] Finished assigned pages.")
|
|
|
|
|
|
# ============================================================
|
|
# 2) MAIN EXECUTION
|
|
# ============================================================
|
|
if __name__ == "__main__":
|
|
RUN_START = datetime.datetime.now()
|
|
print(f"🚀 Starting Multithreaded Scraper with {THREADS} threads...")
|
|
|
|
# 1. Distribute pages among threads
|
|
# Example: If 226 pages and 5 threads, each gets ~45 pages
|
|
all_pages = list(range(TOTAL_PAGES))
|
|
chunk_size = len(all_pages) // THREADS + 1
|
|
chunks = [all_pages[i:i + chunk_size] for i in range(0, len(all_pages), chunk_size)]
|
|
|
|
# 2. Start Threads
|
|
with ThreadPoolExecutor(max_workers=THREADS) as executor:
|
|
futures = []
|
|
for i, page_chunk in enumerate(chunks):
|
|
if page_chunk: # Only start if chunk is not empty
|
|
futures.append(executor.submit(process_page_chunk, page_chunk, i + 1))
|
|
|
|
# Wait for all to finish
|
|
for f in futures:
|
|
f.result()
|
|
|
|
# 3. Final Report
|
|
RUN_END = datetime.datetime.now()
|
|
print("\n✅ All threads completed.")
|
|
|
|
body = (
|
|
f"Run started: {RUN_START:%Y-%m-%d %H:%M:%S}\n"
|
|
f"Run finished: {RUN_END:%Y-%m-%d %H:%M:%S}\n\n"
|
|
f"Processed torrents: {stats['processed']}\n"
|
|
f"New torrents saved: {stats['new']}\n"
|
|
f"Existing torrents updated: {stats['existing']}\n"
|
|
)
|
|
if stats["new_titles"]:
|
|
body += "\nNew torrents list:\n- " + "\n- ".join(stats["new_titles"])
|
|
|
|
send_mail(to="vladimir.buzalka@buzalka.cz", subject=f"SKTorrent Multi-Thread Run", body=body, html=False)
|
|
print("📧 Email report sent.") |