torrents/90 Import all torrents from all pages.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import re
import urllib.parse as urlparse
from pathlib import Path
import json
import requests
import datetime
import sys

# Ensure this file exists in your directory
from EmailMessagingGraph import send_mail

# ============================================================
# RUNTIME INFO
# ============================================================
RUN_START = datetime.datetime.now()
processed_count = 0
new_torrent_count = 0
existing_torrent_count = 0
new_titles = []

print(f"🕒 Run started at {RUN_START:%Y-%m-%d %H:%M:%S}")
sys.stdout.flush()

# ============================================================
# 1) MySQL CONNECTION
# ============================================================
db = pymysql.connect(
    host="192.168.1.50",
    port=3306,
    user="root",
    password="Vlado9674+",
    database="torrents",
    charset="utf8mb4",
    autocommit=True,
)
cursor = db.cursor()

# ============================================================
# 2) Selenium setup
# ============================================================
COOKIE_FILE = Path("sktorrent_cookies.json")
# Updated to standard torrents.php as requested
BASE_URL = (
    "https://sktorrent.eu/torrent/torrents.php"
    "?active=0&category=24&order=data&by=DESC&zaner=&jazyk="
)

chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--disable-extensions")

driver = webdriver.Chrome(options=chrome_options)
driver.set_window_position(380, 50)
driver.set_window_size(1350, 1000)

driver.get("https://sktorrent.eu")

if COOKIE_FILE.exists():
    with open(COOKIE_FILE, "r", encoding="utf-8") as f:
        cookies = json.load(f)
    for c in cookies:
        driver.add_cookie(c)
    print("🍪 Cookies loaded.")
else:
    print("⚠️ Cookie file not found – login may be required.")

# ============================================================
# 3) requests.Session from Selenium cookies
# ============================================================
requests_session = requests.Session()
for ck in driver.get_cookies():
    requests_session.cookies.set(ck["name"], ck["value"])

print("🔗 Requests session initialized.")


# ============================================================
# 4) Popup handler
# ============================================================
def close_popup_if_any():
    try:
        driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}")
        time.sleep(0.5)
    except Exception:
        pass


# ============================================================
# 5) Parse one torrent row
# ============================================================
def parse_row(cells):
    # Column 0: Category icon/text
    category = cells[0].text.strip()

    try:
        # Column 1: Download icon link
        download_a = cells[1].find_element(By.TAG_NAME, "a")
        download_link = download_a.get_attribute("href")
    except:
        return None

    parsed_dl = urlparse.urlparse(download_link)
    dl_query = urlparse.parse_qs(parsed_dl.query)
    torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]

    # Column 2: Name and info
    title_links = cells[2].find_elements(By.TAG_NAME, "a")
    if not title_links:
        return None

    a_tag = title_links[0]
    visible_name = a_tag.text.strip()
    full_title = a_tag.get_attribute("title")
    details_link = a_tag.get_attribute("href")

    parsed = urlparse.urlparse(details_link)
    query = urlparse.parse_qs(parsed.query)
    if "id" not in query:
        return None

    torrent_hash = query["id"][0]

    # Use innerText for robust text extraction
    text_block = cells[2].get_attribute("innerText")
    text_block_clean = " ".join(text_block.split())

    # Regex for Size and Date
    size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
    added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)

    size_pretty = size_match.group(1) if size_match else None
    added_pretty = added_match.group(1) if added_match else None

    # Date conversion: "29/11/2025 o 02:29" -> MySQL format
    added_mysql = None
    if added_pretty:
        clean = added_pretty.replace(" o ", " ").strip()
        parts = clean.split(" ")
        if len(parts) >= 2:
            date_part = parts[0]
            time_part = parts[1]
            if len(time_part.split(":")) == 2:
                time_part += ":00"
            try:
                day, month, year = date_part.split("/")
                added_mysql = f"{year}-{month}-{day} {time_part}"
            except:
                added_mysql = None

    # Column 2: Image preview (if exists)
    img_link = None
    try:
        image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
        mouseover = image_a.get_attribute("onmouseover")
        img_match = re.search(r"src=([^ ]+)", mouseover)
        if img_match:
            img_link = img_match.group(1).replace("'", "").strip()
            if img_link.startswith("//"):
                img_link = "https:" + img_link
    except:
        pass

    # Column 4: Seeders
    seeders_a = cells[4].find_element(By.TAG_NAME, "a")
    seeders_number = int(seeders_a.text.strip())
    seeders_link = seeders_a.get_attribute("href")

    # Column 5: Leechers
    leechers_a = cells[5].find_element(By.TAG_NAME, "a")
    leechers_number = int(leechers_a.text.strip())
    leechers_link = leechers_a.get_attribute("href")

    # Check database for existing binary content
    cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,))
    row = cursor.fetchone()
    already_have_torrent = row is not None and row[0] is not None

    torrent_content = None
    if not already_have_torrent:
        time.sleep(3)  # Politeness delay
        try:
            resp = requests_session.get(download_link)
            resp.raise_for_status()
            torrent_content = resp.content
        except:
            torrent_content = None

    return {
        "torrent_hash": torrent_hash,
        "details_link": details_link,
        "category": category,
        "title_visible": visible_name,
        "title_full": full_title,
        "size_pretty": size_pretty,
        "added_datetime": added_mysql,
        "preview_image": img_link,
        "seeders": seeders_number,
        "seeders_link": seeders_link,
        "leechers": leechers_number,
        "leechers_link": leechers_link,
        "torrent_filename": torrent_filename,
        "torrent_content": torrent_content if not already_have_torrent else None,
        "is_new_torrent": not already_have_torrent,
    }


# ============================================================
# 6) INSERT SQL
# ============================================================
insert_sql = """
INSERT INTO torrents (
    torrent_hash, details_link, category, title_visible, title_full,
    size_pretty, added_datetime, preview_image,
    seeders, seeders_link, leechers, leechers_link,
    torrent_filename, torrent_content
) VALUES (
    %(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s,
    %(size_pretty)s, %(added_datetime)s, %(preview_image)s,
    %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s,
    %(torrent_filename)s, %(torrent_content)s
)
ON DUPLICATE KEY UPDATE
    seeders = VALUES(seeders),
    leechers = VALUES(leechers),
    torrent_content = COALESCE(VALUES(torrent_content), torrent_content);
"""

# ============================================================
# 7) PROCESS ALL PAGES
# ============================================================
TOTAL_PAGES = 226

for page_num in range(0, TOTAL_PAGES):
    current_url = f"{BASE_URL}&page={page_num}"
    print(f"\n🌐 Loading Page Index {page_num} (Page {page_num + 1}/{TOTAL_PAGES})")

    driver.get(current_url)
    time.sleep(2)
    close_popup_if_any()

    # Find table rows
    rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
    # v1 table usually has 7 cells for a data row
    real_rows = [r.find_elements(By.TAG_NAME, "td") for r in rows if len(r.find_elements(By.TAG_NAME, "td")) == 7]

    if not real_rows:
        print("⚠️ No data rows found on this page. Ending loop.")
        break

    page_new_items = 0
    for cells in real_rows:
        try:
            data = parse_row(cells)
        except Exception as e:
            print(f"⚠️ parse_row failed: {e}")
            continue

        if not data: continue
        processed_count += 1

        if data["is_new_torrent"]:
            new_torrent_count += 1
            page_new_items += 1
            new_titles.append(data["title_visible"])
            print(f"💾 NEW: {data['title_visible']}")
        else:
            existing_torrent_count += 1
            print(f"♻️ UPDATING: {data['title_visible']}")

        cursor.execute(insert_sql, data)

    # If an entire page is old news, we can stop the deep crawl
    if page_new_items == 0 and page_num > 0:
        print("🛑 Page contained only known items. Sync complete.")
        break

    time.sleep(1)

# ============================================================
# 8) SEND EMAIL REPORT
# ============================================================
RUN_END = datetime.datetime.now()
subject = f"SKTorrent run – {RUN_START:%Y-%m-%d %H:%M}"
body = (
    f"Run started:  {RUN_START:%Y-%m-%d %H:%M:%S}\n"
    f"Run finished: {RUN_END:%Y-%m-%d %H:%M:%S}\n\n"
    f"Processed torrents: {processed_count}\n"
    f"New torrents saved: {new_torrent_count}\n"
    f"Existing torrents updated: {existing_torrent_count}\n"
)
if new_titles:
    body += "\nNew torrents list:\n- " + "\n- ".join(new_titles)

send_mail(to="vladimir.buzalka@buzalka.cz", subject=subject, body=body, html=False)
print("📧 Email report sent.")

driver.quit()
print("🎉 DONE")