torrents/90 Import all torrents from all pages.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import re
import urllib.parse as urlparse
from pathlib import Path
import json
import requests
import datetime
import sys

# Ensure this file exists in your directory
from EmailMessagingGraph import send_mail

# ============================================================
# RUNTIME INFO
# ============================================================
RUN_START = datetime.datetime.now()
processed_count = 0
new_torrent_count = 0
existing_torrent_count = 0
new_titles = []

print(f"🕒 Run started at {RUN_START:%Y-%m-%d %H:%M:%S}")
sys.stdout.flush()

# ============================================================
# 1) MySQL CONNECTION
# ============================================================
db = pymysql.connect(
    host="192.168.1.50",
    port=3306,
    user="root",
    password="Vlado9674+",
    database="torrents",
    charset="utf8mb4",
    autocommit=True,
)
cursor = db.cursor()

# ============================================================
# 2) Selenium setup
# ============================================================
COOKIE_FILE = Path("sktorrent_cookies.json")
# Updated to standard torrents.php as requested
BASE_URL = (
    "https://sktorrent.eu/torrent/torrents.php"
    "?active=0&category=24&order=data&by=DESC&zaner=&jazyk="
)

chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--disable-extensions")

driver = webdriver.Chrome(options=chrome_options)
driver.set_window_position(380, 50)
driver.set_window_size(1350, 1000)

driver.get("https://sktorrent.eu")

if COOKIE_FILE.exists():
    with open(COOKIE_FILE, "r", encoding="utf-8") as f:
        cookies = json.load(f)
    for c in cookies:
        driver.add_cookie(c)
    print("🍪 Cookies loaded.")
else:
    print("⚠️ Cookie file not found – login may be required.")

# ============================================================
# 3) requests.Session from Selenium cookies
# ============================================================
requests_session = requests.Session()
for ck in driver.get_cookies():
    requests_session.cookies.set(ck["name"], ck["value"])

print("🔗 Requests session initialized.")


# ============================================================
# 4) Popup handler
# ============================================================
def close_popup_if_any():
    try:
        driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}")
        time.sleep(0.5)
    except Exception:
        pass


# ============================================================
# 5) Parse one torrent row (MODIFIED)
# ============================================================

def parse_row(cells):
    # --- 1. INITIALIZE ---
    torrent_hash = None
    download_url = None
    category = cells[0].text.strip()

    try:
        # --- 2. EXTRACT DOWNLOAD URL (Column 1) ---
        download_a = cells[1].find_element(By.TAG_NAME, "a")
        download_url = download_a.get_attribute("href")

        parsed_dl = urlparse.urlparse(download_url)
        dl_query = urlparse.parse_qs(parsed_dl.query)
        torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]

        # --- 3. EXTRACT DETAILS & HASH (Column 2) ---
        title_links = cells[2].find_elements(By.TAG_NAME, "a")
        if not title_links:
            return None

        a_tag = title_links[0]
        visible_name = a_tag.text.strip()
        full_title = a_tag.get_attribute("title")
        details_link = a_tag.get_attribute("href")

        parsed = urlparse.urlparse(details_link)
        query = urlparse.parse_qs(parsed.query)
        if "id" not in query:
            return None

        torrent_hash = query["id"][0]

        # --- 4. EXTRACT SIZE & DATE ---
        text_block = cells[2].get_attribute("innerText")
        text_block_clean = " ".join(text_block.split())
        size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
        added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)
        size_pretty = size_match.group(1) if size_match else None
        added_pretty = added_match.group(1) if added_match else None

        added_mysql = None
        if added_pretty:
            clean = added_pretty.replace(" o ", " ").strip()
            parts = clean.split(" ")
            if len(parts) >= 2:
                date_part, time_part = parts[0], parts[1]
                if len(time_part.split(":")) == 2: time_part += ":00"
                try:
                    d, m, y = date_part.split("/")
                    added_mysql = f"{y}-{m}-{d} {time_part}"
                except: pass

        # --- 5. IMAGE & STATS ---
        img_link = None
        try:
            image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
            mouseover = image_a.get_attribute("onmouseover")
            img_match = re.search(r"src=([^ ]+)", mouseover)
            if img_match:
                img_link = img_match.group(1).replace("'", "").strip()
                if img_link.startswith("//"): img_link = "https:" + img_link
        except: pass

        seeders_number = int(cells[4].find_element(By.TAG_NAME, "a").text.strip())
        seeders_link = cells[4].find_element(By.TAG_NAME, "a").get_attribute("href")
        leechers_number = int(cells[5].find_element(By.TAG_NAME, "a").text.strip())
        leechers_link = cells[5].find_element(By.TAG_NAME, "a").get_attribute("href")

        # --- 6. DATABASE CHECK & DOWNLOAD ---
        cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,))
        db_row = cursor.fetchone()
        already_have_torrent = db_row is not None and db_row[0] is not None

        torrent_content = None
        if not already_have_torrent:
            time.sleep(2)
            try:
                resp = requests_session.get(download_url, timeout=10)
                resp.raise_for_status()
                torrent_content = resp.content
            except Exception as e:
                print(f"   ⚠️ Download failed for {visible_name}: {e}")

        return {
            "torrent_hash": torrent_hash,
            "details_link": details_link,
            "download_url": download_url,
            "category": category,
            "title_visible": visible_name,
            "title_full": full_title,
            "size_pretty": size_pretty,
            "added_datetime": added_mysql,
            "preview_image": img_link,
            "seeders": seeders_number,
            "seeders_link": seeders_link,
            "leechers": leechers_number,
            "leechers_link": leechers_link,
            "torrent_filename": torrent_filename,
            "torrent_content": torrent_content if not already_have_torrent else None,
            "is_new_torrent": not already_have_torrent,
        }
    except Exception as e:
        print(f"⚠️ parse_row logic failed: {e}")
        return None
# ============================================================
# 6) INSERT SQL (MODIFIED)
# ============================================================
insert_sql = """
INSERT INTO torrents (
    torrent_hash, details_link, download_url, category, title_visible, title_full,
    size_pretty, added_datetime, preview_image,
    seeders, seeders_link, leechers, leechers_link,
    torrent_filename, torrent_content
) VALUES (
    %(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s, %(title_visible)s, %(title_full)s,
    %(size_pretty)s, %(added_datetime)s, %(preview_image)s,
    %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s,
    %(torrent_filename)s, %(torrent_content)s
)
ON DUPLICATE KEY UPDATE
    seeders = VALUES(seeders),
    leechers = VALUES(leechers),
    download_url = VALUES(download_url),
    torrent_content = COALESCE(VALUES(torrent_content), torrent_content);
"""
# Note: COALESCE(torrent_content, VALUES(torrent_content))
# keeps the old value if the new one is NULL,
# but updates it if the old one was NULL and the new one is binary.
# ============================================================
# 7) PROCESS ALL PAGES
# ============================================================
TOTAL_PAGES = 226

for page_num in range(0, TOTAL_PAGES):
    current_url = f"{BASE_URL}&page={page_num}"
    print(f"\n🌐 Loading Page Index {page_num} (Page {page_num + 1}/{TOTAL_PAGES})")

    driver.get(current_url)
    time.sleep(2)
    close_popup_if_any()

    # Find table rows
    rows = driver.find_elements(By.CSS_SELECTOR, "table tr")

    # FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1)
    # This automatically discards headers and empty space rows.
    real_rows = []
    for r in rows:
        cells = r.find_elements(By.TAG_NAME, "td")
        if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"):
            real_rows.append(cells)

    if not real_rows:
        print("⚠️ No data rows found on this page. Ending loop.")
        break

    # === INSERT THIS LINE HERE ===
    page_new_items = 0
    # =============================

    for cells in real_rows:
        try:
            data = parse_row(cells)
            # ... rest of your logic ...
        except Exception as e:
            print(f"⚠️ parse_row failed: {e}")
            continue

        if not data: continue
        processed_count += 1

        if data["is_new_torrent"]:
            new_torrent_count += 1
            page_new_items += 1
            new_titles.append(data["title_visible"])
            print(f"💾 NEW: {data['title_visible']}")
        else:
            existing_torrent_count += 1
            print(f"♻️ UPDATING: {data['title_visible']}")

        cursor.execute(insert_sql, data)

    # # If an entire page is old news, we can stop the deep crawl
    # if page_new_items == 0 and page_num > 0:
    #     print("🛑 Page contained only known items. Sync complete.")
    #     break

    time.sleep(1)

# ============================================================
# 8) SEND EMAIL REPORT
# ============================================================
RUN_END = datetime.datetime.now()
subject = f"SKTorrent run – {RUN_START:%Y-%m-%d %H:%M}"
body = (
    f"Run started:  {RUN_START:%Y-%m-%d %H:%M:%S}\n"
    f"Run finished: {RUN_END:%Y-%m-%d %H:%M:%S}\n\n"
    f"Processed torrents: {processed_count}\n"
    f"New torrents saved: {new_torrent_count}\n"
    f"Existing torrents updated: {existing_torrent_count}\n"
)
if new_titles:
    body += "\nNew torrents list:\n- " + "\n- ".join(new_titles)

send_mail(to="vladimir.buzalka@buzalka.cz", subject=subject, body=body, html=False)
print("📧 Email report sent.")

driver.quit()
print("🎉 DONE")