torrents/30 OpenTextListing v4.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import re
import urllib.parse as urlparse
from pathlib import Path
import json
import requests


# ============================================================
# 1) MySQL CONNECTION
# ============================================================

db = pymysql.connect(
    host="192.168.1.76",
    port=3307,
    user="root",
    password="Vlado9674+",
    database="torrents",
    charset="utf8mb4",
    autocommit=True
)

cursor = db.cursor()


# ============================================================
# 2) Selenium setup
# ============================================================

COOKIE_FILE = Path("sktorrent_cookies.json")

# Start URL pro kategorii 24, seřazeno podle data DESC
START_URL = (
    "https://sktorrent.eu/torrent/torrents.php"
    "?active=0&category=24&order=data&by=DESC&zaner=&jazyk=&page=0"
)

chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--disable-extensions")

driver = webdriver.Chrome(options=chrome_options)

# Pozice a velikost okna (aby nepřekrývalo PyCharm)
driver.set_window_position(380, 50)      # 10 cm od levého okraje
driver.set_window_size(1350, 1000)       # můžeš změnit dle monitoru


# Nejprve otevřeme hlavní stránku kvůli doméně pro cookies
driver.get("https://sktorrent.eu")

# Load cookies z JSON
if COOKIE_FILE.exists():
    with open(COOKIE_FILE, "r") as f:
        cookies = json.load(f)
    for c in cookies:
        driver.add_cookie(c)
    print("🍪 Cookies loaded.")
else:
    print("⚠️ Cookie file not found, you may not be logged in!")


# ============================================================
# 3) Převod cookies → requests.Session (pro stahování .torrent)
# ============================================================

requests_session = requests.Session()
for ck in driver.get_cookies():
    requests_session.cookies.set(ck["name"], ck["value"])

print("🔗 Requests session initialized with Selenium cookies.")


# ============================================================
# 4) Funkce pro zavření popupu
# ============================================================

def close_popup_if_any():
    """Zkusí zavřít interstitial reklamu pomocí JS funkce interstitialBox.closeit()."""
    try:
        driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}")
        # Krátká pauza, ať se DOM uklidní
        time.sleep(0.5)
        print("🧹 Popup closed via JS fallback (if present).")
    except Exception as e:
        print("ℹ️ Popup JS handler not found:", e)


# ============================================================
# 5) Funkce pro parsování jednoho řádku (jednoho torrentu)
# ============================================================

def parse_row(cells):
    """
    cells: list<WebElement> o délce 7
    Struktura:
      0: kategorie
      1: download link (.torrent)
      2: název + velikost + datum + 'Obrázok' + žánr
      3: -- (ignorujeme)
      4: seeders
      5: leechers
      6: completed
    """

    # --------------------------
    # 1️⃣ CATEGORY
    # --------------------------
    category = cells[0].text.strip()

    # --------------------------
    # 2️⃣ DOWNLOAD LINK FOR TORRENT FILE (cells[1])
    # --------------------------
    try:
        download_a = cells[1].find_element(By.TAG_NAME, "a")
        download_link = download_a.get_attribute("href")
    except:
        print("⚠️ No download link in row, skipping.")
        return None

    parsed_dl = urlparse.urlparse(download_link)
    dl_query = urlparse.parse_qs(parsed_dl.query)

    torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]

    # --------------------------
    # 3️⃣ TITLE + DETAILS LINK (in cell[2])
    # --------------------------
    title_links = cells[2].find_elements(By.TAG_NAME, "a")
    if not title_links:
        print("⚠️ No title link — skipping row")
        return None

    a_tag = title_links[0]

    visible_name = a_tag.text.strip()
    full_title = a_tag.get_attribute("title")
    details_link = a_tag.get_attribute("href")

    if not details_link:
        print("⚠️ Row has no details link — skipping")
        return None

    # --------------------------
    # Extract torrent hash from ?id=
    # --------------------------
    parsed = urlparse.urlparse(details_link)
    query = urlparse.parse_qs(parsed.query)

    if "id" not in query:
        print("⚠️ Skipping row with no torrent ID →", details_link)
        return None

    torrent_hash = query["id"][0]

    # --------------------------
    # 4️⃣ Size + date parsing
    # --------------------------
    text_block = cells[2].get_attribute("innerText")
    text_block_clean = " ".join(text_block.split())

    size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
    added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)

    size_pretty = size_match.group(1) if size_match else None
    added_pretty = added_match.group(1) if added_match else None

    # Robustní převod data/času do MySQL datetime
    added_mysql = None
    if added_pretty:
        # "29/11/2025 o 02:29" → "29/11/2025 02:29"
        clean = added_pretty.replace(" o ", " ").strip()
        parts = clean.split(" ")

        date_part = parts[0]
        time_part = parts[1] if len(parts) > 1 else "00:00:00"

        # pokud chybí sekundy, přidej
        if len(time_part.split(":")) == 2:
            time_part += ":00"

        day, month, year = date_part.split("/")
        added_mysql = f"{year}-{month}-{day} {time_part}"

    # --------------------------
    # 5️⃣ Image preview
    # --------------------------
    img_link = None
    try:
        image_a = cells[2].find_element(
            By.XPATH,
            ".//a[contains(text(),'Obrázok')]"
        )
        mouseover = image_a.get_attribute("onmouseover")
        img_match = re.search(r"src=([^ ]+)", mouseover)
        if img_match:
            img_link = img_match.group(1).replace("'", "").strip()
            if img_link.startswith("//"):
                img_link = "https:" + img_link
    except:
        pass

    # --------------------------
    # 6️⃣ SEEDERS / LEECHERS
    # --------------------------
    seeders_a = cells[4].find_element(By.TAG_NAME, "a")
    seeders_number = int(seeders_a.text.strip())
    seeders_link = seeders_a.get_attribute("href")

    leechers_a = cells[5].find_element(By.TAG_NAME, "a")
    leechers_number = int(leechers_a.text.strip())
    leechers_link = leechers_a.get_attribute("href")

    # --------------------------
    # 7️⃣ DOWNLOAD TORRENT CONTENT (.torrent)
    # --------------------------
    torrent_content = None
    time.sleep(3) #mezera mezi torrenty
    try:
        resp = requests_session.get(download_link)
        resp.raise_for_status()
        torrent_content = resp.content
    except Exception as e:
        print(f"⚠️ Could not download torrent file for {torrent_hash}: {e}")
        torrent_content = None

    # --------------------------
    # FINAL DICTIONARY
    # --------------------------
    return {
        "torrent_hash": torrent_hash,
        "details_link": details_link,
        "category": category,
        "title_visible": visible_name,
        "title_full": full_title,
        "size_pretty": size_pretty,
        "added_datetime": added_mysql,
        "preview_image": img_link,
        "seeders": seeders_number,
        "seeders_link": seeders_link,
        "leechers": leechers_number,
        "leechers_link": leechers_link,
        "torrent_filename": torrent_filename,
        "torrent_content": torrent_content,
    }


# ============================================================
# 6) MySQL INSERT
# ============================================================

insert_sql = """
INSERT INTO torrents (
    torrent_hash, details_link, category, title_visible, title_full,
    size_pretty, added_datetime, preview_image,
    seeders, seeders_link, leechers, leechers_link,
    torrent_filename, torrent_content
) VALUES (
    %(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s,
    %(size_pretty)s, %(added_datetime)s, %(preview_image)s,
    %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s,
    %(torrent_filename)s, %(torrent_content)s
)
ON DUPLICATE KEY UPDATE
    details_link = VALUES(details_link),
    category = VALUES(category),
    title_visible = VALUES(title_visible),
    title_full = VALUES(title_full),
    size_pretty = VALUES(size_pretty),
    added_datetime = VALUES(added_datetime),
    preview_image = VALUES(preview_image),
    seeders = VALUES(seeders),
    seeders_link = VALUES(seeders_link),
    leechers = VALUES(leechers),
    leechers_link = VALUES(leechers_link),
    torrent_filename = VALUES(torrent_filename),
    torrent_content = VALUES(torrent_content);
"""


# ============================================================
# 7) Funkce pro zpracování jedné stránky
# ============================================================

def process_current_page(page_index: int):
    """
    Zpracuje aktuálně otevřenou stránku:
      - najde všechny "REAL TORRENT ROWS" (7 td)
      - pro každý torrent:
          * parse_row
          * insert/update do DB
    """
    rows = driver.find_elements(By.CSS_SELECTOR, "table tr")

    real_rows = []
    for row in rows:
        cells = row.find_elements(By.TAG_NAME, "td")

        # REAL TORRENT ROWS ALWAYS HAVE EXACTLY 7 TD CELLS
        if len(cells) == 7:
            real_rows.append(cells)

    print(f"📄 Page {page_index}: {len(real_rows)} torrent rows")

    for cells in real_rows:
        data = parse_row(cells)
        if not data:
            continue

        print(f"  💾 [{page_index}] Saving:", data["title_visible"])
        cursor.execute(insert_sql, data)


# ============================================================
# 8) Hlavní stránkovací cyklus
# ============================================================

current_url = START_URL
page_index = 0

while True:
    print(f"\n🌐 Loading page {page_index}: {current_url}")
    driver.get(current_url)
    time.sleep(2)

    # zavři popup, pokud je
    close_popup_if_any()

    # zpracuj aktuální stránku
    process_current_page(page_index)

    # pokus se najít tlačítko "Dalsi >>"
    try:
        next_btn = driver.find_element(
            By.XPATH,
            "//a[b[contains(text(),'Dalsi')]]"
        )
        next_url = next_btn.get_attribute("href")

        if not next_url:
            print("⛔ Next link has no href, stopping.")
            break

        # pokud je relativní, doplň doménu
        if next_url.startswith("/"):
            next_url = "https://sktorrent.eu" + next_url

        # když by náhodou bylo stejné URL → přeruš nekonečnou smyčku
        if next_url == current_url:
            print("⛔ Next URL equals current URL, stopping.")
            break

        print("➡️ Next page:", next_url)
        current_url = next_url
        page_index += 1

        # malá pauza mezi stránkami
        time.sleep(1)

    except Exception:
        print("✅ No 'Dalsi >>' link found, reached last page. Done.")
        break


print("\n🎉 DONE — All pages processed, torrents saved & torrent files downloaded.")
driver.quit()