torrents/30 OpenTextListing v3.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import re
import urllib.parse as urlparse
from pathlib import Path
import json
import requests


# ============================================================
# 1) MySQL CONNECTION
# ============================================================

db = pymysql.connect(
    host="192.168.1.76",
    port=3307,
    user="root",
    password="Vlado9674+",
    database="torrents",
    charset="utf8mb4",
    autocommit=True
)

cursor = db.cursor()


# ============================================================
# 2) Selenium setup
# ============================================================

COOKIE_FILE = Path("sktorrent_cookies.json")
URL = "https://sktorrent.eu/torrent/torrents.php?active=0"

chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--disable-extensions")

driver = webdriver.Chrome(options=chrome_options)

driver.get("https://sktorrent.eu")

# Load cookies
session_cookies = []
if COOKIE_FILE.exists():
    with open(COOKIE_FILE, "r") as f:
        cookies = json.load(f)
    for c in cookies:
        driver.add_cookie(c)
        session_cookies.append({c['name']: c['value']})
    print("🍪 Cookies loaded.")

driver.get(URL)
time.sleep(2)


# ============================================================
# 3) Close interstitial popup robustly
# ============================================================

try:
    driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}")
    print("🧹 Popup closed via JS fallback.")
    time.sleep(1)
except:
    print("ℹ️ No popup found.")


# ============================================================
# Convert Selenium cookies → Python requests cookies
# ============================================================

requests_session = requests.Session()
for ck in driver.get_cookies():
    requests_session.cookies.set(ck["name"], ck["value"])


# ============================================================
# 4) Extract table rows
# ============================================================

rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
print("Total rows found:", len(rows))

real_rows = []
for row in rows:
    cells = row.find_elements(By.TAG_NAME, "td")

    # REAL TORRENT ROWS ALWAYS HAVE EXACTLY 7 TD CELLS
    if len(cells) == 7:
        real_rows.append(cells)

print("Real torrent rows:", len(real_rows))
print("")


# ============================================================
# 5) Function to extract fields from one row
# ============================================================

def parse_row(cells):

    # --------------------------
    # 1️⃣ CATEGORY
    # --------------------------
    category = cells[0].text.strip()

    # --------------------------
    # 2️⃣ DOWNLOAD LINK FOR TORRENT FILE
    # --------------------------
    try:
        download_a = cells[1].find_element(By.TAG_NAME, "a")
        download_link = download_a.get_attribute("href")
    except:
        print("⚠️ No download link in row, skipping.")
        return None

    parsed_dl = urlparse.urlparse(download_link)
    dl_query = urlparse.parse_qs(parsed_dl.query)

    torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]

    # --------------------------
    # 3️⃣ Title + details link (in cell[2])
    # --------------------------
    title_links = cells[2].find_elements(By.TAG_NAME, "a")
    if not title_links:
        print("⚠️ No title link — skipping row")
        return None

    a_tag = title_links[0]

    visible_name = a_tag.text.strip()
    full_title   = a_tag.get_attribute("title")
    details_link = a_tag.get_attribute("href")

    if not details_link:
        print("⚠️ Row has no details link — skipping")
        return None

    # --------------------------
    # Extract torrent hash from ?id=
    # --------------------------
    parsed = urlparse.urlparse(details_link)
    query = urlparse.parse_qs(parsed.query)

    if "id" not in query:
        print("⚠️ Skipping row with no torrent ID →", details_link)
        return None

    torrent_hash = query["id"][0]

    # --------------------------
    # 4️⃣ Size + date parsing
    # --------------------------
    text_block = cells[2].get_attribute("innerText")
    text_block_clean = " ".join(text_block.split())

    size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
    added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)

    size_pretty = size_match.group(1) if size_match else None
    added_pretty = added_match.group(1) if added_match else None

    # Robust time normalization
    added_mysql = None
    if added_pretty:

        clean = added_pretty.replace(" o ", " ").strip()
        parts = clean.split(" ")

        date_part = parts[0]
        time_part = parts[1] if len(parts) > 1 else "00:00:00"

        # add seconds if missing
        if len(time_part.split(":")) == 2:
            time_part += ":00"

        day, month, year = date_part.split("/")
        added_mysql = f"{year}-{month}-{day} {time_part}"

    # --------------------------
    # 5️⃣ Image preview
    # --------------------------
    img_link = None
    try:
        image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
        mouseover = image_a.get_attribute("onmouseover")
        img_match = re.search(r"src=([^ ]+)", mouseover)
        if img_match:
            img_link = img_match.group(1).replace("'", "").strip()
            if img_link.startswith("//"):
                img_link = "https:" + img_link
    except:
        pass

    # --------------------------
    # 6️⃣ SEEDERS / LEECHERS
    # --------------------------
    seeders_a = cells[4].find_element(By.TAG_NAME, "a")
    seeders_number = int(seeders_a.text.strip())
    seeders_link = seeders_a.get_attribute("href")

    leechers_a = cells[5].find_element(By.TAG_NAME, "a")
    leechers_number = int(leechers_a.text.strip())
    leechers_link = leechers_a.get_attribute("href")

    # --------------------------
    # 7️⃣ DOWNLOAD TORRENT CONTENT (.torrent)
    # --------------------------
    try:
        torrent_content = requests_session.get(download_link).content
    except Exception as e:
        print(f"⚠️ Could not download torrent file for {torrent_hash}: {e}")
        torrent_content = None

    # --------------------------
    # FINAL DICTIONARY
    # --------------------------
    return {
        "torrent_hash": torrent_hash,
        "details_link": details_link,
        "category": category,
        "title_visible": visible_name,
        "title_full": full_title,
        "size_pretty": size_pretty,
        "added_datetime": added_mysql,
        "preview_image": img_link,
        "seeders": seeders_number,
        "seeders_link": seeders_link,
        "leechers": leechers_number,
        "leechers_link": leechers_link,
        "torrent_filename": torrent_filename,
        "torrent_content": torrent_content,
    }


# ============================================================
# 6) MySQL INSERT
# ============================================================

insert_sql = """
INSERT INTO torrents (
    torrent_hash, details_link, category, title_visible, title_full,
    size_pretty, added_datetime, preview_image,
    seeders, seeders_link, leechers, leechers_link,
    torrent_filename, torrent_content
) VALUES (
    %(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s,
    %(size_pretty)s, %(added_datetime)s, %(preview_image)s,
    %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s,
    %(torrent_filename)s, %(torrent_content)s
)
ON DUPLICATE KEY UPDATE
    details_link = VALUES(details_link),
    category = VALUES(category),
    title_visible = VALUES(title_visible),
    title_full = VALUES(title_full),
    size_pretty = VALUES(size_pretty),
    added_datetime = VALUES(added_datetime),
    preview_image = VALUES(preview_image),
    seeders = VALUES(seeders),
    seeders_link = VALUES(seeders_link),
    leechers = VALUES(leechers),
    leechers_link = VALUES(leechers_link),
    torrent_filename = VALUES(torrent_filename),
    torrent_content = VALUES(torrent_content);
"""


# ============================================================
# 7) PROCESS ALL ROWS
# ============================================================

for cells in real_rows:
    data = parse_row(cells)
    if not data:
        continue

    print("💾 Saving:", data["title_visible"])
    cursor.execute(insert_sql, data)

print("\n✅ DONE — All torrents saved to MySQL & torrent files downloaded.")
driver.quit()