torrents/40 ParseviaRequests.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import re
import requests
from bs4 import BeautifulSoup
import pymysql
from datetime import datetime

# ==============================
# CONFIG
# ==============================

BASE_URL = "https://sktorrent.eu/torrent/torrents_v2.php?active=0"

COOKIES_FILE = "sktorrent_cookies.json"      # Your exported cookies.txt ( Netscape format )
USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)

HEADERS = {"User-Agent": USER_AGENT}

DB_CFG = {
    "host": "192.168.1.76",
    "port": 3307,
    "user": "root",
    "password": "Vlado9674+",
    "database": "torrents",
    "charset": "utf8mb4",
    "cursorclass": pymysql.cursors.DictCursor,
}


# ==============================
# COOKIE LOADER
# ==============================

def load_cookies(path):
    cookies = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if line.startswith("#") or "\t" not in line:
                continue
            parts = line.strip().split("\t")
            if len(parts) >= 7:
                cookies[parts[5]] = parts[6]
    print(f"🍪 Loaded {len(cookies)} cookies.")
    return cookies


# ==============================
# MYSQL INSERT
# ==============================

def insert_torrent(db, t):
    sql = """
        INSERT IGNORE INTO torrents (
            category,
            title_visible,
            title_full,
            size_pretty,
            added_datetime,
            seeders,
            seeders_link,
            leechers,
            leechers_link,
            preview_image,
            details_link,
            torrent_hash
        ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    """
    with db.cursor() as cur:
        cur.execute(sql, (
            t["category"],
            t["title_visible"],
            t["title_full"],
            t["size_pretty"],
            t["added_datetime"],
            t["seeders"],
            t["seeders_link"],
            t["leechers"],
            t["leechers_link"],
            t["preview_image"],
            t["details_link"],
            t["torrent_hash"],
        ))
    db.commit()


# ==============================
# PARSER
# ==============================

def parse_torrent_row(cols):
    """Parse a <tr> with exactly the structure of a torrent row."""

    # --- category ---
    category = cols[0].get_text(strip=True)

    # --- download link (ignore) ---
    # second <td> is download.gif

    # --- main column ---
    main_td = cols[2]

    a_title = main_td.find("a", href=re.compile("details.php"))
    if not a_title:
        return None

    title_visible = a_title.get_text(strip=True)
    title_full = a_title.get("title", "").strip()
    details_link = "https://sktorrent.eu/torrent/" + a_title.get("href")

    # Extract torrent hash from ?id=.....
    m = re.search(r"id=([A-Fa-f0-9]{40})", a_title.get("href"))
    if not m:
        return None
    torrent_hash = m.group(1)

    # Extract size + added date from the text below <br>
    text = main_td.get_text(" ", strip=True)
    # example: "GR ... Velkost 1.7 GB | Pridany 18/11/2025 o 07:00"
    size_match = re.search(r"Velkost ([\d\.]+ ?[GMK]B)", text)
    date_match = re.search(r"Pridany (\d{2}/\d{2}/\d{4}) o (\d{2}:\d{2})", text)

    size_pretty = size_match.group(1) if size_match else None

    added_datetime = None
    if date_match:
        d, t = date_match.groups()
        added_datetime = datetime.strptime(d + " " + t, "%d/%m/%Y %H:%M")

    # Extract preview img from onmouseover
    img = None
    img_a = main_td.find("a", onmouseover=True)
    if img_a:
        html = img_a.get("onmouseover", "")
        m2 = re.search(r"img src=//([^ ]+)", html)
        if m2:
            img = "https://" + m2.group(1)

    # --- seeders ---
    seed_a = cols[4].find("a")
    seeders = int(seed_a.get_text(strip=True)) if seed_a else 0
    seeders_link = "https://sktorrent.eu/torrent/" + seed_a.get("href") if seed_a else None

    # --- leechers ---
    leech_a = cols[5].find("a")
    leechers = int(leech_a.get_text(strip=True)) if leech_a else 0
    leechers_link = "https://sktorrent.eu/torrent/" + leech_a.get("href") if leech_a else None

    return {
        "category": category,
        "title_visible": title_visible,
        "title_full": title_full,
        "size_pretty": size_pretty,
        "added_datetime": added_datetime,
        "seeders": seeders,
        "seeders_link": seeders_link,
        "leechers": leechers,
        "leechers_link": leechers_link,
        "preview_image": img,
        "details_link": details_link,
        "torrent_hash": torrent_hash,
    }


# ==============================
# MAIN
# ==============================

def main():

    cookies = load_cookies(COOKIES_FILE)

    session = requests.Session()
    session.headers.update(HEADERS)
    session.cookies.update(cookies)

    print("🌍 Downloading HTML...")
    r = session.get(BASE_URL, timeout=30)
    r.raise_for_status()

    soup = BeautifulSoup(r.text, "html.parser")
    tbody = soup.find("tbody")
    if not tbody:
        print("❌ Could not find <tbody>")
        return

    rows = tbody.find_all("tr")
    print(f"Found {len(rows)} <tr> rows.")

    db = pymysql.connect(**DB_CFG)

    inserted = 0
    skipped = 0

    for tr in rows:
        cols = tr.find_all("td")
        if len(cols) != 7:
            continue  # ignore header & separator rows

        data = parse_torrent_row(cols)
        if not data:
            skipped += 1
            continue

        insert_torrent(db, data)
        inserted += 1
        print(f"✔ Inserted {data['torrent_hash']}")

    print(f"\n===== DONE =====")
    print(f"Inserted: {inserted}")
    print(f"Skipped: {skipped}")


if __name__ == "__main__":
    main()