vbnotebook

2025-11-18 07:22:17 +01:00
parent a764c9723e
commit 7bc330beba
2 changed files with 412 additions and 134 deletions
--- a/OpenTextListing.py
+++ b/OpenTextListing.py
@@ -1,171 +1,230 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-import json
+import pymysql
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.chrome.options import Options
 import time
 import re
 import urllib.parse as urlparse
 from pathlib import Path
-from playwright.sync_api import sync_playwright
+import json
-# =============================================================
+
-# CONFIGURATION
+# ============================================================
-# =============================================================
+# 1) MySQL CONNECTION
 # ============================================================
 db = pymysql.connect(
    host="192.168.1.76",
    port=3307,
    user="root",
    password="Vlado9674+",
    database="torrents",
    charset="utf8mb4",
    autocommit=True
 )
 cursor = db.cursor()
 # ============================================================
 # 2) Selenium setup
 # ============================================================
 COOKIE_FILE = Path("sktorrent_cookies.json")
 URL = "https://sktorrent.eu/torrent/torrents.php?active=0"
 chrome_options = Options()
 chrome_options.add_argument("--start-maximized")
 chrome_options.add_argument("--disable-notifications")
 chrome_options.add_argument("--disable-popup-blocking")
 chrome_options.add_argument("--disable-extensions")
-def load_cookies(context):
+driver = webdriver.Chrome(options=chrome_options)
-    """Load saved cookies if available."""
+
 driver.get("https://sktorrent.eu")
 # Load cookies
 if COOKIE_FILE.exists():
    with open(COOKIE_FILE, "r") as f:
        cookies = json.load(f)
-        context.add_cookies(cookies)
+    for c in cookies:
-        print("🔄 Loaded login cookies.")
+        driver.add_cookie(c)
-        return True
+    print("🍪 Cookies loaded.")
-    print("❌ Cookie file not found. Run manual login first.")
+
-    return False
+driver.get(URL)
 time.sleep(2)
-# =============================================================
+# Try to close inline popup if present
-# MAIN CODE
+try:
-# =============================================================
+    close_btn = driver.find_element(By.XPATH, "//a[text()='CLOSE X']")
    close_btn.click()
    print("🧹 Popup closed.")
 except:
    pass
 with sync_playwright() as p:
-    # 1️⃣ Launch browser
+# ============================================================
-    browser = p.chromium.launch(
+# 3) Extract table rows
-        headless=False,
+# ============================================================
-        args=[
+
-            "--disable-popup-blocking",
+rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
-            "--disable-background-networking",
+print("Total rows found:", len(rows))
-            "--disable-notifications",
+
-            "--no-default-browser-check",
+real_rows = []
-            "--no-first-run",
+for row in rows:
-            "--noerrdialogs",
+    cells = row.find_elements(By.TAG_NAME, "td")
-            "--disable-dev-shm-usage",
+    if len(cells) >= 5:  # real torrent rows
-            "--disable-features=IsolateOrigins,site-per-process",
+        real_rows.append(cells)
-            "--no-sandbox",
+
-        ]
+print("Real data rows:", len(real_rows))
 print("")
 # ============================================================
 # 4) Function to extract all fields from one row
 # ============================================================
 def parse_row(cells):
    # --------------------------
    # 1️⃣ CATEGORY
    # --------------------------
    category = cells[0].text.strip()
    # --------------------------
    # 2️⃣ TITLES + DETAILS LINK
    # --------------------------
    a_tag = cells[1].find_element(By.TAG_NAME, "a")
    visible_name = a_tag.text.strip()
    full_title   = a_tag.get_attribute("title")
    details_link = a_tag.get_attribute("href")
    # --------------------------
    # 3️⃣ TORRENT HASH
    # --------------------------
    parsed = urlparse.urlparse(details_link)
    query = urlparse.parse_qs(parsed.query)
    # skip rows without ?id=
    if "id" not in query:
        print("⚠️ Skipping row with no torrent ID →", details_link)
        return None
    torrent_hash = query["id"][0]
    # --------------------------
    # 4️⃣ TEXT BLOCK (size + date)
    # --------------------------
    text_block = cells[1].get_attribute("innerText")
    text_block_clean = " ".join(text_block.split())
    size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
    added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)
    size_pretty = size_match.group(1) if size_match else None
    added_pretty = added_match.group(1) if added_match else None
    # Convert “18/11/2025 o 07:00” → “2025-11-18 07:00:00”
    added_mysql = None
    if added_pretty:
        added_mysql = re.sub(r" o ", " ", added_pretty)
        day, month, year_time = added_mysql.split("/")
        year, time_part = year_time.split(" ")
        added_mysql = f"{year}-{month}-{day} {time_part}:00"
    # --------------------------
    # 5️⃣ IMAGE PREVIEW
    # --------------------------
    img_link = None
    try:
        image_a = cells[1].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
        mouseover = image_a.get_attribute("onmouseover")
        img_match = re.search(r"src=([^ ]+)", mouseover)
        if img_match:
            img_link = img_match.group(1).replace("'", "").strip()
            if img_link.startswith("//"):
                img_link = "https:" + img_link
    except:
        pass
    # --------------------------
    # 6️⃣ SEEDERS
    # --------------------------
    seeders_a = cells[3].find_element(By.TAG_NAME, "a")
    seeders_number = int(seeders_a.text.strip())
    seeders_link = seeders_a.get_attribute("href")
    # --------------------------
    # 7️⃣ LEECHERS
    # --------------------------
    leechers_a = cells[4].find_element(By.TAG_NAME, "a")
    leechers_number = int(leechers_a.text.strip())
    leechers_link = leechers_a.get_attribute("href")
    # --------------------------
    # Return dictionary for MySQL
    # --------------------------
    return {
        "torrent_hash": torrent_hash,
        "details_link": details_link,
        "category": category,
        "title_visible": visible_name,
        "title_full": full_title,
        "size_pretty": size_pretty,
        "added_datetime": added_mysql,
        "preview_image": img_link,
        "seeders": seeders_number,
        "seeders_link": seeders_link,
        "leechers": leechers_number,
        "leechers_link": leechers_link,
    }
 # ============================================================
 # 5) MySQL INSERT
 # ============================================================
 insert_sql = """
 INSERT INTO torrents (
    torrent_hash, details_link, category, title_visible, title_full,
    size_pretty, added_datetime, preview_image,
    seeders, seeders_link, leechers, leechers_link
 ) VALUES (
    %(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s,
    %(size_pretty)s, %(added_datetime)s, %(preview_image)s,
    %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s
 )
 ON DUPLICATE KEY UPDATE
    details_link = VALUES(details_link),
    category = VALUES(category),
    title_visible = VALUES(title_visible),
    title_full = VALUES(title_full),
    size_pretty = VALUES(size_pretty),
    added_datetime = VALUES(added_datetime),
    preview_image = VALUES(preview_image),
    seeders = VALUES(seeders),
    seeders_link = VALUES(seeders_link),
    leechers = VALUES(leechers),
    leechers_link = VALUES(leechers_link);
 """
    # 2️⃣ Create context before any pages exist
    context = browser.new_context()
-    # 3️⃣ Block ALL third-party requests (ads, JS, popups, tracking)
+# ============================================================
-    def block_third_party(route, request):
+# 6) PROCESS ALL REAL ROWS
-        url = request.url.lower()
+# ============================================================
        if "sktorrent.eu" in url:
            route.continue_()
        else:
            print(f"🚫 Blocked third-party request: {url}")
            route.abort()
-    context.route("**/*", block_third_party)
+for cells in real_rows:
    data = parse_row(cells)
    if not data:
        continue
-    # 4️⃣ Block ANY popup windows except the first page
+    print("💾 Saving:", data["title_visible"])
-    pages = []
+    cursor.execute(insert_sql, data)
-    def on_new_page(new_page):
+print("\n✅ DONE — All torrents saved to MySQL.")
-        pages.append(new_page)
+driver.quit()
        if len(pages) == 1:
            print("➡️ Main page created.")
        else:
            print("⚠️ Popup blocked (auto-closed).")
            new_page.close()
    context.on("page", on_new_page)
    # 5️⃣ Disable all popup JS functions (window.open, window.close, opener.close)
    context.add_init_script("""
        window.open = () => { console.log("Blocked window.open"); return null; };
        window.close = () => { console.log("Blocked window.close"); };
        try {
            if (window.opener) {
                window.opener.close = () => { console.log("Blocked opener.close"); };
            }
        } catch (e) {}
        // Block <a target="_blank">
        document.addEventListener('click', function(e) {
            const el = e.target.closest('a[target="_blank"]');
            if (el) {
                e.preventDefault();
                console.log("Blocked target=_blank");
            }
        }, true);
        // Block middle-click opening a new tab
        document.addEventListener('auxclick', function(e) {
            e.preventDefault();
        }, true);
    """)
    # 6️⃣ Create the FIRST page (main page)
    page = context.new_page()
    pages.append(page)
    # 7️⃣ Load cookies (login)
    load_cookies(context)
    # 8️⃣ Navigate
    print("🌍 Opening page...")
    page.goto(URL)
    # Do NOT use networkidle on ad-heavy sites
    page.wait_for_load_state("domcontentloaded")
    page.wait_for_selector("table tr", timeout=15000)
    # Remove popup/overlay elements created by SKTorrent
    page.evaluate("""
        const selectors = [
            '#lightbox', '.lightbox', '#popup', '.popup', 
            '.overlay', '#overlay', '.modal', '#modal',
            'div[style*="fixed"]', 'div[style*="position: fixed"]',
            'table[style*="position: fixed"]',
            'iframe', 'frame'
        ];
        selectors.forEach(sel => {
            document.querySelectorAll(sel).forEach(el => {
                console.log("Removing popup element:", sel);
                el.remove();
            });
        });
        // Remove onclick handlers that trigger popups
        document.querySelectorAll('*').forEach(el => {
            el.onclick = null;
            el.onauxclick = null;
            el.oncontextmenu = null;
        });
        // Remove timers that trigger delayed popups
        window.setTimeout = () => {};
        window.setInterval = () => {};
    """)
    print("✔ Page loaded, extracting table rows...")
    # 9️⃣ Extract all rows
    rows = page.locator("table tr").all()
    print(f"📄 Total rows found (including header): {len(rows)}")
    # 🔟 Extract SECOND ROW only (your request)
    if len(rows) > 1:
        row = rows[1]  # 0 = header, 1 = first data row
        tds = row.locator("td")
        name = tds.nth(1).inner_text().strip()
        size = tds.nth(2).inner_text().strip()
        seeders = tds.nth(3).inner_text().strip()
        leechers = tds.nth(4).inner_text().strip()
        print("\n========= SECOND ROW =========")
        print(f"Name: {name}")
        print(f"Size: {size}")
        print(f"Seeders: {seeders}")
        print(f"Leechers: {leechers}")
        print("==============================\n")
    else:
        print("❌ No data rows found!")
    page.wait_for_timeout(5000)
--- a/ParseviaRequests.py
+++ b/ParseviaRequests.py
@@ -0,0 +1,219 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import re
 import requests
 from bs4 import BeautifulSoup
 import pymysql
 from datetime import datetime
 # ==============================
 # CONFIG
 # ==============================
 BASE_URL = "https://sktorrent.eu/torrent/torrents_v2.php?active=0"
 COOKIES_FILE = "sktorrent_cookies.json"      # Your exported cookies.txt ( Netscape format )
 USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
 )
 HEADERS = {"User-Agent": USER_AGENT}
 DB_CFG = {
    "host": "192.168.1.76",
    "port": 3307,
    "user": "root",
    "password": "Vlado9674+",
    "database": "torrents",
    "charset": "utf8mb4",
    "cursorclass": pymysql.cursors.DictCursor,
 }
 # ==============================
 # COOKIE LOADER
 # ==============================
 def load_cookies(path):
    cookies = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if line.startswith("#") or "\t" not in line:
                continue
            parts = line.strip().split("\t")
            if len(parts) >= 7:
                cookies[parts[5]] = parts[6]
    print(f"🍪 Loaded {len(cookies)} cookies.")
    return cookies
 # ==============================
 # MYSQL INSERT
 # ==============================
 def insert_torrent(db, t):
    sql = """
        INSERT IGNORE INTO torrents (
            category,
            title_visible,
            title_full,
            size_pretty,
            added_datetime,
            seeders,
            seeders_link,
            leechers,
            leechers_link,
            preview_image,
            details_link,
            torrent_hash
        ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
    """
    with db.cursor() as cur:
        cur.execute(sql, (
            t["category"],
            t["title_visible"],
            t["title_full"],
            t["size_pretty"],
            t["added_datetime"],
            t["seeders"],
            t["seeders_link"],
            t["leechers"],
            t["leechers_link"],
            t["preview_image"],
            t["details_link"],
            t["torrent_hash"],
        ))
    db.commit()
 # ==============================
 # PARSER
 # ==============================
 def parse_torrent_row(cols):
    """Parse a <tr> with exactly the structure of a torrent row."""
    # --- category ---
    category = cols[0].get_text(strip=True)
    # --- download link (ignore) ---
    # second <td> is download.gif
    # --- main column ---
    main_td = cols[2]
    a_title = main_td.find("a", href=re.compile("details.php"))
    if not a_title:
        return None
    title_visible = a_title.get_text(strip=True)
    title_full = a_title.get("title", "").strip()
    details_link = "https://sktorrent.eu/torrent/" + a_title.get("href")
    # Extract torrent hash from ?id=.....
    m = re.search(r"id=([A-Fa-f0-9]{40})", a_title.get("href"))
    if not m:
        return None
    torrent_hash = m.group(1)
    # Extract size + added date from the text below <br>
    text = main_td.get_text(" ", strip=True)
    # example: "GR ... Velkost 1.7 GB | Pridany 18/11/2025 o 07:00"
    size_match = re.search(r"Velkost ([\d\.]+ ?[GMK]B)", text)
    date_match = re.search(r"Pridany (\d{2}/\d{2}/\d{4}) o (\d{2}:\d{2})", text)
    size_pretty = size_match.group(1) if size_match else None
    added_datetime = None
    if date_match:
        d, t = date_match.groups()
        added_datetime = datetime.strptime(d + " " + t, "%d/%m/%Y %H:%M")
    # Extract preview img from onmouseover
    img = None
    img_a = main_td.find("a", onmouseover=True)
    if img_a:
        html = img_a.get("onmouseover", "")
        m2 = re.search(r"img src=//([^ ]+)", html)
        if m2:
            img = "https://" + m2.group(1)
    # --- seeders ---
    seed_a = cols[4].find("a")
    seeders = int(seed_a.get_text(strip=True)) if seed_a else 0
    seeders_link = "https://sktorrent.eu/torrent/" + seed_a.get("href") if seed_a else None
    # --- leechers ---
    leech_a = cols[5].find("a")
    leechers = int(leech_a.get_text(strip=True)) if leech_a else 0
    leechers_link = "https://sktorrent.eu/torrent/" + leech_a.get("href") if leech_a else None
    return {
        "category": category,
        "title_visible": title_visible,
        "title_full": title_full,
        "size_pretty": size_pretty,
        "added_datetime": added_datetime,
        "seeders": seeders,
        "seeders_link": seeders_link,
        "leechers": leechers,
        "leechers_link": leechers_link,
        "preview_image": img,
        "details_link": details_link,
        "torrent_hash": torrent_hash,
    }
 # ==============================
 # MAIN
 # ==============================
 def main():
    cookies = load_cookies(COOKIES_FILE)
    session = requests.Session()
    session.headers.update(HEADERS)
    session.cookies.update(cookies)
    print("🌍 Downloading HTML...")
    r = session.get(BASE_URL, timeout=30)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    tbody = soup.find("tbody")
    if not tbody:
        print("❌ Could not find <tbody>")
        return
    rows = tbody.find_all("tr")
    print(f"Found {len(rows)} <tr> rows.")
    db = pymysql.connect(**DB_CFG)
    inserted = 0
    skipped = 0
    for tr in rows:
        cols = tr.find_all("td")
        if len(cols) != 7:
            continue  # ignore header & separator rows
        data = parse_torrent_row(cols)
        if not data:
            skipped += 1
            continue
        insert_torrent(db, data)
        inserted += 1
        print(f"✔ Inserted {data['torrent_hash']}")
    print(f"\n===== DONE =====")
    print(f"Inserted: {inserted}")
    print(f"Skipped: {skipped}")
 if __name__ == "__main__":
    main()