z230

2026-01-30 10:28:42 +01:00
parent 0b7475c5c4
commit 7b0404bfe3
2 changed files with 310 additions and 2 deletions
--- a/pages.py
+++ b/pages.py
@@ -0,0 +1,308 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import pymysql
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.options import Options
+import time
+import re
+import urllib.parse as urlparse
+from pathlib import Path
+import json
+import requests
+import datetime
+import sys
+
+# Ensure this file exists in your directory
+from EmailMessagingGraph import send_mail
+
+# ============================================================
+# RUNTIME INFO
+# ============================================================
+RUN_START = datetime.datetime.now()
+processed_count = 0
+new_torrent_count = 0
+existing_torrent_count = 0
+new_titles = []
+
+print(f"🕒 Run started at {RUN_START:%Y-%m-%d %H:%M:%S}")
+sys.stdout.flush()
+
+# ============================================================
+# 1) MySQL CONNECTION
+# ============================================================
+db = pymysql.connect(
+    host="192.168.1.50",
+    port=3306,
+    user="root",
+    password="Vlado9674+",
+    database="torrents",
+    charset="utf8mb4",
+    autocommit=True,
+)
+cursor = db.cursor()
+
+# ============================================================
+# 2) Selenium setup
+# ============================================================
+COOKIE_FILE = Path("sktorrent_cookies.json")
+# Updated to standard torrents.php as requested
+BASE_URL = (
+    "https://sktorrent.eu/torrent/torrents.php"
+    "?active=0&category=24&order=data&by=DESC&zaner=&jazyk="
+)
+
+chrome_options = Options()
+chrome_options.add_argument("--start-maximized")
+chrome_options.add_argument("--disable-notifications")
+chrome_options.add_argument("--disable-popup-blocking")
+chrome_options.add_argument("--disable-extensions")
+
+driver = webdriver.Chrome(options=chrome_options)
+driver.set_window_position(380, 50)
+driver.set_window_size(1350, 1000)
+
+driver.get("https://sktorrent.eu")
+
+if COOKIE_FILE.exists():
+    with open(COOKIE_FILE, "r", encoding="utf-8") as f:
+        cookies = json.load(f)
+    for c in cookies:
+        driver.add_cookie(c)
+    print("🍪 Cookies loaded.")
+else:
+    print("⚠️ Cookie file not found – login may be required.")
+
+# ============================================================
+# 3) requests.Session from Selenium cookies
+# ============================================================
+requests_session = requests.Session()
+for ck in driver.get_cookies():
+    requests_session.cookies.set(ck["name"], ck["value"])
+
+print("🔗 Requests session initialized.")
+
+
+# ============================================================
+# 4) Popup handler
+# ============================================================
+def close_popup_if_any():
+    try:
+        driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}")
+        time.sleep(0.5)
+    except Exception:
+        pass
+
+
+# ============================================================
+# 5) Parse one torrent row
+# ============================================================
+def parse_row(cells):
+    # Column 0: Category icon/text
+    category = cells[0].text.strip()
+
+    try:
+        # Column 1: Download icon link
+        download_a = cells[1].find_element(By.TAG_NAME, "a")
+        download_link = download_a.get_attribute("href")
+    except:
+        return None
+
+    parsed_dl = urlparse.urlparse(download_link)
+    dl_query = urlparse.parse_qs(parsed_dl.query)
+    torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]
+
+    # Column 2: Name and info
+    title_links = cells[2].find_elements(By.TAG_NAME, "a")
+    if not title_links:
+        return None
+
+    a_tag = title_links[0]
+    visible_name = a_tag.text.strip()
+    full_title = a_tag.get_attribute("title")
+    details_link = a_tag.get_attribute("href")
+
+    parsed = urlparse.urlparse(details_link)
+    query = urlparse.parse_qs(parsed.query)
+    if "id" not in query:
+        return None
+
+    torrent_hash = query["id"][0]
+
+    # Use innerText for robust text extraction
+    text_block = cells[2].get_attribute("innerText")
+    text_block_clean = " ".join(text_block.split())
+
+    # Regex for Size and Date
+    size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
+    added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)
+
+    size_pretty = size_match.group(1) if size_match else None
+    added_pretty = added_match.group(1) if added_match else None
+
+    # Date conversion: "29/11/2025 o 02:29" -> MySQL format
+    added_mysql = None
+    if added_pretty:
+        clean = added_pretty.replace(" o ", " ").strip()
+        parts = clean.split(" ")
+        if len(parts) >= 2:
+            date_part = parts[0]
+            time_part = parts[1]
+            if len(time_part.split(":")) == 2:
+                time_part += ":00"
+            try:
+                day, month, year = date_part.split("/")
+                added_mysql = f"{year}-{month}-{day} {time_part}"
+            except:
+                added_mysql = None
+
+    # Column 2: Image preview (if exists)
+    img_link = None
+    try:
+        image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
+        mouseover = image_a.get_attribute("onmouseover")
+        img_match = re.search(r"src=([^ ]+)", mouseover)
+        if img_match:
+            img_link = img_match.group(1).replace("'", "").strip()
+            if img_link.startswith("//"):
+                img_link = "https:" + img_link
+    except:
+        pass
+
+    # Column 4: Seeders
+    seeders_a = cells[4].find_element(By.TAG_NAME, "a")
+    seeders_number = int(seeders_a.text.strip())
+    seeders_link = seeders_a.get_attribute("href")
+
+    # Column 5: Leechers
+    leechers_a = cells[5].find_element(By.TAG_NAME, "a")
+    leechers_number = int(leechers_a.text.strip())
+    leechers_link = leechers_a.get_attribute("href")
+
+    # Check database for existing binary content
+    cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,))
+    row = cursor.fetchone()
+    already_have_torrent = row is not None and row[0] is not None
+
+    torrent_content = None
+    if not already_have_torrent:
+        time.sleep(3)  # Politeness delay
+        try:
+            resp = requests_session.get(download_link)
+            resp.raise_for_status()
+            torrent_content = resp.content
+        except:
+            torrent_content = None
+
+    return {
+        "torrent_hash": torrent_hash,
+        "details_link": details_link,
+        "category": category,
+        "title_visible": visible_name,
+        "title_full": full_title,
+        "size_pretty": size_pretty,
+        "added_datetime": added_mysql,
+        "preview_image": img_link,
+        "seeders": seeders_number,
+        "seeders_link": seeders_link,
+        "leechers": leechers_number,
+        "leechers_link": leechers_link,
+        "torrent_filename": torrent_filename,
+        "torrent_content": torrent_content if not already_have_torrent else None,
+        "is_new_torrent": not already_have_torrent,
+    }
+
+
+# ============================================================
+# 6) INSERT SQL
+# ============================================================
+insert_sql = """
+INSERT INTO torrents (
+    torrent_hash, details_link, category, title_visible, title_full,
+    size_pretty, added_datetime, preview_image,
+    seeders, seeders_link, leechers, leechers_link,
+    torrent_filename, torrent_content
+) VALUES (
+    %(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s,
+    %(size_pretty)s, %(added_datetime)s, %(preview_image)s,
+    %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s,
+    %(torrent_filename)s, %(torrent_content)s
+)
+ON DUPLICATE KEY UPDATE
+    seeders = VALUES(seeders),
+    leechers = VALUES(leechers),
+    torrent_content = COALESCE(VALUES(torrent_content), torrent_content);
+"""
+
+# ============================================================
+# 7) PROCESS ALL PAGES
+# ============================================================
+TOTAL_PAGES = 226
+
+for page_num in range(0, TOTAL_PAGES):
+    current_url = f"{BASE_URL}&page={page_num}"
+    print(f"\n🌐 Loading Page Index {page_num} (Page {page_num + 1}/{TOTAL_PAGES})")
+
+    driver.get(current_url)
+    time.sleep(2)
+    close_popup_if_any()
+
+    # Find table rows
+    rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
+    # v1 table usually has 7 cells for a data row
+    real_rows = [r.find_elements(By.TAG_NAME, "td") for r in rows if len(r.find_elements(By.TAG_NAME, "td")) == 7]
+
+    if not real_rows:
+        print("⚠️ No data rows found on this page. Ending loop.")
+        break
+
+    page_new_items = 0
+    for cells in real_rows:
+        try:
+            data = parse_row(cells)
+        except Exception as e:
+            print(f"⚠️ parse_row failed: {e}")
+            continue
+
+        if not data: continue
+        processed_count += 1
+
+        if data["is_new_torrent"]:
+            new_torrent_count += 1
+            page_new_items += 1
+            new_titles.append(data["title_visible"])
+            print(f"💾 NEW: {data['title_visible']}")
+        else:
+            existing_torrent_count += 1
+            print(f"♻️ UPDATING: {data['title_visible']}")
+
+        cursor.execute(insert_sql, data)
+
+    # If an entire page is old news, we can stop the deep crawl
+    if page_new_items == 0 and page_num > 0:
+        print("🛑 Page contained only known items. Sync complete.")
+        break
+
+    time.sleep(1)
+
+# ============================================================
+# 8) SEND EMAIL REPORT
+# ============================================================
+RUN_END = datetime.datetime.now()
+subject = f"SKTorrent run – {RUN_START:%Y-%m-%d %H:%M}"
+body = (
+    f"Run started:  {RUN_START:%Y-%m-%d %H:%M:%S}\n"
+    f"Run finished: {RUN_END:%Y-%m-%d %H:%M:%S}\n\n"
+    f"Processed torrents: {processed_count}\n"
+    f"New torrents saved: {new_torrent_count}\n"
+    f"Existing torrents updated: {existing_torrent_count}\n"
+)
+if new_titles:
+    body += "\nNew torrents list:\n- " + "\n- ".join(new_titles)
+
+send_mail(to="vladimir.buzalka@buzalka.cz", subject=subject, body=body, html=False)
+print("📧 Email report sent.")
+
+driver.quit()
+print("🎉 DONE")