#!/usr/bin/env python3 # -*- coding: utf-8 -*- import pymysql from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import time import re import urllib.parse as urlparse from pathlib import Path import json import requests import datetime import sys # Ensure this file exists in your directory from EmailMessagingGraph import send_mail # ============================================================ # RUNTIME INFO # ============================================================ RUN_START = datetime.datetime.now() processed_count = 0 new_torrent_count = 0 existing_torrent_count = 0 new_titles = [] print(f"🕒 Run started at {RUN_START:%Y-%m-%d %H:%M:%S}") sys.stdout.flush() # ============================================================ # 1) MySQL CONNECTION # ============================================================ db = pymysql.connect( host="192.168.1.50", port=3306, user="root", password="Vlado9674+", database="torrents", charset="utf8mb4", autocommit=True, ) cursor = db.cursor() # ============================================================ # 2) Selenium setup # ============================================================ COOKIE_FILE = Path("sktorrent_cookies.json") # Updated to standard torrents.php as requested BASE_URL = ( "https://sktorrent.eu/torrent/torrents.php" "?active=0&category=24&order=data&by=DESC&zaner=&jazyk=" ) chrome_options = Options() chrome_options.add_argument("--start-maximized") chrome_options.add_argument("--disable-notifications") chrome_options.add_argument("--disable-popup-blocking") chrome_options.add_argument("--disable-extensions") driver = webdriver.Chrome(options=chrome_options) driver.set_window_position(380, 50) driver.set_window_size(1350, 1000) driver.get("https://sktorrent.eu") if COOKIE_FILE.exists(): with open(COOKIE_FILE, "r", encoding="utf-8") as f: cookies = json.load(f) for c in cookies: driver.add_cookie(c) print("🍪 Cookies loaded.") else: print("⚠️ Cookie file not found – login may be required.") # ============================================================ # 3) requests.Session from Selenium cookies # ============================================================ requests_session = requests.Session() for ck in driver.get_cookies(): requests_session.cookies.set(ck["name"], ck["value"]) print("🔗 Requests session initialized.") # ============================================================ # 4) Popup handler # ============================================================ def close_popup_if_any(): try: driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}") time.sleep(0.5) except Exception: pass # ============================================================ # 5) Parse one torrent row # ============================================================ def parse_row(cells): # Column 0: Category icon/text category = cells[0].text.strip() try: # Column 1: Download icon link download_a = cells[1].find_element(By.TAG_NAME, "a") download_link = download_a.get_attribute("href") except: return None parsed_dl = urlparse.urlparse(download_link) dl_query = urlparse.parse_qs(parsed_dl.query) torrent_filename = dl_query.get("f", ["unknown.torrent"])[0] # Column 2: Name and info title_links = cells[2].find_elements(By.TAG_NAME, "a") if not title_links: return None a_tag = title_links[0] visible_name = a_tag.text.strip() full_title = a_tag.get_attribute("title") details_link = a_tag.get_attribute("href") parsed = urlparse.urlparse(details_link) query = urlparse.parse_qs(parsed.query) if "id" not in query: return None torrent_hash = query["id"][0] # Use innerText for robust text extraction text_block = cells[2].get_attribute("innerText") text_block_clean = " ".join(text_block.split()) # Regex for Size and Date size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE) added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE) size_pretty = size_match.group(1) if size_match else None added_pretty = added_match.group(1) if added_match else None # Date conversion: "29/11/2025 o 02:29" -> MySQL format added_mysql = None if added_pretty: clean = added_pretty.replace(" o ", " ").strip() parts = clean.split(" ") if len(parts) >= 2: date_part = parts[0] time_part = parts[1] if len(time_part.split(":")) == 2: time_part += ":00" try: day, month, year = date_part.split("/") added_mysql = f"{year}-{month}-{day} {time_part}" except: added_mysql = None # Column 2: Image preview (if exists) img_link = None try: image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]") mouseover = image_a.get_attribute("onmouseover") img_match = re.search(r"src=([^ ]+)", mouseover) if img_match: img_link = img_match.group(1).replace("'", "").strip() if img_link.startswith("//"): img_link = "https:" + img_link except: pass # Column 4: Seeders seeders_a = cells[4].find_element(By.TAG_NAME, "a") seeders_number = int(seeders_a.text.strip()) seeders_link = seeders_a.get_attribute("href") # Column 5: Leechers leechers_a = cells[5].find_element(By.TAG_NAME, "a") leechers_number = int(leechers_a.text.strip()) leechers_link = leechers_a.get_attribute("href") # Check database for existing binary content cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,)) row = cursor.fetchone() already_have_torrent = row is not None and row[0] is not None torrent_content = None if not already_have_torrent: time.sleep(3) # Politeness delay try: resp = requests_session.get(download_link) resp.raise_for_status() torrent_content = resp.content except: torrent_content = None return { "torrent_hash": torrent_hash, "details_link": details_link, "category": category, "title_visible": visible_name, "title_full": full_title, "size_pretty": size_pretty, "added_datetime": added_mysql, "preview_image": img_link, "seeders": seeders_number, "seeders_link": seeders_link, "leechers": leechers_number, "leechers_link": leechers_link, "torrent_filename": torrent_filename, "torrent_content": torrent_content if not already_have_torrent else None, "is_new_torrent": not already_have_torrent, } # ============================================================ # 6) INSERT SQL # ============================================================ insert_sql = """ INSERT INTO torrents ( torrent_hash, details_link, category, title_visible, title_full, size_pretty, added_datetime, preview_image, seeders, seeders_link, leechers, leechers_link, torrent_filename, torrent_content ) VALUES ( %(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s, %(size_pretty)s, %(added_datetime)s, %(preview_image)s, %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s, %(torrent_filename)s, %(torrent_content)s ) ON DUPLICATE KEY UPDATE seeders = VALUES(seeders), leechers = VALUES(leechers), torrent_content = COALESCE(VALUES(torrent_content), torrent_content); """ # ============================================================ # 7) PROCESS ALL PAGES # ============================================================ TOTAL_PAGES = 226 for page_num in range(0, TOTAL_PAGES): current_url = f"{BASE_URL}&page={page_num}" print(f"\n🌐 Loading Page Index {page_num} (Page {page_num + 1}/{TOTAL_PAGES})") driver.get(current_url) time.sleep(2) close_popup_if_any() # Find table rows rows = driver.find_elements(By.CSS_SELECTOR, "table tr") # v1 table usually has 7 cells for a data row real_rows = [r.find_elements(By.TAG_NAME, "td") for r in rows if len(r.find_elements(By.TAG_NAME, "td")) == 7] if not real_rows: print("⚠️ No data rows found on this page. Ending loop.") break page_new_items = 0 for cells in real_rows: try: data = parse_row(cells) except Exception as e: print(f"⚠️ parse_row failed: {e}") continue if not data: continue processed_count += 1 if data["is_new_torrent"]: new_torrent_count += 1 page_new_items += 1 new_titles.append(data["title_visible"]) print(f"💾 NEW: {data['title_visible']}") else: existing_torrent_count += 1 print(f"♻️ UPDATING: {data['title_visible']}") cursor.execute(insert_sql, data) # If an entire page is old news, we can stop the deep crawl if page_new_items == 0 and page_num > 0: print("🛑 Page contained only known items. Sync complete.") break time.sleep(1) # ============================================================ # 8) SEND EMAIL REPORT # ============================================================ RUN_END = datetime.datetime.now() subject = f"SKTorrent run – {RUN_START:%Y-%m-%d %H:%M}" body = ( f"Run started: {RUN_START:%Y-%m-%d %H:%M:%S}\n" f"Run finished: {RUN_END:%Y-%m-%d %H:%M:%S}\n\n" f"Processed torrents: {processed_count}\n" f"New torrents saved: {new_torrent_count}\n" f"Existing torrents updated: {existing_torrent_count}\n" ) if new_titles: body += "\nNew torrents list:\n- " + "\n- ".join(new_titles) send_mail(to="vladimir.buzalka@buzalka.cz", subject=subject, body=body, html=False) print("📧 Email report sent.") driver.quit() print("🎉 DONE")