#!/usr/bin/env python3 # -*- coding: utf-8 -*- import pymysql from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import time import re import urllib.parse as urlparse from pathlib import Path import json import requests import datetime import sys # Ensure this file exists in your directory from EmailMessagingGraph import send_mail # ============================================================ # RUNTIME INFO # ============================================================ RUN_START = datetime.datetime.now() processed_count = 0 new_torrent_count = 0 existing_torrent_count = 0 new_titles = [] print(f"🕒 Run started at {RUN_START:%Y-%m-%d %H:%M:%S}") sys.stdout.flush() # ============================================================ # 1) MySQL CONNECTION # ============================================================ db = pymysql.connect( host="192.168.1.76", port=3306, user="root", password="Vlado9674+", database="torrents", charset="utf8mb4", autocommit=True, ) cursor = db.cursor() # ============================================================ # 2) Selenium setup # ============================================================ COOKIE_FILE = Path("sktorrent_cookies.json") CATEGORIES = { 24: "Knihy a časopisy", 32: "Mluvené slovo" } MAX_PAGES = 10 chrome_options = Options() chrome_options.add_argument("--start-maximized") chrome_options.add_argument("--disable-notifications") chrome_options.add_argument("--disable-popup-blocking") chrome_options.add_argument("--disable-extensions") driver = webdriver.Chrome(options=chrome_options) driver.set_window_position(380, 50) driver.set_window_size(1350, 1000) driver.get("https://sktorrent.eu") if COOKIE_FILE.exists(): with open(COOKIE_FILE, "r", encoding="utf-8") as f: cookies = json.load(f) for c in cookies: driver.add_cookie(c) print("🍪 Cookies loaded.") else: print("⚠️ Cookie file not found – login may be required.") # ============================================================ # 3) requests.Session from Selenium cookies # ============================================================ requests_session = requests.Session() for ck in driver.get_cookies(): requests_session.cookies.set(ck["name"], ck["value"]) print("🔗 Requests session initialized.") # ============================================================ # 4) Build URL for category and page # ============================================================ def get_torrent_url(category, page): base = "https://sktorrent.eu/torrent/torrents.php" return f"{base}?active=0&category={category}&order=data&by=DESC&zaner=&jazyk=&page={page}" # ============================================================ # 5) Popup handler # ============================================================ def close_popup_if_any(): try: driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}") time.sleep(0.5) except Exception: pass # ============================================================ # 6) Parse one torrent row (MODIFIED) # ============================================================ def parse_row(cells): # --- 1. INITIALIZE --- torrent_hash = None download_url = None category = cells[0].text.strip() try: # --- 2. EXTRACT DOWNLOAD URL (Column 1) --- download_a = cells[1].find_element(By.TAG_NAME, "a") download_url = download_a.get_attribute("href") parsed_dl = urlparse.urlparse(download_url) dl_query = urlparse.parse_qs(parsed_dl.query) torrent_filename = dl_query.get("f", ["unknown.torrent"])[0] # --- 3. EXTRACT DETAILS & HASH (Column 2) --- title_links = cells[2].find_elements(By.TAG_NAME, "a") if not title_links: return None a_tag = title_links[0] visible_name = a_tag.text.strip() full_title = a_tag.get_attribute("title") details_link = a_tag.get_attribute("href") parsed = urlparse.urlparse(details_link) query = urlparse.parse_qs(parsed.query) if "id" not in query: return None torrent_hash = query["id"][0] # --- 4. EXTRACT SIZE & DATE --- text_block = cells[2].get_attribute("innerText") text_block_clean = " ".join(text_block.split()) size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE) added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE) size_pretty = size_match.group(1) if size_match else None added_pretty = added_match.group(1) if added_match else None added_mysql = None if added_pretty: clean = added_pretty.replace(" o ", " ").strip() parts = clean.split(" ") if len(parts) >= 2: date_part, time_part = parts[0], parts[1] if len(time_part.split(":")) == 2: time_part += ":00" try: d, m, y = date_part.split("/") added_mysql = f"{y}-{m}-{d} {time_part}" except: pass # --- 5. IMAGE & STATS --- img_link = None try: image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]") mouseover = image_a.get_attribute("onmouseover") img_match = re.search(r"src=([^ ]+)", mouseover) if img_match: img_link = img_match.group(1).replace("'", "").strip() if img_link.startswith("//"): img_link = "https:" + img_link except: pass seeders_number = int(cells[4].find_element(By.TAG_NAME, "a").text.strip()) seeders_link = cells[4].find_element(By.TAG_NAME, "a").get_attribute("href") leechers_number = int(cells[5].find_element(By.TAG_NAME, "a").text.strip()) leechers_link = cells[5].find_element(By.TAG_NAME, "a").get_attribute("href") # --- 6. DATABASE CHECK & DOWNLOAD --- cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,)) db_row = cursor.fetchone() already_have_torrent = db_row is not None and db_row[0] is not None torrent_content = None if not already_have_torrent: time.sleep(2) try: resp = requests_session.get(download_url, timeout=10) resp.raise_for_status() torrent_content = resp.content except Exception as e: print(f" ⚠️ Download failed for {visible_name}: {e}") return { "torrent_hash": torrent_hash, "details_link": details_link, "download_url": download_url, "category": category, "title_visible": visible_name, "title_full": full_title, "size_pretty": size_pretty, "added_datetime": added_mysql, "preview_image": img_link, "seeders": seeders_number, "seeders_link": seeders_link, "leechers": leechers_number, "leechers_link": leechers_link, "torrent_filename": torrent_filename, "torrent_content": torrent_content if not already_have_torrent else None, "is_new_torrent": not already_have_torrent, } except Exception as e: print(f"⚠️ parse_row logic failed: {e}") return None # ============================================================ # 7) INSERT SQL (MODIFIED) # ============================================================ insert_sql = """ INSERT INTO torrents ( torrent_hash, details_link, download_url, category, title_visible, title_full, size_pretty, added_datetime, preview_image, seeders, seeders_link, leechers, leechers_link, torrent_filename, torrent_content ) VALUES ( %(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s, %(title_visible)s, %(title_full)s, %(size_pretty)s, %(added_datetime)s, %(preview_image)s, %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s, %(torrent_filename)s, %(torrent_content)s ) ON DUPLICATE KEY UPDATE seeders = VALUES(seeders), leechers = VALUES(leechers), download_url = VALUES(download_url), torrent_content = COALESCE(VALUES(torrent_content), torrent_content); """ # Note: COALESCE(torrent_content, VALUES(torrent_content)) # keeps the old value if the new one is NULL, # but updates it if the old one was NULL and the new one is binary. # ============================================================ # 8) PROCESS ALL PAGES AND CATEGORIES # ============================================================ for category_id, category_name in CATEGORIES.items(): print(f"\n📚 Starting category: {category_name} (ID: {category_id})") for page_num in range(0, MAX_PAGES): current_url = get_torrent_url(category_id, page_num) print(f"\n🌐 Loading page {page_num}/{MAX_PAGES - 1} for {category_name}") driver.get(current_url) time.sleep(2) close_popup_if_any() # Find table rows rows = driver.find_elements(By.CSS_SELECTOR, "table tr") # FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1) # This automatically discards headers and empty space rows. real_rows = [] for r in rows: cells = r.find_elements(By.TAG_NAME, "td") if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"): real_rows.append(cells) if not real_rows: print(f"⚠️ No data rows found. Stopping pagination for {category_name}") break page_new_items = 0 for cells in real_rows: try: data = parse_row(cells) except Exception as e: print(f"⚠️ parse_row failed: {e}") continue if not data: continue processed_count += 1 if data["is_new_torrent"]: new_torrent_count += 1 page_new_items += 1 new_titles.append(data["title_visible"]) print(f"💾 NEW: {data['title_visible']}") else: existing_torrent_count += 1 print(f"♻️ UPDATING: {data['title_visible']}") cursor.execute(insert_sql, data) time.sleep(1) # ============================================================ # 9) SEND EMAIL REPORT # ============================================================ RUN_END = datetime.datetime.now() subject = f"SKTorrent run – {RUN_START:%Y-%m-%d %H:%M}" body = ( f"Run started: {RUN_START:%Y-%m-%d %H:%M:%S}\n" f"Run finished: {RUN_END:%Y-%m-%d %H:%M:%S}\n\n" f"Processed torrents: {processed_count}\n" f"New torrents saved: {new_torrent_count}\n" f"Existing torrents updated: {existing_torrent_count}\n" ) if new_titles: body += "\nNew torrents list:\n- " + "\n- ".join(new_titles) send_mail(to="vladimir.buzalka@buzalka.cz", subject=subject, body=body, html=False) print("📧 Email report sent.") driver.quit() print("🎉 DONE")