From 7b0404bfe34ab75b094e7782c0c92c4517281f67 Mon Sep 17 00:00:00 2001 From: "vladimir.buzalka" Date: Fri, 30 Jan 2026 10:28:42 +0100 Subject: [PATCH] z230 --- 90 Import all torrents from all pages.py | 308 +++++++++++++++++++++++ Reporter_ReadNewTorrents.py | 4 +- 2 files changed, 310 insertions(+), 2 deletions(-) create mode 100644 90 Import all torrents from all pages.py diff --git a/90 Import all torrents from all pages.py b/90 Import all torrents from all pages.py new file mode 100644 index 0000000..2c9cced --- /dev/null +++ b/90 Import all torrents from all pages.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import pymysql +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.options import Options +import time +import re +import urllib.parse as urlparse +from pathlib import Path +import json +import requests +import datetime +import sys + +# Ensure this file exists in your directory +from EmailMessagingGraph import send_mail + +# ============================================================ +# RUNTIME INFO +# ============================================================ +RUN_START = datetime.datetime.now() +processed_count = 0 +new_torrent_count = 0 +existing_torrent_count = 0 +new_titles = [] + +print(f"🕒 Run started at {RUN_START:%Y-%m-%d %H:%M:%S}") +sys.stdout.flush() + +# ============================================================ +# 1) MySQL CONNECTION +# ============================================================ +db = pymysql.connect( + host="192.168.1.50", + port=3306, + user="root", + password="Vlado9674+", + database="torrents", + charset="utf8mb4", + autocommit=True, +) +cursor = db.cursor() + +# ============================================================ +# 2) Selenium setup +# ============================================================ +COOKIE_FILE = Path("sktorrent_cookies.json") +# Updated to standard torrents.php as requested +BASE_URL = ( + "https://sktorrent.eu/torrent/torrents.php" + "?active=0&category=24&order=data&by=DESC&zaner=&jazyk=" +) + +chrome_options = Options() +chrome_options.add_argument("--start-maximized") +chrome_options.add_argument("--disable-notifications") +chrome_options.add_argument("--disable-popup-blocking") +chrome_options.add_argument("--disable-extensions") + +driver = webdriver.Chrome(options=chrome_options) +driver.set_window_position(380, 50) +driver.set_window_size(1350, 1000) + +driver.get("https://sktorrent.eu") + +if COOKIE_FILE.exists(): + with open(COOKIE_FILE, "r", encoding="utf-8") as f: + cookies = json.load(f) + for c in cookies: + driver.add_cookie(c) + print("🍪 Cookies loaded.") +else: + print("⚠️ Cookie file not found – login may be required.") + +# ============================================================ +# 3) requests.Session from Selenium cookies +# ============================================================ +requests_session = requests.Session() +for ck in driver.get_cookies(): + requests_session.cookies.set(ck["name"], ck["value"]) + +print("🔗 Requests session initialized.") + + +# ============================================================ +# 4) Popup handler +# ============================================================ +def close_popup_if_any(): + try: + driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}") + time.sleep(0.5) + except Exception: + pass + + +# ============================================================ +# 5) Parse one torrent row +# ============================================================ +def parse_row(cells): + # Column 0: Category icon/text + category = cells[0].text.strip() + + try: + # Column 1: Download icon link + download_a = cells[1].find_element(By.TAG_NAME, "a") + download_link = download_a.get_attribute("href") + except: + return None + + parsed_dl = urlparse.urlparse(download_link) + dl_query = urlparse.parse_qs(parsed_dl.query) + torrent_filename = dl_query.get("f", ["unknown.torrent"])[0] + + # Column 2: Name and info + title_links = cells[2].find_elements(By.TAG_NAME, "a") + if not title_links: + return None + + a_tag = title_links[0] + visible_name = a_tag.text.strip() + full_title = a_tag.get_attribute("title") + details_link = a_tag.get_attribute("href") + + parsed = urlparse.urlparse(details_link) + query = urlparse.parse_qs(parsed.query) + if "id" not in query: + return None + + torrent_hash = query["id"][0] + + # Use innerText for robust text extraction + text_block = cells[2].get_attribute("innerText") + text_block_clean = " ".join(text_block.split()) + + # Regex for Size and Date + size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE) + added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE) + + size_pretty = size_match.group(1) if size_match else None + added_pretty = added_match.group(1) if added_match else None + + # Date conversion: "29/11/2025 o 02:29" -> MySQL format + added_mysql = None + if added_pretty: + clean = added_pretty.replace(" o ", " ").strip() + parts = clean.split(" ") + if len(parts) >= 2: + date_part = parts[0] + time_part = parts[1] + if len(time_part.split(":")) == 2: + time_part += ":00" + try: + day, month, year = date_part.split("/") + added_mysql = f"{year}-{month}-{day} {time_part}" + except: + added_mysql = None + + # Column 2: Image preview (if exists) + img_link = None + try: + image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]") + mouseover = image_a.get_attribute("onmouseover") + img_match = re.search(r"src=([^ ]+)", mouseover) + if img_match: + img_link = img_match.group(1).replace("'", "").strip() + if img_link.startswith("//"): + img_link = "https:" + img_link + except: + pass + + # Column 4: Seeders + seeders_a = cells[4].find_element(By.TAG_NAME, "a") + seeders_number = int(seeders_a.text.strip()) + seeders_link = seeders_a.get_attribute("href") + + # Column 5: Leechers + leechers_a = cells[5].find_element(By.TAG_NAME, "a") + leechers_number = int(leechers_a.text.strip()) + leechers_link = leechers_a.get_attribute("href") + + # Check database for existing binary content + cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,)) + row = cursor.fetchone() + already_have_torrent = row is not None and row[0] is not None + + torrent_content = None + if not already_have_torrent: + time.sleep(3) # Politeness delay + try: + resp = requests_session.get(download_link) + resp.raise_for_status() + torrent_content = resp.content + except: + torrent_content = None + + return { + "torrent_hash": torrent_hash, + "details_link": details_link, + "category": category, + "title_visible": visible_name, + "title_full": full_title, + "size_pretty": size_pretty, + "added_datetime": added_mysql, + "preview_image": img_link, + "seeders": seeders_number, + "seeders_link": seeders_link, + "leechers": leechers_number, + "leechers_link": leechers_link, + "torrent_filename": torrent_filename, + "torrent_content": torrent_content if not already_have_torrent else None, + "is_new_torrent": not already_have_torrent, + } + + +# ============================================================ +# 6) INSERT SQL +# ============================================================ +insert_sql = """ +INSERT INTO torrents ( + torrent_hash, details_link, category, title_visible, title_full, + size_pretty, added_datetime, preview_image, + seeders, seeders_link, leechers, leechers_link, + torrent_filename, torrent_content +) VALUES ( + %(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s, + %(size_pretty)s, %(added_datetime)s, %(preview_image)s, + %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s, + %(torrent_filename)s, %(torrent_content)s +) +ON DUPLICATE KEY UPDATE + seeders = VALUES(seeders), + leechers = VALUES(leechers), + torrent_content = COALESCE(VALUES(torrent_content), torrent_content); +""" + +# ============================================================ +# 7) PROCESS ALL PAGES +# ============================================================ +TOTAL_PAGES = 226 + +for page_num in range(0, TOTAL_PAGES): + current_url = f"{BASE_URL}&page={page_num}" + print(f"\n🌐 Loading Page Index {page_num} (Page {page_num + 1}/{TOTAL_PAGES})") + + driver.get(current_url) + time.sleep(2) + close_popup_if_any() + + # Find table rows + rows = driver.find_elements(By.CSS_SELECTOR, "table tr") + # v1 table usually has 7 cells for a data row + real_rows = [r.find_elements(By.TAG_NAME, "td") for r in rows if len(r.find_elements(By.TAG_NAME, "td")) == 7] + + if not real_rows: + print("⚠️ No data rows found on this page. Ending loop.") + break + + page_new_items = 0 + for cells in real_rows: + try: + data = parse_row(cells) + except Exception as e: + print(f"⚠️ parse_row failed: {e}") + continue + + if not data: continue + processed_count += 1 + + if data["is_new_torrent"]: + new_torrent_count += 1 + page_new_items += 1 + new_titles.append(data["title_visible"]) + print(f"💾 NEW: {data['title_visible']}") + else: + existing_torrent_count += 1 + print(f"♻️ UPDATING: {data['title_visible']}") + + cursor.execute(insert_sql, data) + + # If an entire page is old news, we can stop the deep crawl + if page_new_items == 0 and page_num > 0: + print("🛑 Page contained only known items. Sync complete.") + break + + time.sleep(1) + +# ============================================================ +# 8) SEND EMAIL REPORT +# ============================================================ +RUN_END = datetime.datetime.now() +subject = f"SKTorrent run – {RUN_START:%Y-%m-%d %H:%M}" +body = ( + f"Run started: {RUN_START:%Y-%m-%d %H:%M:%S}\n" + f"Run finished: {RUN_END:%Y-%m-%d %H:%M:%S}\n\n" + f"Processed torrents: {processed_count}\n" + f"New torrents saved: {new_torrent_count}\n" + f"Existing torrents updated: {existing_torrent_count}\n" +) +if new_titles: + body += "\nNew torrents list:\n- " + "\n- ".join(new_titles) + +send_mail(to="vladimir.buzalka@buzalka.cz", subject=subject, body=body, html=False) +print("📧 Email report sent.") + +driver.quit() +print("🎉 DONE") \ No newline at end of file diff --git a/Reporter_ReadNewTorrents.py b/Reporter_ReadNewTorrents.py index eef20f9..c40c955 100644 --- a/Reporter_ReadNewTorrents.py +++ b/Reporter_ReadNewTorrents.py @@ -37,8 +37,8 @@ sys.stdout.flush() # ============================================================ db = pymysql.connect( - host="192.168.1.76", - port=3307, + host="192.168.1.50", + port=3306, user="root", password="Vlado9674+", database="torrents",