#!/usr/bin/env python3 # -*- coding: utf-8 -*- import pymysql from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options import time import re import urllib.parse as urlparse from pathlib import Path import json import requests # ============================================================ # 1) MySQL CONNECTION # ============================================================ db = pymysql.connect( host="192.168.1.76", port=3307, user="root", password="Vlado9674+", database="torrents", charset="utf8mb4", autocommit=True ) cursor = db.cursor() # ============================================================ # 2) Selenium setup # ============================================================ COOKIE_FILE = Path("sktorrent_cookies.json") # Start URL pro kategorii 24, seřazeno podle data DESC START_URL = ( "https://sktorrent.eu/torrent/torrents.php" "?active=0&category=24&order=data&by=DESC&zaner=&jazyk=&page=0" ) chrome_options = Options() chrome_options.add_argument("--start-maximized") chrome_options.add_argument("--disable-notifications") chrome_options.add_argument("--disable-popup-blocking") chrome_options.add_argument("--disable-extensions") driver = webdriver.Chrome(options=chrome_options) # Pozice a velikost okna (aby nepřekrývalo PyCharm) driver.set_window_position(380, 50) # 10 cm od levého okraje driver.set_window_size(1350, 1000) # můžeš změnit dle monitoru # Nejprve otevřeme hlavní stránku kvůli doméně pro cookies driver.get("https://sktorrent.eu") # Load cookies z JSON if COOKIE_FILE.exists(): with open(COOKIE_FILE, "r") as f: cookies = json.load(f) for c in cookies: driver.add_cookie(c) print("🍪 Cookies loaded.") else: print("⚠️ Cookie file not found, you may not be logged in!") # ============================================================ # 3) Převod cookies → requests.Session (pro stahování .torrent) # ============================================================ requests_session = requests.Session() for ck in driver.get_cookies(): requests_session.cookies.set(ck["name"], ck["value"]) print("🔗 Requests session initialized with Selenium cookies.") # ============================================================ # 4) Funkce pro zavření popupu # ============================================================ def close_popup_if_any(): """Zkusí zavřít interstitial reklamu pomocí JS funkce interstitialBox.closeit().""" try: driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}") # Krátká pauza, ať se DOM uklidní time.sleep(0.5) print("🧹 Popup closed via JS fallback (if present).") except Exception as e: print("ℹ️ Popup JS handler not found:", e) # ============================================================ # 5) Funkce pro parsování jednoho řádku (jednoho torrentu) # ============================================================ def parse_row(cells): """ cells: list o délce 7 Struktura: 0: kategorie 1: download link (.torrent) 2: název + velikost + datum + 'Obrázok' + žánr 3: -- (ignorujeme) 4: seeders 5: leechers 6: completed """ # -------------------------- # 1️⃣ CATEGORY # -------------------------- category = cells[0].text.strip() # -------------------------- # 2️⃣ DOWNLOAD LINK FOR TORRENT FILE (cells[1]) # -------------------------- try: download_a = cells[1].find_element(By.TAG_NAME, "a") download_link = download_a.get_attribute("href") except: print("⚠️ No download link in row, skipping.") return None parsed_dl = urlparse.urlparse(download_link) dl_query = urlparse.parse_qs(parsed_dl.query) torrent_filename = dl_query.get("f", ["unknown.torrent"])[0] # -------------------------- # 3️⃣ TITLE + DETAILS LINK (in cell[2]) # -------------------------- title_links = cells[2].find_elements(By.TAG_NAME, "a") if not title_links: print("⚠️ No title link — skipping row") return None a_tag = title_links[0] visible_name = a_tag.text.strip() full_title = a_tag.get_attribute("title") details_link = a_tag.get_attribute("href") if not details_link: print("⚠️ Row has no details link — skipping") return None # -------------------------- # Extract torrent hash from ?id= # -------------------------- parsed = urlparse.urlparse(details_link) query = urlparse.parse_qs(parsed.query) if "id" not in query: print("⚠️ Skipping row with no torrent ID →", details_link) return None torrent_hash = query["id"][0] # -------------------------- # 4️⃣ Size + date parsing # -------------------------- text_block = cells[2].get_attribute("innerText") text_block_clean = " ".join(text_block.split()) size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE) added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE) size_pretty = size_match.group(1) if size_match else None added_pretty = added_match.group(1) if added_match else None # Robustní převod data/času do MySQL datetime added_mysql = None if added_pretty: # "29/11/2025 o 02:29" → "29/11/2025 02:29" clean = added_pretty.replace(" o ", " ").strip() parts = clean.split(" ") date_part = parts[0] time_part = parts[1] if len(parts) > 1 else "00:00:00" # pokud chybí sekundy, přidej if len(time_part.split(":")) == 2: time_part += ":00" day, month, year = date_part.split("/") added_mysql = f"{year}-{month}-{day} {time_part}" # -------------------------- # 5️⃣ Image preview # -------------------------- img_link = None try: image_a = cells[2].find_element( By.XPATH, ".//a[contains(text(),'Obrázok')]" ) mouseover = image_a.get_attribute("onmouseover") img_match = re.search(r"src=([^ ]+)", mouseover) if img_match: img_link = img_match.group(1).replace("'", "").strip() if img_link.startswith("//"): img_link = "https:" + img_link except: pass # -------------------------- # 6️⃣ SEEDERS / LEECHERS # -------------------------- seeders_a = cells[4].find_element(By.TAG_NAME, "a") seeders_number = int(seeders_a.text.strip()) seeders_link = seeders_a.get_attribute("href") leechers_a = cells[5].find_element(By.TAG_NAME, "a") leechers_number = int(leechers_a.text.strip()) leechers_link = leechers_a.get_attribute("href") # -------------------------- # 7️⃣ DOWNLOAD TORRENT CONTENT (.torrent) # -------------------------- torrent_content = None time.sleep(3) #mezera mezi torrenty try: resp = requests_session.get(download_link) resp.raise_for_status() torrent_content = resp.content except Exception as e: print(f"⚠️ Could not download torrent file for {torrent_hash}: {e}") torrent_content = None # -------------------------- # FINAL DICTIONARY # -------------------------- return { "torrent_hash": torrent_hash, "details_link": details_link, "category": category, "title_visible": visible_name, "title_full": full_title, "size_pretty": size_pretty, "added_datetime": added_mysql, "preview_image": img_link, "seeders": seeders_number, "seeders_link": seeders_link, "leechers": leechers_number, "leechers_link": leechers_link, "torrent_filename": torrent_filename, "torrent_content": torrent_content, } # ============================================================ # 6) MySQL INSERT # ============================================================ insert_sql = """ INSERT INTO torrents ( torrent_hash, details_link, category, title_visible, title_full, size_pretty, added_datetime, preview_image, seeders, seeders_link, leechers, leechers_link, torrent_filename, torrent_content ) VALUES ( %(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s, %(size_pretty)s, %(added_datetime)s, %(preview_image)s, %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s, %(torrent_filename)s, %(torrent_content)s ) ON DUPLICATE KEY UPDATE details_link = VALUES(details_link), category = VALUES(category), title_visible = VALUES(title_visible), title_full = VALUES(title_full), size_pretty = VALUES(size_pretty), added_datetime = VALUES(added_datetime), preview_image = VALUES(preview_image), seeders = VALUES(seeders), seeders_link = VALUES(seeders_link), leechers = VALUES(leechers), leechers_link = VALUES(leechers_link), torrent_filename = VALUES(torrent_filename), torrent_content = VALUES(torrent_content); """ # ============================================================ # 7) Funkce pro zpracování jedné stránky # ============================================================ def process_current_page(page_index: int): """ Zpracuje aktuálně otevřenou stránku: - najde všechny "REAL TORRENT ROWS" (7 td) - pro každý torrent: * parse_row * insert/update do DB """ rows = driver.find_elements(By.CSS_SELECTOR, "table tr") real_rows = [] for row in rows: cells = row.find_elements(By.TAG_NAME, "td") # REAL TORRENT ROWS ALWAYS HAVE EXACTLY 7 TD CELLS if len(cells) == 7: real_rows.append(cells) print(f"📄 Page {page_index}: {len(real_rows)} torrent rows") for cells in real_rows: data = parse_row(cells) if not data: continue print(f" 💾 [{page_index}] Saving:", data["title_visible"]) cursor.execute(insert_sql, data) # ============================================================ # 8) Hlavní stránkovací cyklus # ============================================================ current_url = START_URL page_index = 0 while True: print(f"\n🌐 Loading page {page_index}: {current_url}") driver.get(current_url) time.sleep(2) # zavři popup, pokud je close_popup_if_any() # zpracuj aktuální stránku process_current_page(page_index) # pokus se najít tlačítko "Dalsi >>" try: next_btn = driver.find_element( By.XPATH, "//a[b[contains(text(),'Dalsi')]]" ) next_url = next_btn.get_attribute("href") if not next_url: print("⛔ Next link has no href, stopping.") break # pokud je relativní, doplň doménu if next_url.startswith("/"): next_url = "https://sktorrent.eu" + next_url # když by náhodou bylo stejné URL → přeruš nekonečnou smyčku if next_url == current_url: print("⛔ Next URL equals current URL, stopping.") break print("➡️ Next page:", next_url) current_url = next_url page_index += 1 # malá pauza mezi stránkami time.sleep(1) except Exception: print("✅ No 'Dalsi >>' link found, reached last page. Done.") break print("\n🎉 DONE — All pages processed, torrents saved & torrent files downloaded.") driver.quit()