From 3631f6cdf5a8a271b1d1dfddb3c96d8465156654 Mon Sep 17 00:00:00 2001 From: vlado Date: Mon, 15 Dec 2025 06:11:53 +0100 Subject: [PATCH] reporter --- 30 OpenTextLIsting v5.py | 390 +++++++++++++++++++++++++++++++++++++++ 30 OpenTextListing v2.py | 256 +++++++++++++++++++++++++ 30 OpenTextListing v3.py | 291 +++++++++++++++++++++++++++++ 30 OpenTextListing v4.py | 375 +++++++++++++++++++++++++++++++++++++ 4 files changed, 1312 insertions(+) diff --git a/30 OpenTextLIsting v5.py b/30 OpenTextLIsting v5.py index e69de29..ac2d9e5 100644 --- a/30 OpenTextLIsting v5.py +++ b/30 OpenTextLIsting v5.py @@ -0,0 +1,390 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import pymysql +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.options import Options +import time +import re +import urllib.parse as urlparse +from pathlib import Path +import json +import requests + + +# ============================================================ +# 1) MySQL CONNECTION +# ============================================================ + +db = pymysql.connect( + host="192.168.1.76", + port=3307, + user="root", + password="Vlado9674+", + database="torrents", + charset="utf8mb4", + autocommit=True +) + +cursor = db.cursor() + + +# ============================================================ +# 2) Selenium setup +# ============================================================ + +COOKIE_FILE = Path("sktorrent_cookies.json") + +# Start URL pro kategorii 24, seřazeno podle data DESC +START_URL = ( + "https://sktorrent.eu/torrent/torrents.php" + "?active=0&category=24&order=data&by=DESC&zaner=&jazyk=&page=90" +) + +chrome_options = Options() +chrome_options.add_argument("--start-maximized") +chrome_options.add_argument("--disable-notifications") +chrome_options.add_argument("--disable-popup-blocking") +chrome_options.add_argument("--disable-extensions") + +driver = webdriver.Chrome(options=chrome_options) + +# Pozice a velikost okna (aby nepřekrývalo PyCharm) +driver.set_window_position(380, 50) # 10 cm od levého okraje +driver.set_window_size(1350, 1000) # můžeš změnit dle monitoru + + +# Nejprve otevřeme hlavní stránku kvůli doméně pro cookies +driver.get("https://sktorrent.eu") + +# Load cookies z JSON +if COOKIE_FILE.exists(): + with open(COOKIE_FILE, "r") as f: + cookies = json.load(f) + for c in cookies: + driver.add_cookie(c) + print("🍪 Cookies loaded.") +else: + print("⚠️ Cookie file not found, you may not be logged in!") + + +# ============================================================ +# 3) Převod cookies → requests.Session (pro stahování .torrent) +# ============================================================ + +requests_session = requests.Session() +for ck in driver.get_cookies(): + requests_session.cookies.set(ck["name"], ck["value"]) + +print("🔗 Requests session initialized with Selenium cookies.") + + +# ============================================================ +# 4) Funkce pro zavření popupu +# ============================================================ + +def close_popup_if_any(): + """Zkusí zavřít interstitial reklamu pomocí JS funkce interstitialBox.closeit().""" + try: + driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}") + # Krátká pauza, ať se DOM uklidní + time.sleep(0.5) + print("🧹 Popup closed via JS fallback (if present).") + except Exception as e: + print("ℹ️ Popup JS handler not found:", e) + + +# ============================================================ +# 5) Funkce pro parsování jednoho řádku (jednoho torrentu) +# ============================================================ + +def parse_row(cells): + """ + cells: list o délce 7 + Struktura: + 0: kategorie + 1: download link (.torrent) + 2: název + velikost + datum + 'Obrázok' + žánr + 3: -- (ignorujeme) + 4: seeders + 5: leechers + 6: completed + """ + + # -------------------------- + # 1️⃣ CATEGORY + # -------------------------- + category = cells[0].text.strip() + + # -------------------------- + # 2️⃣ DOWNLOAD LINK FOR TORRENT FILE (cells[1]) + # -------------------------- + try: + download_a = cells[1].find_element(By.TAG_NAME, "a") + download_link = download_a.get_attribute("href") + except: + print("⚠️ No download link in row, skipping.") + return None + + parsed_dl = urlparse.urlparse(download_link) + dl_query = urlparse.parse_qs(parsed_dl.query) + + torrent_filename = dl_query.get("f", ["unknown.torrent"])[0] + + # -------------------------- + # 3️⃣ TITLE + DETAILS LINK (in cell[2]) + # -------------------------- + title_links = cells[2].find_elements(By.TAG_NAME, "a") + if not title_links: + print("⚠️ No title link — skipping row") + return None + + a_tag = title_links[0] + + visible_name = a_tag.text.strip() + full_title = a_tag.get_attribute("title") + details_link = a_tag.get_attribute("href") + + if not details_link: + print("⚠️ Row has no details link — skipping") + return None + + # -------------------------- + # Extract torrent hash from ?id= + # -------------------------- + parsed = urlparse.urlparse(details_link) + query = urlparse.parse_qs(parsed.query) + + if "id" not in query: + print("⚠️ Skipping row with no torrent ID →", details_link) + return None + + torrent_hash = query["id"][0] + + # -------------------------- + # 4️⃣ Size + date parsing + # -------------------------- + text_block = cells[2].get_attribute("innerText") + text_block_clean = " ".join(text_block.split()) + + size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE) + added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE) + + size_pretty = size_match.group(1) if size_match else None + added_pretty = added_match.group(1) if added_match else None + + # Robustní převod data/času do MySQL datetime + added_mysql = None + if added_pretty: + # "29/11/2025 o 02:29" → "29/11/2025 02:29" + clean = added_pretty.replace(" o ", " ").strip() + parts = clean.split(" ") + + date_part = parts[0] + time_part = parts[1] if len(parts) > 1 else "00:00:00" + + # pokud chybí sekundy, přidej + if len(time_part.split(":")) == 2: + time_part += ":00" + + day, month, year = date_part.split("/") + added_mysql = f"{year}-{month}-{day} {time_part}" + + # -------------------------- + # 5️⃣ Image preview + # -------------------------- + img_link = None + try: + image_a = cells[2].find_element( + By.XPATH, + ".//a[contains(text(),'Obrázok')]" + ) + mouseover = image_a.get_attribute("onmouseover") + img_match = re.search(r"src=([^ ]+)", mouseover) + if img_match: + img_link = img_match.group(1).replace("'", "").strip() + if img_link.startswith("//"): + img_link = "https:" + img_link + except: + pass + + # -------------------------- + # 6️⃣ SEEDERS / LEECHERS + # -------------------------- + seeders_a = cells[4].find_element(By.TAG_NAME, "a") + seeders_number = int(seeders_a.text.strip()) + seeders_link = seeders_a.get_attribute("href") + + leechers_a = cells[5].find_element(By.TAG_NAME, "a") + leechers_number = int(leechers_a.text.strip()) + leechers_link = leechers_a.get_attribute("href") + + # -------------------------- + # 7️⃣ Check, zda už máme torrent_content v DB + # -------------------------- + cursor.execute( + "SELECT torrent_content FROM torrents WHERE torrent_hash=%s", + (torrent_hash,) + ) + row = cursor.fetchone() + already_have_torrent = row is not None and row[0] is not None + + # -------------------------- + # 8️⃣ DOWNLOAD TORRENT CONTENT (.torrent) – only if needed + # -------------------------- + torrent_content = None + + if already_have_torrent: + print(f" ↪️ Torrent file already stored, skipping download ({torrent_filename})") + else: + time.sleep(3) # mezera mezi torrenty + try: + resp = requests_session.get(download_link) + resp.raise_for_status() + torrent_content = resp.content + except Exception as e: + print(f"⚠️ Could not download torrent file for {torrent_hash}: {e}") + torrent_content = None + + # -------------------------- + # FINAL DICTIONARY + # -------------------------- + return { + "torrent_hash": torrent_hash, + "details_link": details_link, + "category": category, + "title_visible": visible_name, + "title_full": full_title, + "size_pretty": size_pretty, + "added_datetime": added_mysql, + "preview_image": img_link, + "seeders": seeders_number, + "seeders_link": seeders_link, + "leechers": leechers_number, + "leechers_link": leechers_link, + "torrent_filename": torrent_filename, + # pokud jsme torrent už měli, vracíme None → UPDATE ho nepřepíše (COALESCE) + "torrent_content": torrent_content if not already_have_torrent else None, + } + + +# ============================================================ +# 6) MySQL INSERT +# ============================================================ + +insert_sql = """ +INSERT INTO torrents ( + torrent_hash, details_link, category, title_visible, title_full, + size_pretty, added_datetime, preview_image, + seeders, seeders_link, leechers, leechers_link, + torrent_filename, torrent_content +) VALUES ( + %(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s, + %(size_pretty)s, %(added_datetime)s, %(preview_image)s, + %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s, + %(torrent_filename)s, %(torrent_content)s +) +ON DUPLICATE KEY UPDATE + details_link = VALUES(details_link), + category = VALUES(category), + title_visible = VALUES(title_visible), + title_full = VALUES(title_full), + size_pretty = VALUES(size_pretty), + added_datetime = VALUES(added_datetime), + preview_image = VALUES(preview_image), + seeders = VALUES(seeders), + seeders_link = VALUES(seeders_link), + leechers = VALUES(leechers), + leechers_link = VALUES(leechers_link), + torrent_filename = VALUES(torrent_filename), + torrent_content = COALESCE(VALUES(torrent_content), torrent_content); +""" + + +# ============================================================ +# 7) Funkce pro zpracování jedné stránky +# ============================================================ + +def process_current_page(page_index: int): + """ + Zpracuje aktuálně otevřenou stránku: + - najde všechny "REAL TORRENT ROWS" (7 td) + - pro každý torrent: + * parse_row + * insert/update do DB + """ + rows = driver.find_elements(By.CSS_SELECTOR, "table tr") + + real_rows = [] + for row in rows: + cells = row.find_elements(By.TAG_NAME, "td") + + # REAL TORRENT ROWS ALWAYS HAVE EXACTLY 7 TD CELLS + if len(cells) == 7: + real_rows.append(cells) + + print(f"📄 Page {page_index}: {len(real_rows)} torrent rows") + + for cells in real_rows: + data = parse_row(cells) + if not data: + continue + + print(f" 💾 [{page_index}] Saving:", data["title_visible"]) + cursor.execute(insert_sql, data) + + +# ============================================================ +# 8) Hlavní stránkovací cyklus +# ============================================================ + +current_url = START_URL +page_index = 0 + +while True: + print(f"\n🌐 Loading page {page_index}: {current_url}") + driver.get(current_url) + time.sleep(2) + + # zavři popup, pokud je + close_popup_if_any() + + # zpracuj aktuální stránku + process_current_page(page_index) + + # pokus se najít tlačítko "Dalsi >>" + try: + next_btn = driver.find_element( + By.XPATH, + "//a[b[contains(text(),'Dalsi')]]" + ) + next_url = next_btn.get_attribute("href") + + if not next_url: + print("⛔ Next link has no href, stopping.") + break + + # pokud je relativní, doplň doménu + if next_url.startswith("/"): + next_url = "https://sktorrent.eu" + next_url + + # když by náhodou bylo stejné URL → přeruš nekonečnou smyčku + if next_url == current_url: + print("⛔ Next URL equals current URL, stopping.") + break + + print("➡️ Next page:", next_url) + current_url = next_url + page_index += 1 + + # malá pauza mezi stránkami + time.sleep(1) + + except Exception: + print("✅ No 'Dalsi >>' link found, reached last page. Done.") + break + + +print("\n🎉 DONE — All pages processed, torrents saved & torrent files downloaded (without re-downloading existing ones).") +driver.quit() diff --git a/30 OpenTextListing v2.py b/30 OpenTextListing v2.py index e69de29..ab396fc 100644 --- a/30 OpenTextListing v2.py +++ b/30 OpenTextListing v2.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import pymysql +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.options import Options +import time +import re +import urllib.parse as urlparse +from pathlib import Path +import json + + +# ============================================================ +# 1) MySQL CONNECTION +# ============================================================ + +db = pymysql.connect( + host="192.168.1.76", + port=3307, + user="root", + password="Vlado9674+", + database="torrents", + charset="utf8mb4", + autocommit=True +) + +cursor = db.cursor() + + +# ============================================================ +# 2) Selenium setup +# ============================================================ + +COOKIE_FILE = Path("sktorrent_cookies.json") +URL = "https://sktorrent.eu/torrent/torrents.php?active=0&category=24&order=data&by=DESC&zaner=&jazyk=&page=0" + +chrome_options = Options() +chrome_options.add_argument("--start-maximized") +chrome_options.add_argument("--disable-notifications") +chrome_options.add_argument("--disable-popup-blocking") +chrome_options.add_argument("--disable-extensions") + +driver = webdriver.Chrome(options=chrome_options) + +driver.get("https://sktorrent.eu") + +# Load cookies +if COOKIE_FILE.exists(): + with open(COOKIE_FILE, "r") as f: + cookies = json.load(f) + for c in cookies: + driver.add_cookie(c) + print("🍪 Cookies loaded.") + +driver.get(URL) +time.sleep(2) + +# ============================================================ +# Close interstitial popup reliably +# ============================================================ + +time.sleep(1) + +try: + # JS close always exists even when HTML structure varies + driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}") + print("🧹 Popup closed via JS fallback.") + time.sleep(1) +except: + print("ℹ️ Popup JS handler not found (probably no popup).") + + + +# ============================================================ +# 3) Extract table rows +# ============================================================ + +rows = driver.find_elements(By.CSS_SELECTOR, "table tr") +print("Total rows found:", len(rows)) + +real_rows = [] +for row in rows: + cells = row.find_elements(By.TAG_NAME, "td") + + # REAL TORRENT ROWS ALWAYS HAVE EXACTLY 7 TD CELLS + if len(cells) == 7: + real_rows.append(cells) + +print("Real torrent rows:", len(real_rows)) +print("") + + +# ============================================================ +# 4) Function to extract fields from one row +# ============================================================ + +def parse_row(cells): + + # -------------------------- + # 1️⃣ CATEGORY (cells[0]) + # -------------------------- + category = cells[0].text.strip() + + # -------------------------- + # 2️⃣ TITLE + DETAILS LINK (always inside cells[2]) + # -------------------------- + title_links = cells[2].find_elements(By.TAG_NAME, "a") + if not title_links: + print("⚠️ Missing title link — skipping row") + return None + + a_tag = title_links[0] + + visible_name = a_tag.text.strip() + full_title = a_tag.get_attribute("title") + details_link = a_tag.get_attribute("href") + + if not details_link: + print("⚠️ Row has no details link — skipping") + return None + + # -------------------------- + # 3️⃣ TORRENT HASH + # -------------------------- + parsed = urlparse.urlparse(details_link) + query = urlparse.parse_qs(parsed.query) + + if "id" not in query: + print("⚠️ Skipping row with no torrent ID →", details_link) + return None + + torrent_hash = query["id"][0] + + # -------------------------- + # 4️⃣ TEXT BLOCK (size + date) + # -------------------------- + text_block = cells[2].get_attribute("innerText") + text_block_clean = " ".join(text_block.split()) + + size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE) + added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE) + + size_pretty = size_match.group(1) if size_match else None + added_pretty = added_match.group(1) if added_match else None + + # Convert “18/11/2025 o 07:00” → “2025-11-18 07:00:00” + added_mysql = None + if added_pretty: + # Normalize formats like "29/11/2025 o 02:29", "29/11/2025 02:29:18" + clean = added_pretty.replace(" o ", " ").strip() + + # Split date and time + date_part, *time_parts = clean.split(" ") + + # If seconds are missing, add :00 + time_part = time_parts[0] if time_parts else "00:00" + if len(time_part.split(":")) == 2: + time_part += ":00" + + day, month, year = date_part.split("/") + + added_mysql = f"{year}-{month}-{day} {time_part}" + + # -------------------------- + # 5️⃣ IMAGE PREVIEW + # -------------------------- + img_link = None + try: + image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]") + mouseover = image_a.get_attribute("onmouseover") + img_match = re.search(r"src=([^ ]+)", mouseover) + if img_match: + img_link = img_match.group(1).replace("'", "").strip() + if img_link.startswith("//"): + img_link = "https:" + img_link + except: + pass + + # -------------------------- + # 6️⃣ SEEDERS (cells[4]) + # -------------------------- + seeders_a = cells[4].find_element(By.TAG_NAME, "a") + seeders_number = int(seeders_a.text.strip()) + seeders_link = seeders_a.get_attribute("href") + + # -------------------------- + # 7️⃣ LEECHERS (cells[5]) + # -------------------------- + leechers_a = cells[5].find_element(By.TAG_NAME, "a") + leechers_number = int(leechers_a.text.strip()) + leechers_link = leechers_a.get_attribute("href") + + # -------------------------- + # Return result + # -------------------------- + return { + "torrent_hash": torrent_hash, + "details_link": details_link, + "category": category, + "title_visible": visible_name, + "title_full": full_title, + "size_pretty": size_pretty, + "added_datetime": added_mysql, + "preview_image": img_link, + "seeders": seeders_number, + "seeders_link": seeders_link, + "leechers": leechers_number, + "leechers_link": leechers_link, + } + + +# ============================================================ +# 5) MySQL INSERT +# ============================================================ + +insert_sql = """ +INSERT INTO torrents ( + torrent_hash, details_link, category, title_visible, title_full, + size_pretty, added_datetime, preview_image, + seeders, seeders_link, leechers, leechers_link +) VALUES ( + %(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s, + %(size_pretty)s, %(added_datetime)s, %(preview_image)s, + %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s +) +ON DUPLICATE KEY UPDATE + details_link = VALUES(details_link), + category = VALUES(category), + title_visible = VALUES(title_visible), + title_full = VALUES(title_full), + size_pretty = VALUES(size_pretty), + added_datetime = VALUES(added_datetime), + preview_image = VALUES(preview_image), + seeders = VALUES(seeders), + seeders_link = VALUES(seeders_link), + leechers = VALUES(leechers), + leechers_link = VALUES(leechers_link); +""" + + +# ============================================================ +# 6) PROCESS ALL ROWS +# ============================================================ + +for cells in real_rows: + data = parse_row(cells) + if not data: + continue + + print("💾 Saving:", data["title_visible"]) + cursor.execute(insert_sql, data) + +print("\n✅ DONE — All torrents saved to MySQL.") +driver.quit() diff --git a/30 OpenTextListing v3.py b/30 OpenTextListing v3.py index e69de29..85f1e4c 100644 --- a/30 OpenTextListing v3.py +++ b/30 OpenTextListing v3.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import pymysql +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.options import Options +import time +import re +import urllib.parse as urlparse +from pathlib import Path +import json +import requests + + +# ============================================================ +# 1) MySQL CONNECTION +# ============================================================ + +db = pymysql.connect( + host="192.168.1.76", + port=3307, + user="root", + password="Vlado9674+", + database="torrents", + charset="utf8mb4", + autocommit=True +) + +cursor = db.cursor() + + +# ============================================================ +# 2) Selenium setup +# ============================================================ + +COOKIE_FILE = Path("sktorrent_cookies.json") +URL = "https://sktorrent.eu/torrent/torrents.php?active=0" + +chrome_options = Options() +chrome_options.add_argument("--start-maximized") +chrome_options.add_argument("--disable-notifications") +chrome_options.add_argument("--disable-popup-blocking") +chrome_options.add_argument("--disable-extensions") + +driver = webdriver.Chrome(options=chrome_options) + +driver.get("https://sktorrent.eu") + +# Load cookies +session_cookies = [] +if COOKIE_FILE.exists(): + with open(COOKIE_FILE, "r") as f: + cookies = json.load(f) + for c in cookies: + driver.add_cookie(c) + session_cookies.append({c['name']: c['value']}) + print("🍪 Cookies loaded.") + +driver.get(URL) +time.sleep(2) + + +# ============================================================ +# 3) Close interstitial popup robustly +# ============================================================ + +try: + driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}") + print("🧹 Popup closed via JS fallback.") + time.sleep(1) +except: + print("ℹ️ No popup found.") + + +# ============================================================ +# Convert Selenium cookies → Python requests cookies +# ============================================================ + +requests_session = requests.Session() +for ck in driver.get_cookies(): + requests_session.cookies.set(ck["name"], ck["value"]) + + +# ============================================================ +# 4) Extract table rows +# ============================================================ + +rows = driver.find_elements(By.CSS_SELECTOR, "table tr") +print("Total rows found:", len(rows)) + +real_rows = [] +for row in rows: + cells = row.find_elements(By.TAG_NAME, "td") + + # REAL TORRENT ROWS ALWAYS HAVE EXACTLY 7 TD CELLS + if len(cells) == 7: + real_rows.append(cells) + +print("Real torrent rows:", len(real_rows)) +print("") + + +# ============================================================ +# 5) Function to extract fields from one row +# ============================================================ + +def parse_row(cells): + + # -------------------------- + # 1️⃣ CATEGORY + # -------------------------- + category = cells[0].text.strip() + + # -------------------------- + # 2️⃣ DOWNLOAD LINK FOR TORRENT FILE + # -------------------------- + try: + download_a = cells[1].find_element(By.TAG_NAME, "a") + download_link = download_a.get_attribute("href") + except: + print("⚠️ No download link in row, skipping.") + return None + + parsed_dl = urlparse.urlparse(download_link) + dl_query = urlparse.parse_qs(parsed_dl.query) + + torrent_filename = dl_query.get("f", ["unknown.torrent"])[0] + + # -------------------------- + # 3️⃣ Title + details link (in cell[2]) + # -------------------------- + title_links = cells[2].find_elements(By.TAG_NAME, "a") + if not title_links: + print("⚠️ No title link — skipping row") + return None + + a_tag = title_links[0] + + visible_name = a_tag.text.strip() + full_title = a_tag.get_attribute("title") + details_link = a_tag.get_attribute("href") + + if not details_link: + print("⚠️ Row has no details link — skipping") + return None + + # -------------------------- + # Extract torrent hash from ?id= + # -------------------------- + parsed = urlparse.urlparse(details_link) + query = urlparse.parse_qs(parsed.query) + + if "id" not in query: + print("⚠️ Skipping row with no torrent ID →", details_link) + return None + + torrent_hash = query["id"][0] + + # -------------------------- + # 4️⃣ Size + date parsing + # -------------------------- + text_block = cells[2].get_attribute("innerText") + text_block_clean = " ".join(text_block.split()) + + size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE) + added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE) + + size_pretty = size_match.group(1) if size_match else None + added_pretty = added_match.group(1) if added_match else None + + # Robust time normalization + added_mysql = None + if added_pretty: + + clean = added_pretty.replace(" o ", " ").strip() + parts = clean.split(" ") + + date_part = parts[0] + time_part = parts[1] if len(parts) > 1 else "00:00:00" + + # add seconds if missing + if len(time_part.split(":")) == 2: + time_part += ":00" + + day, month, year = date_part.split("/") + added_mysql = f"{year}-{month}-{day} {time_part}" + + # -------------------------- + # 5️⃣ Image preview + # -------------------------- + img_link = None + try: + image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]") + mouseover = image_a.get_attribute("onmouseover") + img_match = re.search(r"src=([^ ]+)", mouseover) + if img_match: + img_link = img_match.group(1).replace("'", "").strip() + if img_link.startswith("//"): + img_link = "https:" + img_link + except: + pass + + # -------------------------- + # 6️⃣ SEEDERS / LEECHERS + # -------------------------- + seeders_a = cells[4].find_element(By.TAG_NAME, "a") + seeders_number = int(seeders_a.text.strip()) + seeders_link = seeders_a.get_attribute("href") + + leechers_a = cells[5].find_element(By.TAG_NAME, "a") + leechers_number = int(leechers_a.text.strip()) + leechers_link = leechers_a.get_attribute("href") + + # -------------------------- + # 7️⃣ DOWNLOAD TORRENT CONTENT (.torrent) + # -------------------------- + try: + torrent_content = requests_session.get(download_link).content + except Exception as e: + print(f"⚠️ Could not download torrent file for {torrent_hash}: {e}") + torrent_content = None + + # -------------------------- + # FINAL DICTIONARY + # -------------------------- + return { + "torrent_hash": torrent_hash, + "details_link": details_link, + "category": category, + "title_visible": visible_name, + "title_full": full_title, + "size_pretty": size_pretty, + "added_datetime": added_mysql, + "preview_image": img_link, + "seeders": seeders_number, + "seeders_link": seeders_link, + "leechers": leechers_number, + "leechers_link": leechers_link, + "torrent_filename": torrent_filename, + "torrent_content": torrent_content, + } + + +# ============================================================ +# 6) MySQL INSERT +# ============================================================ + +insert_sql = """ +INSERT INTO torrents ( + torrent_hash, details_link, category, title_visible, title_full, + size_pretty, added_datetime, preview_image, + seeders, seeders_link, leechers, leechers_link, + torrent_filename, torrent_content +) VALUES ( + %(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s, + %(size_pretty)s, %(added_datetime)s, %(preview_image)s, + %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s, + %(torrent_filename)s, %(torrent_content)s +) +ON DUPLICATE KEY UPDATE + details_link = VALUES(details_link), + category = VALUES(category), + title_visible = VALUES(title_visible), + title_full = VALUES(title_full), + size_pretty = VALUES(size_pretty), + added_datetime = VALUES(added_datetime), + preview_image = VALUES(preview_image), + seeders = VALUES(seeders), + seeders_link = VALUES(seeders_link), + leechers = VALUES(leechers), + leechers_link = VALUES(leechers_link), + torrent_filename = VALUES(torrent_filename), + torrent_content = VALUES(torrent_content); +""" + + +# ============================================================ +# 7) PROCESS ALL ROWS +# ============================================================ + +for cells in real_rows: + data = parse_row(cells) + if not data: + continue + + print("💾 Saving:", data["title_visible"]) + cursor.execute(insert_sql, data) + +print("\n✅ DONE — All torrents saved to MySQL & torrent files downloaded.") +driver.quit() diff --git a/30 OpenTextListing v4.py b/30 OpenTextListing v4.py index e69de29..cc59520 100644 --- a/30 OpenTextListing v4.py +++ b/30 OpenTextListing v4.py @@ -0,0 +1,375 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import pymysql +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.options import Options +import time +import re +import urllib.parse as urlparse +from pathlib import Path +import json +import requests + + +# ============================================================ +# 1) MySQL CONNECTION +# ============================================================ + +db = pymysql.connect( + host="192.168.1.76", + port=3307, + user="root", + password="Vlado9674+", + database="torrents", + charset="utf8mb4", + autocommit=True +) + +cursor = db.cursor() + + +# ============================================================ +# 2) Selenium setup +# ============================================================ + +COOKIE_FILE = Path("sktorrent_cookies.json") + +# Start URL pro kategorii 24, seřazeno podle data DESC +START_URL = ( + "https://sktorrent.eu/torrent/torrents.php" + "?active=0&category=24&order=data&by=DESC&zaner=&jazyk=&page=0" +) + +chrome_options = Options() +chrome_options.add_argument("--start-maximized") +chrome_options.add_argument("--disable-notifications") +chrome_options.add_argument("--disable-popup-blocking") +chrome_options.add_argument("--disable-extensions") + +driver = webdriver.Chrome(options=chrome_options) + +# Pozice a velikost okna (aby nepřekrývalo PyCharm) +driver.set_window_position(380, 50) # 10 cm od levého okraje +driver.set_window_size(1350, 1000) # můžeš změnit dle monitoru + + +# Nejprve otevřeme hlavní stránku kvůli doméně pro cookies +driver.get("https://sktorrent.eu") + +# Load cookies z JSON +if COOKIE_FILE.exists(): + with open(COOKIE_FILE, "r") as f: + cookies = json.load(f) + for c in cookies: + driver.add_cookie(c) + print("🍪 Cookies loaded.") +else: + print("⚠️ Cookie file not found, you may not be logged in!") + + +# ============================================================ +# 3) Převod cookies → requests.Session (pro stahování .torrent) +# ============================================================ + +requests_session = requests.Session() +for ck in driver.get_cookies(): + requests_session.cookies.set(ck["name"], ck["value"]) + +print("🔗 Requests session initialized with Selenium cookies.") + + +# ============================================================ +# 4) Funkce pro zavření popupu +# ============================================================ + +def close_popup_if_any(): + """Zkusí zavřít interstitial reklamu pomocí JS funkce interstitialBox.closeit().""" + try: + driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}") + # Krátká pauza, ať se DOM uklidní + time.sleep(0.5) + print("🧹 Popup closed via JS fallback (if present).") + except Exception as e: + print("ℹ️ Popup JS handler not found:", e) + + +# ============================================================ +# 5) Funkce pro parsování jednoho řádku (jednoho torrentu) +# ============================================================ + +def parse_row(cells): + """ + cells: list o délce 7 + Struktura: + 0: kategorie + 1: download link (.torrent) + 2: název + velikost + datum + 'Obrázok' + žánr + 3: -- (ignorujeme) + 4: seeders + 5: leechers + 6: completed + """ + + # -------------------------- + # 1️⃣ CATEGORY + # -------------------------- + category = cells[0].text.strip() + + # -------------------------- + # 2️⃣ DOWNLOAD LINK FOR TORRENT FILE (cells[1]) + # -------------------------- + try: + download_a = cells[1].find_element(By.TAG_NAME, "a") + download_link = download_a.get_attribute("href") + except: + print("⚠️ No download link in row, skipping.") + return None + + parsed_dl = urlparse.urlparse(download_link) + dl_query = urlparse.parse_qs(parsed_dl.query) + + torrent_filename = dl_query.get("f", ["unknown.torrent"])[0] + + # -------------------------- + # 3️⃣ TITLE + DETAILS LINK (in cell[2]) + # -------------------------- + title_links = cells[2].find_elements(By.TAG_NAME, "a") + if not title_links: + print("⚠️ No title link — skipping row") + return None + + a_tag = title_links[0] + + visible_name = a_tag.text.strip() + full_title = a_tag.get_attribute("title") + details_link = a_tag.get_attribute("href") + + if not details_link: + print("⚠️ Row has no details link — skipping") + return None + + # -------------------------- + # Extract torrent hash from ?id= + # -------------------------- + parsed = urlparse.urlparse(details_link) + query = urlparse.parse_qs(parsed.query) + + if "id" not in query: + print("⚠️ Skipping row with no torrent ID →", details_link) + return None + + torrent_hash = query["id"][0] + + # -------------------------- + # 4️⃣ Size + date parsing + # -------------------------- + text_block = cells[2].get_attribute("innerText") + text_block_clean = " ".join(text_block.split()) + + size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE) + added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE) + + size_pretty = size_match.group(1) if size_match else None + added_pretty = added_match.group(1) if added_match else None + + # Robustní převod data/času do MySQL datetime + added_mysql = None + if added_pretty: + # "29/11/2025 o 02:29" → "29/11/2025 02:29" + clean = added_pretty.replace(" o ", " ").strip() + parts = clean.split(" ") + + date_part = parts[0] + time_part = parts[1] if len(parts) > 1 else "00:00:00" + + # pokud chybí sekundy, přidej + if len(time_part.split(":")) == 2: + time_part += ":00" + + day, month, year = date_part.split("/") + added_mysql = f"{year}-{month}-{day} {time_part}" + + # -------------------------- + # 5️⃣ Image preview + # -------------------------- + img_link = None + try: + image_a = cells[2].find_element( + By.XPATH, + ".//a[contains(text(),'Obrázok')]" + ) + mouseover = image_a.get_attribute("onmouseover") + img_match = re.search(r"src=([^ ]+)", mouseover) + if img_match: + img_link = img_match.group(1).replace("'", "").strip() + if img_link.startswith("//"): + img_link = "https:" + img_link + except: + pass + + # -------------------------- + # 6️⃣ SEEDERS / LEECHERS + # -------------------------- + seeders_a = cells[4].find_element(By.TAG_NAME, "a") + seeders_number = int(seeders_a.text.strip()) + seeders_link = seeders_a.get_attribute("href") + + leechers_a = cells[5].find_element(By.TAG_NAME, "a") + leechers_number = int(leechers_a.text.strip()) + leechers_link = leechers_a.get_attribute("href") + + # -------------------------- + # 7️⃣ DOWNLOAD TORRENT CONTENT (.torrent) + # -------------------------- + torrent_content = None + time.sleep(3) #mezera mezi torrenty + try: + resp = requests_session.get(download_link) + resp.raise_for_status() + torrent_content = resp.content + except Exception as e: + print(f"⚠️ Could not download torrent file for {torrent_hash}: {e}") + torrent_content = None + + # -------------------------- + # FINAL DICTIONARY + # -------------------------- + return { + "torrent_hash": torrent_hash, + "details_link": details_link, + "category": category, + "title_visible": visible_name, + "title_full": full_title, + "size_pretty": size_pretty, + "added_datetime": added_mysql, + "preview_image": img_link, + "seeders": seeders_number, + "seeders_link": seeders_link, + "leechers": leechers_number, + "leechers_link": leechers_link, + "torrent_filename": torrent_filename, + "torrent_content": torrent_content, + } + + +# ============================================================ +# 6) MySQL INSERT +# ============================================================ + +insert_sql = """ +INSERT INTO torrents ( + torrent_hash, details_link, category, title_visible, title_full, + size_pretty, added_datetime, preview_image, + seeders, seeders_link, leechers, leechers_link, + torrent_filename, torrent_content +) VALUES ( + %(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s, + %(size_pretty)s, %(added_datetime)s, %(preview_image)s, + %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s, + %(torrent_filename)s, %(torrent_content)s +) +ON DUPLICATE KEY UPDATE + details_link = VALUES(details_link), + category = VALUES(category), + title_visible = VALUES(title_visible), + title_full = VALUES(title_full), + size_pretty = VALUES(size_pretty), + added_datetime = VALUES(added_datetime), + preview_image = VALUES(preview_image), + seeders = VALUES(seeders), + seeders_link = VALUES(seeders_link), + leechers = VALUES(leechers), + leechers_link = VALUES(leechers_link), + torrent_filename = VALUES(torrent_filename), + torrent_content = VALUES(torrent_content); +""" + + +# ============================================================ +# 7) Funkce pro zpracování jedné stránky +# ============================================================ + +def process_current_page(page_index: int): + """ + Zpracuje aktuálně otevřenou stránku: + - najde všechny "REAL TORRENT ROWS" (7 td) + - pro každý torrent: + * parse_row + * insert/update do DB + """ + rows = driver.find_elements(By.CSS_SELECTOR, "table tr") + + real_rows = [] + for row in rows: + cells = row.find_elements(By.TAG_NAME, "td") + + # REAL TORRENT ROWS ALWAYS HAVE EXACTLY 7 TD CELLS + if len(cells) == 7: + real_rows.append(cells) + + print(f"📄 Page {page_index}: {len(real_rows)} torrent rows") + + for cells in real_rows: + data = parse_row(cells) + if not data: + continue + + print(f" 💾 [{page_index}] Saving:", data["title_visible"]) + cursor.execute(insert_sql, data) + + +# ============================================================ +# 8) Hlavní stránkovací cyklus +# ============================================================ + +current_url = START_URL +page_index = 0 + +while True: + print(f"\n🌐 Loading page {page_index}: {current_url}") + driver.get(current_url) + time.sleep(2) + + # zavři popup, pokud je + close_popup_if_any() + + # zpracuj aktuální stránku + process_current_page(page_index) + + # pokus se najít tlačítko "Dalsi >>" + try: + next_btn = driver.find_element( + By.XPATH, + "//a[b[contains(text(),'Dalsi')]]" + ) + next_url = next_btn.get_attribute("href") + + if not next_url: + print("⛔ Next link has no href, stopping.") + break + + # pokud je relativní, doplň doménu + if next_url.startswith("/"): + next_url = "https://sktorrent.eu" + next_url + + # když by náhodou bylo stejné URL → přeruš nekonečnou smyčku + if next_url == current_url: + print("⛔ Next URL equals current URL, stopping.") + break + + print("➡️ Next page:", next_url) + current_url = next_url + page_index += 1 + + # malá pauza mezi stránkami + time.sleep(1) + + except Exception: + print("✅ No 'Dalsi >>' link found, reached last page. Done.") + break + + +print("\n🎉 DONE — All pages processed, torrents saved & torrent files downloaded.") +driver.quit()