Files
torrents/30 OpenTextListing v4.py
2025-12-15 06:11:53 +01:00

376 lines
11 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import re
import urllib.parse as urlparse
from pathlib import Path
import json
import requests
# ============================================================
# 1) MySQL CONNECTION
# ============================================================
db = pymysql.connect(
host="192.168.1.76",
port=3307,
user="root",
password="Vlado9674+",
database="torrents",
charset="utf8mb4",
autocommit=True
)
cursor = db.cursor()
# ============================================================
# 2) Selenium setup
# ============================================================
COOKIE_FILE = Path("sktorrent_cookies.json")
# Start URL pro kategorii 24, seřazeno podle data DESC
START_URL = (
"https://sktorrent.eu/torrent/torrents.php"
"?active=0&category=24&order=data&by=DESC&zaner=&jazyk=&page=0"
)
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--disable-extensions")
driver = webdriver.Chrome(options=chrome_options)
# Pozice a velikost okna (aby nepřekrývalo PyCharm)
driver.set_window_position(380, 50) # 10 cm od levého okraje
driver.set_window_size(1350, 1000) # můžeš změnit dle monitoru
# Nejprve otevřeme hlavní stránku kvůli doméně pro cookies
driver.get("https://sktorrent.eu")
# Load cookies z JSON
if COOKIE_FILE.exists():
with open(COOKIE_FILE, "r") as f:
cookies = json.load(f)
for c in cookies:
driver.add_cookie(c)
print("🍪 Cookies loaded.")
else:
print("⚠️ Cookie file not found, you may not be logged in!")
# ============================================================
# 3) Převod cookies → requests.Session (pro stahování .torrent)
# ============================================================
requests_session = requests.Session()
for ck in driver.get_cookies():
requests_session.cookies.set(ck["name"], ck["value"])
print("🔗 Requests session initialized with Selenium cookies.")
# ============================================================
# 4) Funkce pro zavření popupu
# ============================================================
def close_popup_if_any():
"""Zkusí zavřít interstitial reklamu pomocí JS funkce interstitialBox.closeit()."""
try:
driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}")
# Krátká pauza, ať se DOM uklidní
time.sleep(0.5)
print("🧹 Popup closed via JS fallback (if present).")
except Exception as e:
print(" Popup JS handler not found:", e)
# ============================================================
# 5) Funkce pro parsování jednoho řádku (jednoho torrentu)
# ============================================================
def parse_row(cells):
"""
cells: list<WebElement> o délce 7
Struktura:
0: kategorie
1: download link (.torrent)
2: název + velikost + datum + 'Obrázok' + žánr
3: -- (ignorujeme)
4: seeders
5: leechers
6: completed
"""
# --------------------------
# 1⃣ CATEGORY
# --------------------------
category = cells[0].text.strip()
# --------------------------
# 2⃣ DOWNLOAD LINK FOR TORRENT FILE (cells[1])
# --------------------------
try:
download_a = cells[1].find_element(By.TAG_NAME, "a")
download_link = download_a.get_attribute("href")
except:
print("⚠️ No download link in row, skipping.")
return None
parsed_dl = urlparse.urlparse(download_link)
dl_query = urlparse.parse_qs(parsed_dl.query)
torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]
# --------------------------
# 3⃣ TITLE + DETAILS LINK (in cell[2])
# --------------------------
title_links = cells[2].find_elements(By.TAG_NAME, "a")
if not title_links:
print("⚠️ No title link — skipping row")
return None
a_tag = title_links[0]
visible_name = a_tag.text.strip()
full_title = a_tag.get_attribute("title")
details_link = a_tag.get_attribute("href")
if not details_link:
print("⚠️ Row has no details link — skipping")
return None
# --------------------------
# Extract torrent hash from ?id=
# --------------------------
parsed = urlparse.urlparse(details_link)
query = urlparse.parse_qs(parsed.query)
if "id" not in query:
print("⚠️ Skipping row with no torrent ID →", details_link)
return None
torrent_hash = query["id"][0]
# --------------------------
# 4⃣ Size + date parsing
# --------------------------
text_block = cells[2].get_attribute("innerText")
text_block_clean = " ".join(text_block.split())
size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)
size_pretty = size_match.group(1) if size_match else None
added_pretty = added_match.group(1) if added_match else None
# Robustní převod data/času do MySQL datetime
added_mysql = None
if added_pretty:
# "29/11/2025 o 02:29" → "29/11/2025 02:29"
clean = added_pretty.replace(" o ", " ").strip()
parts = clean.split(" ")
date_part = parts[0]
time_part = parts[1] if len(parts) > 1 else "00:00:00"
# pokud chybí sekundy, přidej
if len(time_part.split(":")) == 2:
time_part += ":00"
day, month, year = date_part.split("/")
added_mysql = f"{year}-{month}-{day} {time_part}"
# --------------------------
# 5⃣ Image preview
# --------------------------
img_link = None
try:
image_a = cells[2].find_element(
By.XPATH,
".//a[contains(text(),'Obrázok')]"
)
mouseover = image_a.get_attribute("onmouseover")
img_match = re.search(r"src=([^ ]+)", mouseover)
if img_match:
img_link = img_match.group(1).replace("'", "").strip()
if img_link.startswith("//"):
img_link = "https:" + img_link
except:
pass
# --------------------------
# 6⃣ SEEDERS / LEECHERS
# --------------------------
seeders_a = cells[4].find_element(By.TAG_NAME, "a")
seeders_number = int(seeders_a.text.strip())
seeders_link = seeders_a.get_attribute("href")
leechers_a = cells[5].find_element(By.TAG_NAME, "a")
leechers_number = int(leechers_a.text.strip())
leechers_link = leechers_a.get_attribute("href")
# --------------------------
# 7⃣ DOWNLOAD TORRENT CONTENT (.torrent)
# --------------------------
torrent_content = None
time.sleep(3) #mezera mezi torrenty
try:
resp = requests_session.get(download_link)
resp.raise_for_status()
torrent_content = resp.content
except Exception as e:
print(f"⚠️ Could not download torrent file for {torrent_hash}: {e}")
torrent_content = None
# --------------------------
# FINAL DICTIONARY
# --------------------------
return {
"torrent_hash": torrent_hash,
"details_link": details_link,
"category": category,
"title_visible": visible_name,
"title_full": full_title,
"size_pretty": size_pretty,
"added_datetime": added_mysql,
"preview_image": img_link,
"seeders": seeders_number,
"seeders_link": seeders_link,
"leechers": leechers_number,
"leechers_link": leechers_link,
"torrent_filename": torrent_filename,
"torrent_content": torrent_content,
}
# ============================================================
# 6) MySQL INSERT
# ============================================================
insert_sql = """
INSERT INTO torrents (
torrent_hash, details_link, category, title_visible, title_full,
size_pretty, added_datetime, preview_image,
seeders, seeders_link, leechers, leechers_link,
torrent_filename, torrent_content
) VALUES (
%(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s,
%(size_pretty)s, %(added_datetime)s, %(preview_image)s,
%(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s,
%(torrent_filename)s, %(torrent_content)s
)
ON DUPLICATE KEY UPDATE
details_link = VALUES(details_link),
category = VALUES(category),
title_visible = VALUES(title_visible),
title_full = VALUES(title_full),
size_pretty = VALUES(size_pretty),
added_datetime = VALUES(added_datetime),
preview_image = VALUES(preview_image),
seeders = VALUES(seeders),
seeders_link = VALUES(seeders_link),
leechers = VALUES(leechers),
leechers_link = VALUES(leechers_link),
torrent_filename = VALUES(torrent_filename),
torrent_content = VALUES(torrent_content);
"""
# ============================================================
# 7) Funkce pro zpracování jedné stránky
# ============================================================
def process_current_page(page_index: int):
"""
Zpracuje aktuálně otevřenou stránku:
- najde všechny "REAL TORRENT ROWS" (7 td)
- pro každý torrent:
* parse_row
* insert/update do DB
"""
rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
real_rows = []
for row in rows:
cells = row.find_elements(By.TAG_NAME, "td")
# REAL TORRENT ROWS ALWAYS HAVE EXACTLY 7 TD CELLS
if len(cells) == 7:
real_rows.append(cells)
print(f"📄 Page {page_index}: {len(real_rows)} torrent rows")
for cells in real_rows:
data = parse_row(cells)
if not data:
continue
print(f" 💾 [{page_index}] Saving:", data["title_visible"])
cursor.execute(insert_sql, data)
# ============================================================
# 8) Hlavní stránkovací cyklus
# ============================================================
current_url = START_URL
page_index = 0
while True:
print(f"\n🌐 Loading page {page_index}: {current_url}")
driver.get(current_url)
time.sleep(2)
# zavři popup, pokud je
close_popup_if_any()
# zpracuj aktuální stránku
process_current_page(page_index)
# pokus se najít tlačítko "Dalsi >>"
try:
next_btn = driver.find_element(
By.XPATH,
"//a[b[contains(text(),'Dalsi')]]"
)
next_url = next_btn.get_attribute("href")
if not next_url:
print("⛔ Next link has no href, stopping.")
break
# pokud je relativní, doplň doménu
if next_url.startswith("/"):
next_url = "https://sktorrent.eu" + next_url
# když by náhodou bylo stejné URL → přeruš nekonečnou smyčku
if next_url == current_url:
print("⛔ Next URL equals current URL, stopping.")
break
print("➡️ Next page:", next_url)
current_url = next_url
page_index += 1
# malá pauza mezi stránkami
time.sleep(1)
except Exception:
print("✅ No 'Dalsi >>' link found, reached last page. Done.")
break
print("\n🎉 DONE — All pages processed, torrents saved & torrent files downloaded.")
driver.quit()