reporter
This commit is contained in:
@@ -0,0 +1,390 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import pymysql
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import urllib.parse as urlparse
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 1) MySQL CONNECTION
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
db = pymysql.connect(
|
||||||
|
host="192.168.1.76",
|
||||||
|
port=3307,
|
||||||
|
user="root",
|
||||||
|
password="Vlado9674+",
|
||||||
|
database="torrents",
|
||||||
|
charset="utf8mb4",
|
||||||
|
autocommit=True
|
||||||
|
)
|
||||||
|
|
||||||
|
cursor = db.cursor()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 2) Selenium setup
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
COOKIE_FILE = Path("sktorrent_cookies.json")
|
||||||
|
|
||||||
|
# Start URL pro kategorii 24, seřazeno podle data DESC
|
||||||
|
START_URL = (
|
||||||
|
"https://sktorrent.eu/torrent/torrents.php"
|
||||||
|
"?active=0&category=24&order=data&by=DESC&zaner=&jazyk=&page=90"
|
||||||
|
)
|
||||||
|
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument("--start-maximized")
|
||||||
|
chrome_options.add_argument("--disable-notifications")
|
||||||
|
chrome_options.add_argument("--disable-popup-blocking")
|
||||||
|
chrome_options.add_argument("--disable-extensions")
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
|
||||||
|
# Pozice a velikost okna (aby nepřekrývalo PyCharm)
|
||||||
|
driver.set_window_position(380, 50) # 10 cm od levého okraje
|
||||||
|
driver.set_window_size(1350, 1000) # můžeš změnit dle monitoru
|
||||||
|
|
||||||
|
|
||||||
|
# Nejprve otevřeme hlavní stránku kvůli doméně pro cookies
|
||||||
|
driver.get("https://sktorrent.eu")
|
||||||
|
|
||||||
|
# Load cookies z JSON
|
||||||
|
if COOKIE_FILE.exists():
|
||||||
|
with open(COOKIE_FILE, "r") as f:
|
||||||
|
cookies = json.load(f)
|
||||||
|
for c in cookies:
|
||||||
|
driver.add_cookie(c)
|
||||||
|
print("🍪 Cookies loaded.")
|
||||||
|
else:
|
||||||
|
print("⚠️ Cookie file not found, you may not be logged in!")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 3) Převod cookies → requests.Session (pro stahování .torrent)
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
requests_session = requests.Session()
|
||||||
|
for ck in driver.get_cookies():
|
||||||
|
requests_session.cookies.set(ck["name"], ck["value"])
|
||||||
|
|
||||||
|
print("🔗 Requests session initialized with Selenium cookies.")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 4) Funkce pro zavření popupu
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def close_popup_if_any():
|
||||||
|
"""Zkusí zavřít interstitial reklamu pomocí JS funkce interstitialBox.closeit()."""
|
||||||
|
try:
|
||||||
|
driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}")
|
||||||
|
# Krátká pauza, ať se DOM uklidní
|
||||||
|
time.sleep(0.5)
|
||||||
|
print("🧹 Popup closed via JS fallback (if present).")
|
||||||
|
except Exception as e:
|
||||||
|
print("ℹ️ Popup JS handler not found:", e)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 5) Funkce pro parsování jednoho řádku (jednoho torrentu)
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def parse_row(cells):
|
||||||
|
"""
|
||||||
|
cells: list<WebElement> o délce 7
|
||||||
|
Struktura:
|
||||||
|
0: kategorie
|
||||||
|
1: download link (.torrent)
|
||||||
|
2: název + velikost + datum + 'Obrázok' + žánr
|
||||||
|
3: -- (ignorujeme)
|
||||||
|
4: seeders
|
||||||
|
5: leechers
|
||||||
|
6: completed
|
||||||
|
"""
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 1️⃣ CATEGORY
|
||||||
|
# --------------------------
|
||||||
|
category = cells[0].text.strip()
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 2️⃣ DOWNLOAD LINK FOR TORRENT FILE (cells[1])
|
||||||
|
# --------------------------
|
||||||
|
try:
|
||||||
|
download_a = cells[1].find_element(By.TAG_NAME, "a")
|
||||||
|
download_link = download_a.get_attribute("href")
|
||||||
|
except:
|
||||||
|
print("⚠️ No download link in row, skipping.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
parsed_dl = urlparse.urlparse(download_link)
|
||||||
|
dl_query = urlparse.parse_qs(parsed_dl.query)
|
||||||
|
|
||||||
|
torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 3️⃣ TITLE + DETAILS LINK (in cell[2])
|
||||||
|
# --------------------------
|
||||||
|
title_links = cells[2].find_elements(By.TAG_NAME, "a")
|
||||||
|
if not title_links:
|
||||||
|
print("⚠️ No title link — skipping row")
|
||||||
|
return None
|
||||||
|
|
||||||
|
a_tag = title_links[0]
|
||||||
|
|
||||||
|
visible_name = a_tag.text.strip()
|
||||||
|
full_title = a_tag.get_attribute("title")
|
||||||
|
details_link = a_tag.get_attribute("href")
|
||||||
|
|
||||||
|
if not details_link:
|
||||||
|
print("⚠️ Row has no details link — skipping")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# Extract torrent hash from ?id=
|
||||||
|
# --------------------------
|
||||||
|
parsed = urlparse.urlparse(details_link)
|
||||||
|
query = urlparse.parse_qs(parsed.query)
|
||||||
|
|
||||||
|
if "id" not in query:
|
||||||
|
print("⚠️ Skipping row with no torrent ID →", details_link)
|
||||||
|
return None
|
||||||
|
|
||||||
|
torrent_hash = query["id"][0]
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 4️⃣ Size + date parsing
|
||||||
|
# --------------------------
|
||||||
|
text_block = cells[2].get_attribute("innerText")
|
||||||
|
text_block_clean = " ".join(text_block.split())
|
||||||
|
|
||||||
|
size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
|
||||||
|
added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)
|
||||||
|
|
||||||
|
size_pretty = size_match.group(1) if size_match else None
|
||||||
|
added_pretty = added_match.group(1) if added_match else None
|
||||||
|
|
||||||
|
# Robustní převod data/času do MySQL datetime
|
||||||
|
added_mysql = None
|
||||||
|
if added_pretty:
|
||||||
|
# "29/11/2025 o 02:29" → "29/11/2025 02:29"
|
||||||
|
clean = added_pretty.replace(" o ", " ").strip()
|
||||||
|
parts = clean.split(" ")
|
||||||
|
|
||||||
|
date_part = parts[0]
|
||||||
|
time_part = parts[1] if len(parts) > 1 else "00:00:00"
|
||||||
|
|
||||||
|
# pokud chybí sekundy, přidej
|
||||||
|
if len(time_part.split(":")) == 2:
|
||||||
|
time_part += ":00"
|
||||||
|
|
||||||
|
day, month, year = date_part.split("/")
|
||||||
|
added_mysql = f"{year}-{month}-{day} {time_part}"
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 5️⃣ Image preview
|
||||||
|
# --------------------------
|
||||||
|
img_link = None
|
||||||
|
try:
|
||||||
|
image_a = cells[2].find_element(
|
||||||
|
By.XPATH,
|
||||||
|
".//a[contains(text(),'Obrázok')]"
|
||||||
|
)
|
||||||
|
mouseover = image_a.get_attribute("onmouseover")
|
||||||
|
img_match = re.search(r"src=([^ ]+)", mouseover)
|
||||||
|
if img_match:
|
||||||
|
img_link = img_match.group(1).replace("'", "").strip()
|
||||||
|
if img_link.startswith("//"):
|
||||||
|
img_link = "https:" + img_link
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 6️⃣ SEEDERS / LEECHERS
|
||||||
|
# --------------------------
|
||||||
|
seeders_a = cells[4].find_element(By.TAG_NAME, "a")
|
||||||
|
seeders_number = int(seeders_a.text.strip())
|
||||||
|
seeders_link = seeders_a.get_attribute("href")
|
||||||
|
|
||||||
|
leechers_a = cells[5].find_element(By.TAG_NAME, "a")
|
||||||
|
leechers_number = int(leechers_a.text.strip())
|
||||||
|
leechers_link = leechers_a.get_attribute("href")
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 7️⃣ Check, zda už máme torrent_content v DB
|
||||||
|
# --------------------------
|
||||||
|
cursor.execute(
|
||||||
|
"SELECT torrent_content FROM torrents WHERE torrent_hash=%s",
|
||||||
|
(torrent_hash,)
|
||||||
|
)
|
||||||
|
row = cursor.fetchone()
|
||||||
|
already_have_torrent = row is not None and row[0] is not None
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 8️⃣ DOWNLOAD TORRENT CONTENT (.torrent) – only if needed
|
||||||
|
# --------------------------
|
||||||
|
torrent_content = None
|
||||||
|
|
||||||
|
if already_have_torrent:
|
||||||
|
print(f" ↪️ Torrent file already stored, skipping download ({torrent_filename})")
|
||||||
|
else:
|
||||||
|
time.sleep(3) # mezera mezi torrenty
|
||||||
|
try:
|
||||||
|
resp = requests_session.get(download_link)
|
||||||
|
resp.raise_for_status()
|
||||||
|
torrent_content = resp.content
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Could not download torrent file for {torrent_hash}: {e}")
|
||||||
|
torrent_content = None
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# FINAL DICTIONARY
|
||||||
|
# --------------------------
|
||||||
|
return {
|
||||||
|
"torrent_hash": torrent_hash,
|
||||||
|
"details_link": details_link,
|
||||||
|
"category": category,
|
||||||
|
"title_visible": visible_name,
|
||||||
|
"title_full": full_title,
|
||||||
|
"size_pretty": size_pretty,
|
||||||
|
"added_datetime": added_mysql,
|
||||||
|
"preview_image": img_link,
|
||||||
|
"seeders": seeders_number,
|
||||||
|
"seeders_link": seeders_link,
|
||||||
|
"leechers": leechers_number,
|
||||||
|
"leechers_link": leechers_link,
|
||||||
|
"torrent_filename": torrent_filename,
|
||||||
|
# pokud jsme torrent už měli, vracíme None → UPDATE ho nepřepíše (COALESCE)
|
||||||
|
"torrent_content": torrent_content if not already_have_torrent else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 6) MySQL INSERT
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
insert_sql = """
|
||||||
|
INSERT INTO torrents (
|
||||||
|
torrent_hash, details_link, category, title_visible, title_full,
|
||||||
|
size_pretty, added_datetime, preview_image,
|
||||||
|
seeders, seeders_link, leechers, leechers_link,
|
||||||
|
torrent_filename, torrent_content
|
||||||
|
) VALUES (
|
||||||
|
%(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s,
|
||||||
|
%(size_pretty)s, %(added_datetime)s, %(preview_image)s,
|
||||||
|
%(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s,
|
||||||
|
%(torrent_filename)s, %(torrent_content)s
|
||||||
|
)
|
||||||
|
ON DUPLICATE KEY UPDATE
|
||||||
|
details_link = VALUES(details_link),
|
||||||
|
category = VALUES(category),
|
||||||
|
title_visible = VALUES(title_visible),
|
||||||
|
title_full = VALUES(title_full),
|
||||||
|
size_pretty = VALUES(size_pretty),
|
||||||
|
added_datetime = VALUES(added_datetime),
|
||||||
|
preview_image = VALUES(preview_image),
|
||||||
|
seeders = VALUES(seeders),
|
||||||
|
seeders_link = VALUES(seeders_link),
|
||||||
|
leechers = VALUES(leechers),
|
||||||
|
leechers_link = VALUES(leechers_link),
|
||||||
|
torrent_filename = VALUES(torrent_filename),
|
||||||
|
torrent_content = COALESCE(VALUES(torrent_content), torrent_content);
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 7) Funkce pro zpracování jedné stránky
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def process_current_page(page_index: int):
|
||||||
|
"""
|
||||||
|
Zpracuje aktuálně otevřenou stránku:
|
||||||
|
- najde všechny "REAL TORRENT ROWS" (7 td)
|
||||||
|
- pro každý torrent:
|
||||||
|
* parse_row
|
||||||
|
* insert/update do DB
|
||||||
|
"""
|
||||||
|
rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
|
||||||
|
|
||||||
|
real_rows = []
|
||||||
|
for row in rows:
|
||||||
|
cells = row.find_elements(By.TAG_NAME, "td")
|
||||||
|
|
||||||
|
# REAL TORRENT ROWS ALWAYS HAVE EXACTLY 7 TD CELLS
|
||||||
|
if len(cells) == 7:
|
||||||
|
real_rows.append(cells)
|
||||||
|
|
||||||
|
print(f"📄 Page {page_index}: {len(real_rows)} torrent rows")
|
||||||
|
|
||||||
|
for cells in real_rows:
|
||||||
|
data = parse_row(cells)
|
||||||
|
if not data:
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f" 💾 [{page_index}] Saving:", data["title_visible"])
|
||||||
|
cursor.execute(insert_sql, data)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 8) Hlavní stránkovací cyklus
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
current_url = START_URL
|
||||||
|
page_index = 0
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print(f"\n🌐 Loading page {page_index}: {current_url}")
|
||||||
|
driver.get(current_url)
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# zavři popup, pokud je
|
||||||
|
close_popup_if_any()
|
||||||
|
|
||||||
|
# zpracuj aktuální stránku
|
||||||
|
process_current_page(page_index)
|
||||||
|
|
||||||
|
# pokus se najít tlačítko "Dalsi >>"
|
||||||
|
try:
|
||||||
|
next_btn = driver.find_element(
|
||||||
|
By.XPATH,
|
||||||
|
"//a[b[contains(text(),'Dalsi')]]"
|
||||||
|
)
|
||||||
|
next_url = next_btn.get_attribute("href")
|
||||||
|
|
||||||
|
if not next_url:
|
||||||
|
print("⛔ Next link has no href, stopping.")
|
||||||
|
break
|
||||||
|
|
||||||
|
# pokud je relativní, doplň doménu
|
||||||
|
if next_url.startswith("/"):
|
||||||
|
next_url = "https://sktorrent.eu" + next_url
|
||||||
|
|
||||||
|
# když by náhodou bylo stejné URL → přeruš nekonečnou smyčku
|
||||||
|
if next_url == current_url:
|
||||||
|
print("⛔ Next URL equals current URL, stopping.")
|
||||||
|
break
|
||||||
|
|
||||||
|
print("➡️ Next page:", next_url)
|
||||||
|
current_url = next_url
|
||||||
|
page_index += 1
|
||||||
|
|
||||||
|
# malá pauza mezi stránkami
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
print("✅ No 'Dalsi >>' link found, reached last page. Done.")
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
print("\n🎉 DONE — All pages processed, torrents saved & torrent files downloaded (without re-downloading existing ones).")
|
||||||
|
driver.quit()
|
||||||
|
|||||||
@@ -0,0 +1,256 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import pymysql
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import urllib.parse as urlparse
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 1) MySQL CONNECTION
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
db = pymysql.connect(
|
||||||
|
host="192.168.1.76",
|
||||||
|
port=3307,
|
||||||
|
user="root",
|
||||||
|
password="Vlado9674+",
|
||||||
|
database="torrents",
|
||||||
|
charset="utf8mb4",
|
||||||
|
autocommit=True
|
||||||
|
)
|
||||||
|
|
||||||
|
cursor = db.cursor()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 2) Selenium setup
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
COOKIE_FILE = Path("sktorrent_cookies.json")
|
||||||
|
URL = "https://sktorrent.eu/torrent/torrents.php?active=0&category=24&order=data&by=DESC&zaner=&jazyk=&page=0"
|
||||||
|
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument("--start-maximized")
|
||||||
|
chrome_options.add_argument("--disable-notifications")
|
||||||
|
chrome_options.add_argument("--disable-popup-blocking")
|
||||||
|
chrome_options.add_argument("--disable-extensions")
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
|
||||||
|
driver.get("https://sktorrent.eu")
|
||||||
|
|
||||||
|
# Load cookies
|
||||||
|
if COOKIE_FILE.exists():
|
||||||
|
with open(COOKIE_FILE, "r") as f:
|
||||||
|
cookies = json.load(f)
|
||||||
|
for c in cookies:
|
||||||
|
driver.add_cookie(c)
|
||||||
|
print("🍪 Cookies loaded.")
|
||||||
|
|
||||||
|
driver.get(URL)
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# Close interstitial popup reliably
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# JS close always exists even when HTML structure varies
|
||||||
|
driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}")
|
||||||
|
print("🧹 Popup closed via JS fallback.")
|
||||||
|
time.sleep(1)
|
||||||
|
except:
|
||||||
|
print("ℹ️ Popup JS handler not found (probably no popup).")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 3) Extract table rows
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
|
||||||
|
print("Total rows found:", len(rows))
|
||||||
|
|
||||||
|
real_rows = []
|
||||||
|
for row in rows:
|
||||||
|
cells = row.find_elements(By.TAG_NAME, "td")
|
||||||
|
|
||||||
|
# REAL TORRENT ROWS ALWAYS HAVE EXACTLY 7 TD CELLS
|
||||||
|
if len(cells) == 7:
|
||||||
|
real_rows.append(cells)
|
||||||
|
|
||||||
|
print("Real torrent rows:", len(real_rows))
|
||||||
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 4) Function to extract fields from one row
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def parse_row(cells):
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 1️⃣ CATEGORY (cells[0])
|
||||||
|
# --------------------------
|
||||||
|
category = cells[0].text.strip()
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 2️⃣ TITLE + DETAILS LINK (always inside cells[2])
|
||||||
|
# --------------------------
|
||||||
|
title_links = cells[2].find_elements(By.TAG_NAME, "a")
|
||||||
|
if not title_links:
|
||||||
|
print("⚠️ Missing title link — skipping row")
|
||||||
|
return None
|
||||||
|
|
||||||
|
a_tag = title_links[0]
|
||||||
|
|
||||||
|
visible_name = a_tag.text.strip()
|
||||||
|
full_title = a_tag.get_attribute("title")
|
||||||
|
details_link = a_tag.get_attribute("href")
|
||||||
|
|
||||||
|
if not details_link:
|
||||||
|
print("⚠️ Row has no details link — skipping")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 3️⃣ TORRENT HASH
|
||||||
|
# --------------------------
|
||||||
|
parsed = urlparse.urlparse(details_link)
|
||||||
|
query = urlparse.parse_qs(parsed.query)
|
||||||
|
|
||||||
|
if "id" not in query:
|
||||||
|
print("⚠️ Skipping row with no torrent ID →", details_link)
|
||||||
|
return None
|
||||||
|
|
||||||
|
torrent_hash = query["id"][0]
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 4️⃣ TEXT BLOCK (size + date)
|
||||||
|
# --------------------------
|
||||||
|
text_block = cells[2].get_attribute("innerText")
|
||||||
|
text_block_clean = " ".join(text_block.split())
|
||||||
|
|
||||||
|
size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
|
||||||
|
added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)
|
||||||
|
|
||||||
|
size_pretty = size_match.group(1) if size_match else None
|
||||||
|
added_pretty = added_match.group(1) if added_match else None
|
||||||
|
|
||||||
|
# Convert “18/11/2025 o 07:00” → “2025-11-18 07:00:00”
|
||||||
|
added_mysql = None
|
||||||
|
if added_pretty:
|
||||||
|
# Normalize formats like "29/11/2025 o 02:29", "29/11/2025 02:29:18"
|
||||||
|
clean = added_pretty.replace(" o ", " ").strip()
|
||||||
|
|
||||||
|
# Split date and time
|
||||||
|
date_part, *time_parts = clean.split(" ")
|
||||||
|
|
||||||
|
# If seconds are missing, add :00
|
||||||
|
time_part = time_parts[0] if time_parts else "00:00"
|
||||||
|
if len(time_part.split(":")) == 2:
|
||||||
|
time_part += ":00"
|
||||||
|
|
||||||
|
day, month, year = date_part.split("/")
|
||||||
|
|
||||||
|
added_mysql = f"{year}-{month}-{day} {time_part}"
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 5️⃣ IMAGE PREVIEW
|
||||||
|
# --------------------------
|
||||||
|
img_link = None
|
||||||
|
try:
|
||||||
|
image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
|
||||||
|
mouseover = image_a.get_attribute("onmouseover")
|
||||||
|
img_match = re.search(r"src=([^ ]+)", mouseover)
|
||||||
|
if img_match:
|
||||||
|
img_link = img_match.group(1).replace("'", "").strip()
|
||||||
|
if img_link.startswith("//"):
|
||||||
|
img_link = "https:" + img_link
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 6️⃣ SEEDERS (cells[4])
|
||||||
|
# --------------------------
|
||||||
|
seeders_a = cells[4].find_element(By.TAG_NAME, "a")
|
||||||
|
seeders_number = int(seeders_a.text.strip())
|
||||||
|
seeders_link = seeders_a.get_attribute("href")
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 7️⃣ LEECHERS (cells[5])
|
||||||
|
# --------------------------
|
||||||
|
leechers_a = cells[5].find_element(By.TAG_NAME, "a")
|
||||||
|
leechers_number = int(leechers_a.text.strip())
|
||||||
|
leechers_link = leechers_a.get_attribute("href")
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# Return result
|
||||||
|
# --------------------------
|
||||||
|
return {
|
||||||
|
"torrent_hash": torrent_hash,
|
||||||
|
"details_link": details_link,
|
||||||
|
"category": category,
|
||||||
|
"title_visible": visible_name,
|
||||||
|
"title_full": full_title,
|
||||||
|
"size_pretty": size_pretty,
|
||||||
|
"added_datetime": added_mysql,
|
||||||
|
"preview_image": img_link,
|
||||||
|
"seeders": seeders_number,
|
||||||
|
"seeders_link": seeders_link,
|
||||||
|
"leechers": leechers_number,
|
||||||
|
"leechers_link": leechers_link,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 5) MySQL INSERT
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
insert_sql = """
|
||||||
|
INSERT INTO torrents (
|
||||||
|
torrent_hash, details_link, category, title_visible, title_full,
|
||||||
|
size_pretty, added_datetime, preview_image,
|
||||||
|
seeders, seeders_link, leechers, leechers_link
|
||||||
|
) VALUES (
|
||||||
|
%(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s,
|
||||||
|
%(size_pretty)s, %(added_datetime)s, %(preview_image)s,
|
||||||
|
%(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s
|
||||||
|
)
|
||||||
|
ON DUPLICATE KEY UPDATE
|
||||||
|
details_link = VALUES(details_link),
|
||||||
|
category = VALUES(category),
|
||||||
|
title_visible = VALUES(title_visible),
|
||||||
|
title_full = VALUES(title_full),
|
||||||
|
size_pretty = VALUES(size_pretty),
|
||||||
|
added_datetime = VALUES(added_datetime),
|
||||||
|
preview_image = VALUES(preview_image),
|
||||||
|
seeders = VALUES(seeders),
|
||||||
|
seeders_link = VALUES(seeders_link),
|
||||||
|
leechers = VALUES(leechers),
|
||||||
|
leechers_link = VALUES(leechers_link);
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 6) PROCESS ALL ROWS
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
for cells in real_rows:
|
||||||
|
data = parse_row(cells)
|
||||||
|
if not data:
|
||||||
|
continue
|
||||||
|
|
||||||
|
print("💾 Saving:", data["title_visible"])
|
||||||
|
cursor.execute(insert_sql, data)
|
||||||
|
|
||||||
|
print("\n✅ DONE — All torrents saved to MySQL.")
|
||||||
|
driver.quit()
|
||||||
|
|||||||
@@ -0,0 +1,291 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import pymysql
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import urllib.parse as urlparse
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 1) MySQL CONNECTION
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
db = pymysql.connect(
|
||||||
|
host="192.168.1.76",
|
||||||
|
port=3307,
|
||||||
|
user="root",
|
||||||
|
password="Vlado9674+",
|
||||||
|
database="torrents",
|
||||||
|
charset="utf8mb4",
|
||||||
|
autocommit=True
|
||||||
|
)
|
||||||
|
|
||||||
|
cursor = db.cursor()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 2) Selenium setup
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
COOKIE_FILE = Path("sktorrent_cookies.json")
|
||||||
|
URL = "https://sktorrent.eu/torrent/torrents.php?active=0"
|
||||||
|
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument("--start-maximized")
|
||||||
|
chrome_options.add_argument("--disable-notifications")
|
||||||
|
chrome_options.add_argument("--disable-popup-blocking")
|
||||||
|
chrome_options.add_argument("--disable-extensions")
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
|
||||||
|
driver.get("https://sktorrent.eu")
|
||||||
|
|
||||||
|
# Load cookies
|
||||||
|
session_cookies = []
|
||||||
|
if COOKIE_FILE.exists():
|
||||||
|
with open(COOKIE_FILE, "r") as f:
|
||||||
|
cookies = json.load(f)
|
||||||
|
for c in cookies:
|
||||||
|
driver.add_cookie(c)
|
||||||
|
session_cookies.append({c['name']: c['value']})
|
||||||
|
print("🍪 Cookies loaded.")
|
||||||
|
|
||||||
|
driver.get(URL)
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 3) Close interstitial popup robustly
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
try:
|
||||||
|
driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}")
|
||||||
|
print("🧹 Popup closed via JS fallback.")
|
||||||
|
time.sleep(1)
|
||||||
|
except:
|
||||||
|
print("ℹ️ No popup found.")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# Convert Selenium cookies → Python requests cookies
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
requests_session = requests.Session()
|
||||||
|
for ck in driver.get_cookies():
|
||||||
|
requests_session.cookies.set(ck["name"], ck["value"])
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 4) Extract table rows
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
|
||||||
|
print("Total rows found:", len(rows))
|
||||||
|
|
||||||
|
real_rows = []
|
||||||
|
for row in rows:
|
||||||
|
cells = row.find_elements(By.TAG_NAME, "td")
|
||||||
|
|
||||||
|
# REAL TORRENT ROWS ALWAYS HAVE EXACTLY 7 TD CELLS
|
||||||
|
if len(cells) == 7:
|
||||||
|
real_rows.append(cells)
|
||||||
|
|
||||||
|
print("Real torrent rows:", len(real_rows))
|
||||||
|
print("")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 5) Function to extract fields from one row
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def parse_row(cells):
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 1️⃣ CATEGORY
|
||||||
|
# --------------------------
|
||||||
|
category = cells[0].text.strip()
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 2️⃣ DOWNLOAD LINK FOR TORRENT FILE
|
||||||
|
# --------------------------
|
||||||
|
try:
|
||||||
|
download_a = cells[1].find_element(By.TAG_NAME, "a")
|
||||||
|
download_link = download_a.get_attribute("href")
|
||||||
|
except:
|
||||||
|
print("⚠️ No download link in row, skipping.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
parsed_dl = urlparse.urlparse(download_link)
|
||||||
|
dl_query = urlparse.parse_qs(parsed_dl.query)
|
||||||
|
|
||||||
|
torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 3️⃣ Title + details link (in cell[2])
|
||||||
|
# --------------------------
|
||||||
|
title_links = cells[2].find_elements(By.TAG_NAME, "a")
|
||||||
|
if not title_links:
|
||||||
|
print("⚠️ No title link — skipping row")
|
||||||
|
return None
|
||||||
|
|
||||||
|
a_tag = title_links[0]
|
||||||
|
|
||||||
|
visible_name = a_tag.text.strip()
|
||||||
|
full_title = a_tag.get_attribute("title")
|
||||||
|
details_link = a_tag.get_attribute("href")
|
||||||
|
|
||||||
|
if not details_link:
|
||||||
|
print("⚠️ Row has no details link — skipping")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# Extract torrent hash from ?id=
|
||||||
|
# --------------------------
|
||||||
|
parsed = urlparse.urlparse(details_link)
|
||||||
|
query = urlparse.parse_qs(parsed.query)
|
||||||
|
|
||||||
|
if "id" not in query:
|
||||||
|
print("⚠️ Skipping row with no torrent ID →", details_link)
|
||||||
|
return None
|
||||||
|
|
||||||
|
torrent_hash = query["id"][0]
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 4️⃣ Size + date parsing
|
||||||
|
# --------------------------
|
||||||
|
text_block = cells[2].get_attribute("innerText")
|
||||||
|
text_block_clean = " ".join(text_block.split())
|
||||||
|
|
||||||
|
size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
|
||||||
|
added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)
|
||||||
|
|
||||||
|
size_pretty = size_match.group(1) if size_match else None
|
||||||
|
added_pretty = added_match.group(1) if added_match else None
|
||||||
|
|
||||||
|
# Robust time normalization
|
||||||
|
added_mysql = None
|
||||||
|
if added_pretty:
|
||||||
|
|
||||||
|
clean = added_pretty.replace(" o ", " ").strip()
|
||||||
|
parts = clean.split(" ")
|
||||||
|
|
||||||
|
date_part = parts[0]
|
||||||
|
time_part = parts[1] if len(parts) > 1 else "00:00:00"
|
||||||
|
|
||||||
|
# add seconds if missing
|
||||||
|
if len(time_part.split(":")) == 2:
|
||||||
|
time_part += ":00"
|
||||||
|
|
||||||
|
day, month, year = date_part.split("/")
|
||||||
|
added_mysql = f"{year}-{month}-{day} {time_part}"
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 5️⃣ Image preview
|
||||||
|
# --------------------------
|
||||||
|
img_link = None
|
||||||
|
try:
|
||||||
|
image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
|
||||||
|
mouseover = image_a.get_attribute("onmouseover")
|
||||||
|
img_match = re.search(r"src=([^ ]+)", mouseover)
|
||||||
|
if img_match:
|
||||||
|
img_link = img_match.group(1).replace("'", "").strip()
|
||||||
|
if img_link.startswith("//"):
|
||||||
|
img_link = "https:" + img_link
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 6️⃣ SEEDERS / LEECHERS
|
||||||
|
# --------------------------
|
||||||
|
seeders_a = cells[4].find_element(By.TAG_NAME, "a")
|
||||||
|
seeders_number = int(seeders_a.text.strip())
|
||||||
|
seeders_link = seeders_a.get_attribute("href")
|
||||||
|
|
||||||
|
leechers_a = cells[5].find_element(By.TAG_NAME, "a")
|
||||||
|
leechers_number = int(leechers_a.text.strip())
|
||||||
|
leechers_link = leechers_a.get_attribute("href")
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 7️⃣ DOWNLOAD TORRENT CONTENT (.torrent)
|
||||||
|
# --------------------------
|
||||||
|
try:
|
||||||
|
torrent_content = requests_session.get(download_link).content
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Could not download torrent file for {torrent_hash}: {e}")
|
||||||
|
torrent_content = None
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# FINAL DICTIONARY
|
||||||
|
# --------------------------
|
||||||
|
return {
|
||||||
|
"torrent_hash": torrent_hash,
|
||||||
|
"details_link": details_link,
|
||||||
|
"category": category,
|
||||||
|
"title_visible": visible_name,
|
||||||
|
"title_full": full_title,
|
||||||
|
"size_pretty": size_pretty,
|
||||||
|
"added_datetime": added_mysql,
|
||||||
|
"preview_image": img_link,
|
||||||
|
"seeders": seeders_number,
|
||||||
|
"seeders_link": seeders_link,
|
||||||
|
"leechers": leechers_number,
|
||||||
|
"leechers_link": leechers_link,
|
||||||
|
"torrent_filename": torrent_filename,
|
||||||
|
"torrent_content": torrent_content,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 6) MySQL INSERT
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
insert_sql = """
|
||||||
|
INSERT INTO torrents (
|
||||||
|
torrent_hash, details_link, category, title_visible, title_full,
|
||||||
|
size_pretty, added_datetime, preview_image,
|
||||||
|
seeders, seeders_link, leechers, leechers_link,
|
||||||
|
torrent_filename, torrent_content
|
||||||
|
) VALUES (
|
||||||
|
%(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s,
|
||||||
|
%(size_pretty)s, %(added_datetime)s, %(preview_image)s,
|
||||||
|
%(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s,
|
||||||
|
%(torrent_filename)s, %(torrent_content)s
|
||||||
|
)
|
||||||
|
ON DUPLICATE KEY UPDATE
|
||||||
|
details_link = VALUES(details_link),
|
||||||
|
category = VALUES(category),
|
||||||
|
title_visible = VALUES(title_visible),
|
||||||
|
title_full = VALUES(title_full),
|
||||||
|
size_pretty = VALUES(size_pretty),
|
||||||
|
added_datetime = VALUES(added_datetime),
|
||||||
|
preview_image = VALUES(preview_image),
|
||||||
|
seeders = VALUES(seeders),
|
||||||
|
seeders_link = VALUES(seeders_link),
|
||||||
|
leechers = VALUES(leechers),
|
||||||
|
leechers_link = VALUES(leechers_link),
|
||||||
|
torrent_filename = VALUES(torrent_filename),
|
||||||
|
torrent_content = VALUES(torrent_content);
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 7) PROCESS ALL ROWS
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
for cells in real_rows:
|
||||||
|
data = parse_row(cells)
|
||||||
|
if not data:
|
||||||
|
continue
|
||||||
|
|
||||||
|
print("💾 Saving:", data["title_visible"])
|
||||||
|
cursor.execute(insert_sql, data)
|
||||||
|
|
||||||
|
print("\n✅ DONE — All torrents saved to MySQL & torrent files downloaded.")
|
||||||
|
driver.quit()
|
||||||
|
|||||||
@@ -0,0 +1,375 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import pymysql
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
import urllib.parse as urlparse
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 1) MySQL CONNECTION
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
db = pymysql.connect(
|
||||||
|
host="192.168.1.76",
|
||||||
|
port=3307,
|
||||||
|
user="root",
|
||||||
|
password="Vlado9674+",
|
||||||
|
database="torrents",
|
||||||
|
charset="utf8mb4",
|
||||||
|
autocommit=True
|
||||||
|
)
|
||||||
|
|
||||||
|
cursor = db.cursor()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 2) Selenium setup
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
COOKIE_FILE = Path("sktorrent_cookies.json")
|
||||||
|
|
||||||
|
# Start URL pro kategorii 24, seřazeno podle data DESC
|
||||||
|
START_URL = (
|
||||||
|
"https://sktorrent.eu/torrent/torrents.php"
|
||||||
|
"?active=0&category=24&order=data&by=DESC&zaner=&jazyk=&page=0"
|
||||||
|
)
|
||||||
|
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument("--start-maximized")
|
||||||
|
chrome_options.add_argument("--disable-notifications")
|
||||||
|
chrome_options.add_argument("--disable-popup-blocking")
|
||||||
|
chrome_options.add_argument("--disable-extensions")
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
|
||||||
|
# Pozice a velikost okna (aby nepřekrývalo PyCharm)
|
||||||
|
driver.set_window_position(380, 50) # 10 cm od levého okraje
|
||||||
|
driver.set_window_size(1350, 1000) # můžeš změnit dle monitoru
|
||||||
|
|
||||||
|
|
||||||
|
# Nejprve otevřeme hlavní stránku kvůli doméně pro cookies
|
||||||
|
driver.get("https://sktorrent.eu")
|
||||||
|
|
||||||
|
# Load cookies z JSON
|
||||||
|
if COOKIE_FILE.exists():
|
||||||
|
with open(COOKIE_FILE, "r") as f:
|
||||||
|
cookies = json.load(f)
|
||||||
|
for c in cookies:
|
||||||
|
driver.add_cookie(c)
|
||||||
|
print("🍪 Cookies loaded.")
|
||||||
|
else:
|
||||||
|
print("⚠️ Cookie file not found, you may not be logged in!")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 3) Převod cookies → requests.Session (pro stahování .torrent)
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
requests_session = requests.Session()
|
||||||
|
for ck in driver.get_cookies():
|
||||||
|
requests_session.cookies.set(ck["name"], ck["value"])
|
||||||
|
|
||||||
|
print("🔗 Requests session initialized with Selenium cookies.")
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 4) Funkce pro zavření popupu
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def close_popup_if_any():
|
||||||
|
"""Zkusí zavřít interstitial reklamu pomocí JS funkce interstitialBox.closeit()."""
|
||||||
|
try:
|
||||||
|
driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}")
|
||||||
|
# Krátká pauza, ať se DOM uklidní
|
||||||
|
time.sleep(0.5)
|
||||||
|
print("🧹 Popup closed via JS fallback (if present).")
|
||||||
|
except Exception as e:
|
||||||
|
print("ℹ️ Popup JS handler not found:", e)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 5) Funkce pro parsování jednoho řádku (jednoho torrentu)
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def parse_row(cells):
|
||||||
|
"""
|
||||||
|
cells: list<WebElement> o délce 7
|
||||||
|
Struktura:
|
||||||
|
0: kategorie
|
||||||
|
1: download link (.torrent)
|
||||||
|
2: název + velikost + datum + 'Obrázok' + žánr
|
||||||
|
3: -- (ignorujeme)
|
||||||
|
4: seeders
|
||||||
|
5: leechers
|
||||||
|
6: completed
|
||||||
|
"""
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 1️⃣ CATEGORY
|
||||||
|
# --------------------------
|
||||||
|
category = cells[0].text.strip()
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 2️⃣ DOWNLOAD LINK FOR TORRENT FILE (cells[1])
|
||||||
|
# --------------------------
|
||||||
|
try:
|
||||||
|
download_a = cells[1].find_element(By.TAG_NAME, "a")
|
||||||
|
download_link = download_a.get_attribute("href")
|
||||||
|
except:
|
||||||
|
print("⚠️ No download link in row, skipping.")
|
||||||
|
return None
|
||||||
|
|
||||||
|
parsed_dl = urlparse.urlparse(download_link)
|
||||||
|
dl_query = urlparse.parse_qs(parsed_dl.query)
|
||||||
|
|
||||||
|
torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 3️⃣ TITLE + DETAILS LINK (in cell[2])
|
||||||
|
# --------------------------
|
||||||
|
title_links = cells[2].find_elements(By.TAG_NAME, "a")
|
||||||
|
if not title_links:
|
||||||
|
print("⚠️ No title link — skipping row")
|
||||||
|
return None
|
||||||
|
|
||||||
|
a_tag = title_links[0]
|
||||||
|
|
||||||
|
visible_name = a_tag.text.strip()
|
||||||
|
full_title = a_tag.get_attribute("title")
|
||||||
|
details_link = a_tag.get_attribute("href")
|
||||||
|
|
||||||
|
if not details_link:
|
||||||
|
print("⚠️ Row has no details link — skipping")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# Extract torrent hash from ?id=
|
||||||
|
# --------------------------
|
||||||
|
parsed = urlparse.urlparse(details_link)
|
||||||
|
query = urlparse.parse_qs(parsed.query)
|
||||||
|
|
||||||
|
if "id" not in query:
|
||||||
|
print("⚠️ Skipping row with no torrent ID →", details_link)
|
||||||
|
return None
|
||||||
|
|
||||||
|
torrent_hash = query["id"][0]
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 4️⃣ Size + date parsing
|
||||||
|
# --------------------------
|
||||||
|
text_block = cells[2].get_attribute("innerText")
|
||||||
|
text_block_clean = " ".join(text_block.split())
|
||||||
|
|
||||||
|
size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
|
||||||
|
added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)
|
||||||
|
|
||||||
|
size_pretty = size_match.group(1) if size_match else None
|
||||||
|
added_pretty = added_match.group(1) if added_match else None
|
||||||
|
|
||||||
|
# Robustní převod data/času do MySQL datetime
|
||||||
|
added_mysql = None
|
||||||
|
if added_pretty:
|
||||||
|
# "29/11/2025 o 02:29" → "29/11/2025 02:29"
|
||||||
|
clean = added_pretty.replace(" o ", " ").strip()
|
||||||
|
parts = clean.split(" ")
|
||||||
|
|
||||||
|
date_part = parts[0]
|
||||||
|
time_part = parts[1] if len(parts) > 1 else "00:00:00"
|
||||||
|
|
||||||
|
# pokud chybí sekundy, přidej
|
||||||
|
if len(time_part.split(":")) == 2:
|
||||||
|
time_part += ":00"
|
||||||
|
|
||||||
|
day, month, year = date_part.split("/")
|
||||||
|
added_mysql = f"{year}-{month}-{day} {time_part}"
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 5️⃣ Image preview
|
||||||
|
# --------------------------
|
||||||
|
img_link = None
|
||||||
|
try:
|
||||||
|
image_a = cells[2].find_element(
|
||||||
|
By.XPATH,
|
||||||
|
".//a[contains(text(),'Obrázok')]"
|
||||||
|
)
|
||||||
|
mouseover = image_a.get_attribute("onmouseover")
|
||||||
|
img_match = re.search(r"src=([^ ]+)", mouseover)
|
||||||
|
if img_match:
|
||||||
|
img_link = img_match.group(1).replace("'", "").strip()
|
||||||
|
if img_link.startswith("//"):
|
||||||
|
img_link = "https:" + img_link
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 6️⃣ SEEDERS / LEECHERS
|
||||||
|
# --------------------------
|
||||||
|
seeders_a = cells[4].find_element(By.TAG_NAME, "a")
|
||||||
|
seeders_number = int(seeders_a.text.strip())
|
||||||
|
seeders_link = seeders_a.get_attribute("href")
|
||||||
|
|
||||||
|
leechers_a = cells[5].find_element(By.TAG_NAME, "a")
|
||||||
|
leechers_number = int(leechers_a.text.strip())
|
||||||
|
leechers_link = leechers_a.get_attribute("href")
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# 7️⃣ DOWNLOAD TORRENT CONTENT (.torrent)
|
||||||
|
# --------------------------
|
||||||
|
torrent_content = None
|
||||||
|
time.sleep(3) #mezera mezi torrenty
|
||||||
|
try:
|
||||||
|
resp = requests_session.get(download_link)
|
||||||
|
resp.raise_for_status()
|
||||||
|
torrent_content = resp.content
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Could not download torrent file for {torrent_hash}: {e}")
|
||||||
|
torrent_content = None
|
||||||
|
|
||||||
|
# --------------------------
|
||||||
|
# FINAL DICTIONARY
|
||||||
|
# --------------------------
|
||||||
|
return {
|
||||||
|
"torrent_hash": torrent_hash,
|
||||||
|
"details_link": details_link,
|
||||||
|
"category": category,
|
||||||
|
"title_visible": visible_name,
|
||||||
|
"title_full": full_title,
|
||||||
|
"size_pretty": size_pretty,
|
||||||
|
"added_datetime": added_mysql,
|
||||||
|
"preview_image": img_link,
|
||||||
|
"seeders": seeders_number,
|
||||||
|
"seeders_link": seeders_link,
|
||||||
|
"leechers": leechers_number,
|
||||||
|
"leechers_link": leechers_link,
|
||||||
|
"torrent_filename": torrent_filename,
|
||||||
|
"torrent_content": torrent_content,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 6) MySQL INSERT
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
insert_sql = """
|
||||||
|
INSERT INTO torrents (
|
||||||
|
torrent_hash, details_link, category, title_visible, title_full,
|
||||||
|
size_pretty, added_datetime, preview_image,
|
||||||
|
seeders, seeders_link, leechers, leechers_link,
|
||||||
|
torrent_filename, torrent_content
|
||||||
|
) VALUES (
|
||||||
|
%(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s,
|
||||||
|
%(size_pretty)s, %(added_datetime)s, %(preview_image)s,
|
||||||
|
%(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s,
|
||||||
|
%(torrent_filename)s, %(torrent_content)s
|
||||||
|
)
|
||||||
|
ON DUPLICATE KEY UPDATE
|
||||||
|
details_link = VALUES(details_link),
|
||||||
|
category = VALUES(category),
|
||||||
|
title_visible = VALUES(title_visible),
|
||||||
|
title_full = VALUES(title_full),
|
||||||
|
size_pretty = VALUES(size_pretty),
|
||||||
|
added_datetime = VALUES(added_datetime),
|
||||||
|
preview_image = VALUES(preview_image),
|
||||||
|
seeders = VALUES(seeders),
|
||||||
|
seeders_link = VALUES(seeders_link),
|
||||||
|
leechers = VALUES(leechers),
|
||||||
|
leechers_link = VALUES(leechers_link),
|
||||||
|
torrent_filename = VALUES(torrent_filename),
|
||||||
|
torrent_content = VALUES(torrent_content);
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 7) Funkce pro zpracování jedné stránky
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def process_current_page(page_index: int):
|
||||||
|
"""
|
||||||
|
Zpracuje aktuálně otevřenou stránku:
|
||||||
|
- najde všechny "REAL TORRENT ROWS" (7 td)
|
||||||
|
- pro každý torrent:
|
||||||
|
* parse_row
|
||||||
|
* insert/update do DB
|
||||||
|
"""
|
||||||
|
rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
|
||||||
|
|
||||||
|
real_rows = []
|
||||||
|
for row in rows:
|
||||||
|
cells = row.find_elements(By.TAG_NAME, "td")
|
||||||
|
|
||||||
|
# REAL TORRENT ROWS ALWAYS HAVE EXACTLY 7 TD CELLS
|
||||||
|
if len(cells) == 7:
|
||||||
|
real_rows.append(cells)
|
||||||
|
|
||||||
|
print(f"📄 Page {page_index}: {len(real_rows)} torrent rows")
|
||||||
|
|
||||||
|
for cells in real_rows:
|
||||||
|
data = parse_row(cells)
|
||||||
|
if not data:
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f" 💾 [{page_index}] Saving:", data["title_visible"])
|
||||||
|
cursor.execute(insert_sql, data)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 8) Hlavní stránkovací cyklus
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
current_url = START_URL
|
||||||
|
page_index = 0
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print(f"\n🌐 Loading page {page_index}: {current_url}")
|
||||||
|
driver.get(current_url)
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# zavři popup, pokud je
|
||||||
|
close_popup_if_any()
|
||||||
|
|
||||||
|
# zpracuj aktuální stránku
|
||||||
|
process_current_page(page_index)
|
||||||
|
|
||||||
|
# pokus se najít tlačítko "Dalsi >>"
|
||||||
|
try:
|
||||||
|
next_btn = driver.find_element(
|
||||||
|
By.XPATH,
|
||||||
|
"//a[b[contains(text(),'Dalsi')]]"
|
||||||
|
)
|
||||||
|
next_url = next_btn.get_attribute("href")
|
||||||
|
|
||||||
|
if not next_url:
|
||||||
|
print("⛔ Next link has no href, stopping.")
|
||||||
|
break
|
||||||
|
|
||||||
|
# pokud je relativní, doplň doménu
|
||||||
|
if next_url.startswith("/"):
|
||||||
|
next_url = "https://sktorrent.eu" + next_url
|
||||||
|
|
||||||
|
# když by náhodou bylo stejné URL → přeruš nekonečnou smyčku
|
||||||
|
if next_url == current_url:
|
||||||
|
print("⛔ Next URL equals current URL, stopping.")
|
||||||
|
break
|
||||||
|
|
||||||
|
print("➡️ Next page:", next_url)
|
||||||
|
current_url = next_url
|
||||||
|
page_index += 1
|
||||||
|
|
||||||
|
# malá pauza mezi stránkami
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
print("✅ No 'Dalsi >>' link found, reached last page. Done.")
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
print("\n🎉 DONE — All pages processed, torrents saved & torrent files downloaded.")
|
||||||
|
driver.quit()
|
||||||
|
|||||||
Reference in New Issue
Block a user