vbnotebook

This commit is contained in:
2025-11-18 07:22:17 +01:00
parent a764c9723e
commit 7bc330beba
2 changed files with 412 additions and 134 deletions

View File

@@ -1,171 +1,230 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import re
import urllib.parse as urlparse
from pathlib import Path
from playwright.sync_api import sync_playwright
import json
# =============================================================
# CONFIGURATION
# =============================================================
# ============================================================
# 1) MySQL CONNECTION
# ============================================================
db = pymysql.connect(
host="192.168.1.76",
port=3307,
user="root",
password="Vlado9674+",
database="torrents",
charset="utf8mb4",
autocommit=True
)
cursor = db.cursor()
# ============================================================
# 2) Selenium setup
# ============================================================
COOKIE_FILE = Path("sktorrent_cookies.json")
URL = "https://sktorrent.eu/torrent/torrents.php?active=0"
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--disable-extensions")
def load_cookies(context):
"""Load saved cookies if available."""
if COOKIE_FILE.exists():
with open(COOKIE_FILE, "r") as f:
cookies = json.load(f)
context.add_cookies(cookies)
print("🔄 Loaded login cookies.")
return True
print("❌ Cookie file not found. Run manual login first.")
return False
driver = webdriver.Chrome(options=chrome_options)
driver.get("https://sktorrent.eu")
# Load cookies
if COOKIE_FILE.exists():
with open(COOKIE_FILE, "r") as f:
cookies = json.load(f)
for c in cookies:
driver.add_cookie(c)
print("🍪 Cookies loaded.")
driver.get(URL)
time.sleep(2)
# =============================================================
# MAIN CODE
# =============================================================
# Try to close inline popup if present
try:
close_btn = driver.find_element(By.XPATH, "//a[text()='CLOSE X']")
close_btn.click()
print("🧹 Popup closed.")
except:
pass
with sync_playwright() as p:
# 1⃣ Launch browser
browser = p.chromium.launch(
headless=False,
args=[
"--disable-popup-blocking",
"--disable-background-networking",
"--disable-notifications",
"--no-default-browser-check",
"--no-first-run",
"--noerrdialogs",
"--disable-dev-shm-usage",
"--disable-features=IsolateOrigins,site-per-process",
"--no-sandbox",
]
)
# ============================================================
# 3) Extract table rows
# ============================================================
# 2⃣ Create context before any pages exist
context = browser.new_context()
rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
print("Total rows found:", len(rows))
# 3⃣ Block ALL third-party requests (ads, JS, popups, tracking)
def block_third_party(route, request):
url = request.url.lower()
if "sktorrent.eu" in url:
route.continue_()
else:
print(f"🚫 Blocked third-party request: {url}")
route.abort()
real_rows = []
for row in rows:
cells = row.find_elements(By.TAG_NAME, "td")
if len(cells) >= 5: # real torrent rows
real_rows.append(cells)
context.route("**/*", block_third_party)
print("Real data rows:", len(real_rows))
print("")
# 4⃣ Block ANY popup windows except the first page
pages = []
def on_new_page(new_page):
pages.append(new_page)
if len(pages) == 1:
print("➡️ Main page created.")
else:
print("⚠️ Popup blocked (auto-closed).")
new_page.close()
# ============================================================
# 4) Function to extract all fields from one row
# ============================================================
context.on("page", on_new_page)
def parse_row(cells):
# 5⃣ Disable all popup JS functions (window.open, window.close, opener.close)
context.add_init_script("""
window.open = () => { console.log("Blocked window.open"); return null; };
window.close = () => { console.log("Blocked window.close"); };
# --------------------------
# 1⃣ CATEGORY
# --------------------------
category = cells[0].text.strip()
try {
if (window.opener) {
window.opener.close = () => { console.log("Blocked opener.close"); };
}
} catch (e) {}
# --------------------------
# 2⃣ TITLES + DETAILS LINK
# --------------------------
a_tag = cells[1].find_element(By.TAG_NAME, "a")
// Block <a target="_blank">
document.addEventListener('click', function(e) {
const el = e.target.closest('a[target="_blank"]');
if (el) {
e.preventDefault();
console.log("Blocked target=_blank");
}
}, true);
visible_name = a_tag.text.strip()
full_title = a_tag.get_attribute("title")
details_link = a_tag.get_attribute("href")
// Block middle-click opening a new tab
document.addEventListener('auxclick', function(e) {
e.preventDefault();
}, true);
""")
# --------------------------
# 3⃣ TORRENT HASH
# --------------------------
parsed = urlparse.urlparse(details_link)
query = urlparse.parse_qs(parsed.query)
# 6⃣ Create the FIRST page (main page)
page = context.new_page()
pages.append(page)
# skip rows without ?id=
if "id" not in query:
print("⚠️ Skipping row with no torrent ID →", details_link)
return None
# 7⃣ Load cookies (login)
load_cookies(context)
torrent_hash = query["id"][0]
# 8⃣ Navigate
print("🌍 Opening page...")
page.goto(URL)
# --------------------------
# 4⃣ TEXT BLOCK (size + date)
# --------------------------
text_block = cells[1].get_attribute("innerText")
text_block_clean = " ".join(text_block.split())
# Do NOT use networkidle on ad-heavy sites
page.wait_for_load_state("domcontentloaded")
page.wait_for_selector("table tr", timeout=15000)
# Remove popup/overlay elements created by SKTorrent
page.evaluate("""
const selectors = [
'#lightbox', '.lightbox', '#popup', '.popup',
'.overlay', '#overlay', '.modal', '#modal',
'div[style*="fixed"]', 'div[style*="position: fixed"]',
'table[style*="position: fixed"]',
'iframe', 'frame'
];
size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)
selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(el => {
console.log("Removing popup element:", sel);
el.remove();
});
});
size_pretty = size_match.group(1) if size_match else None
added_pretty = added_match.group(1) if added_match else None
// Remove onclick handlers that trigger popups
document.querySelectorAll('*').forEach(el => {
el.onclick = null;
el.onauxclick = null;
el.oncontextmenu = null;
});
# Convert “18/11/2025 o 07:00” → “2025-11-18 07:00:00”
added_mysql = None
if added_pretty:
added_mysql = re.sub(r" o ", " ", added_pretty)
day, month, year_time = added_mysql.split("/")
year, time_part = year_time.split(" ")
added_mysql = f"{year}-{month}-{day} {time_part}:00"
// Remove timers that trigger delayed popups
window.setTimeout = () => {};
window.setInterval = () => {};
""")
# --------------------------
# 5⃣ IMAGE PREVIEW
# --------------------------
img_link = None
try:
image_a = cells[1].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
mouseover = image_a.get_attribute("onmouseover")
img_match = re.search(r"src=([^ ]+)", mouseover)
if img_match:
img_link = img_match.group(1).replace("'", "").strip()
if img_link.startswith("//"):
img_link = "https:" + img_link
except:
pass
print("✔ Page loaded, extracting table rows...")
# --------------------------
# 6⃣ SEEDERS
# --------------------------
seeders_a = cells[3].find_element(By.TAG_NAME, "a")
seeders_number = int(seeders_a.text.strip())
seeders_link = seeders_a.get_attribute("href")
# 9⃣ Extract all rows
rows = page.locator("table tr").all()
print(f"📄 Total rows found (including header): {len(rows)}")
# --------------------------
# 7⃣ LEECHERS
# --------------------------
leechers_a = cells[4].find_element(By.TAG_NAME, "a")
leechers_number = int(leechers_a.text.strip())
leechers_link = leechers_a.get_attribute("href")
# 🔟 Extract SECOND ROW only (your request)
if len(rows) > 1:
row = rows[1] # 0 = header, 1 = first data row
tds = row.locator("td")
# --------------------------
# Return dictionary for MySQL
# --------------------------
return {
"torrent_hash": torrent_hash,
"details_link": details_link,
"category": category,
"title_visible": visible_name,
"title_full": full_title,
"size_pretty": size_pretty,
"added_datetime": added_mysql,
"preview_image": img_link,
"seeders": seeders_number,
"seeders_link": seeders_link,
"leechers": leechers_number,
"leechers_link": leechers_link,
}
name = tds.nth(1).inner_text().strip()
size = tds.nth(2).inner_text().strip()
seeders = tds.nth(3).inner_text().strip()
leechers = tds.nth(4).inner_text().strip()
print("\n========= SECOND ROW =========")
print(f"Name: {name}")
print(f"Size: {size}")
print(f"Seeders: {seeders}")
print(f"Leechers: {leechers}")
print("==============================\n")
else:
print("❌ No data rows found!")
# ============================================================
# 5) MySQL INSERT
# ============================================================
page.wait_for_timeout(5000)
insert_sql = """
INSERT INTO torrents (
torrent_hash, details_link, category, title_visible, title_full,
size_pretty, added_datetime, preview_image,
seeders, seeders_link, leechers, leechers_link
) VALUES (
%(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s,
%(size_pretty)s, %(added_datetime)s, %(preview_image)s,
%(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s
)
ON DUPLICATE KEY UPDATE
details_link = VALUES(details_link),
category = VALUES(category),
title_visible = VALUES(title_visible),
title_full = VALUES(title_full),
size_pretty = VALUES(size_pretty),
added_datetime = VALUES(added_datetime),
preview_image = VALUES(preview_image),
seeders = VALUES(seeders),
seeders_link = VALUES(seeders_link),
leechers = VALUES(leechers),
leechers_link = VALUES(leechers_link);
"""
# ============================================================
# 6) PROCESS ALL REAL ROWS
# ============================================================
for cells in real_rows:
data = parse_row(cells)
if not data:
continue
print("💾 Saving:", data["title_visible"])
cursor.execute(insert_sql, data)
print("\n✅ DONE — All torrents saved to MySQL.")
driver.quit()