vbnotebook

This commit is contained in:
2025-11-18 07:22:17 +01:00
parent a764c9723e
commit 7bc330beba
2 changed files with 412 additions and 134 deletions

View File

@@ -1,171 +1,230 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import re
import urllib.parse as urlparse
from pathlib import Path from pathlib import Path
from playwright.sync_api import sync_playwright import json
# =============================================================
# CONFIGURATION # ============================================================
# ============================================================= # 1) MySQL CONNECTION
# ============================================================
db = pymysql.connect(
host="192.168.1.76",
port=3307,
user="root",
password="Vlado9674+",
database="torrents",
charset="utf8mb4",
autocommit=True
)
cursor = db.cursor()
# ============================================================
# 2) Selenium setup
# ============================================================
COOKIE_FILE = Path("sktorrent_cookies.json") COOKIE_FILE = Path("sktorrent_cookies.json")
URL = "https://sktorrent.eu/torrent/torrents.php?active=0" URL = "https://sktorrent.eu/torrent/torrents.php?active=0"
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--disable-extensions")
def load_cookies(context): driver = webdriver.Chrome(options=chrome_options)
"""Load saved cookies if available."""
if COOKIE_FILE.exists(): driver.get("https://sktorrent.eu")
with open(COOKIE_FILE, "r") as f:
cookies = json.load(f) # Load cookies
context.add_cookies(cookies) if COOKIE_FILE.exists():
print("🔄 Loaded login cookies.") with open(COOKIE_FILE, "r") as f:
return True cookies = json.load(f)
print("❌ Cookie file not found. Run manual login first.") for c in cookies:
return False driver.add_cookie(c)
print("🍪 Cookies loaded.")
driver.get(URL)
time.sleep(2)
# ============================================================= # Try to close inline popup if present
# MAIN CODE try:
# ============================================================= close_btn = driver.find_element(By.XPATH, "//a[text()='CLOSE X']")
close_btn.click()
print("🧹 Popup closed.")
except:
pass
with sync_playwright() as p:
# 1⃣ Launch browser # ============================================================
browser = p.chromium.launch( # 3) Extract table rows
headless=False, # ============================================================
args=[
"--disable-popup-blocking",
"--disable-background-networking",
"--disable-notifications",
"--no-default-browser-check",
"--no-first-run",
"--noerrdialogs",
"--disable-dev-shm-usage",
"--disable-features=IsolateOrigins,site-per-process",
"--no-sandbox",
]
)
# 2⃣ Create context before any pages exist rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
context = browser.new_context() print("Total rows found:", len(rows))
# 3⃣ Block ALL third-party requests (ads, JS, popups, tracking) real_rows = []
def block_third_party(route, request): for row in rows:
url = request.url.lower() cells = row.find_elements(By.TAG_NAME, "td")
if "sktorrent.eu" in url: if len(cells) >= 5: # real torrent rows
route.continue_() real_rows.append(cells)
else:
print(f"🚫 Blocked third-party request: {url}")
route.abort()
context.route("**/*", block_third_party) print("Real data rows:", len(real_rows))
print("")
# 4⃣ Block ANY popup windows except the first page
pages = []
def on_new_page(new_page): # ============================================================
pages.append(new_page) # 4) Function to extract all fields from one row
if len(pages) == 1: # ============================================================
print("➡️ Main page created.")
else:
print("⚠️ Popup blocked (auto-closed).")
new_page.close()
context.on("page", on_new_page) def parse_row(cells):
# 5⃣ Disable all popup JS functions (window.open, window.close, opener.close) # --------------------------
context.add_init_script(""" # 1⃣ CATEGORY
window.open = () => { console.log("Blocked window.open"); return null; }; # --------------------------
window.close = () => { console.log("Blocked window.close"); }; category = cells[0].text.strip()
try { # --------------------------
if (window.opener) { # 2⃣ TITLES + DETAILS LINK
window.opener.close = () => { console.log("Blocked opener.close"); }; # --------------------------
} a_tag = cells[1].find_element(By.TAG_NAME, "a")
} catch (e) {}
// Block <a target="_blank"> visible_name = a_tag.text.strip()
document.addEventListener('click', function(e) { full_title = a_tag.get_attribute("title")
const el = e.target.closest('a[target="_blank"]'); details_link = a_tag.get_attribute("href")
if (el) {
e.preventDefault();
console.log("Blocked target=_blank");
}
}, true);
// Block middle-click opening a new tab # --------------------------
document.addEventListener('auxclick', function(e) { # 3⃣ TORRENT HASH
e.preventDefault(); # --------------------------
}, true); parsed = urlparse.urlparse(details_link)
""") query = urlparse.parse_qs(parsed.query)
# 6⃣ Create the FIRST page (main page) # skip rows without ?id=
page = context.new_page() if "id" not in query:
pages.append(page) print("⚠️ Skipping row with no torrent ID →", details_link)
return None
# 7⃣ Load cookies (login) torrent_hash = query["id"][0]
load_cookies(context)
# 8⃣ Navigate # --------------------------
print("🌍 Opening page...") # 4⃣ TEXT BLOCK (size + date)
page.goto(URL) # --------------------------
text_block = cells[1].get_attribute("innerText")
text_block_clean = " ".join(text_block.split())
# Do NOT use networkidle on ad-heavy sites size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
page.wait_for_load_state("domcontentloaded") added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)
page.wait_for_selector("table tr", timeout=15000)
# Remove popup/overlay elements created by SKTorrent
page.evaluate("""
const selectors = [
'#lightbox', '.lightbox', '#popup', '.popup',
'.overlay', '#overlay', '.modal', '#modal',
'div[style*="fixed"]', 'div[style*="position: fixed"]',
'table[style*="position: fixed"]',
'iframe', 'frame'
];
selectors.forEach(sel => { size_pretty = size_match.group(1) if size_match else None
document.querySelectorAll(sel).forEach(el => { added_pretty = added_match.group(1) if added_match else None
console.log("Removing popup element:", sel);
el.remove();
});
});
// Remove onclick handlers that trigger popups # Convert “18/11/2025 o 07:00” → “2025-11-18 07:00:00”
document.querySelectorAll('*').forEach(el => { added_mysql = None
el.onclick = null; if added_pretty:
el.onauxclick = null; added_mysql = re.sub(r" o ", " ", added_pretty)
el.oncontextmenu = null; day, month, year_time = added_mysql.split("/")
}); year, time_part = year_time.split(" ")
added_mysql = f"{year}-{month}-{day} {time_part}:00"
// Remove timers that trigger delayed popups # --------------------------
window.setTimeout = () => {}; # 5⃣ IMAGE PREVIEW
window.setInterval = () => {}; # --------------------------
""") img_link = None
try:
image_a = cells[1].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
mouseover = image_a.get_attribute("onmouseover")
img_match = re.search(r"src=([^ ]+)", mouseover)
if img_match:
img_link = img_match.group(1).replace("'", "").strip()
if img_link.startswith("//"):
img_link = "https:" + img_link
except:
pass
print("✔ Page loaded, extracting table rows...") # --------------------------
# 6⃣ SEEDERS
# --------------------------
seeders_a = cells[3].find_element(By.TAG_NAME, "a")
seeders_number = int(seeders_a.text.strip())
seeders_link = seeders_a.get_attribute("href")
# 9⃣ Extract all rows # --------------------------
rows = page.locator("table tr").all() # 7⃣ LEECHERS
print(f"📄 Total rows found (including header): {len(rows)}") # --------------------------
leechers_a = cells[4].find_element(By.TAG_NAME, "a")
leechers_number = int(leechers_a.text.strip())
leechers_link = leechers_a.get_attribute("href")
# 🔟 Extract SECOND ROW only (your request) # --------------------------
if len(rows) > 1: # Return dictionary for MySQL
row = rows[1] # 0 = header, 1 = first data row # --------------------------
tds = row.locator("td") return {
"torrent_hash": torrent_hash,
"details_link": details_link,
"category": category,
"title_visible": visible_name,
"title_full": full_title,
"size_pretty": size_pretty,
"added_datetime": added_mysql,
"preview_image": img_link,
"seeders": seeders_number,
"seeders_link": seeders_link,
"leechers": leechers_number,
"leechers_link": leechers_link,
}
name = tds.nth(1).inner_text().strip()
size = tds.nth(2).inner_text().strip()
seeders = tds.nth(3).inner_text().strip()
leechers = tds.nth(4).inner_text().strip()
print("\n========= SECOND ROW =========") # ============================================================
print(f"Name: {name}") # 5) MySQL INSERT
print(f"Size: {size}") # ============================================================
print(f"Seeders: {seeders}")
print(f"Leechers: {leechers}")
print("==============================\n")
else:
print("❌ No data rows found!")
page.wait_for_timeout(5000) insert_sql = """
INSERT INTO torrents (
torrent_hash, details_link, category, title_visible, title_full,
size_pretty, added_datetime, preview_image,
seeders, seeders_link, leechers, leechers_link
) VALUES (
%(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s,
%(size_pretty)s, %(added_datetime)s, %(preview_image)s,
%(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s
)
ON DUPLICATE KEY UPDATE
details_link = VALUES(details_link),
category = VALUES(category),
title_visible = VALUES(title_visible),
title_full = VALUES(title_full),
size_pretty = VALUES(size_pretty),
added_datetime = VALUES(added_datetime),
preview_image = VALUES(preview_image),
seeders = VALUES(seeders),
seeders_link = VALUES(seeders_link),
leechers = VALUES(leechers),
leechers_link = VALUES(leechers_link);
"""
# ============================================================
# 6) PROCESS ALL REAL ROWS
# ============================================================
for cells in real_rows:
data = parse_row(cells)
if not data:
continue
print("💾 Saving:", data["title_visible"])
cursor.execute(insert_sql, data)
print("\n✅ DONE — All torrents saved to MySQL.")
driver.quit()

219
40 ParseviaRequests.py Normal file
View File

@@ -0,0 +1,219 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import requests
from bs4 import BeautifulSoup
import pymysql
from datetime import datetime
# ==============================
# CONFIG
# ==============================
BASE_URL = "https://sktorrent.eu/torrent/torrents_v2.php?active=0"
COOKIES_FILE = "sktorrent_cookies.json" # Your exported cookies.txt ( Netscape format )
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
HEADERS = {"User-Agent": USER_AGENT}
DB_CFG = {
"host": "192.168.1.76",
"port": 3307,
"user": "root",
"password": "Vlado9674+",
"database": "torrents",
"charset": "utf8mb4",
"cursorclass": pymysql.cursors.DictCursor,
}
# ==============================
# COOKIE LOADER
# ==============================
def load_cookies(path):
cookies = {}
with open(path, "r", encoding="utf-8") as f:
for line in f:
if line.startswith("#") or "\t" not in line:
continue
parts = line.strip().split("\t")
if len(parts) >= 7:
cookies[parts[5]] = parts[6]
print(f"🍪 Loaded {len(cookies)} cookies.")
return cookies
# ==============================
# MYSQL INSERT
# ==============================
def insert_torrent(db, t):
sql = """
INSERT IGNORE INTO torrents (
category,
title_visible,
title_full,
size_pretty,
added_datetime,
seeders,
seeders_link,
leechers,
leechers_link,
preview_image,
details_link,
torrent_hash
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
with db.cursor() as cur:
cur.execute(sql, (
t["category"],
t["title_visible"],
t["title_full"],
t["size_pretty"],
t["added_datetime"],
t["seeders"],
t["seeders_link"],
t["leechers"],
t["leechers_link"],
t["preview_image"],
t["details_link"],
t["torrent_hash"],
))
db.commit()
# ==============================
# PARSER
# ==============================
def parse_torrent_row(cols):
"""Parse a <tr> with exactly the structure of a torrent row."""
# --- category ---
category = cols[0].get_text(strip=True)
# --- download link (ignore) ---
# second <td> is download.gif
# --- main column ---
main_td = cols[2]
a_title = main_td.find("a", href=re.compile("details.php"))
if not a_title:
return None
title_visible = a_title.get_text(strip=True)
title_full = a_title.get("title", "").strip()
details_link = "https://sktorrent.eu/torrent/" + a_title.get("href")
# Extract torrent hash from ?id=.....
m = re.search(r"id=([A-Fa-f0-9]{40})", a_title.get("href"))
if not m:
return None
torrent_hash = m.group(1)
# Extract size + added date from the text below <br>
text = main_td.get_text(" ", strip=True)
# example: "GR ... Velkost 1.7 GB | Pridany 18/11/2025 o 07:00"
size_match = re.search(r"Velkost ([\d\.]+ ?[GMK]B)", text)
date_match = re.search(r"Pridany (\d{2}/\d{2}/\d{4}) o (\d{2}:\d{2})", text)
size_pretty = size_match.group(1) if size_match else None
added_datetime = None
if date_match:
d, t = date_match.groups()
added_datetime = datetime.strptime(d + " " + t, "%d/%m/%Y %H:%M")
# Extract preview img from onmouseover
img = None
img_a = main_td.find("a", onmouseover=True)
if img_a:
html = img_a.get("onmouseover", "")
m2 = re.search(r"img src=//([^ ]+)", html)
if m2:
img = "https://" + m2.group(1)
# --- seeders ---
seed_a = cols[4].find("a")
seeders = int(seed_a.get_text(strip=True)) if seed_a else 0
seeders_link = "https://sktorrent.eu/torrent/" + seed_a.get("href") if seed_a else None
# --- leechers ---
leech_a = cols[5].find("a")
leechers = int(leech_a.get_text(strip=True)) if leech_a else 0
leechers_link = "https://sktorrent.eu/torrent/" + leech_a.get("href") if leech_a else None
return {
"category": category,
"title_visible": title_visible,
"title_full": title_full,
"size_pretty": size_pretty,
"added_datetime": added_datetime,
"seeders": seeders,
"seeders_link": seeders_link,
"leechers": leechers,
"leechers_link": leechers_link,
"preview_image": img,
"details_link": details_link,
"torrent_hash": torrent_hash,
}
# ==============================
# MAIN
# ==============================
def main():
cookies = load_cookies(COOKIES_FILE)
session = requests.Session()
session.headers.update(HEADERS)
session.cookies.update(cookies)
print("🌍 Downloading HTML...")
r = session.get(BASE_URL, timeout=30)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
tbody = soup.find("tbody")
if not tbody:
print("❌ Could not find <tbody>")
return
rows = tbody.find_all("tr")
print(f"Found {len(rows)} <tr> rows.")
db = pymysql.connect(**DB_CFG)
inserted = 0
skipped = 0
for tr in rows:
cols = tr.find_all("td")
if len(cols) != 7:
continue # ignore header & separator rows
data = parse_torrent_row(cols)
if not data:
skipped += 1
continue
insert_torrent(db, data)
inserted += 1
print(f"✔ Inserted {data['torrent_hash']}")
print(f"\n===== DONE =====")
print(f"Inserted: {inserted}")
print(f"Skipped: {skipped}")
if __name__ == "__main__":
main()