vbnotebook

This commit is contained in:
2025-11-18 07:22:17 +01:00
parent a764c9723e
commit 7bc330beba
2 changed files with 412 additions and 134 deletions

View File

@@ -1,171 +1,230 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import re
import urllib.parse as urlparse
from pathlib import Path from pathlib import Path
from playwright.sync_api import sync_playwright import json
# =============================================================
# CONFIGURATION # ============================================================
# ============================================================= # 1) MySQL CONNECTION
# ============================================================
db = pymysql.connect(
host="192.168.1.76",
port=3307,
user="root",
password="Vlado9674+",
database="torrents",
charset="utf8mb4",
autocommit=True
)
cursor = db.cursor()
# ============================================================
# 2) Selenium setup
# ============================================================
COOKIE_FILE = Path("sktorrent_cookies.json") COOKIE_FILE = Path("sktorrent_cookies.json")
URL = "https://sktorrent.eu/torrent/torrents.php?active=0" URL = "https://sktorrent.eu/torrent/torrents.php?active=0"
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--disable-extensions")
def load_cookies(context): driver = webdriver.Chrome(options=chrome_options)
"""Load saved cookies if available."""
if COOKIE_FILE.exists(): driver.get("https://sktorrent.eu")
# Load cookies
if COOKIE_FILE.exists():
with open(COOKIE_FILE, "r") as f: with open(COOKIE_FILE, "r") as f:
cookies = json.load(f) cookies = json.load(f)
context.add_cookies(cookies) for c in cookies:
print("🔄 Loaded login cookies.") driver.add_cookie(c)
return True print("🍪 Cookies loaded.")
print("❌ Cookie file not found. Run manual login first.")
return False driver.get(URL)
time.sleep(2)
# ============================================================= # Try to close inline popup if present
# MAIN CODE try:
# ============================================================= close_btn = driver.find_element(By.XPATH, "//a[text()='CLOSE X']")
close_btn.click()
print("🧹 Popup closed.")
except:
pass
with sync_playwright() as p:
# 1⃣ Launch browser # ============================================================
browser = p.chromium.launch( # 3) Extract table rows
headless=False, # ============================================================
args=[
"--disable-popup-blocking",
"--disable-background-networking",
"--disable-notifications",
"--no-default-browser-check",
"--no-first-run",
"--noerrdialogs",
"--disable-dev-shm-usage",
"--disable-features=IsolateOrigins,site-per-process",
"--no-sandbox",
]
)
# 2⃣ Create context before any pages exist rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
context = browser.new_context() print("Total rows found:", len(rows))
# 3⃣ Block ALL third-party requests (ads, JS, popups, tracking) real_rows = []
def block_third_party(route, request): for row in rows:
url = request.url.lower() cells = row.find_elements(By.TAG_NAME, "td")
if "sktorrent.eu" in url: if len(cells) >= 5: # real torrent rows
route.continue_() real_rows.append(cells)
else:
print(f"🚫 Blocked third-party request: {url}")
route.abort()
context.route("**/*", block_third_party) print("Real data rows:", len(real_rows))
print("")
# 4⃣ Block ANY popup windows except the first page
pages = []
def on_new_page(new_page): # ============================================================
pages.append(new_page) # 4) Function to extract all fields from one row
if len(pages) == 1: # ============================================================
print("➡️ Main page created.")
else:
print("⚠️ Popup blocked (auto-closed).")
new_page.close()
context.on("page", on_new_page) def parse_row(cells):
# 5⃣ Disable all popup JS functions (window.open, window.close, opener.close) # --------------------------
context.add_init_script(""" # 1⃣ CATEGORY
window.open = () => { console.log("Blocked window.open"); return null; }; # --------------------------
window.close = () => { console.log("Blocked window.close"); }; category = cells[0].text.strip()
try { # --------------------------
if (window.opener) { # 2⃣ TITLES + DETAILS LINK
window.opener.close = () => { console.log("Blocked opener.close"); }; # --------------------------
a_tag = cells[1].find_element(By.TAG_NAME, "a")
visible_name = a_tag.text.strip()
full_title = a_tag.get_attribute("title")
details_link = a_tag.get_attribute("href")
# --------------------------
# 3⃣ TORRENT HASH
# --------------------------
parsed = urlparse.urlparse(details_link)
query = urlparse.parse_qs(parsed.query)
# skip rows without ?id=
if "id" not in query:
print("⚠️ Skipping row with no torrent ID →", details_link)
return None
torrent_hash = query["id"][0]
# --------------------------
# 4⃣ TEXT BLOCK (size + date)
# --------------------------
text_block = cells[1].get_attribute("innerText")
text_block_clean = " ".join(text_block.split())
size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)
size_pretty = size_match.group(1) if size_match else None
added_pretty = added_match.group(1) if added_match else None
# Convert “18/11/2025 o 07:00” → “2025-11-18 07:00:00”
added_mysql = None
if added_pretty:
added_mysql = re.sub(r" o ", " ", added_pretty)
day, month, year_time = added_mysql.split("/")
year, time_part = year_time.split(" ")
added_mysql = f"{year}-{month}-{day} {time_part}:00"
# --------------------------
# 5⃣ IMAGE PREVIEW
# --------------------------
img_link = None
try:
image_a = cells[1].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
mouseover = image_a.get_attribute("onmouseover")
img_match = re.search(r"src=([^ ]+)", mouseover)
if img_match:
img_link = img_match.group(1).replace("'", "").strip()
if img_link.startswith("//"):
img_link = "https:" + img_link
except:
pass
# --------------------------
# 6⃣ SEEDERS
# --------------------------
seeders_a = cells[3].find_element(By.TAG_NAME, "a")
seeders_number = int(seeders_a.text.strip())
seeders_link = seeders_a.get_attribute("href")
# --------------------------
# 7⃣ LEECHERS
# --------------------------
leechers_a = cells[4].find_element(By.TAG_NAME, "a")
leechers_number = int(leechers_a.text.strip())
leechers_link = leechers_a.get_attribute("href")
# --------------------------
# Return dictionary for MySQL
# --------------------------
return {
"torrent_hash": torrent_hash,
"details_link": details_link,
"category": category,
"title_visible": visible_name,
"title_full": full_title,
"size_pretty": size_pretty,
"added_datetime": added_mysql,
"preview_image": img_link,
"seeders": seeders_number,
"seeders_link": seeders_link,
"leechers": leechers_number,
"leechers_link": leechers_link,
} }
} catch (e) {}
// Block <a target="_blank">
document.addEventListener('click', function(e) {
const el = e.target.closest('a[target="_blank"]');
if (el) {
e.preventDefault();
console.log("Blocked target=_blank");
}
}, true);
// Block middle-click opening a new tab # ============================================================
document.addEventListener('auxclick', function(e) { # 5) MySQL INSERT
e.preventDefault(); # ============================================================
}, true);
""")
# 6⃣ Create the FIRST page (main page) insert_sql = """
page = context.new_page() INSERT INTO torrents (
pages.append(page) torrent_hash, details_link, category, title_visible, title_full,
size_pretty, added_datetime, preview_image,
seeders, seeders_link, leechers, leechers_link
) VALUES (
%(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s,
%(size_pretty)s, %(added_datetime)s, %(preview_image)s,
%(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s
)
ON DUPLICATE KEY UPDATE
details_link = VALUES(details_link),
category = VALUES(category),
title_visible = VALUES(title_visible),
title_full = VALUES(title_full),
size_pretty = VALUES(size_pretty),
added_datetime = VALUES(added_datetime),
preview_image = VALUES(preview_image),
seeders = VALUES(seeders),
seeders_link = VALUES(seeders_link),
leechers = VALUES(leechers),
leechers_link = VALUES(leechers_link);
"""
# 7⃣ Load cookies (login)
load_cookies(context)
# 8⃣ Navigate # ============================================================
print("🌍 Opening page...") # 6) PROCESS ALL REAL ROWS
page.goto(URL) # ============================================================
# Do NOT use networkidle on ad-heavy sites for cells in real_rows:
page.wait_for_load_state("domcontentloaded") data = parse_row(cells)
page.wait_for_selector("table tr", timeout=15000) if not data:
# Remove popup/overlay elements created by SKTorrent continue
page.evaluate("""
const selectors = [
'#lightbox', '.lightbox', '#popup', '.popup',
'.overlay', '#overlay', '.modal', '#modal',
'div[style*="fixed"]', 'div[style*="position: fixed"]',
'table[style*="position: fixed"]',
'iframe', 'frame'
];
selectors.forEach(sel => { print("💾 Saving:", data["title_visible"])
document.querySelectorAll(sel).forEach(el => { cursor.execute(insert_sql, data)
console.log("Removing popup element:", sel);
el.remove();
});
});
// Remove onclick handlers that trigger popups print("\n✅ DONE — All torrents saved to MySQL.")
document.querySelectorAll('*').forEach(el => { driver.quit()
el.onclick = null;
el.onauxclick = null;
el.oncontextmenu = null;
});
// Remove timers that trigger delayed popups
window.setTimeout = () => {};
window.setInterval = () => {};
""")
print("✔ Page loaded, extracting table rows...")
# 9⃣ Extract all rows
rows = page.locator("table tr").all()
print(f"📄 Total rows found (including header): {len(rows)}")
# 🔟 Extract SECOND ROW only (your request)
if len(rows) > 1:
row = rows[1] # 0 = header, 1 = first data row
tds = row.locator("td")
name = tds.nth(1).inner_text().strip()
size = tds.nth(2).inner_text().strip()
seeders = tds.nth(3).inner_text().strip()
leechers = tds.nth(4).inner_text().strip()
print("\n========= SECOND ROW =========")
print(f"Name: {name}")
print(f"Size: {size}")
print(f"Seeders: {seeders}")
print(f"Leechers: {leechers}")
print("==============================\n")
else:
print("❌ No data rows found!")
page.wait_for_timeout(5000)

219
40 ParseviaRequests.py Normal file
View File

@@ -0,0 +1,219 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import requests
from bs4 import BeautifulSoup
import pymysql
from datetime import datetime
# ==============================
# CONFIG
# ==============================
BASE_URL = "https://sktorrent.eu/torrent/torrents_v2.php?active=0"
COOKIES_FILE = "sktorrent_cookies.json" # Your exported cookies.txt ( Netscape format )
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
HEADERS = {"User-Agent": USER_AGENT}
DB_CFG = {
"host": "192.168.1.76",
"port": 3307,
"user": "root",
"password": "Vlado9674+",
"database": "torrents",
"charset": "utf8mb4",
"cursorclass": pymysql.cursors.DictCursor,
}
# ==============================
# COOKIE LOADER
# ==============================
def load_cookies(path):
cookies = {}
with open(path, "r", encoding="utf-8") as f:
for line in f:
if line.startswith("#") or "\t" not in line:
continue
parts = line.strip().split("\t")
if len(parts) >= 7:
cookies[parts[5]] = parts[6]
print(f"🍪 Loaded {len(cookies)} cookies.")
return cookies
# ==============================
# MYSQL INSERT
# ==============================
def insert_torrent(db, t):
sql = """
INSERT IGNORE INTO torrents (
category,
title_visible,
title_full,
size_pretty,
added_datetime,
seeders,
seeders_link,
leechers,
leechers_link,
preview_image,
details_link,
torrent_hash
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
with db.cursor() as cur:
cur.execute(sql, (
t["category"],
t["title_visible"],
t["title_full"],
t["size_pretty"],
t["added_datetime"],
t["seeders"],
t["seeders_link"],
t["leechers"],
t["leechers_link"],
t["preview_image"],
t["details_link"],
t["torrent_hash"],
))
db.commit()
# ==============================
# PARSER
# ==============================
def parse_torrent_row(cols):
"""Parse a <tr> with exactly the structure of a torrent row."""
# --- category ---
category = cols[0].get_text(strip=True)
# --- download link (ignore) ---
# second <td> is download.gif
# --- main column ---
main_td = cols[2]
a_title = main_td.find("a", href=re.compile("details.php"))
if not a_title:
return None
title_visible = a_title.get_text(strip=True)
title_full = a_title.get("title", "").strip()
details_link = "https://sktorrent.eu/torrent/" + a_title.get("href")
# Extract torrent hash from ?id=.....
m = re.search(r"id=([A-Fa-f0-9]{40})", a_title.get("href"))
if not m:
return None
torrent_hash = m.group(1)
# Extract size + added date from the text below <br>
text = main_td.get_text(" ", strip=True)
# example: "GR ... Velkost 1.7 GB | Pridany 18/11/2025 o 07:00"
size_match = re.search(r"Velkost ([\d\.]+ ?[GMK]B)", text)
date_match = re.search(r"Pridany (\d{2}/\d{2}/\d{4}) o (\d{2}:\d{2})", text)
size_pretty = size_match.group(1) if size_match else None
added_datetime = None
if date_match:
d, t = date_match.groups()
added_datetime = datetime.strptime(d + " " + t, "%d/%m/%Y %H:%M")
# Extract preview img from onmouseover
img = None
img_a = main_td.find("a", onmouseover=True)
if img_a:
html = img_a.get("onmouseover", "")
m2 = re.search(r"img src=//([^ ]+)", html)
if m2:
img = "https://" + m2.group(1)
# --- seeders ---
seed_a = cols[4].find("a")
seeders = int(seed_a.get_text(strip=True)) if seed_a else 0
seeders_link = "https://sktorrent.eu/torrent/" + seed_a.get("href") if seed_a else None
# --- leechers ---
leech_a = cols[5].find("a")
leechers = int(leech_a.get_text(strip=True)) if leech_a else 0
leechers_link = "https://sktorrent.eu/torrent/" + leech_a.get("href") if leech_a else None
return {
"category": category,
"title_visible": title_visible,
"title_full": title_full,
"size_pretty": size_pretty,
"added_datetime": added_datetime,
"seeders": seeders,
"seeders_link": seeders_link,
"leechers": leechers,
"leechers_link": leechers_link,
"preview_image": img,
"details_link": details_link,
"torrent_hash": torrent_hash,
}
# ==============================
# MAIN
# ==============================
def main():
cookies = load_cookies(COOKIES_FILE)
session = requests.Session()
session.headers.update(HEADERS)
session.cookies.update(cookies)
print("🌍 Downloading HTML...")
r = session.get(BASE_URL, timeout=30)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
tbody = soup.find("tbody")
if not tbody:
print("❌ Could not find <tbody>")
return
rows = tbody.find_all("tr")
print(f"Found {len(rows)} <tr> rows.")
db = pymysql.connect(**DB_CFG)
inserted = 0
skipped = 0
for tr in rows:
cols = tr.find_all("td")
if len(cols) != 7:
continue # ignore header & separator rows
data = parse_torrent_row(cols)
if not data:
skipped += 1
continue
insert_torrent(db, data)
inserted += 1
print(f"✔ Inserted {data['torrent_hash']}")
print(f"\n===== DONE =====")
print(f"Inserted: {inserted}")
print(f"Skipped: {skipped}")
if __name__ == "__main__":
main()