vbnotebook

This commit is contained in:
2025-11-18 07:22:17 +01:00
parent a764c9723e
commit 7bc330beba
2 changed files with 412 additions and 134 deletions

View File

@@ -1,171 +1,230 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import json import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import re
import urllib.parse as urlparse
from pathlib import Path from pathlib import Path
from playwright.sync_api import sync_playwright import json
# =============================================================
# CONFIGURATION # ============================================================
# ============================================================= # 1) MySQL CONNECTION
# ============================================================
db = pymysql.connect(
host="192.168.1.76",
port=3307,
user="root",
password="Vlado9674+",
database="torrents",
charset="utf8mb4",
autocommit=True
)
cursor = db.cursor()
# ============================================================
# 2) Selenium setup
# ============================================================
COOKIE_FILE = Path("sktorrent_cookies.json") COOKIE_FILE = Path("sktorrent_cookies.json")
URL = "https://sktorrent.eu/torrent/torrents.php?active=0" URL = "https://sktorrent.eu/torrent/torrents.php?active=0"
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--disable-notifications")
chrome_options.add_argument("--disable-popup-blocking")
chrome_options.add_argument("--disable-extensions")
def load_cookies(context): driver = webdriver.Chrome(options=chrome_options)
"""Load saved cookies if available."""
driver.get("https://sktorrent.eu")
# Load cookies
if COOKIE_FILE.exists(): if COOKIE_FILE.exists():
with open(COOKIE_FILE, "r") as f: with open(COOKIE_FILE, "r") as f:
cookies = json.load(f) cookies = json.load(f)
context.add_cookies(cookies) for c in cookies:
print("🔄 Loaded login cookies.") driver.add_cookie(c)
return True print("🍪 Cookies loaded.")
print("❌ Cookie file not found. Run manual login first.")
return False driver.get(URL)
time.sleep(2)
# ============================================================= # Try to close inline popup if present
# MAIN CODE try:
# ============================================================= close_btn = driver.find_element(By.XPATH, "//a[text()='CLOSE X']")
close_btn.click()
print("🧹 Popup closed.")
except:
pass
with sync_playwright() as p:
# 1⃣ Launch browser # ============================================================
browser = p.chromium.launch( # 3) Extract table rows
headless=False, # ============================================================
args=[
"--disable-popup-blocking", rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
"--disable-background-networking", print("Total rows found:", len(rows))
"--disable-notifications",
"--no-default-browser-check", real_rows = []
"--no-first-run", for row in rows:
"--noerrdialogs", cells = row.find_elements(By.TAG_NAME, "td")
"--disable-dev-shm-usage", if len(cells) >= 5: # real torrent rows
"--disable-features=IsolateOrigins,site-per-process", real_rows.append(cells)
"--no-sandbox",
] print("Real data rows:", len(real_rows))
print("")
# ============================================================
# 4) Function to extract all fields from one row
# ============================================================
def parse_row(cells):
# --------------------------
# 1⃣ CATEGORY
# --------------------------
category = cells[0].text.strip()
# --------------------------
# 2⃣ TITLES + DETAILS LINK
# --------------------------
a_tag = cells[1].find_element(By.TAG_NAME, "a")
visible_name = a_tag.text.strip()
full_title = a_tag.get_attribute("title")
details_link = a_tag.get_attribute("href")
# --------------------------
# 3⃣ TORRENT HASH
# --------------------------
parsed = urlparse.urlparse(details_link)
query = urlparse.parse_qs(parsed.query)
# skip rows without ?id=
if "id" not in query:
print("⚠️ Skipping row with no torrent ID →", details_link)
return None
torrent_hash = query["id"][0]
# --------------------------
# 4⃣ TEXT BLOCK (size + date)
# --------------------------
text_block = cells[1].get_attribute("innerText")
text_block_clean = " ".join(text_block.split())
size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)
size_pretty = size_match.group(1) if size_match else None
added_pretty = added_match.group(1) if added_match else None
# Convert “18/11/2025 o 07:00” → “2025-11-18 07:00:00”
added_mysql = None
if added_pretty:
added_mysql = re.sub(r" o ", " ", added_pretty)
day, month, year_time = added_mysql.split("/")
year, time_part = year_time.split(" ")
added_mysql = f"{year}-{month}-{day} {time_part}:00"
# --------------------------
# 5⃣ IMAGE PREVIEW
# --------------------------
img_link = None
try:
image_a = cells[1].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
mouseover = image_a.get_attribute("onmouseover")
img_match = re.search(r"src=([^ ]+)", mouseover)
if img_match:
img_link = img_match.group(1).replace("'", "").strip()
if img_link.startswith("//"):
img_link = "https:" + img_link
except:
pass
# --------------------------
# 6⃣ SEEDERS
# --------------------------
seeders_a = cells[3].find_element(By.TAG_NAME, "a")
seeders_number = int(seeders_a.text.strip())
seeders_link = seeders_a.get_attribute("href")
# --------------------------
# 7⃣ LEECHERS
# --------------------------
leechers_a = cells[4].find_element(By.TAG_NAME, "a")
leechers_number = int(leechers_a.text.strip())
leechers_link = leechers_a.get_attribute("href")
# --------------------------
# Return dictionary for MySQL
# --------------------------
return {
"torrent_hash": torrent_hash,
"details_link": details_link,
"category": category,
"title_visible": visible_name,
"title_full": full_title,
"size_pretty": size_pretty,
"added_datetime": added_mysql,
"preview_image": img_link,
"seeders": seeders_number,
"seeders_link": seeders_link,
"leechers": leechers_number,
"leechers_link": leechers_link,
}
# ============================================================
# 5) MySQL INSERT
# ============================================================
insert_sql = """
INSERT INTO torrents (
torrent_hash, details_link, category, title_visible, title_full,
size_pretty, added_datetime, preview_image,
seeders, seeders_link, leechers, leechers_link
) VALUES (
%(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s,
%(size_pretty)s, %(added_datetime)s, %(preview_image)s,
%(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s
) )
ON DUPLICATE KEY UPDATE
details_link = VALUES(details_link),
category = VALUES(category),
title_visible = VALUES(title_visible),
title_full = VALUES(title_full),
size_pretty = VALUES(size_pretty),
added_datetime = VALUES(added_datetime),
preview_image = VALUES(preview_image),
seeders = VALUES(seeders),
seeders_link = VALUES(seeders_link),
leechers = VALUES(leechers),
leechers_link = VALUES(leechers_link);
"""
# 2⃣ Create context before any pages exist
context = browser.new_context()
# 3⃣ Block ALL third-party requests (ads, JS, popups, tracking) # ============================================================
def block_third_party(route, request): # 6) PROCESS ALL REAL ROWS
url = request.url.lower() # ============================================================
if "sktorrent.eu" in url:
route.continue_()
else:
print(f"🚫 Blocked third-party request: {url}")
route.abort()
context.route("**/*", block_third_party) for cells in real_rows:
data = parse_row(cells)
if not data:
continue
# 4⃣ Block ANY popup windows except the first page print("💾 Saving:", data["title_visible"])
pages = [] cursor.execute(insert_sql, data)
def on_new_page(new_page): print("\n✅ DONE — All torrents saved to MySQL.")
pages.append(new_page) driver.quit()
if len(pages) == 1:
print("➡️ Main page created.")
else:
print("⚠️ Popup blocked (auto-closed).")
new_page.close()
context.on("page", on_new_page)
# 5⃣ Disable all popup JS functions (window.open, window.close, opener.close)
context.add_init_script("""
window.open = () => { console.log("Blocked window.open"); return null; };
window.close = () => { console.log("Blocked window.close"); };
try {
if (window.opener) {
window.opener.close = () => { console.log("Blocked opener.close"); };
}
} catch (e) {}
// Block <a target="_blank">
document.addEventListener('click', function(e) {
const el = e.target.closest('a[target="_blank"]');
if (el) {
e.preventDefault();
console.log("Blocked target=_blank");
}
}, true);
// Block middle-click opening a new tab
document.addEventListener('auxclick', function(e) {
e.preventDefault();
}, true);
""")
# 6⃣ Create the FIRST page (main page)
page = context.new_page()
pages.append(page)
# 7⃣ Load cookies (login)
load_cookies(context)
# 8⃣ Navigate
print("🌍 Opening page...")
page.goto(URL)
# Do NOT use networkidle on ad-heavy sites
page.wait_for_load_state("domcontentloaded")
page.wait_for_selector("table tr", timeout=15000)
# Remove popup/overlay elements created by SKTorrent
page.evaluate("""
const selectors = [
'#lightbox', '.lightbox', '#popup', '.popup',
'.overlay', '#overlay', '.modal', '#modal',
'div[style*="fixed"]', 'div[style*="position: fixed"]',
'table[style*="position: fixed"]',
'iframe', 'frame'
];
selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(el => {
console.log("Removing popup element:", sel);
el.remove();
});
});
// Remove onclick handlers that trigger popups
document.querySelectorAll('*').forEach(el => {
el.onclick = null;
el.onauxclick = null;
el.oncontextmenu = null;
});
// Remove timers that trigger delayed popups
window.setTimeout = () => {};
window.setInterval = () => {};
""")
print("✔ Page loaded, extracting table rows...")
# 9⃣ Extract all rows
rows = page.locator("table tr").all()
print(f"📄 Total rows found (including header): {len(rows)}")
# 🔟 Extract SECOND ROW only (your request)
if len(rows) > 1:
row = rows[1] # 0 = header, 1 = first data row
tds = row.locator("td")
name = tds.nth(1).inner_text().strip()
size = tds.nth(2).inner_text().strip()
seeders = tds.nth(3).inner_text().strip()
leechers = tds.nth(4).inner_text().strip()
print("\n========= SECOND ROW =========")
print(f"Name: {name}")
print(f"Size: {size}")
print(f"Seeders: {seeders}")
print(f"Leechers: {leechers}")
print("==============================\n")
else:
print("❌ No data rows found!")
page.wait_for_timeout(5000)

219
40 ParseviaRequests.py Normal file
View File

@@ -0,0 +1,219 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
import requests
from bs4 import BeautifulSoup
import pymysql
from datetime import datetime
# ==============================
# CONFIG
# ==============================
BASE_URL = "https://sktorrent.eu/torrent/torrents_v2.php?active=0"
COOKIES_FILE = "sktorrent_cookies.json" # Your exported cookies.txt ( Netscape format )
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
HEADERS = {"User-Agent": USER_AGENT}
DB_CFG = {
"host": "192.168.1.76",
"port": 3307,
"user": "root",
"password": "Vlado9674+",
"database": "torrents",
"charset": "utf8mb4",
"cursorclass": pymysql.cursors.DictCursor,
}
# ==============================
# COOKIE LOADER
# ==============================
def load_cookies(path):
cookies = {}
with open(path, "r", encoding="utf-8") as f:
for line in f:
if line.startswith("#") or "\t" not in line:
continue
parts = line.strip().split("\t")
if len(parts) >= 7:
cookies[parts[5]] = parts[6]
print(f"🍪 Loaded {len(cookies)} cookies.")
return cookies
# ==============================
# MYSQL INSERT
# ==============================
def insert_torrent(db, t):
sql = """
INSERT IGNORE INTO torrents (
category,
title_visible,
title_full,
size_pretty,
added_datetime,
seeders,
seeders_link,
leechers,
leechers_link,
preview_image,
details_link,
torrent_hash
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
with db.cursor() as cur:
cur.execute(sql, (
t["category"],
t["title_visible"],
t["title_full"],
t["size_pretty"],
t["added_datetime"],
t["seeders"],
t["seeders_link"],
t["leechers"],
t["leechers_link"],
t["preview_image"],
t["details_link"],
t["torrent_hash"],
))
db.commit()
# ==============================
# PARSER
# ==============================
def parse_torrent_row(cols):
"""Parse a <tr> with exactly the structure of a torrent row."""
# --- category ---
category = cols[0].get_text(strip=True)
# --- download link (ignore) ---
# second <td> is download.gif
# --- main column ---
main_td = cols[2]
a_title = main_td.find("a", href=re.compile("details.php"))
if not a_title:
return None
title_visible = a_title.get_text(strip=True)
title_full = a_title.get("title", "").strip()
details_link = "https://sktorrent.eu/torrent/" + a_title.get("href")
# Extract torrent hash from ?id=.....
m = re.search(r"id=([A-Fa-f0-9]{40})", a_title.get("href"))
if not m:
return None
torrent_hash = m.group(1)
# Extract size + added date from the text below <br>
text = main_td.get_text(" ", strip=True)
# example: "GR ... Velkost 1.7 GB | Pridany 18/11/2025 o 07:00"
size_match = re.search(r"Velkost ([\d\.]+ ?[GMK]B)", text)
date_match = re.search(r"Pridany (\d{2}/\d{2}/\d{4}) o (\d{2}:\d{2})", text)
size_pretty = size_match.group(1) if size_match else None
added_datetime = None
if date_match:
d, t = date_match.groups()
added_datetime = datetime.strptime(d + " " + t, "%d/%m/%Y %H:%M")
# Extract preview img from onmouseover
img = None
img_a = main_td.find("a", onmouseover=True)
if img_a:
html = img_a.get("onmouseover", "")
m2 = re.search(r"img src=//([^ ]+)", html)
if m2:
img = "https://" + m2.group(1)
# --- seeders ---
seed_a = cols[4].find("a")
seeders = int(seed_a.get_text(strip=True)) if seed_a else 0
seeders_link = "https://sktorrent.eu/torrent/" + seed_a.get("href") if seed_a else None
# --- leechers ---
leech_a = cols[5].find("a")
leechers = int(leech_a.get_text(strip=True)) if leech_a else 0
leechers_link = "https://sktorrent.eu/torrent/" + leech_a.get("href") if leech_a else None
return {
"category": category,
"title_visible": title_visible,
"title_full": title_full,
"size_pretty": size_pretty,
"added_datetime": added_datetime,
"seeders": seeders,
"seeders_link": seeders_link,
"leechers": leechers,
"leechers_link": leechers_link,
"preview_image": img,
"details_link": details_link,
"torrent_hash": torrent_hash,
}
# ==============================
# MAIN
# ==============================
def main():
cookies = load_cookies(COOKIES_FILE)
session = requests.Session()
session.headers.update(HEADERS)
session.cookies.update(cookies)
print("🌍 Downloading HTML...")
r = session.get(BASE_URL, timeout=30)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")
tbody = soup.find("tbody")
if not tbody:
print("❌ Could not find <tbody>")
return
rows = tbody.find_all("tr")
print(f"Found {len(rows)} <tr> rows.")
db = pymysql.connect(**DB_CFG)
inserted = 0
skipped = 0
for tr in rows:
cols = tr.find_all("td")
if len(cols) != 7:
continue # ignore header & separator rows
data = parse_torrent_row(cols)
if not data:
skipped += 1
continue
insert_torrent(db, data)
inserted += 1
print(f"✔ Inserted {data['torrent_hash']}")
print(f"\n===== DONE =====")
print(f"Inserted: {inserted}")
print(f"Skipped: {skipped}")
if __name__ == "__main__":
main()