This commit is contained in:
2026-05-20 22:26:48 +02:00
parent 8121b70e87
commit 8ddec5184d
2 changed files with 133 additions and 120 deletions
+58 -55
View File
@@ -47,11 +47,12 @@ cursor = db.cursor()
# 2) Selenium setup
# ============================================================
COOKIE_FILE = Path("sktorrent_cookies.json")
# Updated to standard torrents.php as requested
BASE_URL = (
"https://sktorrent.eu/torrent/torrents.php"
"?active=0&category=24&order=data&by=DESC&zaner=&jazyk="
)
CATEGORIES = {
24: "Knihy a časopisy",
32: "Mluvené slovo"
}
MAX_PAGES = 10
chrome_options = Options()
chrome_options.add_argument("--start-maximized")
@@ -85,7 +86,15 @@ print("🔗 Requests session initialized.")
# ============================================================
# 4) Popup handler
# 4) Build URL for category and page
# ============================================================
def get_torrent_url(category, page):
base = "https://sktorrent.eu/torrent/torrents.php"
return f"{base}?active=0&category={category}&order=data&by=DESC&zaner=&jazyk=&page={page}"
# ============================================================
# 5) Popup handler
# ============================================================
def close_popup_if_any():
try:
@@ -96,7 +105,7 @@ def close_popup_if_any():
# ============================================================
# 5) Parse one torrent row (MODIFIED)
# 6) Parse one torrent row (MODIFIED)
# ============================================================
def parse_row(cells):
@@ -204,7 +213,7 @@ def parse_row(cells):
print(f"⚠️ parse_row logic failed: {e}")
return None
# ============================================================
# 6) INSERT SQL (MODIFIED)
# 7) INSERT SQL (MODIFIED)
# ============================================================
insert_sql = """
INSERT INTO torrents (
@@ -228,68 +237,62 @@ ON DUPLICATE KEY UPDATE
# keeps the old value if the new one is NULL,
# but updates it if the old one was NULL and the new one is binary.
# ============================================================
# 7) PROCESS ALL PAGES
# 8) PROCESS ALL PAGES AND CATEGORIES
# ============================================================
TOTAL_PAGES = 226
for page_num in range(0, TOTAL_PAGES):
current_url = f"{BASE_URL}&page={page_num}"
print(f"\n🌐 Loading Page Index {page_num} (Page {page_num + 1}/{TOTAL_PAGES})")
for category_id, category_name in CATEGORIES.items():
print(f"\n📚 Starting category: {category_name} (ID: {category_id})")
driver.get(current_url)
time.sleep(2)
close_popup_if_any()
for page_num in range(0, MAX_PAGES):
current_url = get_torrent_url(category_id, page_num)
print(f"\n🌐 Loading page {page_num}/{MAX_PAGES - 1} for {category_name}")
# Find table rows
rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
driver.get(current_url)
time.sleep(2)
close_popup_if_any()
# FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1)
# This automatically discards headers and empty space rows.
real_rows = []
for r in rows:
cells = r.find_elements(By.TAG_NAME, "td")
if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"):
real_rows.append(cells)
# Find table rows
rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
if not real_rows:
print("⚠️ No data rows found on this page. Ending loop.")
break
# FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1)
# This automatically discards headers and empty space rows.
real_rows = []
for r in rows:
cells = r.find_elements(By.TAG_NAME, "td")
if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"):
real_rows.append(cells)
# === INSERT THIS LINE HERE ===
page_new_items = 0
# =============================
if not real_rows:
print(f"⚠️ No data rows found. Stopping pagination for {category_name}")
break
for cells in real_rows:
try:
data = parse_row(cells)
# ... rest of your logic ...
except Exception as e:
print(f"⚠️ parse_row failed: {e}")
continue
page_new_items = 0
if not data: continue
processed_count += 1
for cells in real_rows:
try:
data = parse_row(cells)
except Exception as e:
print(f"⚠️ parse_row failed: {e}")
continue
if data["is_new_torrent"]:
new_torrent_count += 1
page_new_items += 1
new_titles.append(data["title_visible"])
print(f"💾 NEW: {data['title_visible']}")
else:
existing_torrent_count += 1
print(f"♻️ UPDATING: {data['title_visible']}")
if not data: continue
processed_count += 1
cursor.execute(insert_sql, data)
if data["is_new_torrent"]:
new_torrent_count += 1
page_new_items += 1
new_titles.append(data["title_visible"])
print(f"💾 NEW: {data['title_visible']}")
else:
existing_torrent_count += 1
print(f"♻️ UPDATING: {data['title_visible']}")
# # If an entire page is old news, we can stop the deep crawl
# if page_new_items == 0 and page_num > 0:
# print("🛑 Page contained only known items. Sync complete.")
# break
cursor.execute(insert_sql, data)
time.sleep(1)
time.sleep(1)
# ============================================================
# 8) SEND EMAIL REPORT
# 9) SEND EMAIL REPORT
# ============================================================
RUN_END = datetime.datetime.now()
subject = f"SKTorrent run {RUN_START:%Y-%m-%d %H:%M}"