git

2026-05-20 22:26:48 +02:00
parent 8121b70e87
commit 8ddec5184d
2 changed files with 133 additions and 120 deletions
@@ -47,11 +47,12 @@ cursor = db.cursor()
 # 2) Selenium setup
 # ============================================================
 COOKIE_FILE = Path("sktorrent_cookies.json")
-# Updated to standard torrents.php as requested
-BASE_URL = (
-    "https://sktorrent.eu/torrent/torrents.php"
-    "?active=0&category=24&order=data&by=DESC&zaner=&jazyk="
-)
+
+CATEGORIES = {
+    24: "Knihy a časopisy",
+    32: "Mluvené slovo"
+}
+MAX_PAGES = 10

 chrome_options = Options()
 chrome_options.add_argument("--start-maximized")
@@ -85,7 +86,15 @@ print("🔗 Requests session initialized.")


 # ============================================================
-# 4) Popup handler
+# 4) Build URL for category and page
+# ============================================================
+def get_torrent_url(category, page):
+    base = "https://sktorrent.eu/torrent/torrents.php"
+    return f"{base}?active=0&category={category}&order=data&by=DESC&zaner=&jazyk=&page={page}"
+
+
+# ============================================================
+# 5) Popup handler
 # ============================================================
 def close_popup_if_any():
    try:
@@ -96,7 +105,7 @@ def close_popup_if_any():


 # ============================================================
-# 5) Parse one torrent row (MODIFIED)
+# 6) Parse one torrent row (MODIFIED)
 # ============================================================

 def parse_row(cells):
@@ -204,7 +213,7 @@ def parse_row(cells):
        print(f"⚠️ parse_row logic failed: {e}")
        return None
 # ============================================================
-# 6) INSERT SQL (MODIFIED)
+# 7) INSERT SQL (MODIFIED)
 # ============================================================
 insert_sql = """
 INSERT INTO torrents (
@@ -228,68 +237,62 @@ ON DUPLICATE KEY UPDATE
 # keeps the old value if the new one is NULL,
 # but updates it if the old one was NULL and the new one is binary.
 # ============================================================
-# 7) PROCESS ALL PAGES
+# 8) PROCESS ALL PAGES AND CATEGORIES
 # ============================================================
-TOTAL_PAGES = 226

-for page_num in range(0, TOTAL_PAGES):
-    current_url = f"{BASE_URL}&page={page_num}"
-    print(f"\n🌐 Loading Page Index {page_num} (Page {page_num + 1}/{TOTAL_PAGES})")
+for category_id, category_name in CATEGORIES.items():
+    print(f"\n📚 Starting category: {category_name} (ID: {category_id})")

-    driver.get(current_url)
-    time.sleep(2)
-    close_popup_if_any()
+    for page_num in range(0, MAX_PAGES):
+        current_url = get_torrent_url(category_id, page_num)
+        print(f"\n🌐 Loading page {page_num}/{MAX_PAGES - 1} for {category_name}")

-    # Find table rows
-    rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
+        driver.get(current_url)
+        time.sleep(2)
+        close_popup_if_any()

-    # FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1)
-    # This automatically discards headers and empty space rows.
-    real_rows = []
-    for r in rows:
-        cells = r.find_elements(By.TAG_NAME, "td")
-        if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"):
-            real_rows.append(cells)
+        # Find table rows
+        rows = driver.find_elements(By.CSS_SELECTOR, "table tr")

-    if not real_rows:
-        print("⚠️ No data rows found on this page. Ending loop.")
-        break
+        # FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1)
+        # This automatically discards headers and empty space rows.
+        real_rows = []
+        for r in rows:
+            cells = r.find_elements(By.TAG_NAME, "td")
+            if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"):
+                real_rows.append(cells)

-    # === INSERT THIS LINE HERE ===
-    page_new_items = 0
-    # =============================
+        if not real_rows:
+            print(f"⚠️ No data rows found. Stopping pagination for {category_name}")
+            break

-    for cells in real_rows:
-        try:
-            data = parse_row(cells)
-            # ... rest of your logic ...
-        except Exception as e:
-            print(f"⚠️ parse_row failed: {e}")
-            continue
+        page_new_items = 0

-        if not data: continue
-        processed_count += 1
+        for cells in real_rows:
+            try:
+                data = parse_row(cells)
+            except Exception as e:
+                print(f"⚠️ parse_row failed: {e}")
+                continue

-        if data["is_new_torrent"]:
-            new_torrent_count += 1
-            page_new_items += 1
-            new_titles.append(data["title_visible"])
-            print(f"💾 NEW: {data['title_visible']}")
-        else:
-            existing_torrent_count += 1
-            print(f"♻️ UPDATING: {data['title_visible']}")
+            if not data: continue
+            processed_count += 1

-        cursor.execute(insert_sql, data)
+            if data["is_new_torrent"]:
+                new_torrent_count += 1
+                page_new_items += 1
+                new_titles.append(data["title_visible"])
+                print(f"💾 NEW: {data['title_visible']}")
+            else:
+                existing_torrent_count += 1
+                print(f"♻️ UPDATING: {data['title_visible']}")

-    # # If an entire page is old news, we can stop the deep crawl
-    # if page_new_items == 0 and page_num > 0:
-    #     print("🛑 Page contained only known items. Sync complete.")
-    #     break
+            cursor.execute(insert_sql, data)

-    time.sleep(1)
+        time.sleep(1)

 # ============================================================
-# 8) SEND EMAIL REPORT
+# 9) SEND EMAIL REPORT
 # ============================================================
 RUN_END = datetime.datetime.now()
 subject = f"SKTorrent run – {RUN_START:%Y-%m-%d %H:%M}"