git

2026-05-20 22:26:48 +02:00
parent 8121b70e87
commit 8ddec5184d
2 changed files with 133 additions and 120 deletions
@@ -47,11 +47,12 @@ cursor = db.cursor()
 # 2) Selenium setup
 # ============================================================
 COOKIE_FILE = Path("sktorrent_cookies.json")
-# Updated to standard torrents.php as requested
+
-BASE_URL = (
+CATEGORIES = {
-    "https://sktorrent.eu/torrent/torrents.php"
+    24: "Knihy a časopisy",
-    "?active=0&category=24&order=data&by=DESC&zaner=&jazyk="
+    32: "Mluvené slovo"
-)
+}
 MAX_PAGES = 10
 chrome_options = Options()
 chrome_options.add_argument("--start-maximized")
@@ -85,7 +86,15 @@ print("🔗 Requests session initialized.")
 # ============================================================
-# 4) Popup handler
+# 4) Build URL for category and page
 # ============================================================
 def get_torrent_url(category, page):
    base = "https://sktorrent.eu/torrent/torrents.php"
    return f"{base}?active=0&category={category}&order=data&by=DESC&zaner=&jazyk=&page={page}"
 # ============================================================
 # 5) Popup handler
 # ============================================================
 def close_popup_if_any():
    try:
@@ -96,7 +105,7 @@ def close_popup_if_any():
 # ============================================================
-# 5) Parse one torrent row (MODIFIED)
+# 6) Parse one torrent row (MODIFIED)
 # ============================================================
 def parse_row(cells):
@@ -204,7 +213,7 @@ def parse_row(cells):
        print(f"⚠️ parse_row logic failed: {e}")
        return None
 # ============================================================
-# 6) INSERT SQL (MODIFIED)
+# 7) INSERT SQL (MODIFIED)
 # ============================================================
 insert_sql = """
 INSERT INTO torrents (
@@ -228,68 +237,62 @@ ON DUPLICATE KEY UPDATE
 # keeps the old value if the new one is NULL,
 # but updates it if the old one was NULL and the new one is binary.
 # ============================================================
-# 7) PROCESS ALL PAGES
+# 8) PROCESS ALL PAGES AND CATEGORIES
 # ============================================================
 TOTAL_PAGES = 226
-for page_num in range(0, TOTAL_PAGES):
+for category_id, category_name in CATEGORIES.items():
-    current_url = f"{BASE_URL}&page={page_num}"
+    print(f"\n📚 Starting category: {category_name} (ID: {category_id})")
    print(f"\n🌐 Loading Page Index {page_num} (Page {page_num + 1}/{TOTAL_PAGES})")
-    driver.get(current_url)
+    for page_num in range(0, MAX_PAGES):
-    time.sleep(2)
+        current_url = get_torrent_url(category_id, page_num)
-    close_popup_if_any()
+        print(f"\n🌐 Loading page {page_num}/{MAX_PAGES - 1} for {category_name}")
-    # Find table rows
+        driver.get(current_url)
-    rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
+        time.sleep(2)
        close_popup_if_any()
-    # FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1)
+        # Find table rows
-    # This automatically discards headers and empty space rows.
+        rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
    real_rows = []
    for r in rows:
        cells = r.find_elements(By.TAG_NAME, "td")
        if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"):
            real_rows.append(cells)
-    if not real_rows:
+        # FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1)
-        print("⚠️ No data rows found on this page. Ending loop.")
+        # This automatically discards headers and empty space rows.
-        break
+        real_rows = []
        for r in rows:
            cells = r.find_elements(By.TAG_NAME, "td")
            if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"):
                real_rows.append(cells)
-    # === INSERT THIS LINE HERE ===
+        if not real_rows:
-    page_new_items = 0
+            print(f"⚠️ No data rows found. Stopping pagination for {category_name}")
-    # =============================
+            break
-    for cells in real_rows:
+        page_new_items = 0
        try:
            data = parse_row(cells)
            # ... rest of your logic ...
        except Exception as e:
            print(f"⚠️ parse_row failed: {e}")
            continue
-        if not data: continue
+        for cells in real_rows:
-        processed_count += 1
+            try:
                data = parse_row(cells)
            except Exception as e:
                print(f"⚠️ parse_row failed: {e}")
                continue
-        if data["is_new_torrent"]:
+            if not data: continue
-            new_torrent_count += 1
+            processed_count += 1
            page_new_items += 1
            new_titles.append(data["title_visible"])
            print(f"💾 NEW: {data['title_visible']}")
        else:
            existing_torrent_count += 1
            print(f"♻️ UPDATING: {data['title_visible']}")
-        cursor.execute(insert_sql, data)
+            if data["is_new_torrent"]:
                new_torrent_count += 1
                page_new_items += 1
                new_titles.append(data["title_visible"])
                print(f"💾 NEW: {data['title_visible']}")
            else:
                existing_torrent_count += 1
                print(f"♻️ UPDATING: {data['title_visible']}")
-    # # If an entire page is old news, we can stop the deep crawl
+            cursor.execute(insert_sql, data)
    # if page_new_items == 0 and page_num > 0:
    #     print("🛑 Page contained only known items. Sync complete.")
    #     break
-    time.sleep(1)
+        time.sleep(1)
 # ============================================================
-# 8) SEND EMAIL REPORT
+# 9) SEND EMAIL REPORT
 # ============================================================
 RUN_END = datetime.datetime.now()
 subject = f"SKTorrent run – {RUN_START:%Y-%m-%d %H:%M}"
@@ -30,10 +30,11 @@ HOW_MANY_TO_CHECK = 0
 COOKIE_FILE = Path("sktorrent_cookies.json")
-BASE_URL = (
+CATEGORIES = {
-    "https://sktorrent.eu/torrent/torrents.php"
+    24: "Knihy a časopisy",
-    "?active=0&category=23&order=data&by=DESC"
+    32: "Mluvené slovo"
-)
+}
 MAX_PAGES = 10
 SLEEP_BETWEEN_PAGES   = 2.0   # pauza mezi stránkami
 SLEEP_BEFORE_DOWNLOAD = 1.5   # pauza před stažením každého .torrent souboru
@@ -221,6 +222,8 @@ def main():
    else:
        mode_desc = f"kontrola posledních {HOW_MANY_TO_CHECK} torrentů"
    print(f"Pořadí: nejnovější → nejstarší  |  {mode_desc}")
    print(f"Kategorie: {', '.join(CATEGORIES.values())}")
    print(f"Stránek na kategorii: {MAX_PAGES}")
    print("=" * 60)
    session = build_session()
@@ -230,79 +233,87 @@ def main():
    new_count      = 0
    checked_count  = 0
    skipped_count  = 0
    page           = 0
    stop           = False
-    while not stop:
+    for category_id, category_name in CATEGORIES.items():
-
+        if stop:
        url = f"{BASE_URL}&page={page}"
        r = None
        for attempt in range(1, 6):
            try:
                r = session.get(url, timeout=15)
                r.raise_for_status()
                break
            except Exception as e:
                if attempt < 5:
                    print(f"⚠️ Stránka {page} — pokus {attempt}/5 selhal: {e} — čekám 10s")
                    time.sleep(10)
                else:
                    print(f"⚠️ Stránka {page} — všech 5 pokusů selhalo: {e}")
        if r is None or not r.ok:
            break
-        if "login.php" in r.url or "Prihlas sa" in r.text:
+        print(f"\n📚 Kategorie: {category_name} (ID: {category_id})")
            print("❌ Cookies expiraly — spusť přihlašovací Selenium skript a obnov cookies.")
            break
-        rows = parse_page(r.text)
+        for page in range(MAX_PAGES):
-        if not rows:
+            url = (
-            print(f"  Stránka {page} — žádné záznamy, konec.")
+                "https://sktorrent.eu/torrent/torrents.php"
-            break
+                f"?active=0&category={category_id}&order=data&by=DESC&page={page}"
        print(f"\n📄 Stránka {page}  ({len(rows)} torrentů)")
        for item in rows:
            if HOW_MANY_TO_CHECK > 0 and checked_count >= HOW_MANY_TO_CHECK:
                print(f"  ⏹ Zkontrolováno {checked_count} torrentů — limit dosažen.")
                stop = True
                break
            checked_count += 1
            cursor.execute(
                "SELECT 1 FROM torrents WHERE torrent_hash = %s",
                (item["torrent_hash"],)
            )
-            exists = cursor.fetchone()
+            r = None
            for attempt in range(1, 6):
                try:
                    r = session.get(url, timeout=15)
                    r.raise_for_status()
                    break
                except Exception as e:
                    if attempt < 5:
                        print(f"⚠️ Stránka {page} — pokus {attempt}/5 selhal: {e} — čekám 10s")
                        time.sleep(10)
                    else:
                        print(f"⚠️ Stránka {page} — všech 5 pokusů selhalo: {e}")
            if r is None or not r.ok:
                break
-            if exists:
+            if "login.php" in r.url or "Prihlas sa" in r.text:
-                if HOW_MANY_TO_CHECK == -1:
+                print("❌ Cookies expiraly — spusť přihlašovací Selenium skript a obnov cookies.")
-                    print(f"  ⏹ Již v DB: {item['title_visible']} → zastavuji import.")
+                break
            rows = parse_page(r.text)
            if not rows:
                print(f"  Stránka {page} — žádné záznamy, konec.")
                break
            print(f"\n📄 Stránka {page}  ({len(rows)} torrentů)")
            for item in rows:
                if HOW_MANY_TO_CHECK > 0 and checked_count >= HOW_MANY_TO_CHECK:
                    print(f"  ⏹ Zkontrolováno {checked_count} torrentů — limit dosažen.")
                    stop = True
                    break
                checked_count += 1
                cursor.execute(
                    "SELECT 1 FROM torrents WHERE torrent_hash = %s",
                    (item["torrent_hash"],)
                )
                exists = cursor.fetchone()
                if exists:
                    if HOW_MANY_TO_CHECK == -1:
                        print(f"  ⏹ Již v DB: {item['title_visible']} → zastavuji import.")
                        stop = True
                        break
                    else:
                        skipped_count += 1
                        print(f"  ⏭ Již v DB: {item['title_visible']} — přeskakuji")
                        continue
                print(f"  ⬇️  Nový: {item['title_visible']}")
                time.sleep(SLEEP_BEFORE_DOWNLOAD)
                content = download_torrent(session, item["download_url"])
                if content:
                    print(f"      ✔ Staženo ({len(content):,} B)")
                else:
-                    skipped_count += 1
+                    print(f"      ✖ Nepodařilo se stáhnout, ukládám bez obsahu")
                    print(f"  ⏭ Již v DB: {item['title_visible']} — přeskakuji")
                    continue
-            print(f"  ⬇️  Nový: {item['title_visible']}")
+                item["torrent_content"] = content
-            time.sleep(SLEEP_BEFORE_DOWNLOAD)
+                cursor.execute(INSERT_SQL, item)
                new_count += 1
-            content = download_torrent(session, item["download_url"])
+            if stop:
-            if content:
+                break
                print(f"      ✔ Staženo ({len(content):,} B)")
            else:
                print(f"      ✖ Nepodařilo se stáhnout, ukládám bez obsahu")
            item["torrent_content"] = content
            cursor.execute(INSERT_SQL, item)
            new_count += 1
        if not stop:
            page += 1
            time.sleep(SLEEP_BETWEEN_PAGES)
    # ============================================================
@@ -314,7 +325,6 @@ def main():
    print(f"Nových torrentů uloženo : {new_count}")
    print(f"Zkontrolováno celkem   : {checked_count}")
    print(f"Přeskočeno (v DB)      : {skipped_count}")
    print(f"Stránek prošlo          : {page}")
    print("=" * 60)
    db.close()