diff --git a/90 Import all torrents from all pages.py b/90 Import all torrents from all pages.py index 92e1b2e..f0c94bb 100644 --- a/90 Import all torrents from all pages.py +++ b/90 Import all torrents from all pages.py @@ -47,11 +47,12 @@ cursor = db.cursor() # 2) Selenium setup # ============================================================ COOKIE_FILE = Path("sktorrent_cookies.json") -# Updated to standard torrents.php as requested -BASE_URL = ( - "https://sktorrent.eu/torrent/torrents.php" - "?active=0&category=24&order=data&by=DESC&zaner=&jazyk=" -) + +CATEGORIES = { + 24: "Knihy a časopisy", + 32: "Mluvené slovo" +} +MAX_PAGES = 10 chrome_options = Options() chrome_options.add_argument("--start-maximized") @@ -85,7 +86,15 @@ print("🔗 Requests session initialized.") # ============================================================ -# 4) Popup handler +# 4) Build URL for category and page +# ============================================================ +def get_torrent_url(category, page): + base = "https://sktorrent.eu/torrent/torrents.php" + return f"{base}?active=0&category={category}&order=data&by=DESC&zaner=&jazyk=&page={page}" + + +# ============================================================ +# 5) Popup handler # ============================================================ def close_popup_if_any(): try: @@ -96,7 +105,7 @@ def close_popup_if_any(): # ============================================================ -# 5) Parse one torrent row (MODIFIED) +# 6) Parse one torrent row (MODIFIED) # ============================================================ def parse_row(cells): @@ -204,7 +213,7 @@ def parse_row(cells): print(f"⚠️ parse_row logic failed: {e}") return None # ============================================================ -# 6) INSERT SQL (MODIFIED) +# 7) INSERT SQL (MODIFIED) # ============================================================ insert_sql = """ INSERT INTO torrents ( @@ -228,68 +237,62 @@ ON DUPLICATE KEY UPDATE # keeps the old value if the new one is NULL, # but updates it if the old one was NULL and the new one is binary. # ============================================================ -# 7) PROCESS ALL PAGES +# 8) PROCESS ALL PAGES AND CATEGORIES # ============================================================ -TOTAL_PAGES = 226 -for page_num in range(0, TOTAL_PAGES): - current_url = f"{BASE_URL}&page={page_num}" - print(f"\n🌐 Loading Page Index {page_num} (Page {page_num + 1}/{TOTAL_PAGES})") +for category_id, category_name in CATEGORIES.items(): + print(f"\n📚 Starting category: {category_name} (ID: {category_id})") - driver.get(current_url) - time.sleep(2) - close_popup_if_any() + for page_num in range(0, MAX_PAGES): + current_url = get_torrent_url(category_id, page_num) + print(f"\n🌐 Loading page {page_num}/{MAX_PAGES - 1} for {category_name}") - # Find table rows - rows = driver.find_elements(By.CSS_SELECTOR, "table tr") + driver.get(current_url) + time.sleep(2) + close_popup_if_any() - # FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1) - # This automatically discards headers and empty space rows. - real_rows = [] - for r in rows: - cells = r.find_elements(By.TAG_NAME, "td") - if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"): - real_rows.append(cells) + # Find table rows + rows = driver.find_elements(By.CSS_SELECTOR, "table tr") - if not real_rows: - print("⚠️ No data rows found on this page. Ending loop.") - break + # FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1) + # This automatically discards headers and empty space rows. + real_rows = [] + for r in rows: + cells = r.find_elements(By.TAG_NAME, "td") + if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"): + real_rows.append(cells) - # === INSERT THIS LINE HERE === - page_new_items = 0 - # ============================= + if not real_rows: + print(f"⚠️ No data rows found. Stopping pagination for {category_name}") + break - for cells in real_rows: - try: - data = parse_row(cells) - # ... rest of your logic ... - except Exception as e: - print(f"⚠️ parse_row failed: {e}") - continue + page_new_items = 0 - if not data: continue - processed_count += 1 + for cells in real_rows: + try: + data = parse_row(cells) + except Exception as e: + print(f"⚠️ parse_row failed: {e}") + continue - if data["is_new_torrent"]: - new_torrent_count += 1 - page_new_items += 1 - new_titles.append(data["title_visible"]) - print(f"💾 NEW: {data['title_visible']}") - else: - existing_torrent_count += 1 - print(f"♻️ UPDATING: {data['title_visible']}") + if not data: continue + processed_count += 1 - cursor.execute(insert_sql, data) + if data["is_new_torrent"]: + new_torrent_count += 1 + page_new_items += 1 + new_titles.append(data["title_visible"]) + print(f"💾 NEW: {data['title_visible']}") + else: + existing_torrent_count += 1 + print(f"♻️ UPDATING: {data['title_visible']}") - # # If an entire page is old news, we can stop the deep crawl - # if page_new_items == 0 and page_num > 0: - # print("🛑 Page contained only known items. Sync complete.") - # break + cursor.execute(insert_sql, data) - time.sleep(1) + time.sleep(1) # ============================================================ -# 8) SEND EMAIL REPORT +# 9) SEND EMAIL REPORT # ============================================================ RUN_END = datetime.datetime.now() subject = f"SKTorrent run – {RUN_START:%Y-%m-%d %H:%M}" diff --git a/Seedbox/95 IncrementalImport.py b/Seedbox/95 IncrementalImport.py index 6d9f734..c36bd23 100644 --- a/Seedbox/95 IncrementalImport.py +++ b/Seedbox/95 IncrementalImport.py @@ -30,10 +30,11 @@ HOW_MANY_TO_CHECK = 0 COOKIE_FILE = Path("sktorrent_cookies.json") -BASE_URL = ( - "https://sktorrent.eu/torrent/torrents.php" - "?active=0&category=23&order=data&by=DESC" -) +CATEGORIES = { + 24: "Knihy a časopisy", + 32: "Mluvené slovo" +} +MAX_PAGES = 10 SLEEP_BETWEEN_PAGES = 2.0 # pauza mezi stránkami SLEEP_BEFORE_DOWNLOAD = 1.5 # pauza před stažením každého .torrent souboru @@ -221,6 +222,8 @@ def main(): else: mode_desc = f"kontrola posledních {HOW_MANY_TO_CHECK} torrentů" print(f"Pořadí: nejnovější → nejstarší | {mode_desc}") + print(f"Kategorie: {', '.join(CATEGORIES.values())}") + print(f"Stránek na kategorii: {MAX_PAGES}") print("=" * 60) session = build_session() @@ -230,79 +233,87 @@ def main(): new_count = 0 checked_count = 0 skipped_count = 0 - page = 0 stop = False - while not stop: - - url = f"{BASE_URL}&page={page}" - r = None - for attempt in range(1, 6): - try: - r = session.get(url, timeout=15) - r.raise_for_status() - break - except Exception as e: - if attempt < 5: - print(f"⚠️ Stránka {page} — pokus {attempt}/5 selhal: {e} — čekám 10s") - time.sleep(10) - else: - print(f"⚠️ Stránka {page} — všech 5 pokusů selhalo: {e}") - if r is None or not r.ok: + for category_id, category_name in CATEGORIES.items(): + if stop: break - if "login.php" in r.url or "Prihlas sa" in r.text: - print("❌ Cookies expiraly — spusť přihlašovací Selenium skript a obnov cookies.") - break + print(f"\n📚 Kategorie: {category_name} (ID: {category_id})") - rows = parse_page(r.text) + for page in range(MAX_PAGES): - if not rows: - print(f" Stránka {page} — žádné záznamy, konec.") - break - - print(f"\n📄 Stránka {page} ({len(rows)} torrentů)") - - for item in rows: - - if HOW_MANY_TO_CHECK > 0 and checked_count >= HOW_MANY_TO_CHECK: - print(f" ⏹ Zkontrolováno {checked_count} torrentů — limit dosažen.") - stop = True - break - - checked_count += 1 - - cursor.execute( - "SELECT 1 FROM torrents WHERE torrent_hash = %s", - (item["torrent_hash"],) + url = ( + "https://sktorrent.eu/torrent/torrents.php" + f"?active=0&category={category_id}&order=data&by=DESC&page={page}" ) - exists = cursor.fetchone() + r = None + for attempt in range(1, 6): + try: + r = session.get(url, timeout=15) + r.raise_for_status() + break + except Exception as e: + if attempt < 5: + print(f"⚠️ Stránka {page} — pokus {attempt}/5 selhal: {e} — čekám 10s") + time.sleep(10) + else: + print(f"⚠️ Stránka {page} — všech 5 pokusů selhalo: {e}") + if r is None or not r.ok: + break - if exists: - if HOW_MANY_TO_CHECK == -1: - print(f" ⏹ Již v DB: {item['title_visible']} → zastavuji import.") + if "login.php" in r.url or "Prihlas sa" in r.text: + print("❌ Cookies expiraly — spusť přihlašovací Selenium skript a obnov cookies.") + break + + rows = parse_page(r.text) + + if not rows: + print(f" Stránka {page} — žádné záznamy, konec.") + break + + print(f"\n📄 Stránka {page} ({len(rows)} torrentů)") + + for item in rows: + + if HOW_MANY_TO_CHECK > 0 and checked_count >= HOW_MANY_TO_CHECK: + print(f" ⏹ Zkontrolováno {checked_count} torrentů — limit dosažen.") stop = True break + + checked_count += 1 + + cursor.execute( + "SELECT 1 FROM torrents WHERE torrent_hash = %s", + (item["torrent_hash"],) + ) + exists = cursor.fetchone() + + if exists: + if HOW_MANY_TO_CHECK == -1: + print(f" ⏹ Již v DB: {item['title_visible']} → zastavuji import.") + stop = True + break + else: + skipped_count += 1 + print(f" ⏭ Již v DB: {item['title_visible']} — přeskakuji") + continue + + print(f" ⬇️ Nový: {item['title_visible']}") + time.sleep(SLEEP_BEFORE_DOWNLOAD) + + content = download_torrent(session, item["download_url"]) + if content: + print(f" ✔ Staženo ({len(content):,} B)") else: - skipped_count += 1 - print(f" ⏭ Již v DB: {item['title_visible']} — přeskakuji") - continue + print(f" ✖ Nepodařilo se stáhnout, ukládám bez obsahu") - print(f" ⬇️ Nový: {item['title_visible']}") - time.sleep(SLEEP_BEFORE_DOWNLOAD) + item["torrent_content"] = content + cursor.execute(INSERT_SQL, item) + new_count += 1 - content = download_torrent(session, item["download_url"]) - if content: - print(f" ✔ Staženo ({len(content):,} B)") - else: - print(f" ✖ Nepodařilo se stáhnout, ukládám bez obsahu") - - item["torrent_content"] = content - cursor.execute(INSERT_SQL, item) - new_count += 1 - - if not stop: - page += 1 + if stop: + break time.sleep(SLEEP_BETWEEN_PAGES) # ============================================================ @@ -314,7 +325,6 @@ def main(): print(f"Nových torrentů uloženo : {new_count}") print(f"Zkontrolováno celkem : {checked_count}") print(f"Přeskočeno (v DB) : {skipped_count}") - print(f"Stránek prošlo : {page}") print("=" * 60) db.close()