This commit is contained in:
2026-05-20 22:26:48 +02:00
parent 8121b70e87
commit 8ddec5184d
2 changed files with 133 additions and 120 deletions
+58 -55
View File
@@ -47,11 +47,12 @@ cursor = db.cursor()
# 2) Selenium setup # 2) Selenium setup
# ============================================================ # ============================================================
COOKIE_FILE = Path("sktorrent_cookies.json") COOKIE_FILE = Path("sktorrent_cookies.json")
# Updated to standard torrents.php as requested
BASE_URL = ( CATEGORIES = {
"https://sktorrent.eu/torrent/torrents.php" 24: "Knihy a časopisy",
"?active=0&category=24&order=data&by=DESC&zaner=&jazyk=" 32: "Mluvené slovo"
) }
MAX_PAGES = 10
chrome_options = Options() chrome_options = Options()
chrome_options.add_argument("--start-maximized") chrome_options.add_argument("--start-maximized")
@@ -85,7 +86,15 @@ print("🔗 Requests session initialized.")
# ============================================================ # ============================================================
# 4) Popup handler # 4) Build URL for category and page
# ============================================================
def get_torrent_url(category, page):
base = "https://sktorrent.eu/torrent/torrents.php"
return f"{base}?active=0&category={category}&order=data&by=DESC&zaner=&jazyk=&page={page}"
# ============================================================
# 5) Popup handler
# ============================================================ # ============================================================
def close_popup_if_any(): def close_popup_if_any():
try: try:
@@ -96,7 +105,7 @@ def close_popup_if_any():
# ============================================================ # ============================================================
# 5) Parse one torrent row (MODIFIED) # 6) Parse one torrent row (MODIFIED)
# ============================================================ # ============================================================
def parse_row(cells): def parse_row(cells):
@@ -204,7 +213,7 @@ def parse_row(cells):
print(f"⚠️ parse_row logic failed: {e}") print(f"⚠️ parse_row logic failed: {e}")
return None return None
# ============================================================ # ============================================================
# 6) INSERT SQL (MODIFIED) # 7) INSERT SQL (MODIFIED)
# ============================================================ # ============================================================
insert_sql = """ insert_sql = """
INSERT INTO torrents ( INSERT INTO torrents (
@@ -228,68 +237,62 @@ ON DUPLICATE KEY UPDATE
# keeps the old value if the new one is NULL, # keeps the old value if the new one is NULL,
# but updates it if the old one was NULL and the new one is binary. # but updates it if the old one was NULL and the new one is binary.
# ============================================================ # ============================================================
# 7) PROCESS ALL PAGES # 8) PROCESS ALL PAGES AND CATEGORIES
# ============================================================ # ============================================================
TOTAL_PAGES = 226
for page_num in range(0, TOTAL_PAGES): for category_id, category_name in CATEGORIES.items():
current_url = f"{BASE_URL}&page={page_num}" print(f"\n📚 Starting category: {category_name} (ID: {category_id})")
print(f"\n🌐 Loading Page Index {page_num} (Page {page_num + 1}/{TOTAL_PAGES})")
driver.get(current_url) for page_num in range(0, MAX_PAGES):
time.sleep(2) current_url = get_torrent_url(category_id, page_num)
close_popup_if_any() print(f"\n🌐 Loading page {page_num}/{MAX_PAGES - 1} for {category_name}")
# Find table rows driver.get(current_url)
rows = driver.find_elements(By.CSS_SELECTOR, "table tr") time.sleep(2)
close_popup_if_any()
# FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1) # Find table rows
# This automatically discards headers and empty space rows. rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
real_rows = []
for r in rows:
cells = r.find_elements(By.TAG_NAME, "td")
if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"):
real_rows.append(cells)
if not real_rows: # FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1)
print("⚠️ No data rows found on this page. Ending loop.") # This automatically discards headers and empty space rows.
break real_rows = []
for r in rows:
cells = r.find_elements(By.TAG_NAME, "td")
if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"):
real_rows.append(cells)
# === INSERT THIS LINE HERE === if not real_rows:
page_new_items = 0 print(f"⚠️ No data rows found. Stopping pagination for {category_name}")
# ============================= break
for cells in real_rows: page_new_items = 0
try:
data = parse_row(cells)
# ... rest of your logic ...
except Exception as e:
print(f"⚠️ parse_row failed: {e}")
continue
if not data: continue for cells in real_rows:
processed_count += 1 try:
data = parse_row(cells)
except Exception as e:
print(f"⚠️ parse_row failed: {e}")
continue
if data["is_new_torrent"]: if not data: continue
new_torrent_count += 1 processed_count += 1
page_new_items += 1
new_titles.append(data["title_visible"])
print(f"💾 NEW: {data['title_visible']}")
else:
existing_torrent_count += 1
print(f"♻️ UPDATING: {data['title_visible']}")
cursor.execute(insert_sql, data) if data["is_new_torrent"]:
new_torrent_count += 1
page_new_items += 1
new_titles.append(data["title_visible"])
print(f"💾 NEW: {data['title_visible']}")
else:
existing_torrent_count += 1
print(f"♻️ UPDATING: {data['title_visible']}")
# # If an entire page is old news, we can stop the deep crawl cursor.execute(insert_sql, data)
# if page_new_items == 0 and page_num > 0:
# print("🛑 Page contained only known items. Sync complete.")
# break
time.sleep(1) time.sleep(1)
# ============================================================ # ============================================================
# 8) SEND EMAIL REPORT # 9) SEND EMAIL REPORT
# ============================================================ # ============================================================
RUN_END = datetime.datetime.now() RUN_END = datetime.datetime.now()
subject = f"SKTorrent run {RUN_START:%Y-%m-%d %H:%M}" subject = f"SKTorrent run {RUN_START:%Y-%m-%d %H:%M}"
+75 -65
View File
@@ -30,10 +30,11 @@ HOW_MANY_TO_CHECK = 0
COOKIE_FILE = Path("sktorrent_cookies.json") COOKIE_FILE = Path("sktorrent_cookies.json")
BASE_URL = ( CATEGORIES = {
"https://sktorrent.eu/torrent/torrents.php" 24: "Knihy a časopisy",
"?active=0&category=23&order=data&by=DESC" 32: "Mluvené slovo"
) }
MAX_PAGES = 10
SLEEP_BETWEEN_PAGES = 2.0 # pauza mezi stránkami SLEEP_BETWEEN_PAGES = 2.0 # pauza mezi stránkami
SLEEP_BEFORE_DOWNLOAD = 1.5 # pauza před stažením každého .torrent souboru SLEEP_BEFORE_DOWNLOAD = 1.5 # pauza před stažením každého .torrent souboru
@@ -221,6 +222,8 @@ def main():
else: else:
mode_desc = f"kontrola posledních {HOW_MANY_TO_CHECK} torrentů" mode_desc = f"kontrola posledních {HOW_MANY_TO_CHECK} torrentů"
print(f"Pořadí: nejnovější → nejstarší | {mode_desc}") print(f"Pořadí: nejnovější → nejstarší | {mode_desc}")
print(f"Kategorie: {', '.join(CATEGORIES.values())}")
print(f"Stránek na kategorii: {MAX_PAGES}")
print("=" * 60) print("=" * 60)
session = build_session() session = build_session()
@@ -230,79 +233,87 @@ def main():
new_count = 0 new_count = 0
checked_count = 0 checked_count = 0
skipped_count = 0 skipped_count = 0
page = 0
stop = False stop = False
while not stop: for category_id, category_name in CATEGORIES.items():
if stop:
url = f"{BASE_URL}&page={page}"
r = None
for attempt in range(1, 6):
try:
r = session.get(url, timeout=15)
r.raise_for_status()
break
except Exception as e:
if attempt < 5:
print(f"⚠️ Stránka {page} — pokus {attempt}/5 selhal: {e} — čekám 10s")
time.sleep(10)
else:
print(f"⚠️ Stránka {page} — všech 5 pokusů selhalo: {e}")
if r is None or not r.ok:
break break
if "login.php" in r.url or "Prihlas sa" in r.text: print(f"\n📚 Kategorie: {category_name} (ID: {category_id})")
print("❌ Cookies expiraly — spusť přihlašovací Selenium skript a obnov cookies.")
break
rows = parse_page(r.text) for page in range(MAX_PAGES):
if not rows: url = (
print(f" Stránka {page} — žádné záznamy, konec.") "https://sktorrent.eu/torrent/torrents.php"
break f"?active=0&category={category_id}&order=data&by=DESC&page={page}"
print(f"\n📄 Stránka {page} ({len(rows)} torrentů)")
for item in rows:
if HOW_MANY_TO_CHECK > 0 and checked_count >= HOW_MANY_TO_CHECK:
print(f" ⏹ Zkontrolováno {checked_count} torrentů — limit dosažen.")
stop = True
break
checked_count += 1
cursor.execute(
"SELECT 1 FROM torrents WHERE torrent_hash = %s",
(item["torrent_hash"],)
) )
exists = cursor.fetchone() r = None
for attempt in range(1, 6):
try:
r = session.get(url, timeout=15)
r.raise_for_status()
break
except Exception as e:
if attempt < 5:
print(f"⚠️ Stránka {page} — pokus {attempt}/5 selhal: {e} — čekám 10s")
time.sleep(10)
else:
print(f"⚠️ Stránka {page} — všech 5 pokusů selhalo: {e}")
if r is None or not r.ok:
break
if exists: if "login.php" in r.url or "Prihlas sa" in r.text:
if HOW_MANY_TO_CHECK == -1: print("❌ Cookies expiraly — spusť přihlašovací Selenium skript a obnov cookies.")
print(f" ⏹ Již v DB: {item['title_visible']} → zastavuji import.") break
rows = parse_page(r.text)
if not rows:
print(f" Stránka {page} — žádné záznamy, konec.")
break
print(f"\n📄 Stránka {page} ({len(rows)} torrentů)")
for item in rows:
if HOW_MANY_TO_CHECK > 0 and checked_count >= HOW_MANY_TO_CHECK:
print(f" ⏹ Zkontrolováno {checked_count} torrentů — limit dosažen.")
stop = True stop = True
break break
checked_count += 1
cursor.execute(
"SELECT 1 FROM torrents WHERE torrent_hash = %s",
(item["torrent_hash"],)
)
exists = cursor.fetchone()
if exists:
if HOW_MANY_TO_CHECK == -1:
print(f" ⏹ Již v DB: {item['title_visible']} → zastavuji import.")
stop = True
break
else:
skipped_count += 1
print(f" ⏭ Již v DB: {item['title_visible']} — přeskakuji")
continue
print(f" ⬇️ Nový: {item['title_visible']}")
time.sleep(SLEEP_BEFORE_DOWNLOAD)
content = download_torrent(session, item["download_url"])
if content:
print(f" ✔ Staženo ({len(content):,} B)")
else: else:
skipped_count += 1 print(f" ✖ Nepodařilo se stáhnout, ukládám bez obsahu")
print(f" ⏭ Již v DB: {item['title_visible']} — přeskakuji")
continue
print(f" ⬇️ Nový: {item['title_visible']}") item["torrent_content"] = content
time.sleep(SLEEP_BEFORE_DOWNLOAD) cursor.execute(INSERT_SQL, item)
new_count += 1
content = download_torrent(session, item["download_url"]) if stop:
if content: break
print(f" ✔ Staženo ({len(content):,} B)")
else:
print(f" ✖ Nepodařilo se stáhnout, ukládám bez obsahu")
item["torrent_content"] = content
cursor.execute(INSERT_SQL, item)
new_count += 1
if not stop:
page += 1
time.sleep(SLEEP_BETWEEN_PAGES) time.sleep(SLEEP_BETWEEN_PAGES)
# ============================================================ # ============================================================
@@ -314,7 +325,6 @@ def main():
print(f"Nových torrentů uloženo : {new_count}") print(f"Nových torrentů uloženo : {new_count}")
print(f"Zkontrolováno celkem : {checked_count}") print(f"Zkontrolováno celkem : {checked_count}")
print(f"Přeskočeno (v DB) : {skipped_count}") print(f"Přeskočeno (v DB) : {skipped_count}")
print(f"Stránek prošlo : {page}")
print("=" * 60) print("=" * 60)
db.close() db.close()