git

2026-02-01 07:18:20 +01:00
parent 7b0404bfe3
commit 3d11661997
7 changed files with 1074 additions and 116 deletions
--- a/pages.py
+++ b/pages.py
@@ -96,135 +96,124 @@ def close_popup_if_any():


 # ============================================================
-# 5) Parse one torrent row
+# 5) Parse one torrent row (MODIFIED)
 # ============================================================
+
 def parse_row(cells):
-    # Column 0: Category icon/text
+    # --- 1. INITIALIZE ---
+    torrent_hash = None
+    download_url = None
    category = cells[0].text.strip()

    try:
-        # Column 1: Download icon link
+        # --- 2. EXTRACT DOWNLOAD URL (Column 1) ---
        download_a = cells[1].find_element(By.TAG_NAME, "a")
-        download_link = download_a.get_attribute("href")
-    except:
-        return None
+        download_url = download_a.get_attribute("href")

-    parsed_dl = urlparse.urlparse(download_link)
-    dl_query = urlparse.parse_qs(parsed_dl.query)
-    torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]
+        parsed_dl = urlparse.urlparse(download_url)
+        dl_query = urlparse.parse_qs(parsed_dl.query)
+        torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]

-    # Column 2: Name and info
-    title_links = cells[2].find_elements(By.TAG_NAME, "a")
-    if not title_links:
-        return None
+        # --- 3. EXTRACT DETAILS & HASH (Column 2) ---
+        title_links = cells[2].find_elements(By.TAG_NAME, "a")
+        if not title_links:
+            return None

-    a_tag = title_links[0]
-    visible_name = a_tag.text.strip()
-    full_title = a_tag.get_attribute("title")
-    details_link = a_tag.get_attribute("href")
+        a_tag = title_links[0]
+        visible_name = a_tag.text.strip()
+        full_title = a_tag.get_attribute("title")
+        details_link = a_tag.get_attribute("href")

-    parsed = urlparse.urlparse(details_link)
-    query = urlparse.parse_qs(parsed.query)
-    if "id" not in query:
-        return None
+        parsed = urlparse.urlparse(details_link)
+        query = urlparse.parse_qs(parsed.query)
+        if "id" not in query:
+            return None

-    torrent_hash = query["id"][0]
+        torrent_hash = query["id"][0]

-    # Use innerText for robust text extraction
-    text_block = cells[2].get_attribute("innerText")
-    text_block_clean = " ".join(text_block.split())
+        # --- 4. EXTRACT SIZE & DATE ---
+        text_block = cells[2].get_attribute("innerText")
+        text_block_clean = " ".join(text_block.split())
+        size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
+        added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)
+        size_pretty = size_match.group(1) if size_match else None
+        added_pretty = added_match.group(1) if added_match else None

-    # Regex for Size and Date
-    size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
-    added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)
+        added_mysql = None
+        if added_pretty:
+            clean = added_pretty.replace(" o ", " ").strip()
+            parts = clean.split(" ")
+            if len(parts) >= 2:
+                date_part, time_part = parts[0], parts[1]
+                if len(time_part.split(":")) == 2: time_part += ":00"
+                try:
+                    d, m, y = date_part.split("/")
+                    added_mysql = f"{y}-{m}-{d} {time_part}"
+                except: pass

-    size_pretty = size_match.group(1) if size_match else None
-    added_pretty = added_match.group(1) if added_match else None
-
-    # Date conversion: "29/11/2025 o 02:29" -> MySQL format
-    added_mysql = None
-    if added_pretty:
-        clean = added_pretty.replace(" o ", " ").strip()
-        parts = clean.split(" ")
-        if len(parts) >= 2:
-            date_part = parts[0]
-            time_part = parts[1]
-            if len(time_part.split(":")) == 2:
-                time_part += ":00"
-            try:
-                day, month, year = date_part.split("/")
-                added_mysql = f"{year}-{month}-{day} {time_part}"
-            except:
-                added_mysql = None
-
-    # Column 2: Image preview (if exists)
-    img_link = None
-    try:
-        image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
-        mouseover = image_a.get_attribute("onmouseover")
-        img_match = re.search(r"src=([^ ]+)", mouseover)
-        if img_match:
-            img_link = img_match.group(1).replace("'", "").strip()
-            if img_link.startswith("//"):
-                img_link = "https:" + img_link
-    except:
-        pass
-
-    # Column 4: Seeders
-    seeders_a = cells[4].find_element(By.TAG_NAME, "a")
-    seeders_number = int(seeders_a.text.strip())
-    seeders_link = seeders_a.get_attribute("href")
-
-    # Column 5: Leechers
-    leechers_a = cells[5].find_element(By.TAG_NAME, "a")
-    leechers_number = int(leechers_a.text.strip())
-    leechers_link = leechers_a.get_attribute("href")
-
-    # Check database for existing binary content
-    cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,))
-    row = cursor.fetchone()
-    already_have_torrent = row is not None and row[0] is not None
-
-    torrent_content = None
-    if not already_have_torrent:
-        time.sleep(3)  # Politeness delay
+        # --- 5. IMAGE & STATS ---
+        img_link = None
        try:
-            resp = requests_session.get(download_link)
-            resp.raise_for_status()
-            torrent_content = resp.content
-        except:
-            torrent_content = None
+            image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
+            mouseover = image_a.get_attribute("onmouseover")
+            img_match = re.search(r"src=([^ ]+)", mouseover)
+            if img_match:
+                img_link = img_match.group(1).replace("'", "").strip()
+                if img_link.startswith("//"): img_link = "https:" + img_link
+        except: pass

-    return {
-        "torrent_hash": torrent_hash,
-        "details_link": details_link,
-        "category": category,
-        "title_visible": visible_name,
-        "title_full": full_title,
-        "size_pretty": size_pretty,
-        "added_datetime": added_mysql,
-        "preview_image": img_link,
-        "seeders": seeders_number,
-        "seeders_link": seeders_link,
-        "leechers": leechers_number,
-        "leechers_link": leechers_link,
-        "torrent_filename": torrent_filename,
-        "torrent_content": torrent_content if not already_have_torrent else None,
-        "is_new_torrent": not already_have_torrent,
-    }
+        seeders_number = int(cells[4].find_element(By.TAG_NAME, "a").text.strip())
+        seeders_link = cells[4].find_element(By.TAG_NAME, "a").get_attribute("href")
+        leechers_number = int(cells[5].find_element(By.TAG_NAME, "a").text.strip())
+        leechers_link = cells[5].find_element(By.TAG_NAME, "a").get_attribute("href")

+        # --- 6. DATABASE CHECK & DOWNLOAD ---
+        cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,))
+        db_row = cursor.fetchone()
+        already_have_torrent = db_row is not None and db_row[0] is not None

+        torrent_content = None
+        if not already_have_torrent:
+            time.sleep(2)
+            try:
+                resp = requests_session.get(download_url, timeout=10)
+                resp.raise_for_status()
+                torrent_content = resp.content
+            except Exception as e:
+                print(f"   ⚠️ Download failed for {visible_name}: {e}")
+
+        return {
+            "torrent_hash": torrent_hash,
+            "details_link": details_link,
+            "download_url": download_url,
+            "category": category,
+            "title_visible": visible_name,
+            "title_full": full_title,
+            "size_pretty": size_pretty,
+            "added_datetime": added_mysql,
+            "preview_image": img_link,
+            "seeders": seeders_number,
+            "seeders_link": seeders_link,
+            "leechers": leechers_number,
+            "leechers_link": leechers_link,
+            "torrent_filename": torrent_filename,
+            "torrent_content": torrent_content if not already_have_torrent else None,
+            "is_new_torrent": not already_have_torrent,
+        }
+    except Exception as e:
+        print(f"⚠️ parse_row logic failed: {e}")
+        return None
 # ============================================================
-# 6) INSERT SQL
+# 6) INSERT SQL (MODIFIED)
 # ============================================================
 insert_sql = """
 INSERT INTO torrents (
-    torrent_hash, details_link, category, title_visible, title_full,
+    torrent_hash, details_link, download_url, category, title_visible, title_full,
    size_pretty, added_datetime, preview_image,
    seeders, seeders_link, leechers, leechers_link,
    torrent_filename, torrent_content
 ) VALUES (
-    %(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s,
+    %(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s, %(title_visible)s, %(title_full)s,
    %(size_pretty)s, %(added_datetime)s, %(preview_image)s,
    %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s,
    %(torrent_filename)s, %(torrent_content)s
@@ -232,9 +221,12 @@ INSERT INTO torrents (
 ON DUPLICATE KEY UPDATE
    seeders = VALUES(seeders),
    leechers = VALUES(leechers),
+    download_url = VALUES(download_url),
    torrent_content = COALESCE(VALUES(torrent_content), torrent_content);
 """
-
+# Note: COALESCE(torrent_content, VALUES(torrent_content))
+# keeps the old value if the new one is NULL,
+# but updates it if the old one was NULL and the new one is binary.
 # ============================================================
 # 7) PROCESS ALL PAGES
 # ============================================================
@@ -250,17 +242,27 @@ for page_num in range(0, TOTAL_PAGES):

    # Find table rows
    rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
-    # v1 table usually has 7 cells for a data row
-    real_rows = [r.find_elements(By.TAG_NAME, "td") for r in rows if len(r.find_elements(By.TAG_NAME, "td")) == 7]
+
+    # FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1)
+    # This automatically discards headers and empty space rows.
+    real_rows = []
+    for r in rows:
+        cells = r.find_elements(By.TAG_NAME, "td")
+        if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"):
+            real_rows.append(cells)

    if not real_rows:
        print("⚠️ No data rows found on this page. Ending loop.")
        break

+    # === INSERT THIS LINE HERE ===
    page_new_items = 0
+    # =============================
+
    for cells in real_rows:
        try:
            data = parse_row(cells)
+            # ... rest of your logic ...
        except Exception as e:
            print(f"⚠️ parse_row failed: {e}")
            continue
@@ -279,10 +281,10 @@ for page_num in range(0, TOTAL_PAGES):

        cursor.execute(insert_sql, data)

-    # If an entire page is old news, we can stop the deep crawl
-    if page_new_items == 0 and page_num > 0:
-        print("🛑 Page contained only known items. Sync complete.")
-        break
+    # # If an entire page is old news, we can stop the deep crawl
+    # if page_new_items == 0 and page_num > 0:
+    #     print("🛑 Page contained only known items. Sync complete.")
+    #     break

    time.sleep(1)