git
This commit is contained in:
@@ -96,135 +96,124 @@ def close_popup_if_any():
|
||||
|
||||
|
||||
# ============================================================
|
||||
# 5) Parse one torrent row
|
||||
# 5) Parse one torrent row (MODIFIED)
|
||||
# ============================================================
|
||||
|
||||
def parse_row(cells):
|
||||
# Column 0: Category icon/text
|
||||
# --- 1. INITIALIZE ---
|
||||
torrent_hash = None
|
||||
download_url = None
|
||||
category = cells[0].text.strip()
|
||||
|
||||
try:
|
||||
# Column 1: Download icon link
|
||||
# --- 2. EXTRACT DOWNLOAD URL (Column 1) ---
|
||||
download_a = cells[1].find_element(By.TAG_NAME, "a")
|
||||
download_link = download_a.get_attribute("href")
|
||||
except:
|
||||
return None
|
||||
download_url = download_a.get_attribute("href")
|
||||
|
||||
parsed_dl = urlparse.urlparse(download_link)
|
||||
dl_query = urlparse.parse_qs(parsed_dl.query)
|
||||
torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]
|
||||
parsed_dl = urlparse.urlparse(download_url)
|
||||
dl_query = urlparse.parse_qs(parsed_dl.query)
|
||||
torrent_filename = dl_query.get("f", ["unknown.torrent"])[0]
|
||||
|
||||
# Column 2: Name and info
|
||||
title_links = cells[2].find_elements(By.TAG_NAME, "a")
|
||||
if not title_links:
|
||||
return None
|
||||
# --- 3. EXTRACT DETAILS & HASH (Column 2) ---
|
||||
title_links = cells[2].find_elements(By.TAG_NAME, "a")
|
||||
if not title_links:
|
||||
return None
|
||||
|
||||
a_tag = title_links[0]
|
||||
visible_name = a_tag.text.strip()
|
||||
full_title = a_tag.get_attribute("title")
|
||||
details_link = a_tag.get_attribute("href")
|
||||
a_tag = title_links[0]
|
||||
visible_name = a_tag.text.strip()
|
||||
full_title = a_tag.get_attribute("title")
|
||||
details_link = a_tag.get_attribute("href")
|
||||
|
||||
parsed = urlparse.urlparse(details_link)
|
||||
query = urlparse.parse_qs(parsed.query)
|
||||
if "id" not in query:
|
||||
return None
|
||||
parsed = urlparse.urlparse(details_link)
|
||||
query = urlparse.parse_qs(parsed.query)
|
||||
if "id" not in query:
|
||||
return None
|
||||
|
||||
torrent_hash = query["id"][0]
|
||||
torrent_hash = query["id"][0]
|
||||
|
||||
# Use innerText for robust text extraction
|
||||
text_block = cells[2].get_attribute("innerText")
|
||||
text_block_clean = " ".join(text_block.split())
|
||||
# --- 4. EXTRACT SIZE & DATE ---
|
||||
text_block = cells[2].get_attribute("innerText")
|
||||
text_block_clean = " ".join(text_block.split())
|
||||
size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
|
||||
added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)
|
||||
size_pretty = size_match.group(1) if size_match else None
|
||||
added_pretty = added_match.group(1) if added_match else None
|
||||
|
||||
# Regex for Size and Date
|
||||
size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE)
|
||||
added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE)
|
||||
added_mysql = None
|
||||
if added_pretty:
|
||||
clean = added_pretty.replace(" o ", " ").strip()
|
||||
parts = clean.split(" ")
|
||||
if len(parts) >= 2:
|
||||
date_part, time_part = parts[0], parts[1]
|
||||
if len(time_part.split(":")) == 2: time_part += ":00"
|
||||
try:
|
||||
d, m, y = date_part.split("/")
|
||||
added_mysql = f"{y}-{m}-{d} {time_part}"
|
||||
except: pass
|
||||
|
||||
size_pretty = size_match.group(1) if size_match else None
|
||||
added_pretty = added_match.group(1) if added_match else None
|
||||
|
||||
# Date conversion: "29/11/2025 o 02:29" -> MySQL format
|
||||
added_mysql = None
|
||||
if added_pretty:
|
||||
clean = added_pretty.replace(" o ", " ").strip()
|
||||
parts = clean.split(" ")
|
||||
if len(parts) >= 2:
|
||||
date_part = parts[0]
|
||||
time_part = parts[1]
|
||||
if len(time_part.split(":")) == 2:
|
||||
time_part += ":00"
|
||||
try:
|
||||
day, month, year = date_part.split("/")
|
||||
added_mysql = f"{year}-{month}-{day} {time_part}"
|
||||
except:
|
||||
added_mysql = None
|
||||
|
||||
# Column 2: Image preview (if exists)
|
||||
img_link = None
|
||||
try:
|
||||
image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
|
||||
mouseover = image_a.get_attribute("onmouseover")
|
||||
img_match = re.search(r"src=([^ ]+)", mouseover)
|
||||
if img_match:
|
||||
img_link = img_match.group(1).replace("'", "").strip()
|
||||
if img_link.startswith("//"):
|
||||
img_link = "https:" + img_link
|
||||
except:
|
||||
pass
|
||||
|
||||
# Column 4: Seeders
|
||||
seeders_a = cells[4].find_element(By.TAG_NAME, "a")
|
||||
seeders_number = int(seeders_a.text.strip())
|
||||
seeders_link = seeders_a.get_attribute("href")
|
||||
|
||||
# Column 5: Leechers
|
||||
leechers_a = cells[5].find_element(By.TAG_NAME, "a")
|
||||
leechers_number = int(leechers_a.text.strip())
|
||||
leechers_link = leechers_a.get_attribute("href")
|
||||
|
||||
# Check database for existing binary content
|
||||
cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,))
|
||||
row = cursor.fetchone()
|
||||
already_have_torrent = row is not None and row[0] is not None
|
||||
|
||||
torrent_content = None
|
||||
if not already_have_torrent:
|
||||
time.sleep(3) # Politeness delay
|
||||
# --- 5. IMAGE & STATS ---
|
||||
img_link = None
|
||||
try:
|
||||
resp = requests_session.get(download_link)
|
||||
resp.raise_for_status()
|
||||
torrent_content = resp.content
|
||||
except:
|
||||
torrent_content = None
|
||||
image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]")
|
||||
mouseover = image_a.get_attribute("onmouseover")
|
||||
img_match = re.search(r"src=([^ ]+)", mouseover)
|
||||
if img_match:
|
||||
img_link = img_match.group(1).replace("'", "").strip()
|
||||
if img_link.startswith("//"): img_link = "https:" + img_link
|
||||
except: pass
|
||||
|
||||
return {
|
||||
"torrent_hash": torrent_hash,
|
||||
"details_link": details_link,
|
||||
"category": category,
|
||||
"title_visible": visible_name,
|
||||
"title_full": full_title,
|
||||
"size_pretty": size_pretty,
|
||||
"added_datetime": added_mysql,
|
||||
"preview_image": img_link,
|
||||
"seeders": seeders_number,
|
||||
"seeders_link": seeders_link,
|
||||
"leechers": leechers_number,
|
||||
"leechers_link": leechers_link,
|
||||
"torrent_filename": torrent_filename,
|
||||
"torrent_content": torrent_content if not already_have_torrent else None,
|
||||
"is_new_torrent": not already_have_torrent,
|
||||
}
|
||||
seeders_number = int(cells[4].find_element(By.TAG_NAME, "a").text.strip())
|
||||
seeders_link = cells[4].find_element(By.TAG_NAME, "a").get_attribute("href")
|
||||
leechers_number = int(cells[5].find_element(By.TAG_NAME, "a").text.strip())
|
||||
leechers_link = cells[5].find_element(By.TAG_NAME, "a").get_attribute("href")
|
||||
|
||||
# --- 6. DATABASE CHECK & DOWNLOAD ---
|
||||
cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,))
|
||||
db_row = cursor.fetchone()
|
||||
already_have_torrent = db_row is not None and db_row[0] is not None
|
||||
|
||||
torrent_content = None
|
||||
if not already_have_torrent:
|
||||
time.sleep(2)
|
||||
try:
|
||||
resp = requests_session.get(download_url, timeout=10)
|
||||
resp.raise_for_status()
|
||||
torrent_content = resp.content
|
||||
except Exception as e:
|
||||
print(f" ⚠️ Download failed for {visible_name}: {e}")
|
||||
|
||||
return {
|
||||
"torrent_hash": torrent_hash,
|
||||
"details_link": details_link,
|
||||
"download_url": download_url,
|
||||
"category": category,
|
||||
"title_visible": visible_name,
|
||||
"title_full": full_title,
|
||||
"size_pretty": size_pretty,
|
||||
"added_datetime": added_mysql,
|
||||
"preview_image": img_link,
|
||||
"seeders": seeders_number,
|
||||
"seeders_link": seeders_link,
|
||||
"leechers": leechers_number,
|
||||
"leechers_link": leechers_link,
|
||||
"torrent_filename": torrent_filename,
|
||||
"torrent_content": torrent_content if not already_have_torrent else None,
|
||||
"is_new_torrent": not already_have_torrent,
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"⚠️ parse_row logic failed: {e}")
|
||||
return None
|
||||
# ============================================================
|
||||
# 6) INSERT SQL
|
||||
# 6) INSERT SQL (MODIFIED)
|
||||
# ============================================================
|
||||
insert_sql = """
|
||||
INSERT INTO torrents (
|
||||
torrent_hash, details_link, category, title_visible, title_full,
|
||||
torrent_hash, details_link, download_url, category, title_visible, title_full,
|
||||
size_pretty, added_datetime, preview_image,
|
||||
seeders, seeders_link, leechers, leechers_link,
|
||||
torrent_filename, torrent_content
|
||||
) VALUES (
|
||||
%(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s,
|
||||
%(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s, %(title_visible)s, %(title_full)s,
|
||||
%(size_pretty)s, %(added_datetime)s, %(preview_image)s,
|
||||
%(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s,
|
||||
%(torrent_filename)s, %(torrent_content)s
|
||||
@@ -232,9 +221,12 @@ INSERT INTO torrents (
|
||||
ON DUPLICATE KEY UPDATE
|
||||
seeders = VALUES(seeders),
|
||||
leechers = VALUES(leechers),
|
||||
download_url = VALUES(download_url),
|
||||
torrent_content = COALESCE(VALUES(torrent_content), torrent_content);
|
||||
"""
|
||||
|
||||
# Note: COALESCE(torrent_content, VALUES(torrent_content))
|
||||
# keeps the old value if the new one is NULL,
|
||||
# but updates it if the old one was NULL and the new one is binary.
|
||||
# ============================================================
|
||||
# 7) PROCESS ALL PAGES
|
||||
# ============================================================
|
||||
@@ -250,17 +242,27 @@ for page_num in range(0, TOTAL_PAGES):
|
||||
|
||||
# Find table rows
|
||||
rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
|
||||
# v1 table usually has 7 cells for a data row
|
||||
real_rows = [r.find_elements(By.TAG_NAME, "td") for r in rows if len(r.find_elements(By.TAG_NAME, "td")) == 7]
|
||||
|
||||
# FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1)
|
||||
# This automatically discards headers and empty space rows.
|
||||
real_rows = []
|
||||
for r in rows:
|
||||
cells = r.find_elements(By.TAG_NAME, "td")
|
||||
if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"):
|
||||
real_rows.append(cells)
|
||||
|
||||
if not real_rows:
|
||||
print("⚠️ No data rows found on this page. Ending loop.")
|
||||
break
|
||||
|
||||
# === INSERT THIS LINE HERE ===
|
||||
page_new_items = 0
|
||||
# =============================
|
||||
|
||||
for cells in real_rows:
|
||||
try:
|
||||
data = parse_row(cells)
|
||||
# ... rest of your logic ...
|
||||
except Exception as e:
|
||||
print(f"⚠️ parse_row failed: {e}")
|
||||
continue
|
||||
@@ -279,10 +281,10 @@ for page_num in range(0, TOTAL_PAGES):
|
||||
|
||||
cursor.execute(insert_sql, data)
|
||||
|
||||
# If an entire page is old news, we can stop the deep crawl
|
||||
if page_new_items == 0 and page_num > 0:
|
||||
print("🛑 Page contained only known items. Sync complete.")
|
||||
break
|
||||
# # If an entire page is old news, we can stop the deep crawl
|
||||
# if page_new_items == 0 and page_num > 0:
|
||||
# print("🛑 Page contained only known items. Sync complete.")
|
||||
# break
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user