git
This commit is contained in:
@@ -47,11 +47,12 @@ cursor = db.cursor()
|
|||||||
# 2) Selenium setup
|
# 2) Selenium setup
|
||||||
# ============================================================
|
# ============================================================
|
||||||
COOKIE_FILE = Path("sktorrent_cookies.json")
|
COOKIE_FILE = Path("sktorrent_cookies.json")
|
||||||
# Updated to standard torrents.php as requested
|
|
||||||
BASE_URL = (
|
CATEGORIES = {
|
||||||
"https://sktorrent.eu/torrent/torrents.php"
|
24: "Knihy a časopisy",
|
||||||
"?active=0&category=24&order=data&by=DESC&zaner=&jazyk="
|
32: "Mluvené slovo"
|
||||||
)
|
}
|
||||||
|
MAX_PAGES = 10
|
||||||
|
|
||||||
chrome_options = Options()
|
chrome_options = Options()
|
||||||
chrome_options.add_argument("--start-maximized")
|
chrome_options.add_argument("--start-maximized")
|
||||||
@@ -85,7 +86,15 @@ print("🔗 Requests session initialized.")
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# 4) Popup handler
|
# 4) Build URL for category and page
|
||||||
|
# ============================================================
|
||||||
|
def get_torrent_url(category, page):
|
||||||
|
base = "https://sktorrent.eu/torrent/torrents.php"
|
||||||
|
return f"{base}?active=0&category={category}&order=data&by=DESC&zaner=&jazyk=&page={page}"
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# 5) Popup handler
|
||||||
# ============================================================
|
# ============================================================
|
||||||
def close_popup_if_any():
|
def close_popup_if_any():
|
||||||
try:
|
try:
|
||||||
@@ -96,7 +105,7 @@ def close_popup_if_any():
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# 5) Parse one torrent row (MODIFIED)
|
# 6) Parse one torrent row (MODIFIED)
|
||||||
# ============================================================
|
# ============================================================
|
||||||
|
|
||||||
def parse_row(cells):
|
def parse_row(cells):
|
||||||
@@ -204,7 +213,7 @@ def parse_row(cells):
|
|||||||
print(f"⚠️ parse_row logic failed: {e}")
|
print(f"⚠️ parse_row logic failed: {e}")
|
||||||
return None
|
return None
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# 6) INSERT SQL (MODIFIED)
|
# 7) INSERT SQL (MODIFIED)
|
||||||
# ============================================================
|
# ============================================================
|
||||||
insert_sql = """
|
insert_sql = """
|
||||||
INSERT INTO torrents (
|
INSERT INTO torrents (
|
||||||
@@ -228,68 +237,62 @@ ON DUPLICATE KEY UPDATE
|
|||||||
# keeps the old value if the new one is NULL,
|
# keeps the old value if the new one is NULL,
|
||||||
# but updates it if the old one was NULL and the new one is binary.
|
# but updates it if the old one was NULL and the new one is binary.
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# 7) PROCESS ALL PAGES
|
# 8) PROCESS ALL PAGES AND CATEGORIES
|
||||||
# ============================================================
|
# ============================================================
|
||||||
TOTAL_PAGES = 226
|
|
||||||
|
|
||||||
for page_num in range(0, TOTAL_PAGES):
|
for category_id, category_name in CATEGORIES.items():
|
||||||
current_url = f"{BASE_URL}&page={page_num}"
|
print(f"\n📚 Starting category: {category_name} (ID: {category_id})")
|
||||||
print(f"\n🌐 Loading Page Index {page_num} (Page {page_num + 1}/{TOTAL_PAGES})")
|
|
||||||
|
|
||||||
driver.get(current_url)
|
for page_num in range(0, MAX_PAGES):
|
||||||
time.sleep(2)
|
current_url = get_torrent_url(category_id, page_num)
|
||||||
close_popup_if_any()
|
print(f"\n🌐 Loading page {page_num}/{MAX_PAGES - 1} for {category_name}")
|
||||||
|
|
||||||
# Find table rows
|
driver.get(current_url)
|
||||||
rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
|
time.sleep(2)
|
||||||
|
close_popup_if_any()
|
||||||
|
|
||||||
# FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1)
|
# Find table rows
|
||||||
# This automatically discards headers and empty space rows.
|
rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
|
||||||
real_rows = []
|
|
||||||
for r in rows:
|
|
||||||
cells = r.find_elements(By.TAG_NAME, "td")
|
|
||||||
if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"):
|
|
||||||
real_rows.append(cells)
|
|
||||||
|
|
||||||
if not real_rows:
|
# FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1)
|
||||||
print("⚠️ No data rows found on this page. Ending loop.")
|
# This automatically discards headers and empty space rows.
|
||||||
break
|
real_rows = []
|
||||||
|
for r in rows:
|
||||||
|
cells = r.find_elements(By.TAG_NAME, "td")
|
||||||
|
if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"):
|
||||||
|
real_rows.append(cells)
|
||||||
|
|
||||||
# === INSERT THIS LINE HERE ===
|
if not real_rows:
|
||||||
page_new_items = 0
|
print(f"⚠️ No data rows found. Stopping pagination for {category_name}")
|
||||||
# =============================
|
break
|
||||||
|
|
||||||
for cells in real_rows:
|
page_new_items = 0
|
||||||
try:
|
|
||||||
data = parse_row(cells)
|
|
||||||
# ... rest of your logic ...
|
|
||||||
except Exception as e:
|
|
||||||
print(f"⚠️ parse_row failed: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not data: continue
|
for cells in real_rows:
|
||||||
processed_count += 1
|
try:
|
||||||
|
data = parse_row(cells)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ parse_row failed: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
if data["is_new_torrent"]:
|
if not data: continue
|
||||||
new_torrent_count += 1
|
processed_count += 1
|
||||||
page_new_items += 1
|
|
||||||
new_titles.append(data["title_visible"])
|
|
||||||
print(f"💾 NEW: {data['title_visible']}")
|
|
||||||
else:
|
|
||||||
existing_torrent_count += 1
|
|
||||||
print(f"♻️ UPDATING: {data['title_visible']}")
|
|
||||||
|
|
||||||
cursor.execute(insert_sql, data)
|
if data["is_new_torrent"]:
|
||||||
|
new_torrent_count += 1
|
||||||
|
page_new_items += 1
|
||||||
|
new_titles.append(data["title_visible"])
|
||||||
|
print(f"💾 NEW: {data['title_visible']}")
|
||||||
|
else:
|
||||||
|
existing_torrent_count += 1
|
||||||
|
print(f"♻️ UPDATING: {data['title_visible']}")
|
||||||
|
|
||||||
# # If an entire page is old news, we can stop the deep crawl
|
cursor.execute(insert_sql, data)
|
||||||
# if page_new_items == 0 and page_num > 0:
|
|
||||||
# print("🛑 Page contained only known items. Sync complete.")
|
|
||||||
# break
|
|
||||||
|
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
# 8) SEND EMAIL REPORT
|
# 9) SEND EMAIL REPORT
|
||||||
# ============================================================
|
# ============================================================
|
||||||
RUN_END = datetime.datetime.now()
|
RUN_END = datetime.datetime.now()
|
||||||
subject = f"SKTorrent run – {RUN_START:%Y-%m-%d %H:%M}"
|
subject = f"SKTorrent run – {RUN_START:%Y-%m-%d %H:%M}"
|
||||||
|
|||||||
@@ -30,10 +30,11 @@ HOW_MANY_TO_CHECK = 0
|
|||||||
|
|
||||||
COOKIE_FILE = Path("sktorrent_cookies.json")
|
COOKIE_FILE = Path("sktorrent_cookies.json")
|
||||||
|
|
||||||
BASE_URL = (
|
CATEGORIES = {
|
||||||
"https://sktorrent.eu/torrent/torrents.php"
|
24: "Knihy a časopisy",
|
||||||
"?active=0&category=23&order=data&by=DESC"
|
32: "Mluvené slovo"
|
||||||
)
|
}
|
||||||
|
MAX_PAGES = 10
|
||||||
|
|
||||||
SLEEP_BETWEEN_PAGES = 2.0 # pauza mezi stránkami
|
SLEEP_BETWEEN_PAGES = 2.0 # pauza mezi stránkami
|
||||||
SLEEP_BEFORE_DOWNLOAD = 1.5 # pauza před stažením každého .torrent souboru
|
SLEEP_BEFORE_DOWNLOAD = 1.5 # pauza před stažením každého .torrent souboru
|
||||||
@@ -221,6 +222,8 @@ def main():
|
|||||||
else:
|
else:
|
||||||
mode_desc = f"kontrola posledních {HOW_MANY_TO_CHECK} torrentů"
|
mode_desc = f"kontrola posledních {HOW_MANY_TO_CHECK} torrentů"
|
||||||
print(f"Pořadí: nejnovější → nejstarší | {mode_desc}")
|
print(f"Pořadí: nejnovější → nejstarší | {mode_desc}")
|
||||||
|
print(f"Kategorie: {', '.join(CATEGORIES.values())}")
|
||||||
|
print(f"Stránek na kategorii: {MAX_PAGES}")
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
|
|
||||||
session = build_session()
|
session = build_session()
|
||||||
@@ -230,79 +233,87 @@ def main():
|
|||||||
new_count = 0
|
new_count = 0
|
||||||
checked_count = 0
|
checked_count = 0
|
||||||
skipped_count = 0
|
skipped_count = 0
|
||||||
page = 0
|
|
||||||
stop = False
|
stop = False
|
||||||
|
|
||||||
while not stop:
|
for category_id, category_name in CATEGORIES.items():
|
||||||
|
if stop:
|
||||||
url = f"{BASE_URL}&page={page}"
|
|
||||||
r = None
|
|
||||||
for attempt in range(1, 6):
|
|
||||||
try:
|
|
||||||
r = session.get(url, timeout=15)
|
|
||||||
r.raise_for_status()
|
|
||||||
break
|
|
||||||
except Exception as e:
|
|
||||||
if attempt < 5:
|
|
||||||
print(f"⚠️ Stránka {page} — pokus {attempt}/5 selhal: {e} — čekám 10s")
|
|
||||||
time.sleep(10)
|
|
||||||
else:
|
|
||||||
print(f"⚠️ Stránka {page} — všech 5 pokusů selhalo: {e}")
|
|
||||||
if r is None or not r.ok:
|
|
||||||
break
|
break
|
||||||
|
|
||||||
if "login.php" in r.url or "Prihlas sa" in r.text:
|
print(f"\n📚 Kategorie: {category_name} (ID: {category_id})")
|
||||||
print("❌ Cookies expiraly — spusť přihlašovací Selenium skript a obnov cookies.")
|
|
||||||
break
|
|
||||||
|
|
||||||
rows = parse_page(r.text)
|
for page in range(MAX_PAGES):
|
||||||
|
|
||||||
if not rows:
|
url = (
|
||||||
print(f" Stránka {page} — žádné záznamy, konec.")
|
"https://sktorrent.eu/torrent/torrents.php"
|
||||||
break
|
f"?active=0&category={category_id}&order=data&by=DESC&page={page}"
|
||||||
|
|
||||||
print(f"\n📄 Stránka {page} ({len(rows)} torrentů)")
|
|
||||||
|
|
||||||
for item in rows:
|
|
||||||
|
|
||||||
if HOW_MANY_TO_CHECK > 0 and checked_count >= HOW_MANY_TO_CHECK:
|
|
||||||
print(f" ⏹ Zkontrolováno {checked_count} torrentů — limit dosažen.")
|
|
||||||
stop = True
|
|
||||||
break
|
|
||||||
|
|
||||||
checked_count += 1
|
|
||||||
|
|
||||||
cursor.execute(
|
|
||||||
"SELECT 1 FROM torrents WHERE torrent_hash = %s",
|
|
||||||
(item["torrent_hash"],)
|
|
||||||
)
|
)
|
||||||
exists = cursor.fetchone()
|
r = None
|
||||||
|
for attempt in range(1, 6):
|
||||||
|
try:
|
||||||
|
r = session.get(url, timeout=15)
|
||||||
|
r.raise_for_status()
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
if attempt < 5:
|
||||||
|
print(f"⚠️ Stránka {page} — pokus {attempt}/5 selhal: {e} — čekám 10s")
|
||||||
|
time.sleep(10)
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Stránka {page} — všech 5 pokusů selhalo: {e}")
|
||||||
|
if r is None or not r.ok:
|
||||||
|
break
|
||||||
|
|
||||||
if exists:
|
if "login.php" in r.url or "Prihlas sa" in r.text:
|
||||||
if HOW_MANY_TO_CHECK == -1:
|
print("❌ Cookies expiraly — spusť přihlašovací Selenium skript a obnov cookies.")
|
||||||
print(f" ⏹ Již v DB: {item['title_visible']} → zastavuji import.")
|
break
|
||||||
|
|
||||||
|
rows = parse_page(r.text)
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
print(f" Stránka {page} — žádné záznamy, konec.")
|
||||||
|
break
|
||||||
|
|
||||||
|
print(f"\n📄 Stránka {page} ({len(rows)} torrentů)")
|
||||||
|
|
||||||
|
for item in rows:
|
||||||
|
|
||||||
|
if HOW_MANY_TO_CHECK > 0 and checked_count >= HOW_MANY_TO_CHECK:
|
||||||
|
print(f" ⏹ Zkontrolováno {checked_count} torrentů — limit dosažen.")
|
||||||
stop = True
|
stop = True
|
||||||
break
|
break
|
||||||
|
|
||||||
|
checked_count += 1
|
||||||
|
|
||||||
|
cursor.execute(
|
||||||
|
"SELECT 1 FROM torrents WHERE torrent_hash = %s",
|
||||||
|
(item["torrent_hash"],)
|
||||||
|
)
|
||||||
|
exists = cursor.fetchone()
|
||||||
|
|
||||||
|
if exists:
|
||||||
|
if HOW_MANY_TO_CHECK == -1:
|
||||||
|
print(f" ⏹ Již v DB: {item['title_visible']} → zastavuji import.")
|
||||||
|
stop = True
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
skipped_count += 1
|
||||||
|
print(f" ⏭ Již v DB: {item['title_visible']} — přeskakuji")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f" ⬇️ Nový: {item['title_visible']}")
|
||||||
|
time.sleep(SLEEP_BEFORE_DOWNLOAD)
|
||||||
|
|
||||||
|
content = download_torrent(session, item["download_url"])
|
||||||
|
if content:
|
||||||
|
print(f" ✔ Staženo ({len(content):,} B)")
|
||||||
else:
|
else:
|
||||||
skipped_count += 1
|
print(f" ✖ Nepodařilo se stáhnout, ukládám bez obsahu")
|
||||||
print(f" ⏭ Již v DB: {item['title_visible']} — přeskakuji")
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f" ⬇️ Nový: {item['title_visible']}")
|
item["torrent_content"] = content
|
||||||
time.sleep(SLEEP_BEFORE_DOWNLOAD)
|
cursor.execute(INSERT_SQL, item)
|
||||||
|
new_count += 1
|
||||||
|
|
||||||
content = download_torrent(session, item["download_url"])
|
if stop:
|
||||||
if content:
|
break
|
||||||
print(f" ✔ Staženo ({len(content):,} B)")
|
|
||||||
else:
|
|
||||||
print(f" ✖ Nepodařilo se stáhnout, ukládám bez obsahu")
|
|
||||||
|
|
||||||
item["torrent_content"] = content
|
|
||||||
cursor.execute(INSERT_SQL, item)
|
|
||||||
new_count += 1
|
|
||||||
|
|
||||||
if not stop:
|
|
||||||
page += 1
|
|
||||||
time.sleep(SLEEP_BETWEEN_PAGES)
|
time.sleep(SLEEP_BETWEEN_PAGES)
|
||||||
|
|
||||||
# ============================================================
|
# ============================================================
|
||||||
@@ -314,7 +325,6 @@ def main():
|
|||||||
print(f"Nových torrentů uloženo : {new_count}")
|
print(f"Nových torrentů uloženo : {new_count}")
|
||||||
print(f"Zkontrolováno celkem : {checked_count}")
|
print(f"Zkontrolováno celkem : {checked_count}")
|
||||||
print(f"Přeskočeno (v DB) : {skipped_count}")
|
print(f"Přeskočeno (v DB) : {skipped_count}")
|
||||||
print(f"Stránek prošlo : {page}")
|
|
||||||
print("=" * 60)
|
print("=" * 60)
|
||||||
|
|
||||||
db.close()
|
db.close()
|
||||||
|
|||||||
Reference in New Issue
Block a user