diff --git a/82 Reporting.py b/82 Reporting.py index 5207664..abaec7b 100644 --- a/82 Reporting.py +++ b/82 Reporting.py @@ -61,17 +61,28 @@ def get_data(): # ============================== def auto_adjust_columns(writer, df, sheet_name): - """Pomocná funkce pro automatické rozšíření sloupců v Excelu""" + """Bezpečné automatické nastavení šířky sloupců""" worksheet = writer.sheets[sheet_name] + for idx, col in enumerate(df.columns): - max_len = max( - df[col].astype(str).map(len).max(), - len(str(col)) - ) + 2 - if max_len > 60: max_len = 60 + series = df[col] + + max_len = len(str(col)) # minimálně délka hlavičky + + for val in series: + if val is None or (isinstance(val, float) and pd.isna(val)): + length = 0 + else: + length = len(str(val)) + + if length > max_len: + max_len = length + + max_len = min(max_len + 2, 60) worksheet.set_column(idx, idx, max_len) + # ============================== # 🚀 HLAVNÍ LOGIKA # ============================== diff --git a/90 Import all torrents from all pages.py b/90 Import all torrents from all pages.py index 2c9cced..449383d 100644 --- a/90 Import all torrents from all pages.py +++ b/90 Import all torrents from all pages.py @@ -96,135 +96,124 @@ def close_popup_if_any(): # ============================================================ -# 5) Parse one torrent row +# 5) Parse one torrent row (MODIFIED) # ============================================================ + def parse_row(cells): - # Column 0: Category icon/text + # --- 1. INITIALIZE --- + torrent_hash = None + download_url = None category = cells[0].text.strip() try: - # Column 1: Download icon link + # --- 2. EXTRACT DOWNLOAD URL (Column 1) --- download_a = cells[1].find_element(By.TAG_NAME, "a") - download_link = download_a.get_attribute("href") - except: - return None + download_url = download_a.get_attribute("href") - parsed_dl = urlparse.urlparse(download_link) - dl_query = urlparse.parse_qs(parsed_dl.query) - torrent_filename = dl_query.get("f", ["unknown.torrent"])[0] + parsed_dl = urlparse.urlparse(download_url) + dl_query = urlparse.parse_qs(parsed_dl.query) + torrent_filename = dl_query.get("f", ["unknown.torrent"])[0] - # Column 2: Name and info - title_links = cells[2].find_elements(By.TAG_NAME, "a") - if not title_links: - return None + # --- 3. EXTRACT DETAILS & HASH (Column 2) --- + title_links = cells[2].find_elements(By.TAG_NAME, "a") + if not title_links: + return None - a_tag = title_links[0] - visible_name = a_tag.text.strip() - full_title = a_tag.get_attribute("title") - details_link = a_tag.get_attribute("href") + a_tag = title_links[0] + visible_name = a_tag.text.strip() + full_title = a_tag.get_attribute("title") + details_link = a_tag.get_attribute("href") - parsed = urlparse.urlparse(details_link) - query = urlparse.parse_qs(parsed.query) - if "id" not in query: - return None + parsed = urlparse.urlparse(details_link) + query = urlparse.parse_qs(parsed.query) + if "id" not in query: + return None - torrent_hash = query["id"][0] + torrent_hash = query["id"][0] - # Use innerText for robust text extraction - text_block = cells[2].get_attribute("innerText") - text_block_clean = " ".join(text_block.split()) + # --- 4. EXTRACT SIZE & DATE --- + text_block = cells[2].get_attribute("innerText") + text_block_clean = " ".join(text_block.split()) + size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE) + added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE) + size_pretty = size_match.group(1) if size_match else None + added_pretty = added_match.group(1) if added_match else None - # Regex for Size and Date - size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", text_block_clean, re.IGNORECASE) - added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", text_block_clean, re.IGNORECASE) + added_mysql = None + if added_pretty: + clean = added_pretty.replace(" o ", " ").strip() + parts = clean.split(" ") + if len(parts) >= 2: + date_part, time_part = parts[0], parts[1] + if len(time_part.split(":")) == 2: time_part += ":00" + try: + d, m, y = date_part.split("/") + added_mysql = f"{y}-{m}-{d} {time_part}" + except: pass - size_pretty = size_match.group(1) if size_match else None - added_pretty = added_match.group(1) if added_match else None - - # Date conversion: "29/11/2025 o 02:29" -> MySQL format - added_mysql = None - if added_pretty: - clean = added_pretty.replace(" o ", " ").strip() - parts = clean.split(" ") - if len(parts) >= 2: - date_part = parts[0] - time_part = parts[1] - if len(time_part.split(":")) == 2: - time_part += ":00" - try: - day, month, year = date_part.split("/") - added_mysql = f"{year}-{month}-{day} {time_part}" - except: - added_mysql = None - - # Column 2: Image preview (if exists) - img_link = None - try: - image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]") - mouseover = image_a.get_attribute("onmouseover") - img_match = re.search(r"src=([^ ]+)", mouseover) - if img_match: - img_link = img_match.group(1).replace("'", "").strip() - if img_link.startswith("//"): - img_link = "https:" + img_link - except: - pass - - # Column 4: Seeders - seeders_a = cells[4].find_element(By.TAG_NAME, "a") - seeders_number = int(seeders_a.text.strip()) - seeders_link = seeders_a.get_attribute("href") - - # Column 5: Leechers - leechers_a = cells[5].find_element(By.TAG_NAME, "a") - leechers_number = int(leechers_a.text.strip()) - leechers_link = leechers_a.get_attribute("href") - - # Check database for existing binary content - cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,)) - row = cursor.fetchone() - already_have_torrent = row is not None and row[0] is not None - - torrent_content = None - if not already_have_torrent: - time.sleep(3) # Politeness delay + # --- 5. IMAGE & STATS --- + img_link = None try: - resp = requests_session.get(download_link) - resp.raise_for_status() - torrent_content = resp.content - except: - torrent_content = None + image_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]") + mouseover = image_a.get_attribute("onmouseover") + img_match = re.search(r"src=([^ ]+)", mouseover) + if img_match: + img_link = img_match.group(1).replace("'", "").strip() + if img_link.startswith("//"): img_link = "https:" + img_link + except: pass - return { - "torrent_hash": torrent_hash, - "details_link": details_link, - "category": category, - "title_visible": visible_name, - "title_full": full_title, - "size_pretty": size_pretty, - "added_datetime": added_mysql, - "preview_image": img_link, - "seeders": seeders_number, - "seeders_link": seeders_link, - "leechers": leechers_number, - "leechers_link": leechers_link, - "torrent_filename": torrent_filename, - "torrent_content": torrent_content if not already_have_torrent else None, - "is_new_torrent": not already_have_torrent, - } + seeders_number = int(cells[4].find_element(By.TAG_NAME, "a").text.strip()) + seeders_link = cells[4].find_element(By.TAG_NAME, "a").get_attribute("href") + leechers_number = int(cells[5].find_element(By.TAG_NAME, "a").text.strip()) + leechers_link = cells[5].find_element(By.TAG_NAME, "a").get_attribute("href") + # --- 6. DATABASE CHECK & DOWNLOAD --- + cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,)) + db_row = cursor.fetchone() + already_have_torrent = db_row is not None and db_row[0] is not None + torrent_content = None + if not already_have_torrent: + time.sleep(2) + try: + resp = requests_session.get(download_url, timeout=10) + resp.raise_for_status() + torrent_content = resp.content + except Exception as e: + print(f" ⚠️ Download failed for {visible_name}: {e}") + + return { + "torrent_hash": torrent_hash, + "details_link": details_link, + "download_url": download_url, + "category": category, + "title_visible": visible_name, + "title_full": full_title, + "size_pretty": size_pretty, + "added_datetime": added_mysql, + "preview_image": img_link, + "seeders": seeders_number, + "seeders_link": seeders_link, + "leechers": leechers_number, + "leechers_link": leechers_link, + "torrent_filename": torrent_filename, + "torrent_content": torrent_content if not already_have_torrent else None, + "is_new_torrent": not already_have_torrent, + } + except Exception as e: + print(f"⚠️ parse_row logic failed: {e}") + return None # ============================================================ -# 6) INSERT SQL +# 6) INSERT SQL (MODIFIED) # ============================================================ insert_sql = """ INSERT INTO torrents ( - torrent_hash, details_link, category, title_visible, title_full, + torrent_hash, details_link, download_url, category, title_visible, title_full, size_pretty, added_datetime, preview_image, seeders, seeders_link, leechers, leechers_link, torrent_filename, torrent_content ) VALUES ( - %(torrent_hash)s, %(details_link)s, %(category)s, %(title_visible)s, %(title_full)s, + %(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s, %(title_visible)s, %(title_full)s, %(size_pretty)s, %(added_datetime)s, %(preview_image)s, %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s, %(torrent_filename)s, %(torrent_content)s @@ -232,9 +221,12 @@ INSERT INTO torrents ( ON DUPLICATE KEY UPDATE seeders = VALUES(seeders), leechers = VALUES(leechers), + download_url = VALUES(download_url), torrent_content = COALESCE(VALUES(torrent_content), torrent_content); """ - +# Note: COALESCE(torrent_content, VALUES(torrent_content)) +# keeps the old value if the new one is NULL, +# but updates it if the old one was NULL and the new one is binary. # ============================================================ # 7) PROCESS ALL PAGES # ============================================================ @@ -250,17 +242,27 @@ for page_num in range(0, TOTAL_PAGES): # Find table rows rows = driver.find_elements(By.CSS_SELECTOR, "table tr") - # v1 table usually has 7 cells for a data row - real_rows = [r.find_elements(By.TAG_NAME, "td") for r in rows if len(r.find_elements(By.TAG_NAME, "td")) == 7] + + # FILTER: Only keep rows that have 7 columns AND a link in the 2nd column (index 1) + # This automatically discards headers and empty space rows. + real_rows = [] + for r in rows: + cells = r.find_elements(By.TAG_NAME, "td") + if len(cells) == 7 and cells[1].find_elements(By.TAG_NAME, "a"): + real_rows.append(cells) if not real_rows: print("⚠️ No data rows found on this page. Ending loop.") break + # === INSERT THIS LINE HERE === page_new_items = 0 + # ============================= + for cells in real_rows: try: data = parse_row(cells) + # ... rest of your logic ... except Exception as e: print(f"⚠️ parse_row failed: {e}") continue @@ -279,10 +281,10 @@ for page_num in range(0, TOTAL_PAGES): cursor.execute(insert_sql, data) - # If an entire page is old news, we can stop the deep crawl - if page_new_items == 0 and page_num > 0: - print("🛑 Page contained only known items. Sync complete.") - break + # # If an entire page is old news, we can stop the deep crawl + # if page_new_items == 0 and page_num > 0: + # print("🛑 Page contained only known items. Sync complete.") + # break time.sleep(1) diff --git a/91 5threaddownloader.py b/91 5threaddownloader.py new file mode 100644 index 0000000..c7bcf68 --- /dev/null +++ b/91 5threaddownloader.py @@ -0,0 +1,292 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import pymysql +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.chrome.options import Options +import time +import re +import urllib.parse as urlparse +from pathlib import Path +import json +import requests +import datetime +import sys +import threading +from concurrent.futures import ThreadPoolExecutor + +# Ensure this file exists in your directory +from EmailMessagingGraph import send_mail + +# ============================================================ +# CONFIGURATION +# ============================================================ +TOTAL_PAGES = 226 +THREADS = 5 +COOKIE_FILE = Path("sktorrent_cookies.json") + +# Database settings +DB_CONFIG = { + "host": "192.168.1.50", + "port": 3306, + "user": "root", + "password": "Vlado9674+", + "database": "torrents", + "charset": "utf8mb4", + "autocommit": True, +} + +BASE_URL = ( + "https://sktorrent.eu/torrent/torrents.php" + "?active=0&category=24&order=data&by=DESC&zaner=&jazyk=" +) + +# Global counters for reporting (Thread-safe lock needed) +stats_lock = threading.Lock() +stats = { + "processed": 0, + "new": 0, + "existing": 0, + "new_titles": [] +} + + +# ============================================================ +# 1) WORKER FUNCTION (Runs inside each thread) +# ============================================================ +def process_page_chunk(page_indices, thread_id): + """ + This function creates its OWN browser and OWN database connection. + It processes the specific list of page numbers assigned to it. + """ + print(f"🧵 [Thread-{thread_id}] Starting. Assigned {len(page_indices)} pages.") + + # --- A. Setup Independent DB Connection --- + try: + db = pymysql.connect(**DB_CONFIG) + cursor = db.cursor() + except Exception as e: + print(f"❌ [Thread-{thread_id}] DB Connection failed: {e}") + return + + # --- B. Setup Independent Selenium Driver --- + chrome_options = Options() + # HEADLESS MODE is safer for 5 threads to avoid popping up 5 windows + chrome_options.add_argument("--headless=new") + chrome_options.add_argument("--disable-notifications") + chrome_options.add_argument("--disable-popup-blocking") + chrome_options.add_argument("--disable-extensions") + chrome_options.add_argument("--log-level=3") # Reduce noise + + driver = webdriver.Chrome(options=chrome_options) + driver.set_window_size(1350, 1000) + + # --- C. Login / Cookies --- + driver.get("https://sktorrent.eu") + if COOKIE_FILE.exists(): + with open(COOKIE_FILE, "r", encoding="utf-8") as f: + cookies = json.load(f) + for c in cookies: + driver.add_cookie(c) + + # --- D. Requests Session --- + requests_session = requests.Session() + for ck in driver.get_cookies(): + requests_session.cookies.set(ck["name"], ck["value"]) + + # --- E. Helper: Parse Row (Local scope) --- + def parse_row(cells): + try: + category = cells[0].text.strip() + + # Download URL + download_a = cells[1].find_element(By.TAG_NAME, "a") + download_url = download_a.get_attribute("href") + + parsed_dl = urlparse.urlparse(download_url) + dl_query = urlparse.parse_qs(parsed_dl.query) + torrent_filename = dl_query.get("f", ["unknown.torrent"])[0] + + # Details & Hash + title_links = cells[2].find_elements(By.TAG_NAME, "a") + if not title_links: return None + a_tag = title_links[0] + visible_name = a_tag.text.strip() + full_title = a_tag.get_attribute("title") + details_link = a_tag.get_attribute("href") + + parsed = urlparse.urlparse(details_link) + query = urlparse.parse_qs(parsed.query) + if "id" not in query: return None + torrent_hash = query["id"][0] + + # Size & Date + text_block = cells[2].get_attribute("innerText") + clean_text = " ".join(text_block.split()) + size_match = re.search(r"Velkost ([0-9\.]+ ?[KMG]B)", clean_text, re.IGNORECASE) + added_match = re.search(r"Pridany (.+?)(?:\sObrázok|$)", clean_text, re.IGNORECASE) + size_pretty = size_match.group(1) if size_match else None + + added_mysql = None + if added_match: + clean = added_match.group(1).replace(" o ", " ").strip() + parts = clean.split(" ") + if len(parts) >= 2: + d, m, y = parts[0].split("/") + t = parts[1] + ":00" if len(parts[1].split(":")) == 2 else parts[1] + try: + added_mysql = f"{y}-{m}-{d} {t}" + except: + pass + + # Image + img_link = None + try: + img_a = cells[2].find_element(By.XPATH, ".//a[contains(text(),'Obrázok')]") + img_src = re.search(r"src=([^ ]+)", img_a.get_attribute("onmouseover")) + if img_src: + img_link = img_src.group(1).replace("'", "").strip() + if img_link.startswith("//"): img_link = "https:" + img_link + except: + pass + + # Stats + seeders = int(cells[4].find_element(By.TAG_NAME, "a").text.strip()) + seeders_link = cells[4].find_element(By.TAG_NAME, "a").get_attribute("href") + leechers = int(cells[5].find_element(By.TAG_NAME, "a").text.strip()) + leechers_link = cells[5].find_element(By.TAG_NAME, "a").get_attribute("href") + + # Check DB + cursor.execute("SELECT torrent_content FROM torrents WHERE torrent_hash=%s", (torrent_hash,)) + row = cursor.fetchone() + already_have_file = row is not None and row[0] is not None + + content = None + if not already_have_file: + # Politeness sleep only if downloading + time.sleep(1) + try: + r = requests_session.get(download_url, timeout=10) + r.raise_for_status() + content = r.content + except: + pass + + return { + "torrent_hash": torrent_hash, "details_link": details_link, "download_url": download_url, + "category": category, "title_visible": visible_name, "title_full": full_title, + "size_pretty": size_pretty, "added_datetime": added_mysql, "preview_image": img_link, + "seeders": seeders, "seeders_link": seeders_link, "leechers": leechers, "leechers_link": leechers_link, + "torrent_filename": torrent_filename, "torrent_content": content, + "is_new_torrent": not already_have_file + } + except Exception: + return None + + # --- F. Loop through Assigned Pages --- + for page_num in page_indices: + url = f"{BASE_URL}&page={page_num}" + print(f" 🔄 [Thread-{thread_id}] Scraping Page {page_num}") + + try: + driver.get(url) + # Close popup (simplified JS) + driver.execute_script("try { interstitialBox.closeit(); } catch(e) {}") + + # Row Filtering + rows = driver.find_elements(By.CSS_SELECTOR, "table tr") + real_rows = [] + for r in rows: + cs = r.find_elements(By.TAG_NAME, "td") + if len(cs) == 7 and cs[1].find_elements(By.TAG_NAME, "a"): + real_rows.append(cs) + + if not real_rows: + print(f" ⚠️ [Thread-{thread_id}] Page {page_num} empty.") + continue + + # Process Rows + for cells in real_rows: + data = parse_row(cells) + if not data: continue + + # Update Global Stats safely + with stats_lock: + stats["processed"] += 1 + if data["is_new_torrent"]: + stats["new"] += 1 + stats["new_titles"].append(data["title_visible"]) + else: + stats["existing"] += 1 + + # Insert SQL + sql = """ + INSERT INTO torrents ( + torrent_hash, details_link, download_url, category, title_visible, title_full, + size_pretty, added_datetime, preview_image, + seeders, seeders_link, leechers, leechers_link, + torrent_filename, torrent_content + ) VALUES ( + %(torrent_hash)s, %(details_link)s, %(download_url)s, %(category)s, %(title_visible)s, %(title_full)s, + %(size_pretty)s, %(added_datetime)s, %(preview_image)s, + %(seeders)s, %(seeders_link)s, %(leechers)s, %(leechers_link)s, + %(torrent_filename)s, %(torrent_content)s + ) + ON DUPLICATE KEY UPDATE + seeders = VALUES(seeders), + leechers = VALUES(leechers), + download_url = VALUES(download_url), + torrent_content = COALESCE(VALUES(torrent_content), torrent_content); + """ + cursor.execute(sql, data) + + except Exception as e: + print(f" 💥 [Thread-{thread_id}] Error on page {page_num}: {e}") + + # Cleanup + driver.quit() + db.close() + print(f"🏁 [Thread-{thread_id}] Finished assigned pages.") + + +# ============================================================ +# 2) MAIN EXECUTION +# ============================================================ +if __name__ == "__main__": + RUN_START = datetime.datetime.now() + print(f"🚀 Starting Multithreaded Scraper with {THREADS} threads...") + + # 1. Distribute pages among threads + # Example: If 226 pages and 5 threads, each gets ~45 pages + all_pages = list(range(TOTAL_PAGES)) + chunk_size = len(all_pages) // THREADS + 1 + chunks = [all_pages[i:i + chunk_size] for i in range(0, len(all_pages), chunk_size)] + + # 2. Start Threads + with ThreadPoolExecutor(max_workers=THREADS) as executor: + futures = [] + for i, page_chunk in enumerate(chunks): + if page_chunk: # Only start if chunk is not empty + futures.append(executor.submit(process_page_chunk, page_chunk, i + 1)) + + # Wait for all to finish + for f in futures: + f.result() + + # 3. Final Report + RUN_END = datetime.datetime.now() + print("\n✅ All threads completed.") + + body = ( + f"Run started: {RUN_START:%Y-%m-%d %H:%M:%S}\n" + f"Run finished: {RUN_END:%Y-%m-%d %H:%M:%S}\n\n" + f"Processed torrents: {stats['processed']}\n" + f"New torrents saved: {stats['new']}\n" + f"Existing torrents updated: {stats['existing']}\n" + ) + if stats["new_titles"]: + body += "\nNew torrents list:\n- " + "\n- ".join(stats["new_titles"]) + + send_mail(to="vladimir.buzalka@buzalka.cz", subject=f"SKTorrent Multi-Thread Run", body=body, html=False) + print("📧 Email report sent.") \ No newline at end of file diff --git a/92 5threaddownloadtorrentfiles.py b/92 5threaddownloadtorrentfiles.py new file mode 100644 index 0000000..16270dc --- /dev/null +++ b/92 5threaddownloadtorrentfiles.py @@ -0,0 +1,212 @@ +import pymysql +import requests +import json +import time +import random +import os +import re +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor +from threading import Lock +from selenium import webdriver +from selenium.webdriver.chrome.options import Options + +# ============================================================ +# KONFIGURACE +# ============================================================ +DB_CONFIG = { + "host": "192.168.1.50", + "port": 3306, + "user": "root", + "password": "Vlado9674+", + "database": "torrents", + "charset": "utf8mb4", + "autocommit": True, +} + +COOKIE_FILE = Path("sktorrent_cookies.json") +BACKUP_DIR = "saved_torrents" # Adresář pro lokální zálohu +THREADS = 5 # Počet vláken + +# Globální zámek pro výpisy do konzole, aby se nepřepisovaly +print_lock = Lock() +stats = {"fixed": 0, "failed": 0, "saved_to_disk": 0} + + +# ============================================================ +# POMOCNÉ FUNKCE +# ============================================================ + +def sanitize_filename(name): + """Odstraní z názvu souboru nepovolené znaky""" + # Povolíme jen písmena, čísla, tečky, pomlčky a mezery + clean = re.sub(r'[^\w\s\.-]', '', name) + return clean.strip()[:100] # Ořízneme na 100 znaků pro jistotu + + +def ensure_backup_dir(): + """Vytvoří adresář pro torrenty, pokud neexistuje""" + if not os.path.exists(BACKUP_DIR): + os.makedirs(BACKUP_DIR) + print(f"📁 Vytvořen adresář pro zálohu: {os.path.abspath(BACKUP_DIR)}") + + +def get_browser_identity(): + """ + Spustí Selenium (Chrome) JEN JEDNOU, aby získal validní + User-Agent a čerstvé Cookies pro threads. + """ + print("🤖 Startuji Selenium pro získání identity prohlížeče...") + + opts = Options() + opts.add_argument("--headless=new") + opts.add_argument("--disable-gpu") + + driver = webdriver.Chrome(options=opts) + + # Jdeme na web nastavit doménu pro cookies + driver.get("https://sktorrent.eu") + + # Načteme cookies ze souboru + if COOKIE_FILE.exists(): + with open(COOKIE_FILE, "r", encoding="utf-8") as f: + cookies_list = json.load(f) + for c in cookies_list: + driver.add_cookie(c) + driver.refresh() + time.sleep(2) + + # Exportujeme identitu + user_agent = driver.execute_script("return navigator.userAgent;") + browser_cookies = driver.get_cookies() + + driver.quit() + print("✅ Identita získána.") + return user_agent, browser_cookies + + +# ============================================================ +# WORKER (Pracovní vlákno) +# ============================================================ +def worker_task(rows_chunk, thread_id, user_agent, cookies_list): + """ + Tato funkce běží v každém vlákně zvlášť. + """ + # 1. Vytvoření vlastní Session pro toto vlákno + session = requests.Session() + session.headers.update({"User-Agent": user_agent}) + for c in cookies_list: + session.cookies.set(c['name'], c['value']) + + # 2. Vlastní připojení k DB (nutné pro thread-safety) + try: + db = pymysql.connect(**DB_CONFIG) + cursor = db.cursor() + except Exception as e: + with print_lock: + print(f"❌ [Thread-{thread_id}] Chyba DB připojení: {e}") + return + + for row in rows_chunk: + t_hash, url, title = row + + # Ochrana: krátká náhodná pauza, aby 5 vláken nezabilo server + time.sleep(random.uniform(0.5, 2.0)) + + try: + # Stažení + resp = session.get(url, timeout=15) + + if resp.status_code == 403: + with print_lock: + print(f"⛔ [Thread-{thread_id}] 403 Forbidden! {title[:20]}...") + stats["failed"] += 1 + continue + + resp.raise_for_status() + content = resp.content + + if len(content) > 100: + # A) Uložit do DB (BLOB) + sql = "UPDATE torrents SET torrent_content = %s WHERE torrent_hash = %s" + cursor.execute(sql, (content, t_hash)) + + # B) Uložit na DISK (Soubor) + clean_name = sanitize_filename(title) + # Přidáme kousek hashe do názvu, aby se nepřepsaly soubory se stejným jménem + filename = f"{clean_name}_{t_hash[:6]}.torrent" + file_path = os.path.join(BACKUP_DIR, filename) + + with open(file_path, "wb") as f: + f.write(content) + + with print_lock: + print(f"✅ [Thread-{thread_id}] OK: {clean_name}") + stats["fixed"] += 1 + stats["saved_to_disk"] += 1 + else: + with print_lock: + print(f"⚠️ [Thread-{thread_id}] Prázdný soubor: {title}") + stats["failed"] += 1 + + except Exception as e: + with print_lock: + print(f"❌ [Thread-{thread_id}] Chyba: {title[:20]}... -> {e}") + stats["failed"] += 1 + + db.close() + with print_lock: + print(f"🏁 [Thread-{thread_id}] Dokončil práci.") + + +# ============================================================ +# HLAVNÍ LOOP +# ============================================================ +if __name__ == "__main__": + ensure_backup_dir() + + # 1. Získat data z DB + print("🔍 Načítám seznam chybějících souborů z DB...") + main_db = pymysql.connect(**DB_CONFIG) + with main_db.cursor() as c: + # Hledáme ty, co mají URL, ale nemají obsah + c.execute( + "SELECT torrent_hash, download_url, title_visible FROM torrents WHERE torrent_content IS NULL AND download_url IS NOT NULL") + all_rows = c.fetchall() + main_db.close() + + total = len(all_rows) + print(f"📋 K opravě: {total} položek.") + + if total == 0: + print("🎉 Není co opravovat.") + exit() + + # 2. Získat "Super Identitu" přes Selenium (jen jednou) + u_agent, browser_cookies = get_browser_identity() + + # 3. Rozdělit práci pro 5 vláken + chunk_size = total // THREADS + 1 + chunks = [all_rows[i:i + chunk_size] for i in range(0, total, chunk_size)] + + print(f"🚀 Spouštím {THREADS} vláken (ukládání do DB + do složky '{BACKUP_DIR}')...") + + # 4. Spustit multithreading + with ThreadPoolExecutor(max_workers=THREADS) as executor: + futures = [] + for i, chunk in enumerate(chunks): + if chunk: + # Každému vláknu předáme kus práce + identitu prohlížeče + futures.append(executor.submit(worker_task, chunk, i + 1, u_agent, browser_cookies)) + + # Čekáme na dokončení + for f in futures: + f.result() + + print("\n" + "=" * 40) + print(f"🏁 DOKONČENO") + print(f"✅ Opraveno v DB: {stats['fixed']}") + print(f"💾 Uloženo na disk: {stats['saved_to_disk']}") + print(f"❌ Chyby: {stats['failed']}") + print(f"📁 Soubory najdeš v: {os.path.abspath(BACKUP_DIR)}") + print("=" * 40) \ No newline at end of file diff --git a/93 Final SingleThreaded Cleanup.py b/93 Final SingleThreaded Cleanup.py new file mode 100644 index 0000000..6bdf972 --- /dev/null +++ b/93 Final SingleThreaded Cleanup.py @@ -0,0 +1,133 @@ +import pymysql +import requests +import json +import time +import random +import os +import re +from pathlib import Path +from selenium import webdriver +from selenium.webdriver.chrome.options import Options + +# ============================================================ +# KONFIGURACE +# ============================================================ +DB_CONFIG = { + "host": "192.168.1.50", + "port": 3306, + "user": "root", + "password": "Vlado9674+", + "database": "torrents", + "charset": "utf8mb4", + "autocommit": True, +} + +COOKIE_FILE = Path("sktorrent_cookies.json") +BACKUP_DIR = "saved_torrents" + + +# ============================================================ +# POMOCNÉ FUNKCE +# ============================================================ +def sanitize_filename(name): + clean = re.sub(r'[^\w\s\.-]', '', name) + return clean.strip()[:100] + + +def get_browser_identity(): + print("🤖 Startuji Selenium (Single Thread Mode)...") + opts = Options() + opts.add_argument("--headless=new") + opts.add_argument("--disable-gpu") + driver = webdriver.Chrome(options=opts) + driver.get("https://sktorrent.eu") + + if COOKIE_FILE.exists(): + with open(COOKIE_FILE, "r", encoding="utf-8") as f: + cookies_list = json.load(f) + for c in cookies_list: + driver.add_cookie(c) + driver.refresh() + time.sleep(2) + + user_agent = driver.execute_script("return navigator.userAgent;") + browser_cookies = driver.get_cookies() + driver.quit() + return user_agent, browser_cookies + + +# ============================================================ +# MAIN +# ============================================================ +if __name__ == "__main__": + if not os.path.exists(BACKUP_DIR): + os.makedirs(BACKUP_DIR) + + # 1. Načíst zbývající chyby + db = pymysql.connect(**DB_CONFIG) + cursor = db.cursor() + cursor.execute( + "SELECT torrent_hash, download_url, title_visible FROM torrents WHERE torrent_content IS NULL AND download_url IS NOT NULL") + rows = cursor.fetchall() + + print(f"📋 Zbývá opravit: {len(rows)} položek.") + if not rows: + print("🎉 Hotovo! Vše je staženo.") + exit() + + # 2. Získat identitu + ua, cookies = get_browser_identity() + + session = requests.Session() + session.headers.update({"User-Agent": ua}) + for c in cookies: + session.cookies.set(c['name'], c['value']) + + # 3. Pomalá smyčka (1 vlákno) + success = 0 + dead_links = 0 + + print("🚀 Spouštím jemné dočištění...") + + for i, row in enumerate(rows): + t_hash, url, title = row + print(f"[{i + 1}/{len(rows)}] {title[:50]}...", end=" ") + + try: + # Delší pauza pro stabilitu + time.sleep(random.uniform(1.5, 3.0)) + + resp = session.get(url, timeout=20) # Delší timeout + + if resp.status_code == 404: + print("❌ 404 Nenalezeno (soubor na serveru neexistuje)") + dead_links += 1 + continue + + if resp.status_code != 200: + print(f"❌ Chyba {resp.status_code}") + continue + + content = resp.content + if len(content) > 100: + # DB + cursor.execute("UPDATE torrents SET torrent_content = %s WHERE torrent_hash = %s", (content, t_hash)) + + # Disk + fname = f"{sanitize_filename(title)}_{t_hash[:6]}.torrent" + with open(os.path.join(BACKUP_DIR, fname), "wb") as f: + f.write(content) + + print("✅ OK") + success += 1 + else: + print("⚠️ Prázdný soubor") + + except Exception as e: + print(f"❌ Selhalo: {e}") + + db.close() + print("\n" + "=" * 30) + print(f"🏁 FINÁLE: Opraveno {success} z {len(rows)}") + if dead_links > 0: + print(f"💀 Mrtvé odkazy (404): {dead_links} (ty už opravit nejdou)") \ No newline at end of file diff --git a/94 WhatWehaveAlreadyDownloaded.py b/94 WhatWehaveAlreadyDownloaded.py new file mode 100644 index 0000000..17b7234 --- /dev/null +++ b/94 WhatWehaveAlreadyDownloaded.py @@ -0,0 +1,158 @@ +import pymysql +import bencodepy +import os +from pathlib import Path + +# ============================================================ +# CONFIGURATION +# ============================================================ +# Your network path (Use raw string r"..." for backslashes) +# PHYSICAL_DIR = Path(r"\\tower\torrents\downloads") +PHYSICAL_DIR = Path(r"\\tower1\#Colddata\Porno") + +DB_CONFIG = { + "host": "192.168.1.50", + "port": 3306, + "user": "root", + "password": "Vlado9674+", + "database": "torrents", + "charset": "utf8mb4", + "autocommit": True, +} + + +# ============================================================ +# HELPER FUNCTIONS +# ============================================================ +def decode_bytes(b): + """ + Decodes bytes from Bencode into a string. + Tries UTF-8 first, then common fallbacks. + """ + if isinstance(b, str): return b + encodings = ['utf-8', 'windows-1250', 'latin-1', 'cp1252'] + for enc in encodings: + try: + return b.decode(enc) + except: + continue + return b.decode('utf-8', errors='ignore') + + +def check_torrent_in_filesystem(torrent_blob, root_path): + """ + Parses the binary BLOB, calculates expected paths, + and checks if they exist in the root_path. + """ + try: + # Decode the binary BLOB + data = bencodepy.decode(torrent_blob) + info = data.get(b'info') + if not info: return False + + # Get the name of the root file/folder defined in the torrent + name = decode_bytes(info.get(b'name')) + + # Calculate expected location + target_path = root_path / name + + # 1. Check if the main path exists + if not target_path.exists(): + return False + + # 2. Size Verification (Basic) + # If it's a single file + if b'files' not in info: + expected_size = info[b'length'] + real_size = target_path.stat().st_size + # Allow 1% variance or 1KB (sometimes filesystems vary slightly) + if abs(real_size - expected_size) < 4096: + return True + return False + + # If it's a multi-file torrent (folder) + else: + # If the folder exists, we assume it's mostly good, + # but let's check at least one file inside to be sure it's not empty. + files = info[b'files'] + if not files: return True # Empty folder torrent? rare but possible. + + # Check the first file in the list + first_file_path = target_path.joinpath(*[decode_bytes(p) for p in files[0][b'path']]) + return first_file_path.exists() + + except Exception as e: + # If Bencode fails or path is weird + return False + + +# ============================================================ +# MAIN EXECUTION +# ============================================================ +if __name__ == "__main__": + if not PHYSICAL_DIR.exists(): + print(f"❌ ERROR: Cannot access path: {PHYSICAL_DIR}") + print("Make sure the drive is mapped or the network path is accessible.") + exit() + + print(f"📂 Scanning storage: {PHYSICAL_DIR}") + print("🚀 Connecting to Database...") + + db = pymysql.connect(**DB_CONFIG) + cursor = db.cursor() + + # 1. Get all torrents that have content (BLOB) + # We only select ID and Content to keep memory usage reasonable + cursor.execute( + "SELECT torrent_hash, title_visible, torrent_content FROM torrents WHERE torrent_content IS NOT NULL") + + rows = cursor.fetchall() + total = len(rows) + print(f"📋 Analysing {total} torrents from database against disk files...") + + found_count = 0 + missing_count = 0 + + # 2. Iterate and Check + updates = [] # Store successful hashes to batch update later + + for index, row in enumerate(rows): + t_hash, title, blob = row + + is_downloaded = check_torrent_in_filesystem(blob, PHYSICAL_DIR) + + if is_downloaded: + found_count += 1 + updates.append(t_hash) + # Print only every 50th line to reduce clutter, or if found + # print(f"✅ Found: {title[:50]}") + else: + missing_count += 1 + + if index % 100 == 0: + print(f" Processed {index}/{total} ... (Found: {found_count})") + + # 3. Batch Update Database + print(f"\n💾 Updating Database: Marking {len(updates)} torrents as 'physical_exists = 1'...") + + # Reset everything to 0 first (in case you deleted files since last run) + cursor.execute("UPDATE torrents SET physical_exists = 0") + + if updates: + # Update in chunks of 1000 to be safe + chunk_size = 1000 + for i in range(0, len(updates), chunk_size): + chunk = updates[i:i + chunk_size] + format_strings = ','.join(['%s'] * len(chunk)) + cursor.execute(f"UPDATE torrents SET physical_exists = 1 WHERE torrent_hash IN ({format_strings})", + tuple(chunk)) + db.commit() + + db.close() + + print("\n" + "=" * 40) + print(f"🏁 SCAN COMPLETE") + print(f"✅ Physically Available: {found_count}") + print(f"❌ Missing / Not Downloaded: {missing_count}") + print(f"📊 Completion Rate: {int((found_count / total) * 100)}%") + print("=" * 40) \ No newline at end of file diff --git a/Seedbox/10 Nahraniexistujicich.py b/Seedbox/10 Nahraniexistujicich.py new file mode 100644 index 0000000..5a3a088 --- /dev/null +++ b/Seedbox/10 Nahraniexistujicich.py @@ -0,0 +1,150 @@ +import pymysql +import re +import time +import qbittorrentapi + +# ============================================================ +# KONFIGURACE +# ============================================================ +MAX_SIZE_GB = 950 +QBT_URL = "https://vladob.zen.usbx.me/qbittorrent" +QBT_USER = "vladob" +QBT_PASS = "jCni3U6d#y4bfcm" + +DB_CONFIG = { + "host": "192.168.1.50", + "port": 3306, + "user": "root", + "password": "Vlado9674+", + "database": "torrents", + "charset": "utf8mb4", + "autocommit": True, +} + + +# ============================================================ +# POMOCNÉ FUNKCE +# ============================================================ +def parse_size_to_gb(size_str): + """Převede text '1.5 GB' nebo '500 MB' na float v GB""" + if not size_str: return 0.0 + s = str(size_str).upper().replace(",", ".").strip() + match = re.search(r"([\d\.]+)", s) + if not match: return 0.0 + val = float(match.group(1)) + + if "TB" in s: return val * 1024 + if "GB" in s: return val + if "MB" in s: return val / 1024 + if "KB" in s: return val / 1024 / 1024 + return 0.0 + + +# ============================================================ +# HLAVNÍ LOGIKA +# ============================================================ +def main(): + print(f"🚀 Plánuji přímý upload z DB (Limit: {MAX_SIZE_GB} GB, řazeno dle seederů)...") + + # 1. Načtení dat z DB + # Stahujeme i BLOB (torrent_content), takže to může chvilku trvat + db = pymysql.connect(**DB_CONFIG) + cursor = db.cursor() + + print("⏳ Načítám data z MySQL...") + sql = """ + SELECT torrent_hash, title_visible, size_pretty, seeders, torrent_content + FROM torrents + WHERE physical_exists = 0 AND torrent_content IS NOT NULL + ORDER BY seeders DESC + """ + cursor.execute(sql) + rows = cursor.fetchall() + db.close() + + print(f"🔍 Nalezeno {len(rows)} kandidátů. Vybírám ty nejlepší...") + + # 2. Výběr do kapacity 950 GB + selected_torrents = [] + total_size_gb = 0.0 + + for row in rows: + t_hash, title, size_str, seeders, content = row + size_gb = parse_size_to_gb(size_str) + + # Pojistka proti nesmyslně velkým souborům nebo chybám v parsování + if size_gb == 0 and "MB" not in str(size_str).upper() and "KB" not in str(size_str).upper(): + pass + + # Kontrola limitu + if total_size_gb + size_gb > MAX_SIZE_GB: + # Jakmile narazíme na něco, co se nevejde, končíme výběr (protože jsou seřazeny dle priority) + print(f"🛑 Limit naplněn! '{title}' ({size_gb:.2f} GB) by přesáhl {MAX_SIZE_GB} GB.") + break + + selected_torrents.append({ + "filename": f"{t_hash}.torrent", # Virtuální název souboru + "content": content, # Binární data + "title": title, + "size": size_gb, + "seeders": seeders + }) + total_size_gb += size_gb + + # 3. Report + print("-" * 40) + print(f"📦 Vybráno: {len(selected_torrents)} torrentů") + print(f"💾 Celková velikost: {total_size_gb:.2f} GB / {MAX_SIZE_GB} GB") + if selected_torrents: + avg_seeders = sum(t['seeders'] for t in selected_torrents) / len(selected_torrents) + print(f"⚡ Průměrně seederů: {avg_seeders:.1f}") + print("-" * 40) + + if not selected_torrents: + print("Nic k nahrání.") + exit() + + confirm = input("❓ Nahrát tento výběr na Seedbox? (ano/ne): ") + if confirm.lower() not in ['ano', 'y', 'yes']: + print("❌ Zrušeno.") + exit() + + # 4. Připojení k qBittorrent + try: + qbt = qbittorrentapi.Client( + host=QBT_URL, + username=QBT_USER, + password=QBT_PASS, + VERIFY_WEBUI_CERTIFICATE=False + ) + qbt.auth_log_in() + print("✅ Připojeno k Seedboxu.") + except Exception as e: + print(f"❌ Chyba připojení: {e}") + exit() + + # 5. Odeslání dat + print("🚀 Odesílám...") + success_count = 0 + + for i, item in enumerate(selected_torrents): + try: + # Posíláme binární data přímo (tváříme se, že posíláme soubor) + # formát: {'nazev_souboru.torrent': b'binarni_data...'} + file_dict = {item['filename']: item['content']} + + qbt.torrents_add(torrent_files=file_dict, is_paused=False) + + print(f"[{i + 1}/{len(selected_torrents)}] 📤 {item['title']} ({item['size']:.1f} GB)") + success_count += 1 + time.sleep(0.2) # Malá pauza pro stabilitu API + + except Exception as e: + print(f"❌ Chyba u {item['title']}: {e}") + + print("\n✅ HOTOVO.") + print("Torrenty jsou na Seedboxu. Až se stáhnou, stáhni je domů a spusť skript 99_Scan...") + + +if __name__ == "__main__": + main() \ No newline at end of file