#!/usr/bin/python3 # -*- coding: utf-8 -*- """ FAST FILE HASH INDEXER – UNRAID (BLAKE3 ONLY, ALL SHARES) - HARDCODED SINGLE SHARE MODE - SQL OPTIMIZATION - STRICT MODE (NO TOLERANCE) - Updates DB on any mismatch """ import os import pymysql import socket import platform from blake3 import blake3 # ============================== # ENV / HOST # ============================== HOSTNAME = socket.gethostname() OS_NAME = platform.system() # ZDE JE TO NATVRDO PRO TESTOVÁNÍ: # SCAN_ONLY_THIS = None #"#Fotky" SCAN_ONLY_THIS = '#Library' # "#Fotky" # ============================== # CONFIG # ============================== EXCLUDED_SHARES = {"domains", "appdata", "system", "isos"} # --- File size limits (bytes) --- FILE_MIN_SIZE = 0 FILE_MAX_SIZE = 1024 * 1024 * 1024 * 1024 # 50MB DB_CONFIG = { "host": "192.168.1.76", "port": 3307, "user": "root", "password": "Vlado9674+", "database": "torrents", "charset": "utf8mb4", "autocommit": True, } CHUNK_SIZE = 4 * 1024 * 1024 # 4 MB PRINT_SKIPPED = False # ============================== # HASH # ============================== def compute_blake3(path: str) -> bytes: h = blake3() with open(path, "rb") as f: for chunk in iter(lambda: f.read(CHUNK_SIZE), b""): h.update(chunk) return h.digest() # ============================== # SHARE / PATH HELPERS # ============================== def get_user_shares(): if SCAN_ONLY_THIS: path = f"/mnt/user/{SCAN_ONLY_THIS}" if os.path.isdir(path): print(f"🎯 SINGLE SHARE MODE ACTIVE: Scanning only '{SCAN_ONLY_THIS}'") return [SCAN_ONLY_THIS] else: print(f"⚠️ ERROR: Requested share '{SCAN_ONLY_THIS}' not found in /mnt/user!") return [] shares = [] if not os.path.exists("/mnt/user"): return [] for name in os.listdir("/mnt/user"): if name.startswith("."): continue if name in EXCLUDED_SHARES: continue path = f"/mnt/user/{name}" if os.path.isdir(path): shares.append(name) return sorted(shares) def find_physical_roots(shares): roots = [] if not os.path.exists("/mnt"): return [] for disk in os.listdir("/mnt"): if not disk.startswith("disk"): continue for share in shares: path = f"/mnt/{disk}/{share}" if os.path.isdir(path): roots.append((share, path)) return sorted(roots) def logical_path_from_disk_path(disk_path: str) -> str: if not disk_path.startswith("/mnt/disk"): raise ValueError(f"Unexpected disk path: {disk_path}") parts = disk_path.split("/", 3) return f"/mnt/user/{parts[3]}" def size_allowed(size: int) -> bool: if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE: return False if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE: return False return True # ============================== # MAIN # ============================== def main(): print("🚀 BLAKE3 indexer starting", flush=True) print(f"🖥 Host: {HOSTNAME} | OS: {OS_NAME}", flush=True) if FILE_MIN_SIZE or FILE_MAX_SIZE: print(f"📏 File size limits: min={FILE_MIN_SIZE} max={FILE_MAX_SIZE}", flush=True) shares = get_user_shares() if not shares: print("❌ No user shares to index!", flush=True) return print("📦 User shares to index:", flush=True) for s in shares: print(f" - {s}", flush=True) scan_roots = find_physical_roots(shares) if not scan_roots: print("❌ No physical disk roots found!", flush=True) return print("📂 Physical scan roots:", flush=True) for _, path in scan_roots: print(f" - {path}", flush=True) try: db = pymysql.connect(**DB_CONFIG) cur = db.cursor() # === TOTO JE TEN PŘÍKAZ "NEPŘEMÝŠLEJ" === # Nastaví relaci na UTC. MySQL přestane posouvat časy o hodinu sem a tam. # cur.execute("SET time_zone = '+00:00'") # ========================================= except Exception as e: print(f"❌ Database connection failed: {e}") return print("📥 Loading already indexed files into memory...", flush=True) # === OPTIMALIZACE SQL === if SCAN_ONLY_THIS: search_pattern = f"/mnt/user/{SCAN_ONLY_THIS}%" print(f"⚡ OPTIMIZATION: Fetching only DB records for '{search_pattern}'", flush=True) cur.execute(""" SELECT full_path, file_size, UNIX_TIMESTAMP(mtime) FROM file_md5_index WHERE host_name = %s AND full_path LIKE %s """, (HOSTNAME, search_pattern)) else: cur.execute(""" SELECT full_path, file_size, UNIX_TIMESTAMP(mtime) FROM file_md5_index WHERE host_name = %s """, (HOSTNAME,)) # Načteme do slovníku pro rychlé vyhledávání # Formát: { "cesta": (velikost, mtime) } indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()} print(f"✅ Loaded {len(indexed_map):,} indexed entries", flush=True) print("======================================", flush=True) new_files = 0 skipped = 0 filtered = 0 seen_paths = set() # --- SCAN --- for share, scan_root in scan_roots: for root, _, files in os.walk(scan_root): for fname in files: disk_path = os.path.join(root, fname) try: stat = os.stat(disk_path) except OSError: continue size = stat.st_size if not size_allowed(size): filtered += 1 continue logical_path = logical_path_from_disk_path(disk_path) if logical_path in seen_paths: continue seen_paths.add(logical_path) mtime = int(stat.st_mtime) # === PŘÍSNÁ KONTROLA (ŽÁDNÁ TOLERANCE) === # Pokud soubor v DB existuje a přesně sedí velikost i čas, přeskočíme ho. # Vše ostatní (včetně posunu času o 1s) se považuje za změnu a aktualizuje se. is_match = False if logical_path in indexed_map: db_size, db_mtime = indexed_map[logical_path] if size == db_size and mtime == db_mtime: is_match = True if is_match: skipped += 1 if PRINT_SKIPPED: print(f"⏭ SKIP {logical_path}", flush=True) continue # ============================================ print("➕ NEW / UPDATED", flush=True) print(f" File: {logical_path}", flush=True) print(f" Size: {size:,} B", flush=True) try: b3 = compute_blake3(disk_path) except Exception as e: print(f"❌ BLAKE3 failed: {e}", flush=True) continue # Zde proběhne UPDATE mtime na hodnotu z disku cur.execute(""" INSERT INTO file_md5_index (os_name, host_name, full_path, file_name, directory, file_size, mtime, blake3) VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s) ON DUPLICATE KEY UPDATE file_size = VALUES(file_size), mtime = VALUES(mtime), blake3 = VALUES(blake3), updated_at = CURRENT_TIMESTAMP """, ( OS_NAME, HOSTNAME, logical_path, fname, os.path.dirname(logical_path), size, mtime, b3, )) new_files += 1 print(f" B3 : {b3.hex()}", flush=True) print("--------------------------------------", flush=True) print("======================================", flush=True) print(f"✅ New / updated : {new_files}", flush=True) print(f"⏭ Skipped : {skipped}", flush=True) print(f"🚫 Size filtered: {filtered}", flush=True) print("🏁 Script finished", flush=True) # ============================== # DB CLEANUP – REMOVE DELETED FILES # ============================== print("🧹 Checking for deleted files in DB...", flush=True) db_paths = set(indexed_map.keys()) deleted_paths = db_paths - seen_paths # Omezíme jen na aktuální share (pokud je aktivní) if SCAN_ONLY_THIS: prefix = f"/mnt/user/{SCAN_ONLY_THIS}/" deleted_paths = {p for p in deleted_paths if p.startswith(prefix)} if deleted_paths: print(f"🗑 Removing {len(deleted_paths):,} deleted files from DB", flush=True) BATCH_SIZE = 1000 deleted_paths = list(deleted_paths) for i in range(0, len(deleted_paths), BATCH_SIZE): batch = deleted_paths[i:i + BATCH_SIZE] placeholders = ",".join(["%s"] * len(batch)) sql = f""" DELETE FROM file_md5_index WHERE host_name = %s AND full_path IN ({placeholders}) """ cur.execute(sql, (HOSTNAME, *batch)) print("✅ DB cleanup completed", flush=True) else: print("✅ No deleted files found in DB", flush=True) cur.close() db.close() if __name__ == "__main__": main()