#!/usr/bin/python3 # -*- coding: utf-8 -*- r""" FAST FILE HASH INDEXER โ€“ WINDOWS CLIENT (HARDCODED CONFIG) - Mode: PHYSICAL BACKUP - Hostname in DB = Disk Label (e.g., #HD015) - Path in DB = Relative path (e.g., /Movies/Film.mkv) """ import os import time import pymysql import socket import platform import sys import hashlib from blake3 import blake3 # ============================== # โš™๏ธ USER CONFIGURATION # ============================== DISK_DRIVE_LETTER = "z" # (e.g., "E", "F", "P") DISK_HOSTNAME = "TW22" # (e.g., "#HD015") # ๐Ÿ”’ SAFETY SWITCH DRY_RUN = False # ============================== # TECHNICAL CONFIG # ============================== CHUNK_SIZE = 5 * 1024 * 1024 # 5 MB PROGRESS_MIN_SIZE = 500 * 1024 * 1024 # 500 MB PROGRESS_INTERVAL = 1.0 # seconds EXCLUDED_DIRS = {"$RECYCLE.BIN", "System Volume Information", "RECYCLER", "msdownld.tmp"} # --- File Size Limits --- FILE_MIN_SIZE = 0 FILE_MAX_SIZE = 1024 * 1024 * 1024 * 1024 # 1TB # --- DB Config --- DB_CONFIG = { "host": "192.168.1.50", "port": 3306, "user": "root", "password": "Vlado9674+", "database": "torrents", "charset": "utf8mb4", "autocommit": True, } PRINT_SKIPPED = False # Set True to see files that were already in DB # ============================== # SYSTEM INFO # ============================== REAL_PC_HOSTNAME = socket.gethostname() OS_NAME = platform.system() # ============================== # FUNCTIONS # ============================== def get_path_hash(path_str: str) -> bytes: """Calculates MD5 hash of the path and returns raw 16 bytes for BINARY(16).""" return hashlib.md5(path_str.encode('utf-8')).digest() def compute_blake3(path: str) -> bytes: h = blake3() total_size = os.path.getsize(path) show_progress = total_size >= PROGRESS_MIN_SIZE processed = 0 start_time = time.time() last_report = start_time try: with open(path, "rb") as f: while True: chunk = f.read(CHUNK_SIZE) if not chunk: break h.update(chunk) processed += len(chunk) if show_progress: now = time.time() if now - last_report >= PROGRESS_INTERVAL: elapsed = now - start_time speed = processed / elapsed if elapsed > 0 else 0 percent = processed / total_size * 100 remaining = total_size - processed eta = remaining / speed if speed > 0 else 0 print( f" โณ {percent:6.2f}% | " f"{processed / 1024 / 1024:8.1f} / {total_size / 1024 / 1024:.1f} MB | " f"{speed / 1024 / 1024:6.1f} MB/s | " f"ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}", flush=True ) last_report = now if show_progress: total_time = time.time() - start_time avg_speed = total_size / total_time if total_time > 0 else 0 print( f" โœ… DONE | " f"{total_size / 1024 / 1024:.1f} MB | " f"avg {avg_speed / 1024 / 1024:.1f} MB/s | " f"time {time.strftime('%H:%M:%S', time.gmtime(total_time))}", flush=True ) return h.digest() except Exception as e: print(f"โš ๏ธ HASH ERROR: {path} - {e}") raise def size_allowed(size: int) -> bool: if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE: return False if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE: return False return True def normalize_db_path(scan_root, disk_path): """ Converts a physical Windows path to the standardized DB format. E:\Movies\File.mkv -> /Movies/File.mkv """ try: rel_path = os.path.relpath(disk_path, scan_root) except ValueError: return None clean_path = rel_path.replace("\\", "/") if not clean_path.startswith("/"): clean_path = "/" + clean_path return clean_path # ============================== # MAIN # ============================== def main(): print("๐Ÿš€ BLAKE3 External Disk Indexer (MySQL 9 Compatible)", flush=True) print(f"๐Ÿ–ฅ Running on PC: {REAL_PC_HOSTNAME}", flush=True) if DRY_RUN: print("๐Ÿ›ก๏ธ DRY RUN MODE ACTIVE: No changes will be made to DB.", flush=True) else: print("โš ๏ธ LIVE MODE: Changes WILL be committed to DB.", flush=True) scan_root = f"{DISK_DRIVE_LETTER}:\\" if not os.path.isdir(scan_root): print(f"โŒ ERROR: Drive '{scan_root}' not found!") return try: db = pymysql.connect(**DB_CONFIG) cur = db.cursor() except Exception as e: print(f"โŒ DB Connection failed: {e}") return print(f"๐Ÿ“ฅ Loading DB index for: '{DISK_HOSTNAME}'...", flush=True) # === LOAD EXISTING DB RECORDS === # We load path_hash as well for precise deletion cur.execute(""" SELECT full_path, file_size, UNIX_TIMESTAMP(mtime) FROM file_md5_index WHERE host_name = %s """, (DISK_HOSTNAME,)) indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()} print(f"โœ… Found {len(indexed_map):,} files in DB for this disk.", flush=True) # ========================================================= # PHASE 1: CLEANUP (DELETE MISSING FILES) # ========================================================= print("======================================", flush=True) print("๐Ÿงน PHASE 1: Checking for deleted files...", flush=True) current_disk_paths = set() for root, dirs, files in os.walk(scan_root): dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS] for fname in files: disk_path = os.path.join(root, fname) clean_path = normalize_db_path(scan_root, disk_path) if clean_path: current_disk_paths.add(clean_path) paths_to_delete = set(indexed_map.keys()) - current_disk_paths if paths_to_delete: print(f"๐Ÿ—‘๏ธ Found {len(paths_to_delete):,} files to delete from DB.") if DRY_RUN: for p in sorted(list(paths_to_delete))[:20]: print(f" - {p}") else: # Delete using path_hash for index efficiency batch_size = 500 to_delete_list = list(paths_to_delete) for i in range(0, len(to_delete_list), batch_size): batch_paths = to_delete_list[i: i + batch_size] # Map paths to their MD5 hashes batch_hashes = [get_path_hash(p) for p in batch_paths] format_strings = ','.join(['%s'] * len(batch_hashes)) query = f"DELETE FROM file_md5_index WHERE host_name = %s AND path_hash IN ({format_strings})" try: cur.execute(query, [DISK_HOSTNAME] + batch_hashes) except Exception as e: print(f"โŒ Error deleting batch: {e}") for p in paths_to_delete: del indexed_map[p] print("โœ… Cleanup complete.") else: print("โœ… No deleted files detected.") # ========================================================= # PHASE 2: SCAN & UPDATE (HASHING) # ========================================================= print("======================================", flush=True) print("๐Ÿš€ PHASE 2: Scanning for changes & new files...", flush=True) new_files = 0 skipped = 0 errors = 0 seen_paths = set() for root, dirs, files in os.walk(scan_root): dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS] for fname in files: disk_path = os.path.join(root, fname) try: stat = os.stat(disk_path) except OSError: errors += 1 continue size = stat.st_size if not size_allowed(size): continue clean_path = normalize_db_path(scan_root, disk_path) if not clean_path or clean_path in seen_paths: continue seen_paths.add(clean_path) mtime = int(stat.st_mtime) # Match Check if clean_path in indexed_map: db_size, db_mtime = indexed_map[clean_path] if size == db_size and mtime == db_mtime: skipped += 1 continue # Compute Hashes try: b3_hash = compute_blake3(disk_path) p_hash = get_path_hash(clean_path) # Essential for MySQL 9 Unique Index except Exception: errors += 1 continue if DRY_RUN: print(f"๐Ÿ›ก๏ธ [DRY RUN] NEW/UPDATE: {clean_path}") new_files += 1 else: cur.execute(""" INSERT INTO file_md5_index (os_name, host_name, full_path, path_hash, file_name, directory, file_size, mtime, blake3) VALUES (%s, %s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s) ON DUPLICATE KEY UPDATE file_size = VALUES(file_size), mtime = VALUES(mtime), blake3 = VALUES(blake3), updated_at = CURRENT_TIMESTAMP """, ( OS_NAME, DISK_HOSTNAME, clean_path, p_hash, fname, os.path.dirname(clean_path), size, mtime, b3_hash )) new_files += 1 print(f"โž• ADDED: {clean_path} | {b3_hash.hex()[:8]}...") print("======================================", flush=True) print(f"โœ… Processed : {new_files}") print(f"โญ Skipped : {skipped}") print(f"๐Ÿ—‘ Deleted : {len(paths_to_delete)}") print(f"โš ๏ธ Errors : {errors}") print("๐Ÿ Done.") cur.close() db.close() if __name__ == "__main__": main()