#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ FAST MD5 indexer with in-memory cache - prints every processed file - skips unchanged files instantly """ import os import hashlib from datetime import datetime import pymysql # ============================== # CONFIG # ============================== ROOT_DIR = r"\\tower1\#ColdData\porno" DB_CONFIG = { "host": "192.168.1.76", "port": 3307, "user": "root", "password": "Vlado9674+", "database": "torrents", "charset": "utf8mb4", "autocommit": True, } CHUNK_SIZE = 1024 * 1024 # 1 MB PRINT_SKIPPED = True # set False if too noisy # ============================== # HELPERS # ============================== def compute_md5(path: str) -> str: h = hashlib.md5() with open(path, "rb") as f: for chunk in iter(lambda: f.read(CHUNK_SIZE), b""): h.update(chunk) return h.hexdigest() def format_size(size): for unit in ["B", "KB", "MB", "GB", "TB"]: if size < 1024: return f"{size:.1f} {unit}" size /= 1024 return f"{size:.1f} PB" # ============================== # MAIN # ============================== def main(): db = pymysql.connect(**DB_CONFIG) cur = db.cursor() print("📥 Loading already indexed files into memory...") cur.execute(""" SELECT full_path, file_size, mtime FROM file_md5_index """) indexed = { (row[0], row[1], row[2]) for row in cur.fetchall() } print(f"✅ Loaded {len(indexed):,} indexed entries") print("======================================") new_files = 0 skipped = 0 for root, _, files in os.walk(ROOT_DIR): for fname in files: full_path = os.path.join(root, fname) try: stat = os.stat(full_path) except (OSError, FileNotFoundError): continue key = ( full_path, stat.st_size, datetime.fromtimestamp(stat.st_mtime), ) # FAST PATH if key in indexed: skipped += 1 if PRINT_SKIPPED: print("⏭ SKIP") print(f" File: {full_path}") continue print("➕ NEW / UPDATED") print(f" Size: {format_size(stat.st_size)}") print(f" File: {full_path}") try: md5 = compute_md5(full_path) except Exception as e: print(f"❌ MD5 failed: {e}") continue cur.execute(""" INSERT INTO file_md5_index (full_path, file_name, directory, file_size, mtime, md5) VALUES (%s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE file_size=VALUES(file_size), mtime=VALUES(mtime), md5=VALUES(md5), updated_at=CURRENT_TIMESTAMP """, ( full_path, fname, root, stat.st_size, datetime.fromtimestamp(stat.st_mtime), md5, )) new_files += 1 print(f" MD5 : {md5}") print("--------------------------------------") print("======================================") print(f"✅ New / updated : {new_files}") print(f"⏭ Skipped : {skipped}") print("======================================") cur.close() db.close() if __name__ == "__main__": main()