From 0769bd267083a0cf539441d4262f4af8915571ad Mon Sep 17 00:00:00 2001 From: "vladimir.buzalka" Date: Thu, 18 Dec 2025 07:53:48 +0100 Subject: [PATCH] z230 --- 70 MD5.py | 143 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 143 insertions(+) create mode 100644 70 MD5.py diff --git a/70 MD5.py b/70 MD5.py new file mode 100644 index 0000000..8ef237a --- /dev/null +++ b/70 MD5.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +FAST MD5 indexer with in-memory cache +- prints every processed file +- skips unchanged files instantly +""" + +import os +import hashlib +from datetime import datetime +import pymysql + +# ============================== +# CONFIG +# ============================== + +ROOT_DIR = r"\\tower1\#ColdData\porno" + +DB_CONFIG = { + "host": "192.168.1.76", + "port": 3307, + "user": "root", + "password": "Vlado9674+", + "database": "torrents", + "charset": "utf8mb4", + "autocommit": True, +} + +CHUNK_SIZE = 1024 * 1024 # 1 MB +PRINT_SKIPPED = True # set False if too noisy + +# ============================== +# HELPERS +# ============================== + +def compute_md5(path: str) -> str: + h = hashlib.md5() + with open(path, "rb") as f: + for chunk in iter(lambda: f.read(CHUNK_SIZE), b""): + h.update(chunk) + return h.hexdigest() + +def format_size(size): + for unit in ["B", "KB", "MB", "GB", "TB"]: + if size < 1024: + return f"{size:.1f} {unit}" + size /= 1024 + return f"{size:.1f} PB" + +# ============================== +# MAIN +# ============================== + +def main(): + db = pymysql.connect(**DB_CONFIG) + cur = db.cursor() + + print("📥 Loading already indexed files into memory...") + + cur.execute(""" + SELECT full_path, file_size, mtime + FROM file_md5_index + """) + + indexed = { + (row[0], row[1], row[2]) + for row in cur.fetchall() + } + + print(f"✅ Loaded {len(indexed):,} indexed entries") + print("======================================") + + new_files = 0 + skipped = 0 + + for root, _, files in os.walk(ROOT_DIR): + for fname in files: + full_path = os.path.join(root, fname) + + try: + stat = os.stat(full_path) + except (OSError, FileNotFoundError): + continue + + key = ( + full_path, + stat.st_size, + datetime.fromtimestamp(stat.st_mtime), + ) + + # FAST PATH + if key in indexed: + skipped += 1 + if PRINT_SKIPPED: + print("⏭ SKIP") + print(f" File: {full_path}") + continue + + print("➕ NEW / UPDATED") + print(f" Size: {format_size(stat.st_size)}") + print(f" File: {full_path}") + + try: + md5 = compute_md5(full_path) + except Exception as e: + print(f"❌ MD5 failed: {e}") + continue + + cur.execute(""" + INSERT INTO file_md5_index + (full_path, file_name, directory, file_size, mtime, md5) + VALUES (%s, %s, %s, %s, %s, %s) + ON DUPLICATE KEY UPDATE + file_size=VALUES(file_size), + mtime=VALUES(mtime), + md5=VALUES(md5), + updated_at=CURRENT_TIMESTAMP + """, ( + full_path, + fname, + root, + stat.st_size, + datetime.fromtimestamp(stat.st_mtime), + md5, + )) + + new_files += 1 + print(f" MD5 : {md5}") + print("--------------------------------------") + + print("======================================") + print(f"✅ New / updated : {new_files}") + print(f"⏭ Skipped : {skipped}") + print("======================================") + + cur.close() + db.close() + + +if __name__ == "__main__": + main()