144 lines
3.5 KiB
Python
144 lines
3.5 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
|
||
"""
|
||
FAST MD5 indexer with in-memory cache
|
||
- prints every processed file
|
||
- skips unchanged files instantly
|
||
- restart-safe (no reprocessing same files)
|
||
"""
|
||
|
||
import os
|
||
import hashlib
|
||
from datetime import datetime
|
||
import pymysql
|
||
|
||
# ==============================
|
||
# CONFIG
|
||
# ==============================
|
||
|
||
ROOT_DIR = r"\\tower1\#ColdData\porno"
|
||
|
||
DB_CONFIG = {
|
||
"host": "192.168.1.76",
|
||
"port": 3307,
|
||
"user": "root",
|
||
"password": "Vlado9674+",
|
||
"database": "torrents",
|
||
"charset": "utf8mb4",
|
||
"autocommit": True,
|
||
}
|
||
|
||
CHUNK_SIZE = 1024 * 1024 # 1 MB
|
||
PRINT_SKIPPED = True # set False if too noisy
|
||
|
||
# ==============================
|
||
# HELPERS
|
||
# ==============================
|
||
|
||
def compute_md5(path: str) -> str:
|
||
h = hashlib.md5()
|
||
with open(path, "rb") as f:
|
||
for chunk in iter(lambda: f.read(CHUNK_SIZE), b""):
|
||
h.update(chunk)
|
||
return h.hexdigest()
|
||
|
||
def format_size(size):
|
||
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
||
if size < 1024:
|
||
return f"{size:.1f} {unit}"
|
||
size /= 1024
|
||
return f"{size:.1f} PB"
|
||
|
||
# ==============================
|
||
# MAIN
|
||
# ==============================
|
||
|
||
def main():
|
||
db = pymysql.connect(**DB_CONFIG)
|
||
cur = db.cursor()
|
||
|
||
print("📥 Loading already indexed files into memory...")
|
||
|
||
cur.execute("""
|
||
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
|
||
FROM file_md5_index
|
||
""")
|
||
|
||
indexed = {
|
||
(row[0], row[1], row[2])
|
||
for row in cur.fetchall()
|
||
}
|
||
|
||
print(f"✅ Loaded {len(indexed):,} indexed entries")
|
||
print("======================================")
|
||
|
||
new_files = 0
|
||
skipped = 0
|
||
|
||
for root, _, files in os.walk(ROOT_DIR):
|
||
for fname in files:
|
||
full_path = os.path.join(root, fname)
|
||
|
||
try:
|
||
stat = os.stat(full_path)
|
||
except (OSError, FileNotFoundError):
|
||
continue
|
||
|
||
mtime = int(stat.st_mtime)
|
||
size = stat.st_size
|
||
|
||
key = (full_path, size, mtime)
|
||
|
||
# FAST PATH
|
||
if key in indexed:
|
||
skipped += 1
|
||
if PRINT_SKIPPED:
|
||
print("⏭ SKIP")
|
||
print(f" File: {full_path}")
|
||
continue
|
||
|
||
print("➕ NEW / UPDATED")
|
||
print(f" Size: {format_size(size)}")
|
||
print(f" File: {full_path}")
|
||
|
||
try:
|
||
md5 = compute_md5(full_path)
|
||
except Exception as e:
|
||
print(f"❌ MD5 failed: {e}")
|
||
continue
|
||
|
||
cur.execute("""
|
||
INSERT INTO file_md5_index
|
||
(full_path, file_name, directory, file_size, mtime, md5)
|
||
VALUES (%s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
|
||
ON DUPLICATE KEY UPDATE
|
||
file_size=VALUES(file_size),
|
||
mtime=VALUES(mtime),
|
||
md5=VALUES(md5),
|
||
updated_at=CURRENT_TIMESTAMP
|
||
""", (
|
||
full_path,
|
||
fname,
|
||
root,
|
||
size,
|
||
mtime,
|
||
md5,
|
||
))
|
||
|
||
new_files += 1
|
||
print(f" MD5 : {md5}")
|
||
print("--------------------------------------")
|
||
|
||
print("======================================")
|
||
print(f"✅ New / updated : {new_files}")
|
||
print(f"⏭ Skipped : {skipped}")
|
||
print("======================================")
|
||
|
||
cur.close()
|
||
db.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|