355 lines
11 KiB
Python
355 lines
11 KiB
Python
#!/usr/bin/python3
|
||
# -*- coding: utf-8 -*-
|
||
|
||
r"""
|
||
FAST FILE HASH INDEXER – WINDOWS CLIENT (HARDCODED CONFIG)
|
||
- Mode: PHYSICAL BACKUP
|
||
- Hostname in DB = Disk Label (e.g., #HD015)
|
||
- Path in DB = Relative path (e.g., /Movies/Film.mkv)
|
||
"""
|
||
|
||
import os, time
|
||
import pymysql
|
||
import socket
|
||
import platform
|
||
import sys
|
||
from blake3 import blake3
|
||
|
||
def get_path_hash(path_str: str) -> bytes:
|
||
"""Calculates MD5 hash of the path and returns raw 16 bytes for BINARY(16)."""
|
||
return hashlib.md5(path_str.encode('utf-8')).digest()
|
||
|
||
# ==============================
|
||
# ⚙️ USER CONFIGURATION
|
||
# ==============================
|
||
DISK_DRIVE_LETTER = "p" # (e.g., "E", "F", "P")
|
||
DISK_HOSTNAME = "#HD05" # (e.g., "#HD015")
|
||
|
||
# 🔒 SAFETY SWITCH
|
||
# True = LIST ONLY (No DB changes). "Simulates" the run.
|
||
# False = EXECUTE (Deletes and Inserts into DB).
|
||
DRY_RUN = False
|
||
|
||
# ==============================
|
||
# TECHNICAL CONFIG
|
||
# ==============================
|
||
CHUNK_SIZE = 5 * 1024 * 1024 # 5 MB
|
||
PROGRESS_MIN_SIZE = 500 * 1024 * 1024 # 500 MB
|
||
PROGRESS_INTERVAL = 1.0 # seconds
|
||
|
||
EXCLUDED_DIRS = {"$RECYCLE.BIN", "System Volume Information", "RECYCLER", "msdownld.tmp"}
|
||
|
||
# --- File Size Limits ---
|
||
FILE_MIN_SIZE = 0
|
||
FILE_MAX_SIZE = 1024 * 1024*1024*1024 # 1TB
|
||
|
||
# --- DB Config ---
|
||
DB_CONFIG = {
|
||
"host": "192.168.1.76",
|
||
"port": 3307,
|
||
"user": "root",
|
||
"password": "Vlado9674+",
|
||
"database": "torrents",
|
||
"charset": "utf8mb4",
|
||
"autocommit": True,
|
||
}
|
||
|
||
PRINT_SKIPPED = False # Set True to see files that were already in DB
|
||
|
||
# ==============================
|
||
# SYSTEM INFO
|
||
# ==============================
|
||
REAL_PC_HOSTNAME = socket.gethostname()
|
||
OS_NAME = platform.system()
|
||
|
||
|
||
# ==============================
|
||
# FUNCTIONS
|
||
# ==============================
|
||
|
||
def compute_blake3(path: str) -> bytes:
|
||
h = blake3()
|
||
total_size = os.path.getsize(path)
|
||
show_progress = total_size >= PROGRESS_MIN_SIZE
|
||
|
||
processed = 0
|
||
start_time = time.time()
|
||
last_report = start_time
|
||
|
||
try:
|
||
with open(path, "rb") as f:
|
||
while True:
|
||
chunk = f.read(CHUNK_SIZE)
|
||
if not chunk:
|
||
break
|
||
|
||
h.update(chunk)
|
||
processed += len(chunk)
|
||
|
||
if show_progress:
|
||
now = time.time()
|
||
if now - last_report >= PROGRESS_INTERVAL:
|
||
elapsed = now - start_time
|
||
speed = processed / elapsed if elapsed > 0 else 0
|
||
percent = processed / total_size * 100
|
||
remaining = total_size - processed
|
||
eta = remaining / speed if speed > 0 else 0
|
||
|
||
print(
|
||
f" ⏳ {percent:6.2f}% | "
|
||
f"{processed / 1024 / 1024:8.1f} / {total_size / 1024 / 1024:.1f} MB | "
|
||
f"{speed / 1024 / 1024:6.1f} MB/s | "
|
||
f"ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}",
|
||
flush=True
|
||
)
|
||
last_report = now
|
||
|
||
if show_progress:
|
||
total_time = time.time() - start_time
|
||
avg_speed = total_size / total_time if total_time > 0 else 0
|
||
print(
|
||
f" ✅ DONE | "
|
||
f"{total_size / 1024 / 1024:.1f} MB | "
|
||
f"avg {avg_speed / 1024 / 1024:.1f} MB/s | "
|
||
f"time {time.strftime('%H:%M:%S', time.gmtime(total_time))}",
|
||
flush=True
|
||
)
|
||
|
||
return h.digest()
|
||
|
||
except Exception as e:
|
||
print(f"⚠️ HASH ERROR: {path} - {e}")
|
||
raise
|
||
|
||
|
||
def size_allowed(size: int) -> bool:
|
||
if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE: return False
|
||
if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE: return False
|
||
return True
|
||
|
||
|
||
def normalize_db_path(scan_root, disk_path):
|
||
"""
|
||
Converts a physical Windows path to the standardized DB format.
|
||
E:\Movies\File.mkv -> /Movies/File.mkv
|
||
"""
|
||
try:
|
||
rel_path = os.path.relpath(disk_path, scan_root)
|
||
except ValueError:
|
||
return None
|
||
|
||
# Windows backslash to slash
|
||
clean_path = rel_path.replace("\\", "/")
|
||
|
||
# Ensure leading slash
|
||
if not clean_path.startswith("/"):
|
||
clean_path = "/" + clean_path
|
||
|
||
return clean_path
|
||
|
||
|
||
# ==============================
|
||
# MAIN
|
||
# ==============================
|
||
|
||
def main():
|
||
print("🚀 BLAKE3 External Disk Indexer", flush=True)
|
||
print(f"🖥 Running on PC: {REAL_PC_HOSTNAME}", flush=True)
|
||
|
||
if DRY_RUN:
|
||
print("🛡️ DRY RUN MODE ACTIVE: No changes will be made to DB.", flush=True)
|
||
else:
|
||
print("⚠️ LIVE MODE: Changes WILL be committed to DB.", flush=True)
|
||
|
||
# Build root path
|
||
scan_root = f"{DISK_DRIVE_LETTER}:\\"
|
||
|
||
if not os.path.isdir(scan_root):
|
||
print(f"❌ ERROR: Drive '{scan_root}' not found!")
|
||
print(f" Please check DISK_DRIVE_LETTER in config.")
|
||
return
|
||
|
||
print(f"✅ Config:")
|
||
print(f" Source (Win) : {scan_root}")
|
||
print(f" DB Hostname : {DISK_HOSTNAME}")
|
||
|
||
try:
|
||
db = pymysql.connect(**DB_CONFIG)
|
||
cur = db.cursor()
|
||
except Exception as e:
|
||
print(f"❌ DB Connection failed: {e}")
|
||
return
|
||
|
||
print(f"📥 Loading DB index for: '{DISK_HOSTNAME}'...", flush=True)
|
||
|
||
# === LOAD EXISTING DB RECORDS ===
|
||
cur.execute("""
|
||
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
|
||
FROM file_md5_index
|
||
WHERE host_name = %s
|
||
""", (DISK_HOSTNAME,))
|
||
|
||
indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
|
||
|
||
print(f"✅ Found {len(indexed_map):,} files in DB for this disk.", flush=True)
|
||
|
||
# =========================================================
|
||
# PHASE 1: CLEANUP (DELETE MISSING FILES)
|
||
# =========================================================
|
||
print("======================================", flush=True)
|
||
print("🧹 PHASE 1: Checking for deleted files...", flush=True)
|
||
|
||
current_disk_paths = set()
|
||
|
||
# Fast walk just to get paths
|
||
for root, dirs, files in os.walk(scan_root):
|
||
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
|
||
|
||
for fname in files:
|
||
disk_path = os.path.join(root, fname)
|
||
clean_path = normalize_db_path(scan_root, disk_path)
|
||
if clean_path:
|
||
current_disk_paths.add(clean_path)
|
||
|
||
paths_to_delete = set(indexed_map.keys()) - current_disk_paths
|
||
|
||
if paths_to_delete:
|
||
print(f"🗑️ Found {len(paths_to_delete):,} files to delete from DB.")
|
||
|
||
if DRY_RUN:
|
||
print("🛡️ [DRY RUN] Listing files to be deleted (No action taken):")
|
||
for p in sorted(list(paths_to_delete))[:20]: # Print first 20
|
||
print(f" - {p}")
|
||
if len(paths_to_delete) > 20:
|
||
print(f" ... and {len(paths_to_delete) - 20} more.")
|
||
else:
|
||
# Delete in batches
|
||
batch_size = 1000
|
||
to_delete_list = list(paths_to_delete)
|
||
|
||
for i in range(0, len(to_delete_list), batch_size):
|
||
batch = to_delete_list[i: i + batch_size]
|
||
format_strings = ','.join(['%s'] * len(batch))
|
||
|
||
query = f"DELETE FROM file_md5_index WHERE host_name = %s AND full_path IN ({format_strings})"
|
||
|
||
try:
|
||
cur.execute(query, [DISK_HOSTNAME] + batch)
|
||
print(f" ... deleted batch {i}-{i + len(batch)}")
|
||
except Exception as e:
|
||
print(f"❌ Error deleting batch: {e}")
|
||
|
||
# Update local map
|
||
for p in paths_to_delete:
|
||
del indexed_map[p]
|
||
print("✅ Cleanup complete.")
|
||
else:
|
||
print("✅ No deleted files detected.")
|
||
|
||
# =========================================================
|
||
# PHASE 2: SCAN & UPDATE (HASHING)
|
||
# =========================================================
|
||
print("======================================", flush=True)
|
||
print("🚀 PHASE 2: Scanning for changes & new files...", flush=True)
|
||
|
||
new_files = 0
|
||
skipped = 0
|
||
filtered = 0
|
||
errors = 0
|
||
seen_paths = set()
|
||
|
||
for root, dirs, files in os.walk(scan_root):
|
||
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
|
||
|
||
for fname in files:
|
||
disk_path = os.path.join(root, fname)
|
||
|
||
try:
|
||
stat = os.stat(disk_path)
|
||
except OSError:
|
||
errors += 1
|
||
continue
|
||
|
||
size = stat.st_size
|
||
if not size_allowed(size):
|
||
filtered += 1
|
||
continue
|
||
|
||
clean_path = normalize_db_path(scan_root, disk_path)
|
||
if not clean_path:
|
||
errors += 1
|
||
continue
|
||
|
||
if clean_path in seen_paths:
|
||
continue
|
||
seen_paths.add(clean_path)
|
||
|
||
mtime = int(stat.st_mtime)
|
||
|
||
# === MATCH CHECK ===
|
||
is_match = False
|
||
if clean_path in indexed_map:
|
||
db_size, db_mtime = indexed_map[clean_path]
|
||
if size == db_size and mtime == db_mtime:
|
||
is_match = True
|
||
|
||
if is_match:
|
||
skipped += 1
|
||
if PRINT_SKIPPED:
|
||
print(f"⏭ SKIP {clean_path}", flush=True)
|
||
continue
|
||
|
||
# === INSERT / UPDATE ===
|
||
print("➕ NEW / UPDATED", flush=True)
|
||
print(f" File: {clean_path}", flush=True)
|
||
print(f" Size: {size:,} B", flush=True)
|
||
|
||
try:
|
||
b3 = compute_blake3(disk_path)
|
||
except Exception:
|
||
errors += 1
|
||
continue
|
||
|
||
if DRY_RUN:
|
||
print(f"🛡️ [DRY RUN] Would INSERT/UPDATE: {clean_path}")
|
||
print(f" Hash: {b3.hex()}")
|
||
new_files += 1
|
||
else:
|
||
cur.execute("""
|
||
INSERT INTO file_md5_index
|
||
(os_name, host_name, full_path, file_name, directory,
|
||
file_size, mtime, blake3)
|
||
VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
|
||
ON DUPLICATE KEY UPDATE
|
||
file_size = VALUES(file_size),
|
||
mtime = VALUES(mtime),
|
||
blake3 = VALUES(blake3),
|
||
updated_at = CURRENT_TIMESTAMP
|
||
""", (
|
||
OS_NAME,
|
||
DISK_HOSTNAME,
|
||
clean_path,
|
||
fname,
|
||
os.path.dirname(clean_path),
|
||
size,
|
||
mtime,
|
||
b3,
|
||
))
|
||
new_files += 1
|
||
print(f" Hash: {b3.hex()}", flush=True)
|
||
|
||
print("--------------------------------------", flush=True)
|
||
|
||
print("======================================", flush=True)
|
||
print(f"✅ Processed : {new_files}")
|
||
print(f"⏭ Skipped : {skipped}")
|
||
print(f"🗑 Deleted : {len(paths_to_delete)} " + ("(DRY RUN)" if DRY_RUN else ""))
|
||
print(f"⚠️ Errors : {errors}")
|
||
print("🏁 Done.")
|
||
|
||
cur.close()
|
||
db.close()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |