Files
walkfiles/WalkFilesOnBackupHDD/30 WalkBackupHDD.py
2026-02-02 09:19:11 +01:00

313 lines
10 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/python3
# -*- coding: utf-8 -*-
r"""
FAST FILE HASH INDEXER WINDOWS CLIENT (HARDCODED CONFIG)
- Mode: PHYSICAL BACKUP
- Hostname in DB = Disk Label (e.g., #HD015)
- Path in DB = Relative path (e.g., /Movies/Film.mkv)
"""
import os
import time
import pymysql
import socket
import platform
import sys
import hashlib
from blake3 import blake3
# ==============================
# ⚙️ USER CONFIGURATION
# ==============================
DISK_DRIVE_LETTER = "z" # (e.g., "E", "F", "P")
DISK_HOSTNAME = "TW22" # (e.g., "#HD015")
# 🔒 SAFETY SWITCH
DRY_RUN = False
# ==============================
# TECHNICAL CONFIG
# ==============================
CHUNK_SIZE = 5 * 1024 * 1024 # 5 MB
PROGRESS_MIN_SIZE = 500 * 1024 * 1024 # 500 MB
PROGRESS_INTERVAL = 1.0 # seconds
EXCLUDED_DIRS = {"$RECYCLE.BIN", "System Volume Information", "RECYCLER", "msdownld.tmp"}
# --- File Size Limits ---
FILE_MIN_SIZE = 0
FILE_MAX_SIZE = 1024 * 1024 * 1024 * 1024 # 1TB
# --- DB Config ---
DB_CONFIG = {
"host": "192.168.1.50",
"port": 3306,
"user": "root",
"password": "Vlado9674+",
"database": "torrents",
"charset": "utf8mb4",
"autocommit": True,
}
PRINT_SKIPPED = False # Set True to see files that were already in DB
# ==============================
# SYSTEM INFO
# ==============================
REAL_PC_HOSTNAME = socket.gethostname()
OS_NAME = platform.system()
# ==============================
# FUNCTIONS
# ==============================
def get_path_hash(path_str: str) -> bytes:
"""Calculates MD5 hash of the path and returns raw 16 bytes for BINARY(16)."""
return hashlib.md5(path_str.encode('utf-8')).digest()
def compute_blake3(path: str) -> bytes:
h = blake3()
total_size = os.path.getsize(path)
show_progress = total_size >= PROGRESS_MIN_SIZE
processed = 0
start_time = time.time()
last_report = start_time
try:
with open(path, "rb") as f:
while True:
chunk = f.read(CHUNK_SIZE)
if not chunk:
break
h.update(chunk)
processed += len(chunk)
if show_progress:
now = time.time()
if now - last_report >= PROGRESS_INTERVAL:
elapsed = now - start_time
speed = processed / elapsed if elapsed > 0 else 0
percent = processed / total_size * 100
remaining = total_size - processed
eta = remaining / speed if speed > 0 else 0
print(
f"{percent:6.2f}% | "
f"{processed / 1024 / 1024:8.1f} / {total_size / 1024 / 1024:.1f} MB | "
f"{speed / 1024 / 1024:6.1f} MB/s | "
f"ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}",
flush=True
)
last_report = now
if show_progress:
total_time = time.time() - start_time
avg_speed = total_size / total_time if total_time > 0 else 0
print(
f" ✅ DONE | "
f"{total_size / 1024 / 1024:.1f} MB | "
f"avg {avg_speed / 1024 / 1024:.1f} MB/s | "
f"time {time.strftime('%H:%M:%S', time.gmtime(total_time))}",
flush=True
)
return h.digest()
except Exception as e:
print(f"⚠️ HASH ERROR: {path} - {e}")
raise
def size_allowed(size: int) -> bool:
if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE: return False
if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE: return False
return True
def normalize_db_path(scan_root, disk_path):
"""
Converts a physical Windows path to the standardized DB format.
E:\Movies\File.mkv -> /Movies/File.mkv
"""
try:
rel_path = os.path.relpath(disk_path, scan_root)
except ValueError:
return None
clean_path = rel_path.replace("\\", "/")
if not clean_path.startswith("/"):
clean_path = "/" + clean_path
return clean_path
# ==============================
# MAIN
# ==============================
def main():
print("🚀 BLAKE3 External Disk Indexer (MySQL 9 Compatible)", flush=True)
print(f"🖥 Running on PC: {REAL_PC_HOSTNAME}", flush=True)
if DRY_RUN:
print("🛡️ DRY RUN MODE ACTIVE: No changes will be made to DB.", flush=True)
else:
print("⚠️ LIVE MODE: Changes WILL be committed to DB.", flush=True)
scan_root = f"{DISK_DRIVE_LETTER}:\\"
if not os.path.isdir(scan_root):
print(f"❌ ERROR: Drive '{scan_root}' not found!")
return
try:
db = pymysql.connect(**DB_CONFIG)
cur = db.cursor()
except Exception as e:
print(f"❌ DB Connection failed: {e}")
return
print(f"📥 Loading DB index for: '{DISK_HOSTNAME}'...", flush=True)
# === LOAD EXISTING DB RECORDS ===
# We load path_hash as well for precise deletion
cur.execute("""
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
FROM file_md5_index
WHERE host_name = %s
""", (DISK_HOSTNAME,))
indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
print(f"✅ Found {len(indexed_map):,} files in DB for this disk.", flush=True)
# =========================================================
# PHASE 1: CLEANUP (DELETE MISSING FILES)
# =========================================================
print("======================================", flush=True)
print("🧹 PHASE 1: Checking for deleted files...", flush=True)
current_disk_paths = set()
for root, dirs, files in os.walk(scan_root):
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
for fname in files:
disk_path = os.path.join(root, fname)
clean_path = normalize_db_path(scan_root, disk_path)
if clean_path:
current_disk_paths.add(clean_path)
paths_to_delete = set(indexed_map.keys()) - current_disk_paths
if paths_to_delete:
print(f"🗑️ Found {len(paths_to_delete):,} files to delete from DB.")
if DRY_RUN:
for p in sorted(list(paths_to_delete))[:20]:
print(f" - {p}")
else:
# Delete using path_hash for index efficiency
batch_size = 500
to_delete_list = list(paths_to_delete)
for i in range(0, len(to_delete_list), batch_size):
batch_paths = to_delete_list[i: i + batch_size]
# Map paths to their MD5 hashes
batch_hashes = [get_path_hash(p) for p in batch_paths]
format_strings = ','.join(['%s'] * len(batch_hashes))
query = f"DELETE FROM file_md5_index WHERE host_name = %s AND path_hash IN ({format_strings})"
try:
cur.execute(query, [DISK_HOSTNAME] + batch_hashes)
except Exception as e:
print(f"❌ Error deleting batch: {e}")
for p in paths_to_delete:
del indexed_map[p]
print("✅ Cleanup complete.")
else:
print("✅ No deleted files detected.")
# =========================================================
# PHASE 2: SCAN & UPDATE (HASHING)
# =========================================================
print("======================================", flush=True)
print("🚀 PHASE 2: Scanning for changes & new files...", flush=True)
new_files = 0
skipped = 0
errors = 0
seen_paths = set()
for root, dirs, files in os.walk(scan_root):
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
for fname in files:
disk_path = os.path.join(root, fname)
try:
stat = os.stat(disk_path)
except OSError:
errors += 1
continue
size = stat.st_size
if not size_allowed(size):
continue
clean_path = normalize_db_path(scan_root, disk_path)
if not clean_path or clean_path in seen_paths:
continue
seen_paths.add(clean_path)
mtime = int(stat.st_mtime)
# Match Check
if clean_path in indexed_map:
db_size, db_mtime = indexed_map[clean_path]
if size == db_size and mtime == db_mtime:
skipped += 1
continue
# Compute Hashes
try:
b3_hash = compute_blake3(disk_path)
p_hash = get_path_hash(clean_path) # Essential for MySQL 9 Unique Index
except Exception:
errors += 1
continue
if DRY_RUN:
print(f"🛡️ [DRY RUN] NEW/UPDATE: {clean_path}")
new_files += 1
else:
cur.execute("""
INSERT INTO file_md5_index
(os_name, host_name, full_path, path_hash, file_name, directory,
file_size, mtime, blake3)
VALUES (%s, %s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
ON DUPLICATE KEY UPDATE
file_size = VALUES(file_size),
mtime = VALUES(mtime),
blake3 = VALUES(blake3),
updated_at = CURRENT_TIMESTAMP
""", (
OS_NAME, DISK_HOSTNAME, clean_path, p_hash, fname,
os.path.dirname(clean_path), size, mtime, b3_hash
))
new_files += 1
print(f" ADDED: {clean_path} | {b3_hash.hex()[:8]}...")
print("======================================", flush=True)
print(f"✅ Processed : {new_files}")
print(f"⏭ Skipped : {skipped}")
print(f"🗑 Deleted : {len(paths_to_delete)}")
print(f"⚠️ Errors : {errors}")
print("🏁 Done.")
cur.close()
db.close()
if __name__ == "__main__":
main()