Files
walkfiles/WalkFilesOnBackupHDD/20 WalkBackupHDD.py
2026-01-17 20:23:58 +01:00

351 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/python3
# -*- coding: utf-8 -*-
r"""
FAST FILE HASH INDEXER WINDOWS CLIENT (HARDCODED CONFIG)
- Mode: PHYSICAL BACKUP
- Hostname in DB = Disk Label (e.g., #HD015)
- Path in DB = Relative path (e.g., /Movies/Film.mkv)
"""
import os, time
import pymysql
import socket
import platform
import sys
from blake3 import blake3
# ==============================
# ⚙️ USER CONFIGURATION
# ==============================
DISK_DRIVE_LETTER = "f" # (e.g., "E", "F", "P")
DISK_HOSTNAME = "#HD16" # (e.g., "#HD015")
# 🔒 SAFETY SWITCH
# True = LIST ONLY (No DB changes). "Simulates" the run.
# False = EXECUTE (Deletes and Inserts into DB).
DRY_RUN = False
# ==============================
# TECHNICAL CONFIG
# ==============================
CHUNK_SIZE = 5 * 1024 * 1024 # 5 MB
PROGRESS_MIN_SIZE = 500 * 1024 * 1024 # 500 MB
PROGRESS_INTERVAL = 1.0 # seconds
EXCLUDED_DIRS = {"$RECYCLE.BIN", "System Volume Information", "RECYCLER", "msdownld.tmp"}
# --- File Size Limits ---
FILE_MIN_SIZE = 0
FILE_MAX_SIZE = 1024 * 1024*1024*1024 # 1TB
# --- DB Config ---
DB_CONFIG = {
"host": "192.168.1.76",
"port": 3307,
"user": "root",
"password": "Vlado9674+",
"database": "torrents",
"charset": "utf8mb4",
"autocommit": True,
}
PRINT_SKIPPED = False # Set True to see files that were already in DB
# ==============================
# SYSTEM INFO
# ==============================
REAL_PC_HOSTNAME = socket.gethostname()
OS_NAME = platform.system()
# ==============================
# FUNCTIONS
# ==============================
def compute_blake3(path: str) -> bytes:
h = blake3()
total_size = os.path.getsize(path)
show_progress = total_size >= PROGRESS_MIN_SIZE
processed = 0
start_time = time.time()
last_report = start_time
try:
with open(path, "rb") as f:
while True:
chunk = f.read(CHUNK_SIZE)
if not chunk:
break
h.update(chunk)
processed += len(chunk)
if show_progress:
now = time.time()
if now - last_report >= PROGRESS_INTERVAL:
elapsed = now - start_time
speed = processed / elapsed if elapsed > 0 else 0
percent = processed / total_size * 100
remaining = total_size - processed
eta = remaining / speed if speed > 0 else 0
print(
f"{percent:6.2f}% | "
f"{processed / 1024 / 1024:8.1f} / {total_size / 1024 / 1024:.1f} MB | "
f"{speed / 1024 / 1024:6.1f} MB/s | "
f"ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}",
flush=True
)
last_report = now
if show_progress:
total_time = time.time() - start_time
avg_speed = total_size / total_time if total_time > 0 else 0
print(
f" ✅ DONE | "
f"{total_size / 1024 / 1024:.1f} MB | "
f"avg {avg_speed / 1024 / 1024:.1f} MB/s | "
f"time {time.strftime('%H:%M:%S', time.gmtime(total_time))}",
flush=True
)
return h.digest()
except Exception as e:
print(f"⚠️ HASH ERROR: {path} - {e}")
raise
def size_allowed(size: int) -> bool:
if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE: return False
if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE: return False
return True
def normalize_db_path(scan_root, disk_path):
"""
Converts a physical Windows path to the standardized DB format.
E:\Movies\File.mkv -> /Movies/File.mkv
"""
try:
rel_path = os.path.relpath(disk_path, scan_root)
except ValueError:
return None
# Windows backslash to slash
clean_path = rel_path.replace("\\", "/")
# Ensure leading slash
if not clean_path.startswith("/"):
clean_path = "/" + clean_path
return clean_path
# ==============================
# MAIN
# ==============================
def main():
print("🚀 BLAKE3 External Disk Indexer", flush=True)
print(f"🖥 Running on PC: {REAL_PC_HOSTNAME}", flush=True)
if DRY_RUN:
print("🛡️ DRY RUN MODE ACTIVE: No changes will be made to DB.", flush=True)
else:
print("⚠️ LIVE MODE: Changes WILL be committed to DB.", flush=True)
# Build root path
scan_root = f"{DISK_DRIVE_LETTER}:\\"
if not os.path.isdir(scan_root):
print(f"❌ ERROR: Drive '{scan_root}' not found!")
print(f" Please check DISK_DRIVE_LETTER in config.")
return
print(f"✅ Config:")
print(f" Source (Win) : {scan_root}")
print(f" DB Hostname : {DISK_HOSTNAME}")
try:
db = pymysql.connect(**DB_CONFIG)
cur = db.cursor()
except Exception as e:
print(f"❌ DB Connection failed: {e}")
return
print(f"📥 Loading DB index for: '{DISK_HOSTNAME}'...", flush=True)
# === LOAD EXISTING DB RECORDS ===
cur.execute("""
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
FROM file_md5_index
WHERE host_name = %s
""", (DISK_HOSTNAME,))
indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
print(f"✅ Found {len(indexed_map):,} files in DB for this disk.", flush=True)
# =========================================================
# PHASE 1: CLEANUP (DELETE MISSING FILES)
# =========================================================
print("======================================", flush=True)
print("🧹 PHASE 1: Checking for deleted files...", flush=True)
current_disk_paths = set()
# Fast walk just to get paths
for root, dirs, files in os.walk(scan_root):
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
for fname in files:
disk_path = os.path.join(root, fname)
clean_path = normalize_db_path(scan_root, disk_path)
if clean_path:
current_disk_paths.add(clean_path)
paths_to_delete = set(indexed_map.keys()) - current_disk_paths
if paths_to_delete:
print(f"🗑️ Found {len(paths_to_delete):,} files to delete from DB.")
if DRY_RUN:
print("🛡️ [DRY RUN] Listing files to be deleted (No action taken):")
for p in sorted(list(paths_to_delete))[:20]: # Print first 20
print(f" - {p}")
if len(paths_to_delete) > 20:
print(f" ... and {len(paths_to_delete) - 20} more.")
else:
# Delete in batches
batch_size = 1000
to_delete_list = list(paths_to_delete)
for i in range(0, len(to_delete_list), batch_size):
batch = to_delete_list[i: i + batch_size]
format_strings = ','.join(['%s'] * len(batch))
query = f"DELETE FROM file_md5_index WHERE host_name = %s AND full_path IN ({format_strings})"
try:
cur.execute(query, [DISK_HOSTNAME] + batch)
print(f" ... deleted batch {i}-{i + len(batch)}")
except Exception as e:
print(f"❌ Error deleting batch: {e}")
# Update local map
for p in paths_to_delete:
del indexed_map[p]
print("✅ Cleanup complete.")
else:
print("✅ No deleted files detected.")
# =========================================================
# PHASE 2: SCAN & UPDATE (HASHING)
# =========================================================
print("======================================", flush=True)
print("🚀 PHASE 2: Scanning for changes & new files...", flush=True)
new_files = 0
skipped = 0
filtered = 0
errors = 0
seen_paths = set()
for root, dirs, files in os.walk(scan_root):
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
for fname in files:
disk_path = os.path.join(root, fname)
try:
stat = os.stat(disk_path)
except OSError:
errors += 1
continue
size = stat.st_size
if not size_allowed(size):
filtered += 1
continue
clean_path = normalize_db_path(scan_root, disk_path)
if not clean_path:
errors += 1
continue
if clean_path in seen_paths:
continue
seen_paths.add(clean_path)
mtime = int(stat.st_mtime)
# === MATCH CHECK ===
is_match = False
if clean_path in indexed_map:
db_size, db_mtime = indexed_map[clean_path]
if size == db_size and mtime == db_mtime:
is_match = True
if is_match:
skipped += 1
if PRINT_SKIPPED:
print(f"⏭ SKIP {clean_path}", flush=True)
continue
# === INSERT / UPDATE ===
print(" NEW / UPDATED", flush=True)
print(f" File: {clean_path}", flush=True)
print(f" Size: {size:,} B", flush=True)
try:
b3 = compute_blake3(disk_path)
except Exception:
errors += 1
continue
if DRY_RUN:
print(f"🛡️ [DRY RUN] Would INSERT/UPDATE: {clean_path}")
print(f" Hash: {b3.hex()}")
new_files += 1
else:
cur.execute("""
INSERT INTO file_md5_index
(os_name, host_name, full_path, file_name, directory,
file_size, mtime, blake3)
VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
ON DUPLICATE KEY UPDATE
file_size = VALUES(file_size),
mtime = VALUES(mtime),
blake3 = VALUES(blake3),
updated_at = CURRENT_TIMESTAMP
""", (
OS_NAME,
DISK_HOSTNAME,
clean_path,
fname,
os.path.dirname(clean_path),
size,
mtime,
b3,
))
new_files += 1
print(f" Hash: {b3.hex()}", flush=True)
print("--------------------------------------", flush=True)
print("======================================", flush=True)
print(f"✅ Processed : {new_files}")
print(f"⏭ Skipped : {skipped}")
print(f"🗑 Deleted : {len(paths_to_delete)} " + ("(DRY RUN)" if DRY_RUN else ""))
print(f"⚠️ Errors : {errors}")
print("🏁 Done.")
cur.close()
db.close()
if __name__ == "__main__":
main()