diff --git a/WalkFilesOnBackupHDD/10 WalkBackupHDD.py b/WalkFilesOnBackupHDD/10 WalkBackupHDD.py new file mode 100644 index 0000000..f008cf7 --- /dev/null +++ b/WalkFilesOnBackupHDD/10 WalkBackupHDD.py @@ -0,0 +1,295 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +""" +FAST FILE HASH INDEXER – WINDOWS CLIENT (EXTERNAL DISKS) +- Mode: PHYSICAL BACKUP +- Hostname in DB = Disk Label (e.g., #HD015) +- Path in DB = Relative path (e.g., /Movies/Film.mkv) +""" + +import os, time +import pymysql +import socket +import platform +import sys +from blake3 import blake3 + +# ============================== +# CONFIG +# ============================== +CHUNK_SIZE = 5 * 1024 * 1024 # 5 MB +PROGRESS_MIN_SIZE = 500 * 1024 * 1024 # 500 MB +PROGRESS_INTERVAL = 1.0 # seconds + +EXCLUDED_DIRS = {"$RECYCLE.BIN", "System Volume Information", "RECYCLER", "msdownld.tmp"} + +# --- Limity velikosti --- +FILE_MIN_SIZE = 0 +FILE_MAX_SIZE = 1024 * 1024 * 1024* 1024 # 1TB + +# --- Nastavení Databáze --- +DB_CONFIG = { + "host": "192.168.1.76", + "port": 3307, + "user": "root", + "password": "Vlado9674+", + "database": "torrents", + "charset": "utf8mb4", + "autocommit": True, +} + +CHUNK_SIZE = 4 * 1024 * 1024 # 4 MB +PRINT_SKIPPED = False # True = vypisovat i přeskočené + +# ============================== +# SYSTEM INFO +# ============================== + +# Fyzický název PC (jen pro výpis do konzole, do DB půjde název disku) +REAL_PC_HOSTNAME = socket.gethostname() +OS_NAME = platform.system() + + +# ============================== +# FUNCTIONS +# ============================== + +def compute_blake3(path: str) -> bytes: + h = blake3() + total_size = os.path.getsize(path) + show_progress = total_size >= PROGRESS_MIN_SIZE + + processed = 0 + start_time = time.time() + last_report = start_time + + try: + with open(path, "rb") as f: + while True: + chunk = f.read(CHUNK_SIZE) + if not chunk: + break + + h.update(chunk) + processed += len(chunk) + + if show_progress: + now = time.time() + if now - last_report >= PROGRESS_INTERVAL: + elapsed = now - start_time + speed = processed / elapsed if elapsed > 0 else 0 + percent = processed / total_size * 100 + remaining = total_size - processed + eta = remaining / speed if speed > 0 else 0 + + print( + f" ⏳ {percent:6.2f}% | " + f"{processed/1024/1024:8.1f} / {total_size/1024/1024:.1f} MB | " + f"{speed/1024/1024:6.1f} MB/s | " + f"ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}", + flush=True + ) + last_report = now + + if show_progress: + total_time = time.time() - start_time + avg_speed = total_size / total_time if total_time > 0 else 0 + print( + f" ✅ DONE | " + f"{total_size/1024/1024:.1f} MB | " + f"avg {avg_speed/1024/1024:.1f} MB/s | " + f"time {time.strftime('%H:%M:%S', time.gmtime(total_time))}", + flush=True + ) + + return h.digest() + + except Exception as e: + print(f"⚠️ HASH ERROR: {path} - {e}") + raise + + + +def get_drive_info(): + """Získá písmeno disku a jeho ID (které se použije jako host_name).""" + print("\n💿 --- NASTAVENÍ SKENOVÁNÍ (EXTERNÍ DISK) ---") + + # 1. Písmeno disku + while True: + drive_input = input("📂 Zadejte písmeno disku ve Windows (např. 'E'): ").strip().upper() + drive_letter = drive_input.replace(":", "").replace("\\", "").replace("/", "") + + if len(drive_letter) == 1 and drive_letter.isalpha(): + drive_root = f"{drive_letter}:\\" + if os.path.isdir(drive_root): + break + else: + print(f"❌ Disk {drive_root} není dostupný.") + else: + print("❌ Neplatný formát.") + + # 2. Název disku -> HOST_NAME + while True: + disk_label = input("🏷️ Zadejte ID disku (bude uloženo jako 'host_name', např. '#HD015'): ").strip() + if len(disk_label) >= 2: + break + print("❌ Název je příliš krátký.") + + return drive_root, disk_label + + +def size_allowed(size: int) -> bool: + if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE: return False + if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE: return False + return True + + +# ============================== +# MAIN +# ============================== + +def main(): + print("🚀 BLAKE3 External Disk Indexer", flush=True) + print(f"🖥 Running on PC: {REAL_PC_HOSTNAME}", flush=True) + + # Získání vstupů + scan_root, disk_hostname = get_drive_info() + + print(f"✅ Konfigurace:") + print(f" Zdroj (Windows) : {scan_root}") + print(f" DB Hostname : {disk_hostname}") + print(f" DB Cesty : /Složka/Soubor...") + + try: + db = pymysql.connect(**DB_CONFIG) + cur = db.cursor() + except Exception as e: + print(f"❌ DB Connection failed: {e}") + input("Enter pro konec...") + return + + print(f"📥 Načítám index pro disk: '{disk_hostname}'...", flush=True) + + # === OPTIMALIZACE: Hledáme přesně podle host_name === + cur.execute(""" + SELECT full_path, file_size, UNIX_TIMESTAMP(mtime) + FROM file_md5_index + WHERE host_name = %s + """, (disk_hostname,)) + + # Mapa: { "/Slozka/Soubor.ext": (size, mtime) } + indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()} + + print(f"✅ Nalezeno {len(indexed_map):,} souborů v DB pro tento disk.", flush=True) + print("======================================", flush=True) + + new_files = 0 + skipped = 0 + filtered = 0 + errors = 0 + seen_paths = set() + + # --- SCAN --- + for root, dirs, files in os.walk(scan_root): + # Ignorace systémových složek + dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS] + + for fname in files: + disk_path = os.path.join(root, fname) + + # 1. Stat (velikost, čas) + try: + stat = os.stat(disk_path) + except OSError: + errors += 1 + continue + + size = stat.st_size + if not size_allowed(size): + filtered += 1 + continue + + # 2. Vytvoření čisté cesty pro DB + # E:\Filmy\Avatar.mkv -> Filmy\Avatar.mkv + try: + rel_path = os.path.relpath(disk_path, scan_root) + except ValueError: + errors += 1 + continue + + # Normalizace na Linux style: Filmy/Avatar.mkv + clean_path = rel_path.replace("\\", "/") + + # Přidání lomítka na začátek: /Filmy/Avatar.mkv + if not clean_path.startswith("/"): + clean_path = "/" + clean_path + + if clean_path in seen_paths: + continue + seen_paths.add(clean_path) + + mtime = int(stat.st_mtime) + + # === STRICT CHECK === + is_match = False + if clean_path in indexed_map: + db_size, db_mtime = indexed_map[clean_path] + if size == db_size and mtime == db_mtime: + is_match = True + + if is_match: + skipped += 1 + if PRINT_SKIPPED: + print(f"⏭ SKIP {clean_path}", flush=True) + continue + + # === INSERT / UPDATE === + print("➕ NEW / UPDATED", flush=True) + print(f" File: {clean_path}", flush=True) + print(f" Size: {size:,} B", flush=True) + + try: + b3 = compute_blake3(disk_path) + except Exception: + errors += 1 + continue + + cur.execute(""" + INSERT INTO file_md5_index + (os_name, host_name, full_path, file_name, directory, + file_size, mtime, blake3) + VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s) + ON DUPLICATE KEY UPDATE + file_size = VALUES(file_size), + mtime = VALUES(mtime), + blake3 = VALUES(blake3), + updated_at = CURRENT_TIMESTAMP + """, ( + OS_NAME, # Např. 'Windows' (kde se to skenovalo) + disk_hostname, # ZDE SE UKLÁDÁ '#HD015' + clean_path, # ZDE SE UKLÁDÁ '/Filmy/Avatar.mkv' + fname, + os.path.dirname(clean_path), + size, + mtime, + b3, + )) + + new_files += 1 + print(f" Hash: {b3.hex()}", flush=True) + print("--------------------------------------", flush=True) + + print("======================================", flush=True) + print(f"✅ Hotovo : {new_files}") + print(f"⏭ Shoda : {skipped}") + print(f"⚠️ Chyby : {errors}") + print("🏁 Konec.") + + cur.close() + db.close() + # input("\nStiskněte Enter pro ukončení...") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/WalkFilesOnBackupHDD/20 WalkBackupHDD.py b/WalkFilesOnBackupHDD/20 WalkBackupHDD.py new file mode 100644 index 0000000..add08f0 --- /dev/null +++ b/WalkFilesOnBackupHDD/20 WalkBackupHDD.py @@ -0,0 +1,351 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + +r""" +FAST FILE HASH INDEXER – WINDOWS CLIENT (HARDCODED CONFIG) +- Mode: PHYSICAL BACKUP +- Hostname in DB = Disk Label (e.g., #HD015) +- Path in DB = Relative path (e.g., /Movies/Film.mkv) +""" + +import os, time +import pymysql +import socket +import platform +import sys +from blake3 import blake3 + +# ============================== +# ⚙️ USER CONFIGURATION +# ============================== +DISK_DRIVE_LETTER = "P" # (e.g., "E", "F", "P") +DISK_HOSTNAME = "#HD08" # (e.g., "#HD015") + +# 🔒 SAFETY SWITCH +# True = LIST ONLY (No DB changes). "Simulates" the run. +# False = EXECUTE (Deletes and Inserts into DB). +DRY_RUN = True + +# ============================== +# TECHNICAL CONFIG +# ============================== +CHUNK_SIZE = 5 * 1024 * 1024 # 5 MB +PROGRESS_MIN_SIZE = 500 * 1024 * 1024 # 500 MB +PROGRESS_INTERVAL = 1.0 # seconds + +EXCLUDED_DIRS = {"$RECYCLE.BIN", "System Volume Information", "RECYCLER", "msdownld.tmp"} + +# --- File Size Limits --- +FILE_MIN_SIZE = 0 +FILE_MAX_SIZE = 1024 * 1024 * 1024 * 1024 # 1TB + +# --- DB Config --- +DB_CONFIG = { + "host": "192.168.1.76", + "port": 3307, + "user": "root", + "password": "Vlado9674+", + "database": "torrents", + "charset": "utf8mb4", + "autocommit": True, +} + +PRINT_SKIPPED = False # Set True to see files that were already in DB + +# ============================== +# SYSTEM INFO +# ============================== +REAL_PC_HOSTNAME = socket.gethostname() +OS_NAME = platform.system() + + +# ============================== +# FUNCTIONS +# ============================== + +def compute_blake3(path: str) -> bytes: + h = blake3() + total_size = os.path.getsize(path) + show_progress = total_size >= PROGRESS_MIN_SIZE + + processed = 0 + start_time = time.time() + last_report = start_time + + try: + with open(path, "rb") as f: + while True: + chunk = f.read(CHUNK_SIZE) + if not chunk: + break + + h.update(chunk) + processed += len(chunk) + + if show_progress: + now = time.time() + if now - last_report >= PROGRESS_INTERVAL: + elapsed = now - start_time + speed = processed / elapsed if elapsed > 0 else 0 + percent = processed / total_size * 100 + remaining = total_size - processed + eta = remaining / speed if speed > 0 else 0 + + print( + f" ⏳ {percent:6.2f}% | " + f"{processed / 1024 / 1024:8.1f} / {total_size / 1024 / 1024:.1f} MB | " + f"{speed / 1024 / 1024:6.1f} MB/s | " + f"ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}", + flush=True + ) + last_report = now + + if show_progress: + total_time = time.time() - start_time + avg_speed = total_size / total_time if total_time > 0 else 0 + print( + f" ✅ DONE | " + f"{total_size / 1024 / 1024:.1f} MB | " + f"avg {avg_speed / 1024 / 1024:.1f} MB/s | " + f"time {time.strftime('%H:%M:%S', time.gmtime(total_time))}", + flush=True + ) + + return h.digest() + + except Exception as e: + print(f"⚠️ HASH ERROR: {path} - {e}") + raise + + +def size_allowed(size: int) -> bool: + if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE: return False + if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE: return False + return True + + +def normalize_db_path(scan_root, disk_path): + """ + Converts a physical Windows path to the standardized DB format. + E:\Movies\File.mkv -> /Movies/File.mkv + """ + try: + rel_path = os.path.relpath(disk_path, scan_root) + except ValueError: + return None + + # Windows backslash to slash + clean_path = rel_path.replace("\\", "/") + + # Ensure leading slash + if not clean_path.startswith("/"): + clean_path = "/" + clean_path + + return clean_path + + +# ============================== +# MAIN +# ============================== + +def main(): + print("🚀 BLAKE3 External Disk Indexer", flush=True) + print(f"🖥 Running on PC: {REAL_PC_HOSTNAME}", flush=True) + + if DRY_RUN: + print("🛡️ DRY RUN MODE ACTIVE: No changes will be made to DB.", flush=True) + else: + print("⚠️ LIVE MODE: Changes WILL be committed to DB.", flush=True) + + # Build root path + scan_root = f"{DISK_DRIVE_LETTER}:\\" + + if not os.path.isdir(scan_root): + print(f"❌ ERROR: Drive '{scan_root}' not found!") + print(f" Please check DISK_DRIVE_LETTER in config.") + return + + print(f"✅ Config:") + print(f" Source (Win) : {scan_root}") + print(f" DB Hostname : {DISK_HOSTNAME}") + + try: + db = pymysql.connect(**DB_CONFIG) + cur = db.cursor() + except Exception as e: + print(f"❌ DB Connection failed: {e}") + return + + print(f"📥 Loading DB index for: '{DISK_HOSTNAME}'...", flush=True) + + # === LOAD EXISTING DB RECORDS === + cur.execute(""" + SELECT full_path, file_size, UNIX_TIMESTAMP(mtime) + FROM file_md5_index + WHERE host_name = %s + """, (DISK_HOSTNAME,)) + + indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()} + + print(f"✅ Found {len(indexed_map):,} files in DB for this disk.", flush=True) + + # ========================================================= + # PHASE 1: CLEANUP (DELETE MISSING FILES) + # ========================================================= + print("======================================", flush=True) + print("🧹 PHASE 1: Checking for deleted files...", flush=True) + + current_disk_paths = set() + + # Fast walk just to get paths + for root, dirs, files in os.walk(scan_root): + dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS] + + for fname in files: + disk_path = os.path.join(root, fname) + clean_path = normalize_db_path(scan_root, disk_path) + if clean_path: + current_disk_paths.add(clean_path) + + paths_to_delete = set(indexed_map.keys()) - current_disk_paths + + if paths_to_delete: + print(f"🗑️ Found {len(paths_to_delete):,} files to delete from DB.") + + if DRY_RUN: + print("🛡️ [DRY RUN] Listing files to be deleted (No action taken):") + for p in sorted(list(paths_to_delete))[:20]: # Print first 20 + print(f" - {p}") + if len(paths_to_delete) > 20: + print(f" ... and {len(paths_to_delete) - 20} more.") + else: + # Delete in batches + batch_size = 1000 + to_delete_list = list(paths_to_delete) + + for i in range(0, len(to_delete_list), batch_size): + batch = to_delete_list[i: i + batch_size] + format_strings = ','.join(['%s'] * len(batch)) + + query = f"DELETE FROM file_md5_index WHERE host_name = %s AND full_path IN ({format_strings})" + + try: + cur.execute(query, [DISK_HOSTNAME] + batch) + print(f" ... deleted batch {i}-{i + len(batch)}") + except Exception as e: + print(f"❌ Error deleting batch: {e}") + + # Update local map + for p in paths_to_delete: + del indexed_map[p] + print("✅ Cleanup complete.") + else: + print("✅ No deleted files detected.") + + # ========================================================= + # PHASE 2: SCAN & UPDATE (HASHING) + # ========================================================= + print("======================================", flush=True) + print("🚀 PHASE 2: Scanning for changes & new files...", flush=True) + + new_files = 0 + skipped = 0 + filtered = 0 + errors = 0 + seen_paths = set() + + for root, dirs, files in os.walk(scan_root): + dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS] + + for fname in files: + disk_path = os.path.join(root, fname) + + try: + stat = os.stat(disk_path) + except OSError: + errors += 1 + continue + + size = stat.st_size + if not size_allowed(size): + filtered += 1 + continue + + clean_path = normalize_db_path(scan_root, disk_path) + if not clean_path: + errors += 1 + continue + + if clean_path in seen_paths: + continue + seen_paths.add(clean_path) + + mtime = int(stat.st_mtime) + + # === MATCH CHECK === + is_match = False + if clean_path in indexed_map: + db_size, db_mtime = indexed_map[clean_path] + if size == db_size and mtime == db_mtime: + is_match = True + + if is_match: + skipped += 1 + if PRINT_SKIPPED: + print(f"⏭ SKIP {clean_path}", flush=True) + continue + + # === INSERT / UPDATE === + print("➕ NEW / UPDATED", flush=True) + print(f" File: {clean_path}", flush=True) + print(f" Size: {size:,} B", flush=True) + + try: + b3 = compute_blake3(disk_path) + except Exception: + errors += 1 + continue + + if DRY_RUN: + print(f"🛡️ [DRY RUN] Would INSERT/UPDATE: {clean_path}") + print(f" Hash: {b3.hex()}") + new_files += 1 + else: + cur.execute(""" + INSERT INTO file_md5_index + (os_name, host_name, full_path, file_name, directory, + file_size, mtime, blake3) + VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s) + ON DUPLICATE KEY UPDATE + file_size = VALUES(file_size), + mtime = VALUES(mtime), + blake3 = VALUES(blake3), + updated_at = CURRENT_TIMESTAMP + """, ( + OS_NAME, + DISK_HOSTNAME, + clean_path, + fname, + os.path.dirname(clean_path), + size, + mtime, + b3, + )) + new_files += 1 + print(f" Hash: {b3.hex()}", flush=True) + + print("--------------------------------------", flush=True) + + print("======================================", flush=True) + print(f"✅ Processed : {new_files}") + print(f"⏭ Skipped : {skipped}") + print(f"🗑 Deleted : {len(paths_to_delete)} " + ("(DRY RUN)" if DRY_RUN else "")) + print(f"⚠️ Errors : {errors}") + print("🏁 Done.") + + cur.close() + db.close() + + +if __name__ == "__main__": + main() \ No newline at end of file