tw22

2026-01-08 10:15:45 +01:00
parent 2aee823e87
commit 6cdabc64b4
2 changed files with 646 additions and 0 deletions
--- a/WalkFilesOnBackupHDD/10
+++ b/WalkFilesOnBackupHDD/10
@@ -0,0 +1,295 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+"""
+FAST FILE HASH INDEXER – WINDOWS CLIENT (EXTERNAL DISKS)
+- Mode: PHYSICAL BACKUP
+- Hostname in DB = Disk Label (e.g., #HD015)
+- Path in DB     = Relative path (e.g., /Movies/Film.mkv)
+"""
+
+import os, time
+import pymysql
+import socket
+import platform
+import sys
+from blake3 import blake3
+
+# ==============================
+# CONFIG
+# ==============================
+CHUNK_SIZE = 5 * 1024 * 1024   # 5 MB
+PROGRESS_MIN_SIZE = 500 * 1024 * 1024  # 500 MB
+PROGRESS_INTERVAL = 1.0  # seconds
+
+EXCLUDED_DIRS = {"$RECYCLE.BIN", "System Volume Information", "RECYCLER", "msdownld.tmp"}
+
+# --- Limity velikosti ---
+FILE_MIN_SIZE = 0
+FILE_MAX_SIZE = 1024 * 1024 * 1024* 1024  # 1TB
+
+# --- Nastavení Databáze ---
+DB_CONFIG = {
+    "host": "192.168.1.76",
+    "port": 3307,
+    "user": "root",
+    "password": "Vlado9674+",
+    "database": "torrents",
+    "charset": "utf8mb4",
+    "autocommit": True,
+}
+
+CHUNK_SIZE = 4 * 1024 * 1024  # 4 MB
+PRINT_SKIPPED = False  # True = vypisovat i přeskočené
+
+# ==============================
+# SYSTEM INFO
+# ==============================
+
+# Fyzický název PC (jen pro výpis do konzole, do DB půjde název disku)
+REAL_PC_HOSTNAME = socket.gethostname()
+OS_NAME = platform.system()
+
+
+# ==============================
+# FUNCTIONS
+# ==============================
+
+def compute_blake3(path: str) -> bytes:
+    h = blake3()
+    total_size = os.path.getsize(path)
+    show_progress = total_size >= PROGRESS_MIN_SIZE
+
+    processed = 0
+    start_time = time.time()
+    last_report = start_time
+
+    try:
+        with open(path, "rb") as f:
+            while True:
+                chunk = f.read(CHUNK_SIZE)
+                if not chunk:
+                    break
+
+                h.update(chunk)
+                processed += len(chunk)
+
+                if show_progress:
+                    now = time.time()
+                    if now - last_report >= PROGRESS_INTERVAL:
+                        elapsed = now - start_time
+                        speed = processed / elapsed if elapsed > 0 else 0
+                        percent = processed / total_size * 100
+                        remaining = total_size - processed
+                        eta = remaining / speed if speed > 0 else 0
+
+                        print(
+                            f"   ⏳ {percent:6.2f}% | "
+                            f"{processed/1024/1024:8.1f} / {total_size/1024/1024:.1f} MB | "
+                            f"{speed/1024/1024:6.1f} MB/s | "
+                            f"ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}",
+                            flush=True
+                        )
+                        last_report = now
+
+        if show_progress:
+            total_time = time.time() - start_time
+            avg_speed = total_size / total_time if total_time > 0 else 0
+            print(
+                f"   ✅ DONE | "
+                f"{total_size/1024/1024:.1f} MB | "
+                f"avg {avg_speed/1024/1024:.1f} MB/s | "
+                f"time {time.strftime('%H:%M:%S', time.gmtime(total_time))}",
+                flush=True
+            )
+
+        return h.digest()
+
+    except Exception as e:
+        print(f"⚠️ HASH ERROR: {path} - {e}")
+        raise
+
+
+
+def get_drive_info():
+    """Získá písmeno disku a jeho ID (které se použije jako host_name)."""
+    print("\n💿 --- NASTAVENÍ SKENOVÁNÍ (EXTERNÍ DISK) ---")
+
+    # 1. Písmeno disku
+    while True:
+        drive_input = input("📂 Zadejte písmeno disku ve Windows (např. 'E'): ").strip().upper()
+        drive_letter = drive_input.replace(":", "").replace("\\", "").replace("/", "")
+
+        if len(drive_letter) == 1 and drive_letter.isalpha():
+            drive_root = f"{drive_letter}:\\"
+            if os.path.isdir(drive_root):
+                break
+            else:
+                print(f"❌ Disk {drive_root} není dostupný.")
+        else:
+            print("❌ Neplatný formát.")
+
+    # 2. Název disku -> HOST_NAME
+    while True:
+        disk_label = input("🏷️  Zadejte ID disku (bude uloženo jako 'host_name', např. '#HD015'): ").strip()
+        if len(disk_label) >= 2:
+            break
+        print("❌ Název je příliš krátký.")
+
+    return drive_root, disk_label
+
+
+def size_allowed(size: int) -> bool:
+    if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE: return False
+    if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE: return False
+    return True
+
+
+# ==============================
+# MAIN
+# ==============================
+
+def main():
+    print("🚀 BLAKE3 External Disk Indexer", flush=True)
+    print(f"🖥 Running on PC: {REAL_PC_HOSTNAME}", flush=True)
+
+    # Získání vstupů
+    scan_root, disk_hostname = get_drive_info()
+
+    print(f"✅ Konfigurace:")
+    print(f"   Zdroj (Windows) : {scan_root}")
+    print(f"   DB Hostname     : {disk_hostname}")
+    print(f"   DB Cesty        : /Složka/Soubor...")
+
+    try:
+        db = pymysql.connect(**DB_CONFIG)
+        cur = db.cursor()
+    except Exception as e:
+        print(f"❌ DB Connection failed: {e}")
+        input("Enter pro konec...")
+        return
+
+    print(f"📥 Načítám index pro disk: '{disk_hostname}'...", flush=True)
+
+    # === OPTIMALIZACE: Hledáme přesně podle host_name ===
+    cur.execute("""
+        SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
+        FROM file_md5_index
+        WHERE host_name = %s
+    """, (disk_hostname,))
+
+    # Mapa: { "/Slozka/Soubor.ext": (size, mtime) }
+    indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
+
+    print(f"✅ Nalezeno {len(indexed_map):,} souborů v DB pro tento disk.", flush=True)
+    print("======================================", flush=True)
+
+    new_files = 0
+    skipped = 0
+    filtered = 0
+    errors = 0
+    seen_paths = set()
+
+    # --- SCAN ---
+    for root, dirs, files in os.walk(scan_root):
+        # Ignorace systémových složek
+        dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
+
+        for fname in files:
+            disk_path = os.path.join(root, fname)
+
+            # 1. Stat (velikost, čas)
+            try:
+                stat = os.stat(disk_path)
+            except OSError:
+                errors += 1
+                continue
+
+            size = stat.st_size
+            if not size_allowed(size):
+                filtered += 1
+                continue
+
+            # 2. Vytvoření čisté cesty pro DB
+            # E:\Filmy\Avatar.mkv -> Filmy\Avatar.mkv
+            try:
+                rel_path = os.path.relpath(disk_path, scan_root)
+            except ValueError:
+                errors += 1
+                continue
+
+            # Normalizace na Linux style: Filmy/Avatar.mkv
+            clean_path = rel_path.replace("\\", "/")
+
+            # Přidání lomítka na začátek: /Filmy/Avatar.mkv
+            if not clean_path.startswith("/"):
+                clean_path = "/" + clean_path
+
+            if clean_path in seen_paths:
+                continue
+            seen_paths.add(clean_path)
+
+            mtime = int(stat.st_mtime)
+
+            # === STRICT CHECK ===
+            is_match = False
+            if clean_path in indexed_map:
+                db_size, db_mtime = indexed_map[clean_path]
+                if size == db_size and mtime == db_mtime:
+                    is_match = True
+
+            if is_match:
+                skipped += 1
+                if PRINT_SKIPPED:
+                    print(f"⏭ SKIP {clean_path}", flush=True)
+                continue
+
+            # === INSERT / UPDATE ===
+            print("➕ NEW / UPDATED", flush=True)
+            print(f"   File: {clean_path}", flush=True)
+            print(f"   Size: {size:,} B", flush=True)
+
+            try:
+                b3 = compute_blake3(disk_path)
+            except Exception:
+                errors += 1
+                continue
+
+            cur.execute("""
+                INSERT INTO file_md5_index
+                    (os_name, host_name, full_path, file_name, directory,
+                     file_size, mtime, blake3)
+                VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
+                ON DUPLICATE KEY UPDATE
+                    file_size  = VALUES(file_size),
+                    mtime      = VALUES(mtime),
+                    blake3     = VALUES(blake3),
+                    updated_at = CURRENT_TIMESTAMP
+            """, (
+                OS_NAME,  # Např. 'Windows' (kde se to skenovalo)
+                disk_hostname,  # ZDE SE UKLÁDÁ '#HD015'
+                clean_path,  # ZDE SE UKLÁDÁ '/Filmy/Avatar.mkv'
+                fname,
+                os.path.dirname(clean_path),
+                size,
+                mtime,
+                b3,
+            ))
+
+            new_files += 1
+            print(f"   Hash: {b3.hex()}", flush=True)
+            print("--------------------------------------", flush=True)
+
+    print("======================================", flush=True)
+    print(f"✅ Hotovo : {new_files}")
+    print(f"⏭ Shoda  : {skipped}")
+    print(f"⚠️ Chyby  : {errors}")
+    print("🏁 Konec.")
+
+    cur.close()
+    db.close()
+    # input("\nStiskněte Enter pro ukončení...")
+
+
+if __name__ == "__main__":
+    main()
--- a/WalkFilesOnBackupHDD/20
+++ b/WalkFilesOnBackupHDD/20
@@ -0,0 +1,351 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+r"""
+FAST FILE HASH INDEXER – WINDOWS CLIENT (HARDCODED CONFIG)
+- Mode: PHYSICAL BACKUP
+- Hostname in DB = Disk Label (e.g., #HD015)
+- Path in DB     = Relative path (e.g., /Movies/Film.mkv)
+"""
+
+import os, time
+import pymysql
+import socket
+import platform
+import sys
+from blake3 import blake3
+
+# ==============================
+# ⚙️ USER CONFIGURATION
+# ==============================
+DISK_DRIVE_LETTER = "P"  # (e.g., "E", "F", "P")
+DISK_HOSTNAME = "#HD08"  # (e.g., "#HD015")
+
+# 🔒 SAFETY SWITCH
+# True  = LIST ONLY (No DB changes). "Simulates" the run.
+# False = EXECUTE (Deletes and Inserts into DB).
+DRY_RUN = True
+
+# ==============================
+# TECHNICAL CONFIG
+# ==============================
+CHUNK_SIZE = 5 * 1024 * 1024  # 5 MB
+PROGRESS_MIN_SIZE = 500 * 1024 * 1024  # 500 MB
+PROGRESS_INTERVAL = 1.0  # seconds
+
+EXCLUDED_DIRS = {"$RECYCLE.BIN", "System Volume Information", "RECYCLER", "msdownld.tmp"}
+
+# --- File Size Limits ---
+FILE_MIN_SIZE = 0
+FILE_MAX_SIZE = 1024 * 1024 * 1024 * 1024  # 1TB
+
+# --- DB Config ---
+DB_CONFIG = {
+    "host": "192.168.1.76",
+    "port": 3307,
+    "user": "root",
+    "password": "Vlado9674+",
+    "database": "torrents",
+    "charset": "utf8mb4",
+    "autocommit": True,
+}
+
+PRINT_SKIPPED = False  # Set True to see files that were already in DB
+
+# ==============================
+# SYSTEM INFO
+# ==============================
+REAL_PC_HOSTNAME = socket.gethostname()
+OS_NAME = platform.system()
+
+
+# ==============================
+# FUNCTIONS
+# ==============================
+
+def compute_blake3(path: str) -> bytes:
+    h = blake3()
+    total_size = os.path.getsize(path)
+    show_progress = total_size >= PROGRESS_MIN_SIZE
+
+    processed = 0
+    start_time = time.time()
+    last_report = start_time
+
+    try:
+        with open(path, "rb") as f:
+            while True:
+                chunk = f.read(CHUNK_SIZE)
+                if not chunk:
+                    break
+
+                h.update(chunk)
+                processed += len(chunk)
+
+                if show_progress:
+                    now = time.time()
+                    if now - last_report >= PROGRESS_INTERVAL:
+                        elapsed = now - start_time
+                        speed = processed / elapsed if elapsed > 0 else 0
+                        percent = processed / total_size * 100
+                        remaining = total_size - processed
+                        eta = remaining / speed if speed > 0 else 0
+
+                        print(
+                            f"   ⏳ {percent:6.2f}% | "
+                            f"{processed / 1024 / 1024:8.1f} / {total_size / 1024 / 1024:.1f} MB | "
+                            f"{speed / 1024 / 1024:6.1f} MB/s | "
+                            f"ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}",
+                            flush=True
+                        )
+                        last_report = now
+
+        if show_progress:
+            total_time = time.time() - start_time
+            avg_speed = total_size / total_time if total_time > 0 else 0
+            print(
+                f"   ✅ DONE | "
+                f"{total_size / 1024 / 1024:.1f} MB | "
+                f"avg {avg_speed / 1024 / 1024:.1f} MB/s | "
+                f"time {time.strftime('%H:%M:%S', time.gmtime(total_time))}",
+                flush=True
+            )
+
+        return h.digest()
+
+    except Exception as e:
+        print(f"⚠️ HASH ERROR: {path} - {e}")
+        raise
+
+
+def size_allowed(size: int) -> bool:
+    if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE: return False
+    if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE: return False
+    return True
+
+
+def normalize_db_path(scan_root, disk_path):
+    """
+    Converts a physical Windows path to the standardized DB format.
+    E:\Movies\File.mkv -> /Movies/File.mkv
+    """
+    try:
+        rel_path = os.path.relpath(disk_path, scan_root)
+    except ValueError:
+        return None
+
+    # Windows backslash to slash
+    clean_path = rel_path.replace("\\", "/")
+
+    # Ensure leading slash
+    if not clean_path.startswith("/"):
+        clean_path = "/" + clean_path
+
+    return clean_path
+
+
+# ==============================
+# MAIN
+# ==============================
+
+def main():
+    print("🚀 BLAKE3 External Disk Indexer", flush=True)
+    print(f"🖥 Running on PC: {REAL_PC_HOSTNAME}", flush=True)
+
+    if DRY_RUN:
+        print("🛡️  DRY RUN MODE ACTIVE: No changes will be made to DB.", flush=True)
+    else:
+        print("⚠️  LIVE MODE: Changes WILL be committed to DB.", flush=True)
+
+    # Build root path
+    scan_root = f"{DISK_DRIVE_LETTER}:\\"
+
+    if not os.path.isdir(scan_root):
+        print(f"❌ ERROR: Drive '{scan_root}' not found!")
+        print(f"   Please check DISK_DRIVE_LETTER in config.")
+        return
+
+    print(f"✅ Config:")
+    print(f"   Source (Win) : {scan_root}")
+    print(f"   DB Hostname  : {DISK_HOSTNAME}")
+
+    try:
+        db = pymysql.connect(**DB_CONFIG)
+        cur = db.cursor()
+    except Exception as e:
+        print(f"❌ DB Connection failed: {e}")
+        return
+
+    print(f"📥 Loading DB index for: '{DISK_HOSTNAME}'...", flush=True)
+
+    # === LOAD EXISTING DB RECORDS ===
+    cur.execute("""
+        SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
+        FROM file_md5_index
+        WHERE host_name = %s
+    """, (DISK_HOSTNAME,))
+
+    indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
+
+    print(f"✅ Found {len(indexed_map):,} files in DB for this disk.", flush=True)
+
+    # =========================================================
+    # PHASE 1: CLEANUP (DELETE MISSING FILES)
+    # =========================================================
+    print("======================================", flush=True)
+    print("🧹 PHASE 1: Checking for deleted files...", flush=True)
+
+    current_disk_paths = set()
+
+    # Fast walk just to get paths
+    for root, dirs, files in os.walk(scan_root):
+        dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
+
+        for fname in files:
+            disk_path = os.path.join(root, fname)
+            clean_path = normalize_db_path(scan_root, disk_path)
+            if clean_path:
+                current_disk_paths.add(clean_path)
+
+    paths_to_delete = set(indexed_map.keys()) - current_disk_paths
+
+    if paths_to_delete:
+        print(f"🗑️  Found {len(paths_to_delete):,} files to delete from DB.")
+
+        if DRY_RUN:
+            print("🛡️  [DRY RUN] Listing files to be deleted (No action taken):")
+            for p in sorted(list(paths_to_delete))[:20]:  # Print first 20
+                print(f"   - {p}")
+            if len(paths_to_delete) > 20:
+                print(f"   ... and {len(paths_to_delete) - 20} more.")
+        else:
+            # Delete in batches
+            batch_size = 1000
+            to_delete_list = list(paths_to_delete)
+
+            for i in range(0, len(to_delete_list), batch_size):
+                batch = to_delete_list[i: i + batch_size]
+                format_strings = ','.join(['%s'] * len(batch))
+
+                query = f"DELETE FROM file_md5_index WHERE host_name = %s AND full_path IN ({format_strings})"
+
+                try:
+                    cur.execute(query, [DISK_HOSTNAME] + batch)
+                    print(f"   ... deleted batch {i}-{i + len(batch)}")
+                except Exception as e:
+                    print(f"❌ Error deleting batch: {e}")
+
+            # Update local map
+            for p in paths_to_delete:
+                del indexed_map[p]
+            print("✅ Cleanup complete.")
+    else:
+        print("✅ No deleted files detected.")
+
+    # =========================================================
+    # PHASE 2: SCAN & UPDATE (HASHING)
+    # =========================================================
+    print("======================================", flush=True)
+    print("🚀 PHASE 2: Scanning for changes & new files...", flush=True)
+
+    new_files = 0
+    skipped = 0
+    filtered = 0
+    errors = 0
+    seen_paths = set()
+
+    for root, dirs, files in os.walk(scan_root):
+        dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
+
+        for fname in files:
+            disk_path = os.path.join(root, fname)
+
+            try:
+                stat = os.stat(disk_path)
+            except OSError:
+                errors += 1
+                continue
+
+            size = stat.st_size
+            if not size_allowed(size):
+                filtered += 1
+                continue
+
+            clean_path = normalize_db_path(scan_root, disk_path)
+            if not clean_path:
+                errors += 1
+                continue
+
+            if clean_path in seen_paths:
+                continue
+            seen_paths.add(clean_path)
+
+            mtime = int(stat.st_mtime)
+
+            # === MATCH CHECK ===
+            is_match = False
+            if clean_path in indexed_map:
+                db_size, db_mtime = indexed_map[clean_path]
+                if size == db_size and mtime == db_mtime:
+                    is_match = True
+
+            if is_match:
+                skipped += 1
+                if PRINT_SKIPPED:
+                    print(f"⏭ SKIP {clean_path}", flush=True)
+                continue
+
+            # === INSERT / UPDATE ===
+            print("➕ NEW / UPDATED", flush=True)
+            print(f"   File: {clean_path}", flush=True)
+            print(f"   Size: {size:,} B", flush=True)
+
+            try:
+                b3 = compute_blake3(disk_path)
+            except Exception:
+                errors += 1
+                continue
+
+            if DRY_RUN:
+                print(f"🛡️  [DRY RUN] Would INSERT/UPDATE: {clean_path}")
+                print(f"   Hash: {b3.hex()}")
+                new_files += 1
+            else:
+                cur.execute("""
+                    INSERT INTO file_md5_index
+                        (os_name, host_name, full_path, file_name, directory,
+                        file_size, mtime, blake3)
+                    VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
+                    ON DUPLICATE KEY UPDATE
+                        file_size  = VALUES(file_size),
+                        mtime      = VALUES(mtime),
+                        blake3     = VALUES(blake3),
+                        updated_at = CURRENT_TIMESTAMP
+                """, (
+                    OS_NAME,
+                    DISK_HOSTNAME,
+                    clean_path,
+                    fname,
+                    os.path.dirname(clean_path),
+                    size,
+                    mtime,
+                    b3,
+                ))
+                new_files += 1
+                print(f"   Hash: {b3.hex()}", flush=True)
+
+            print("--------------------------------------", flush=True)
+
+    print("======================================", flush=True)
+    print(f"✅ Processed  : {new_files}")
+    print(f"⏭ Skipped    : {skipped}")
+    print(f"🗑 Deleted    : {len(paths_to_delete)} " + ("(DRY RUN)" if DRY_RUN else ""))
+    print(f"⚠️ Errors     : {errors}")
+    print("🏁 Done.")
+
+    cur.close()
+    db.close()
+
+
+if __name__ == "__main__":
+    main()