Z230

2026-01-16 15:34:12 +01:00
parent 186c98fd0d
commit 2d2a60a845
6 changed files with 850 additions and 1 deletions
--- a/dddddd.py
+++ b/dddddd.py
@@ -0,0 +1,315 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+
+"""
+FAST FILE HASH INDEXER – UNRAID (BLAKE3 ONLY, ALL SHARES)
+- HARDCODED SINGLE SHARE MODE
+- SQL OPTIMIZATION
+- STRICT MODE (NO TOLERANCE) - Updates DB on any mismatch
+"""
+
+import os
+import pymysql
+import socket
+import platform
+from blake3 import blake3
+
+# ==============================
+# ENV / HOST
+# ==============================
+
+HOSTNAME = socket.gethostname()
+OS_NAME = platform.system()
+
+# ZDE JE TO NATVRDO PRO TESTOVÁNÍ:
+# SCAN_ONLY_THIS = None #"#Fotky"
+SCAN_ONLY_THIS = '#Library'  # "#Fotky"
+
+# ==============================
+# CONFIG
+# ==============================
+
+EXCLUDED_SHARES = {"domains", "appdata", "system", "isos"}
+
+# --- File size limits (bytes) ---
+FILE_MIN_SIZE = 0
+FILE_MAX_SIZE = 1024 * 1024 * 1024 * 1024  # 50MB
+
+DB_CONFIG = {
+    "host": "192.168.1.76",
+    "port": 3307,
+    "user": "root",
+    "password": "Vlado9674+",
+    "database": "torrents",
+    "charset": "utf8mb4",
+    "autocommit": True,
+}
+
+CHUNK_SIZE = 4 * 1024 * 1024  # 4 MB
+PRINT_SKIPPED = False
+
+
+# ==============================
+# HASH
+# ==============================
+
+def compute_blake3(path: str) -> bytes:
+    h = blake3()
+    with open(path, "rb") as f:
+        for chunk in iter(lambda: f.read(CHUNK_SIZE), b""):
+            h.update(chunk)
+    return h.digest()
+
+
+# ==============================
+# SHARE / PATH HELPERS
+# ==============================
+
+def get_user_shares():
+    if SCAN_ONLY_THIS:
+        path = f"/mnt/user/{SCAN_ONLY_THIS}"
+        if os.path.isdir(path):
+            print(f"🎯 SINGLE SHARE MODE ACTIVE: Scanning only '{SCAN_ONLY_THIS}'")
+            return [SCAN_ONLY_THIS]
+        else:
+            print(f"⚠️ ERROR: Requested share '{SCAN_ONLY_THIS}' not found in /mnt/user!")
+            return []
+
+    shares = []
+    if not os.path.exists("/mnt/user"):
+        return []
+
+    for name in os.listdir("/mnt/user"):
+        if name.startswith("."):
+            continue
+        if name in EXCLUDED_SHARES:
+            continue
+        path = f"/mnt/user/{name}"
+        if os.path.isdir(path):
+            shares.append(name)
+    return sorted(shares)
+
+
+def find_physical_roots(shares):
+    roots = []
+    if not os.path.exists("/mnt"):
+        return []
+    for disk in os.listdir("/mnt"):
+        if not disk.startswith("disk"):
+            continue
+        for share in shares:
+            path = f"/mnt/{disk}/{share}"
+            if os.path.isdir(path):
+                roots.append((share, path))
+    return sorted(roots)
+
+
+def logical_path_from_disk_path(disk_path: str) -> str:
+    if not disk_path.startswith("/mnt/disk"):
+        raise ValueError(f"Unexpected disk path: {disk_path}")
+    parts = disk_path.split("/", 3)
+    return f"/mnt/user/{parts[3]}"
+
+
+def size_allowed(size: int) -> bool:
+    if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE:
+        return False
+    if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE:
+        return False
+    return True
+
+
+# ==============================
+# MAIN
+# ==============================
+
+def main():
+    print("🚀 BLAKE3 indexer starting", flush=True)
+    print(f"🖥 Host: {HOSTNAME} | OS: {OS_NAME}", flush=True)
+
+    if FILE_MIN_SIZE or FILE_MAX_SIZE:
+        print(f"📏 File size limits: min={FILE_MIN_SIZE} max={FILE_MAX_SIZE}", flush=True)
+
+    shares = get_user_shares()
+    if not shares:
+        print("❌ No user shares to index!", flush=True)
+        return
+
+    print("📦 User shares to index:", flush=True)
+    for s in shares:
+        print(f"   - {s}", flush=True)
+
+    scan_roots = find_physical_roots(shares)
+    if not scan_roots:
+        print("❌ No physical disk roots found!", flush=True)
+        return
+
+    print("📂 Physical scan roots:", flush=True)
+    for _, path in scan_roots:
+        print(f"   - {path}", flush=True)
+
+    try:
+        db = pymysql.connect(**DB_CONFIG)
+        cur = db.cursor()
+        # === TOTO JE TEN PŘÍKAZ "NEPŘEMÝŠLEJ" ===
+        # Nastaví relaci na UTC. MySQL přestane posouvat časy o hodinu sem a tam.
+        # cur.execute("SET time_zone = '+00:00'")
+        # =========================================
+    except Exception as e:
+        print(f"❌ Database connection failed: {e}")
+        return
+
+    print("📥 Loading already indexed files into memory...", flush=True)
+
+    # === OPTIMALIZACE SQL ===
+    if SCAN_ONLY_THIS:
+        search_pattern = f"/mnt/user/{SCAN_ONLY_THIS}%"
+        print(f"⚡ OPTIMIZATION: Fetching only DB records for '{search_pattern}'", flush=True)
+        cur.execute("""
+            SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
+            FROM file_md5_index
+            WHERE host_name = %s AND full_path LIKE %s
+        """, (HOSTNAME, search_pattern))
+    else:
+        cur.execute("""
+            SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
+            FROM file_md5_index
+            WHERE host_name = %s
+        """, (HOSTNAME,))
+
+    # Načteme do slovníku pro rychlé vyhledávání
+    # Formát: { "cesta": (velikost, mtime) }
+    indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
+    print(f"✅ Loaded {len(indexed_map):,} indexed entries", flush=True)
+    print("======================================", flush=True)
+
+    new_files = 0
+    skipped = 0
+    filtered = 0
+    seen_paths = set()
+
+    # --- SCAN ---
+    for share, scan_root in scan_roots:
+        for root, _, files in os.walk(scan_root):
+            for fname in files:
+                disk_path = os.path.join(root, fname)
+
+                try:
+                    stat = os.stat(disk_path)
+                except OSError:
+                    continue
+
+                size = stat.st_size
+                if not size_allowed(size):
+                    filtered += 1
+                    continue
+
+                logical_path = logical_path_from_disk_path(disk_path)
+
+                if logical_path in seen_paths:
+                    continue
+                seen_paths.add(logical_path)
+
+                mtime = int(stat.st_mtime)
+
+                # === PŘÍSNÁ KONTROLA (ŽÁDNÁ TOLERANCE) ===
+                # Pokud soubor v DB existuje a přesně sedí velikost i čas, přeskočíme ho.
+                # Vše ostatní (včetně posunu času o 1s) se považuje za změnu a aktualizuje se.
+
+                is_match = False
+                if logical_path in indexed_map:
+                    db_size, db_mtime = indexed_map[logical_path]
+                    if size == db_size and mtime == db_mtime:
+                        is_match = True
+
+                if is_match:
+                    skipped += 1
+                    if PRINT_SKIPPED:
+                        print(f"⏭ SKIP {logical_path}", flush=True)
+                    continue
+                # ============================================
+
+                print("➕ NEW / UPDATED", flush=True)
+                print(f"   File: {logical_path}", flush=True)
+                print(f"   Size: {size:,} B", flush=True)
+
+                try:
+                    b3 = compute_blake3(disk_path)
+                except Exception as e:
+                    print(f"❌ BLAKE3 failed: {e}", flush=True)
+                    continue
+
+                # Zde proběhne UPDATE mtime na hodnotu z disku
+                cur.execute("""
+                    INSERT INTO file_md5_index
+                        (os_name, host_name, full_path, file_name, directory,
+                         file_size, mtime, blake3)
+                    VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
+                    ON DUPLICATE KEY UPDATE
+                        file_size  = VALUES(file_size),
+                        mtime      = VALUES(mtime),
+                        blake3     = VALUES(blake3),
+                        updated_at = CURRENT_TIMESTAMP
+                """, (
+                    OS_NAME,
+                    HOSTNAME,
+                    logical_path,
+                    fname,
+                    os.path.dirname(logical_path),
+                    size,
+                    mtime,
+                    b3,
+                ))
+
+                new_files += 1
+                print(f"   B3  : {b3.hex()}", flush=True)
+                print("--------------------------------------", flush=True)
+
+    print("======================================", flush=True)
+    print(f"✅ New / updated : {new_files}", flush=True)
+    print(f"⏭ Skipped        : {skipped}", flush=True)
+    print(f"🚫 Size filtered: {filtered}", flush=True)
+    print("🏁 Script finished", flush=True)
+
+
+    # ==============================
+    # DB CLEANUP – REMOVE DELETED FILES
+    # ==============================
+
+    print("🧹 Checking for deleted files in DB...", flush=True)
+
+    db_paths = set(indexed_map.keys())
+    deleted_paths = db_paths - seen_paths
+
+    # Omezíme jen na aktuální share (pokud je aktivní)
+    if SCAN_ONLY_THIS:
+        prefix = f"/mnt/user/{SCAN_ONLY_THIS}/"
+        deleted_paths = {p for p in deleted_paths if p.startswith(prefix)}
+
+    if deleted_paths:
+        print(f"🗑 Removing {len(deleted_paths):,} deleted files from DB", flush=True)
+
+        BATCH_SIZE = 1000
+        deleted_paths = list(deleted_paths)
+
+        for i in range(0, len(deleted_paths), BATCH_SIZE):
+            batch = deleted_paths[i:i + BATCH_SIZE]
+            placeholders = ",".join(["%s"] * len(batch))
+
+            sql = f"""
+                DELETE FROM file_md5_index
+                WHERE host_name = %s
+                  AND full_path IN ({placeholders})
+            """
+
+            cur.execute(sql, (HOSTNAME, *batch))
+
+        print("✅ DB cleanup completed", flush=True)
+    else:
+        print("✅ No deleted files found in DB", flush=True)
+
+    cur.close()
+    db.close()
+
+if __name__ == "__main__":
+    main()