medevio/dddddd.py

#!/usr/bin/python3
# -*- coding: utf-8 -*-

"""
FAST FILE HASH INDEXER – UNRAID (BLAKE3 ONLY, ALL SHARES)
- HARDCODED SINGLE SHARE MODE
- SQL OPTIMIZATION
- STRICT MODE (NO TOLERANCE) - Updates DB on any mismatch
"""

import os
import pymysql
import socket
import platform
from blake3 import blake3

# ==============================
# ENV / HOST
# ==============================

HOSTNAME = socket.gethostname()
OS_NAME = platform.system()

# ZDE JE TO NATVRDO PRO TESTOVÁNÍ:
# SCAN_ONLY_THIS = None #"#Fotky"
SCAN_ONLY_THIS = '#Library'  # "#Fotky"

# ==============================
# CONFIG
# ==============================

EXCLUDED_SHARES = {"domains", "appdata", "system", "isos"}

# --- File size limits (bytes) ---
FILE_MIN_SIZE = 0
FILE_MAX_SIZE = 1024 * 1024 * 1024 * 1024  # 50MB

DB_CONFIG = {
    "host": "192.168.1.76",
    "port": 3307,
    "user": "root",
    "password": "Vlado9674+",
    "database": "torrents",
    "charset": "utf8mb4",
    "autocommit": True,
}

CHUNK_SIZE = 4 * 1024 * 1024  # 4 MB
PRINT_SKIPPED = False


# ==============================
# HASH
# ==============================

def compute_blake3(path: str) -> bytes:
    h = blake3()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(CHUNK_SIZE), b""):
            h.update(chunk)
    return h.digest()


# ==============================
# SHARE / PATH HELPERS
# ==============================

def get_user_shares():
    if SCAN_ONLY_THIS:
        path = f"/mnt/user/{SCAN_ONLY_THIS}"
        if os.path.isdir(path):
            print(f"🎯 SINGLE SHARE MODE ACTIVE: Scanning only '{SCAN_ONLY_THIS}'")
            return [SCAN_ONLY_THIS]
        else:
            print(f"⚠️ ERROR: Requested share '{SCAN_ONLY_THIS}' not found in /mnt/user!")
            return []

    shares = []
    if not os.path.exists("/mnt/user"):
        return []

    for name in os.listdir("/mnt/user"):
        if name.startswith("."):
            continue
        if name in EXCLUDED_SHARES:
            continue
        path = f"/mnt/user/{name}"
        if os.path.isdir(path):
            shares.append(name)
    return sorted(shares)


def find_physical_roots(shares):
    roots = []
    if not os.path.exists("/mnt"):
        return []
    for disk in os.listdir("/mnt"):
        if not disk.startswith("disk"):
            continue
        for share in shares:
            path = f"/mnt/{disk}/{share}"
            if os.path.isdir(path):
                roots.append((share, path))
    return sorted(roots)


def logical_path_from_disk_path(disk_path: str) -> str:
    if not disk_path.startswith("/mnt/disk"):
        raise ValueError(f"Unexpected disk path: {disk_path}")
    parts = disk_path.split("/", 3)
    return f"/mnt/user/{parts[3]}"


def size_allowed(size: int) -> bool:
    if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE:
        return False
    if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE:
        return False
    return True


# ==============================
# MAIN
# ==============================

def main():
    print("🚀 BLAKE3 indexer starting", flush=True)
    print(f"🖥 Host: {HOSTNAME} | OS: {OS_NAME}", flush=True)

    if FILE_MIN_SIZE or FILE_MAX_SIZE:
        print(f"📏 File size limits: min={FILE_MIN_SIZE} max={FILE_MAX_SIZE}", flush=True)

    shares = get_user_shares()
    if not shares:
        print("❌ No user shares to index!", flush=True)
        return

    print("📦 User shares to index:", flush=True)
    for s in shares:
        print(f"   - {s}", flush=True)

    scan_roots = find_physical_roots(shares)
    if not scan_roots:
        print("❌ No physical disk roots found!", flush=True)
        return

    print("📂 Physical scan roots:", flush=True)
    for _, path in scan_roots:
        print(f"   - {path}", flush=True)

    try:
        db = pymysql.connect(**DB_CONFIG)
        cur = db.cursor()
        # === TOTO JE TEN PŘÍKAZ "NEPŘEMÝŠLEJ" ===
        # Nastaví relaci na UTC. MySQL přestane posouvat časy o hodinu sem a tam.
        # cur.execute("SET time_zone = '+00:00'")
        # =========================================
    except Exception as e:
        print(f"❌ Database connection failed: {e}")
        return

    print("📥 Loading already indexed files into memory...", flush=True)

    # === OPTIMALIZACE SQL ===
    if SCAN_ONLY_THIS:
        search_pattern = f"/mnt/user/{SCAN_ONLY_THIS}%"
        print(f"⚡ OPTIMIZATION: Fetching only DB records for '{search_pattern}'", flush=True)
        cur.execute("""
            SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
            FROM file_md5_index
            WHERE host_name = %s AND full_path LIKE %s
        """, (HOSTNAME, search_pattern))
    else:
        cur.execute("""
            SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
            FROM file_md5_index
            WHERE host_name = %s
        """, (HOSTNAME,))

    # Načteme do slovníku pro rychlé vyhledávání
    # Formát: { "cesta": (velikost, mtime) }
    indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
    print(f"✅ Loaded {len(indexed_map):,} indexed entries", flush=True)
    print("======================================", flush=True)

    new_files = 0
    skipped = 0
    filtered = 0
    seen_paths = set()

    # --- SCAN ---
    for share, scan_root in scan_roots:
        for root, _, files in os.walk(scan_root):
            for fname in files:
                disk_path = os.path.join(root, fname)

                try:
                    stat = os.stat(disk_path)
                except OSError:
                    continue

                size = stat.st_size
                if not size_allowed(size):
                    filtered += 1
                    continue

                logical_path = logical_path_from_disk_path(disk_path)

                if logical_path in seen_paths:
                    continue
                seen_paths.add(logical_path)

                mtime = int(stat.st_mtime)

                # === PŘÍSNÁ KONTROLA (ŽÁDNÁ TOLERANCE) ===
                # Pokud soubor v DB existuje a přesně sedí velikost i čas, přeskočíme ho.
                # Vše ostatní (včetně posunu času o 1s) se považuje za změnu a aktualizuje se.

                is_match = False
                if logical_path in indexed_map:
                    db_size, db_mtime = indexed_map[logical_path]
                    if size == db_size and mtime == db_mtime:
                        is_match = True

                if is_match:
                    skipped += 1
                    if PRINT_SKIPPED:
                        print(f"⏭ SKIP {logical_path}", flush=True)
                    continue
                # ============================================

                print("➕ NEW / UPDATED", flush=True)
                print(f"   File: {logical_path}", flush=True)
                print(f"   Size: {size:,} B", flush=True)

                try:
                    b3 = compute_blake3(disk_path)
                except Exception as e:
                    print(f"❌ BLAKE3 failed: {e}", flush=True)
                    continue

                # Zde proběhne UPDATE mtime na hodnotu z disku
                cur.execute("""
                    INSERT INTO file_md5_index
                        (os_name, host_name, full_path, file_name, directory,
                         file_size, mtime, blake3)
                    VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
                    ON DUPLICATE KEY UPDATE
                        file_size  = VALUES(file_size),
                        mtime      = VALUES(mtime),
                        blake3     = VALUES(blake3),
                        updated_at = CURRENT_TIMESTAMP
                """, (
                    OS_NAME,
                    HOSTNAME,
                    logical_path,
                    fname,
                    os.path.dirname(logical_path),
                    size,
                    mtime,
                    b3,
                ))

                new_files += 1
                print(f"   B3  : {b3.hex()}", flush=True)
                print("--------------------------------------", flush=True)

    print("======================================", flush=True)
    print(f"✅ New / updated : {new_files}", flush=True)
    print(f"⏭ Skipped        : {skipped}", flush=True)
    print(f"🚫 Size filtered: {filtered}", flush=True)
    print("🏁 Script finished", flush=True)


    # ==============================
    # DB CLEANUP – REMOVE DELETED FILES
    # ==============================

    print("🧹 Checking for deleted files in DB...", flush=True)

    db_paths = set(indexed_map.keys())
    deleted_paths = db_paths - seen_paths

    # Omezíme jen na aktuální share (pokud je aktivní)
    if SCAN_ONLY_THIS:
        prefix = f"/mnt/user/{SCAN_ONLY_THIS}/"
        deleted_paths = {p for p in deleted_paths if p.startswith(prefix)}

    if deleted_paths:
        print(f"🗑 Removing {len(deleted_paths):,} deleted files from DB", flush=True)

        BATCH_SIZE = 1000
        deleted_paths = list(deleted_paths)

        for i in range(0, len(deleted_paths), BATCH_SIZE):
            batch = deleted_paths[i:i + BATCH_SIZE]
            placeholders = ",".join(["%s"] * len(batch))

            sql = f"""
                DELETE FROM file_md5_index
                WHERE host_name = %s
                  AND full_path IN ({placeholders})
            """

            cur.execute(sql, (HOSTNAME, *batch))

        print("✅ DB cleanup completed", flush=True)
    else:
        print("✅ No deleted files found in DB", flush=True)

    cur.close()
    db.close()

if __name__ == "__main__":
    main()