This commit is contained in:
2026-01-16 15:34:12 +01:00
parent 186c98fd0d
commit 2d2a60a845
6 changed files with 850 additions and 1 deletions

315
dddddd.py Normal file
View File

@@ -0,0 +1,315 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
FAST FILE HASH INDEXER UNRAID (BLAKE3 ONLY, ALL SHARES)
- HARDCODED SINGLE SHARE MODE
- SQL OPTIMIZATION
- STRICT MODE (NO TOLERANCE) - Updates DB on any mismatch
"""
import os
import pymysql
import socket
import platform
from blake3 import blake3
# ==============================
# ENV / HOST
# ==============================
HOSTNAME = socket.gethostname()
OS_NAME = platform.system()
# ZDE JE TO NATVRDO PRO TESTOVÁNÍ:
# SCAN_ONLY_THIS = None #"#Fotky"
SCAN_ONLY_THIS = '#Library' # "#Fotky"
# ==============================
# CONFIG
# ==============================
EXCLUDED_SHARES = {"domains", "appdata", "system", "isos"}
# --- File size limits (bytes) ---
FILE_MIN_SIZE = 0
FILE_MAX_SIZE = 1024 * 1024 * 1024 * 1024 # 50MB
DB_CONFIG = {
"host": "192.168.1.76",
"port": 3307,
"user": "root",
"password": "Vlado9674+",
"database": "torrents",
"charset": "utf8mb4",
"autocommit": True,
}
CHUNK_SIZE = 4 * 1024 * 1024 # 4 MB
PRINT_SKIPPED = False
# ==============================
# HASH
# ==============================
def compute_blake3(path: str) -> bytes:
h = blake3()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(CHUNK_SIZE), b""):
h.update(chunk)
return h.digest()
# ==============================
# SHARE / PATH HELPERS
# ==============================
def get_user_shares():
if SCAN_ONLY_THIS:
path = f"/mnt/user/{SCAN_ONLY_THIS}"
if os.path.isdir(path):
print(f"🎯 SINGLE SHARE MODE ACTIVE: Scanning only '{SCAN_ONLY_THIS}'")
return [SCAN_ONLY_THIS]
else:
print(f"⚠️ ERROR: Requested share '{SCAN_ONLY_THIS}' not found in /mnt/user!")
return []
shares = []
if not os.path.exists("/mnt/user"):
return []
for name in os.listdir("/mnt/user"):
if name.startswith("."):
continue
if name in EXCLUDED_SHARES:
continue
path = f"/mnt/user/{name}"
if os.path.isdir(path):
shares.append(name)
return sorted(shares)
def find_physical_roots(shares):
roots = []
if not os.path.exists("/mnt"):
return []
for disk in os.listdir("/mnt"):
if not disk.startswith("disk"):
continue
for share in shares:
path = f"/mnt/{disk}/{share}"
if os.path.isdir(path):
roots.append((share, path))
return sorted(roots)
def logical_path_from_disk_path(disk_path: str) -> str:
if not disk_path.startswith("/mnt/disk"):
raise ValueError(f"Unexpected disk path: {disk_path}")
parts = disk_path.split("/", 3)
return f"/mnt/user/{parts[3]}"
def size_allowed(size: int) -> bool:
if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE:
return False
if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE:
return False
return True
# ==============================
# MAIN
# ==============================
def main():
print("🚀 BLAKE3 indexer starting", flush=True)
print(f"🖥 Host: {HOSTNAME} | OS: {OS_NAME}", flush=True)
if FILE_MIN_SIZE or FILE_MAX_SIZE:
print(f"📏 File size limits: min={FILE_MIN_SIZE} max={FILE_MAX_SIZE}", flush=True)
shares = get_user_shares()
if not shares:
print("❌ No user shares to index!", flush=True)
return
print("📦 User shares to index:", flush=True)
for s in shares:
print(f" - {s}", flush=True)
scan_roots = find_physical_roots(shares)
if not scan_roots:
print("❌ No physical disk roots found!", flush=True)
return
print("📂 Physical scan roots:", flush=True)
for _, path in scan_roots:
print(f" - {path}", flush=True)
try:
db = pymysql.connect(**DB_CONFIG)
cur = db.cursor()
# === TOTO JE TEN PŘÍKAZ "NEPŘEMÝŠLEJ" ===
# Nastaví relaci na UTC. MySQL přestane posouvat časy o hodinu sem a tam.
# cur.execute("SET time_zone = '+00:00'")
# =========================================
except Exception as e:
print(f"❌ Database connection failed: {e}")
return
print("📥 Loading already indexed files into memory...", flush=True)
# === OPTIMALIZACE SQL ===
if SCAN_ONLY_THIS:
search_pattern = f"/mnt/user/{SCAN_ONLY_THIS}%"
print(f"⚡ OPTIMIZATION: Fetching only DB records for '{search_pattern}'", flush=True)
cur.execute("""
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
FROM file_md5_index
WHERE host_name = %s AND full_path LIKE %s
""", (HOSTNAME, search_pattern))
else:
cur.execute("""
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
FROM file_md5_index
WHERE host_name = %s
""", (HOSTNAME,))
# Načteme do slovníku pro rychlé vyhledávání
# Formát: { "cesta": (velikost, mtime) }
indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
print(f"✅ Loaded {len(indexed_map):,} indexed entries", flush=True)
print("======================================", flush=True)
new_files = 0
skipped = 0
filtered = 0
seen_paths = set()
# --- SCAN ---
for share, scan_root in scan_roots:
for root, _, files in os.walk(scan_root):
for fname in files:
disk_path = os.path.join(root, fname)
try:
stat = os.stat(disk_path)
except OSError:
continue
size = stat.st_size
if not size_allowed(size):
filtered += 1
continue
logical_path = logical_path_from_disk_path(disk_path)
if logical_path in seen_paths:
continue
seen_paths.add(logical_path)
mtime = int(stat.st_mtime)
# === PŘÍSNÁ KONTROLA (ŽÁDNÁ TOLERANCE) ===
# Pokud soubor v DB existuje a přesně sedí velikost i čas, přeskočíme ho.
# Vše ostatní (včetně posunu času o 1s) se považuje za změnu a aktualizuje se.
is_match = False
if logical_path in indexed_map:
db_size, db_mtime = indexed_map[logical_path]
if size == db_size and mtime == db_mtime:
is_match = True
if is_match:
skipped += 1
if PRINT_SKIPPED:
print(f"⏭ SKIP {logical_path}", flush=True)
continue
# ============================================
print(" NEW / UPDATED", flush=True)
print(f" File: {logical_path}", flush=True)
print(f" Size: {size:,} B", flush=True)
try:
b3 = compute_blake3(disk_path)
except Exception as e:
print(f"❌ BLAKE3 failed: {e}", flush=True)
continue
# Zde proběhne UPDATE mtime na hodnotu z disku
cur.execute("""
INSERT INTO file_md5_index
(os_name, host_name, full_path, file_name, directory,
file_size, mtime, blake3)
VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
ON DUPLICATE KEY UPDATE
file_size = VALUES(file_size),
mtime = VALUES(mtime),
blake3 = VALUES(blake3),
updated_at = CURRENT_TIMESTAMP
""", (
OS_NAME,
HOSTNAME,
logical_path,
fname,
os.path.dirname(logical_path),
size,
mtime,
b3,
))
new_files += 1
print(f" B3 : {b3.hex()}", flush=True)
print("--------------------------------------", flush=True)
print("======================================", flush=True)
print(f"✅ New / updated : {new_files}", flush=True)
print(f"⏭ Skipped : {skipped}", flush=True)
print(f"🚫 Size filtered: {filtered}", flush=True)
print("🏁 Script finished", flush=True)
# ==============================
# DB CLEANUP REMOVE DELETED FILES
# ==============================
print("🧹 Checking for deleted files in DB...", flush=True)
db_paths = set(indexed_map.keys())
deleted_paths = db_paths - seen_paths
# Omezíme jen na aktuální share (pokud je aktivní)
if SCAN_ONLY_THIS:
prefix = f"/mnt/user/{SCAN_ONLY_THIS}/"
deleted_paths = {p for p in deleted_paths if p.startswith(prefix)}
if deleted_paths:
print(f"🗑 Removing {len(deleted_paths):,} deleted files from DB", flush=True)
BATCH_SIZE = 1000
deleted_paths = list(deleted_paths)
for i in range(0, len(deleted_paths), BATCH_SIZE):
batch = deleted_paths[i:i + BATCH_SIZE]
placeholders = ",".join(["%s"] * len(batch))
sql = f"""
DELETE FROM file_md5_index
WHERE host_name = %s
AND full_path IN ({placeholders})
"""
cur.execute(sql, (HOSTNAME, *batch))
print("✅ DB cleanup completed", flush=True)
else:
print("✅ No deleted files found in DB", flush=True)
cur.close()
db.close()
if __name__ == "__main__":
main()