315 lines
9.5 KiB
Python
315 lines
9.5 KiB
Python
#!/usr/bin/python3
|
||
# -*- coding: utf-8 -*-
|
||
|
||
"""
|
||
FAST FILE HASH INDEXER – UNRAID (BLAKE3 ONLY, ALL SHARES)
|
||
- HARDCODED SINGLE SHARE MODE
|
||
- SQL OPTIMIZATION
|
||
- STRICT MODE (NO TOLERANCE) - Updates DB on any mismatch
|
||
"""
|
||
|
||
import os
|
||
import pymysql
|
||
import socket
|
||
import platform
|
||
from blake3 import blake3
|
||
|
||
# ==============================
|
||
# ENV / HOST
|
||
# ==============================
|
||
|
||
HOSTNAME = socket.gethostname()
|
||
OS_NAME = platform.system()
|
||
|
||
# ZDE JE TO NATVRDO PRO TESTOVÁNÍ:
|
||
# SCAN_ONLY_THIS = None #"#Fotky"
|
||
SCAN_ONLY_THIS = '#Library' # "#Fotky"
|
||
|
||
# ==============================
|
||
# CONFIG
|
||
# ==============================
|
||
|
||
EXCLUDED_SHARES = {"domains", "appdata", "system", "isos"}
|
||
|
||
# --- File size limits (bytes) ---
|
||
FILE_MIN_SIZE = 0
|
||
FILE_MAX_SIZE = 1024 * 1024 * 1024 * 1024 # 50MB
|
||
|
||
DB_CONFIG = {
|
||
"host": "192.168.1.76",
|
||
"port": 3307,
|
||
"user": "root",
|
||
"password": "Vlado9674+",
|
||
"database": "torrents",
|
||
"charset": "utf8mb4",
|
||
"autocommit": True,
|
||
}
|
||
|
||
CHUNK_SIZE = 4 * 1024 * 1024 # 4 MB
|
||
PRINT_SKIPPED = False
|
||
|
||
|
||
# ==============================
|
||
# HASH
|
||
# ==============================
|
||
|
||
def compute_blake3(path: str) -> bytes:
|
||
h = blake3()
|
||
with open(path, "rb") as f:
|
||
for chunk in iter(lambda: f.read(CHUNK_SIZE), b""):
|
||
h.update(chunk)
|
||
return h.digest()
|
||
|
||
|
||
# ==============================
|
||
# SHARE / PATH HELPERS
|
||
# ==============================
|
||
|
||
def get_user_shares():
|
||
if SCAN_ONLY_THIS:
|
||
path = f"/mnt/user/{SCAN_ONLY_THIS}"
|
||
if os.path.isdir(path):
|
||
print(f"🎯 SINGLE SHARE MODE ACTIVE: Scanning only '{SCAN_ONLY_THIS}'")
|
||
return [SCAN_ONLY_THIS]
|
||
else:
|
||
print(f"⚠️ ERROR: Requested share '{SCAN_ONLY_THIS}' not found in /mnt/user!")
|
||
return []
|
||
|
||
shares = []
|
||
if not os.path.exists("/mnt/user"):
|
||
return []
|
||
|
||
for name in os.listdir("/mnt/user"):
|
||
if name.startswith("."):
|
||
continue
|
||
if name in EXCLUDED_SHARES:
|
||
continue
|
||
path = f"/mnt/user/{name}"
|
||
if os.path.isdir(path):
|
||
shares.append(name)
|
||
return sorted(shares)
|
||
|
||
|
||
def find_physical_roots(shares):
|
||
roots = []
|
||
if not os.path.exists("/mnt"):
|
||
return []
|
||
for disk in os.listdir("/mnt"):
|
||
if not disk.startswith("disk"):
|
||
continue
|
||
for share in shares:
|
||
path = f"/mnt/{disk}/{share}"
|
||
if os.path.isdir(path):
|
||
roots.append((share, path))
|
||
return sorted(roots)
|
||
|
||
|
||
def logical_path_from_disk_path(disk_path: str) -> str:
|
||
if not disk_path.startswith("/mnt/disk"):
|
||
raise ValueError(f"Unexpected disk path: {disk_path}")
|
||
parts = disk_path.split("/", 3)
|
||
return f"/mnt/user/{parts[3]}"
|
||
|
||
|
||
def size_allowed(size: int) -> bool:
|
||
if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE:
|
||
return False
|
||
if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE:
|
||
return False
|
||
return True
|
||
|
||
|
||
# ==============================
|
||
# MAIN
|
||
# ==============================
|
||
|
||
def main():
|
||
print("🚀 BLAKE3 indexer starting", flush=True)
|
||
print(f"🖥 Host: {HOSTNAME} | OS: {OS_NAME}", flush=True)
|
||
|
||
if FILE_MIN_SIZE or FILE_MAX_SIZE:
|
||
print(f"📏 File size limits: min={FILE_MIN_SIZE} max={FILE_MAX_SIZE}", flush=True)
|
||
|
||
shares = get_user_shares()
|
||
if not shares:
|
||
print("❌ No user shares to index!", flush=True)
|
||
return
|
||
|
||
print("📦 User shares to index:", flush=True)
|
||
for s in shares:
|
||
print(f" - {s}", flush=True)
|
||
|
||
scan_roots = find_physical_roots(shares)
|
||
if not scan_roots:
|
||
print("❌ No physical disk roots found!", flush=True)
|
||
return
|
||
|
||
print("📂 Physical scan roots:", flush=True)
|
||
for _, path in scan_roots:
|
||
print(f" - {path}", flush=True)
|
||
|
||
try:
|
||
db = pymysql.connect(**DB_CONFIG)
|
||
cur = db.cursor()
|
||
# === TOTO JE TEN PŘÍKAZ "NEPŘEMÝŠLEJ" ===
|
||
# Nastaví relaci na UTC. MySQL přestane posouvat časy o hodinu sem a tam.
|
||
# cur.execute("SET time_zone = '+00:00'")
|
||
# =========================================
|
||
except Exception as e:
|
||
print(f"❌ Database connection failed: {e}")
|
||
return
|
||
|
||
print("📥 Loading already indexed files into memory...", flush=True)
|
||
|
||
# === OPTIMALIZACE SQL ===
|
||
if SCAN_ONLY_THIS:
|
||
search_pattern = f"/mnt/user/{SCAN_ONLY_THIS}%"
|
||
print(f"⚡ OPTIMIZATION: Fetching only DB records for '{search_pattern}'", flush=True)
|
||
cur.execute("""
|
||
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
|
||
FROM file_md5_index
|
||
WHERE host_name = %s AND full_path LIKE %s
|
||
""", (HOSTNAME, search_pattern))
|
||
else:
|
||
cur.execute("""
|
||
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
|
||
FROM file_md5_index
|
||
WHERE host_name = %s
|
||
""", (HOSTNAME,))
|
||
|
||
# Načteme do slovníku pro rychlé vyhledávání
|
||
# Formát: { "cesta": (velikost, mtime) }
|
||
indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
|
||
print(f"✅ Loaded {len(indexed_map):,} indexed entries", flush=True)
|
||
print("======================================", flush=True)
|
||
|
||
new_files = 0
|
||
skipped = 0
|
||
filtered = 0
|
||
seen_paths = set()
|
||
|
||
# --- SCAN ---
|
||
for share, scan_root in scan_roots:
|
||
for root, _, files in os.walk(scan_root):
|
||
for fname in files:
|
||
disk_path = os.path.join(root, fname)
|
||
|
||
try:
|
||
stat = os.stat(disk_path)
|
||
except OSError:
|
||
continue
|
||
|
||
size = stat.st_size
|
||
if not size_allowed(size):
|
||
filtered += 1
|
||
continue
|
||
|
||
logical_path = logical_path_from_disk_path(disk_path)
|
||
|
||
if logical_path in seen_paths:
|
||
continue
|
||
seen_paths.add(logical_path)
|
||
|
||
mtime = int(stat.st_mtime)
|
||
|
||
# === PŘÍSNÁ KONTROLA (ŽÁDNÁ TOLERANCE) ===
|
||
# Pokud soubor v DB existuje a přesně sedí velikost i čas, přeskočíme ho.
|
||
# Vše ostatní (včetně posunu času o 1s) se považuje za změnu a aktualizuje se.
|
||
|
||
is_match = False
|
||
if logical_path in indexed_map:
|
||
db_size, db_mtime = indexed_map[logical_path]
|
||
if size == db_size and mtime == db_mtime:
|
||
is_match = True
|
||
|
||
if is_match:
|
||
skipped += 1
|
||
if PRINT_SKIPPED:
|
||
print(f"⏭ SKIP {logical_path}", flush=True)
|
||
continue
|
||
# ============================================
|
||
|
||
print("➕ NEW / UPDATED", flush=True)
|
||
print(f" File: {logical_path}", flush=True)
|
||
print(f" Size: {size:,} B", flush=True)
|
||
|
||
try:
|
||
b3 = compute_blake3(disk_path)
|
||
except Exception as e:
|
||
print(f"❌ BLAKE3 failed: {e}", flush=True)
|
||
continue
|
||
|
||
# Zde proběhne UPDATE mtime na hodnotu z disku
|
||
cur.execute("""
|
||
INSERT INTO file_md5_index
|
||
(os_name, host_name, full_path, file_name, directory,
|
||
file_size, mtime, blake3)
|
||
VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
|
||
ON DUPLICATE KEY UPDATE
|
||
file_size = VALUES(file_size),
|
||
mtime = VALUES(mtime),
|
||
blake3 = VALUES(blake3),
|
||
updated_at = CURRENT_TIMESTAMP
|
||
""", (
|
||
OS_NAME,
|
||
HOSTNAME,
|
||
logical_path,
|
||
fname,
|
||
os.path.dirname(logical_path),
|
||
size,
|
||
mtime,
|
||
b3,
|
||
))
|
||
|
||
new_files += 1
|
||
print(f" B3 : {b3.hex()}", flush=True)
|
||
print("--------------------------------------", flush=True)
|
||
|
||
print("======================================", flush=True)
|
||
print(f"✅ New / updated : {new_files}", flush=True)
|
||
print(f"⏭ Skipped : {skipped}", flush=True)
|
||
print(f"🚫 Size filtered: {filtered}", flush=True)
|
||
print("🏁 Script finished", flush=True)
|
||
|
||
|
||
# ==============================
|
||
# DB CLEANUP – REMOVE DELETED FILES
|
||
# ==============================
|
||
|
||
print("🧹 Checking for deleted files in DB...", flush=True)
|
||
|
||
db_paths = set(indexed_map.keys())
|
||
deleted_paths = db_paths - seen_paths
|
||
|
||
# Omezíme jen na aktuální share (pokud je aktivní)
|
||
if SCAN_ONLY_THIS:
|
||
prefix = f"/mnt/user/{SCAN_ONLY_THIS}/"
|
||
deleted_paths = {p for p in deleted_paths if p.startswith(prefix)}
|
||
|
||
if deleted_paths:
|
||
print(f"🗑 Removing {len(deleted_paths):,} deleted files from DB", flush=True)
|
||
|
||
BATCH_SIZE = 1000
|
||
deleted_paths = list(deleted_paths)
|
||
|
||
for i in range(0, len(deleted_paths), BATCH_SIZE):
|
||
batch = deleted_paths[i:i + BATCH_SIZE]
|
||
placeholders = ",".join(["%s"] * len(batch))
|
||
|
||
sql = f"""
|
||
DELETE FROM file_md5_index
|
||
WHERE host_name = %s
|
||
AND full_path IN ({placeholders})
|
||
"""
|
||
|
||
cur.execute(sql, (HOSTNAME, *batch))
|
||
|
||
print("✅ DB cleanup completed", flush=True)
|
||
else:
|
||
print("✅ No deleted files found in DB", flush=True)
|
||
|
||
cur.close()
|
||
db.close()
|
||
|
||
if __name__ == "__main__":
|
||
main() |