Files
medevio/dddddd.py
2026-01-16 15:34:12 +01:00

315 lines
9.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
FAST FILE HASH INDEXER UNRAID (BLAKE3 ONLY, ALL SHARES)
- HARDCODED SINGLE SHARE MODE
- SQL OPTIMIZATION
- STRICT MODE (NO TOLERANCE) - Updates DB on any mismatch
"""
import os
import pymysql
import socket
import platform
from blake3 import blake3
# ==============================
# ENV / HOST
# ==============================
HOSTNAME = socket.gethostname()
OS_NAME = platform.system()
# ZDE JE TO NATVRDO PRO TESTOVÁNÍ:
# SCAN_ONLY_THIS = None #"#Fotky"
SCAN_ONLY_THIS = '#Library' # "#Fotky"
# ==============================
# CONFIG
# ==============================
EXCLUDED_SHARES = {"domains", "appdata", "system", "isos"}
# --- File size limits (bytes) ---
FILE_MIN_SIZE = 0
FILE_MAX_SIZE = 1024 * 1024 * 1024 * 1024 # 50MB
DB_CONFIG = {
"host": "192.168.1.76",
"port": 3307,
"user": "root",
"password": "Vlado9674+",
"database": "torrents",
"charset": "utf8mb4",
"autocommit": True,
}
CHUNK_SIZE = 4 * 1024 * 1024 # 4 MB
PRINT_SKIPPED = False
# ==============================
# HASH
# ==============================
def compute_blake3(path: str) -> bytes:
h = blake3()
with open(path, "rb") as f:
for chunk in iter(lambda: f.read(CHUNK_SIZE), b""):
h.update(chunk)
return h.digest()
# ==============================
# SHARE / PATH HELPERS
# ==============================
def get_user_shares():
if SCAN_ONLY_THIS:
path = f"/mnt/user/{SCAN_ONLY_THIS}"
if os.path.isdir(path):
print(f"🎯 SINGLE SHARE MODE ACTIVE: Scanning only '{SCAN_ONLY_THIS}'")
return [SCAN_ONLY_THIS]
else:
print(f"⚠️ ERROR: Requested share '{SCAN_ONLY_THIS}' not found in /mnt/user!")
return []
shares = []
if not os.path.exists("/mnt/user"):
return []
for name in os.listdir("/mnt/user"):
if name.startswith("."):
continue
if name in EXCLUDED_SHARES:
continue
path = f"/mnt/user/{name}"
if os.path.isdir(path):
shares.append(name)
return sorted(shares)
def find_physical_roots(shares):
roots = []
if not os.path.exists("/mnt"):
return []
for disk in os.listdir("/mnt"):
if not disk.startswith("disk"):
continue
for share in shares:
path = f"/mnt/{disk}/{share}"
if os.path.isdir(path):
roots.append((share, path))
return sorted(roots)
def logical_path_from_disk_path(disk_path: str) -> str:
if not disk_path.startswith("/mnt/disk"):
raise ValueError(f"Unexpected disk path: {disk_path}")
parts = disk_path.split("/", 3)
return f"/mnt/user/{parts[3]}"
def size_allowed(size: int) -> bool:
if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE:
return False
if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE:
return False
return True
# ==============================
# MAIN
# ==============================
def main():
print("🚀 BLAKE3 indexer starting", flush=True)
print(f"🖥 Host: {HOSTNAME} | OS: {OS_NAME}", flush=True)
if FILE_MIN_SIZE or FILE_MAX_SIZE:
print(f"📏 File size limits: min={FILE_MIN_SIZE} max={FILE_MAX_SIZE}", flush=True)
shares = get_user_shares()
if not shares:
print("❌ No user shares to index!", flush=True)
return
print("📦 User shares to index:", flush=True)
for s in shares:
print(f" - {s}", flush=True)
scan_roots = find_physical_roots(shares)
if not scan_roots:
print("❌ No physical disk roots found!", flush=True)
return
print("📂 Physical scan roots:", flush=True)
for _, path in scan_roots:
print(f" - {path}", flush=True)
try:
db = pymysql.connect(**DB_CONFIG)
cur = db.cursor()
# === TOTO JE TEN PŘÍKAZ "NEPŘEMÝŠLEJ" ===
# Nastaví relaci na UTC. MySQL přestane posouvat časy o hodinu sem a tam.
# cur.execute("SET time_zone = '+00:00'")
# =========================================
except Exception as e:
print(f"❌ Database connection failed: {e}")
return
print("📥 Loading already indexed files into memory...", flush=True)
# === OPTIMALIZACE SQL ===
if SCAN_ONLY_THIS:
search_pattern = f"/mnt/user/{SCAN_ONLY_THIS}%"
print(f"⚡ OPTIMIZATION: Fetching only DB records for '{search_pattern}'", flush=True)
cur.execute("""
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
FROM file_md5_index
WHERE host_name = %s AND full_path LIKE %s
""", (HOSTNAME, search_pattern))
else:
cur.execute("""
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
FROM file_md5_index
WHERE host_name = %s
""", (HOSTNAME,))
# Načteme do slovníku pro rychlé vyhledávání
# Formát: { "cesta": (velikost, mtime) }
indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
print(f"✅ Loaded {len(indexed_map):,} indexed entries", flush=True)
print("======================================", flush=True)
new_files = 0
skipped = 0
filtered = 0
seen_paths = set()
# --- SCAN ---
for share, scan_root in scan_roots:
for root, _, files in os.walk(scan_root):
for fname in files:
disk_path = os.path.join(root, fname)
try:
stat = os.stat(disk_path)
except OSError:
continue
size = stat.st_size
if not size_allowed(size):
filtered += 1
continue
logical_path = logical_path_from_disk_path(disk_path)
if logical_path in seen_paths:
continue
seen_paths.add(logical_path)
mtime = int(stat.st_mtime)
# === PŘÍSNÁ KONTROLA (ŽÁDNÁ TOLERANCE) ===
# Pokud soubor v DB existuje a přesně sedí velikost i čas, přeskočíme ho.
# Vše ostatní (včetně posunu času o 1s) se považuje za změnu a aktualizuje se.
is_match = False
if logical_path in indexed_map:
db_size, db_mtime = indexed_map[logical_path]
if size == db_size and mtime == db_mtime:
is_match = True
if is_match:
skipped += 1
if PRINT_SKIPPED:
print(f"⏭ SKIP {logical_path}", flush=True)
continue
# ============================================
print(" NEW / UPDATED", flush=True)
print(f" File: {logical_path}", flush=True)
print(f" Size: {size:,} B", flush=True)
try:
b3 = compute_blake3(disk_path)
except Exception as e:
print(f"❌ BLAKE3 failed: {e}", flush=True)
continue
# Zde proběhne UPDATE mtime na hodnotu z disku
cur.execute("""
INSERT INTO file_md5_index
(os_name, host_name, full_path, file_name, directory,
file_size, mtime, blake3)
VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
ON DUPLICATE KEY UPDATE
file_size = VALUES(file_size),
mtime = VALUES(mtime),
blake3 = VALUES(blake3),
updated_at = CURRENT_TIMESTAMP
""", (
OS_NAME,
HOSTNAME,
logical_path,
fname,
os.path.dirname(logical_path),
size,
mtime,
b3,
))
new_files += 1
print(f" B3 : {b3.hex()}", flush=True)
print("--------------------------------------", flush=True)
print("======================================", flush=True)
print(f"✅ New / updated : {new_files}", flush=True)
print(f"⏭ Skipped : {skipped}", flush=True)
print(f"🚫 Size filtered: {filtered}", flush=True)
print("🏁 Script finished", flush=True)
# ==============================
# DB CLEANUP REMOVE DELETED FILES
# ==============================
print("🧹 Checking for deleted files in DB...", flush=True)
db_paths = set(indexed_map.keys())
deleted_paths = db_paths - seen_paths
# Omezíme jen na aktuální share (pokud je aktivní)
if SCAN_ONLY_THIS:
prefix = f"/mnt/user/{SCAN_ONLY_THIS}/"
deleted_paths = {p for p in deleted_paths if p.startswith(prefix)}
if deleted_paths:
print(f"🗑 Removing {len(deleted_paths):,} deleted files from DB", flush=True)
BATCH_SIZE = 1000
deleted_paths = list(deleted_paths)
for i in range(0, len(deleted_paths), BATCH_SIZE):
batch = deleted_paths[i:i + BATCH_SIZE]
placeholders = ",".join(["%s"] * len(batch))
sql = f"""
DELETE FROM file_md5_index
WHERE host_name = %s
AND full_path IN ({placeholders})
"""
cur.execute(sql, (HOSTNAME, *batch))
print("✅ DB cleanup completed", flush=True)
else:
print("✅ No deleted files found in DB", flush=True)
cur.close()
db.close()
if __name__ == "__main__":
main()