Z230
This commit is contained in:
315
dddddd.py
Normal file
315
dddddd.py
Normal file
@@ -0,0 +1,315 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
FAST FILE HASH INDEXER – UNRAID (BLAKE3 ONLY, ALL SHARES)
|
||||
- HARDCODED SINGLE SHARE MODE
|
||||
- SQL OPTIMIZATION
|
||||
- STRICT MODE (NO TOLERANCE) - Updates DB on any mismatch
|
||||
"""
|
||||
|
||||
import os
|
||||
import pymysql
|
||||
import socket
|
||||
import platform
|
||||
from blake3 import blake3
|
||||
|
||||
# ==============================
|
||||
# ENV / HOST
|
||||
# ==============================
|
||||
|
||||
HOSTNAME = socket.gethostname()
|
||||
OS_NAME = platform.system()
|
||||
|
||||
# ZDE JE TO NATVRDO PRO TESTOVÁNÍ:
|
||||
# SCAN_ONLY_THIS = None #"#Fotky"
|
||||
SCAN_ONLY_THIS = '#Library' # "#Fotky"
|
||||
|
||||
# ==============================
|
||||
# CONFIG
|
||||
# ==============================
|
||||
|
||||
EXCLUDED_SHARES = {"domains", "appdata", "system", "isos"}
|
||||
|
||||
# --- File size limits (bytes) ---
|
||||
FILE_MIN_SIZE = 0
|
||||
FILE_MAX_SIZE = 1024 * 1024 * 1024 * 1024 # 50MB
|
||||
|
||||
DB_CONFIG = {
|
||||
"host": "192.168.1.76",
|
||||
"port": 3307,
|
||||
"user": "root",
|
||||
"password": "Vlado9674+",
|
||||
"database": "torrents",
|
||||
"charset": "utf8mb4",
|
||||
"autocommit": True,
|
||||
}
|
||||
|
||||
CHUNK_SIZE = 4 * 1024 * 1024 # 4 MB
|
||||
PRINT_SKIPPED = False
|
||||
|
||||
|
||||
# ==============================
|
||||
# HASH
|
||||
# ==============================
|
||||
|
||||
def compute_blake3(path: str) -> bytes:
|
||||
h = blake3()
|
||||
with open(path, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(CHUNK_SIZE), b""):
|
||||
h.update(chunk)
|
||||
return h.digest()
|
||||
|
||||
|
||||
# ==============================
|
||||
# SHARE / PATH HELPERS
|
||||
# ==============================
|
||||
|
||||
def get_user_shares():
|
||||
if SCAN_ONLY_THIS:
|
||||
path = f"/mnt/user/{SCAN_ONLY_THIS}"
|
||||
if os.path.isdir(path):
|
||||
print(f"🎯 SINGLE SHARE MODE ACTIVE: Scanning only '{SCAN_ONLY_THIS}'")
|
||||
return [SCAN_ONLY_THIS]
|
||||
else:
|
||||
print(f"⚠️ ERROR: Requested share '{SCAN_ONLY_THIS}' not found in /mnt/user!")
|
||||
return []
|
||||
|
||||
shares = []
|
||||
if not os.path.exists("/mnt/user"):
|
||||
return []
|
||||
|
||||
for name in os.listdir("/mnt/user"):
|
||||
if name.startswith("."):
|
||||
continue
|
||||
if name in EXCLUDED_SHARES:
|
||||
continue
|
||||
path = f"/mnt/user/{name}"
|
||||
if os.path.isdir(path):
|
||||
shares.append(name)
|
||||
return sorted(shares)
|
||||
|
||||
|
||||
def find_physical_roots(shares):
|
||||
roots = []
|
||||
if not os.path.exists("/mnt"):
|
||||
return []
|
||||
for disk in os.listdir("/mnt"):
|
||||
if not disk.startswith("disk"):
|
||||
continue
|
||||
for share in shares:
|
||||
path = f"/mnt/{disk}/{share}"
|
||||
if os.path.isdir(path):
|
||||
roots.append((share, path))
|
||||
return sorted(roots)
|
||||
|
||||
|
||||
def logical_path_from_disk_path(disk_path: str) -> str:
|
||||
if not disk_path.startswith("/mnt/disk"):
|
||||
raise ValueError(f"Unexpected disk path: {disk_path}")
|
||||
parts = disk_path.split("/", 3)
|
||||
return f"/mnt/user/{parts[3]}"
|
||||
|
||||
|
||||
def size_allowed(size: int) -> bool:
|
||||
if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE:
|
||||
return False
|
||||
if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
# ==============================
|
||||
# MAIN
|
||||
# ==============================
|
||||
|
||||
def main():
|
||||
print("🚀 BLAKE3 indexer starting", flush=True)
|
||||
print(f"🖥 Host: {HOSTNAME} | OS: {OS_NAME}", flush=True)
|
||||
|
||||
if FILE_MIN_SIZE or FILE_MAX_SIZE:
|
||||
print(f"📏 File size limits: min={FILE_MIN_SIZE} max={FILE_MAX_SIZE}", flush=True)
|
||||
|
||||
shares = get_user_shares()
|
||||
if not shares:
|
||||
print("❌ No user shares to index!", flush=True)
|
||||
return
|
||||
|
||||
print("📦 User shares to index:", flush=True)
|
||||
for s in shares:
|
||||
print(f" - {s}", flush=True)
|
||||
|
||||
scan_roots = find_physical_roots(shares)
|
||||
if not scan_roots:
|
||||
print("❌ No physical disk roots found!", flush=True)
|
||||
return
|
||||
|
||||
print("📂 Physical scan roots:", flush=True)
|
||||
for _, path in scan_roots:
|
||||
print(f" - {path}", flush=True)
|
||||
|
||||
try:
|
||||
db = pymysql.connect(**DB_CONFIG)
|
||||
cur = db.cursor()
|
||||
# === TOTO JE TEN PŘÍKAZ "NEPŘEMÝŠLEJ" ===
|
||||
# Nastaví relaci na UTC. MySQL přestane posouvat časy o hodinu sem a tam.
|
||||
# cur.execute("SET time_zone = '+00:00'")
|
||||
# =========================================
|
||||
except Exception as e:
|
||||
print(f"❌ Database connection failed: {e}")
|
||||
return
|
||||
|
||||
print("📥 Loading already indexed files into memory...", flush=True)
|
||||
|
||||
# === OPTIMALIZACE SQL ===
|
||||
if SCAN_ONLY_THIS:
|
||||
search_pattern = f"/mnt/user/{SCAN_ONLY_THIS}%"
|
||||
print(f"⚡ OPTIMIZATION: Fetching only DB records for '{search_pattern}'", flush=True)
|
||||
cur.execute("""
|
||||
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
|
||||
FROM file_md5_index
|
||||
WHERE host_name = %s AND full_path LIKE %s
|
||||
""", (HOSTNAME, search_pattern))
|
||||
else:
|
||||
cur.execute("""
|
||||
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
|
||||
FROM file_md5_index
|
||||
WHERE host_name = %s
|
||||
""", (HOSTNAME,))
|
||||
|
||||
# Načteme do slovníku pro rychlé vyhledávání
|
||||
# Formát: { "cesta": (velikost, mtime) }
|
||||
indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
|
||||
print(f"✅ Loaded {len(indexed_map):,} indexed entries", flush=True)
|
||||
print("======================================", flush=True)
|
||||
|
||||
new_files = 0
|
||||
skipped = 0
|
||||
filtered = 0
|
||||
seen_paths = set()
|
||||
|
||||
# --- SCAN ---
|
||||
for share, scan_root in scan_roots:
|
||||
for root, _, files in os.walk(scan_root):
|
||||
for fname in files:
|
||||
disk_path = os.path.join(root, fname)
|
||||
|
||||
try:
|
||||
stat = os.stat(disk_path)
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
size = stat.st_size
|
||||
if not size_allowed(size):
|
||||
filtered += 1
|
||||
continue
|
||||
|
||||
logical_path = logical_path_from_disk_path(disk_path)
|
||||
|
||||
if logical_path in seen_paths:
|
||||
continue
|
||||
seen_paths.add(logical_path)
|
||||
|
||||
mtime = int(stat.st_mtime)
|
||||
|
||||
# === PŘÍSNÁ KONTROLA (ŽÁDNÁ TOLERANCE) ===
|
||||
# Pokud soubor v DB existuje a přesně sedí velikost i čas, přeskočíme ho.
|
||||
# Vše ostatní (včetně posunu času o 1s) se považuje za změnu a aktualizuje se.
|
||||
|
||||
is_match = False
|
||||
if logical_path in indexed_map:
|
||||
db_size, db_mtime = indexed_map[logical_path]
|
||||
if size == db_size and mtime == db_mtime:
|
||||
is_match = True
|
||||
|
||||
if is_match:
|
||||
skipped += 1
|
||||
if PRINT_SKIPPED:
|
||||
print(f"⏭ SKIP {logical_path}", flush=True)
|
||||
continue
|
||||
# ============================================
|
||||
|
||||
print("➕ NEW / UPDATED", flush=True)
|
||||
print(f" File: {logical_path}", flush=True)
|
||||
print(f" Size: {size:,} B", flush=True)
|
||||
|
||||
try:
|
||||
b3 = compute_blake3(disk_path)
|
||||
except Exception as e:
|
||||
print(f"❌ BLAKE3 failed: {e}", flush=True)
|
||||
continue
|
||||
|
||||
# Zde proběhne UPDATE mtime na hodnotu z disku
|
||||
cur.execute("""
|
||||
INSERT INTO file_md5_index
|
||||
(os_name, host_name, full_path, file_name, directory,
|
||||
file_size, mtime, blake3)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
|
||||
ON DUPLICATE KEY UPDATE
|
||||
file_size = VALUES(file_size),
|
||||
mtime = VALUES(mtime),
|
||||
blake3 = VALUES(blake3),
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
""", (
|
||||
OS_NAME,
|
||||
HOSTNAME,
|
||||
logical_path,
|
||||
fname,
|
||||
os.path.dirname(logical_path),
|
||||
size,
|
||||
mtime,
|
||||
b3,
|
||||
))
|
||||
|
||||
new_files += 1
|
||||
print(f" B3 : {b3.hex()}", flush=True)
|
||||
print("--------------------------------------", flush=True)
|
||||
|
||||
print("======================================", flush=True)
|
||||
print(f"✅ New / updated : {new_files}", flush=True)
|
||||
print(f"⏭ Skipped : {skipped}", flush=True)
|
||||
print(f"🚫 Size filtered: {filtered}", flush=True)
|
||||
print("🏁 Script finished", flush=True)
|
||||
|
||||
|
||||
# ==============================
|
||||
# DB CLEANUP – REMOVE DELETED FILES
|
||||
# ==============================
|
||||
|
||||
print("🧹 Checking for deleted files in DB...", flush=True)
|
||||
|
||||
db_paths = set(indexed_map.keys())
|
||||
deleted_paths = db_paths - seen_paths
|
||||
|
||||
# Omezíme jen na aktuální share (pokud je aktivní)
|
||||
if SCAN_ONLY_THIS:
|
||||
prefix = f"/mnt/user/{SCAN_ONLY_THIS}/"
|
||||
deleted_paths = {p for p in deleted_paths if p.startswith(prefix)}
|
||||
|
||||
if deleted_paths:
|
||||
print(f"🗑 Removing {len(deleted_paths):,} deleted files from DB", flush=True)
|
||||
|
||||
BATCH_SIZE = 1000
|
||||
deleted_paths = list(deleted_paths)
|
||||
|
||||
for i in range(0, len(deleted_paths), BATCH_SIZE):
|
||||
batch = deleted_paths[i:i + BATCH_SIZE]
|
||||
placeholders = ",".join(["%s"] * len(batch))
|
||||
|
||||
sql = f"""
|
||||
DELETE FROM file_md5_index
|
||||
WHERE host_name = %s
|
||||
AND full_path IN ({placeholders})
|
||||
"""
|
||||
|
||||
cur.execute(sql, (HOSTNAME, *batch))
|
||||
|
||||
print("✅ DB cleanup completed", flush=True)
|
||||
else:
|
||||
print("✅ No deleted files found in DB", flush=True)
|
||||
|
||||
cur.close()
|
||||
db.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user