This commit is contained in:
2026-01-08 10:15:45 +01:00
parent 2aee823e87
commit 6cdabc64b4
2 changed files with 646 additions and 0 deletions

View File

@@ -0,0 +1,295 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
FAST FILE HASH INDEXER WINDOWS CLIENT (EXTERNAL DISKS)
- Mode: PHYSICAL BACKUP
- Hostname in DB = Disk Label (e.g., #HD015)
- Path in DB = Relative path (e.g., /Movies/Film.mkv)
"""
import os, time
import pymysql
import socket
import platform
import sys
from blake3 import blake3
# ==============================
# CONFIG
# ==============================
CHUNK_SIZE = 5 * 1024 * 1024 # 5 MB
PROGRESS_MIN_SIZE = 500 * 1024 * 1024 # 500 MB
PROGRESS_INTERVAL = 1.0 # seconds
EXCLUDED_DIRS = {"$RECYCLE.BIN", "System Volume Information", "RECYCLER", "msdownld.tmp"}
# --- Limity velikosti ---
FILE_MIN_SIZE = 0
FILE_MAX_SIZE = 1024 * 1024 * 1024* 1024 # 1TB
# --- Nastavení Databáze ---
DB_CONFIG = {
"host": "192.168.1.76",
"port": 3307,
"user": "root",
"password": "Vlado9674+",
"database": "torrents",
"charset": "utf8mb4",
"autocommit": True,
}
CHUNK_SIZE = 4 * 1024 * 1024 # 4 MB
PRINT_SKIPPED = False # True = vypisovat i přeskočené
# ==============================
# SYSTEM INFO
# ==============================
# Fyzický název PC (jen pro výpis do konzole, do DB půjde název disku)
REAL_PC_HOSTNAME = socket.gethostname()
OS_NAME = platform.system()
# ==============================
# FUNCTIONS
# ==============================
def compute_blake3(path: str) -> bytes:
h = blake3()
total_size = os.path.getsize(path)
show_progress = total_size >= PROGRESS_MIN_SIZE
processed = 0
start_time = time.time()
last_report = start_time
try:
with open(path, "rb") as f:
while True:
chunk = f.read(CHUNK_SIZE)
if not chunk:
break
h.update(chunk)
processed += len(chunk)
if show_progress:
now = time.time()
if now - last_report >= PROGRESS_INTERVAL:
elapsed = now - start_time
speed = processed / elapsed if elapsed > 0 else 0
percent = processed / total_size * 100
remaining = total_size - processed
eta = remaining / speed if speed > 0 else 0
print(
f"{percent:6.2f}% | "
f"{processed/1024/1024:8.1f} / {total_size/1024/1024:.1f} MB | "
f"{speed/1024/1024:6.1f} MB/s | "
f"ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}",
flush=True
)
last_report = now
if show_progress:
total_time = time.time() - start_time
avg_speed = total_size / total_time if total_time > 0 else 0
print(
f" ✅ DONE | "
f"{total_size/1024/1024:.1f} MB | "
f"avg {avg_speed/1024/1024:.1f} MB/s | "
f"time {time.strftime('%H:%M:%S', time.gmtime(total_time))}",
flush=True
)
return h.digest()
except Exception as e:
print(f"⚠️ HASH ERROR: {path} - {e}")
raise
def get_drive_info():
"""Získá písmeno disku a jeho ID (které se použije jako host_name)."""
print("\n💿 --- NASTAVENÍ SKENOVÁNÍ (EXTERNÍ DISK) ---")
# 1. Písmeno disku
while True:
drive_input = input("📂 Zadejte písmeno disku ve Windows (např. 'E'): ").strip().upper()
drive_letter = drive_input.replace(":", "").replace("\\", "").replace("/", "")
if len(drive_letter) == 1 and drive_letter.isalpha():
drive_root = f"{drive_letter}:\\"
if os.path.isdir(drive_root):
break
else:
print(f"❌ Disk {drive_root} není dostupný.")
else:
print("❌ Neplatný formát.")
# 2. Název disku -> HOST_NAME
while True:
disk_label = input("🏷️ Zadejte ID disku (bude uloženo jako 'host_name', např. '#HD015'): ").strip()
if len(disk_label) >= 2:
break
print("❌ Název je příliš krátký.")
return drive_root, disk_label
def size_allowed(size: int) -> bool:
if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE: return False
if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE: return False
return True
# ==============================
# MAIN
# ==============================
def main():
print("🚀 BLAKE3 External Disk Indexer", flush=True)
print(f"🖥 Running on PC: {REAL_PC_HOSTNAME}", flush=True)
# Získání vstupů
scan_root, disk_hostname = get_drive_info()
print(f"✅ Konfigurace:")
print(f" Zdroj (Windows) : {scan_root}")
print(f" DB Hostname : {disk_hostname}")
print(f" DB Cesty : /Složka/Soubor...")
try:
db = pymysql.connect(**DB_CONFIG)
cur = db.cursor()
except Exception as e:
print(f"❌ DB Connection failed: {e}")
input("Enter pro konec...")
return
print(f"📥 Načítám index pro disk: '{disk_hostname}'...", flush=True)
# === OPTIMALIZACE: Hledáme přesně podle host_name ===
cur.execute("""
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
FROM file_md5_index
WHERE host_name = %s
""", (disk_hostname,))
# Mapa: { "/Slozka/Soubor.ext": (size, mtime) }
indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
print(f"✅ Nalezeno {len(indexed_map):,} souborů v DB pro tento disk.", flush=True)
print("======================================", flush=True)
new_files = 0
skipped = 0
filtered = 0
errors = 0
seen_paths = set()
# --- SCAN ---
for root, dirs, files in os.walk(scan_root):
# Ignorace systémových složek
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
for fname in files:
disk_path = os.path.join(root, fname)
# 1. Stat (velikost, čas)
try:
stat = os.stat(disk_path)
except OSError:
errors += 1
continue
size = stat.st_size
if not size_allowed(size):
filtered += 1
continue
# 2. Vytvoření čisté cesty pro DB
# E:\Filmy\Avatar.mkv -> Filmy\Avatar.mkv
try:
rel_path = os.path.relpath(disk_path, scan_root)
except ValueError:
errors += 1
continue
# Normalizace na Linux style: Filmy/Avatar.mkv
clean_path = rel_path.replace("\\", "/")
# Přidání lomítka na začátek: /Filmy/Avatar.mkv
if not clean_path.startswith("/"):
clean_path = "/" + clean_path
if clean_path in seen_paths:
continue
seen_paths.add(clean_path)
mtime = int(stat.st_mtime)
# === STRICT CHECK ===
is_match = False
if clean_path in indexed_map:
db_size, db_mtime = indexed_map[clean_path]
if size == db_size and mtime == db_mtime:
is_match = True
if is_match:
skipped += 1
if PRINT_SKIPPED:
print(f"⏭ SKIP {clean_path}", flush=True)
continue
# === INSERT / UPDATE ===
print(" NEW / UPDATED", flush=True)
print(f" File: {clean_path}", flush=True)
print(f" Size: {size:,} B", flush=True)
try:
b3 = compute_blake3(disk_path)
except Exception:
errors += 1
continue
cur.execute("""
INSERT INTO file_md5_index
(os_name, host_name, full_path, file_name, directory,
file_size, mtime, blake3)
VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
ON DUPLICATE KEY UPDATE
file_size = VALUES(file_size),
mtime = VALUES(mtime),
blake3 = VALUES(blake3),
updated_at = CURRENT_TIMESTAMP
""", (
OS_NAME, # Např. 'Windows' (kde se to skenovalo)
disk_hostname, # ZDE SE UKLÁDÁ '#HD015'
clean_path, # ZDE SE UKLÁDÁ '/Filmy/Avatar.mkv'
fname,
os.path.dirname(clean_path),
size,
mtime,
b3,
))
new_files += 1
print(f" Hash: {b3.hex()}", flush=True)
print("--------------------------------------", flush=True)
print("======================================", flush=True)
print(f"✅ Hotovo : {new_files}")
print(f"⏭ Shoda : {skipped}")
print(f"⚠️ Chyby : {errors}")
print("🏁 Konec.")
cur.close()
db.close()
# input("\nStiskněte Enter pro ukončení...")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,351 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
r"""
FAST FILE HASH INDEXER WINDOWS CLIENT (HARDCODED CONFIG)
- Mode: PHYSICAL BACKUP
- Hostname in DB = Disk Label (e.g., #HD015)
- Path in DB = Relative path (e.g., /Movies/Film.mkv)
"""
import os, time
import pymysql
import socket
import platform
import sys
from blake3 import blake3
# ==============================
# ⚙️ USER CONFIGURATION
# ==============================
DISK_DRIVE_LETTER = "P" # (e.g., "E", "F", "P")
DISK_HOSTNAME = "#HD08" # (e.g., "#HD015")
# 🔒 SAFETY SWITCH
# True = LIST ONLY (No DB changes). "Simulates" the run.
# False = EXECUTE (Deletes and Inserts into DB).
DRY_RUN = True
# ==============================
# TECHNICAL CONFIG
# ==============================
CHUNK_SIZE = 5 * 1024 * 1024 # 5 MB
PROGRESS_MIN_SIZE = 500 * 1024 * 1024 # 500 MB
PROGRESS_INTERVAL = 1.0 # seconds
EXCLUDED_DIRS = {"$RECYCLE.BIN", "System Volume Information", "RECYCLER", "msdownld.tmp"}
# --- File Size Limits ---
FILE_MIN_SIZE = 0
FILE_MAX_SIZE = 1024 * 1024 * 1024 * 1024 # 1TB
# --- DB Config ---
DB_CONFIG = {
"host": "192.168.1.76",
"port": 3307,
"user": "root",
"password": "Vlado9674+",
"database": "torrents",
"charset": "utf8mb4",
"autocommit": True,
}
PRINT_SKIPPED = False # Set True to see files that were already in DB
# ==============================
# SYSTEM INFO
# ==============================
REAL_PC_HOSTNAME = socket.gethostname()
OS_NAME = platform.system()
# ==============================
# FUNCTIONS
# ==============================
def compute_blake3(path: str) -> bytes:
h = blake3()
total_size = os.path.getsize(path)
show_progress = total_size >= PROGRESS_MIN_SIZE
processed = 0
start_time = time.time()
last_report = start_time
try:
with open(path, "rb") as f:
while True:
chunk = f.read(CHUNK_SIZE)
if not chunk:
break
h.update(chunk)
processed += len(chunk)
if show_progress:
now = time.time()
if now - last_report >= PROGRESS_INTERVAL:
elapsed = now - start_time
speed = processed / elapsed if elapsed > 0 else 0
percent = processed / total_size * 100
remaining = total_size - processed
eta = remaining / speed if speed > 0 else 0
print(
f"{percent:6.2f}% | "
f"{processed / 1024 / 1024:8.1f} / {total_size / 1024 / 1024:.1f} MB | "
f"{speed / 1024 / 1024:6.1f} MB/s | "
f"ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}",
flush=True
)
last_report = now
if show_progress:
total_time = time.time() - start_time
avg_speed = total_size / total_time if total_time > 0 else 0
print(
f" ✅ DONE | "
f"{total_size / 1024 / 1024:.1f} MB | "
f"avg {avg_speed / 1024 / 1024:.1f} MB/s | "
f"time {time.strftime('%H:%M:%S', time.gmtime(total_time))}",
flush=True
)
return h.digest()
except Exception as e:
print(f"⚠️ HASH ERROR: {path} - {e}")
raise
def size_allowed(size: int) -> bool:
if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE: return False
if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE: return False
return True
def normalize_db_path(scan_root, disk_path):
"""
Converts a physical Windows path to the standardized DB format.
E:\Movies\File.mkv -> /Movies/File.mkv
"""
try:
rel_path = os.path.relpath(disk_path, scan_root)
except ValueError:
return None
# Windows backslash to slash
clean_path = rel_path.replace("\\", "/")
# Ensure leading slash
if not clean_path.startswith("/"):
clean_path = "/" + clean_path
return clean_path
# ==============================
# MAIN
# ==============================
def main():
print("🚀 BLAKE3 External Disk Indexer", flush=True)
print(f"🖥 Running on PC: {REAL_PC_HOSTNAME}", flush=True)
if DRY_RUN:
print("🛡️ DRY RUN MODE ACTIVE: No changes will be made to DB.", flush=True)
else:
print("⚠️ LIVE MODE: Changes WILL be committed to DB.", flush=True)
# Build root path
scan_root = f"{DISK_DRIVE_LETTER}:\\"
if not os.path.isdir(scan_root):
print(f"❌ ERROR: Drive '{scan_root}' not found!")
print(f" Please check DISK_DRIVE_LETTER in config.")
return
print(f"✅ Config:")
print(f" Source (Win) : {scan_root}")
print(f" DB Hostname : {DISK_HOSTNAME}")
try:
db = pymysql.connect(**DB_CONFIG)
cur = db.cursor()
except Exception as e:
print(f"❌ DB Connection failed: {e}")
return
print(f"📥 Loading DB index for: '{DISK_HOSTNAME}'...", flush=True)
# === LOAD EXISTING DB RECORDS ===
cur.execute("""
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
FROM file_md5_index
WHERE host_name = %s
""", (DISK_HOSTNAME,))
indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
print(f"✅ Found {len(indexed_map):,} files in DB for this disk.", flush=True)
# =========================================================
# PHASE 1: CLEANUP (DELETE MISSING FILES)
# =========================================================
print("======================================", flush=True)
print("🧹 PHASE 1: Checking for deleted files...", flush=True)
current_disk_paths = set()
# Fast walk just to get paths
for root, dirs, files in os.walk(scan_root):
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
for fname in files:
disk_path = os.path.join(root, fname)
clean_path = normalize_db_path(scan_root, disk_path)
if clean_path:
current_disk_paths.add(clean_path)
paths_to_delete = set(indexed_map.keys()) - current_disk_paths
if paths_to_delete:
print(f"🗑️ Found {len(paths_to_delete):,} files to delete from DB.")
if DRY_RUN:
print("🛡️ [DRY RUN] Listing files to be deleted (No action taken):")
for p in sorted(list(paths_to_delete))[:20]: # Print first 20
print(f" - {p}")
if len(paths_to_delete) > 20:
print(f" ... and {len(paths_to_delete) - 20} more.")
else:
# Delete in batches
batch_size = 1000
to_delete_list = list(paths_to_delete)
for i in range(0, len(to_delete_list), batch_size):
batch = to_delete_list[i: i + batch_size]
format_strings = ','.join(['%s'] * len(batch))
query = f"DELETE FROM file_md5_index WHERE host_name = %s AND full_path IN ({format_strings})"
try:
cur.execute(query, [DISK_HOSTNAME] + batch)
print(f" ... deleted batch {i}-{i + len(batch)}")
except Exception as e:
print(f"❌ Error deleting batch: {e}")
# Update local map
for p in paths_to_delete:
del indexed_map[p]
print("✅ Cleanup complete.")
else:
print("✅ No deleted files detected.")
# =========================================================
# PHASE 2: SCAN & UPDATE (HASHING)
# =========================================================
print("======================================", flush=True)
print("🚀 PHASE 2: Scanning for changes & new files...", flush=True)
new_files = 0
skipped = 0
filtered = 0
errors = 0
seen_paths = set()
for root, dirs, files in os.walk(scan_root):
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
for fname in files:
disk_path = os.path.join(root, fname)
try:
stat = os.stat(disk_path)
except OSError:
errors += 1
continue
size = stat.st_size
if not size_allowed(size):
filtered += 1
continue
clean_path = normalize_db_path(scan_root, disk_path)
if not clean_path:
errors += 1
continue
if clean_path in seen_paths:
continue
seen_paths.add(clean_path)
mtime = int(stat.st_mtime)
# === MATCH CHECK ===
is_match = False
if clean_path in indexed_map:
db_size, db_mtime = indexed_map[clean_path]
if size == db_size and mtime == db_mtime:
is_match = True
if is_match:
skipped += 1
if PRINT_SKIPPED:
print(f"⏭ SKIP {clean_path}", flush=True)
continue
# === INSERT / UPDATE ===
print(" NEW / UPDATED", flush=True)
print(f" File: {clean_path}", flush=True)
print(f" Size: {size:,} B", flush=True)
try:
b3 = compute_blake3(disk_path)
except Exception:
errors += 1
continue
if DRY_RUN:
print(f"🛡️ [DRY RUN] Would INSERT/UPDATE: {clean_path}")
print(f" Hash: {b3.hex()}")
new_files += 1
else:
cur.execute("""
INSERT INTO file_md5_index
(os_name, host_name, full_path, file_name, directory,
file_size, mtime, blake3)
VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
ON DUPLICATE KEY UPDATE
file_size = VALUES(file_size),
mtime = VALUES(mtime),
blake3 = VALUES(blake3),
updated_at = CURRENT_TIMESTAMP
""", (
OS_NAME,
DISK_HOSTNAME,
clean_path,
fname,
os.path.dirname(clean_path),
size,
mtime,
b3,
))
new_files += 1
print(f" Hash: {b3.hex()}", flush=True)
print("--------------------------------------", flush=True)
print("======================================", flush=True)
print(f"✅ Processed : {new_files}")
print(f"⏭ Skipped : {skipped}")
print(f"🗑 Deleted : {len(paths_to_delete)} " + ("(DRY RUN)" if DRY_RUN else ""))
print(f"⚠️ Errors : {errors}")
print("🏁 Done.")
cur.close()
db.close()
if __name__ == "__main__":
main()