Compare commits

..

8 Commits

Author SHA1 Message Date
c30a582323 Merge remote-tracking branch 'origin/master' 2026-01-13 16:43:13 +01:00
01aa1249b9 z230 2026-01-13 16:42:40 +01:00
b74e180022 tw22 2026-01-11 21:13:38 +01:00
2037d1b887 tw22 2026-01-11 20:14:52 +01:00
6cdabc64b4 tw22 2026-01-08 10:15:45 +01:00
2aee823e87 z230 2026-01-06 10:09:51 +01:00
b61a8a5473 z230 2026-01-06 10:09:25 +01:00
83f2d0dafc z230 2026-01-06 10:05:35 +01:00
10 changed files with 1197 additions and 580 deletions

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
.venv/
.idea/

View File

@@ -258,8 +258,7 @@ def walk_and_store_bulk():
BATCH_SIZE = 10000 BATCH_SIZE = 10000
# target_dir = r"\\tower1\#colddata" # target_dir = r"\\tower1\#colddata"
# target_dir = r"z:" # target_dir = r"z:"
# target_dir = r"\\tower\ebooks" target_dir = r"\\tower\ebooks"
target_dir = r"\\tower\dedup"
# device_name = "TW22" # device_name = "TW22"
device_name = "TOWER" device_name = "TOWER"

View File

@@ -1,485 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import hashlib
from datetime import datetime
import mysql.connector
from mysql.connector import Error
from dotenv import load_dotenv
from pathlib import Path
import unicodedata
# ======================================================
# Load .env from the script directory
# ======================================================
env_path = Path(__file__).resolve().parent / ".env"
load_dotenv(env_path)
# ======================================================
# Helper: MD5 of full file path string
# ======================================================
def md5_path(path: str) -> str:
return hashlib.md5(path.encode("utf8")).hexdigest()
# ======================================================
# MySQL CONNECTIONS
# ======================================================
def get_server_connection():
return mysql.connector.connect(
host=os.getenv("DB_MYSQL_HOST"),
user=os.getenv("DB_MYSQL_ROOT"),
password=os.getenv("DB_MYSQL_ROOT_PASS"),
port=int(os.getenv("DB_MYSQL_PORT")),
auth_plugin="mysql_native_password",
)
def get_db_connection():
conn = mysql.connector.connect(
host=os.getenv("DB_MYSQL_HOST"),
user=os.getenv("DB_MYSQL_ROOT"),
password=os.getenv("DB_MYSQL_ROOT_PASS"),
port=int(os.getenv("DB_MYSQL_PORT")),
database="walkfiles",
auth_plugin="mysql_native_password",
)
c = conn.cursor()
c.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
c.close()
return conn
# ======================================================
# DATABASE INITIALIZATION
# ======================================================
def init_db():
# Ensure DB exists
server = get_server_connection()
cur = server.cursor()
cur.execute("""
CREATE DATABASE IF NOT EXISTS walkfiles
DEFAULT CHARACTER SET utf8mb4
COLLATE utf8mb4_general_ci
""")
server.commit()
cur.close()
server.close()
# Connect
conn = get_db_connection()
cursor = conn.cursor()
# DEVICES
cursor.execute("""
CREATE TABLE IF NOT EXISTS devices (
id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255) UNIQUE,
scanned_at DATETIME NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
""")
# FOLDERS
cursor.execute("""
CREATE TABLE IF NOT EXISTS folders (
id INT AUTO_INCREMENT PRIMARY KEY,
path VARCHAR(2048) NOT NULL,
parent_id INT NULL,
device_id INT NOT NULL,
first_seen DATETIME NOT NULL,
last_seen DATETIME NOT NULL,
deleted TINYINT(1) NOT NULL DEFAULT 0,
CONSTRAINT fk_folder_device
FOREIGN KEY (device_id) REFERENCES devices(id)
ON DELETE CASCADE,
UNIQUE KEY uniq_folder_path (device_id, path(255)),
INDEX idx_folder_dev (device_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
""")
# FILES
cursor.execute("""
CREATE TABLE IF NOT EXISTS files (
id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255) NOT NULL,
path VARCHAR(2048) NOT NULL,
path_md5 CHAR(32) NOT NULL,
size BIGINT NULL,
modified DATETIME NULL,
type VARCHAR(255) NULL,
folder_id INT NULL,
device_id INT NOT NULL,
deleted TINYINT(1) NOT NULL DEFAULT 0,
first_seen DATETIME NOT NULL,
last_seen DATETIME NOT NULL,
CONSTRAINT fk_file_folder
FOREIGN KEY (folder_id) REFERENCES folders(id)
ON DELETE SET NULL,
CONSTRAINT fk_file_device
FOREIGN KEY (device_id) REFERENCES devices(id)
ON DELETE CASCADE,
UNIQUE KEY uniq_file_path_md5 (device_id, path_md5),
INDEX idx_file_folder (folder_id),
INDEX idx_file_deleted (device_id, deleted)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
""")
conn.commit()
return conn, cursor
# ======================================================
# HELPERS — DEVICES & FOLDERS
# ======================================================
def get_or_create_device(cursor, conn, name: str) -> int:
now = datetime.now()
cursor.execute("INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s,%s)", (name, now))
conn.commit()
cursor.execute("SELECT id FROM devices WHERE name=%s", (name,))
return cursor.fetchone()[0]
def load_folder_state(cursor, device_id: int):
"""
Načte všechny složky pro zařízení a uloží jako:
folder_state[normalized_path] = {"id": id, "deleted": 0/1}
"""
cursor.execute("""
SELECT id, path, deleted
FROM folders
WHERE device_id=%s
""", (device_id,))
out = {}
for folder_id, path, deleted in cursor.fetchall():
norm_path = os.path.normpath(path)
out[norm_path] = {"id": folder_id, "deleted": int(deleted)}
return out
def get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, parent_id):
"""
Vytvoří nebo najde složku. Ošetřuje:
- Unicode normalizaci (Černý vs Černý)
- cache v paměti (folder_state)
- idempotentní INSERT (ON DUPLICATE KEY UPDATE)
"""
# Normalize Unicode + path form
folder_path = unicodedata.normalize("NFC", folder_path)
folder_path = os.path.normpath(folder_path)
key = folder_path
# 1) Cache hit
if key in folder_state:
return folder_state[key]["id"]
now = datetime.now()
# 2) Zkus SELECT
cursor.execute("""
SELECT id
FROM folders
WHERE device_id = %s AND path = %s
LIMIT 1
""", (device_id, folder_path))
row = cursor.fetchone()
if row:
folder_id = row[0]
folder_state[key] = {"id": folder_id, "deleted": 0}
return folder_id
# 3) INSERT (idempotent)
cursor.execute("""
INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen)
VALUES (%s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
id = LAST_INSERT_ID(id),
last_seen = VALUES(last_seen)
""", (folder_path, parent_id, device_id, now, now))
conn.commit()
folder_id = cursor.lastrowid
folder_state[key] = {"id": folder_id, "deleted": 0}
return folder_id
# ======================================================
# LOAD LAST FILE STATE
# ======================================================
def load_last_file_state(cursor, device_id: int):
"""
Načte poslední známý stav souborů pro zařízení, indexovaný podle path_md5.
(Z historických důvodů přes MAX(id), i když máš UNIQUE na (device_id, path_md5))
"""
cursor.execute("""
SELECT f.id, f.path_md5, f.deleted, f.size, f.modified
FROM files f
JOIN (
SELECT MAX(id) AS mx
FROM files
WHERE device_id=%s
GROUP BY path_md5
) t ON f.id = t.mx
""", (device_id,))
out = {}
for fid, md5, deleted, size, modified in cursor.fetchall():
out[md5] = {
"id": fid,
"deleted": int(deleted),
"size": size,
"modified": modified,
}
return out
# ======================================================
# MAIN SCANNER WITH BATCHING
# ======================================================
def walk_and_store_bulk():
BATCH_SIZE = 10000
# target_dir = r"\\tower1\#colddata"
# target_dir = r"z:"
target_dir = r"\\tower\ebooks"
# target_dir = r"\\tower\dedup"
device_name = "TOWER"
# Normalizovaný root pro porovnávání a LIKE
target_dir_norm = os.path.normpath(target_dir)
if not os.path.isdir(target_dir):
print("Invalid directory:", target_dir)
return
conn, cursor = init_db()
now = datetime.now()
device_id = get_or_create_device(cursor, conn, device_name)
folder_state = load_folder_state(cursor, device_id)
file_state = load_last_file_state(cursor, device_id)
seen_folders = set()
seen_files = set() # MD5 of path
files_to_insert = []
files_to_update = []
total_files = 0
print(f"🔍 Scanning: {target_dir} (device {device_id})")
# -------------------------------------------------
# WALK FILESYSTEM
# -------------------------------------------------
for root, dirs, files in os.walk(target_dir):
folder_path = os.path.normpath(root)
# 1⃣ determine parent_id correctly
if folder_path == target_dir_norm:
parent_id = None
else:
parent_folder_path = os.path.normpath(os.path.dirname(folder_path))
parent_id = get_or_create_folder(cursor, conn, folder_state,
device_id, parent_folder_path,
None)
# 2⃣ now insert current folder with correct parent_id
seen_folders.add(folder_path)
folder_id = get_or_create_folder(cursor, conn, folder_state,
device_id, folder_path,
parent_id)
# -------------------------------------------------
# FILE LOOP
# -------------------------------------------------
for name in files:
total_files += 1
filepath = os.path.normpath(os.path.join(folder_path, name))
md5 = md5_path(filepath)
seen_files.add(md5)
try:
st = os.stat(filepath)
except FileNotFoundError:
continue
modified = datetime.fromtimestamp(st.st_mtime).replace(microsecond=0)
size = st.st_size
ext = os.path.splitext(name)[1][:250]
prev = file_state.get(md5)
if prev is None:
# nový soubor
files_to_insert.append(
(name, filepath, md5, size, modified, ext,
folder_id, device_id, 0, now, now)
)
else:
if prev["deleted"] == 1:
# "vzkříšený" soubor
files_to_insert.append(
(name, filepath, md5, size, modified, ext,
folder_id, device_id, 0, now, now)
)
else:
# existuje a není deleted → zkontroluj změnu velikosti / času
if prev["size"] != size or prev["modified"] != modified:
files_to_update.append(
(size, modified, now, prev["id"])
)
# -------------------------------------------------
# BATCH FLUSHING
# -------------------------------------------------
if len(files_to_insert) >= BATCH_SIZE:
print(f"💾 Flushing {len(files_to_insert)} inserts...")
cursor.executemany("""
INSERT INTO files (
name, path, path_md5, size, modified, type,
folder_id, device_id, deleted,
first_seen, last_seen
)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
""", files_to_insert)
conn.commit()
files_to_insert.clear()
if len(files_to_update) >= BATCH_SIZE:
print(f"💾 Flushing {len(files_to_update)} updates...")
cursor.executemany("""
UPDATE files
SET size=%s, modified=%s, last_seen=%s, deleted=0
WHERE id=%s
""", files_to_update)
conn.commit()
files_to_update.clear()
# PROGRESS
if total_files % 1000 == 0:
print(f" ... processed {total_files} files")
# -------------------------------------------------
# FINAL FLUSH (REMAINING INSERTS/UPDATES)
# -------------------------------------------------
if files_to_insert:
print(f"💾 Final flush: {len(files_to_insert)} inserts")
cursor.executemany("""
INSERT INTO files (
name, path, path_md5, size, modified, type,
folder_id, device_id, deleted,
first_seen, last_seen
)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
""", files_to_insert)
conn.commit()
if files_to_update:
print(f"💾 Final flush: {len(files_to_update)} updates")
cursor.executemany("""
UPDATE files
SET size=%s, modified=%s, last_seen=%s, deleted=0
WHERE id=%s
""", files_to_update)
conn.commit()
# -------------------------------------------------
# MARK DELETED FILES — ONLY IN THIS SUBTREE
# -------------------------------------------------
files_deleted_count = 0
like_prefix = target_dir_norm.rstrip("\\/") + "%"
cursor.execute("""
SELECT id, path_md5
FROM files
WHERE device_id = %s
AND deleted = 0
AND path LIKE %s
""", (device_id, like_prefix))
candidates = cursor.fetchall()
ids_to_delete = [fid for (fid, md5) in candidates if md5 not in seen_files]
if ids_to_delete:
print(f"💾 Marking {len(ids_to_delete)} files as deleted in subtree")
cursor.executemany("""
UPDATE files
SET deleted=1, last_seen=%s
WHERE id=%s
""", [(now, fid) for fid in ids_to_delete])
conn.commit()
files_deleted_count = len(ids_to_delete)
# -------------------------------------------------
# MARK DELETED FOLDERS — ONLY IN THIS SUBTREE
# -------------------------------------------------
folders_to_mark_deleted = []
for path, info in folder_state.items():
# omez na subtree (včetně root složky)
norm_path = os.path.normpath(path)
if not norm_path.startswith(target_dir_norm):
continue
if info["deleted"] == 0 and norm_path not in seen_folders:
folders_to_mark_deleted.append((now, info["id"]))
folders_deleted_count = 0
if folders_to_mark_deleted:
cursor.executemany("""
UPDATE folders
SET deleted=1, last_seen=%s
WHERE id=%s
""", folders_to_mark_deleted)
conn.commit()
folders_deleted_count = len(folders_to_mark_deleted)
# -------------------------------------------------
# Update device timestamp
# -------------------------------------------------
cursor.execute("UPDATE devices SET scanned_at=%s WHERE id=%s", (now, device_id))
conn.commit()
cursor.close()
conn.close()
print("")
print("✅ Scan completed.")
print(" Total files scanned:", total_files)
print(" Files inserted:", len(files_to_insert)) # po flushi je 0, ale nechávám pro konzistenci
print(" Files updated:", len(files_to_update)) # dtto
print(" Files deleted in subtree:", files_deleted_count)
print(" Folders deleted in subtree:", folders_deleted_count)
# ======================================================
# MAIN ENTRY
# ======================================================
if __name__ == '__main__':
walk_and_store_bulk()

View File

@@ -8,7 +8,7 @@ import mysql.connector
from dotenv import load_dotenv from dotenv import load_dotenv
from pathlib import Path from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
# ====================================================== # ======================================================
# Load environment # Load environment
@@ -16,47 +16,6 @@ import threading
env_path = Path(__file__).resolve().parent / ".env" env_path = Path(__file__).resolve().parent / ".env"
load_dotenv(env_path) load_dotenv(env_path)
# ======================================================
# LOGGING TOGGLE (ON/OFF)
# ======================================================
LOGGING_ENABLED = False # ← NEW: Set to False to silence all thread debug logs
# ======================================================
# Colors & logging helpers
# ======================================================
RESET = "\033[0m"
COLORS = [
"\033[92m", # green
"\033[94m", # blue
"\033[93m", # yellow
"\033[91m", # red
"\033[95m", # magenta
"\033[96m", # cyan
"\033[90m", # gray
]
print_lock = threading.Lock()
def thread_color():
name = threading.current_thread().name
idx = 0
if "_" in name:
suffix = name.split("_")[-1]
if suffix.isdigit():
idx = int(suffix)
return COLORS[idx % len(COLORS)]
def log_thread(msg: str):
"""Thread-safe, colored log with thread name prefix."""
if not LOGGING_ENABLED: # ← NEW
return
with print_lock:
name = threading.current_thread().name
color = thread_color()
print(f"{color}[{name}] {msg}{RESET}")
# ====================================================== # ======================================================
# MySQL connection (each thread gets its own) # MySQL connection (each thread gets its own)
@@ -82,10 +41,7 @@ def get_db_connection():
def file_md5(path, chunk_size=1024 * 1024): def file_md5(path, chunk_size=1024 * 1024):
md5 = hashlib.md5() md5 = hashlib.md5()
with open(path, "rb") as f: with open(path, "rb") as f:
while True: while chunk := f.read(chunk_size):
chunk = f.read(chunk_size)
if not chunk:
break
md5.update(chunk) md5.update(chunk)
return md5.hexdigest() return md5.hexdigest()
@@ -108,28 +64,24 @@ def process_one_file(row):
file_id = row["id"] file_id = row["id"]
path = row["path"] path = row["path"]
modified = row["modified"] modified = row["modified"]
prev_md5 = row.get("content_md5") prev_md5 = row["content_md5"]
prev_calc = row.get("md5_calculated") prev_calc = row["md5_calculated"]
log_thread(f"START ID={file_id}{path}")
# --- Skip if file does not exist --- # --- Skip if file does not exist ---
if not os.path.isfile(path): if not os.path.isfile(path):
log_thread(f"MISS ID={file_id} (file not found)") return (file_id, "missing", None)
return file_id, "missing", None
# --- Decide if MD5 calculation is needed --- # --- Decide if MD5 needed ---
if prev_md5 and prev_calc and prev_calc >= modified: need_md5 = (
log_thread(f"SKIP ID={file_id} (md5 up-to-date)") prev_md5 is None or
return file_id, "skip", None prev_calc is None or
prev_calc < modified
)
if not need_md5:
return (file_id, "skip", None)
# --- Calculate MD5 --- # --- Calculate MD5 ---
try: new_md5 = file_md5(path)
new_md5 = file_md5(path)
except Exception as e:
log_thread(f"ERROR ID={file_id} while reading file: {e}")
return file_id, "error", str(e)
now = datetime.now().replace(microsecond=0) now = datetime.now().replace(microsecond=0)
# --- Update DB inside thread --- # --- Update DB inside thread ---
@@ -145,17 +97,14 @@ def process_one_file(row):
conn.commit() conn.commit()
c.close() c.close()
conn.close() conn.close()
return (file_id, "updated", new_md5)
log_thread(f"UPDATE ID={file_id} (MD5={new_md5})")
return file_id, "updated", new_md5
except Exception as e: except Exception as e:
log_thread(f"ERROR ID={file_id} DB update failed: {e}") return (file_id, "error", str(e))
return file_id, "error", str(e)
# ====================================================== # ======================================================
# MAIN LOGIC # MAIN LOGIC (single-threaded DB query + multi-threaded MD5)
# ====================================================== # ======================================================
def run_md5_calculator(device_name=None, def run_md5_calculator(device_name=None,
device_id=None, device_id=None,
@@ -164,7 +113,9 @@ def run_md5_calculator(device_name=None,
path_prefix=None, path_prefix=None,
threads=8): threads=8):
# DEVICE resolution # ----------------------------
# DEVICE filter resolution
# ----------------------------
filter_by_device = True filter_by_device = True
if device_name == "ANY" or device_id == "ANY": if device_name == "ANY" or device_id == "ANY":
filter_by_device = False filter_by_device = False
@@ -177,17 +128,21 @@ def run_md5_calculator(device_name=None,
cur = conn.cursor(dictionary=True) cur = conn.cursor(dictionary=True)
cur.execute("SELECT id FROM devices WHERE name=%s", (device_name,)) cur.execute("SELECT id FROM devices WHERE name=%s", (device_name,))
row = cur.fetchone() row = cur.fetchone()
cur.close() cur.close(); conn.close()
conn.close()
if not row: if not row:
raise RuntimeError(f"Device '{device_name}' not found") raise RuntimeError(f"Device '{device_name}' not found")
device_id = row["id"] device_id = row["id"]
# EXTENSION filter
filter_by_extension = (extension != "ANY") filter_by_extension = (extension != "ANY")
# SIZE filter
filter_by_size = (max_size != "ANY") filter_by_size = (max_size != "ANY")
max_bytes = parse_size(max_size) if filter_by_size else None max_bytes = parse_size(max_size) if filter_by_size else None
# PATH filter
filter_by_path = (path_prefix not in [None, "", "ANY"]) filter_by_path = (path_prefix not in [None, "", "ANY"])
cleaned_prefix = path_prefix.rstrip("\\/") if filter_by_path else None cleaned_prefix = path_prefix.rstrip("\\/") if filter_by_path else None
@@ -197,7 +152,9 @@ def run_md5_calculator(device_name=None,
f" max_size={max_size}," f" max_size={max_size},"
f" prefix={path_prefix}\n") f" prefix={path_prefix}\n")
# Fetch rows # ---------------------------------------
# Fetch all rows in a single DB query
# ---------------------------------------
conn = get_db_connection() conn = get_db_connection()
cursor = conn.cursor(dictionary=True) cursor = conn.cursor(dictionary=True)
@@ -224,38 +181,34 @@ def run_md5_calculator(device_name=None,
SELECT id, path, size, modified, content_md5, md5_calculated SELECT id, path, size, modified, content_md5, md5_calculated
FROM files FROM files
WHERE {" AND ".join(where)} WHERE {" AND ".join(where)}
AND NOT (
content_md5 IS NOT NULL
AND md5_calculated IS NOT NULL
AND md5_calculated >= modified
)
""" """
cursor.execute(sql, params) cursor.execute(sql, params)
rows = cursor.fetchall() rows = cursor.fetchall()
cursor.close() cursor.close(); conn.close()
conn.close()
total = len(rows) total = len(rows)
print(f"📁 Files matching criteria: {total}\n") print(f"📁 Files matching criteria: {total}\n")
if total == 0: # ======================================================
print("Nothing to do, exiting.") # === MULTITHREADED MD5 CALCULATION BELOW ============
return # ======================================================
updated = 0
skipped = 0
missing = 0
errors = 0
# MULTITHREADED MD5 with ThreadPoolExecutor(max_workers=threads) as exe:
updated = skipped = missing = errors = 0
with ThreadPoolExecutor(max_workers=threads, thread_name_prefix="Worker") as exe:
futures = {exe.submit(process_one_file, r): r["id"] for r in rows} futures = {exe.submit(process_one_file, r): r["id"] for r in rows}
for i, future in enumerate(as_completed(futures), start=1): for future in as_completed(futures):
file_id = futures[future] file_id = futures[future]
status, result = None, None
try: try:
_id, status, result = future.result() file_id, status, result = future.result()
except Exception as e: except Exception as e:
log_thread(f"FUTURE ERROR for ID={file_id}: {e}") print(f"❌ Thread error for ID {file_id}: {e}")
errors += 1 errors += 1
continue continue
@@ -267,11 +220,11 @@ def run_md5_calculator(device_name=None,
missing += 1 missing += 1
elif status == "error": elif status == "error":
errors += 1 errors += 1
print(f"⚠️ DB update error: {result}")
if i % 100 == 0: # ======================================================
print(f"… processed {i}/{total} files")
# SUMMARY # SUMMARY
# ======================================================
print("\n============================") print("\n============================")
print("✅ Multithreaded MD5 finished") print("✅ Multithreaded MD5 finished")
print("============================") print("============================")
@@ -282,6 +235,7 @@ def run_md5_calculator(device_name=None,
print(f"Threads: {threads}\n") print(f"Threads: {threads}\n")
# ====================================================== # ======================================================
# RUN EXAMPLE # RUN EXAMPLE
# ====================================================== # ======================================================
@@ -291,5 +245,5 @@ if __name__ == "__main__":
extension="ANY", extension="ANY",
max_size="ANY", max_size="ANY",
path_prefix="ANY", path_prefix="ANY",
threads=6 threads=12 # ← ADJUST THREAD COUNT HERE
) )

View File

@@ -0,0 +1,200 @@
import os
import sys
import pymysql
import pymysql.cursors
# ================= KONFIGURACE =================
# --- BEZPEČNOSTNÍ POJISTKA ---
# True = POUZE VÝPIS (nic se nesmaže, databáze se nezmění)
# False = OSTRÝ REŽIM (maže soubory i záznamy v DB!)
DRY_MODE = True
# 1. Přístup k MySQL
DB_CONFIG = {
'host': '192.168.1.76',
'port': 3307,
'user': 'root',
'password': 'Vlado9674+',
'db': 'torrents',
'charset': 'utf8mb4',
'autocommit': True
}
TABULKA = "file_md5_index"
# 2. Mapování cest
SERVER_PREFIX = "/mnt/user/Library"
# Používáme 'r' pro raw string, aby se zpětná lomítka chápala správně
LOCAL_PREFIX = r"\\tower1\#library"
# ===============================================
def get_connection():
return pymysql.connect(
cursorclass=pymysql.cursors.DictCursor,
**DB_CONFIG
)
def convert_path(db_path):
"""Převede cestu z Linux serveru na lokální cestu Windows."""
if db_path.startswith(SERVER_PREFIX):
relative_path = db_path[len(SERVER_PREFIX):]
# Ořízneme počáteční lomítka z relativní cesty, aby fungoval join
relative_path = relative_path.lstrip("/").lstrip("\\")
# Spojí cesty a opraví lomítka
local_path = os.path.join(LOCAL_PREFIX, relative_path)
return os.path.normpath(local_path)
return None
def step_1_mark_duplicates():
print(f"\n--- KROK 1: Hledání duplicit v DB (DRY_MODE={DRY_MODE}) ---")
try:
conn = get_connection()
with conn.cursor() as cursor:
if DRY_MODE:
# V DRY_MODE jen počítáme, co bychom označili (neprovádíme UPDATE)
sql = f"""
SELECT COUNT(*) as pocet
FROM {TABULKA} t1
JOIN {TABULKA} t2 ON t1.blake3 = t2.blake3
WHERE t1.host_name = 'TOWER1'
AND t2.host_name = 'SYNOLOGY'
AND (t1.to_delete IS NULL OR t1.to_delete = 0);
"""
cursor.execute(sql)
result = cursor.fetchone()
affected = result['pocet']
print(f"[DRY-RUN] Našel jsem {affected} shodných záznamů (DB nebude změněna).")
else:
# V OSTRÉM režimu provádíme UPDATE
sql = f"""
UPDATE {TABULKA} t1
JOIN {TABULKA} t2 ON t1.blake3 = t2.blake3
SET t1.to_delete = 1
WHERE t1.host_name = 'TOWER'
AND t2.host_name = 'SYNOLOGY'
AND (t1.to_delete IS NULL OR t1.to_delete = 0);
"""
print("Provádím UPDATE záznamů v databázi...")
cursor.execute(sql)
affected = cursor.rowcount
conn.commit()
print(f"Hotovo. Označeno {affected} záznamů ke smazání.")
conn.close()
return affected
except pymysql.MySQLError as e:
print(f"Chyba MySQL při označování: {e}")
sys.exit(1)
def step_2_delete_files():
print(f"\n--- KROK 2: Mazání souborů (DRY_MODE={DRY_MODE}) ---")
try:
conn = get_connection()
files_to_process = []
with conn.cursor() as cursor:
print("Stahuji seznam souborů...")
if DRY_MODE:
# V DRY_MODE nemůžeme hledat podle 'to_delete=1' (protože jsme nic neoznačili),
# takže musíme použít JOIN dotaz přímo pro simulaci výpisu.
sql = f"""
SELECT t1.id, t1.full_path
FROM {TABULKA} t1
JOIN {TABULKA} t2 ON t1.blake3 = t2.blake3
WHERE t1.host_name = 'TOWER'
AND t2.host_name = 'SYNOLOGY'
AND (t1.to_delete IS NULL OR t1.to_delete = 0)
"""
else:
# V OSTRÉM režimu bereme to, co jsme v kroku 1 označili
sql = f"SELECT id, full_path FROM {TABULKA} WHERE host_name = 'TOWER' AND to_delete = 1"
cursor.execute(sql)
files_to_process = cursor.fetchall()
count = len(files_to_process)
print(f"Nalezeno {count} souborů.")
if count == 0:
print("Žádné soubory k zpracování. Konec.")
return
# V ostrém režimu se zeptáme na potvrzení
if not DRY_MODE:
confirm = input(f"-> [POZOR] Opravdu chcete SMAZAT {count} souborů? (napište 'ano'): ")
if confirm.lower() != 'ano':
print("Operace zrušena.")
return
else:
print("-" * 40)
print("VÝPIS SOUBORŮ, KTERÉ BY BYLY SMAZÁNY:")
print("-" * 40)
deleted_counter = 0
errors = 0
for row in files_to_process:
db_id = row['id']
server_path = row['full_path']
local_path = convert_path(server_path)
if not local_path:
print(f"[SKIP PATH] Nesedí prefix: {server_path}")
continue
# --- LOGIKA DRY RUN vs REAL ---
if DRY_MODE:
# Pouze výpis
print(f"[DRY-RUN] Bylo by smazáno: {local_path}")
deleted_counter += 1
else:
# Ostré mazání
try:
if os.path.exists(local_path):
os.remove(local_path)
print(f"[OK SMAZÁNO] {local_path}")
# Smazání z DB
with conn.cursor() as del_cursor:
del_sql = f"DELETE FROM {TABULKA} WHERE id = %s"
del_cursor.execute(del_sql, (db_id,))
conn.commit()
deleted_counter += 1
else:
print(f"[NENÍ NA DISKU] Mažu jen z DB: {local_path}")
with conn.cursor() as del_cursor:
del_sql = f"DELETE FROM {TABULKA} WHERE id = %s"
del_cursor.execute(del_sql, (db_id,))
conn.commit()
deleted_counter += 1
except OSError as e:
print(f"[CHYBA OS] {local_path}: {e}")
errors += 1
except pymysql.MySQLError as e:
print(f"[CHYBA DB] ID {db_id}: {e}")
conn.close()
print("-" * 30)
if DRY_MODE:
print(f"DRY RUN DOKONČEN. Zobrazena simulace pro {deleted_counter} souborů.")
else:
print(f"HOTOVO. Úspěšně smazáno: {deleted_counter}, Chyby: {errors}")
except pymysql.MySQLError as e:
print(f"Kritická chyba DB: {e}")
if __name__ == "__main__":
step_1_mark_duplicates()
step_2_delete_files()

92
PST/10 ReadKulhavaPST.py Normal file
View File

@@ -0,0 +1,92 @@
import win32com.client
import os
# Your specific file path
pst_path = r'd:\Dropbox\!!!Days\Downloads Z230\PST\tkulhava.pst'
def main():
if not os.path.exists(pst_path):
print(f"Error: File not found at {pst_path}")
return
try:
# Connect to Outlook
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
# 1. Add the PST to Outlook (This makes it visible in the sidebar)
print(f"Mounting PST: {pst_path}...")
outlook.AddStore(pst_path)
# 2. Find the folder object for this PST
# We search specifically for the folder that matches the filename 'tkulhava'
# or grab the last added store if the name doesn't match exactly.
pst_name = "tkulhava" # derived from filename usually
root_folder = None
# Loop through all stores to find the new one
for folder in outlook.Folders:
if pst_name.lower() in folder.Name.lower():
root_folder = folder
break
# Fallback: Just grab the last folder in the list if name didn't match
if not root_folder:
root_folder = outlook.Folders.GetLast()
print(f"Successfully opened root folder: {root_folder.Name}")
print("=" * 50)
# 3. Start the recursive walk
print_subjects_recursively(root_folder)
# 4. Cleanup: Remove the PST from Outlook
# (Comment this out if you want to keep it open in Outlook to inspect manually)
outlook.RemoveStore(root_folder)
print("\nDone. PST detached.")
except Exception as e:
print(f"An error occurred: {e}")
def print_subjects_recursively(folder):
"""
Recursively prints subjects of emails in a folder and its subfolders.
"""
try:
# Print current folder name for context
# Check if folder has items
if folder.Items.Count > 0:
print(f"\n--- Folder: {folder.Name} ---")
# Iterate through items
for item in folder.Items:
try:
# Class 43 is a standard MailItem.
# Other items (meeting requests, reports) might not have a Subject or behave differently.
if item.Class == 43:
print(f"Subject: {item.Subject}")
else:
# Attempt to print subject anyway (e.g., for Meeting Items)
print(f"[{type_name(item.Class)}] Subject: {item.Subject}")
except Exception:
# Skip items that are corrupted or unreadable
pass
# Recursion: Go deeper into subfolders
for subfolder in folder.Folders:
print_subjects_recursively(subfolder)
except Exception as e:
print(f"Skipping restricted folder '{folder.Name}': {e}")
def type_name(class_id):
# Helper to identify non-email items
if class_id == 53: return "Meeting"
if class_id == 46: return "Report"
return f"Type {class_id}"
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,142 @@
import win32com.client
import os
import pathlib
# --- CONFIGURATION ---
pst_path = r'd:\Dropbox\!!!Days\Downloads Z230\PST\tkulhava.pst'
output_dir = r'd:\Dropbox\!!!Days\Downloads Z230\PST\pictures'
# Image extensions to look for (case insensitive)
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tif', '.tiff'}
def fix_encoding(text):
"""Repairs text wrongly decoded as cp1252 instead of cp1250."""
if not text: return ""
try:
return text.encode('cp1252').decode('cp1250')
except Exception:
return text
def get_unique_filepath(directory, filename):
"""
Checks if a file exists. If so, adds a counter (_1, _2) to the filename
until a unique name is found.
"""
# Clean filename of illegal characters just in case
filename = "".join(x for x in filename if x.isalnum() or x in "._- ")
path = pathlib.Path(directory) / filename
if not path.exists():
return path
# Split name and extension
stem = path.stem
suffix = path.suffix
counter = 1
while True:
new_filename = f"{stem}_{counter}{suffix}"
new_path = pathlib.Path(directory) / new_filename
if not new_path.exists():
return new_path
counter += 1
def process_item_attachments(item, save_folder):
"""Checks an item for attachments and saves pictures."""
try:
# Check if item has attachments
if item.Attachments.Count > 0:
for attachment in item.Attachments:
try:
# Get filename and extension
fname = getattr(attachment, 'FileName', '')
if not fname: continue
# Fix encoding on filename if needed (sometimes attachments inherit bad encoding)
fname = fix_encoding(fname)
ext = os.path.splitext(fname)[1].lower()
if ext in IMAGE_EXTENSIONS:
# Determine unique path
save_path = get_unique_filepath(save_folder, fname)
# Save the file
attachment.SaveAsFile(str(save_path))
print(f" [SAVED] {save_path.name}")
except Exception as e:
print(f" [ERROR saving attachment]: {e}")
except Exception:
# Some items (like corrupted notes) fail when accessing .Attachments
pass
def scan_folder_recursively(folder, save_folder):
"""Recursively walks folders and processes items."""
try:
folder_name = fix_encoding(folder.Name)
# Optional: Print folder progress
if folder.Items.Count > 0:
print(f"Scanning Folder: {folder_name}...")
# Process items in this folder
for item in folder.Items:
process_item_attachments(item, save_folder)
# Recursion
for subfolder in folder.Folders:
scan_folder_recursively(subfolder, save_folder)
except Exception as e:
print(f"Skipping folder '{fix_encoding(folder.Name)}': {e}")
def main():
# 1. Ensure output directory exists
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"Created directory: {output_dir}")
if not os.path.exists(pst_path):
print(f"Error: PST file not found at {pst_path}")
return
try:
# 2. Connect to Outlook
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
print(f"Mounting PST: {pst_path}...")
outlook.AddStore(pst_path)
# 3. Find the PST folder
pst_name = "tkulhava" # Usually derived from filename
root_folder = None
for folder in outlook.Folders:
if pst_name.lower() in folder.Name.lower():
root_folder = folder
break
if not root_folder:
root_folder = outlook.Folders.GetLast()
print(f"Opened: {fix_encoding(root_folder.Name)}")
print(f"Saving pictures to: {output_dir}")
print("=" * 50)
# 4. Start processing
scan_folder_recursively(root_folder, output_dir)
# 5. Cleanup
outlook.RemoveStore(root_folder)
print("\nDone. PST detached.")
except Exception as e:
print(f"Critical Error: {e}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,67 @@
import pandas as pd
from sqlalchemy import create_engine
import time
# --- KONFIGURACE ---
db_user = 'root'
db_pass = 'Vlado9674+'
db_host = '192.168.1.76'
db_port = '3307'
db_name = 'torrents' # <--- ZDE DOPLNIT NÁZEV DATABÁZE
# --- PŘIPOJENÍ ---
connection_string = f'mysql+mysqlconnector://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}'
engine = create_engine(connection_string)
# SQL dotaz - vybíráme i full_path, abychom mohli v Pandas ukázat příklad cesty
# POZOR: Načítání 5.8M textových řetězců (full_path) zabere dost RAM (odhadem 2-4 GB).
query = """
SELECT id, blake3, file_size, full_path
FROM file_md5_index FORCE INDEX (idx_full_path_prefix)
WHERE host_name='Tower1' AND full_path LIKE '/mnt/user/#Library%'
"""
print("1. Začínám stahovat data z MySQL do RAM...")
start_load = time.time()
try:
# Stáhnutí dat
df = pd.read_sql(query, engine)
end_load = time.time()
print(f"-> Data stažena za: {end_load - start_load:.2f} sekund")
print(f"-> Počet řádků v paměti: {len(df)}")
print("\n2. Začínám hledat duplicity (Pandas GroupBy)...")
start_process = time.time()
# Logika hledání duplicit
# Najdeme jen ty, co mají duplicitní hash
duplicity = df[df.duplicated(subset=['blake3'], keep=False)]
if not duplicity.empty:
# Seskupení
vysledek = duplicity.groupby('blake3').agg({
'file_size': 'first', # Velikost souboru (předpokládáme stejnou pro stejný hash)
'id': 'count', # Počet výskytů
'full_path': lambda x: x.iloc[0] # Ukázka první cesty (rychlejší než 'first')
}).rename(columns={'id': 'pocet_kopii'})
# Filtrujeme jen ty, co mají skutečně více kopií a seřadíme podle velikosti * počet kopií
# (Chceme vidět, kde plýtváme nejvíc místa)
vysledek['celkove_plytvani'] = vysledek['file_size'] * (vysledek['pocet_kopii'] - 1)
vysledek = vysledek.sort_values('celkove_plytvani', ascending=False)
end_process = time.time()
print(f"-> Zpracováno za: {end_process - start_process:.4f} sekund")
print("\n--- TOP 20 NEJVĚTŠÍCH DUPLICIT ---")
# Zobrazíme hash, počet kopií, velikost jednoho souboru a ukázku cesty
print(vysledek[['pocet_kopii', 'file_size', 'full_path']].head(20))
print(f"\nCelkem nalezeno {len(vysledek)} unikátních souborů, které mají duplicity.")
else:
print("Nebyly nalezeny žádné duplicity.")
except Exception as e:
print(f"\nCHYBA: {e}")
print("Zkontrolujte prosím název databáze a jestli máte dost paměti RAM.")

View File

@@ -0,0 +1,295 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
FAST FILE HASH INDEXER WINDOWS CLIENT (EXTERNAL DISKS)
- Mode: PHYSICAL BACKUP
- Hostname in DB = Disk Label (e.g., #HD015)
- Path in DB = Relative path (e.g., /Movies/Film.mkv)
"""
import os, time
import pymysql
import socket
import platform
import sys
from blake3 import blake3
# ==============================
# CONFIG
# ==============================
CHUNK_SIZE = 5 * 1024 * 1024 # 5 MB
PROGRESS_MIN_SIZE = 500 * 1024 * 1024 # 500 MB
PROGRESS_INTERVAL = 1.0 # seconds
EXCLUDED_DIRS = {"$RECYCLE.BIN", "System Volume Information", "RECYCLER", "msdownld.tmp"}
# --- Limity velikosti ---
FILE_MIN_SIZE = 0
FILE_MAX_SIZE = 1024 * 1024 * 1024* 1024 # 1TB
# --- Nastavení Databáze ---
DB_CONFIG = {
"host": "192.168.1.76",
"port": 3307,
"user": "root",
"password": "Vlado9674+",
"database": "torrents",
"charset": "utf8mb4",
"autocommit": True,
}
CHUNK_SIZE = 4 * 1024 * 1024 # 4 MB
PRINT_SKIPPED = False # True = vypisovat i přeskočené
# ==============================
# SYSTEM INFO
# ==============================
# Fyzický název PC (jen pro výpis do konzole, do DB půjde název disku)
REAL_PC_HOSTNAME = socket.gethostname()
OS_NAME = platform.system()
# ==============================
# FUNCTIONS
# ==============================
def compute_blake3(path: str) -> bytes:
h = blake3()
total_size = os.path.getsize(path)
show_progress = total_size >= PROGRESS_MIN_SIZE
processed = 0
start_time = time.time()
last_report = start_time
try:
with open(path, "rb") as f:
while True:
chunk = f.read(CHUNK_SIZE)
if not chunk:
break
h.update(chunk)
processed += len(chunk)
if show_progress:
now = time.time()
if now - last_report >= PROGRESS_INTERVAL:
elapsed = now - start_time
speed = processed / elapsed if elapsed > 0 else 0
percent = processed / total_size * 100
remaining = total_size - processed
eta = remaining / speed if speed > 0 else 0
print(
f"{percent:6.2f}% | "
f"{processed/1024/1024:8.1f} / {total_size/1024/1024:.1f} MB | "
f"{speed/1024/1024:6.1f} MB/s | "
f"ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}",
flush=True
)
last_report = now
if show_progress:
total_time = time.time() - start_time
avg_speed = total_size / total_time if total_time > 0 else 0
print(
f" ✅ DONE | "
f"{total_size/1024/1024:.1f} MB | "
f"avg {avg_speed/1024/1024:.1f} MB/s | "
f"time {time.strftime('%H:%M:%S', time.gmtime(total_time))}",
flush=True
)
return h.digest()
except Exception as e:
print(f"⚠️ HASH ERROR: {path} - {e}")
raise
def get_drive_info():
"""Získá písmeno disku a jeho ID (které se použije jako host_name)."""
print("\n💿 --- NASTAVENÍ SKENOVÁNÍ (EXTERNÍ DISK) ---")
# 1. Písmeno disku
while True:
drive_input = input("📂 Zadejte písmeno disku ve Windows (např. 'E'): ").strip().upper()
drive_letter = drive_input.replace(":", "").replace("\\", "").replace("/", "")
if len(drive_letter) == 1 and drive_letter.isalpha():
drive_root = f"{drive_letter}:\\"
if os.path.isdir(drive_root):
break
else:
print(f"❌ Disk {drive_root} není dostupný.")
else:
print("❌ Neplatný formát.")
# 2. Název disku -> HOST_NAME
while True:
disk_label = input("🏷️ Zadejte ID disku (bude uloženo jako 'host_name', např. '#HD015'): ").strip()
if len(disk_label) >= 2:
break
print("❌ Název je příliš krátký.")
return drive_root, disk_label
def size_allowed(size: int) -> bool:
if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE: return False
if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE: return False
return True
# ==============================
# MAIN
# ==============================
def main():
print("🚀 BLAKE3 External Disk Indexer", flush=True)
print(f"🖥 Running on PC: {REAL_PC_HOSTNAME}", flush=True)
# Získání vstupů
scan_root, disk_hostname = get_drive_info()
print(f"✅ Konfigurace:")
print(f" Zdroj (Windows) : {scan_root}")
print(f" DB Hostname : {disk_hostname}")
print(f" DB Cesty : /Složka/Soubor...")
try:
db = pymysql.connect(**DB_CONFIG)
cur = db.cursor()
except Exception as e:
print(f"❌ DB Connection failed: {e}")
input("Enter pro konec...")
return
print(f"📥 Načítám index pro disk: '{disk_hostname}'...", flush=True)
# === OPTIMALIZACE: Hledáme přesně podle host_name ===
cur.execute("""
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
FROM file_md5_index
WHERE host_name = %s
""", (disk_hostname,))
# Mapa: { "/Slozka/Soubor.ext": (size, mtime) }
indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
print(f"✅ Nalezeno {len(indexed_map):,} souborů v DB pro tento disk.", flush=True)
print("======================================", flush=True)
new_files = 0
skipped = 0
filtered = 0
errors = 0
seen_paths = set()
# --- SCAN ---
for root, dirs, files in os.walk(scan_root):
# Ignorace systémových složek
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
for fname in files:
disk_path = os.path.join(root, fname)
# 1. Stat (velikost, čas)
try:
stat = os.stat(disk_path)
except OSError:
errors += 1
continue
size = stat.st_size
if not size_allowed(size):
filtered += 1
continue
# 2. Vytvoření čisté cesty pro DB
# E:\Filmy\Avatar.mkv -> Filmy\Avatar.mkv
try:
rel_path = os.path.relpath(disk_path, scan_root)
except ValueError:
errors += 1
continue
# Normalizace na Linux style: Filmy/Avatar.mkv
clean_path = rel_path.replace("\\", "/")
# Přidání lomítka na začátek: /Filmy/Avatar.mkv
if not clean_path.startswith("/"):
clean_path = "/" + clean_path
if clean_path in seen_paths:
continue
seen_paths.add(clean_path)
mtime = int(stat.st_mtime)
# === STRICT CHECK ===
is_match = False
if clean_path in indexed_map:
db_size, db_mtime = indexed_map[clean_path]
if size == db_size and mtime == db_mtime:
is_match = True
if is_match:
skipped += 1
if PRINT_SKIPPED:
print(f"⏭ SKIP {clean_path}", flush=True)
continue
# === INSERT / UPDATE ===
print(" NEW / UPDATED", flush=True)
print(f" File: {clean_path}", flush=True)
print(f" Size: {size:,} B", flush=True)
try:
b3 = compute_blake3(disk_path)
except Exception:
errors += 1
continue
cur.execute("""
INSERT INTO file_md5_index
(os_name, host_name, full_path, file_name, directory,
file_size, mtime, blake3)
VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
ON DUPLICATE KEY UPDATE
file_size = VALUES(file_size),
mtime = VALUES(mtime),
blake3 = VALUES(blake3),
updated_at = CURRENT_TIMESTAMP
""", (
OS_NAME, # Např. 'Windows' (kde se to skenovalo)
disk_hostname, # ZDE SE UKLÁDÁ '#HD015'
clean_path, # ZDE SE UKLÁDÁ '/Filmy/Avatar.mkv'
fname,
os.path.dirname(clean_path),
size,
mtime,
b3,
))
new_files += 1
print(f" Hash: {b3.hex()}", flush=True)
print("--------------------------------------", flush=True)
print("======================================", flush=True)
print(f"✅ Hotovo : {new_files}")
print(f"⏭ Shoda : {skipped}")
print(f"⚠️ Chyby : {errors}")
print("🏁 Konec.")
cur.close()
db.close()
# input("\nStiskněte Enter pro ukončení...")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,351 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
r"""
FAST FILE HASH INDEXER WINDOWS CLIENT (HARDCODED CONFIG)
- Mode: PHYSICAL BACKUP
- Hostname in DB = Disk Label (e.g., #HD015)
- Path in DB = Relative path (e.g., /Movies/Film.mkv)
"""
import os, time
import pymysql
import socket
import platform
import sys
from blake3 import blake3
# ==============================
# ⚙️ USER CONFIGURATION
# ==============================
DISK_DRIVE_LETTER = "P" # (e.g., "E", "F", "P")
DISK_HOSTNAME = "#HD08" # (e.g., "#HD015")
# 🔒 SAFETY SWITCH
# True = LIST ONLY (No DB changes). "Simulates" the run.
# False = EXECUTE (Deletes and Inserts into DB).
DRY_RUN = False
# ==============================
# TECHNICAL CONFIG
# ==============================
CHUNK_SIZE = 5 * 1024 * 1024 # 5 MB
PROGRESS_MIN_SIZE = 500 * 1024 * 1024 # 500 MB
PROGRESS_INTERVAL = 1.0 # seconds
EXCLUDED_DIRS = {"$RECYCLE.BIN", "System Volume Information", "RECYCLER", "msdownld.tmp"}
# --- File Size Limits ---
FILE_MIN_SIZE = 0
FILE_MAX_SIZE = 1024 * 1024*1024*1024 # 1TB
# --- DB Config ---
DB_CONFIG = {
"host": "192.168.1.76",
"port": 3307,
"user": "root",
"password": "Vlado9674+",
"database": "torrents",
"charset": "utf8mb4",
"autocommit": True,
}
PRINT_SKIPPED = False # Set True to see files that were already in DB
# ==============================
# SYSTEM INFO
# ==============================
REAL_PC_HOSTNAME = socket.gethostname()
OS_NAME = platform.system()
# ==============================
# FUNCTIONS
# ==============================
def compute_blake3(path: str) -> bytes:
h = blake3()
total_size = os.path.getsize(path)
show_progress = total_size >= PROGRESS_MIN_SIZE
processed = 0
start_time = time.time()
last_report = start_time
try:
with open(path, "rb") as f:
while True:
chunk = f.read(CHUNK_SIZE)
if not chunk:
break
h.update(chunk)
processed += len(chunk)
if show_progress:
now = time.time()
if now - last_report >= PROGRESS_INTERVAL:
elapsed = now - start_time
speed = processed / elapsed if elapsed > 0 else 0
percent = processed / total_size * 100
remaining = total_size - processed
eta = remaining / speed if speed > 0 else 0
print(
f"{percent:6.2f}% | "
f"{processed / 1024 / 1024:8.1f} / {total_size / 1024 / 1024:.1f} MB | "
f"{speed / 1024 / 1024:6.1f} MB/s | "
f"ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}",
flush=True
)
last_report = now
if show_progress:
total_time = time.time() - start_time
avg_speed = total_size / total_time if total_time > 0 else 0
print(
f" ✅ DONE | "
f"{total_size / 1024 / 1024:.1f} MB | "
f"avg {avg_speed / 1024 / 1024:.1f} MB/s | "
f"time {time.strftime('%H:%M:%S', time.gmtime(total_time))}",
flush=True
)
return h.digest()
except Exception as e:
print(f"⚠️ HASH ERROR: {path} - {e}")
raise
def size_allowed(size: int) -> bool:
if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE: return False
if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE: return False
return True
def normalize_db_path(scan_root, disk_path):
"""
Converts a physical Windows path to the standardized DB format.
E:\Movies\File.mkv -> /Movies/File.mkv
"""
try:
rel_path = os.path.relpath(disk_path, scan_root)
except ValueError:
return None
# Windows backslash to slash
clean_path = rel_path.replace("\\", "/")
# Ensure leading slash
if not clean_path.startswith("/"):
clean_path = "/" + clean_path
return clean_path
# ==============================
# MAIN
# ==============================
def main():
print("🚀 BLAKE3 External Disk Indexer", flush=True)
print(f"🖥 Running on PC: {REAL_PC_HOSTNAME}", flush=True)
if DRY_RUN:
print("🛡️ DRY RUN MODE ACTIVE: No changes will be made to DB.", flush=True)
else:
print("⚠️ LIVE MODE: Changes WILL be committed to DB.", flush=True)
# Build root path
scan_root = f"{DISK_DRIVE_LETTER}:\\"
if not os.path.isdir(scan_root):
print(f"❌ ERROR: Drive '{scan_root}' not found!")
print(f" Please check DISK_DRIVE_LETTER in config.")
return
print(f"✅ Config:")
print(f" Source (Win) : {scan_root}")
print(f" DB Hostname : {DISK_HOSTNAME}")
try:
db = pymysql.connect(**DB_CONFIG)
cur = db.cursor()
except Exception as e:
print(f"❌ DB Connection failed: {e}")
return
print(f"📥 Loading DB index for: '{DISK_HOSTNAME}'...", flush=True)
# === LOAD EXISTING DB RECORDS ===
cur.execute("""
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
FROM file_md5_index
WHERE host_name = %s
""", (DISK_HOSTNAME,))
indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
print(f"✅ Found {len(indexed_map):,} files in DB for this disk.", flush=True)
# =========================================================
# PHASE 1: CLEANUP (DELETE MISSING FILES)
# =========================================================
print("======================================", flush=True)
print("🧹 PHASE 1: Checking for deleted files...", flush=True)
current_disk_paths = set()
# Fast walk just to get paths
for root, dirs, files in os.walk(scan_root):
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
for fname in files:
disk_path = os.path.join(root, fname)
clean_path = normalize_db_path(scan_root, disk_path)
if clean_path:
current_disk_paths.add(clean_path)
paths_to_delete = set(indexed_map.keys()) - current_disk_paths
if paths_to_delete:
print(f"🗑️ Found {len(paths_to_delete):,} files to delete from DB.")
if DRY_RUN:
print("🛡️ [DRY RUN] Listing files to be deleted (No action taken):")
for p in sorted(list(paths_to_delete))[:20]: # Print first 20
print(f" - {p}")
if len(paths_to_delete) > 20:
print(f" ... and {len(paths_to_delete) - 20} more.")
else:
# Delete in batches
batch_size = 1000
to_delete_list = list(paths_to_delete)
for i in range(0, len(to_delete_list), batch_size):
batch = to_delete_list[i: i + batch_size]
format_strings = ','.join(['%s'] * len(batch))
query = f"DELETE FROM file_md5_index WHERE host_name = %s AND full_path IN ({format_strings})"
try:
cur.execute(query, [DISK_HOSTNAME] + batch)
print(f" ... deleted batch {i}-{i + len(batch)}")
except Exception as e:
print(f"❌ Error deleting batch: {e}")
# Update local map
for p in paths_to_delete:
del indexed_map[p]
print("✅ Cleanup complete.")
else:
print("✅ No deleted files detected.")
# =========================================================
# PHASE 2: SCAN & UPDATE (HASHING)
# =========================================================
print("======================================", flush=True)
print("🚀 PHASE 2: Scanning for changes & new files...", flush=True)
new_files = 0
skipped = 0
filtered = 0
errors = 0
seen_paths = set()
for root, dirs, files in os.walk(scan_root):
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
for fname in files:
disk_path = os.path.join(root, fname)
try:
stat = os.stat(disk_path)
except OSError:
errors += 1
continue
size = stat.st_size
if not size_allowed(size):
filtered += 1
continue
clean_path = normalize_db_path(scan_root, disk_path)
if not clean_path:
errors += 1
continue
if clean_path in seen_paths:
continue
seen_paths.add(clean_path)
mtime = int(stat.st_mtime)
# === MATCH CHECK ===
is_match = False
if clean_path in indexed_map:
db_size, db_mtime = indexed_map[clean_path]
if size == db_size and mtime == db_mtime:
is_match = True
if is_match:
skipped += 1
if PRINT_SKIPPED:
print(f"⏭ SKIP {clean_path}", flush=True)
continue
# === INSERT / UPDATE ===
print(" NEW / UPDATED", flush=True)
print(f" File: {clean_path}", flush=True)
print(f" Size: {size:,} B", flush=True)
try:
b3 = compute_blake3(disk_path)
except Exception:
errors += 1
continue
if DRY_RUN:
print(f"🛡️ [DRY RUN] Would INSERT/UPDATE: {clean_path}")
print(f" Hash: {b3.hex()}")
new_files += 1
else:
cur.execute("""
INSERT INTO file_md5_index
(os_name, host_name, full_path, file_name, directory,
file_size, mtime, blake3)
VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
ON DUPLICATE KEY UPDATE
file_size = VALUES(file_size),
mtime = VALUES(mtime),
blake3 = VALUES(blake3),
updated_at = CURRENT_TIMESTAMP
""", (
OS_NAME,
DISK_HOSTNAME,
clean_path,
fname,
os.path.dirname(clean_path),
size,
mtime,
b3,
))
new_files += 1
print(f" Hash: {b3.hex()}", flush=True)
print("--------------------------------------", flush=True)
print("======================================", flush=True)
print(f"✅ Processed : {new_files}")
print(f"⏭ Skipped : {skipped}")
print(f"🗑 Deleted : {len(paths_to_delete)} " + ("(DRY RUN)" if DRY_RUN else ""))
print(f"⚠️ Errors : {errors}")
print("🏁 Done.")
cur.close()
db.close()
if __name__ == "__main__":
main()