#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import hashlib from datetime import datetime import mysql.connector from mysql.connector import Error from dotenv import load_dotenv from pathlib import Path import unicodedata # ====================================================== # Load .env from the script directory # ====================================================== env_path = Path(__file__).resolve().parent / ".env" load_dotenv(env_path) # ====================================================== # Helper: MD5 of full file path string # ====================================================== def md5_path(path: str) -> str: return hashlib.md5(path.encode("utf8")).hexdigest() # ====================================================== # MySQL CONNECTIONS # ====================================================== def get_server_connection(): return mysql.connector.connect( host=os.getenv("DB_MYSQL_HOST"), user=os.getenv("DB_MYSQL_ROOT"), password=os.getenv("DB_MYSQL_ROOT_PASS"), port=int(os.getenv("DB_MYSQL_PORT")), auth_plugin="mysql_native_password", ) def get_db_connection(): conn = mysql.connector.connect( host=os.getenv("DB_MYSQL_HOST"), user=os.getenv("DB_MYSQL_ROOT"), password=os.getenv("DB_MYSQL_ROOT_PASS"), port=int(os.getenv("DB_MYSQL_PORT")), database="walkfiles", auth_plugin="mysql_native_password", ) c = conn.cursor() c.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci") c.close() return conn # ====================================================== # DATABASE INITIALIZATION # ====================================================== def init_db(): # Ensure DB exists server = get_server_connection() cur = server.cursor() cur.execute(""" CREATE DATABASE IF NOT EXISTS walkfiles DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci """) server.commit() cur.close() server.close() # Connect conn = get_db_connection() cursor = conn.cursor() # DEVICES cursor.execute(""" CREATE TABLE IF NOT EXISTS devices ( id INT AUTO_INCREMENT PRIMARY KEY, name VARCHAR(255) UNIQUE, scanned_at DATETIME NULL ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; """) # FOLDERS cursor.execute(""" CREATE TABLE IF NOT EXISTS folders ( id INT AUTO_INCREMENT PRIMARY KEY, path VARCHAR(2048) NOT NULL, parent_id INT NULL, device_id INT NOT NULL, first_seen DATETIME NOT NULL, last_seen DATETIME NOT NULL, deleted TINYINT(1) NOT NULL DEFAULT 0, CONSTRAINT fk_folder_device FOREIGN KEY (device_id) REFERENCES devices(id) ON DELETE CASCADE, UNIQUE KEY uniq_folder_path (device_id, path(255)), INDEX idx_folder_dev (device_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; """) # FILES cursor.execute(""" CREATE TABLE IF NOT EXISTS files ( id INT AUTO_INCREMENT PRIMARY KEY, name VARCHAR(255) NOT NULL, path VARCHAR(2048) NOT NULL, path_md5 CHAR(32) NOT NULL, size BIGINT NULL, modified DATETIME NULL, type VARCHAR(255) NULL, folder_id INT NULL, device_id INT NOT NULL, deleted TINYINT(1) NOT NULL DEFAULT 0, first_seen DATETIME NOT NULL, last_seen DATETIME NOT NULL, CONSTRAINT fk_file_folder FOREIGN KEY (folder_id) REFERENCES folders(id) ON DELETE SET NULL, CONSTRAINT fk_file_device FOREIGN KEY (device_id) REFERENCES devices(id) ON DELETE CASCADE, UNIQUE KEY uniq_file_path_md5 (device_id, path_md5), INDEX idx_file_folder (folder_id), INDEX idx_file_deleted (device_id, deleted) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; """) conn.commit() return conn, cursor # ====================================================== # HELPERS — DEVICES & FOLDERS # ====================================================== def get_or_create_device(cursor, conn, name: str) -> int: now = datetime.now() cursor.execute("INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s,%s)", (name, now)) conn.commit() cursor.execute("SELECT id FROM devices WHERE name=%s", (name,)) return cursor.fetchone()[0] def load_folder_state(cursor, device_id: int): """ Načte všechny složky pro zařízení a uloží jako: folder_state[normalized_path] = {"id": id, "deleted": 0/1} """ cursor.execute(""" SELECT id, path, deleted FROM folders WHERE device_id=%s """, (device_id,)) out = {} for folder_id, path, deleted in cursor.fetchall(): norm_path = os.path.normpath(path) out[norm_path] = {"id": folder_id, "deleted": int(deleted)} return out def get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, parent_id): """ Vytvoří nebo najde složku. Ošetřuje: - Unicode normalizaci (Černý vs Černý) - cache v paměti (folder_state) - idempotentní INSERT (ON DUPLICATE KEY UPDATE) """ # Normalize Unicode + path form folder_path = unicodedata.normalize("NFC", folder_path) folder_path = os.path.normpath(folder_path) key = folder_path # 1) Cache hit if key in folder_state: return folder_state[key]["id"] now = datetime.now() # 2) Zkus SELECT cursor.execute(""" SELECT id FROM folders WHERE device_id = %s AND path = %s LIMIT 1 """, (device_id, folder_path)) row = cursor.fetchone() if row: folder_id = row[0] folder_state[key] = {"id": folder_id, "deleted": 0} return folder_id # 3) INSERT (idempotent) cursor.execute(""" INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen) VALUES (%s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE id = LAST_INSERT_ID(id), last_seen = VALUES(last_seen) """, (folder_path, parent_id, device_id, now, now)) conn.commit() folder_id = cursor.lastrowid folder_state[key] = {"id": folder_id, "deleted": 0} return folder_id # ====================================================== # LOAD LAST FILE STATE # ====================================================== def load_last_file_state(cursor, device_id: int): """ Načte poslední známý stav souborů pro zařízení, indexovaný podle path_md5. (Z historických důvodů přes MAX(id), i když máš UNIQUE na (device_id, path_md5)) """ cursor.execute(""" SELECT f.id, f.path_md5, f.deleted, f.size, f.modified FROM files f JOIN ( SELECT MAX(id) AS mx FROM files WHERE device_id=%s GROUP BY path_md5 ) t ON f.id = t.mx """, (device_id,)) out = {} for fid, md5, deleted, size, modified in cursor.fetchall(): out[md5] = { "id": fid, "deleted": int(deleted), "size": size, "modified": modified, } return out # ====================================================== # MAIN SCANNER WITH BATCHING # ====================================================== def walk_and_store_bulk(): BATCH_SIZE = 10000 # target_dir = r"\\tower1\#colddata" # target_dir = r"z:" target_dir = r"\\tower\ebooks" # target_dir = r"\\tower\dedup" device_name = "TOWER" # Normalizovaný root pro porovnávání a LIKE target_dir_norm = os.path.normpath(target_dir) if not os.path.isdir(target_dir): print("Invalid directory:", target_dir) return conn, cursor = init_db() now = datetime.now() device_id = get_or_create_device(cursor, conn, device_name) folder_state = load_folder_state(cursor, device_id) file_state = load_last_file_state(cursor, device_id) seen_folders = set() seen_files = set() # MD5 of path files_to_insert = [] files_to_update = [] total_files = 0 print(f"🔍 Scanning: {target_dir} (device {device_id})") # ------------------------------------------------- # WALK FILESYSTEM # ------------------------------------------------- for root, dirs, files in os.walk(target_dir): folder_path = os.path.normpath(root) # 1️⃣ determine parent_id correctly if folder_path == target_dir_norm: parent_id = None else: parent_folder_path = os.path.normpath(os.path.dirname(folder_path)) parent_id = get_or_create_folder(cursor, conn, folder_state, device_id, parent_folder_path, None) # 2️⃣ now insert current folder with correct parent_id seen_folders.add(folder_path) folder_id = get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, parent_id) # ------------------------------------------------- # FILE LOOP # ------------------------------------------------- for name in files: total_files += 1 filepath = os.path.normpath(os.path.join(folder_path, name)) md5 = md5_path(filepath) seen_files.add(md5) try: st = os.stat(filepath) except FileNotFoundError: continue modified = datetime.fromtimestamp(st.st_mtime).replace(microsecond=0) size = st.st_size ext = os.path.splitext(name)[1][:250] prev = file_state.get(md5) if prev is None: # nový soubor files_to_insert.append( (name, filepath, md5, size, modified, ext, folder_id, device_id, 0, now, now) ) else: if prev["deleted"] == 1: # "vzkříšený" soubor files_to_insert.append( (name, filepath, md5, size, modified, ext, folder_id, device_id, 0, now, now) ) else: # existuje a není deleted → zkontroluj změnu velikosti / času if prev["size"] != size or prev["modified"] != modified: files_to_update.append( (size, modified, now, prev["id"]) ) # ------------------------------------------------- # BATCH FLUSHING # ------------------------------------------------- if len(files_to_insert) >= BATCH_SIZE: print(f"💾 Flushing {len(files_to_insert)} inserts...") cursor.executemany(""" INSERT INTO files ( name, path, path_md5, size, modified, type, folder_id, device_id, deleted, first_seen, last_seen ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """, files_to_insert) conn.commit() files_to_insert.clear() if len(files_to_update) >= BATCH_SIZE: print(f"💾 Flushing {len(files_to_update)} updates...") cursor.executemany(""" UPDATE files SET size=%s, modified=%s, last_seen=%s, deleted=0 WHERE id=%s """, files_to_update) conn.commit() files_to_update.clear() # PROGRESS if total_files % 1000 == 0: print(f" ... processed {total_files} files") # ------------------------------------------------- # FINAL FLUSH (REMAINING INSERTS/UPDATES) # ------------------------------------------------- if files_to_insert: print(f"💾 Final flush: {len(files_to_insert)} inserts") cursor.executemany(""" INSERT INTO files ( name, path, path_md5, size, modified, type, folder_id, device_id, deleted, first_seen, last_seen ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """, files_to_insert) conn.commit() if files_to_update: print(f"💾 Final flush: {len(files_to_update)} updates") cursor.executemany(""" UPDATE files SET size=%s, modified=%s, last_seen=%s, deleted=0 WHERE id=%s """, files_to_update) conn.commit() # ------------------------------------------------- # MARK DELETED FILES — ONLY IN THIS SUBTREE # ------------------------------------------------- files_deleted_count = 0 like_prefix = target_dir_norm.rstrip("\\/") + "%" cursor.execute(""" SELECT id, path_md5 FROM files WHERE device_id = %s AND deleted = 0 AND path LIKE %s """, (device_id, like_prefix)) candidates = cursor.fetchall() ids_to_delete = [fid for (fid, md5) in candidates if md5 not in seen_files] if ids_to_delete: print(f"💾 Marking {len(ids_to_delete)} files as deleted in subtree") cursor.executemany(""" UPDATE files SET deleted=1, last_seen=%s WHERE id=%s """, [(now, fid) for fid in ids_to_delete]) conn.commit() files_deleted_count = len(ids_to_delete) # ------------------------------------------------- # MARK DELETED FOLDERS — ONLY IN THIS SUBTREE # ------------------------------------------------- folders_to_mark_deleted = [] for path, info in folder_state.items(): # omez na subtree (včetně root složky) norm_path = os.path.normpath(path) if not norm_path.startswith(target_dir_norm): continue if info["deleted"] == 0 and norm_path not in seen_folders: folders_to_mark_deleted.append((now, info["id"])) folders_deleted_count = 0 if folders_to_mark_deleted: cursor.executemany(""" UPDATE folders SET deleted=1, last_seen=%s WHERE id=%s """, folders_to_mark_deleted) conn.commit() folders_deleted_count = len(folders_to_mark_deleted) # ------------------------------------------------- # Update device timestamp # ------------------------------------------------- cursor.execute("UPDATE devices SET scanned_at=%s WHERE id=%s", (now, device_id)) conn.commit() cursor.close() conn.close() print("") print("✅ Scan completed.") print(" Total files scanned:", total_files) print(" Files inserted:", len(files_to_insert)) # po flushi je 0, ale nechávám pro konzistenci print(" Files updated:", len(files_to_update)) # dtto print(" Files deleted in subtree:", files_deleted_count) print(" Folders deleted in subtree:", folders_deleted_count) # ====================================================== # MAIN ENTRY # ====================================================== if __name__ == '__main__': walk_and_store_bulk()