#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import hashlib from datetime import datetime import mysql.connector from dotenv import load_dotenv from pathlib import Path # Always load .env from the folder where THIS script is stored env_path = Path(__file__).resolve().parent / ".env" load_dotenv(env_path) # ====================================================== # πŸ”§ Helper: MD5 of full path # ====================================================== def md5_path(path: str) -> str: return hashlib.md5(path.encode("utf8")).hexdigest() # ====================================================== # πŸ”§ DB CONNECTION HELPERS # ====================================================== def get_server_connection(): """Connect to MySQL server WITHOUT selecting a database.""" conn = mysql.connector.connect( host=os.getenv("DB_MYSQL_HOST", "127.0.0.1"), user=os.getenv("DB_MYSQL_ROOT", "root"), password=os.getenv("DB_MYSQL_ROOT_PASS", ""), port=int(os.getenv("DB_MYSQL_PORT", "3306")), auth_plugin="mysql_native_password", ) return conn def get_db_connection(): """Connect to the 'walkfiles' database.""" conn = mysql.connector.connect( host=os.getenv("DB_MYSQL_HOST", "127.0.0.1"), user=os.getenv("DB_MYSQL_ROOT", "root"), password=os.getenv("DB_MYSQL_ROOT_PASS", ""), port=int(os.getenv("DB_MYSQL_PORT", "3306")), database="walkfiles", auth_plugin="mysql_native_password", ) cursor = conn.cursor() cursor.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci") cursor.close() return conn # ====================================================== # πŸ—„ DB INITIALIZATION # ====================================================== def init_db(): # 1) Ensure DB exists server_conn = get_server_connection() cur = server_conn.cursor() cur.execute( "CREATE DATABASE IF NOT EXISTS walkfiles " "DEFAULT CHARACTER SET utf8mb4 " "COLLATE utf8mb4_general_ci" ) server_conn.commit() cur.close() server_conn.close() # 2) Connect conn = get_db_connection() cursor = conn.cursor() # Devices cursor.execute(""" CREATE TABLE IF NOT EXISTS devices ( id INT AUTO_INCREMENT PRIMARY KEY, name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci UNIQUE, scanned_at DATETIME NULL ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 """) # Folders cursor.execute(""" CREATE TABLE IF NOT EXISTS folders ( id INT AUTO_INCREMENT PRIMARY KEY, path VARCHAR(2048) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL, parent_id INT NULL, device_id INT NOT NULL, first_seen DATETIME NOT NULL, last_seen DATETIME NOT NULL, CONSTRAINT fk_folders_device FOREIGN KEY (device_id) REFERENCES devices(id) ON DELETE CASCADE, UNIQUE KEY uniq_folders_device_path (device_id, path(255)), INDEX idx_folders_device (device_id) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 """) # Files cursor.execute(""" CREATE TABLE IF NOT EXISTS files ( id INT AUTO_INCREMENT PRIMARY KEY, name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL, path VARCHAR(2048) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL, path_md5 CHAR(32) NOT NULL, size BIGINT NULL, modified DATETIME NULL, type VARCHAR(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL, folder_id INT NULL, device_id INT NOT NULL, deleted TINYINT(1) NOT NULL DEFAULT 0, first_seen DATETIME NOT NULL, last_seen DATETIME NOT NULL, CONSTRAINT fk_files_folder FOREIGN KEY (folder_id) REFERENCES folders(id) ON DELETE SET NULL, CONSTRAINT fk_files_device FOREIGN KEY (device_id) REFERENCES devices(id) ON DELETE CASCADE, UNIQUE KEY uniq_files_device_path_md5 (device_id, path_md5), INDEX idx_files_folder (folder_id), INDEX idx_files_deleted (device_id, deleted) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 """) conn.commit() return conn, cursor # ====================================================== # πŸ‘€ DEVICE + FOLDERS HELPERS # ====================================================== def get_or_create_device(cursor, conn, device_name: str) -> int: now = datetime.now() cursor.execute( "INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s, %s)", (device_name, now) ) conn.commit() cursor.execute("SELECT id FROM devices WHERE name=%s", (device_name,)) return cursor.fetchone()[0] def load_folder_cache(cursor, device_id: int): cursor.execute( "SELECT id, path FROM folders WHERE device_id=%s", (device_id,) ) return {path: folder_id for folder_id, path in cursor.fetchall()} def get_or_create_folder(cursor, conn, folder_cache, device_id, folder_path, parent_path, now): if folder_path in folder_cache: folder_id = folder_cache[folder_path] cursor.execute("UPDATE folders SET last_seen=%s WHERE id=%s", (now, folder_id)) return folder_id parent_id = folder_cache.get(parent_path) cursor.execute(""" INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen) VALUES (%s, %s, %s, %s, %s) """, (folder_path, parent_id, device_id, now, now)) folder_id = cursor.lastrowid folder_cache[folder_path] = folder_id return folder_id # ====================================================== # πŸ“‚ FILES – LOAD LAST STATE # ====================================================== def load_last_file_state(cursor, device_id: int): cursor.execute(""" SELECT f.id, f.path_md5, f.deleted, f.size, f.modified FROM files f JOIN ( SELECT MAX(id) AS max_id FROM files WHERE device_id = %s GROUP BY path_md5 ) latest ON f.id = latest.max_id WHERE f.device_id = %s """, (device_id, device_id)) state = {} for file_id, path_md5, deleted, size, modified in cursor.fetchall(): state[path_md5] = { "id": file_id, "deleted": int(deleted), "size": size, "modified": modified } return state # ====================================================== # 🚢 MAIN WALK LOGIC # ====================================================== def walk_and_store_bulk(): updated_debug = [] target_dir = r"u:\Dropbox\Ordinace\Dokumentace_ke_zpracovΓ‘nΓ­" device_name = "Z230" if not os.path.isdir(target_dir): print("Invalid directory:", target_dir) return conn, cursor = init_db() now = datetime.now() device_id = get_or_create_device(cursor, conn, device_name) folder_cache = load_folder_cache(cursor, device_id) last_state = load_last_file_state(cursor, device_id) seen_md5 = set() files_to_insert = [] files_to_update_existing = [] files_to_mark_deleted = [] total_files = 0 print(f"πŸ” Scanning: {target_dir} (device {device_id})") for root, dirs, files in os.walk(target_dir): folder_path = os.path.normpath(root) parent_path = os.path.normpath(os.path.dirname(root)) if root != target_dir else None folder_id = get_or_create_folder(cursor, conn, folder_cache, device_id, folder_path, parent_path, now) for name in files: total_files += 1 file_path = os.path.normpath(os.path.join(root, name)) file_md5 = md5_path(file_path) seen_md5.add(file_md5) try: stats = os.stat(file_path) except FileNotFoundError: continue modified = datetime.fromtimestamp(stats.st_mtime).replace(microsecond=0) size = stats.st_size ext = os.path.splitext(name)[1] prev = last_state.get(file_md5) if prev is None: # New file files_to_insert.append( (name, file_path, file_md5, size, modified, ext, folder_id, device_id, 0, now, now) ) else: if prev["deleted"] == 1: # Reappeared file β†’ new row files_to_insert.append( (name, file_path, file_md5, size, modified, ext, folder_id, device_id, 0, now, now) ) else: # Existing & not deleted # Only update if size or modified timestamp CHANGED if prev["size"] != size or prev["modified"] != modified: files_to_update_existing.append( (size, modified, now, prev["id"]) ) updated_debug.append({ "path": file_path, "old_size": prev["size"], "new_size": size, "old_modified": prev["modified"], "new_modified": modified }) if total_files % 1000 == 0: print(f" ... processed {total_files} files") # Mark missing files as deleted for md5_hash, info in last_state.items(): if info["deleted"] == 0 and md5_hash not in seen_md5: files_to_mark_deleted.append((now, info["id"])) # ================================================== # πŸ’Ύ APPLY CHANGES # ================================================== if files_to_insert: cursor.executemany(""" INSERT INTO files ( name, path, path_md5, size, modified, type, folder_id, device_id, deleted, first_seen, last_seen ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) """, files_to_insert) if files_to_update_existing: cursor.executemany(""" UPDATE files SET size=%s, modified=%s, last_seen=%s, deleted=0 WHERE id=%s """, files_to_update_existing) if files_to_mark_deleted: cursor.executemany(""" UPDATE files SET deleted=1, last_seen=%s WHERE id=%s """, files_to_mark_deleted) cursor.execute("UPDATE devices SET scanned_at=%s WHERE id=%s", (now, device_id)) conn.commit() cursor.close() conn.close() if updated_debug: print("\nπŸ“Œ Updated files:") for info in updated_debug: print(f"- {info['path']}") print(f" size: {info['old_size']} β†’ {info['new_size']}") print(f" modified: {info['old_modified']} β†’ {info['new_modified']}") print("βœ… Scan completed.") print(" Total files:", total_files) print(" Inserted:", len(files_to_insert)) print(" Updated:", len(files_to_update_existing)) print(" Marked deleted:", len(files_to_mark_deleted)) # ====================================================== # πŸ”š MAIN # ====================================================== if __name__ == '__main__': walk_and_store_bulk()