From b8038593abcb8675cdb2469cb019a5e7a0cf350a Mon Sep 17 00:00:00 2001 From: "vladimir.buzalka" Date: Mon, 24 Nov 2025 15:15:49 +0100 Subject: [PATCH] z230 --- 20 Walkandsave.py | 387 ++++++++++++++++++++++++++++++++++-------- 21 WalkandSave.py | 391 ++++++++++++++++++++++++++++++++++++++++++ 22 WalkandSave.py | 413 +++++++++++++++++++++++++++++++++++++++++++++ 30 Test.py | 28 +++ 40 Test3.py | 11 ++ 50 MD5calculate.py | 192 +++++++++++++++++++++ credentials.env | 6 - 7 files changed, 1354 insertions(+), 74 deletions(-) create mode 100644 21 WalkandSave.py create mode 100644 22 WalkandSave.py create mode 100644 30 Test.py create mode 100644 40 Test3.py create mode 100644 50 MD5calculate.py delete mode 100644 credentials.env diff --git a/20 Walkandsave.py b/20 Walkandsave.py index 791e0d0..f72e6bd 100644 --- a/20 Walkandsave.py +++ b/20 Walkandsave.py @@ -1,106 +1,357 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + import os -import mysql.connector +import hashlib from datetime import datetime +import mysql.connector from dotenv import load_dotenv +from pathlib import Path -load_dotenv() # Reads .env file and adds to environment +# Always load .env from the folder where THIS script is stored +env_path = Path(__file__).resolve().parent / ".env" +load_dotenv(env_path) -# Database setup with explicit UTF8MB4 collation -def init_db(): +# ====================================================== +# 🔧 Helper: MD5 of full path +# ====================================================== + +def md5_path(path: str) -> str: + return hashlib.md5(path.encode("utf8")).hexdigest() + +# ====================================================== +# 🔧 DB CONNECTION HELPERS +# ====================================================== + +def get_server_connection(): + """Connect to MySQL server WITHOUT selecting a database.""" conn = mysql.connector.connect( - host=os.getenv("DB_MYSQL_HOST"), - user=os.getenv("DB_MYSQL_ROOT"), - password=os.getenv("DB_MYSQL_ROOT_PASS"), - database=os.getenv("walkfiles"), - port=int(os.getenv("DB_MYSQL_PORT", 3306)), - charset="utf8mb4", - collation="utf8mb4_general_ci" + host=os.getenv("DB_MYSQL_HOST", "127.0.0.1"), + user=os.getenv("DB_MYSQL_ROOT", "root"), + password=os.getenv("DB_MYSQL_ROOT_PASS", ""), + port=int(os.getenv("DB_MYSQL_PORT", "3306")), + auth_plugin="mysql_native_password", + ) + return conn + + +def get_db_connection(): + """Connect to the 'walkfiles' database.""" + conn = mysql.connector.connect( + host=os.getenv("DB_MYSQL_HOST", "127.0.0.1"), + user=os.getenv("DB_MYSQL_ROOT", "root"), + password=os.getenv("DB_MYSQL_ROOT_PASS", ""), + port=int(os.getenv("DB_MYSQL_PORT", "3306")), + database="walkfiles", + auth_plugin="mysql_native_password", ) cursor = conn.cursor() - cursor.execute("CREATE DATABASE IF NOT EXISTS walkfiles CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci") - cursor.execute("USE walkfiles") + cursor.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci") + cursor.close() + return conn - cursor.execute('''CREATE TABLE IF NOT EXISTS devices ( - id INT AUTO_INCREMENT PRIMARY KEY, - name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci UNIQUE, - scanned_at DATETIME - ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci''') +# ====================================================== +# 🗄 DB INITIALIZATION +# ====================================================== - cursor.execute('''CREATE TABLE IF NOT EXISTS folders ( - id INT AUTO_INCREMENT PRIMARY KEY, - path TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci UNIQUE, - parent_id INT, - device_id INT, - FOREIGN KEY(device_id) REFERENCES devices(id) - ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci''') +def init_db(): + # 1) Ensure DB exists + server_conn = get_server_connection() + cur = server_conn.cursor() + cur.execute( + "CREATE DATABASE IF NOT EXISTS walkfiles " + "DEFAULT CHARACTER SET utf8mb4 " + "COLLATE utf8mb4_general_ci" + ) + server_conn.commit() + cur.close() + server_conn.close() - cursor.execute('''CREATE TABLE IF NOT EXISTS files ( - id INT AUTO_INCREMENT PRIMARY KEY, - name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci, - path TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci UNIQUE, - size BIGINT, - modified DATETIME, - type VARCHAR(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci, - folder_id INT, - device_id INT, - FOREIGN KEY(folder_id) REFERENCES folders(id), - FOREIGN KEY(device_id) REFERENCES devices(id) - ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci''') + # 2) Connect + conn = get_db_connection() + cursor = conn.cursor() + + # Devices + cursor.execute(""" + CREATE TABLE IF NOT EXISTS devices ( + id INT AUTO_INCREMENT PRIMARY KEY, + name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci UNIQUE, + scanned_at DATETIME NULL + ) ENGINE=InnoDB + DEFAULT CHARSET=utf8mb4 + """) + + # Folders + cursor.execute(""" + CREATE TABLE IF NOT EXISTS folders ( + id INT AUTO_INCREMENT PRIMARY KEY, + path VARCHAR(2048) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL, + parent_id INT NULL, + device_id INT NOT NULL, + first_seen DATETIME NOT NULL, + last_seen DATETIME NOT NULL, + + CONSTRAINT fk_folders_device + FOREIGN KEY (device_id) REFERENCES devices(id) + ON DELETE CASCADE, + + UNIQUE KEY uniq_folders_device_path (device_id, path(255)), + INDEX idx_folders_device (device_id) + ) ENGINE=InnoDB + DEFAULT CHARSET=utf8mb4 + """) + + # Files + cursor.execute(""" + CREATE TABLE IF NOT EXISTS files ( + id INT AUTO_INCREMENT PRIMARY KEY, + + name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL, + path VARCHAR(2048) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL, + path_md5 CHAR(32) NOT NULL, + + size BIGINT NULL, + modified DATETIME NULL, + type VARCHAR(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL, + + folder_id INT NULL, + device_id INT NOT NULL, + + deleted TINYINT(1) NOT NULL DEFAULT 0, + + first_seen DATETIME NOT NULL, + last_seen DATETIME NOT NULL, + + CONSTRAINT fk_files_folder + FOREIGN KEY (folder_id) REFERENCES folders(id) + ON DELETE SET NULL, + + CONSTRAINT fk_files_device + FOREIGN KEY (device_id) REFERENCES devices(id) + ON DELETE CASCADE, + + UNIQUE KEY uniq_files_device_path_md5 (device_id, path_md5), + INDEX idx_files_folder (folder_id), + INDEX idx_files_deleted (device_id, deleted) + ) ENGINE=InnoDB + DEFAULT CHARSET=utf8mb4 + """) conn.commit() return conn, cursor +# ====================================================== +# 👤 DEVICE + FOLDERS HELPERS +# ====================================================== -def insert_bulk_files(cursor, conn, files_data): - if not files_data: - return - query = '''INSERT IGNORE INTO files (name, path, size, modified, type, folder_id, device_id) - VALUES (%s,%s,%s,%s,%s,%s,%s)''' - cursor.executemany(query, files_data) +def get_or_create_device(cursor, conn, device_name: str) -> int: + now = datetime.now() + cursor.execute( + "INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s, %s)", + (device_name, now) + ) conn.commit() + cursor.execute("SELECT id FROM devices WHERE name=%s", (device_name,)) + return cursor.fetchone()[0] + + +def load_folder_cache(cursor, device_id: int): + cursor.execute( + "SELECT id, path FROM folders WHERE device_id=%s", + (device_id,) + ) + return {path: folder_id for folder_id, path in cursor.fetchall()} + + +def get_or_create_folder(cursor, conn, folder_cache, device_id, folder_path, parent_path, now): + if folder_path in folder_cache: + folder_id = folder_cache[folder_path] + cursor.execute("UPDATE folders SET last_seen=%s WHERE id=%s", (now, folder_id)) + return folder_id + + parent_id = folder_cache.get(parent_path) + + cursor.execute(""" + INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen) + VALUES (%s, %s, %s, %s, %s) + """, (folder_path, parent_id, device_id, now, now)) + + folder_id = cursor.lastrowid + folder_cache[folder_path] = folder_id + return folder_id + +# ====================================================== +# 📂 FILES – LOAD LAST STATE +# ====================================================== + +def load_last_file_state(cursor, device_id: int): + cursor.execute(""" + SELECT f.id, f.path_md5, f.deleted, f.size, f.modified + FROM files f + JOIN ( + SELECT MAX(id) AS max_id + FROM files + WHERE device_id = %s + GROUP BY path_md5 + ) latest ON f.id = latest.max_id + WHERE f.device_id = %s + """, (device_id, device_id)) + + state = {} + for file_id, path_md5, deleted, size, modified in cursor.fetchall(): + state[path_md5] = { + "id": file_id, + "deleted": int(deleted), + "size": size, + "modified": modified + } + return state + + +# ====================================================== +# 🚶 MAIN WALK LOGIC +# ====================================================== def walk_and_store_bulk(): - target_dir = r"u:\\Dropbox\\Ordinace\\Dokumentace_ke_zpracování" - device_name = "NTB" + updated_debug = [] + target_dir = r"u:\Dropbox\Ordinace\Dokumentace_ke_zpracování" + device_name = "Z230" + + if not os.path.isdir(target_dir): + print("Invalid directory:", target_dir) + return + conn, cursor = init_db() now = datetime.now() - cursor.execute("INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s, %s)", (device_name, now)) - conn.commit() - cursor.execute("SELECT id FROM devices WHERE name=%s", (device_name,)) - device_id = cursor.fetchone()[0] + device_id = get_or_create_device(cursor, conn, device_name) + folder_cache = load_folder_cache(cursor, device_id) + last_state = load_last_file_state(cursor, device_id) + + seen_md5 = set() - folder_cache = {} files_to_insert = [] + files_to_update_existing = [] + files_to_mark_deleted = [] + + total_files = 0 + print(f"🔍 Scanning: {target_dir} (device {device_id})") for root, dirs, files in os.walk(target_dir): - parent_path = os.path.dirname(root) - parent_id = folder_cache.get(parent_path) - cursor.execute("INSERT IGNORE INTO folders (path, parent_id, device_id) VALUES (%s, %s, %s)", (root, parent_id, device_id)) - conn.commit() - cursor.execute("SELECT id FROM folders WHERE path=%s", (root,)) - folder_id = cursor.fetchone()[0] - folder_cache[root] = folder_id + folder_path = os.path.normpath(root) + parent_path = os.path.normpath(os.path.dirname(root)) if root != target_dir else None + folder_id = get_or_create_folder(cursor, conn, folder_cache, device_id, folder_path, parent_path, now) + + for name in files: + total_files += 1 + + file_path = os.path.normpath(os.path.join(root, name)) + file_md5 = md5_path(file_path) + seen_md5.add(file_md5) - for file in files: - file_path = os.path.join(root, file) try: stats = os.stat(file_path) - modified = datetime.fromtimestamp(stats.st_mtime) - ftype = os.path.splitext(file)[1] - files_to_insert.append((file, file_path, stats.st_size, modified, ftype, folder_id, device_id)) except FileNotFoundError: continue - insert_bulk_files(cursor, conn, files_to_insert) + modified = datetime.fromtimestamp(stats.st_mtime).replace(microsecond=0) + size = stats.st_size + ext = os.path.splitext(name)[1] + + prev = last_state.get(file_md5) + + if prev is None: + # New file + files_to_insert.append( + (name, file_path, file_md5, size, modified, ext, + folder_id, device_id, 0, now, now) + ) + else: + if prev["deleted"] == 1: + # Reappeared file → new row + files_to_insert.append( + (name, file_path, file_md5, size, modified, ext, + folder_id, device_id, 0, now, now) + ) + else: + # Existing & not deleted + # Only update if size or modified timestamp CHANGED + if prev["size"] != size or prev["modified"] != modified: + files_to_update_existing.append( + (size, modified, now, prev["id"]) + ) + + updated_debug.append({ + "path": file_path, + "old_size": prev["size"], + "new_size": size, + "old_modified": prev["modified"], + "new_modified": modified + }) + + if total_files % 1000 == 0: + print(f" ... processed {total_files} files") + + # Mark missing files as deleted + for md5_hash, info in last_state.items(): + if info["deleted"] == 0 and md5_hash not in seen_md5: + files_to_mark_deleted.append((now, info["id"])) + + # ================================================== + # 💾 APPLY CHANGES + # ================================================== + + if files_to_insert: + cursor.executemany(""" + INSERT INTO files ( + name, path, path_md5, size, modified, type, + folder_id, device_id, deleted, + first_seen, last_seen + ) + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + """, files_to_insert) + + if files_to_update_existing: + cursor.executemany(""" + UPDATE files + SET size=%s, + modified=%s, + last_seen=%s, + deleted=0 + WHERE id=%s + """, files_to_update_existing) + + if files_to_mark_deleted: + cursor.executemany(""" + UPDATE files + SET deleted=1, + last_seen=%s + WHERE id=%s + """, files_to_mark_deleted) + + cursor.execute("UPDATE devices SET scanned_at=%s WHERE id=%s", (now, device_id)) + conn.commit() + cursor.close() conn.close() + if updated_debug: + print("\n📌 Updated files:") + for info in updated_debug: + print(f"- {info['path']}") + print(f" size: {info['old_size']} → {info['new_size']}") + print(f" modified: {info['old_modified']} → {info['new_modified']}") + print("✅ Scan completed.") + print(" Total files:", total_files) + print(" Inserted:", len(files_to_insert)) + print(" Updated:", len(files_to_update_existing)) + print(" Marked deleted:", len(files_to_mark_deleted)) + + +# ====================================================== +# 🔚 MAIN +# ====================================================== if __name__ == '__main__': - if not os.path.isdir(r"u:\\Dropbox\\Ordinace\\Dokumentace_ke_zpracování"): - print("Invalid directory path.") - else: - walk_and_store_bulk() - print("Scan completed for directory 'u:\\Dropbox\\Ordinace\\Dokumentace_ke_zpracování' on device 'NTB'. Bulk data stored efficiently in MySQL database 'walkfiles'.") \ No newline at end of file + walk_and_store_bulk() diff --git a/21 WalkandSave.py b/21 WalkandSave.py new file mode 100644 index 0000000..e59bf51 --- /dev/null +++ b/21 WalkandSave.py @@ -0,0 +1,391 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import hashlib +from datetime import datetime +import mysql.connector +from mysql.connector import Error +from dotenv import load_dotenv +from pathlib import Path + +# ====================================================== +# Load .env from the script directory +# ====================================================== + +env_path = Path(__file__).resolve().parent / ".env" +load_dotenv(env_path) + + +# ====================================================== +# Helper: MD5 of full file path string +# ====================================================== + +def md5_path(path: str) -> str: + return hashlib.md5(path.encode("utf8")).hexdigest() + + +# ====================================================== +# MySQL CONNECTIONS +# ====================================================== + +def get_server_connection(): + return mysql.connector.connect( + host=os.getenv("DB_MYSQL_HOST"), + user=os.getenv("DB_MYSQL_ROOT"), + password=os.getenv("DB_MYSQL_ROOT_PASS"), + port=int(os.getenv("DB_MYSQL_PORT")), + auth_plugin="mysql_native_password", + ) + + +def get_db_connection(): + conn = mysql.connector.connect( + host=os.getenv("DB_MYSQL_HOST"), + user=os.getenv("DB_MYSQL_ROOT"), + password=os.getenv("DB_MYSQL_ROOT_PASS"), + port=int(os.getenv("DB_MYSQL_PORT")), + database="walkfiles", + auth_plugin="mysql_native_password", + ) + + c = conn.cursor() + c.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci") + c.close() + return conn + + +# ====================================================== +# DATABASE INITIALIZATION +# ====================================================== + +def init_db(): + # -- Ensure DB exists -- + server = get_server_connection() + cur = server.cursor() + cur.execute(""" + CREATE DATABASE IF NOT EXISTS walkfiles + DEFAULT CHARACTER SET utf8mb4 + COLLATE utf8mb4_general_ci + """) + server.commit() + cur.close() + server.close() + + # -- Connect to DB -- + conn = get_db_connection() + cursor = conn.cursor() + + # ============================ + # DEVICES + # ============================ + cursor.execute(""" + CREATE TABLE IF NOT EXISTS devices ( + id INT AUTO_INCREMENT PRIMARY KEY, + name VARCHAR(255) UNIQUE, + scanned_at DATETIME NULL + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; + """) + + # ============================ + # FOLDERS + # ============================ + cursor.execute(""" + CREATE TABLE IF NOT EXISTS folders ( + id INT AUTO_INCREMENT PRIMARY KEY, + path VARCHAR(2048) NOT NULL, + parent_id INT NULL, + device_id INT NOT NULL, + first_seen DATETIME NOT NULL, + last_seen DATETIME NOT NULL, + deleted TINYINT(1) NOT NULL DEFAULT 0, + + CONSTRAINT fk_folder_device + FOREIGN KEY (device_id) REFERENCES devices(id) + ON DELETE CASCADE, + + UNIQUE KEY uniq_folder_path (device_id, path(255)), + INDEX idx_folder_dev (device_id) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; + """) + + # ============================ + # FILES + # ============================ + cursor.execute(""" + CREATE TABLE IF NOT EXISTS files ( + id INT AUTO_INCREMENT PRIMARY KEY, + + name VARCHAR(255) NOT NULL, + path VARCHAR(2048) NOT NULL, + path_md5 CHAR(32) NOT NULL, + + size BIGINT NULL, + modified DATETIME NULL, + type VARCHAR(50) NULL, + + folder_id INT NULL, + device_id INT NOT NULL, + deleted TINYINT(1) NOT NULL DEFAULT 0, + + first_seen DATETIME NOT NULL, + last_seen DATETIME NOT NULL, + + CONSTRAINT fk_file_folder + FOREIGN KEY (folder_id) REFERENCES folders(id) + ON DELETE SET NULL, + + CONSTRAINT fk_file_device + FOREIGN KEY (device_id) REFERENCES devices(id) + ON DELETE CASCADE, + + UNIQUE KEY uniq_file_path_md5 (device_id, path_md5), + INDEX idx_file_folder (folder_id), + INDEX idx_file_deleted (device_id, deleted) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; + """) + + conn.commit() + return conn, cursor + + +# ====================================================== +# HELPERS FOR DEVICES AND FOLDERS +# ====================================================== + +def get_or_create_device(cursor, conn, name: str) -> int: + now = datetime.now() + cursor.execute("INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s,%s)", (name, now)) + conn.commit() + + cursor.execute("SELECT id FROM devices WHERE name=%s", (name,)) + return cursor.fetchone()[0] + + +def load_folder_state(cursor, device_id: int): + """ + Return {path: {"id", "deleted"}} + """ + cursor.execute(""" + SELECT id, path, deleted + FROM folders + WHERE device_id=%s + """, (device_id,)) + + out = {} + for folder_id, path, deleted in cursor.fetchall(): + out[path] = {"id": folder_id, "deleted": int(deleted)} + return out + + +def get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, parent_path, now): + """ + Ensure folder exists in DB (even empty). + """ + if folder_path in folder_state: + folder_id = folder_state[folder_path]["id"] + cursor.execute( + "UPDATE folders SET last_seen=%s, deleted=0 WHERE id=%s", + (now, folder_id) + ) + folder_state[folder_path]["deleted"] = 0 + return folder_id + + parent_id = folder_state.get(parent_path, {}).get("id") if parent_path else None + + cursor.execute(""" + INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen, deleted) + VALUES (%s, %s, %s, %s, %s, 0) + """, (folder_path, parent_id, device_id, now, now)) + + folder_id = cursor.lastrowid + folder_state[folder_path] = {"id": folder_id, "deleted": 0} + return folder_id + + +# ====================================================== +# LOAD LAST FILE STATE +# ====================================================== + +def load_last_file_state(cursor, device_id: int): + """ + {md5: {"id", "deleted", "size", "modified"}} + """ + cursor.execute(""" + SELECT f.id, f.path_md5, f.deleted, f.size, f.modified + FROM files f + JOIN ( + SELECT MAX(id) AS mx + FROM files + WHERE device_id=%s + GROUP BY path_md5 + ) t ON f.id = t.mx + """, (device_id,)) + + out = {} + for fid, md5, deleted, size, modified in cursor.fetchall(): + out[md5] = { + "id": fid, + "deleted": int(deleted), + "size": size, + "modified": modified, + } + return out + + +# ====================================================== +# MAIN SCANNER +# ====================================================== + +def walk_and_store_bulk(): + target_dir = r"u:\Dropbox" + device_name = "Z230" + + if not os.path.isdir(target_dir): + print("Invalid directory:", target_dir) + return + + conn, cursor = init_db() + now = datetime.now() + + device_id = get_or_create_device(cursor, conn, device_name) + folder_state = load_folder_state(cursor, device_id) + file_state = load_last_file_state(cursor, device_id) + + seen_folders = set() + seen_files = set() + + files_to_insert = [] + files_to_update = [] + files_to_mark_deleted = [] + folders_to_mark_deleted = [] + + total_files = 0 + + print(f"🔍 Scanning: {target_dir} (device {device_id})") + + # ------------------------------------------------- + # WALK THROUGH FILESYSTEM + # ------------------------------------------------- + for root, dirs, files in os.walk(target_dir): + folder_path = os.path.normpath(root) + parent_path = os.path.normpath(os.path.dirname(root)) if root != target_dir else None + + # ALWAYS insert/update folder + seen_folders.add(folder_path) + folder_id = get_or_create_folder(cursor, conn, folder_state, + device_id, folder_path, + parent_path, now) + + # Process files inside this folder + for name in files: + total_files += 1 + + filepath = os.path.normpath(os.path.join(root, name)) + md5 = md5_path(filepath) + seen_files.add(md5) + + try: + st = os.stat(filepath) + except FileNotFoundError: + continue + + modified = datetime.fromtimestamp(st.st_mtime).replace(microsecond=0) + size = st.st_size + ext = os.path.splitext(name)[1] + + prev = file_state.get(md5) + + if prev is None: + # New file + files_to_insert.append( + (name, filepath, md5, size, modified, ext, + folder_id, device_id, 0, now, now) + ) + else: + if prev["deleted"] == 1: + # Reappeared + files_to_insert.append( + (name, filepath, md5, size, modified, ext, + folder_id, device_id, 0, now, now) + ) + else: + # Existing file → update only if changed + if prev["size"] != size or prev["modified"] != modified: + files_to_update.append( + (size, modified, now, prev["id"]) + ) + + if total_files % 1000 == 0: + print(f" ... processed {total_files} files") + + # ------------------------------------------------- + # MARK DELETED FILES + # ------------------------------------------------- + for md5, info in file_state.items(): + if info["deleted"] == 0 and md5 not in seen_files: + files_to_mark_deleted.append((now, info["id"])) + + # ------------------------------------------------- + # MARK DELETED FOLDERS + # ------------------------------------------------- + for path, info in folder_state.items(): + if info["deleted"] == 0 and path not in seen_folders: + folders_to_mark_deleted.append((now, info["id"])) + + # ------------------------------------------------- + # APPLY CHANGES + # ------------------------------------------------- + if files_to_insert: + cursor.executemany(""" + INSERT INTO files ( + name, path, path_md5, size, modified, type, + folder_id, device_id, deleted, + first_seen, last_seen + ) + VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) + """, files_to_insert) + + if files_to_update: + cursor.executemany(""" + UPDATE files + SET size=%s, modified=%s, last_seen=%s, deleted=0 + WHERE id=%s + """, files_to_update) + + if files_to_mark_deleted: + cursor.executemany(""" + UPDATE files + SET deleted=1, last_seen=%s + WHERE id=%s + """, files_to_mark_deleted) + + if folders_to_mark_deleted: + cursor.executemany(""" + UPDATE folders + SET deleted=1, last_seen=%s + WHERE id=%s + """, folders_to_mark_deleted) + + # Update device timestamp + cursor.execute("UPDATE devices SET scanned_at=%s WHERE id=%s", (now, device_id)) + + conn.commit() + cursor.close() + conn.close() + + print("") + print("✅ Scan completed.") + print(" Total files:", total_files) + print(" Inserted:", len(files_to_insert)) + print(" Updated:", len(files_to_update)) + print(" Files deleted:", len(files_to_mark_deleted)) + print(" Folders deleted:", len(folders_to_mark_deleted)) + + +# ====================================================== +# MAIN ENTRY +# ====================================================== + +if __name__ == '__main__': + walk_and_store_bulk() diff --git a/22 WalkandSave.py b/22 WalkandSave.py new file mode 100644 index 0000000..9fdac86 --- /dev/null +++ b/22 WalkandSave.py @@ -0,0 +1,413 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import hashlib +from datetime import datetime +import mysql.connector +from mysql.connector import Error +from dotenv import load_dotenv +from pathlib import Path + +# ====================================================== +# Load .env from the script directory +# ====================================================== + +env_path = Path(__file__).resolve().parent / ".env" +load_dotenv(env_path) + + +# ====================================================== +# Helper: MD5 of full file path string +# ====================================================== + +def md5_path(path: str) -> str: + return hashlib.md5(path.encode("utf8")).hexdigest() + + +# ====================================================== +# MySQL CONNECTIONS +# ====================================================== + +def get_server_connection(): + return mysql.connector.connect( + host=os.getenv("DB_MYSQL_HOST"), + user=os.getenv("DB_MYSQL_ROOT"), + password=os.getenv("DB_MYSQL_ROOT_PASS"), + port=int(os.getenv("DB_MYSQL_PORT")), + auth_plugin="mysql_native_password", + ) + + +def get_db_connection(): + conn = mysql.connector.connect( + host=os.getenv("DB_MYSQL_HOST"), + user=os.getenv("DB_MYSQL_ROOT"), + password=os.getenv("DB_MYSQL_ROOT_PASS"), + port=int(os.getenv("DB_MYSQL_PORT")), + database="walkfiles", + auth_plugin="mysql_native_password", + ) + + c = conn.cursor() + c.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci") + c.close() + return conn + + +# ====================================================== +# DATABASE INITIALIZATION +# ====================================================== + +def init_db(): + # Ensure DB exists + server = get_server_connection() + cur = server.cursor() + cur.execute(""" + CREATE DATABASE IF NOT EXISTS walkfiles + DEFAULT CHARACTER SET utf8mb4 + COLLATE utf8mb4_general_ci + """) + server.commit() + cur.close() + server.close() + + # Connect + conn = get_db_connection() + cursor = conn.cursor() + + # DEVICES + cursor.execute(""" + CREATE TABLE IF NOT EXISTS devices ( + id INT AUTO_INCREMENT PRIMARY KEY, + name VARCHAR(255) UNIQUE, + scanned_at DATETIME NULL + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; + """) + + # FOLDERS + cursor.execute(""" + CREATE TABLE IF NOT EXISTS folders ( + id INT AUTO_INCREMENT PRIMARY KEY, + path VARCHAR(2048) NOT NULL, + parent_id INT NULL, + device_id INT NOT NULL, + first_seen DATETIME NOT NULL, + last_seen DATETIME NOT NULL, + deleted TINYINT(1) NOT NULL DEFAULT 0, + + CONSTRAINT fk_folder_device + FOREIGN KEY (device_id) REFERENCES devices(id) + ON DELETE CASCADE, + + UNIQUE KEY uniq_folder_path (device_id, path(255)), + INDEX idx_folder_dev (device_id) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; + """) + + # FILES + cursor.execute(""" + CREATE TABLE IF NOT EXISTS files ( + id INT AUTO_INCREMENT PRIMARY KEY, + + name VARCHAR(255) NOT NULL, + path VARCHAR(2048) NOT NULL, + path_md5 CHAR(32) NOT NULL, + + size BIGINT NULL, + modified DATETIME NULL, + type VARCHAR(255) NULL, + + folder_id INT NULL, + device_id INT NOT NULL, + deleted TINYINT(1) NOT NULL DEFAULT 0, + + first_seen DATETIME NOT NULL, + last_seen DATETIME NOT NULL, + + CONSTRAINT fk_file_folder + FOREIGN KEY (folder_id) REFERENCES folders(id) + ON DELETE SET NULL, + + CONSTRAINT fk_file_device + FOREIGN KEY (device_id) REFERENCES devices(id) + ON DELETE CASCADE, + + UNIQUE KEY uniq_file_path_md5 (device_id, path_md5), + INDEX idx_file_folder (folder_id), + INDEX idx_file_deleted (device_id, deleted) + ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; + """) + + conn.commit() + return conn, cursor + + +# ====================================================== +# HELPERS — DEVICES & FOLDERS +# ====================================================== + +def get_or_create_device(cursor, conn, name: str) -> int: + now = datetime.now() + cursor.execute("INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s,%s)", (name, now)) + conn.commit() + + cursor.execute("SELECT id FROM devices WHERE name=%s", (name,)) + return cursor.fetchone()[0] + + +def load_folder_state(cursor, device_id: int): + cursor.execute(""" + SELECT id, path, deleted + FROM folders + WHERE device_id=%s + """, (device_id,)) + + out = {} + for folder_id, path, deleted in cursor.fetchall(): + out[path] = {"id": folder_id, "deleted": int(deleted)} + return out + + +def get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, parent_path, now): + if folder_path in folder_state: + folder_id = folder_state[folder_path]["id"] + cursor.execute( + "UPDATE folders SET last_seen=%s, deleted=0 WHERE id=%s", + (now, folder_id) + ) + folder_state[folder_path]["deleted"] = 0 + return folder_id + + parent_id = folder_state.get(parent_path, {}).get("id") if parent_path else None + + cursor.execute(""" + INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen, deleted) + VALUES (%s, %s, %s, %s, %s, 0) + """, (folder_path, parent_id, device_id, now, now)) + + folder_id = cursor.lastrowid + folder_state[folder_path] = {"id": folder_id, "deleted": 0} + return folder_id + + +# ====================================================== +# LOAD LAST FILE STATE +# ====================================================== + +def load_last_file_state(cursor, device_id: int): + cursor.execute(""" + SELECT f.id, f.path_md5, f.deleted, f.size, f.modified + FROM files f + JOIN ( + SELECT MAX(id) AS mx + FROM files + WHERE device_id=%s + GROUP BY path_md5 + ) t ON f.id = t.mx + """, (device_id,)) + + out = {} + for fid, md5, deleted, size, modified in cursor.fetchall(): + out[md5] = { + "id": fid, + "deleted": int(deleted), + "size": size, + "modified": modified, + } + return out + + +# ====================================================== +# MAIN SCANNER WITH BATCHING +# ====================================================== + +def walk_and_store_bulk(): + + BATCH_SIZE = 10000 + target_dir = r"u:\Dropbox" + device_name = "Z230" + + if not os.path.isdir(target_dir): + print("Invalid directory:", target_dir) + return + + conn, cursor = init_db() + now = datetime.now() + + device_id = get_or_create_device(cursor, conn, device_name) + folder_state = load_folder_state(cursor, device_id) + file_state = load_last_file_state(cursor, device_id) + + seen_folders = set() + seen_files = set() + + files_to_insert = [] + files_to_update = [] + files_to_mark_deleted = [] + folders_to_mark_deleted = [] + + total_files = 0 + + print(f"🔍 Scanning: {target_dir} (device {device_id})") + + # ------------------------------------------------- + # WALK FILESYSTEM + # ------------------------------------------------- + for root, dirs, files in os.walk(target_dir): + folder_path = os.path.normpath(root) + parent_path = os.path.normpath(os.path.dirname(root)) if root != target_dir else None + + seen_folders.add(folder_path) + folder_id = get_or_create_folder(cursor, conn, folder_state, + device_id, folder_path, + parent_path, now) + + # ------------------------------------------------- + # FILE LOOP + # ------------------------------------------------- + for name in files: + total_files += 1 + + filepath = os.path.normpath(os.path.join(root, name)) + md5 = md5_path(filepath) + seen_files.add(md5) + + try: + st = os.stat(filepath) + except FileNotFoundError: + continue + + modified = datetime.fromtimestamp(st.st_mtime).replace(microsecond=0) + size = st.st_size + ext = os.path.splitext(name)[1][:250] + + prev = file_state.get(md5) + + if prev is None: + files_to_insert.append( + (name, filepath, md5, size, modified, ext, + folder_id, device_id, 0, now, now) + ) + else: + if prev["deleted"] == 1: + files_to_insert.append( + (name, filepath, md5, size, modified, ext, + folder_id, device_id, 0, now, now) + ) + else: + if prev["size"] != size or prev["modified"] != modified: + files_to_update.append( + (size, modified, now, prev["id"]) + ) + + # ------------------------------------------------- + # BATCH FLUSHING + # ------------------------------------------------- + if len(files_to_insert) >= BATCH_SIZE: + print(f"💾 Flushing {len(files_to_insert)} inserts...") + cursor.executemany(""" + INSERT INTO files ( + name, path, path_md5, size, modified, type, + folder_id, device_id, deleted, + first_seen, last_seen + ) + VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) + """, files_to_insert) + conn.commit() + files_to_insert.clear() + + if len(files_to_update) >= BATCH_SIZE: + print(f"💾 Flushing {len(files_to_update)} updates...") + cursor.executemany(""" + UPDATE files + SET size=%s, modified=%s, last_seen=%s, deleted=0 + WHERE id=%s + """, files_to_update) + conn.commit() + files_to_update.clear() + + # PROGRESS + if total_files % 1000 == 0: + print(f" ... processed {total_files} files") + + # ------------------------------------------------- + # MARK DELETED FILES + # ------------------------------------------------- + for md5, info in file_state.items(): + if info["deleted"] == 0 and md5 not in seen_files: + files_to_mark_deleted.append((now, info["id"])) + + # ------------------------------------------------- + # MARK DELETED FOLDERS + # ------------------------------------------------- + for path, info in folder_state.items(): + if info["deleted"] == 0 and path not in seen_folders: + folders_to_mark_deleted.append((now, info["id"])) + + # ------------------------------------------------- + # FINAL FLUSH (REMAINING BATCHES) + # ------------------------------------------------- + + if files_to_insert: + print(f"💾 Final flush: {len(files_to_insert)} inserts") + cursor.executemany(""" + INSERT INTO files ( + name, path, path_md5, size, modified, type, + folder_id, device_id, deleted, + first_seen, last_seen + ) + VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) + """, files_to_insert) + conn.commit() + + if files_to_update: + print(f"💾 Final flush: {len(files_to_update)} updates") + cursor.executemany(""" + UPDATE files + SET size=%s, modified=%s, last_seen=%s, deleted=0 + WHERE id=%s + """, files_to_update) + conn.commit() + + if files_to_mark_deleted: + print(f"💾 Final flush: {len(files_to_mark_deleted)} deletions") + cursor.executemany(""" + UPDATE files + SET deleted=1, last_seen=%s + WHERE id=%s + """, files_to_mark_deleted) + conn.commit() + + if folders_to_mark_deleted: + cursor.executemany(""" + UPDATE folders + SET deleted=1, last_seen=%s + WHERE id=%s + """, folders_to_mark_deleted) + conn.commit() + + # ------------------------------------------------- + # Update device timestamp + # ------------------------------------------------- + cursor.execute("UPDATE devices SET scanned_at=%s WHERE id=%s", (now, device_id)) + conn.commit() + + cursor.close() + conn.close() + + print("") + print("✅ Scan completed.") + print(" Total files:", total_files) + print(" Inserted:", len(files_to_insert)) + print(" Updated:", len(files_to_update)) + print(" Files deleted:", len(files_to_mark_deleted)) + print(" Folders deleted:", len(folders_to_mark_deleted)) + + +# ====================================================== +# MAIN ENTRY +# ====================================================== + +if __name__ == '__main__': + walk_and_store_bulk() diff --git a/30 Test.py b/30 Test.py new file mode 100644 index 0000000..be20e72 --- /dev/null +++ b/30 Test.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +from datetime import datetime +import mysql.connector +from dotenv import load_dotenv +from pathlib import Path + +# Always load .env from the folder where THIS script is stored +env_path = Path(__file__).resolve().parent / ".env" +load_dotenv(env_path) + +# ======================= +# ENV DEBUG OUTPUT +# ======================= +print("======================================") +print("ENV DEBUG") +print("Script file:", __file__) +print("Script folder:", Path(__file__).resolve().parent) +print("Expected .env path:", env_path) +print(".env exists? ->", env_path.exists()) +print("Current working directory (cwd):", os.getcwd()) +print("Loaded DB_MYSQL_HOST:", os.getenv("DB_MYSQL_HOST")) +print("Loaded DB_MYSQL_PORT:", os.getenv("DB_MYSQL_PORT")) +print("Loaded DB_MYSQL_ROOT:", os.getenv("DB_MYSQL_ROOT")) +print("Loaded DB_MYSQL_ROOT_PASS:", os.getenv("DB_MYSQL_ROOT_PASS")) +print("======================================") diff --git a/40 Test3.py b/40 Test3.py new file mode 100644 index 0000000..e6bca76 --- /dev/null +++ b/40 Test3.py @@ -0,0 +1,11 @@ +import os + +base = r"u:\Dropbox\Ordinace\Dokumentace_ke_zpracování" + +print("Listing folders in:", base) +print("--------------------------------------") + +for name in os.listdir(base): + full = os.path.join(base, name) + if os.path.isdir(full): + print("FOLDER:", repr(name)) diff --git a/50 MD5calculate.py b/50 MD5calculate.py new file mode 100644 index 0000000..a940d31 --- /dev/null +++ b/50 MD5calculate.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import hashlib +from datetime import datetime +import mysql.connector +from dotenv import load_dotenv +from pathlib import Path + + +# ====================================================== +# Load environment +# ====================================================== +env_path = Path(__file__).resolve().parent / ".env" +load_dotenv(env_path) + + +# ====================================================== +# MySQL connection +# ====================================================== +def get_db_connection(): + conn = mysql.connector.connect( + host=os.getenv("DB_MYSQL_HOST"), + user=os.getenv("DB_MYSQL_ROOT"), + password=os.getenv("DB_MYSQL_ROOT_PASS"), + port=int(os.getenv("DB_MYSQL_PORT")), + database="walkfiles", + auth_plugin="mysql_native_password" + ) + c = conn.cursor() + c.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci") + c.close() + return conn + + +# ====================================================== +# Helpers +# ====================================================== +def file_md5(path, chunk_size=1024 * 1024): + """Compute content MD5 of a file in chunks.""" + md5 = hashlib.md5() + with open(path, "rb") as f: + while chunk := f.read(chunk_size): + md5.update(chunk) + return md5.hexdigest() + + +def parse_size(size_str: str) -> int: + """ + Convert human input like: + 10MB, 500kB, 2GB + into bytes. If already numeric, return as-is. + """ + s = size_str.strip().upper() + if s.endswith("KB"): + return int(float(s[:-2]) * 1024) + if s.endswith("MB"): + return int(float(s[:-2]) * 1024 * 1024) + if s.endswith("GB"): + return int(float(s[:-2]) * 1024 * 1024 * 1024) + return int(s) # assume raw bytes + + +# ====================================================== +# MAIN LOGIC +# ====================================================== +def run_md5_calculator(device_name=None, + device_id=None, + extension=".pdf", + max_size="50MB"): + """ + device_name OR device_id must be provided. + extension: ".pdf", ".jpg", etc. + max_size: "10MB", "500KB", "1GB" or number of bytes + """ + + max_bytes = parse_size(max_size) + + conn, cursor = None, None + + try: + conn = get_db_connection() + cursor = conn.cursor(dictionary=True) + + # ------------------------------------------ + # Resolve device_id if only device_name given + # ------------------------------------------ + if device_id is None: + if device_name is None: + raise RuntimeError("You must provide device_name or device_id") + cursor.execute("SELECT id FROM devices WHERE name=%s", (device_name,)) + row = cursor.fetchone() + if not row: + raise RuntimeError(f"Device '{device_name}' not found") + device_id = row["id"] + + print(f"\n🔍 Filtering: device={device_id}, ext={extension}, max_size={max_bytes} bytes\n") + + # ------------------------------------------ + # SELECT only files that need MD5 calculation + # ------------------------------------------ + cursor.execute(""" + SELECT id, path, size, modified, content_md5, md5_calculated + FROM files + WHERE device_id=%s + AND deleted = 0 + AND path LIKE %s + AND size <= %s + """, (device_id, "%" + extension, max_bytes)) + + rows = cursor.fetchall() + total = len(rows) + print(f"📁 Files matching criteria: {total}") + + updates = 0 + + for row in rows: + + file_id = row["id"] + path = row["path"] + size = row["size"] + modified = row["modified"] + prev_md5 = row["content_md5"] + prev_calc = row["md5_calculated"] + + # ------------------------------- + # Skip missing files on disk + # ------------------------------- + if not os.path.isfile(path): + print(f"⚠️ Missing on disk, skipping: {path}") + continue + + # ------------------------------- + # Check conditions for recalculation + # ------------------------------- + need_md5 = False + + if prev_md5 is None: + need_md5 = True + else: + if prev_calc is None or prev_calc < modified: + need_md5 = True + + if not need_md5: + continue + + # ------------------------------- + # Compute MD5 + # ------------------------------- + print(f"🔄 Calculating MD5: {path}") + new_md5 = file_md5(path) + now = datetime.now().replace(microsecond=0) + + cursor.execute(""" + UPDATE files + SET content_md5=%s, + md5_calculated=%s + WHERE id=%s + """, (new_md5, now, file_id)) + + updates += 1 + + # optional commit per-file: + # conn.commit() + + conn.commit() + + print("\n✅ MD5 calculation finished.") + print(f" Updated files: {updates}") + print(f" Skipped files: {total - updates}\n") + + except Error as e: + print("MySQL Error:", e) + + finally: + if cursor: + cursor.close() + if conn: + conn.close() + + +# ====================================================== +# RUN EXAMPLE +# ====================================================== +if __name__ == "__main__": + # Example usage: + run_md5_calculator( + device_name="Z230", + extension=".pdf", + max_size="100MB" + ) diff --git a/credentials.env b/credentials.env deleted file mode 100644 index 4e6ec74..0000000 --- a/credentials.env +++ /dev/null @@ -1,6 +0,0 @@ -DB_MYSQL_HOST=192.168.1.76 -DB_MYSQL_PORT=3307 -DB_MYSQL_ROOT=root -DB_MYSQL_ROOT_PASS=Vlado9674+ -DB_MYSQL_PORT=3307 -