This commit is contained in:
2025-11-24 15:15:49 +01:00
parent f41b2c43bc
commit b8038593ab
7 changed files with 1354 additions and 74 deletions

View File

@@ -1,106 +1,357 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import mysql.connector
import hashlib
from datetime import datetime
import mysql.connector
from dotenv import load_dotenv
from pathlib import Path
load_dotenv() # Reads .env file and adds to environment
# Always load .env from the folder where THIS script is stored
env_path = Path(__file__).resolve().parent / ".env"
load_dotenv(env_path)
# Database setup with explicit UTF8MB4 collation
def init_db():
# ======================================================
# 🔧 Helper: MD5 of full path
# ======================================================
def md5_path(path: str) -> str:
return hashlib.md5(path.encode("utf8")).hexdigest()
# ======================================================
# 🔧 DB CONNECTION HELPERS
# ======================================================
def get_server_connection():
"""Connect to MySQL server WITHOUT selecting a database."""
conn = mysql.connector.connect(
host=os.getenv("DB_MYSQL_HOST"),
user=os.getenv("DB_MYSQL_ROOT"),
password=os.getenv("DB_MYSQL_ROOT_PASS"),
database=os.getenv("walkfiles"),
port=int(os.getenv("DB_MYSQL_PORT", 3306)),
charset="utf8mb4",
collation="utf8mb4_general_ci"
host=os.getenv("DB_MYSQL_HOST", "127.0.0.1"),
user=os.getenv("DB_MYSQL_ROOT", "root"),
password=os.getenv("DB_MYSQL_ROOT_PASS", ""),
port=int(os.getenv("DB_MYSQL_PORT", "3306")),
auth_plugin="mysql_native_password",
)
return conn
def get_db_connection():
"""Connect to the 'walkfiles' database."""
conn = mysql.connector.connect(
host=os.getenv("DB_MYSQL_HOST", "127.0.0.1"),
user=os.getenv("DB_MYSQL_ROOT", "root"),
password=os.getenv("DB_MYSQL_ROOT_PASS", ""),
port=int(os.getenv("DB_MYSQL_PORT", "3306")),
database="walkfiles",
auth_plugin="mysql_native_password",
)
cursor = conn.cursor()
cursor.execute("CREATE DATABASE IF NOT EXISTS walkfiles CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci")
cursor.execute("USE walkfiles")
cursor.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
cursor.close()
return conn
cursor.execute('''CREATE TABLE IF NOT EXISTS devices (
id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci UNIQUE,
scanned_at DATETIME
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci''')
# ======================================================
# 🗄 DB INITIALIZATION
# ======================================================
cursor.execute('''CREATE TABLE IF NOT EXISTS folders (
id INT AUTO_INCREMENT PRIMARY KEY,
path TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci UNIQUE,
parent_id INT,
device_id INT,
FOREIGN KEY(device_id) REFERENCES devices(id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci''')
def init_db():
# 1) Ensure DB exists
server_conn = get_server_connection()
cur = server_conn.cursor()
cur.execute(
"CREATE DATABASE IF NOT EXISTS walkfiles "
"DEFAULT CHARACTER SET utf8mb4 "
"COLLATE utf8mb4_general_ci"
)
server_conn.commit()
cur.close()
server_conn.close()
cursor.execute('''CREATE TABLE IF NOT EXISTS files (
id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
path TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci UNIQUE,
size BIGINT,
modified DATETIME,
type VARCHAR(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
folder_id INT,
device_id INT,
FOREIGN KEY(folder_id) REFERENCES folders(id),
FOREIGN KEY(device_id) REFERENCES devices(id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci''')
# 2) Connect
conn = get_db_connection()
cursor = conn.cursor()
# Devices
cursor.execute("""
CREATE TABLE IF NOT EXISTS devices (
id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci UNIQUE,
scanned_at DATETIME NULL
) ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
""")
# Folders
cursor.execute("""
CREATE TABLE IF NOT EXISTS folders (
id INT AUTO_INCREMENT PRIMARY KEY,
path VARCHAR(2048) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
parent_id INT NULL,
device_id INT NOT NULL,
first_seen DATETIME NOT NULL,
last_seen DATETIME NOT NULL,
CONSTRAINT fk_folders_device
FOREIGN KEY (device_id) REFERENCES devices(id)
ON DELETE CASCADE,
UNIQUE KEY uniq_folders_device_path (device_id, path(255)),
INDEX idx_folders_device (device_id)
) ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
""")
# Files
cursor.execute("""
CREATE TABLE IF NOT EXISTS files (
id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
path VARCHAR(2048) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
path_md5 CHAR(32) NOT NULL,
size BIGINT NULL,
modified DATETIME NULL,
type VARCHAR(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL,
folder_id INT NULL,
device_id INT NOT NULL,
deleted TINYINT(1) NOT NULL DEFAULT 0,
first_seen DATETIME NOT NULL,
last_seen DATETIME NOT NULL,
CONSTRAINT fk_files_folder
FOREIGN KEY (folder_id) REFERENCES folders(id)
ON DELETE SET NULL,
CONSTRAINT fk_files_device
FOREIGN KEY (device_id) REFERENCES devices(id)
ON DELETE CASCADE,
UNIQUE KEY uniq_files_device_path_md5 (device_id, path_md5),
INDEX idx_files_folder (folder_id),
INDEX idx_files_deleted (device_id, deleted)
) ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
""")
conn.commit()
return conn, cursor
# ======================================================
# 👤 DEVICE + FOLDERS HELPERS
# ======================================================
def insert_bulk_files(cursor, conn, files_data):
if not files_data:
return
query = '''INSERT IGNORE INTO files (name, path, size, modified, type, folder_id, device_id)
VALUES (%s,%s,%s,%s,%s,%s,%s)'''
cursor.executemany(query, files_data)
def get_or_create_device(cursor, conn, device_name: str) -> int:
now = datetime.now()
cursor.execute(
"INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s, %s)",
(device_name, now)
)
conn.commit()
cursor.execute("SELECT id FROM devices WHERE name=%s", (device_name,))
return cursor.fetchone()[0]
def load_folder_cache(cursor, device_id: int):
cursor.execute(
"SELECT id, path FROM folders WHERE device_id=%s",
(device_id,)
)
return {path: folder_id for folder_id, path in cursor.fetchall()}
def get_or_create_folder(cursor, conn, folder_cache, device_id, folder_path, parent_path, now):
if folder_path in folder_cache:
folder_id = folder_cache[folder_path]
cursor.execute("UPDATE folders SET last_seen=%s WHERE id=%s", (now, folder_id))
return folder_id
parent_id = folder_cache.get(parent_path)
cursor.execute("""
INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen)
VALUES (%s, %s, %s, %s, %s)
""", (folder_path, parent_id, device_id, now, now))
folder_id = cursor.lastrowid
folder_cache[folder_path] = folder_id
return folder_id
# ======================================================
# 📂 FILES LOAD LAST STATE
# ======================================================
def load_last_file_state(cursor, device_id: int):
cursor.execute("""
SELECT f.id, f.path_md5, f.deleted, f.size, f.modified
FROM files f
JOIN (
SELECT MAX(id) AS max_id
FROM files
WHERE device_id = %s
GROUP BY path_md5
) latest ON f.id = latest.max_id
WHERE f.device_id = %s
""", (device_id, device_id))
state = {}
for file_id, path_md5, deleted, size, modified in cursor.fetchall():
state[path_md5] = {
"id": file_id,
"deleted": int(deleted),
"size": size,
"modified": modified
}
return state
# ======================================================
# 🚶 MAIN WALK LOGIC
# ======================================================
def walk_and_store_bulk():
target_dir = r"u:\\Dropbox\\Ordinace\\Dokumentace_ke_zpracování"
device_name = "NTB"
updated_debug = []
target_dir = r"u:\Dropbox\Ordinace\Dokumentace_ke_zpracování"
device_name = "Z230"
if not os.path.isdir(target_dir):
print("Invalid directory:", target_dir)
return
conn, cursor = init_db()
now = datetime.now()
cursor.execute("INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s, %s)", (device_name, now))
conn.commit()
cursor.execute("SELECT id FROM devices WHERE name=%s", (device_name,))
device_id = cursor.fetchone()[0]
device_id = get_or_create_device(cursor, conn, device_name)
folder_cache = load_folder_cache(cursor, device_id)
last_state = load_last_file_state(cursor, device_id)
seen_md5 = set()
folder_cache = {}
files_to_insert = []
files_to_update_existing = []
files_to_mark_deleted = []
total_files = 0
print(f"🔍 Scanning: {target_dir} (device {device_id})")
for root, dirs, files in os.walk(target_dir):
parent_path = os.path.dirname(root)
parent_id = folder_cache.get(parent_path)
cursor.execute("INSERT IGNORE INTO folders (path, parent_id, device_id) VALUES (%s, %s, %s)", (root, parent_id, device_id))
conn.commit()
cursor.execute("SELECT id FROM folders WHERE path=%s", (root,))
folder_id = cursor.fetchone()[0]
folder_cache[root] = folder_id
folder_path = os.path.normpath(root)
parent_path = os.path.normpath(os.path.dirname(root)) if root != target_dir else None
folder_id = get_or_create_folder(cursor, conn, folder_cache, device_id, folder_path, parent_path, now)
for name in files:
total_files += 1
file_path = os.path.normpath(os.path.join(root, name))
file_md5 = md5_path(file_path)
seen_md5.add(file_md5)
for file in files:
file_path = os.path.join(root, file)
try:
stats = os.stat(file_path)
modified = datetime.fromtimestamp(stats.st_mtime)
ftype = os.path.splitext(file)[1]
files_to_insert.append((file, file_path, stats.st_size, modified, ftype, folder_id, device_id))
except FileNotFoundError:
continue
insert_bulk_files(cursor, conn, files_to_insert)
modified = datetime.fromtimestamp(stats.st_mtime).replace(microsecond=0)
size = stats.st_size
ext = os.path.splitext(name)[1]
prev = last_state.get(file_md5)
if prev is None:
# New file
files_to_insert.append(
(name, file_path, file_md5, size, modified, ext,
folder_id, device_id, 0, now, now)
)
else:
if prev["deleted"] == 1:
# Reappeared file → new row
files_to_insert.append(
(name, file_path, file_md5, size, modified, ext,
folder_id, device_id, 0, now, now)
)
else:
# Existing & not deleted
# Only update if size or modified timestamp CHANGED
if prev["size"] != size or prev["modified"] != modified:
files_to_update_existing.append(
(size, modified, now, prev["id"])
)
updated_debug.append({
"path": file_path,
"old_size": prev["size"],
"new_size": size,
"old_modified": prev["modified"],
"new_modified": modified
})
if total_files % 1000 == 0:
print(f" ... processed {total_files} files")
# Mark missing files as deleted
for md5_hash, info in last_state.items():
if info["deleted"] == 0 and md5_hash not in seen_md5:
files_to_mark_deleted.append((now, info["id"]))
# ==================================================
# 💾 APPLY CHANGES
# ==================================================
if files_to_insert:
cursor.executemany("""
INSERT INTO files (
name, path, path_md5, size, modified, type,
folder_id, device_id, deleted,
first_seen, last_seen
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
""", files_to_insert)
if files_to_update_existing:
cursor.executemany("""
UPDATE files
SET size=%s,
modified=%s,
last_seen=%s,
deleted=0
WHERE id=%s
""", files_to_update_existing)
if files_to_mark_deleted:
cursor.executemany("""
UPDATE files
SET deleted=1,
last_seen=%s
WHERE id=%s
""", files_to_mark_deleted)
cursor.execute("UPDATE devices SET scanned_at=%s WHERE id=%s", (now, device_id))
conn.commit()
cursor.close()
conn.close()
if updated_debug:
print("\n📌 Updated files:")
for info in updated_debug:
print(f"- {info['path']}")
print(f" size: {info['old_size']}{info['new_size']}")
print(f" modified: {info['old_modified']}{info['new_modified']}")
print("✅ Scan completed.")
print(" Total files:", total_files)
print(" Inserted:", len(files_to_insert))
print(" Updated:", len(files_to_update_existing))
print(" Marked deleted:", len(files_to_mark_deleted))
# ======================================================
# 🔚 MAIN
# ======================================================
if __name__ == '__main__':
if not os.path.isdir(r"u:\\Dropbox\\Ordinace\\Dokumentace_ke_zpracování"):
print("Invalid directory path.")
else:
walk_and_store_bulk()
print("Scan completed for directory 'u:\\Dropbox\\Ordinace\\Dokumentace_ke_zpracování' on device 'NTB'. Bulk data stored efficiently in MySQL database 'walkfiles'.")
walk_and_store_bulk()