358 lines
11 KiB
Python
358 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
|
||
import os
|
||
import hashlib
|
||
from datetime import datetime
|
||
import mysql.connector
|
||
from dotenv import load_dotenv
|
||
from pathlib import Path
|
||
|
||
# Always load .env from the folder where THIS script is stored
|
||
env_path = Path(__file__).resolve().parent / ".env"
|
||
load_dotenv(env_path)
|
||
|
||
# ======================================================
|
||
# 🔧 Helper: MD5 of full path
|
||
# ======================================================
|
||
|
||
def md5_path(path: str) -> str:
|
||
return hashlib.md5(path.encode("utf8")).hexdigest()
|
||
|
||
# ======================================================
|
||
# 🔧 DB CONNECTION HELPERS
|
||
# ======================================================
|
||
|
||
def get_server_connection():
|
||
"""Connect to MySQL server WITHOUT selecting a database."""
|
||
conn = mysql.connector.connect(
|
||
host=os.getenv("DB_MYSQL_HOST", "127.0.0.1"),
|
||
user=os.getenv("DB_MYSQL_ROOT", "root"),
|
||
password=os.getenv("DB_MYSQL_ROOT_PASS", ""),
|
||
port=int(os.getenv("DB_MYSQL_PORT", "3306")),
|
||
auth_plugin="mysql_native_password",
|
||
)
|
||
return conn
|
||
|
||
|
||
def get_db_connection():
|
||
"""Connect to the 'walkfiles' database."""
|
||
conn = mysql.connector.connect(
|
||
host=os.getenv("DB_MYSQL_HOST", "127.0.0.1"),
|
||
user=os.getenv("DB_MYSQL_ROOT", "root"),
|
||
password=os.getenv("DB_MYSQL_ROOT_PASS", ""),
|
||
port=int(os.getenv("DB_MYSQL_PORT", "3306")),
|
||
database="walkfiles",
|
||
auth_plugin="mysql_native_password",
|
||
)
|
||
cursor = conn.cursor()
|
||
cursor.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
|
||
cursor.close()
|
||
return conn
|
||
|
||
# ======================================================
|
||
# 🗄 DB INITIALIZATION
|
||
# ======================================================
|
||
|
||
def init_db():
|
||
# 1) Ensure DB exists
|
||
server_conn = get_server_connection()
|
||
cur = server_conn.cursor()
|
||
cur.execute(
|
||
"CREATE DATABASE IF NOT EXISTS walkfiles "
|
||
"DEFAULT CHARACTER SET utf8mb4 "
|
||
"COLLATE utf8mb4_general_ci"
|
||
)
|
||
server_conn.commit()
|
||
cur.close()
|
||
server_conn.close()
|
||
|
||
# 2) Connect
|
||
conn = get_db_connection()
|
||
cursor = conn.cursor()
|
||
|
||
# Devices
|
||
cursor.execute("""
|
||
CREATE TABLE IF NOT EXISTS devices (
|
||
id INT AUTO_INCREMENT PRIMARY KEY,
|
||
name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci UNIQUE,
|
||
scanned_at DATETIME NULL
|
||
) ENGINE=InnoDB
|
||
DEFAULT CHARSET=utf8mb4
|
||
""")
|
||
|
||
# Folders
|
||
cursor.execute("""
|
||
CREATE TABLE IF NOT EXISTS folders (
|
||
id INT AUTO_INCREMENT PRIMARY KEY,
|
||
path VARCHAR(2048) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
|
||
parent_id INT NULL,
|
||
device_id INT NOT NULL,
|
||
first_seen DATETIME NOT NULL,
|
||
last_seen DATETIME NOT NULL,
|
||
|
||
CONSTRAINT fk_folders_device
|
||
FOREIGN KEY (device_id) REFERENCES devices(id)
|
||
ON DELETE CASCADE,
|
||
|
||
UNIQUE KEY uniq_folders_device_path (device_id, path(255)),
|
||
INDEX idx_folders_device (device_id)
|
||
) ENGINE=InnoDB
|
||
DEFAULT CHARSET=utf8mb4
|
||
""")
|
||
|
||
# Files
|
||
cursor.execute("""
|
||
CREATE TABLE IF NOT EXISTS files (
|
||
id INT AUTO_INCREMENT PRIMARY KEY,
|
||
|
||
name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
|
||
path VARCHAR(2048) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
|
||
path_md5 CHAR(32) NOT NULL,
|
||
|
||
size BIGINT NULL,
|
||
modified DATETIME NULL,
|
||
type VARCHAR(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL,
|
||
|
||
folder_id INT NULL,
|
||
device_id INT NOT NULL,
|
||
|
||
deleted TINYINT(1) NOT NULL DEFAULT 0,
|
||
|
||
first_seen DATETIME NOT NULL,
|
||
last_seen DATETIME NOT NULL,
|
||
|
||
CONSTRAINT fk_files_folder
|
||
FOREIGN KEY (folder_id) REFERENCES folders(id)
|
||
ON DELETE SET NULL,
|
||
|
||
CONSTRAINT fk_files_device
|
||
FOREIGN KEY (device_id) REFERENCES devices(id)
|
||
ON DELETE CASCADE,
|
||
|
||
UNIQUE KEY uniq_files_device_path_md5 (device_id, path_md5),
|
||
INDEX idx_files_folder (folder_id),
|
||
INDEX idx_files_deleted (device_id, deleted)
|
||
) ENGINE=InnoDB
|
||
DEFAULT CHARSET=utf8mb4
|
||
""")
|
||
|
||
conn.commit()
|
||
return conn, cursor
|
||
|
||
# ======================================================
|
||
# 👤 DEVICE + FOLDERS HELPERS
|
||
# ======================================================
|
||
|
||
def get_or_create_device(cursor, conn, device_name: str) -> int:
|
||
now = datetime.now()
|
||
cursor.execute(
|
||
"INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s, %s)",
|
||
(device_name, now)
|
||
)
|
||
conn.commit()
|
||
|
||
cursor.execute("SELECT id FROM devices WHERE name=%s", (device_name,))
|
||
return cursor.fetchone()[0]
|
||
|
||
|
||
def load_folder_cache(cursor, device_id: int):
|
||
cursor.execute(
|
||
"SELECT id, path FROM folders WHERE device_id=%s",
|
||
(device_id,)
|
||
)
|
||
return {path: folder_id for folder_id, path in cursor.fetchall()}
|
||
|
||
|
||
def get_or_create_folder(cursor, conn, folder_cache, device_id, folder_path, parent_path, now):
|
||
if folder_path in folder_cache:
|
||
folder_id = folder_cache[folder_path]
|
||
cursor.execute("UPDATE folders SET last_seen=%s WHERE id=%s", (now, folder_id))
|
||
return folder_id
|
||
|
||
parent_id = folder_cache.get(parent_path)
|
||
|
||
cursor.execute("""
|
||
INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen)
|
||
VALUES (%s, %s, %s, %s, %s)
|
||
""", (folder_path, parent_id, device_id, now, now))
|
||
|
||
folder_id = cursor.lastrowid
|
||
folder_cache[folder_path] = folder_id
|
||
return folder_id
|
||
|
||
# ======================================================
|
||
# 📂 FILES – LOAD LAST STATE
|
||
# ======================================================
|
||
|
||
def load_last_file_state(cursor, device_id: int):
|
||
cursor.execute("""
|
||
SELECT f.id, f.path_md5, f.deleted, f.size, f.modified
|
||
FROM files f
|
||
JOIN (
|
||
SELECT MAX(id) AS max_id
|
||
FROM files
|
||
WHERE device_id = %s
|
||
GROUP BY path_md5
|
||
) latest ON f.id = latest.max_id
|
||
WHERE f.device_id = %s
|
||
""", (device_id, device_id))
|
||
|
||
state = {}
|
||
for file_id, path_md5, deleted, size, modified in cursor.fetchall():
|
||
state[path_md5] = {
|
||
"id": file_id,
|
||
"deleted": int(deleted),
|
||
"size": size,
|
||
"modified": modified
|
||
}
|
||
return state
|
||
|
||
|
||
# ======================================================
|
||
# 🚶 MAIN WALK LOGIC
|
||
# ======================================================
|
||
|
||
def walk_and_store_bulk():
|
||
updated_debug = []
|
||
target_dir = r"u:\Dropbox\Ordinace\Dokumentace_ke_zpracování"
|
||
device_name = "Z230"
|
||
|
||
if not os.path.isdir(target_dir):
|
||
print("Invalid directory:", target_dir)
|
||
return
|
||
|
||
conn, cursor = init_db()
|
||
now = datetime.now()
|
||
|
||
device_id = get_or_create_device(cursor, conn, device_name)
|
||
folder_cache = load_folder_cache(cursor, device_id)
|
||
last_state = load_last_file_state(cursor, device_id)
|
||
|
||
seen_md5 = set()
|
||
|
||
files_to_insert = []
|
||
files_to_update_existing = []
|
||
files_to_mark_deleted = []
|
||
|
||
total_files = 0
|
||
print(f"🔍 Scanning: {target_dir} (device {device_id})")
|
||
|
||
for root, dirs, files in os.walk(target_dir):
|
||
|
||
folder_path = os.path.normpath(root)
|
||
parent_path = os.path.normpath(os.path.dirname(root)) if root != target_dir else None
|
||
folder_id = get_or_create_folder(cursor, conn, folder_cache, device_id, folder_path, parent_path, now)
|
||
|
||
for name in files:
|
||
total_files += 1
|
||
|
||
file_path = os.path.normpath(os.path.join(root, name))
|
||
file_md5 = md5_path(file_path)
|
||
seen_md5.add(file_md5)
|
||
|
||
try:
|
||
stats = os.stat(file_path)
|
||
except FileNotFoundError:
|
||
continue
|
||
|
||
modified = datetime.fromtimestamp(stats.st_mtime).replace(microsecond=0)
|
||
size = stats.st_size
|
||
ext = os.path.splitext(name)[1]
|
||
|
||
prev = last_state.get(file_md5)
|
||
|
||
if prev is None:
|
||
# New file
|
||
files_to_insert.append(
|
||
(name, file_path, file_md5, size, modified, ext,
|
||
folder_id, device_id, 0, now, now)
|
||
)
|
||
else:
|
||
if prev["deleted"] == 1:
|
||
# Reappeared file → new row
|
||
files_to_insert.append(
|
||
(name, file_path, file_md5, size, modified, ext,
|
||
folder_id, device_id, 0, now, now)
|
||
)
|
||
else:
|
||
# Existing & not deleted
|
||
# Only update if size or modified timestamp CHANGED
|
||
if prev["size"] != size or prev["modified"] != modified:
|
||
files_to_update_existing.append(
|
||
(size, modified, now, prev["id"])
|
||
)
|
||
|
||
updated_debug.append({
|
||
"path": file_path,
|
||
"old_size": prev["size"],
|
||
"new_size": size,
|
||
"old_modified": prev["modified"],
|
||
"new_modified": modified
|
||
})
|
||
|
||
if total_files % 1000 == 0:
|
||
print(f" ... processed {total_files} files")
|
||
|
||
# Mark missing files as deleted
|
||
for md5_hash, info in last_state.items():
|
||
if info["deleted"] == 0 and md5_hash not in seen_md5:
|
||
files_to_mark_deleted.append((now, info["id"]))
|
||
|
||
# ==================================================
|
||
# 💾 APPLY CHANGES
|
||
# ==================================================
|
||
|
||
if files_to_insert:
|
||
cursor.executemany("""
|
||
INSERT INTO files (
|
||
name, path, path_md5, size, modified, type,
|
||
folder_id, device_id, deleted,
|
||
first_seen, last_seen
|
||
)
|
||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||
""", files_to_insert)
|
||
|
||
if files_to_update_existing:
|
||
cursor.executemany("""
|
||
UPDATE files
|
||
SET size=%s,
|
||
modified=%s,
|
||
last_seen=%s,
|
||
deleted=0
|
||
WHERE id=%s
|
||
""", files_to_update_existing)
|
||
|
||
if files_to_mark_deleted:
|
||
cursor.executemany("""
|
||
UPDATE files
|
||
SET deleted=1,
|
||
last_seen=%s
|
||
WHERE id=%s
|
||
""", files_to_mark_deleted)
|
||
|
||
cursor.execute("UPDATE devices SET scanned_at=%s WHERE id=%s", (now, device_id))
|
||
conn.commit()
|
||
cursor.close()
|
||
conn.close()
|
||
|
||
if updated_debug:
|
||
print("\n📌 Updated files:")
|
||
for info in updated_debug:
|
||
print(f"- {info['path']}")
|
||
print(f" size: {info['old_size']} → {info['new_size']}")
|
||
print(f" modified: {info['old_modified']} → {info['new_modified']}")
|
||
print("✅ Scan completed.")
|
||
print(" Total files:", total_files)
|
||
print(" Inserted:", len(files_to_insert))
|
||
print(" Updated:", len(files_to_update_existing))
|
||
print(" Marked deleted:", len(files_to_mark_deleted))
|
||
|
||
|
||
# ======================================================
|
||
# 🔚 MAIN
|
||
# ======================================================
|
||
|
||
if __name__ == '__main__':
|
||
walk_and_store_bulk()
|