Files
walkfiles/20 Walkandsave.py
2025-11-24 15:15:49 +01:00

358 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import hashlib
from datetime import datetime
import mysql.connector
from dotenv import load_dotenv
from pathlib import Path
# Always load .env from the folder where THIS script is stored
env_path = Path(__file__).resolve().parent / ".env"
load_dotenv(env_path)
# ======================================================
# 🔧 Helper: MD5 of full path
# ======================================================
def md5_path(path: str) -> str:
return hashlib.md5(path.encode("utf8")).hexdigest()
# ======================================================
# 🔧 DB CONNECTION HELPERS
# ======================================================
def get_server_connection():
"""Connect to MySQL server WITHOUT selecting a database."""
conn = mysql.connector.connect(
host=os.getenv("DB_MYSQL_HOST", "127.0.0.1"),
user=os.getenv("DB_MYSQL_ROOT", "root"),
password=os.getenv("DB_MYSQL_ROOT_PASS", ""),
port=int(os.getenv("DB_MYSQL_PORT", "3306")),
auth_plugin="mysql_native_password",
)
return conn
def get_db_connection():
"""Connect to the 'walkfiles' database."""
conn = mysql.connector.connect(
host=os.getenv("DB_MYSQL_HOST", "127.0.0.1"),
user=os.getenv("DB_MYSQL_ROOT", "root"),
password=os.getenv("DB_MYSQL_ROOT_PASS", ""),
port=int(os.getenv("DB_MYSQL_PORT", "3306")),
database="walkfiles",
auth_plugin="mysql_native_password",
)
cursor = conn.cursor()
cursor.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
cursor.close()
return conn
# ======================================================
# 🗄 DB INITIALIZATION
# ======================================================
def init_db():
# 1) Ensure DB exists
server_conn = get_server_connection()
cur = server_conn.cursor()
cur.execute(
"CREATE DATABASE IF NOT EXISTS walkfiles "
"DEFAULT CHARACTER SET utf8mb4 "
"COLLATE utf8mb4_general_ci"
)
server_conn.commit()
cur.close()
server_conn.close()
# 2) Connect
conn = get_db_connection()
cursor = conn.cursor()
# Devices
cursor.execute("""
CREATE TABLE IF NOT EXISTS devices (
id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci UNIQUE,
scanned_at DATETIME NULL
) ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
""")
# Folders
cursor.execute("""
CREATE TABLE IF NOT EXISTS folders (
id INT AUTO_INCREMENT PRIMARY KEY,
path VARCHAR(2048) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
parent_id INT NULL,
device_id INT NOT NULL,
first_seen DATETIME NOT NULL,
last_seen DATETIME NOT NULL,
CONSTRAINT fk_folders_device
FOREIGN KEY (device_id) REFERENCES devices(id)
ON DELETE CASCADE,
UNIQUE KEY uniq_folders_device_path (device_id, path(255)),
INDEX idx_folders_device (device_id)
) ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
""")
# Files
cursor.execute("""
CREATE TABLE IF NOT EXISTS files (
id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
path VARCHAR(2048) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
path_md5 CHAR(32) NOT NULL,
size BIGINT NULL,
modified DATETIME NULL,
type VARCHAR(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL,
folder_id INT NULL,
device_id INT NOT NULL,
deleted TINYINT(1) NOT NULL DEFAULT 0,
first_seen DATETIME NOT NULL,
last_seen DATETIME NOT NULL,
CONSTRAINT fk_files_folder
FOREIGN KEY (folder_id) REFERENCES folders(id)
ON DELETE SET NULL,
CONSTRAINT fk_files_device
FOREIGN KEY (device_id) REFERENCES devices(id)
ON DELETE CASCADE,
UNIQUE KEY uniq_files_device_path_md5 (device_id, path_md5),
INDEX idx_files_folder (folder_id),
INDEX idx_files_deleted (device_id, deleted)
) ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
""")
conn.commit()
return conn, cursor
# ======================================================
# 👤 DEVICE + FOLDERS HELPERS
# ======================================================
def get_or_create_device(cursor, conn, device_name: str) -> int:
now = datetime.now()
cursor.execute(
"INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s, %s)",
(device_name, now)
)
conn.commit()
cursor.execute("SELECT id FROM devices WHERE name=%s", (device_name,))
return cursor.fetchone()[0]
def load_folder_cache(cursor, device_id: int):
cursor.execute(
"SELECT id, path FROM folders WHERE device_id=%s",
(device_id,)
)
return {path: folder_id for folder_id, path in cursor.fetchall()}
def get_or_create_folder(cursor, conn, folder_cache, device_id, folder_path, parent_path, now):
if folder_path in folder_cache:
folder_id = folder_cache[folder_path]
cursor.execute("UPDATE folders SET last_seen=%s WHERE id=%s", (now, folder_id))
return folder_id
parent_id = folder_cache.get(parent_path)
cursor.execute("""
INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen)
VALUES (%s, %s, %s, %s, %s)
""", (folder_path, parent_id, device_id, now, now))
folder_id = cursor.lastrowid
folder_cache[folder_path] = folder_id
return folder_id
# ======================================================
# 📂 FILES LOAD LAST STATE
# ======================================================
def load_last_file_state(cursor, device_id: int):
cursor.execute("""
SELECT f.id, f.path_md5, f.deleted, f.size, f.modified
FROM files f
JOIN (
SELECT MAX(id) AS max_id
FROM files
WHERE device_id = %s
GROUP BY path_md5
) latest ON f.id = latest.max_id
WHERE f.device_id = %s
""", (device_id, device_id))
state = {}
for file_id, path_md5, deleted, size, modified in cursor.fetchall():
state[path_md5] = {
"id": file_id,
"deleted": int(deleted),
"size": size,
"modified": modified
}
return state
# ======================================================
# 🚶 MAIN WALK LOGIC
# ======================================================
def walk_and_store_bulk():
updated_debug = []
target_dir = r"u:\Dropbox\Ordinace\Dokumentace_ke_zpracování"
device_name = "Z230"
if not os.path.isdir(target_dir):
print("Invalid directory:", target_dir)
return
conn, cursor = init_db()
now = datetime.now()
device_id = get_or_create_device(cursor, conn, device_name)
folder_cache = load_folder_cache(cursor, device_id)
last_state = load_last_file_state(cursor, device_id)
seen_md5 = set()
files_to_insert = []
files_to_update_existing = []
files_to_mark_deleted = []
total_files = 0
print(f"🔍 Scanning: {target_dir} (device {device_id})")
for root, dirs, files in os.walk(target_dir):
folder_path = os.path.normpath(root)
parent_path = os.path.normpath(os.path.dirname(root)) if root != target_dir else None
folder_id = get_or_create_folder(cursor, conn, folder_cache, device_id, folder_path, parent_path, now)
for name in files:
total_files += 1
file_path = os.path.normpath(os.path.join(root, name))
file_md5 = md5_path(file_path)
seen_md5.add(file_md5)
try:
stats = os.stat(file_path)
except FileNotFoundError:
continue
modified = datetime.fromtimestamp(stats.st_mtime).replace(microsecond=0)
size = stats.st_size
ext = os.path.splitext(name)[1]
prev = last_state.get(file_md5)
if prev is None:
# New file
files_to_insert.append(
(name, file_path, file_md5, size, modified, ext,
folder_id, device_id, 0, now, now)
)
else:
if prev["deleted"] == 1:
# Reappeared file → new row
files_to_insert.append(
(name, file_path, file_md5, size, modified, ext,
folder_id, device_id, 0, now, now)
)
else:
# Existing & not deleted
# Only update if size or modified timestamp CHANGED
if prev["size"] != size or prev["modified"] != modified:
files_to_update_existing.append(
(size, modified, now, prev["id"])
)
updated_debug.append({
"path": file_path,
"old_size": prev["size"],
"new_size": size,
"old_modified": prev["modified"],
"new_modified": modified
})
if total_files % 1000 == 0:
print(f" ... processed {total_files} files")
# Mark missing files as deleted
for md5_hash, info in last_state.items():
if info["deleted"] == 0 and md5_hash not in seen_md5:
files_to_mark_deleted.append((now, info["id"]))
# ==================================================
# 💾 APPLY CHANGES
# ==================================================
if files_to_insert:
cursor.executemany("""
INSERT INTO files (
name, path, path_md5, size, modified, type,
folder_id, device_id, deleted,
first_seen, last_seen
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
""", files_to_insert)
if files_to_update_existing:
cursor.executemany("""
UPDATE files
SET size=%s,
modified=%s,
last_seen=%s,
deleted=0
WHERE id=%s
""", files_to_update_existing)
if files_to_mark_deleted:
cursor.executemany("""
UPDATE files
SET deleted=1,
last_seen=%s
WHERE id=%s
""", files_to_mark_deleted)
cursor.execute("UPDATE devices SET scanned_at=%s WHERE id=%s", (now, device_id))
conn.commit()
cursor.close()
conn.close()
if updated_debug:
print("\n📌 Updated files:")
for info in updated_debug:
print(f"- {info['path']}")
print(f" size: {info['old_size']}{info['new_size']}")
print(f" modified: {info['old_modified']}{info['new_modified']}")
print("✅ Scan completed.")
print(" Total files:", total_files)
print(" Inserted:", len(files_to_insert))
print(" Updated:", len(files_to_update_existing))
print(" Marked deleted:", len(files_to_mark_deleted))
# ======================================================
# 🔚 MAIN
# ======================================================
if __name__ == '__main__':
walk_and_store_bulk()