This commit is contained in:
2025-12-10 14:05:31 +01:00
parent d1b89d8e34
commit 8b7f99ca82
3 changed files with 580 additions and 48 deletions

485
23 WalkadnSave.py Normal file
View File

@@ -0,0 +1,485 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import hashlib
from datetime import datetime
import mysql.connector
from mysql.connector import Error
from dotenv import load_dotenv
from pathlib import Path
import unicodedata
# ======================================================
# Load .env from the script directory
# ======================================================
env_path = Path(__file__).resolve().parent / ".env"
load_dotenv(env_path)
# ======================================================
# Helper: MD5 of full file path string
# ======================================================
def md5_path(path: str) -> str:
return hashlib.md5(path.encode("utf8")).hexdigest()
# ======================================================
# MySQL CONNECTIONS
# ======================================================
def get_server_connection():
return mysql.connector.connect(
host=os.getenv("DB_MYSQL_HOST"),
user=os.getenv("DB_MYSQL_ROOT"),
password=os.getenv("DB_MYSQL_ROOT_PASS"),
port=int(os.getenv("DB_MYSQL_PORT")),
auth_plugin="mysql_native_password",
)
def get_db_connection():
conn = mysql.connector.connect(
host=os.getenv("DB_MYSQL_HOST"),
user=os.getenv("DB_MYSQL_ROOT"),
password=os.getenv("DB_MYSQL_ROOT_PASS"),
port=int(os.getenv("DB_MYSQL_PORT")),
database="walkfiles",
auth_plugin="mysql_native_password",
)
c = conn.cursor()
c.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
c.close()
return conn
# ======================================================
# DATABASE INITIALIZATION
# ======================================================
def init_db():
# Ensure DB exists
server = get_server_connection()
cur = server.cursor()
cur.execute("""
CREATE DATABASE IF NOT EXISTS walkfiles
DEFAULT CHARACTER SET utf8mb4
COLLATE utf8mb4_general_ci
""")
server.commit()
cur.close()
server.close()
# Connect
conn = get_db_connection()
cursor = conn.cursor()
# DEVICES
cursor.execute("""
CREATE TABLE IF NOT EXISTS devices (
id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255) UNIQUE,
scanned_at DATETIME NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
""")
# FOLDERS
cursor.execute("""
CREATE TABLE IF NOT EXISTS folders (
id INT AUTO_INCREMENT PRIMARY KEY,
path VARCHAR(2048) NOT NULL,
parent_id INT NULL,
device_id INT NOT NULL,
first_seen DATETIME NOT NULL,
last_seen DATETIME NOT NULL,
deleted TINYINT(1) NOT NULL DEFAULT 0,
CONSTRAINT fk_folder_device
FOREIGN KEY (device_id) REFERENCES devices(id)
ON DELETE CASCADE,
UNIQUE KEY uniq_folder_path (device_id, path(255)),
INDEX idx_folder_dev (device_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
""")
# FILES
cursor.execute("""
CREATE TABLE IF NOT EXISTS files (
id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255) NOT NULL,
path VARCHAR(2048) NOT NULL,
path_md5 CHAR(32) NOT NULL,
size BIGINT NULL,
modified DATETIME NULL,
type VARCHAR(255) NULL,
folder_id INT NULL,
device_id INT NOT NULL,
deleted TINYINT(1) NOT NULL DEFAULT 0,
first_seen DATETIME NOT NULL,
last_seen DATETIME NOT NULL,
CONSTRAINT fk_file_folder
FOREIGN KEY (folder_id) REFERENCES folders(id)
ON DELETE SET NULL,
CONSTRAINT fk_file_device
FOREIGN KEY (device_id) REFERENCES devices(id)
ON DELETE CASCADE,
UNIQUE KEY uniq_file_path_md5 (device_id, path_md5),
INDEX idx_file_folder (folder_id),
INDEX idx_file_deleted (device_id, deleted)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
""")
conn.commit()
return conn, cursor
# ======================================================
# HELPERS — DEVICES & FOLDERS
# ======================================================
def get_or_create_device(cursor, conn, name: str) -> int:
now = datetime.now()
cursor.execute("INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s,%s)", (name, now))
conn.commit()
cursor.execute("SELECT id FROM devices WHERE name=%s", (name,))
return cursor.fetchone()[0]
def load_folder_state(cursor, device_id: int):
"""
Načte všechny složky pro zařízení a uloží jako:
folder_state[normalized_path] = {"id": id, "deleted": 0/1}
"""
cursor.execute("""
SELECT id, path, deleted
FROM folders
WHERE device_id=%s
""", (device_id,))
out = {}
for folder_id, path, deleted in cursor.fetchall():
norm_path = os.path.normpath(path)
out[norm_path] = {"id": folder_id, "deleted": int(deleted)}
return out
def get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, parent_id):
"""
Vytvoří nebo najde složku. Ošetřuje:
- Unicode normalizaci (Černý vs Černý)
- cache v paměti (folder_state)
- idempotentní INSERT (ON DUPLICATE KEY UPDATE)
"""
# Normalize Unicode + path form
folder_path = unicodedata.normalize("NFC", folder_path)
folder_path = os.path.normpath(folder_path)
key = folder_path
# 1) Cache hit
if key in folder_state:
return folder_state[key]["id"]
now = datetime.now()
# 2) Zkus SELECT
cursor.execute("""
SELECT id
FROM folders
WHERE device_id = %s AND path = %s
LIMIT 1
""", (device_id, folder_path))
row = cursor.fetchone()
if row:
folder_id = row[0]
folder_state[key] = {"id": folder_id, "deleted": 0}
return folder_id
# 3) INSERT (idempotent)
cursor.execute("""
INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen)
VALUES (%s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
id = LAST_INSERT_ID(id),
last_seen = VALUES(last_seen)
""", (folder_path, parent_id, device_id, now, now))
conn.commit()
folder_id = cursor.lastrowid
folder_state[key] = {"id": folder_id, "deleted": 0}
return folder_id
# ======================================================
# LOAD LAST FILE STATE
# ======================================================
def load_last_file_state(cursor, device_id: int):
"""
Načte poslední známý stav souborů pro zařízení, indexovaný podle path_md5.
(Z historických důvodů přes MAX(id), i když máš UNIQUE na (device_id, path_md5))
"""
cursor.execute("""
SELECT f.id, f.path_md5, f.deleted, f.size, f.modified
FROM files f
JOIN (
SELECT MAX(id) AS mx
FROM files
WHERE device_id=%s
GROUP BY path_md5
) t ON f.id = t.mx
""", (device_id,))
out = {}
for fid, md5, deleted, size, modified in cursor.fetchall():
out[md5] = {
"id": fid,
"deleted": int(deleted),
"size": size,
"modified": modified,
}
return out
# ======================================================
# MAIN SCANNER WITH BATCHING
# ======================================================
def walk_and_store_bulk():
BATCH_SIZE = 10000
# target_dir = r"\\tower1\#colddata"
# target_dir = r"z:"
target_dir = r"\\tower\ebooks"
# target_dir = r"\\tower\dedup"
device_name = "TOWER"
# Normalizovaný root pro porovnávání a LIKE
target_dir_norm = os.path.normpath(target_dir)
if not os.path.isdir(target_dir):
print("Invalid directory:", target_dir)
return
conn, cursor = init_db()
now = datetime.now()
device_id = get_or_create_device(cursor, conn, device_name)
folder_state = load_folder_state(cursor, device_id)
file_state = load_last_file_state(cursor, device_id)
seen_folders = set()
seen_files = set() # MD5 of path
files_to_insert = []
files_to_update = []
total_files = 0
print(f"🔍 Scanning: {target_dir} (device {device_id})")
# -------------------------------------------------
# WALK FILESYSTEM
# -------------------------------------------------
for root, dirs, files in os.walk(target_dir):
folder_path = os.path.normpath(root)
# 1⃣ determine parent_id correctly
if folder_path == target_dir_norm:
parent_id = None
else:
parent_folder_path = os.path.normpath(os.path.dirname(folder_path))
parent_id = get_or_create_folder(cursor, conn, folder_state,
device_id, parent_folder_path,
None)
# 2⃣ now insert current folder with correct parent_id
seen_folders.add(folder_path)
folder_id = get_or_create_folder(cursor, conn, folder_state,
device_id, folder_path,
parent_id)
# -------------------------------------------------
# FILE LOOP
# -------------------------------------------------
for name in files:
total_files += 1
filepath = os.path.normpath(os.path.join(folder_path, name))
md5 = md5_path(filepath)
seen_files.add(md5)
try:
st = os.stat(filepath)
except FileNotFoundError:
continue
modified = datetime.fromtimestamp(st.st_mtime).replace(microsecond=0)
size = st.st_size
ext = os.path.splitext(name)[1][:250]
prev = file_state.get(md5)
if prev is None:
# nový soubor
files_to_insert.append(
(name, filepath, md5, size, modified, ext,
folder_id, device_id, 0, now, now)
)
else:
if prev["deleted"] == 1:
# "vzkříšený" soubor
files_to_insert.append(
(name, filepath, md5, size, modified, ext,
folder_id, device_id, 0, now, now)
)
else:
# existuje a není deleted → zkontroluj změnu velikosti / času
if prev["size"] != size or prev["modified"] != modified:
files_to_update.append(
(size, modified, now, prev["id"])
)
# -------------------------------------------------
# BATCH FLUSHING
# -------------------------------------------------
if len(files_to_insert) >= BATCH_SIZE:
print(f"💾 Flushing {len(files_to_insert)} inserts...")
cursor.executemany("""
INSERT INTO files (
name, path, path_md5, size, modified, type,
folder_id, device_id, deleted,
first_seen, last_seen
)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
""", files_to_insert)
conn.commit()
files_to_insert.clear()
if len(files_to_update) >= BATCH_SIZE:
print(f"💾 Flushing {len(files_to_update)} updates...")
cursor.executemany("""
UPDATE files
SET size=%s, modified=%s, last_seen=%s, deleted=0
WHERE id=%s
""", files_to_update)
conn.commit()
files_to_update.clear()
# PROGRESS
if total_files % 1000 == 0:
print(f" ... processed {total_files} files")
# -------------------------------------------------
# FINAL FLUSH (REMAINING INSERTS/UPDATES)
# -------------------------------------------------
if files_to_insert:
print(f"💾 Final flush: {len(files_to_insert)} inserts")
cursor.executemany("""
INSERT INTO files (
name, path, path_md5, size, modified, type,
folder_id, device_id, deleted,
first_seen, last_seen
)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
""", files_to_insert)
conn.commit()
if files_to_update:
print(f"💾 Final flush: {len(files_to_update)} updates")
cursor.executemany("""
UPDATE files
SET size=%s, modified=%s, last_seen=%s, deleted=0
WHERE id=%s
""", files_to_update)
conn.commit()
# -------------------------------------------------
# MARK DELETED FILES — ONLY IN THIS SUBTREE
# -------------------------------------------------
files_deleted_count = 0
like_prefix = target_dir_norm.rstrip("\\/") + "%"
cursor.execute("""
SELECT id, path_md5
FROM files
WHERE device_id = %s
AND deleted = 0
AND path LIKE %s
""", (device_id, like_prefix))
candidates = cursor.fetchall()
ids_to_delete = [fid for (fid, md5) in candidates if md5 not in seen_files]
if ids_to_delete:
print(f"💾 Marking {len(ids_to_delete)} files as deleted in subtree")
cursor.executemany("""
UPDATE files
SET deleted=1, last_seen=%s
WHERE id=%s
""", [(now, fid) for fid in ids_to_delete])
conn.commit()
files_deleted_count = len(ids_to_delete)
# -------------------------------------------------
# MARK DELETED FOLDERS — ONLY IN THIS SUBTREE
# -------------------------------------------------
folders_to_mark_deleted = []
for path, info in folder_state.items():
# omez na subtree (včetně root složky)
norm_path = os.path.normpath(path)
if not norm_path.startswith(target_dir_norm):
continue
if info["deleted"] == 0 and norm_path not in seen_folders:
folders_to_mark_deleted.append((now, info["id"]))
folders_deleted_count = 0
if folders_to_mark_deleted:
cursor.executemany("""
UPDATE folders
SET deleted=1, last_seen=%s
WHERE id=%s
""", folders_to_mark_deleted)
conn.commit()
folders_deleted_count = len(folders_to_mark_deleted)
# -------------------------------------------------
# Update device timestamp
# -------------------------------------------------
cursor.execute("UPDATE devices SET scanned_at=%s WHERE id=%s", (now, device_id))
conn.commit()
cursor.close()
conn.close()
print("")
print("✅ Scan completed.")
print(" Total files scanned:", total_files)
print(" Files inserted:", len(files_to_insert)) # po flushi je 0, ale nechávám pro konzistenci
print(" Files updated:", len(files_to_update)) # dtto
print(" Files deleted in subtree:", files_deleted_count)
print(" Folders deleted in subtree:", folders_deleted_count)
# ======================================================
# MAIN ENTRY
# ======================================================
if __name__ == '__main__':
walk_and_store_bulk()