Files
walkfiles/23 WalkadnSave.py
2025-12-10 14:05:31 +01:00

486 lines
15 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import hashlib
from datetime import datetime
import mysql.connector
from mysql.connector import Error
from dotenv import load_dotenv
from pathlib import Path
import unicodedata
# ======================================================
# Load .env from the script directory
# ======================================================
env_path = Path(__file__).resolve().parent / ".env"
load_dotenv(env_path)
# ======================================================
# Helper: MD5 of full file path string
# ======================================================
def md5_path(path: str) -> str:
return hashlib.md5(path.encode("utf8")).hexdigest()
# ======================================================
# MySQL CONNECTIONS
# ======================================================
def get_server_connection():
return mysql.connector.connect(
host=os.getenv("DB_MYSQL_HOST"),
user=os.getenv("DB_MYSQL_ROOT"),
password=os.getenv("DB_MYSQL_ROOT_PASS"),
port=int(os.getenv("DB_MYSQL_PORT")),
auth_plugin="mysql_native_password",
)
def get_db_connection():
conn = mysql.connector.connect(
host=os.getenv("DB_MYSQL_HOST"),
user=os.getenv("DB_MYSQL_ROOT"),
password=os.getenv("DB_MYSQL_ROOT_PASS"),
port=int(os.getenv("DB_MYSQL_PORT")),
database="walkfiles",
auth_plugin="mysql_native_password",
)
c = conn.cursor()
c.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
c.close()
return conn
# ======================================================
# DATABASE INITIALIZATION
# ======================================================
def init_db():
# Ensure DB exists
server = get_server_connection()
cur = server.cursor()
cur.execute("""
CREATE DATABASE IF NOT EXISTS walkfiles
DEFAULT CHARACTER SET utf8mb4
COLLATE utf8mb4_general_ci
""")
server.commit()
cur.close()
server.close()
# Connect
conn = get_db_connection()
cursor = conn.cursor()
# DEVICES
cursor.execute("""
CREATE TABLE IF NOT EXISTS devices (
id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255) UNIQUE,
scanned_at DATETIME NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
""")
# FOLDERS
cursor.execute("""
CREATE TABLE IF NOT EXISTS folders (
id INT AUTO_INCREMENT PRIMARY KEY,
path VARCHAR(2048) NOT NULL,
parent_id INT NULL,
device_id INT NOT NULL,
first_seen DATETIME NOT NULL,
last_seen DATETIME NOT NULL,
deleted TINYINT(1) NOT NULL DEFAULT 0,
CONSTRAINT fk_folder_device
FOREIGN KEY (device_id) REFERENCES devices(id)
ON DELETE CASCADE,
UNIQUE KEY uniq_folder_path (device_id, path(255)),
INDEX idx_folder_dev (device_id)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
""")
# FILES
cursor.execute("""
CREATE TABLE IF NOT EXISTS files (
id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255) NOT NULL,
path VARCHAR(2048) NOT NULL,
path_md5 CHAR(32) NOT NULL,
size BIGINT NULL,
modified DATETIME NULL,
type VARCHAR(255) NULL,
folder_id INT NULL,
device_id INT NOT NULL,
deleted TINYINT(1) NOT NULL DEFAULT 0,
first_seen DATETIME NOT NULL,
last_seen DATETIME NOT NULL,
CONSTRAINT fk_file_folder
FOREIGN KEY (folder_id) REFERENCES folders(id)
ON DELETE SET NULL,
CONSTRAINT fk_file_device
FOREIGN KEY (device_id) REFERENCES devices(id)
ON DELETE CASCADE,
UNIQUE KEY uniq_file_path_md5 (device_id, path_md5),
INDEX idx_file_folder (folder_id),
INDEX idx_file_deleted (device_id, deleted)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
""")
conn.commit()
return conn, cursor
# ======================================================
# HELPERS — DEVICES & FOLDERS
# ======================================================
def get_or_create_device(cursor, conn, name: str) -> int:
now = datetime.now()
cursor.execute("INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s,%s)", (name, now))
conn.commit()
cursor.execute("SELECT id FROM devices WHERE name=%s", (name,))
return cursor.fetchone()[0]
def load_folder_state(cursor, device_id: int):
"""
Načte všechny složky pro zařízení a uloží jako:
folder_state[normalized_path] = {"id": id, "deleted": 0/1}
"""
cursor.execute("""
SELECT id, path, deleted
FROM folders
WHERE device_id=%s
""", (device_id,))
out = {}
for folder_id, path, deleted in cursor.fetchall():
norm_path = os.path.normpath(path)
out[norm_path] = {"id": folder_id, "deleted": int(deleted)}
return out
def get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, parent_id):
"""
Vytvoří nebo najde složku. Ošetřuje:
- Unicode normalizaci (Černý vs Černý)
- cache v paměti (folder_state)
- idempotentní INSERT (ON DUPLICATE KEY UPDATE)
"""
# Normalize Unicode + path form
folder_path = unicodedata.normalize("NFC", folder_path)
folder_path = os.path.normpath(folder_path)
key = folder_path
# 1) Cache hit
if key in folder_state:
return folder_state[key]["id"]
now = datetime.now()
# 2) Zkus SELECT
cursor.execute("""
SELECT id
FROM folders
WHERE device_id = %s AND path = %s
LIMIT 1
""", (device_id, folder_path))
row = cursor.fetchone()
if row:
folder_id = row[0]
folder_state[key] = {"id": folder_id, "deleted": 0}
return folder_id
# 3) INSERT (idempotent)
cursor.execute("""
INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen)
VALUES (%s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
id = LAST_INSERT_ID(id),
last_seen = VALUES(last_seen)
""", (folder_path, parent_id, device_id, now, now))
conn.commit()
folder_id = cursor.lastrowid
folder_state[key] = {"id": folder_id, "deleted": 0}
return folder_id
# ======================================================
# LOAD LAST FILE STATE
# ======================================================
def load_last_file_state(cursor, device_id: int):
"""
Načte poslední známý stav souborů pro zařízení, indexovaný podle path_md5.
(Z historických důvodů přes MAX(id), i když máš UNIQUE na (device_id, path_md5))
"""
cursor.execute("""
SELECT f.id, f.path_md5, f.deleted, f.size, f.modified
FROM files f
JOIN (
SELECT MAX(id) AS mx
FROM files
WHERE device_id=%s
GROUP BY path_md5
) t ON f.id = t.mx
""", (device_id,))
out = {}
for fid, md5, deleted, size, modified in cursor.fetchall():
out[md5] = {
"id": fid,
"deleted": int(deleted),
"size": size,
"modified": modified,
}
return out
# ======================================================
# MAIN SCANNER WITH BATCHING
# ======================================================
def walk_and_store_bulk():
BATCH_SIZE = 10000
# target_dir = r"\\tower1\#colddata"
# target_dir = r"z:"
target_dir = r"\\tower\ebooks"
# target_dir = r"\\tower\dedup"
device_name = "TOWER"
# Normalizovaný root pro porovnávání a LIKE
target_dir_norm = os.path.normpath(target_dir)
if not os.path.isdir(target_dir):
print("Invalid directory:", target_dir)
return
conn, cursor = init_db()
now = datetime.now()
device_id = get_or_create_device(cursor, conn, device_name)
folder_state = load_folder_state(cursor, device_id)
file_state = load_last_file_state(cursor, device_id)
seen_folders = set()
seen_files = set() # MD5 of path
files_to_insert = []
files_to_update = []
total_files = 0
print(f"🔍 Scanning: {target_dir} (device {device_id})")
# -------------------------------------------------
# WALK FILESYSTEM
# -------------------------------------------------
for root, dirs, files in os.walk(target_dir):
folder_path = os.path.normpath(root)
# 1⃣ determine parent_id correctly
if folder_path == target_dir_norm:
parent_id = None
else:
parent_folder_path = os.path.normpath(os.path.dirname(folder_path))
parent_id = get_or_create_folder(cursor, conn, folder_state,
device_id, parent_folder_path,
None)
# 2⃣ now insert current folder with correct parent_id
seen_folders.add(folder_path)
folder_id = get_or_create_folder(cursor, conn, folder_state,
device_id, folder_path,
parent_id)
# -------------------------------------------------
# FILE LOOP
# -------------------------------------------------
for name in files:
total_files += 1
filepath = os.path.normpath(os.path.join(folder_path, name))
md5 = md5_path(filepath)
seen_files.add(md5)
try:
st = os.stat(filepath)
except FileNotFoundError:
continue
modified = datetime.fromtimestamp(st.st_mtime).replace(microsecond=0)
size = st.st_size
ext = os.path.splitext(name)[1][:250]
prev = file_state.get(md5)
if prev is None:
# nový soubor
files_to_insert.append(
(name, filepath, md5, size, modified, ext,
folder_id, device_id, 0, now, now)
)
else:
if prev["deleted"] == 1:
# "vzkříšený" soubor
files_to_insert.append(
(name, filepath, md5, size, modified, ext,
folder_id, device_id, 0, now, now)
)
else:
# existuje a není deleted → zkontroluj změnu velikosti / času
if prev["size"] != size or prev["modified"] != modified:
files_to_update.append(
(size, modified, now, prev["id"])
)
# -------------------------------------------------
# BATCH FLUSHING
# -------------------------------------------------
if len(files_to_insert) >= BATCH_SIZE:
print(f"💾 Flushing {len(files_to_insert)} inserts...")
cursor.executemany("""
INSERT INTO files (
name, path, path_md5, size, modified, type,
folder_id, device_id, deleted,
first_seen, last_seen
)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
""", files_to_insert)
conn.commit()
files_to_insert.clear()
if len(files_to_update) >= BATCH_SIZE:
print(f"💾 Flushing {len(files_to_update)} updates...")
cursor.executemany("""
UPDATE files
SET size=%s, modified=%s, last_seen=%s, deleted=0
WHERE id=%s
""", files_to_update)
conn.commit()
files_to_update.clear()
# PROGRESS
if total_files % 1000 == 0:
print(f" ... processed {total_files} files")
# -------------------------------------------------
# FINAL FLUSH (REMAINING INSERTS/UPDATES)
# -------------------------------------------------
if files_to_insert:
print(f"💾 Final flush: {len(files_to_insert)} inserts")
cursor.executemany("""
INSERT INTO files (
name, path, path_md5, size, modified, type,
folder_id, device_id, deleted,
first_seen, last_seen
)
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
""", files_to_insert)
conn.commit()
if files_to_update:
print(f"💾 Final flush: {len(files_to_update)} updates")
cursor.executemany("""
UPDATE files
SET size=%s, modified=%s, last_seen=%s, deleted=0
WHERE id=%s
""", files_to_update)
conn.commit()
# -------------------------------------------------
# MARK DELETED FILES — ONLY IN THIS SUBTREE
# -------------------------------------------------
files_deleted_count = 0
like_prefix = target_dir_norm.rstrip("\\/") + "%"
cursor.execute("""
SELECT id, path_md5
FROM files
WHERE device_id = %s
AND deleted = 0
AND path LIKE %s
""", (device_id, like_prefix))
candidates = cursor.fetchall()
ids_to_delete = [fid for (fid, md5) in candidates if md5 not in seen_files]
if ids_to_delete:
print(f"💾 Marking {len(ids_to_delete)} files as deleted in subtree")
cursor.executemany("""
UPDATE files
SET deleted=1, last_seen=%s
WHERE id=%s
""", [(now, fid) for fid in ids_to_delete])
conn.commit()
files_deleted_count = len(ids_to_delete)
# -------------------------------------------------
# MARK DELETED FOLDERS — ONLY IN THIS SUBTREE
# -------------------------------------------------
folders_to_mark_deleted = []
for path, info in folder_state.items():
# omez na subtree (včetně root složky)
norm_path = os.path.normpath(path)
if not norm_path.startswith(target_dir_norm):
continue
if info["deleted"] == 0 and norm_path not in seen_folders:
folders_to_mark_deleted.append((now, info["id"]))
folders_deleted_count = 0
if folders_to_mark_deleted:
cursor.executemany("""
UPDATE folders
SET deleted=1, last_seen=%s
WHERE id=%s
""", folders_to_mark_deleted)
conn.commit()
folders_deleted_count = len(folders_to_mark_deleted)
# -------------------------------------------------
# Update device timestamp
# -------------------------------------------------
cursor.execute("UPDATE devices SET scanned_at=%s WHERE id=%s", (now, device_id))
conn.commit()
cursor.close()
conn.close()
print("")
print("✅ Scan completed.")
print(" Total files scanned:", total_files)
print(" Files inserted:", len(files_to_insert)) # po flushi je 0, ale nechávám pro konzistenci
print(" Files updated:", len(files_to_update)) # dtto
print(" Files deleted in subtree:", files_deleted_count)
print(" Folders deleted in subtree:", folders_deleted_count)
# ======================================================
# MAIN ENTRY
# ======================================================
if __name__ == '__main__':
walk_and_store_bulk()