z230
This commit is contained in:
@@ -1,106 +1,357 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import mysql.connector
|
import hashlib
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import mysql.connector
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
load_dotenv() # Reads .env file and adds to environment
|
# Always load .env from the folder where THIS script is stored
|
||||||
|
env_path = Path(__file__).resolve().parent / ".env"
|
||||||
|
load_dotenv(env_path)
|
||||||
|
|
||||||
# Database setup with explicit UTF8MB4 collation
|
# ======================================================
|
||||||
def init_db():
|
# 🔧 Helper: MD5 of full path
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
def md5_path(path: str) -> str:
|
||||||
|
return hashlib.md5(path.encode("utf8")).hexdigest()
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# 🔧 DB CONNECTION HELPERS
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
def get_server_connection():
|
||||||
|
"""Connect to MySQL server WITHOUT selecting a database."""
|
||||||
conn = mysql.connector.connect(
|
conn = mysql.connector.connect(
|
||||||
host=os.getenv("DB_MYSQL_HOST"),
|
host=os.getenv("DB_MYSQL_HOST", "127.0.0.1"),
|
||||||
user=os.getenv("DB_MYSQL_ROOT"),
|
user=os.getenv("DB_MYSQL_ROOT", "root"),
|
||||||
password=os.getenv("DB_MYSQL_ROOT_PASS"),
|
password=os.getenv("DB_MYSQL_ROOT_PASS", ""),
|
||||||
database=os.getenv("walkfiles"),
|
port=int(os.getenv("DB_MYSQL_PORT", "3306")),
|
||||||
port=int(os.getenv("DB_MYSQL_PORT", 3306)),
|
auth_plugin="mysql_native_password",
|
||||||
charset="utf8mb4",
|
)
|
||||||
collation="utf8mb4_general_ci"
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
def get_db_connection():
|
||||||
|
"""Connect to the 'walkfiles' database."""
|
||||||
|
conn = mysql.connector.connect(
|
||||||
|
host=os.getenv("DB_MYSQL_HOST", "127.0.0.1"),
|
||||||
|
user=os.getenv("DB_MYSQL_ROOT", "root"),
|
||||||
|
password=os.getenv("DB_MYSQL_ROOT_PASS", ""),
|
||||||
|
port=int(os.getenv("DB_MYSQL_PORT", "3306")),
|
||||||
|
database="walkfiles",
|
||||||
|
auth_plugin="mysql_native_password",
|
||||||
)
|
)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
cursor.execute("CREATE DATABASE IF NOT EXISTS walkfiles CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci")
|
cursor.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
|
||||||
cursor.execute("USE walkfiles")
|
cursor.close()
|
||||||
|
return conn
|
||||||
|
|
||||||
cursor.execute('''CREATE TABLE IF NOT EXISTS devices (
|
# ======================================================
|
||||||
|
# 🗄 DB INITIALIZATION
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
def init_db():
|
||||||
|
# 1) Ensure DB exists
|
||||||
|
server_conn = get_server_connection()
|
||||||
|
cur = server_conn.cursor()
|
||||||
|
cur.execute(
|
||||||
|
"CREATE DATABASE IF NOT EXISTS walkfiles "
|
||||||
|
"DEFAULT CHARACTER SET utf8mb4 "
|
||||||
|
"COLLATE utf8mb4_general_ci"
|
||||||
|
)
|
||||||
|
server_conn.commit()
|
||||||
|
cur.close()
|
||||||
|
server_conn.close()
|
||||||
|
|
||||||
|
# 2) Connect
|
||||||
|
conn = get_db_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# Devices
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS devices (
|
||||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci UNIQUE,
|
name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci UNIQUE,
|
||||||
scanned_at DATETIME
|
scanned_at DATETIME NULL
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci''')
|
) ENGINE=InnoDB
|
||||||
|
DEFAULT CHARSET=utf8mb4
|
||||||
|
""")
|
||||||
|
|
||||||
cursor.execute('''CREATE TABLE IF NOT EXISTS folders (
|
# Folders
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS folders (
|
||||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
path TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci UNIQUE,
|
path VARCHAR(2048) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
|
||||||
parent_id INT,
|
parent_id INT NULL,
|
||||||
device_id INT,
|
device_id INT NOT NULL,
|
||||||
FOREIGN KEY(device_id) REFERENCES devices(id)
|
first_seen DATETIME NOT NULL,
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci''')
|
last_seen DATETIME NOT NULL,
|
||||||
|
|
||||||
cursor.execute('''CREATE TABLE IF NOT EXISTS files (
|
CONSTRAINT fk_folders_device
|
||||||
|
FOREIGN KEY (device_id) REFERENCES devices(id)
|
||||||
|
ON DELETE CASCADE,
|
||||||
|
|
||||||
|
UNIQUE KEY uniq_folders_device_path (device_id, path(255)),
|
||||||
|
INDEX idx_folders_device (device_id)
|
||||||
|
) ENGINE=InnoDB
|
||||||
|
DEFAULT CHARSET=utf8mb4
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Files
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS files (
|
||||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
|
|
||||||
path TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci UNIQUE,
|
name VARCHAR(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
|
||||||
size BIGINT,
|
path VARCHAR(2048) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
|
||||||
modified DATETIME,
|
path_md5 CHAR(32) NOT NULL,
|
||||||
type VARCHAR(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci,
|
|
||||||
folder_id INT,
|
size BIGINT NULL,
|
||||||
device_id INT,
|
modified DATETIME NULL,
|
||||||
FOREIGN KEY(folder_id) REFERENCES folders(id),
|
type VARCHAR(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL,
|
||||||
FOREIGN KEY(device_id) REFERENCES devices(id)
|
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci''')
|
folder_id INT NULL,
|
||||||
|
device_id INT NOT NULL,
|
||||||
|
|
||||||
|
deleted TINYINT(1) NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
first_seen DATETIME NOT NULL,
|
||||||
|
last_seen DATETIME NOT NULL,
|
||||||
|
|
||||||
|
CONSTRAINT fk_files_folder
|
||||||
|
FOREIGN KEY (folder_id) REFERENCES folders(id)
|
||||||
|
ON DELETE SET NULL,
|
||||||
|
|
||||||
|
CONSTRAINT fk_files_device
|
||||||
|
FOREIGN KEY (device_id) REFERENCES devices(id)
|
||||||
|
ON DELETE CASCADE,
|
||||||
|
|
||||||
|
UNIQUE KEY uniq_files_device_path_md5 (device_id, path_md5),
|
||||||
|
INDEX idx_files_folder (folder_id),
|
||||||
|
INDEX idx_files_deleted (device_id, deleted)
|
||||||
|
) ENGINE=InnoDB
|
||||||
|
DEFAULT CHARSET=utf8mb4
|
||||||
|
""")
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
return conn, cursor
|
return conn, cursor
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# 👤 DEVICE + FOLDERS HELPERS
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
def insert_bulk_files(cursor, conn, files_data):
|
def get_or_create_device(cursor, conn, device_name: str) -> int:
|
||||||
if not files_data:
|
now = datetime.now()
|
||||||
return
|
cursor.execute(
|
||||||
query = '''INSERT IGNORE INTO files (name, path, size, modified, type, folder_id, device_id)
|
"INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s, %s)",
|
||||||
VALUES (%s,%s,%s,%s,%s,%s,%s)'''
|
(device_name, now)
|
||||||
cursor.executemany(query, files_data)
|
)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
cursor.execute("SELECT id FROM devices WHERE name=%s", (device_name,))
|
||||||
|
return cursor.fetchone()[0]
|
||||||
|
|
||||||
|
|
||||||
|
def load_folder_cache(cursor, device_id: int):
|
||||||
|
cursor.execute(
|
||||||
|
"SELECT id, path FROM folders WHERE device_id=%s",
|
||||||
|
(device_id,)
|
||||||
|
)
|
||||||
|
return {path: folder_id for folder_id, path in cursor.fetchall()}
|
||||||
|
|
||||||
|
|
||||||
|
def get_or_create_folder(cursor, conn, folder_cache, device_id, folder_path, parent_path, now):
|
||||||
|
if folder_path in folder_cache:
|
||||||
|
folder_id = folder_cache[folder_path]
|
||||||
|
cursor.execute("UPDATE folders SET last_seen=%s WHERE id=%s", (now, folder_id))
|
||||||
|
return folder_id
|
||||||
|
|
||||||
|
parent_id = folder_cache.get(parent_path)
|
||||||
|
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen)
|
||||||
|
VALUES (%s, %s, %s, %s, %s)
|
||||||
|
""", (folder_path, parent_id, device_id, now, now))
|
||||||
|
|
||||||
|
folder_id = cursor.lastrowid
|
||||||
|
folder_cache[folder_path] = folder_id
|
||||||
|
return folder_id
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# 📂 FILES – LOAD LAST STATE
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
def load_last_file_state(cursor, device_id: int):
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT f.id, f.path_md5, f.deleted, f.size, f.modified
|
||||||
|
FROM files f
|
||||||
|
JOIN (
|
||||||
|
SELECT MAX(id) AS max_id
|
||||||
|
FROM files
|
||||||
|
WHERE device_id = %s
|
||||||
|
GROUP BY path_md5
|
||||||
|
) latest ON f.id = latest.max_id
|
||||||
|
WHERE f.device_id = %s
|
||||||
|
""", (device_id, device_id))
|
||||||
|
|
||||||
|
state = {}
|
||||||
|
for file_id, path_md5, deleted, size, modified in cursor.fetchall():
|
||||||
|
state[path_md5] = {
|
||||||
|
"id": file_id,
|
||||||
|
"deleted": int(deleted),
|
||||||
|
"size": size,
|
||||||
|
"modified": modified
|
||||||
|
}
|
||||||
|
return state
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# 🚶 MAIN WALK LOGIC
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
def walk_and_store_bulk():
|
def walk_and_store_bulk():
|
||||||
target_dir = r"u:\\Dropbox\\Ordinace\\Dokumentace_ke_zpracování"
|
updated_debug = []
|
||||||
device_name = "NTB"
|
target_dir = r"u:\Dropbox\Ordinace\Dokumentace_ke_zpracování"
|
||||||
|
device_name = "Z230"
|
||||||
|
|
||||||
|
if not os.path.isdir(target_dir):
|
||||||
|
print("Invalid directory:", target_dir)
|
||||||
|
return
|
||||||
|
|
||||||
conn, cursor = init_db()
|
conn, cursor = init_db()
|
||||||
now = datetime.now()
|
now = datetime.now()
|
||||||
|
|
||||||
cursor.execute("INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s, %s)", (device_name, now))
|
device_id = get_or_create_device(cursor, conn, device_name)
|
||||||
conn.commit()
|
folder_cache = load_folder_cache(cursor, device_id)
|
||||||
cursor.execute("SELECT id FROM devices WHERE name=%s", (device_name,))
|
last_state = load_last_file_state(cursor, device_id)
|
||||||
device_id = cursor.fetchone()[0]
|
|
||||||
|
seen_md5 = set()
|
||||||
|
|
||||||
folder_cache = {}
|
|
||||||
files_to_insert = []
|
files_to_insert = []
|
||||||
|
files_to_update_existing = []
|
||||||
|
files_to_mark_deleted = []
|
||||||
|
|
||||||
|
total_files = 0
|
||||||
|
print(f"🔍 Scanning: {target_dir} (device {device_id})")
|
||||||
|
|
||||||
for root, dirs, files in os.walk(target_dir):
|
for root, dirs, files in os.walk(target_dir):
|
||||||
parent_path = os.path.dirname(root)
|
|
||||||
parent_id = folder_cache.get(parent_path)
|
|
||||||
|
|
||||||
cursor.execute("INSERT IGNORE INTO folders (path, parent_id, device_id) VALUES (%s, %s, %s)", (root, parent_id, device_id))
|
folder_path = os.path.normpath(root)
|
||||||
conn.commit()
|
parent_path = os.path.normpath(os.path.dirname(root)) if root != target_dir else None
|
||||||
cursor.execute("SELECT id FROM folders WHERE path=%s", (root,))
|
folder_id = get_or_create_folder(cursor, conn, folder_cache, device_id, folder_path, parent_path, now)
|
||||||
folder_id = cursor.fetchone()[0]
|
|
||||||
folder_cache[root] = folder_id
|
for name in files:
|
||||||
|
total_files += 1
|
||||||
|
|
||||||
|
file_path = os.path.normpath(os.path.join(root, name))
|
||||||
|
file_md5 = md5_path(file_path)
|
||||||
|
seen_md5.add(file_md5)
|
||||||
|
|
||||||
for file in files:
|
|
||||||
file_path = os.path.join(root, file)
|
|
||||||
try:
|
try:
|
||||||
stats = os.stat(file_path)
|
stats = os.stat(file_path)
|
||||||
modified = datetime.fromtimestamp(stats.st_mtime)
|
|
||||||
ftype = os.path.splitext(file)[1]
|
|
||||||
files_to_insert.append((file, file_path, stats.st_size, modified, ftype, folder_id, device_id))
|
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
insert_bulk_files(cursor, conn, files_to_insert)
|
modified = datetime.fromtimestamp(stats.st_mtime).replace(microsecond=0)
|
||||||
|
size = stats.st_size
|
||||||
|
ext = os.path.splitext(name)[1]
|
||||||
|
|
||||||
|
prev = last_state.get(file_md5)
|
||||||
|
|
||||||
|
if prev is None:
|
||||||
|
# New file
|
||||||
|
files_to_insert.append(
|
||||||
|
(name, file_path, file_md5, size, modified, ext,
|
||||||
|
folder_id, device_id, 0, now, now)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if prev["deleted"] == 1:
|
||||||
|
# Reappeared file → new row
|
||||||
|
files_to_insert.append(
|
||||||
|
(name, file_path, file_md5, size, modified, ext,
|
||||||
|
folder_id, device_id, 0, now, now)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Existing & not deleted
|
||||||
|
# Only update if size or modified timestamp CHANGED
|
||||||
|
if prev["size"] != size or prev["modified"] != modified:
|
||||||
|
files_to_update_existing.append(
|
||||||
|
(size, modified, now, prev["id"])
|
||||||
|
)
|
||||||
|
|
||||||
|
updated_debug.append({
|
||||||
|
"path": file_path,
|
||||||
|
"old_size": prev["size"],
|
||||||
|
"new_size": size,
|
||||||
|
"old_modified": prev["modified"],
|
||||||
|
"new_modified": modified
|
||||||
|
})
|
||||||
|
|
||||||
|
if total_files % 1000 == 0:
|
||||||
|
print(f" ... processed {total_files} files")
|
||||||
|
|
||||||
|
# Mark missing files as deleted
|
||||||
|
for md5_hash, info in last_state.items():
|
||||||
|
if info["deleted"] == 0 and md5_hash not in seen_md5:
|
||||||
|
files_to_mark_deleted.append((now, info["id"]))
|
||||||
|
|
||||||
|
# ==================================================
|
||||||
|
# 💾 APPLY CHANGES
|
||||||
|
# ==================================================
|
||||||
|
|
||||||
|
if files_to_insert:
|
||||||
|
cursor.executemany("""
|
||||||
|
INSERT INTO files (
|
||||||
|
name, path, path_md5, size, modified, type,
|
||||||
|
folder_id, device_id, deleted,
|
||||||
|
first_seen, last_seen
|
||||||
|
)
|
||||||
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||||
|
""", files_to_insert)
|
||||||
|
|
||||||
|
if files_to_update_existing:
|
||||||
|
cursor.executemany("""
|
||||||
|
UPDATE files
|
||||||
|
SET size=%s,
|
||||||
|
modified=%s,
|
||||||
|
last_seen=%s,
|
||||||
|
deleted=0
|
||||||
|
WHERE id=%s
|
||||||
|
""", files_to_update_existing)
|
||||||
|
|
||||||
|
if files_to_mark_deleted:
|
||||||
|
cursor.executemany("""
|
||||||
|
UPDATE files
|
||||||
|
SET deleted=1,
|
||||||
|
last_seen=%s
|
||||||
|
WHERE id=%s
|
||||||
|
""", files_to_mark_deleted)
|
||||||
|
|
||||||
|
cursor.execute("UPDATE devices SET scanned_at=%s WHERE id=%s", (now, device_id))
|
||||||
|
conn.commit()
|
||||||
|
cursor.close()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
|
if updated_debug:
|
||||||
|
print("\n📌 Updated files:")
|
||||||
|
for info in updated_debug:
|
||||||
|
print(f"- {info['path']}")
|
||||||
|
print(f" size: {info['old_size']} → {info['new_size']}")
|
||||||
|
print(f" modified: {info['old_modified']} → {info['new_modified']}")
|
||||||
|
print("✅ Scan completed.")
|
||||||
|
print(" Total files:", total_files)
|
||||||
|
print(" Inserted:", len(files_to_insert))
|
||||||
|
print(" Updated:", len(files_to_update_existing))
|
||||||
|
print(" Marked deleted:", len(files_to_mark_deleted))
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# 🔚 MAIN
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
if not os.path.isdir(r"u:\\Dropbox\\Ordinace\\Dokumentace_ke_zpracování"):
|
|
||||||
print("Invalid directory path.")
|
|
||||||
else:
|
|
||||||
walk_and_store_bulk()
|
walk_and_store_bulk()
|
||||||
print("Scan completed for directory 'u:\\Dropbox\\Ordinace\\Dokumentace_ke_zpracování' on device 'NTB'. Bulk data stored efficiently in MySQL database 'walkfiles'.")
|
|
||||||
391
21 WalkandSave.py
Normal file
391
21 WalkandSave.py
Normal file
@@ -0,0 +1,391 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import os
|
||||||
|
import hashlib
|
||||||
|
from datetime import datetime
|
||||||
|
import mysql.connector
|
||||||
|
from mysql.connector import Error
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# Load .env from the script directory
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
env_path = Path(__file__).resolve().parent / ".env"
|
||||||
|
load_dotenv(env_path)
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# Helper: MD5 of full file path string
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
def md5_path(path: str) -> str:
|
||||||
|
return hashlib.md5(path.encode("utf8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# MySQL CONNECTIONS
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
def get_server_connection():
|
||||||
|
return mysql.connector.connect(
|
||||||
|
host=os.getenv("DB_MYSQL_HOST"),
|
||||||
|
user=os.getenv("DB_MYSQL_ROOT"),
|
||||||
|
password=os.getenv("DB_MYSQL_ROOT_PASS"),
|
||||||
|
port=int(os.getenv("DB_MYSQL_PORT")),
|
||||||
|
auth_plugin="mysql_native_password",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_db_connection():
|
||||||
|
conn = mysql.connector.connect(
|
||||||
|
host=os.getenv("DB_MYSQL_HOST"),
|
||||||
|
user=os.getenv("DB_MYSQL_ROOT"),
|
||||||
|
password=os.getenv("DB_MYSQL_ROOT_PASS"),
|
||||||
|
port=int(os.getenv("DB_MYSQL_PORT")),
|
||||||
|
database="walkfiles",
|
||||||
|
auth_plugin="mysql_native_password",
|
||||||
|
)
|
||||||
|
|
||||||
|
c = conn.cursor()
|
||||||
|
c.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
|
||||||
|
c.close()
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# DATABASE INITIALIZATION
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
def init_db():
|
||||||
|
# -- Ensure DB exists --
|
||||||
|
server = get_server_connection()
|
||||||
|
cur = server.cursor()
|
||||||
|
cur.execute("""
|
||||||
|
CREATE DATABASE IF NOT EXISTS walkfiles
|
||||||
|
DEFAULT CHARACTER SET utf8mb4
|
||||||
|
COLLATE utf8mb4_general_ci
|
||||||
|
""")
|
||||||
|
server.commit()
|
||||||
|
cur.close()
|
||||||
|
server.close()
|
||||||
|
|
||||||
|
# -- Connect to DB --
|
||||||
|
conn = get_db_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# ============================
|
||||||
|
# DEVICES
|
||||||
|
# ============================
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS devices (
|
||||||
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
name VARCHAR(255) UNIQUE,
|
||||||
|
scanned_at DATETIME NULL
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||||
|
""")
|
||||||
|
|
||||||
|
# ============================
|
||||||
|
# FOLDERS
|
||||||
|
# ============================
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS folders (
|
||||||
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
path VARCHAR(2048) NOT NULL,
|
||||||
|
parent_id INT NULL,
|
||||||
|
device_id INT NOT NULL,
|
||||||
|
first_seen DATETIME NOT NULL,
|
||||||
|
last_seen DATETIME NOT NULL,
|
||||||
|
deleted TINYINT(1) NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
CONSTRAINT fk_folder_device
|
||||||
|
FOREIGN KEY (device_id) REFERENCES devices(id)
|
||||||
|
ON DELETE CASCADE,
|
||||||
|
|
||||||
|
UNIQUE KEY uniq_folder_path (device_id, path(255)),
|
||||||
|
INDEX idx_folder_dev (device_id)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||||
|
""")
|
||||||
|
|
||||||
|
# ============================
|
||||||
|
# FILES
|
||||||
|
# ============================
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS files (
|
||||||
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
path VARCHAR(2048) NOT NULL,
|
||||||
|
path_md5 CHAR(32) NOT NULL,
|
||||||
|
|
||||||
|
size BIGINT NULL,
|
||||||
|
modified DATETIME NULL,
|
||||||
|
type VARCHAR(50) NULL,
|
||||||
|
|
||||||
|
folder_id INT NULL,
|
||||||
|
device_id INT NOT NULL,
|
||||||
|
deleted TINYINT(1) NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
first_seen DATETIME NOT NULL,
|
||||||
|
last_seen DATETIME NOT NULL,
|
||||||
|
|
||||||
|
CONSTRAINT fk_file_folder
|
||||||
|
FOREIGN KEY (folder_id) REFERENCES folders(id)
|
||||||
|
ON DELETE SET NULL,
|
||||||
|
|
||||||
|
CONSTRAINT fk_file_device
|
||||||
|
FOREIGN KEY (device_id) REFERENCES devices(id)
|
||||||
|
ON DELETE CASCADE,
|
||||||
|
|
||||||
|
UNIQUE KEY uniq_file_path_md5 (device_id, path_md5),
|
||||||
|
INDEX idx_file_folder (folder_id),
|
||||||
|
INDEX idx_file_deleted (device_id, deleted)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||||
|
""")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
return conn, cursor
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# HELPERS FOR DEVICES AND FOLDERS
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
def get_or_create_device(cursor, conn, name: str) -> int:
|
||||||
|
now = datetime.now()
|
||||||
|
cursor.execute("INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s,%s)", (name, now))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
cursor.execute("SELECT id FROM devices WHERE name=%s", (name,))
|
||||||
|
return cursor.fetchone()[0]
|
||||||
|
|
||||||
|
|
||||||
|
def load_folder_state(cursor, device_id: int):
|
||||||
|
"""
|
||||||
|
Return {path: {"id", "deleted"}}
|
||||||
|
"""
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT id, path, deleted
|
||||||
|
FROM folders
|
||||||
|
WHERE device_id=%s
|
||||||
|
""", (device_id,))
|
||||||
|
|
||||||
|
out = {}
|
||||||
|
for folder_id, path, deleted in cursor.fetchall():
|
||||||
|
out[path] = {"id": folder_id, "deleted": int(deleted)}
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, parent_path, now):
|
||||||
|
"""
|
||||||
|
Ensure folder exists in DB (even empty).
|
||||||
|
"""
|
||||||
|
if folder_path in folder_state:
|
||||||
|
folder_id = folder_state[folder_path]["id"]
|
||||||
|
cursor.execute(
|
||||||
|
"UPDATE folders SET last_seen=%s, deleted=0 WHERE id=%s",
|
||||||
|
(now, folder_id)
|
||||||
|
)
|
||||||
|
folder_state[folder_path]["deleted"] = 0
|
||||||
|
return folder_id
|
||||||
|
|
||||||
|
parent_id = folder_state.get(parent_path, {}).get("id") if parent_path else None
|
||||||
|
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen, deleted)
|
||||||
|
VALUES (%s, %s, %s, %s, %s, 0)
|
||||||
|
""", (folder_path, parent_id, device_id, now, now))
|
||||||
|
|
||||||
|
folder_id = cursor.lastrowid
|
||||||
|
folder_state[folder_path] = {"id": folder_id, "deleted": 0}
|
||||||
|
return folder_id
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# LOAD LAST FILE STATE
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
def load_last_file_state(cursor, device_id: int):
|
||||||
|
"""
|
||||||
|
{md5: {"id", "deleted", "size", "modified"}}
|
||||||
|
"""
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT f.id, f.path_md5, f.deleted, f.size, f.modified
|
||||||
|
FROM files f
|
||||||
|
JOIN (
|
||||||
|
SELECT MAX(id) AS mx
|
||||||
|
FROM files
|
||||||
|
WHERE device_id=%s
|
||||||
|
GROUP BY path_md5
|
||||||
|
) t ON f.id = t.mx
|
||||||
|
""", (device_id,))
|
||||||
|
|
||||||
|
out = {}
|
||||||
|
for fid, md5, deleted, size, modified in cursor.fetchall():
|
||||||
|
out[md5] = {
|
||||||
|
"id": fid,
|
||||||
|
"deleted": int(deleted),
|
||||||
|
"size": size,
|
||||||
|
"modified": modified,
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# MAIN SCANNER
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
def walk_and_store_bulk():
|
||||||
|
target_dir = r"u:\Dropbox"
|
||||||
|
device_name = "Z230"
|
||||||
|
|
||||||
|
if not os.path.isdir(target_dir):
|
||||||
|
print("Invalid directory:", target_dir)
|
||||||
|
return
|
||||||
|
|
||||||
|
conn, cursor = init_db()
|
||||||
|
now = datetime.now()
|
||||||
|
|
||||||
|
device_id = get_or_create_device(cursor, conn, device_name)
|
||||||
|
folder_state = load_folder_state(cursor, device_id)
|
||||||
|
file_state = load_last_file_state(cursor, device_id)
|
||||||
|
|
||||||
|
seen_folders = set()
|
||||||
|
seen_files = set()
|
||||||
|
|
||||||
|
files_to_insert = []
|
||||||
|
files_to_update = []
|
||||||
|
files_to_mark_deleted = []
|
||||||
|
folders_to_mark_deleted = []
|
||||||
|
|
||||||
|
total_files = 0
|
||||||
|
|
||||||
|
print(f"🔍 Scanning: {target_dir} (device {device_id})")
|
||||||
|
|
||||||
|
# -------------------------------------------------
|
||||||
|
# WALK THROUGH FILESYSTEM
|
||||||
|
# -------------------------------------------------
|
||||||
|
for root, dirs, files in os.walk(target_dir):
|
||||||
|
folder_path = os.path.normpath(root)
|
||||||
|
parent_path = os.path.normpath(os.path.dirname(root)) if root != target_dir else None
|
||||||
|
|
||||||
|
# ALWAYS insert/update folder
|
||||||
|
seen_folders.add(folder_path)
|
||||||
|
folder_id = get_or_create_folder(cursor, conn, folder_state,
|
||||||
|
device_id, folder_path,
|
||||||
|
parent_path, now)
|
||||||
|
|
||||||
|
# Process files inside this folder
|
||||||
|
for name in files:
|
||||||
|
total_files += 1
|
||||||
|
|
||||||
|
filepath = os.path.normpath(os.path.join(root, name))
|
||||||
|
md5 = md5_path(filepath)
|
||||||
|
seen_files.add(md5)
|
||||||
|
|
||||||
|
try:
|
||||||
|
st = os.stat(filepath)
|
||||||
|
except FileNotFoundError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
modified = datetime.fromtimestamp(st.st_mtime).replace(microsecond=0)
|
||||||
|
size = st.st_size
|
||||||
|
ext = os.path.splitext(name)[1]
|
||||||
|
|
||||||
|
prev = file_state.get(md5)
|
||||||
|
|
||||||
|
if prev is None:
|
||||||
|
# New file
|
||||||
|
files_to_insert.append(
|
||||||
|
(name, filepath, md5, size, modified, ext,
|
||||||
|
folder_id, device_id, 0, now, now)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if prev["deleted"] == 1:
|
||||||
|
# Reappeared
|
||||||
|
files_to_insert.append(
|
||||||
|
(name, filepath, md5, size, modified, ext,
|
||||||
|
folder_id, device_id, 0, now, now)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Existing file → update only if changed
|
||||||
|
if prev["size"] != size or prev["modified"] != modified:
|
||||||
|
files_to_update.append(
|
||||||
|
(size, modified, now, prev["id"])
|
||||||
|
)
|
||||||
|
|
||||||
|
if total_files % 1000 == 0:
|
||||||
|
print(f" ... processed {total_files} files")
|
||||||
|
|
||||||
|
# -------------------------------------------------
|
||||||
|
# MARK DELETED FILES
|
||||||
|
# -------------------------------------------------
|
||||||
|
for md5, info in file_state.items():
|
||||||
|
if info["deleted"] == 0 and md5 not in seen_files:
|
||||||
|
files_to_mark_deleted.append((now, info["id"]))
|
||||||
|
|
||||||
|
# -------------------------------------------------
|
||||||
|
# MARK DELETED FOLDERS
|
||||||
|
# -------------------------------------------------
|
||||||
|
for path, info in folder_state.items():
|
||||||
|
if info["deleted"] == 0 and path not in seen_folders:
|
||||||
|
folders_to_mark_deleted.append((now, info["id"]))
|
||||||
|
|
||||||
|
# -------------------------------------------------
|
||||||
|
# APPLY CHANGES
|
||||||
|
# -------------------------------------------------
|
||||||
|
if files_to_insert:
|
||||||
|
cursor.executemany("""
|
||||||
|
INSERT INTO files (
|
||||||
|
name, path, path_md5, size, modified, type,
|
||||||
|
folder_id, device_id, deleted,
|
||||||
|
first_seen, last_seen
|
||||||
|
)
|
||||||
|
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
|
||||||
|
""", files_to_insert)
|
||||||
|
|
||||||
|
if files_to_update:
|
||||||
|
cursor.executemany("""
|
||||||
|
UPDATE files
|
||||||
|
SET size=%s, modified=%s, last_seen=%s, deleted=0
|
||||||
|
WHERE id=%s
|
||||||
|
""", files_to_update)
|
||||||
|
|
||||||
|
if files_to_mark_deleted:
|
||||||
|
cursor.executemany("""
|
||||||
|
UPDATE files
|
||||||
|
SET deleted=1, last_seen=%s
|
||||||
|
WHERE id=%s
|
||||||
|
""", files_to_mark_deleted)
|
||||||
|
|
||||||
|
if folders_to_mark_deleted:
|
||||||
|
cursor.executemany("""
|
||||||
|
UPDATE folders
|
||||||
|
SET deleted=1, last_seen=%s
|
||||||
|
WHERE id=%s
|
||||||
|
""", folders_to_mark_deleted)
|
||||||
|
|
||||||
|
# Update device timestamp
|
||||||
|
cursor.execute("UPDATE devices SET scanned_at=%s WHERE id=%s", (now, device_id))
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
print("")
|
||||||
|
print("✅ Scan completed.")
|
||||||
|
print(" Total files:", total_files)
|
||||||
|
print(" Inserted:", len(files_to_insert))
|
||||||
|
print(" Updated:", len(files_to_update))
|
||||||
|
print(" Files deleted:", len(files_to_mark_deleted))
|
||||||
|
print(" Folders deleted:", len(folders_to_mark_deleted))
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# MAIN ENTRY
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
walk_and_store_bulk()
|
||||||
413
22 WalkandSave.py
Normal file
413
22 WalkandSave.py
Normal file
@@ -0,0 +1,413 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import os
|
||||||
|
import hashlib
|
||||||
|
from datetime import datetime
|
||||||
|
import mysql.connector
|
||||||
|
from mysql.connector import Error
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# Load .env from the script directory
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
env_path = Path(__file__).resolve().parent / ".env"
|
||||||
|
load_dotenv(env_path)
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# Helper: MD5 of full file path string
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
def md5_path(path: str) -> str:
|
||||||
|
return hashlib.md5(path.encode("utf8")).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# MySQL CONNECTIONS
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
def get_server_connection():
|
||||||
|
return mysql.connector.connect(
|
||||||
|
host=os.getenv("DB_MYSQL_HOST"),
|
||||||
|
user=os.getenv("DB_MYSQL_ROOT"),
|
||||||
|
password=os.getenv("DB_MYSQL_ROOT_PASS"),
|
||||||
|
port=int(os.getenv("DB_MYSQL_PORT")),
|
||||||
|
auth_plugin="mysql_native_password",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_db_connection():
|
||||||
|
conn = mysql.connector.connect(
|
||||||
|
host=os.getenv("DB_MYSQL_HOST"),
|
||||||
|
user=os.getenv("DB_MYSQL_ROOT"),
|
||||||
|
password=os.getenv("DB_MYSQL_ROOT_PASS"),
|
||||||
|
port=int(os.getenv("DB_MYSQL_PORT")),
|
||||||
|
database="walkfiles",
|
||||||
|
auth_plugin="mysql_native_password",
|
||||||
|
)
|
||||||
|
|
||||||
|
c = conn.cursor()
|
||||||
|
c.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
|
||||||
|
c.close()
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# DATABASE INITIALIZATION
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
def init_db():
|
||||||
|
# Ensure DB exists
|
||||||
|
server = get_server_connection()
|
||||||
|
cur = server.cursor()
|
||||||
|
cur.execute("""
|
||||||
|
CREATE DATABASE IF NOT EXISTS walkfiles
|
||||||
|
DEFAULT CHARACTER SET utf8mb4
|
||||||
|
COLLATE utf8mb4_general_ci
|
||||||
|
""")
|
||||||
|
server.commit()
|
||||||
|
cur.close()
|
||||||
|
server.close()
|
||||||
|
|
||||||
|
# Connect
|
||||||
|
conn = get_db_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# DEVICES
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS devices (
|
||||||
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
name VARCHAR(255) UNIQUE,
|
||||||
|
scanned_at DATETIME NULL
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||||
|
""")
|
||||||
|
|
||||||
|
# FOLDERS
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS folders (
|
||||||
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
path VARCHAR(2048) NOT NULL,
|
||||||
|
parent_id INT NULL,
|
||||||
|
device_id INT NOT NULL,
|
||||||
|
first_seen DATETIME NOT NULL,
|
||||||
|
last_seen DATETIME NOT NULL,
|
||||||
|
deleted TINYINT(1) NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
CONSTRAINT fk_folder_device
|
||||||
|
FOREIGN KEY (device_id) REFERENCES devices(id)
|
||||||
|
ON DELETE CASCADE,
|
||||||
|
|
||||||
|
UNIQUE KEY uniq_folder_path (device_id, path(255)),
|
||||||
|
INDEX idx_folder_dev (device_id)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||||
|
""")
|
||||||
|
|
||||||
|
# FILES
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS files (
|
||||||
|
id INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
|
||||||
|
name VARCHAR(255) NOT NULL,
|
||||||
|
path VARCHAR(2048) NOT NULL,
|
||||||
|
path_md5 CHAR(32) NOT NULL,
|
||||||
|
|
||||||
|
size BIGINT NULL,
|
||||||
|
modified DATETIME NULL,
|
||||||
|
type VARCHAR(255) NULL,
|
||||||
|
|
||||||
|
folder_id INT NULL,
|
||||||
|
device_id INT NOT NULL,
|
||||||
|
deleted TINYINT(1) NOT NULL DEFAULT 0,
|
||||||
|
|
||||||
|
first_seen DATETIME NOT NULL,
|
||||||
|
last_seen DATETIME NOT NULL,
|
||||||
|
|
||||||
|
CONSTRAINT fk_file_folder
|
||||||
|
FOREIGN KEY (folder_id) REFERENCES folders(id)
|
||||||
|
ON DELETE SET NULL,
|
||||||
|
|
||||||
|
CONSTRAINT fk_file_device
|
||||||
|
FOREIGN KEY (device_id) REFERENCES devices(id)
|
||||||
|
ON DELETE CASCADE,
|
||||||
|
|
||||||
|
UNIQUE KEY uniq_file_path_md5 (device_id, path_md5),
|
||||||
|
INDEX idx_file_folder (folder_id),
|
||||||
|
INDEX idx_file_deleted (device_id, deleted)
|
||||||
|
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||||
|
""")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
return conn, cursor
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# HELPERS — DEVICES & FOLDERS
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
def get_or_create_device(cursor, conn, name: str) -> int:
|
||||||
|
now = datetime.now()
|
||||||
|
cursor.execute("INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s,%s)", (name, now))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
cursor.execute("SELECT id FROM devices WHERE name=%s", (name,))
|
||||||
|
return cursor.fetchone()[0]
|
||||||
|
|
||||||
|
|
||||||
|
def load_folder_state(cursor, device_id: int):
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT id, path, deleted
|
||||||
|
FROM folders
|
||||||
|
WHERE device_id=%s
|
||||||
|
""", (device_id,))
|
||||||
|
|
||||||
|
out = {}
|
||||||
|
for folder_id, path, deleted in cursor.fetchall():
|
||||||
|
out[path] = {"id": folder_id, "deleted": int(deleted)}
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, parent_path, now):
|
||||||
|
if folder_path in folder_state:
|
||||||
|
folder_id = folder_state[folder_path]["id"]
|
||||||
|
cursor.execute(
|
||||||
|
"UPDATE folders SET last_seen=%s, deleted=0 WHERE id=%s",
|
||||||
|
(now, folder_id)
|
||||||
|
)
|
||||||
|
folder_state[folder_path]["deleted"] = 0
|
||||||
|
return folder_id
|
||||||
|
|
||||||
|
parent_id = folder_state.get(parent_path, {}).get("id") if parent_path else None
|
||||||
|
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen, deleted)
|
||||||
|
VALUES (%s, %s, %s, %s, %s, 0)
|
||||||
|
""", (folder_path, parent_id, device_id, now, now))
|
||||||
|
|
||||||
|
folder_id = cursor.lastrowid
|
||||||
|
folder_state[folder_path] = {"id": folder_id, "deleted": 0}
|
||||||
|
return folder_id
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# LOAD LAST FILE STATE
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
def load_last_file_state(cursor, device_id: int):
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT f.id, f.path_md5, f.deleted, f.size, f.modified
|
||||||
|
FROM files f
|
||||||
|
JOIN (
|
||||||
|
SELECT MAX(id) AS mx
|
||||||
|
FROM files
|
||||||
|
WHERE device_id=%s
|
||||||
|
GROUP BY path_md5
|
||||||
|
) t ON f.id = t.mx
|
||||||
|
""", (device_id,))
|
||||||
|
|
||||||
|
out = {}
|
||||||
|
for fid, md5, deleted, size, modified in cursor.fetchall():
|
||||||
|
out[md5] = {
|
||||||
|
"id": fid,
|
||||||
|
"deleted": int(deleted),
|
||||||
|
"size": size,
|
||||||
|
"modified": modified,
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# MAIN SCANNER WITH BATCHING
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
def walk_and_store_bulk():
|
||||||
|
|
||||||
|
BATCH_SIZE = 10000
|
||||||
|
target_dir = r"u:\Dropbox"
|
||||||
|
device_name = "Z230"
|
||||||
|
|
||||||
|
if not os.path.isdir(target_dir):
|
||||||
|
print("Invalid directory:", target_dir)
|
||||||
|
return
|
||||||
|
|
||||||
|
conn, cursor = init_db()
|
||||||
|
now = datetime.now()
|
||||||
|
|
||||||
|
device_id = get_or_create_device(cursor, conn, device_name)
|
||||||
|
folder_state = load_folder_state(cursor, device_id)
|
||||||
|
file_state = load_last_file_state(cursor, device_id)
|
||||||
|
|
||||||
|
seen_folders = set()
|
||||||
|
seen_files = set()
|
||||||
|
|
||||||
|
files_to_insert = []
|
||||||
|
files_to_update = []
|
||||||
|
files_to_mark_deleted = []
|
||||||
|
folders_to_mark_deleted = []
|
||||||
|
|
||||||
|
total_files = 0
|
||||||
|
|
||||||
|
print(f"🔍 Scanning: {target_dir} (device {device_id})")
|
||||||
|
|
||||||
|
# -------------------------------------------------
|
||||||
|
# WALK FILESYSTEM
|
||||||
|
# -------------------------------------------------
|
||||||
|
for root, dirs, files in os.walk(target_dir):
|
||||||
|
folder_path = os.path.normpath(root)
|
||||||
|
parent_path = os.path.normpath(os.path.dirname(root)) if root != target_dir else None
|
||||||
|
|
||||||
|
seen_folders.add(folder_path)
|
||||||
|
folder_id = get_or_create_folder(cursor, conn, folder_state,
|
||||||
|
device_id, folder_path,
|
||||||
|
parent_path, now)
|
||||||
|
|
||||||
|
# -------------------------------------------------
|
||||||
|
# FILE LOOP
|
||||||
|
# -------------------------------------------------
|
||||||
|
for name in files:
|
||||||
|
total_files += 1
|
||||||
|
|
||||||
|
filepath = os.path.normpath(os.path.join(root, name))
|
||||||
|
md5 = md5_path(filepath)
|
||||||
|
seen_files.add(md5)
|
||||||
|
|
||||||
|
try:
|
||||||
|
st = os.stat(filepath)
|
||||||
|
except FileNotFoundError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
modified = datetime.fromtimestamp(st.st_mtime).replace(microsecond=0)
|
||||||
|
size = st.st_size
|
||||||
|
ext = os.path.splitext(name)[1][:250]
|
||||||
|
|
||||||
|
prev = file_state.get(md5)
|
||||||
|
|
||||||
|
if prev is None:
|
||||||
|
files_to_insert.append(
|
||||||
|
(name, filepath, md5, size, modified, ext,
|
||||||
|
folder_id, device_id, 0, now, now)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if prev["deleted"] == 1:
|
||||||
|
files_to_insert.append(
|
||||||
|
(name, filepath, md5, size, modified, ext,
|
||||||
|
folder_id, device_id, 0, now, now)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if prev["size"] != size or prev["modified"] != modified:
|
||||||
|
files_to_update.append(
|
||||||
|
(size, modified, now, prev["id"])
|
||||||
|
)
|
||||||
|
|
||||||
|
# -------------------------------------------------
|
||||||
|
# BATCH FLUSHING
|
||||||
|
# -------------------------------------------------
|
||||||
|
if len(files_to_insert) >= BATCH_SIZE:
|
||||||
|
print(f"💾 Flushing {len(files_to_insert)} inserts...")
|
||||||
|
cursor.executemany("""
|
||||||
|
INSERT INTO files (
|
||||||
|
name, path, path_md5, size, modified, type,
|
||||||
|
folder_id, device_id, deleted,
|
||||||
|
first_seen, last_seen
|
||||||
|
)
|
||||||
|
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
|
||||||
|
""", files_to_insert)
|
||||||
|
conn.commit()
|
||||||
|
files_to_insert.clear()
|
||||||
|
|
||||||
|
if len(files_to_update) >= BATCH_SIZE:
|
||||||
|
print(f"💾 Flushing {len(files_to_update)} updates...")
|
||||||
|
cursor.executemany("""
|
||||||
|
UPDATE files
|
||||||
|
SET size=%s, modified=%s, last_seen=%s, deleted=0
|
||||||
|
WHERE id=%s
|
||||||
|
""", files_to_update)
|
||||||
|
conn.commit()
|
||||||
|
files_to_update.clear()
|
||||||
|
|
||||||
|
# PROGRESS
|
||||||
|
if total_files % 1000 == 0:
|
||||||
|
print(f" ... processed {total_files} files")
|
||||||
|
|
||||||
|
# -------------------------------------------------
|
||||||
|
# MARK DELETED FILES
|
||||||
|
# -------------------------------------------------
|
||||||
|
for md5, info in file_state.items():
|
||||||
|
if info["deleted"] == 0 and md5 not in seen_files:
|
||||||
|
files_to_mark_deleted.append((now, info["id"]))
|
||||||
|
|
||||||
|
# -------------------------------------------------
|
||||||
|
# MARK DELETED FOLDERS
|
||||||
|
# -------------------------------------------------
|
||||||
|
for path, info in folder_state.items():
|
||||||
|
if info["deleted"] == 0 and path not in seen_folders:
|
||||||
|
folders_to_mark_deleted.append((now, info["id"]))
|
||||||
|
|
||||||
|
# -------------------------------------------------
|
||||||
|
# FINAL FLUSH (REMAINING BATCHES)
|
||||||
|
# -------------------------------------------------
|
||||||
|
|
||||||
|
if files_to_insert:
|
||||||
|
print(f"💾 Final flush: {len(files_to_insert)} inserts")
|
||||||
|
cursor.executemany("""
|
||||||
|
INSERT INTO files (
|
||||||
|
name, path, path_md5, size, modified, type,
|
||||||
|
folder_id, device_id, deleted,
|
||||||
|
first_seen, last_seen
|
||||||
|
)
|
||||||
|
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
|
||||||
|
""", files_to_insert)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
if files_to_update:
|
||||||
|
print(f"💾 Final flush: {len(files_to_update)} updates")
|
||||||
|
cursor.executemany("""
|
||||||
|
UPDATE files
|
||||||
|
SET size=%s, modified=%s, last_seen=%s, deleted=0
|
||||||
|
WHERE id=%s
|
||||||
|
""", files_to_update)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
if files_to_mark_deleted:
|
||||||
|
print(f"💾 Final flush: {len(files_to_mark_deleted)} deletions")
|
||||||
|
cursor.executemany("""
|
||||||
|
UPDATE files
|
||||||
|
SET deleted=1, last_seen=%s
|
||||||
|
WHERE id=%s
|
||||||
|
""", files_to_mark_deleted)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
if folders_to_mark_deleted:
|
||||||
|
cursor.executemany("""
|
||||||
|
UPDATE folders
|
||||||
|
SET deleted=1, last_seen=%s
|
||||||
|
WHERE id=%s
|
||||||
|
""", folders_to_mark_deleted)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# -------------------------------------------------
|
||||||
|
# Update device timestamp
|
||||||
|
# -------------------------------------------------
|
||||||
|
cursor.execute("UPDATE devices SET scanned_at=%s WHERE id=%s", (now, device_id))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
print("")
|
||||||
|
print("✅ Scan completed.")
|
||||||
|
print(" Total files:", total_files)
|
||||||
|
print(" Inserted:", len(files_to_insert))
|
||||||
|
print(" Updated:", len(files_to_update))
|
||||||
|
print(" Files deleted:", len(files_to_mark_deleted))
|
||||||
|
print(" Folders deleted:", len(folders_to_mark_deleted))
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# MAIN ENTRY
|
||||||
|
# ======================================================
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
walk_and_store_bulk()
|
||||||
28
30 Test.py
Normal file
28
30 Test.py
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
import mysql.connector
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Always load .env from the folder where THIS script is stored
|
||||||
|
env_path = Path(__file__).resolve().parent / ".env"
|
||||||
|
load_dotenv(env_path)
|
||||||
|
|
||||||
|
# =======================
|
||||||
|
# ENV DEBUG OUTPUT
|
||||||
|
# =======================
|
||||||
|
print("======================================")
|
||||||
|
print("ENV DEBUG")
|
||||||
|
print("Script file:", __file__)
|
||||||
|
print("Script folder:", Path(__file__).resolve().parent)
|
||||||
|
print("Expected .env path:", env_path)
|
||||||
|
print(".env exists? ->", env_path.exists())
|
||||||
|
print("Current working directory (cwd):", os.getcwd())
|
||||||
|
print("Loaded DB_MYSQL_HOST:", os.getenv("DB_MYSQL_HOST"))
|
||||||
|
print("Loaded DB_MYSQL_PORT:", os.getenv("DB_MYSQL_PORT"))
|
||||||
|
print("Loaded DB_MYSQL_ROOT:", os.getenv("DB_MYSQL_ROOT"))
|
||||||
|
print("Loaded DB_MYSQL_ROOT_PASS:", os.getenv("DB_MYSQL_ROOT_PASS"))
|
||||||
|
print("======================================")
|
||||||
11
40 Test3.py
Normal file
11
40 Test3.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
base = r"u:\Dropbox\Ordinace\Dokumentace_ke_zpracování"
|
||||||
|
|
||||||
|
print("Listing folders in:", base)
|
||||||
|
print("--------------------------------------")
|
||||||
|
|
||||||
|
for name in os.listdir(base):
|
||||||
|
full = os.path.join(base, name)
|
||||||
|
if os.path.isdir(full):
|
||||||
|
print("FOLDER:", repr(name))
|
||||||
192
50 MD5calculate.py
Normal file
192
50 MD5calculate.py
Normal file
@@ -0,0 +1,192 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import os
|
||||||
|
import hashlib
|
||||||
|
from datetime import datetime
|
||||||
|
import mysql.connector
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# Load environment
|
||||||
|
# ======================================================
|
||||||
|
env_path = Path(__file__).resolve().parent / ".env"
|
||||||
|
load_dotenv(env_path)
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# MySQL connection
|
||||||
|
# ======================================================
|
||||||
|
def get_db_connection():
|
||||||
|
conn = mysql.connector.connect(
|
||||||
|
host=os.getenv("DB_MYSQL_HOST"),
|
||||||
|
user=os.getenv("DB_MYSQL_ROOT"),
|
||||||
|
password=os.getenv("DB_MYSQL_ROOT_PASS"),
|
||||||
|
port=int(os.getenv("DB_MYSQL_PORT")),
|
||||||
|
database="walkfiles",
|
||||||
|
auth_plugin="mysql_native_password"
|
||||||
|
)
|
||||||
|
c = conn.cursor()
|
||||||
|
c.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
|
||||||
|
c.close()
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# Helpers
|
||||||
|
# ======================================================
|
||||||
|
def file_md5(path, chunk_size=1024 * 1024):
|
||||||
|
"""Compute content MD5 of a file in chunks."""
|
||||||
|
md5 = hashlib.md5()
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
while chunk := f.read(chunk_size):
|
||||||
|
md5.update(chunk)
|
||||||
|
return md5.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_size(size_str: str) -> int:
|
||||||
|
"""
|
||||||
|
Convert human input like:
|
||||||
|
10MB, 500kB, 2GB
|
||||||
|
into bytes. If already numeric, return as-is.
|
||||||
|
"""
|
||||||
|
s = size_str.strip().upper()
|
||||||
|
if s.endswith("KB"):
|
||||||
|
return int(float(s[:-2]) * 1024)
|
||||||
|
if s.endswith("MB"):
|
||||||
|
return int(float(s[:-2]) * 1024 * 1024)
|
||||||
|
if s.endswith("GB"):
|
||||||
|
return int(float(s[:-2]) * 1024 * 1024 * 1024)
|
||||||
|
return int(s) # assume raw bytes
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# MAIN LOGIC
|
||||||
|
# ======================================================
|
||||||
|
def run_md5_calculator(device_name=None,
|
||||||
|
device_id=None,
|
||||||
|
extension=".pdf",
|
||||||
|
max_size="50MB"):
|
||||||
|
"""
|
||||||
|
device_name OR device_id must be provided.
|
||||||
|
extension: ".pdf", ".jpg", etc.
|
||||||
|
max_size: "10MB", "500KB", "1GB" or number of bytes
|
||||||
|
"""
|
||||||
|
|
||||||
|
max_bytes = parse_size(max_size)
|
||||||
|
|
||||||
|
conn, cursor = None, None
|
||||||
|
|
||||||
|
try:
|
||||||
|
conn = get_db_connection()
|
||||||
|
cursor = conn.cursor(dictionary=True)
|
||||||
|
|
||||||
|
# ------------------------------------------
|
||||||
|
# Resolve device_id if only device_name given
|
||||||
|
# ------------------------------------------
|
||||||
|
if device_id is None:
|
||||||
|
if device_name is None:
|
||||||
|
raise RuntimeError("You must provide device_name or device_id")
|
||||||
|
cursor.execute("SELECT id FROM devices WHERE name=%s", (device_name,))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
if not row:
|
||||||
|
raise RuntimeError(f"Device '{device_name}' not found")
|
||||||
|
device_id = row["id"]
|
||||||
|
|
||||||
|
print(f"\n🔍 Filtering: device={device_id}, ext={extension}, max_size={max_bytes} bytes\n")
|
||||||
|
|
||||||
|
# ------------------------------------------
|
||||||
|
# SELECT only files that need MD5 calculation
|
||||||
|
# ------------------------------------------
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT id, path, size, modified, content_md5, md5_calculated
|
||||||
|
FROM files
|
||||||
|
WHERE device_id=%s
|
||||||
|
AND deleted = 0
|
||||||
|
AND path LIKE %s
|
||||||
|
AND size <= %s
|
||||||
|
""", (device_id, "%" + extension, max_bytes))
|
||||||
|
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
total = len(rows)
|
||||||
|
print(f"📁 Files matching criteria: {total}")
|
||||||
|
|
||||||
|
updates = 0
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
|
||||||
|
file_id = row["id"]
|
||||||
|
path = row["path"]
|
||||||
|
size = row["size"]
|
||||||
|
modified = row["modified"]
|
||||||
|
prev_md5 = row["content_md5"]
|
||||||
|
prev_calc = row["md5_calculated"]
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# Skip missing files on disk
|
||||||
|
# -------------------------------
|
||||||
|
if not os.path.isfile(path):
|
||||||
|
print(f"⚠️ Missing on disk, skipping: {path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# Check conditions for recalculation
|
||||||
|
# -------------------------------
|
||||||
|
need_md5 = False
|
||||||
|
|
||||||
|
if prev_md5 is None:
|
||||||
|
need_md5 = True
|
||||||
|
else:
|
||||||
|
if prev_calc is None or prev_calc < modified:
|
||||||
|
need_md5 = True
|
||||||
|
|
||||||
|
if not need_md5:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# -------------------------------
|
||||||
|
# Compute MD5
|
||||||
|
# -------------------------------
|
||||||
|
print(f"🔄 Calculating MD5: {path}")
|
||||||
|
new_md5 = file_md5(path)
|
||||||
|
now = datetime.now().replace(microsecond=0)
|
||||||
|
|
||||||
|
cursor.execute("""
|
||||||
|
UPDATE files
|
||||||
|
SET content_md5=%s,
|
||||||
|
md5_calculated=%s
|
||||||
|
WHERE id=%s
|
||||||
|
""", (new_md5, now, file_id))
|
||||||
|
|
||||||
|
updates += 1
|
||||||
|
|
||||||
|
# optional commit per-file:
|
||||||
|
# conn.commit()
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
print("\n✅ MD5 calculation finished.")
|
||||||
|
print(f" Updated files: {updates}")
|
||||||
|
print(f" Skipped files: {total - updates}\n")
|
||||||
|
|
||||||
|
except Error as e:
|
||||||
|
print("MySQL Error:", e)
|
||||||
|
|
||||||
|
finally:
|
||||||
|
if cursor:
|
||||||
|
cursor.close()
|
||||||
|
if conn:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================
|
||||||
|
# RUN EXAMPLE
|
||||||
|
# ======================================================
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Example usage:
|
||||||
|
run_md5_calculator(
|
||||||
|
device_name="Z230",
|
||||||
|
extension=".pdf",
|
||||||
|
max_size="100MB"
|
||||||
|
)
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
DB_MYSQL_HOST=192.168.1.76
|
|
||||||
DB_MYSQL_PORT=3307
|
|
||||||
DB_MYSQL_ROOT=root
|
|
||||||
DB_MYSQL_ROOT_PASS=Vlado9674+
|
|
||||||
DB_MYSQL_PORT=3307
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user