Compare commits
17 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| a19281c3a4 | |||
| 0c94333abb | |||
| 51c77a8793 | |||
| f052362b31 | |||
| 773e67c9b6 | |||
| b0275928d2 | |||
| 4c420060ec | |||
| c6334c2244 | |||
| 50ee068af9 | |||
| c30a582323 | |||
| 01aa1249b9 | |||
| b74e180022 | |||
| 2037d1b887 | |||
| 6cdabc64b4 | |||
| 2aee823e87 | |||
| b61a8a5473 | |||
| 83f2d0dafc |
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
.venv/
|
||||||
|
.idea/
|
||||||
@@ -258,8 +258,7 @@ def walk_and_store_bulk():
|
|||||||
BATCH_SIZE = 10000
|
BATCH_SIZE = 10000
|
||||||
# target_dir = r"\\tower1\#colddata"
|
# target_dir = r"\\tower1\#colddata"
|
||||||
# target_dir = r"z:"
|
# target_dir = r"z:"
|
||||||
# target_dir = r"\\tower\ebooks"
|
target_dir = r"\\tower\ebooks"
|
||||||
target_dir = r"\\tower\dedup"
|
|
||||||
# device_name = "TW22"
|
# device_name = "TW22"
|
||||||
device_name = "TOWER"
|
device_name = "TOWER"
|
||||||
|
|
||||||
|
|||||||
@@ -1,485 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
import os
|
|
||||||
import hashlib
|
|
||||||
from datetime import datetime
|
|
||||||
import mysql.connector
|
|
||||||
from mysql.connector import Error
|
|
||||||
from dotenv import load_dotenv
|
|
||||||
from pathlib import Path
|
|
||||||
import unicodedata
|
|
||||||
|
|
||||||
# ======================================================
|
|
||||||
# Load .env from the script directory
|
|
||||||
# ======================================================
|
|
||||||
|
|
||||||
env_path = Path(__file__).resolve().parent / ".env"
|
|
||||||
load_dotenv(env_path)
|
|
||||||
|
|
||||||
|
|
||||||
# ======================================================
|
|
||||||
# Helper: MD5 of full file path string
|
|
||||||
# ======================================================
|
|
||||||
|
|
||||||
def md5_path(path: str) -> str:
|
|
||||||
return hashlib.md5(path.encode("utf8")).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
# ======================================================
|
|
||||||
# MySQL CONNECTIONS
|
|
||||||
# ======================================================
|
|
||||||
|
|
||||||
def get_server_connection():
|
|
||||||
return mysql.connector.connect(
|
|
||||||
host=os.getenv("DB_MYSQL_HOST"),
|
|
||||||
user=os.getenv("DB_MYSQL_ROOT"),
|
|
||||||
password=os.getenv("DB_MYSQL_ROOT_PASS"),
|
|
||||||
port=int(os.getenv("DB_MYSQL_PORT")),
|
|
||||||
auth_plugin="mysql_native_password",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_db_connection():
|
|
||||||
conn = mysql.connector.connect(
|
|
||||||
host=os.getenv("DB_MYSQL_HOST"),
|
|
||||||
user=os.getenv("DB_MYSQL_ROOT"),
|
|
||||||
password=os.getenv("DB_MYSQL_ROOT_PASS"),
|
|
||||||
port=int(os.getenv("DB_MYSQL_PORT")),
|
|
||||||
database="walkfiles",
|
|
||||||
auth_plugin="mysql_native_password",
|
|
||||||
)
|
|
||||||
|
|
||||||
c = conn.cursor()
|
|
||||||
c.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
|
|
||||||
c.close()
|
|
||||||
return conn
|
|
||||||
|
|
||||||
|
|
||||||
# ======================================================
|
|
||||||
# DATABASE INITIALIZATION
|
|
||||||
# ======================================================
|
|
||||||
|
|
||||||
def init_db():
|
|
||||||
# Ensure DB exists
|
|
||||||
server = get_server_connection()
|
|
||||||
cur = server.cursor()
|
|
||||||
cur.execute("""
|
|
||||||
CREATE DATABASE IF NOT EXISTS walkfiles
|
|
||||||
DEFAULT CHARACTER SET utf8mb4
|
|
||||||
COLLATE utf8mb4_general_ci
|
|
||||||
""")
|
|
||||||
server.commit()
|
|
||||||
cur.close()
|
|
||||||
server.close()
|
|
||||||
|
|
||||||
# Connect
|
|
||||||
conn = get_db_connection()
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
# DEVICES
|
|
||||||
cursor.execute("""
|
|
||||||
CREATE TABLE IF NOT EXISTS devices (
|
|
||||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
|
||||||
name VARCHAR(255) UNIQUE,
|
|
||||||
scanned_at DATETIME NULL
|
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
|
||||||
""")
|
|
||||||
|
|
||||||
# FOLDERS
|
|
||||||
cursor.execute("""
|
|
||||||
CREATE TABLE IF NOT EXISTS folders (
|
|
||||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
|
||||||
path VARCHAR(2048) NOT NULL,
|
|
||||||
parent_id INT NULL,
|
|
||||||
device_id INT NOT NULL,
|
|
||||||
first_seen DATETIME NOT NULL,
|
|
||||||
last_seen DATETIME NOT NULL,
|
|
||||||
deleted TINYINT(1) NOT NULL DEFAULT 0,
|
|
||||||
|
|
||||||
CONSTRAINT fk_folder_device
|
|
||||||
FOREIGN KEY (device_id) REFERENCES devices(id)
|
|
||||||
ON DELETE CASCADE,
|
|
||||||
|
|
||||||
UNIQUE KEY uniq_folder_path (device_id, path(255)),
|
|
||||||
INDEX idx_folder_dev (device_id)
|
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
|
||||||
""")
|
|
||||||
|
|
||||||
# FILES
|
|
||||||
cursor.execute("""
|
|
||||||
CREATE TABLE IF NOT EXISTS files (
|
|
||||||
id INT AUTO_INCREMENT PRIMARY KEY,
|
|
||||||
|
|
||||||
name VARCHAR(255) NOT NULL,
|
|
||||||
path VARCHAR(2048) NOT NULL,
|
|
||||||
path_md5 CHAR(32) NOT NULL,
|
|
||||||
|
|
||||||
size BIGINT NULL,
|
|
||||||
modified DATETIME NULL,
|
|
||||||
type VARCHAR(255) NULL,
|
|
||||||
|
|
||||||
folder_id INT NULL,
|
|
||||||
device_id INT NOT NULL,
|
|
||||||
deleted TINYINT(1) NOT NULL DEFAULT 0,
|
|
||||||
|
|
||||||
first_seen DATETIME NOT NULL,
|
|
||||||
last_seen DATETIME NOT NULL,
|
|
||||||
|
|
||||||
CONSTRAINT fk_file_folder
|
|
||||||
FOREIGN KEY (folder_id) REFERENCES folders(id)
|
|
||||||
ON DELETE SET NULL,
|
|
||||||
|
|
||||||
CONSTRAINT fk_file_device
|
|
||||||
FOREIGN KEY (device_id) REFERENCES devices(id)
|
|
||||||
ON DELETE CASCADE,
|
|
||||||
|
|
||||||
UNIQUE KEY uniq_file_path_md5 (device_id, path_md5),
|
|
||||||
INDEX idx_file_folder (folder_id),
|
|
||||||
INDEX idx_file_deleted (device_id, deleted)
|
|
||||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
|
||||||
""")
|
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
return conn, cursor
|
|
||||||
|
|
||||||
|
|
||||||
# ======================================================
|
|
||||||
# HELPERS — DEVICES & FOLDERS
|
|
||||||
# ======================================================
|
|
||||||
|
|
||||||
def get_or_create_device(cursor, conn, name: str) -> int:
|
|
||||||
now = datetime.now()
|
|
||||||
cursor.execute("INSERT IGNORE INTO devices (name, scanned_at) VALUES (%s,%s)", (name, now))
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
cursor.execute("SELECT id FROM devices WHERE name=%s", (name,))
|
|
||||||
return cursor.fetchone()[0]
|
|
||||||
|
|
||||||
|
|
||||||
def load_folder_state(cursor, device_id: int):
|
|
||||||
"""
|
|
||||||
Načte všechny složky pro zařízení a uloží jako:
|
|
||||||
folder_state[normalized_path] = {"id": id, "deleted": 0/1}
|
|
||||||
"""
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT id, path, deleted
|
|
||||||
FROM folders
|
|
||||||
WHERE device_id=%s
|
|
||||||
""", (device_id,))
|
|
||||||
|
|
||||||
out = {}
|
|
||||||
for folder_id, path, deleted in cursor.fetchall():
|
|
||||||
norm_path = os.path.normpath(path)
|
|
||||||
out[norm_path] = {"id": folder_id, "deleted": int(deleted)}
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def get_or_create_folder(cursor, conn, folder_state, device_id, folder_path, parent_id):
|
|
||||||
"""
|
|
||||||
Vytvoří nebo najde složku. Ošetřuje:
|
|
||||||
- Unicode normalizaci (Černý vs Černý)
|
|
||||||
- cache v paměti (folder_state)
|
|
||||||
- idempotentní INSERT (ON DUPLICATE KEY UPDATE)
|
|
||||||
"""
|
|
||||||
# Normalize Unicode + path form
|
|
||||||
folder_path = unicodedata.normalize("NFC", folder_path)
|
|
||||||
folder_path = os.path.normpath(folder_path)
|
|
||||||
|
|
||||||
key = folder_path
|
|
||||||
|
|
||||||
# 1) Cache hit
|
|
||||||
if key in folder_state:
|
|
||||||
return folder_state[key]["id"]
|
|
||||||
|
|
||||||
now = datetime.now()
|
|
||||||
|
|
||||||
# 2) Zkus SELECT
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT id
|
|
||||||
FROM folders
|
|
||||||
WHERE device_id = %s AND path = %s
|
|
||||||
LIMIT 1
|
|
||||||
""", (device_id, folder_path))
|
|
||||||
row = cursor.fetchone()
|
|
||||||
|
|
||||||
if row:
|
|
||||||
folder_id = row[0]
|
|
||||||
folder_state[key] = {"id": folder_id, "deleted": 0}
|
|
||||||
return folder_id
|
|
||||||
|
|
||||||
# 3) INSERT (idempotent)
|
|
||||||
cursor.execute("""
|
|
||||||
INSERT INTO folders (path, parent_id, device_id, first_seen, last_seen)
|
|
||||||
VALUES (%s, %s, %s, %s, %s)
|
|
||||||
ON DUPLICATE KEY UPDATE
|
|
||||||
id = LAST_INSERT_ID(id),
|
|
||||||
last_seen = VALUES(last_seen)
|
|
||||||
""", (folder_path, parent_id, device_id, now, now))
|
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
folder_id = cursor.lastrowid
|
|
||||||
folder_state[key] = {"id": folder_id, "deleted": 0}
|
|
||||||
return folder_id
|
|
||||||
|
|
||||||
|
|
||||||
# ======================================================
|
|
||||||
# LOAD LAST FILE STATE
|
|
||||||
# ======================================================
|
|
||||||
|
|
||||||
def load_last_file_state(cursor, device_id: int):
|
|
||||||
"""
|
|
||||||
Načte poslední známý stav souborů pro zařízení, indexovaný podle path_md5.
|
|
||||||
(Z historických důvodů přes MAX(id), i když máš UNIQUE na (device_id, path_md5))
|
|
||||||
"""
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT f.id, f.path_md5, f.deleted, f.size, f.modified
|
|
||||||
FROM files f
|
|
||||||
JOIN (
|
|
||||||
SELECT MAX(id) AS mx
|
|
||||||
FROM files
|
|
||||||
WHERE device_id=%s
|
|
||||||
GROUP BY path_md5
|
|
||||||
) t ON f.id = t.mx
|
|
||||||
""", (device_id,))
|
|
||||||
|
|
||||||
out = {}
|
|
||||||
for fid, md5, deleted, size, modified in cursor.fetchall():
|
|
||||||
out[md5] = {
|
|
||||||
"id": fid,
|
|
||||||
"deleted": int(deleted),
|
|
||||||
"size": size,
|
|
||||||
"modified": modified,
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
# ======================================================
|
|
||||||
# MAIN SCANNER WITH BATCHING
|
|
||||||
# ======================================================
|
|
||||||
|
|
||||||
def walk_and_store_bulk():
|
|
||||||
|
|
||||||
BATCH_SIZE = 10000
|
|
||||||
# target_dir = r"\\tower1\#colddata"
|
|
||||||
# target_dir = r"z:"
|
|
||||||
target_dir = r"\\tower\ebooks"
|
|
||||||
# target_dir = r"\\tower\dedup"
|
|
||||||
device_name = "TOWER"
|
|
||||||
|
|
||||||
# Normalizovaný root pro porovnávání a LIKE
|
|
||||||
target_dir_norm = os.path.normpath(target_dir)
|
|
||||||
|
|
||||||
if not os.path.isdir(target_dir):
|
|
||||||
print("Invalid directory:", target_dir)
|
|
||||||
return
|
|
||||||
|
|
||||||
conn, cursor = init_db()
|
|
||||||
now = datetime.now()
|
|
||||||
|
|
||||||
device_id = get_or_create_device(cursor, conn, device_name)
|
|
||||||
folder_state = load_folder_state(cursor, device_id)
|
|
||||||
file_state = load_last_file_state(cursor, device_id)
|
|
||||||
|
|
||||||
seen_folders = set()
|
|
||||||
seen_files = set() # MD5 of path
|
|
||||||
|
|
||||||
files_to_insert = []
|
|
||||||
files_to_update = []
|
|
||||||
|
|
||||||
total_files = 0
|
|
||||||
|
|
||||||
print(f"🔍 Scanning: {target_dir} (device {device_id})")
|
|
||||||
|
|
||||||
# -------------------------------------------------
|
|
||||||
# WALK FILESYSTEM
|
|
||||||
# -------------------------------------------------
|
|
||||||
for root, dirs, files in os.walk(target_dir):
|
|
||||||
folder_path = os.path.normpath(root)
|
|
||||||
|
|
||||||
# 1️⃣ determine parent_id correctly
|
|
||||||
if folder_path == target_dir_norm:
|
|
||||||
parent_id = None
|
|
||||||
else:
|
|
||||||
parent_folder_path = os.path.normpath(os.path.dirname(folder_path))
|
|
||||||
parent_id = get_or_create_folder(cursor, conn, folder_state,
|
|
||||||
device_id, parent_folder_path,
|
|
||||||
None)
|
|
||||||
|
|
||||||
# 2️⃣ now insert current folder with correct parent_id
|
|
||||||
seen_folders.add(folder_path)
|
|
||||||
folder_id = get_or_create_folder(cursor, conn, folder_state,
|
|
||||||
device_id, folder_path,
|
|
||||||
parent_id)
|
|
||||||
|
|
||||||
# -------------------------------------------------
|
|
||||||
# FILE LOOP
|
|
||||||
# -------------------------------------------------
|
|
||||||
for name in files:
|
|
||||||
total_files += 1
|
|
||||||
|
|
||||||
filepath = os.path.normpath(os.path.join(folder_path, name))
|
|
||||||
md5 = md5_path(filepath)
|
|
||||||
seen_files.add(md5)
|
|
||||||
|
|
||||||
try:
|
|
||||||
st = os.stat(filepath)
|
|
||||||
except FileNotFoundError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
modified = datetime.fromtimestamp(st.st_mtime).replace(microsecond=0)
|
|
||||||
size = st.st_size
|
|
||||||
ext = os.path.splitext(name)[1][:250]
|
|
||||||
|
|
||||||
prev = file_state.get(md5)
|
|
||||||
|
|
||||||
if prev is None:
|
|
||||||
# nový soubor
|
|
||||||
files_to_insert.append(
|
|
||||||
(name, filepath, md5, size, modified, ext,
|
|
||||||
folder_id, device_id, 0, now, now)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
if prev["deleted"] == 1:
|
|
||||||
# "vzkříšený" soubor
|
|
||||||
files_to_insert.append(
|
|
||||||
(name, filepath, md5, size, modified, ext,
|
|
||||||
folder_id, device_id, 0, now, now)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# existuje a není deleted → zkontroluj změnu velikosti / času
|
|
||||||
if prev["size"] != size or prev["modified"] != modified:
|
|
||||||
files_to_update.append(
|
|
||||||
(size, modified, now, prev["id"])
|
|
||||||
)
|
|
||||||
|
|
||||||
# -------------------------------------------------
|
|
||||||
# BATCH FLUSHING
|
|
||||||
# -------------------------------------------------
|
|
||||||
if len(files_to_insert) >= BATCH_SIZE:
|
|
||||||
print(f"💾 Flushing {len(files_to_insert)} inserts...")
|
|
||||||
cursor.executemany("""
|
|
||||||
INSERT INTO files (
|
|
||||||
name, path, path_md5, size, modified, type,
|
|
||||||
folder_id, device_id, deleted,
|
|
||||||
first_seen, last_seen
|
|
||||||
)
|
|
||||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
|
|
||||||
""", files_to_insert)
|
|
||||||
conn.commit()
|
|
||||||
files_to_insert.clear()
|
|
||||||
|
|
||||||
if len(files_to_update) >= BATCH_SIZE:
|
|
||||||
print(f"💾 Flushing {len(files_to_update)} updates...")
|
|
||||||
cursor.executemany("""
|
|
||||||
UPDATE files
|
|
||||||
SET size=%s, modified=%s, last_seen=%s, deleted=0
|
|
||||||
WHERE id=%s
|
|
||||||
""", files_to_update)
|
|
||||||
conn.commit()
|
|
||||||
files_to_update.clear()
|
|
||||||
|
|
||||||
# PROGRESS
|
|
||||||
if total_files % 1000 == 0:
|
|
||||||
print(f" ... processed {total_files} files")
|
|
||||||
|
|
||||||
# -------------------------------------------------
|
|
||||||
# FINAL FLUSH (REMAINING INSERTS/UPDATES)
|
|
||||||
# -------------------------------------------------
|
|
||||||
|
|
||||||
if files_to_insert:
|
|
||||||
print(f"💾 Final flush: {len(files_to_insert)} inserts")
|
|
||||||
cursor.executemany("""
|
|
||||||
INSERT INTO files (
|
|
||||||
name, path, path_md5, size, modified, type,
|
|
||||||
folder_id, device_id, deleted,
|
|
||||||
first_seen, last_seen
|
|
||||||
)
|
|
||||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
|
|
||||||
""", files_to_insert)
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
if files_to_update:
|
|
||||||
print(f"💾 Final flush: {len(files_to_update)} updates")
|
|
||||||
cursor.executemany("""
|
|
||||||
UPDATE files
|
|
||||||
SET size=%s, modified=%s, last_seen=%s, deleted=0
|
|
||||||
WHERE id=%s
|
|
||||||
""", files_to_update)
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
# -------------------------------------------------
|
|
||||||
# MARK DELETED FILES — ONLY IN THIS SUBTREE
|
|
||||||
# -------------------------------------------------
|
|
||||||
files_deleted_count = 0
|
|
||||||
|
|
||||||
like_prefix = target_dir_norm.rstrip("\\/") + "%"
|
|
||||||
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT id, path_md5
|
|
||||||
FROM files
|
|
||||||
WHERE device_id = %s
|
|
||||||
AND deleted = 0
|
|
||||||
AND path LIKE %s
|
|
||||||
""", (device_id, like_prefix))
|
|
||||||
|
|
||||||
candidates = cursor.fetchall()
|
|
||||||
ids_to_delete = [fid for (fid, md5) in candidates if md5 not in seen_files]
|
|
||||||
|
|
||||||
if ids_to_delete:
|
|
||||||
print(f"💾 Marking {len(ids_to_delete)} files as deleted in subtree")
|
|
||||||
cursor.executemany("""
|
|
||||||
UPDATE files
|
|
||||||
SET deleted=1, last_seen=%s
|
|
||||||
WHERE id=%s
|
|
||||||
""", [(now, fid) for fid in ids_to_delete])
|
|
||||||
conn.commit()
|
|
||||||
files_deleted_count = len(ids_to_delete)
|
|
||||||
|
|
||||||
# -------------------------------------------------
|
|
||||||
# MARK DELETED FOLDERS — ONLY IN THIS SUBTREE
|
|
||||||
# -------------------------------------------------
|
|
||||||
folders_to_mark_deleted = []
|
|
||||||
for path, info in folder_state.items():
|
|
||||||
# omez na subtree (včetně root složky)
|
|
||||||
norm_path = os.path.normpath(path)
|
|
||||||
if not norm_path.startswith(target_dir_norm):
|
|
||||||
continue
|
|
||||||
if info["deleted"] == 0 and norm_path not in seen_folders:
|
|
||||||
folders_to_mark_deleted.append((now, info["id"]))
|
|
||||||
|
|
||||||
folders_deleted_count = 0
|
|
||||||
if folders_to_mark_deleted:
|
|
||||||
cursor.executemany("""
|
|
||||||
UPDATE folders
|
|
||||||
SET deleted=1, last_seen=%s
|
|
||||||
WHERE id=%s
|
|
||||||
""", folders_to_mark_deleted)
|
|
||||||
conn.commit()
|
|
||||||
folders_deleted_count = len(folders_to_mark_deleted)
|
|
||||||
|
|
||||||
# -------------------------------------------------
|
|
||||||
# Update device timestamp
|
|
||||||
# -------------------------------------------------
|
|
||||||
cursor.execute("UPDATE devices SET scanned_at=%s WHERE id=%s", (now, device_id))
|
|
||||||
conn.commit()
|
|
||||||
|
|
||||||
cursor.close()
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
print("")
|
|
||||||
print("✅ Scan completed.")
|
|
||||||
print(" Total files scanned:", total_files)
|
|
||||||
print(" Files inserted:", len(files_to_insert)) # po flushi je 0, ale nechávám pro konzistenci
|
|
||||||
print(" Files updated:", len(files_to_update)) # dtto
|
|
||||||
print(" Files deleted in subtree:", files_deleted_count)
|
|
||||||
print(" Folders deleted in subtree:", folders_deleted_count)
|
|
||||||
|
|
||||||
|
|
||||||
# ======================================================
|
|
||||||
# MAIN ENTRY
|
|
||||||
# ======================================================
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
walk_and_store_bulk()
|
|
||||||
@@ -8,7 +8,7 @@ import mysql.connector
|
|||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
import threading
|
|
||||||
|
|
||||||
# ======================================================
|
# ======================================================
|
||||||
# Load environment
|
# Load environment
|
||||||
@@ -16,47 +16,6 @@ import threading
|
|||||||
env_path = Path(__file__).resolve().parent / ".env"
|
env_path = Path(__file__).resolve().parent / ".env"
|
||||||
load_dotenv(env_path)
|
load_dotenv(env_path)
|
||||||
|
|
||||||
# ======================================================
|
|
||||||
# LOGGING TOGGLE (ON/OFF)
|
|
||||||
# ======================================================
|
|
||||||
LOGGING_ENABLED = False # ← NEW: Set to False to silence all thread debug logs
|
|
||||||
|
|
||||||
# ======================================================
|
|
||||||
# Colors & logging helpers
|
|
||||||
# ======================================================
|
|
||||||
RESET = "\033[0m"
|
|
||||||
COLORS = [
|
|
||||||
"\033[92m", # green
|
|
||||||
"\033[94m", # blue
|
|
||||||
"\033[93m", # yellow
|
|
||||||
"\033[91m", # red
|
|
||||||
"\033[95m", # magenta
|
|
||||||
"\033[96m", # cyan
|
|
||||||
"\033[90m", # gray
|
|
||||||
]
|
|
||||||
|
|
||||||
print_lock = threading.Lock()
|
|
||||||
|
|
||||||
|
|
||||||
def thread_color():
|
|
||||||
name = threading.current_thread().name
|
|
||||||
idx = 0
|
|
||||||
if "_" in name:
|
|
||||||
suffix = name.split("_")[-1]
|
|
||||||
if suffix.isdigit():
|
|
||||||
idx = int(suffix)
|
|
||||||
return COLORS[idx % len(COLORS)]
|
|
||||||
|
|
||||||
|
|
||||||
def log_thread(msg: str):
|
|
||||||
"""Thread-safe, colored log with thread name prefix."""
|
|
||||||
if not LOGGING_ENABLED: # ← NEW
|
|
||||||
return
|
|
||||||
with print_lock:
|
|
||||||
name = threading.current_thread().name
|
|
||||||
color = thread_color()
|
|
||||||
print(f"{color}[{name}] {msg}{RESET}")
|
|
||||||
|
|
||||||
|
|
||||||
# ======================================================
|
# ======================================================
|
||||||
# MySQL connection (each thread gets its own)
|
# MySQL connection (each thread gets its own)
|
||||||
@@ -82,10 +41,7 @@ def get_db_connection():
|
|||||||
def file_md5(path, chunk_size=1024 * 1024):
|
def file_md5(path, chunk_size=1024 * 1024):
|
||||||
md5 = hashlib.md5()
|
md5 = hashlib.md5()
|
||||||
with open(path, "rb") as f:
|
with open(path, "rb") as f:
|
||||||
while True:
|
while chunk := f.read(chunk_size):
|
||||||
chunk = f.read(chunk_size)
|
|
||||||
if not chunk:
|
|
||||||
break
|
|
||||||
md5.update(chunk)
|
md5.update(chunk)
|
||||||
return md5.hexdigest()
|
return md5.hexdigest()
|
||||||
|
|
||||||
@@ -108,28 +64,24 @@ def process_one_file(row):
|
|||||||
file_id = row["id"]
|
file_id = row["id"]
|
||||||
path = row["path"]
|
path = row["path"]
|
||||||
modified = row["modified"]
|
modified = row["modified"]
|
||||||
prev_md5 = row.get("content_md5")
|
prev_md5 = row["content_md5"]
|
||||||
prev_calc = row.get("md5_calculated")
|
prev_calc = row["md5_calculated"]
|
||||||
|
|
||||||
log_thread(f"START ID={file_id} → {path}")
|
|
||||||
|
|
||||||
# --- Skip if file does not exist ---
|
# --- Skip if file does not exist ---
|
||||||
if not os.path.isfile(path):
|
if not os.path.isfile(path):
|
||||||
log_thread(f"MISS ID={file_id} (file not found)")
|
return (file_id, "missing", None)
|
||||||
return file_id, "missing", None
|
|
||||||
|
|
||||||
# --- Decide if MD5 calculation is needed ---
|
# --- Decide if MD5 needed ---
|
||||||
if prev_md5 and prev_calc and prev_calc >= modified:
|
need_md5 = (
|
||||||
log_thread(f"SKIP ID={file_id} (md5 up-to-date)")
|
prev_md5 is None or
|
||||||
return file_id, "skip", None
|
prev_calc is None or
|
||||||
|
prev_calc < modified
|
||||||
|
)
|
||||||
|
if not need_md5:
|
||||||
|
return (file_id, "skip", None)
|
||||||
|
|
||||||
# --- Calculate MD5 ---
|
# --- Calculate MD5 ---
|
||||||
try:
|
|
||||||
new_md5 = file_md5(path)
|
new_md5 = file_md5(path)
|
||||||
except Exception as e:
|
|
||||||
log_thread(f"ERROR ID={file_id} while reading file: {e}")
|
|
||||||
return file_id, "error", str(e)
|
|
||||||
|
|
||||||
now = datetime.now().replace(microsecond=0)
|
now = datetime.now().replace(microsecond=0)
|
||||||
|
|
||||||
# --- Update DB inside thread ---
|
# --- Update DB inside thread ---
|
||||||
@@ -145,17 +97,14 @@ def process_one_file(row):
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
c.close()
|
c.close()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
return (file_id, "updated", new_md5)
|
||||||
log_thread(f"UPDATE ID={file_id} (MD5={new_md5})")
|
|
||||||
return file_id, "updated", new_md5
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log_thread(f"ERROR ID={file_id} DB update failed: {e}")
|
return (file_id, "error", str(e))
|
||||||
return file_id, "error", str(e)
|
|
||||||
|
|
||||||
|
|
||||||
# ======================================================
|
# ======================================================
|
||||||
# MAIN LOGIC
|
# MAIN LOGIC (single-threaded DB query + multi-threaded MD5)
|
||||||
# ======================================================
|
# ======================================================
|
||||||
def run_md5_calculator(device_name=None,
|
def run_md5_calculator(device_name=None,
|
||||||
device_id=None,
|
device_id=None,
|
||||||
@@ -164,7 +113,9 @@ def run_md5_calculator(device_name=None,
|
|||||||
path_prefix=None,
|
path_prefix=None,
|
||||||
threads=8):
|
threads=8):
|
||||||
|
|
||||||
# DEVICE resolution
|
# ----------------------------
|
||||||
|
# DEVICE filter resolution
|
||||||
|
# ----------------------------
|
||||||
filter_by_device = True
|
filter_by_device = True
|
||||||
if device_name == "ANY" or device_id == "ANY":
|
if device_name == "ANY" or device_id == "ANY":
|
||||||
filter_by_device = False
|
filter_by_device = False
|
||||||
@@ -177,17 +128,21 @@ def run_md5_calculator(device_name=None,
|
|||||||
cur = conn.cursor(dictionary=True)
|
cur = conn.cursor(dictionary=True)
|
||||||
cur.execute("SELECT id FROM devices WHERE name=%s", (device_name,))
|
cur.execute("SELECT id FROM devices WHERE name=%s", (device_name,))
|
||||||
row = cur.fetchone()
|
row = cur.fetchone()
|
||||||
cur.close()
|
cur.close(); conn.close()
|
||||||
conn.close()
|
|
||||||
|
|
||||||
if not row:
|
if not row:
|
||||||
raise RuntimeError(f"Device '{device_name}' not found")
|
raise RuntimeError(f"Device '{device_name}' not found")
|
||||||
|
|
||||||
device_id = row["id"]
|
device_id = row["id"]
|
||||||
|
|
||||||
|
# EXTENSION filter
|
||||||
filter_by_extension = (extension != "ANY")
|
filter_by_extension = (extension != "ANY")
|
||||||
|
|
||||||
|
# SIZE filter
|
||||||
filter_by_size = (max_size != "ANY")
|
filter_by_size = (max_size != "ANY")
|
||||||
max_bytes = parse_size(max_size) if filter_by_size else None
|
max_bytes = parse_size(max_size) if filter_by_size else None
|
||||||
|
|
||||||
|
# PATH filter
|
||||||
filter_by_path = (path_prefix not in [None, "", "ANY"])
|
filter_by_path = (path_prefix not in [None, "", "ANY"])
|
||||||
cleaned_prefix = path_prefix.rstrip("\\/") if filter_by_path else None
|
cleaned_prefix = path_prefix.rstrip("\\/") if filter_by_path else None
|
||||||
|
|
||||||
@@ -197,7 +152,9 @@ def run_md5_calculator(device_name=None,
|
|||||||
f" max_size={max_size},"
|
f" max_size={max_size},"
|
||||||
f" prefix={path_prefix}\n")
|
f" prefix={path_prefix}\n")
|
||||||
|
|
||||||
# Fetch rows
|
# ---------------------------------------
|
||||||
|
# Fetch all rows in a single DB query
|
||||||
|
# ---------------------------------------
|
||||||
conn = get_db_connection()
|
conn = get_db_connection()
|
||||||
cursor = conn.cursor(dictionary=True)
|
cursor = conn.cursor(dictionary=True)
|
||||||
|
|
||||||
@@ -224,38 +181,34 @@ def run_md5_calculator(device_name=None,
|
|||||||
SELECT id, path, size, modified, content_md5, md5_calculated
|
SELECT id, path, size, modified, content_md5, md5_calculated
|
||||||
FROM files
|
FROM files
|
||||||
WHERE {" AND ".join(where)}
|
WHERE {" AND ".join(where)}
|
||||||
AND NOT (
|
|
||||||
content_md5 IS NOT NULL
|
|
||||||
AND md5_calculated IS NOT NULL
|
|
||||||
AND md5_calculated >= modified
|
|
||||||
)
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
cursor.execute(sql, params)
|
cursor.execute(sql, params)
|
||||||
rows = cursor.fetchall()
|
rows = cursor.fetchall()
|
||||||
cursor.close()
|
cursor.close(); conn.close()
|
||||||
conn.close()
|
|
||||||
|
|
||||||
total = len(rows)
|
total = len(rows)
|
||||||
print(f"📁 Files matching criteria: {total}\n")
|
print(f"📁 Files matching criteria: {total}\n")
|
||||||
|
|
||||||
if total == 0:
|
# ======================================================
|
||||||
print("Nothing to do, exiting.")
|
# === MULTITHREADED MD5 CALCULATION BELOW ============
|
||||||
return
|
# ======================================================
|
||||||
|
updated = 0
|
||||||
|
skipped = 0
|
||||||
|
missing = 0
|
||||||
|
errors = 0
|
||||||
|
|
||||||
# MULTITHREADED MD5
|
with ThreadPoolExecutor(max_workers=threads) as exe:
|
||||||
updated = skipped = missing = errors = 0
|
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=threads, thread_name_prefix="Worker") as exe:
|
|
||||||
futures = {exe.submit(process_one_file, r): r["id"] for r in rows}
|
futures = {exe.submit(process_one_file, r): r["id"] for r in rows}
|
||||||
|
|
||||||
for i, future in enumerate(as_completed(futures), start=1):
|
for future in as_completed(futures):
|
||||||
file_id = futures[future]
|
file_id = futures[future]
|
||||||
|
status, result = None, None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
_id, status, result = future.result()
|
file_id, status, result = future.result()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log_thread(f"FUTURE ERROR for ID={file_id}: {e}")
|
print(f"❌ Thread error for ID {file_id}: {e}")
|
||||||
errors += 1
|
errors += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -267,11 +220,11 @@ def run_md5_calculator(device_name=None,
|
|||||||
missing += 1
|
missing += 1
|
||||||
elif status == "error":
|
elif status == "error":
|
||||||
errors += 1
|
errors += 1
|
||||||
|
print(f"⚠️ DB update error: {result}")
|
||||||
|
|
||||||
if i % 100 == 0:
|
# ======================================================
|
||||||
print(f"… processed {i}/{total} files")
|
|
||||||
|
|
||||||
# SUMMARY
|
# SUMMARY
|
||||||
|
# ======================================================
|
||||||
print("\n============================")
|
print("\n============================")
|
||||||
print("✅ Multithreaded MD5 finished")
|
print("✅ Multithreaded MD5 finished")
|
||||||
print("============================")
|
print("============================")
|
||||||
@@ -282,6 +235,7 @@ def run_md5_calculator(device_name=None,
|
|||||||
print(f"Threads: {threads}\n")
|
print(f"Threads: {threads}\n")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# ======================================================
|
# ======================================================
|
||||||
# RUN EXAMPLE
|
# RUN EXAMPLE
|
||||||
# ======================================================
|
# ======================================================
|
||||||
@@ -291,5 +245,5 @@ if __name__ == "__main__":
|
|||||||
extension="ANY",
|
extension="ANY",
|
||||||
max_size="ANY",
|
max_size="ANY",
|
||||||
path_prefix="ANY",
|
path_prefix="ANY",
|
||||||
threads=6
|
threads=12 # ← ADJUST THREAD COUNT HERE
|
||||||
)
|
)
|
||||||
|
|||||||
200
Mazání Library/10 První skript.py
Normal file
200
Mazání Library/10 První skript.py
Normal file
@@ -0,0 +1,200 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import pymysql
|
||||||
|
import pymysql.cursors
|
||||||
|
|
||||||
|
# ================= KONFIGURACE =================
|
||||||
|
|
||||||
|
# --- BEZPEČNOSTNÍ POJISTKA ---
|
||||||
|
# True = POUZE VÝPIS (nic se nesmaže, databáze se nezmění)
|
||||||
|
# False = OSTRÝ REŽIM (maže soubory i záznamy v DB!)
|
||||||
|
DRY_MODE = True
|
||||||
|
|
||||||
|
# 1. Přístup k MySQL
|
||||||
|
DB_CONFIG = {
|
||||||
|
'host': '192.168.1.76',
|
||||||
|
'port': 3307,
|
||||||
|
'user': 'root',
|
||||||
|
'password': 'Vlado9674+',
|
||||||
|
'db': 'torrents',
|
||||||
|
'charset': 'utf8mb4',
|
||||||
|
'autocommit': True
|
||||||
|
}
|
||||||
|
|
||||||
|
TABULKA = "file_md5_index"
|
||||||
|
|
||||||
|
# 2. Mapování cest
|
||||||
|
SERVER_PREFIX = "/mnt/user/Library"
|
||||||
|
# Používáme 'r' pro raw string, aby se zpětná lomítka chápala správně
|
||||||
|
LOCAL_PREFIX = r"\\tower1\#library"
|
||||||
|
|
||||||
|
|
||||||
|
# ===============================================
|
||||||
|
|
||||||
|
def get_connection():
|
||||||
|
return pymysql.connect(
|
||||||
|
cursorclass=pymysql.cursors.DictCursor,
|
||||||
|
**DB_CONFIG
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def convert_path(db_path):
|
||||||
|
"""Převede cestu z Linux serveru na lokální cestu Windows."""
|
||||||
|
if db_path.startswith(SERVER_PREFIX):
|
||||||
|
relative_path = db_path[len(SERVER_PREFIX):]
|
||||||
|
# Ořízneme počáteční lomítka z relativní cesty, aby fungoval join
|
||||||
|
relative_path = relative_path.lstrip("/").lstrip("\\")
|
||||||
|
# Spojí cesty a opraví lomítka
|
||||||
|
local_path = os.path.join(LOCAL_PREFIX, relative_path)
|
||||||
|
return os.path.normpath(local_path)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def step_1_mark_duplicates():
|
||||||
|
print(f"\n--- KROK 1: Hledání duplicit v DB (DRY_MODE={DRY_MODE}) ---")
|
||||||
|
|
||||||
|
try:
|
||||||
|
conn = get_connection()
|
||||||
|
with conn.cursor() as cursor:
|
||||||
|
if DRY_MODE:
|
||||||
|
# V DRY_MODE jen počítáme, co bychom označili (neprovádíme UPDATE)
|
||||||
|
sql = f"""
|
||||||
|
SELECT COUNT(*) as pocet
|
||||||
|
FROM {TABULKA} t1
|
||||||
|
JOIN {TABULKA} t2 ON t1.blake3 = t2.blake3
|
||||||
|
WHERE t1.host_name = 'TOWER1'
|
||||||
|
AND t2.host_name = 'SYNOLOGY'
|
||||||
|
AND (t1.to_delete IS NULL OR t1.to_delete = 0);
|
||||||
|
"""
|
||||||
|
cursor.execute(sql)
|
||||||
|
result = cursor.fetchone()
|
||||||
|
affected = result['pocet']
|
||||||
|
print(f"[DRY-RUN] Našel jsem {affected} shodných záznamů (DB nebude změněna).")
|
||||||
|
else:
|
||||||
|
# V OSTRÉM režimu provádíme UPDATE
|
||||||
|
sql = f"""
|
||||||
|
UPDATE {TABULKA} t1
|
||||||
|
JOIN {TABULKA} t2 ON t1.blake3 = t2.blake3
|
||||||
|
SET t1.to_delete = 1
|
||||||
|
WHERE t1.host_name = 'TOWER'
|
||||||
|
AND t2.host_name = 'SYNOLOGY'
|
||||||
|
AND (t1.to_delete IS NULL OR t1.to_delete = 0);
|
||||||
|
"""
|
||||||
|
print("Provádím UPDATE záznamů v databázi...")
|
||||||
|
cursor.execute(sql)
|
||||||
|
affected = cursor.rowcount
|
||||||
|
conn.commit()
|
||||||
|
print(f"Hotovo. Označeno {affected} záznamů ke smazání.")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
return affected
|
||||||
|
|
||||||
|
except pymysql.MySQLError as e:
|
||||||
|
print(f"Chyba MySQL při označování: {e}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
def step_2_delete_files():
|
||||||
|
print(f"\n--- KROK 2: Mazání souborů (DRY_MODE={DRY_MODE}) ---")
|
||||||
|
|
||||||
|
try:
|
||||||
|
conn = get_connection()
|
||||||
|
files_to_process = []
|
||||||
|
|
||||||
|
with conn.cursor() as cursor:
|
||||||
|
print("Stahuji seznam souborů...")
|
||||||
|
|
||||||
|
if DRY_MODE:
|
||||||
|
# V DRY_MODE nemůžeme hledat podle 'to_delete=1' (protože jsme nic neoznačili),
|
||||||
|
# takže musíme použít JOIN dotaz přímo pro simulaci výpisu.
|
||||||
|
sql = f"""
|
||||||
|
SELECT t1.id, t1.full_path
|
||||||
|
FROM {TABULKA} t1
|
||||||
|
JOIN {TABULKA} t2 ON t1.blake3 = t2.blake3
|
||||||
|
WHERE t1.host_name = 'TOWER'
|
||||||
|
AND t2.host_name = 'SYNOLOGY'
|
||||||
|
AND (t1.to_delete IS NULL OR t1.to_delete = 0)
|
||||||
|
"""
|
||||||
|
else:
|
||||||
|
# V OSTRÉM režimu bereme to, co jsme v kroku 1 označili
|
||||||
|
sql = f"SELECT id, full_path FROM {TABULKA} WHERE host_name = 'TOWER' AND to_delete = 1"
|
||||||
|
|
||||||
|
cursor.execute(sql)
|
||||||
|
files_to_process = cursor.fetchall()
|
||||||
|
|
||||||
|
count = len(files_to_process)
|
||||||
|
print(f"Nalezeno {count} souborů.")
|
||||||
|
|
||||||
|
if count == 0:
|
||||||
|
print("Žádné soubory k zpracování. Konec.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# V ostrém režimu se zeptáme na potvrzení
|
||||||
|
if not DRY_MODE:
|
||||||
|
confirm = input(f"-> [POZOR] Opravdu chcete SMAZAT {count} souborů? (napište 'ano'): ")
|
||||||
|
if confirm.lower() != 'ano':
|
||||||
|
print("Operace zrušena.")
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
print("-" * 40)
|
||||||
|
print("VÝPIS SOUBORŮ, KTERÉ BY BYLY SMAZÁNY:")
|
||||||
|
print("-" * 40)
|
||||||
|
|
||||||
|
deleted_counter = 0
|
||||||
|
errors = 0
|
||||||
|
|
||||||
|
for row in files_to_process:
|
||||||
|
db_id = row['id']
|
||||||
|
server_path = row['full_path']
|
||||||
|
local_path = convert_path(server_path)
|
||||||
|
|
||||||
|
if not local_path:
|
||||||
|
print(f"[SKIP PATH] Nesedí prefix: {server_path}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# --- LOGIKA DRY RUN vs REAL ---
|
||||||
|
if DRY_MODE:
|
||||||
|
# Pouze výpis
|
||||||
|
print(f"[DRY-RUN] Bylo by smazáno: {local_path}")
|
||||||
|
deleted_counter += 1
|
||||||
|
else:
|
||||||
|
# Ostré mazání
|
||||||
|
try:
|
||||||
|
if os.path.exists(local_path):
|
||||||
|
os.remove(local_path)
|
||||||
|
print(f"[OK SMAZÁNO] {local_path}")
|
||||||
|
|
||||||
|
# Smazání z DB
|
||||||
|
with conn.cursor() as del_cursor:
|
||||||
|
del_sql = f"DELETE FROM {TABULKA} WHERE id = %s"
|
||||||
|
del_cursor.execute(del_sql, (db_id,))
|
||||||
|
conn.commit()
|
||||||
|
deleted_counter += 1
|
||||||
|
else:
|
||||||
|
print(f"[NENÍ NA DISKU] Mažu jen z DB: {local_path}")
|
||||||
|
with conn.cursor() as del_cursor:
|
||||||
|
del_sql = f"DELETE FROM {TABULKA} WHERE id = %s"
|
||||||
|
del_cursor.execute(del_sql, (db_id,))
|
||||||
|
conn.commit()
|
||||||
|
deleted_counter += 1
|
||||||
|
|
||||||
|
except OSError as e:
|
||||||
|
print(f"[CHYBA OS] {local_path}: {e}")
|
||||||
|
errors += 1
|
||||||
|
except pymysql.MySQLError as e:
|
||||||
|
print(f"[CHYBA DB] ID {db_id}: {e}")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
print("-" * 30)
|
||||||
|
if DRY_MODE:
|
||||||
|
print(f"DRY RUN DOKONČEN. Zobrazena simulace pro {deleted_counter} souborů.")
|
||||||
|
else:
|
||||||
|
print(f"HOTOVO. Úspěšně smazáno: {deleted_counter}, Chyby: {errors}")
|
||||||
|
|
||||||
|
except pymysql.MySQLError as e:
|
||||||
|
print(f"Kritická chyba DB: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
step_1_mark_duplicates()
|
||||||
|
step_2_delete_files()
|
||||||
38
Mazání Library/20 MazáníPrádnýchAdresářů.py
Normal file
38
Mazání Library/20 MazáníPrádnýchAdresářů.py
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_empty_folders(target_path, dry_run=True):
|
||||||
|
# Kontrola, zda cesta existuje
|
||||||
|
if not os.path.exists(target_path):
|
||||||
|
print(f"Chyba: Cesta {target_path} neexistuje.")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"Prohledávám: {target_path}")
|
||||||
|
if dry_run:
|
||||||
|
print("POZOR: Spuštěno v režimu DRY RUN (nic se nemaže)\n")
|
||||||
|
|
||||||
|
# topdown=False je klíčové - začínáme od nejhlubších složek
|
||||||
|
for root, dirs, files in os.walk(target_path, topdown=False):
|
||||||
|
for name in dirs:
|
||||||
|
folder_path = os.path.join(root, name)
|
||||||
|
|
||||||
|
# Kontrola, zda je složka prázdná
|
||||||
|
# Listdir vypíše vše v adresáři (včetně skrytých souborů)
|
||||||
|
if not os.listdir(folder_path):
|
||||||
|
if dry_run:
|
||||||
|
print(f"[DRY RUN] Složka by byla smazána: {folder_path}")
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
os.rmdir(folder_path)
|
||||||
|
print(f"Smazáno: {folder_path}")
|
||||||
|
except OSError as e:
|
||||||
|
print(f"Chyba při mazání {folder_path}: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Cesta k vašemu síťovému sdílení
|
||||||
|
# V Pythonu na Windows použijte r"" (raw string) kvůli zpětným lomítkům
|
||||||
|
path_to_clean = r"\\tower1\#library"
|
||||||
|
|
||||||
|
# Prvně spusťte s dry_run=True, abyste viděli, co se stane
|
||||||
|
cleanup_empty_folders(path_to_clean, dry_run=True)
|
||||||
92
PST/10 ReadKulhavaPST.py
Normal file
92
PST/10 ReadKulhavaPST.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
import win32com.client
|
||||||
|
import os
|
||||||
|
|
||||||
|
# Your specific file path
|
||||||
|
pst_path = r'd:\Dropbox\!!!Days\Downloads Z230\PST\tkulhava.pst'
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if not os.path.exists(pst_path):
|
||||||
|
print(f"Error: File not found at {pst_path}")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Connect to Outlook
|
||||||
|
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
|
||||||
|
|
||||||
|
# 1. Add the PST to Outlook (This makes it visible in the sidebar)
|
||||||
|
print(f"Mounting PST: {pst_path}...")
|
||||||
|
outlook.AddStore(pst_path)
|
||||||
|
|
||||||
|
# 2. Find the folder object for this PST
|
||||||
|
# We search specifically for the folder that matches the filename 'tkulhava'
|
||||||
|
# or grab the last added store if the name doesn't match exactly.
|
||||||
|
pst_name = "tkulhava" # derived from filename usually
|
||||||
|
root_folder = None
|
||||||
|
|
||||||
|
# Loop through all stores to find the new one
|
||||||
|
for folder in outlook.Folders:
|
||||||
|
if pst_name.lower() in folder.Name.lower():
|
||||||
|
root_folder = folder
|
||||||
|
break
|
||||||
|
|
||||||
|
# Fallback: Just grab the last folder in the list if name didn't match
|
||||||
|
if not root_folder:
|
||||||
|
root_folder = outlook.Folders.GetLast()
|
||||||
|
|
||||||
|
print(f"Successfully opened root folder: {root_folder.Name}")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
# 3. Start the recursive walk
|
||||||
|
print_subjects_recursively(root_folder)
|
||||||
|
|
||||||
|
# 4. Cleanup: Remove the PST from Outlook
|
||||||
|
# (Comment this out if you want to keep it open in Outlook to inspect manually)
|
||||||
|
outlook.RemoveStore(root_folder)
|
||||||
|
print("\nDone. PST detached.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"An error occurred: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def print_subjects_recursively(folder):
|
||||||
|
"""
|
||||||
|
Recursively prints subjects of emails in a folder and its subfolders.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Print current folder name for context
|
||||||
|
# Check if folder has items
|
||||||
|
if folder.Items.Count > 0:
|
||||||
|
print(f"\n--- Folder: {folder.Name} ---")
|
||||||
|
|
||||||
|
# Iterate through items
|
||||||
|
for item in folder.Items:
|
||||||
|
try:
|
||||||
|
# Class 43 is a standard MailItem.
|
||||||
|
# Other items (meeting requests, reports) might not have a Subject or behave differently.
|
||||||
|
if item.Class == 43:
|
||||||
|
print(f"Subject: {item.Subject}")
|
||||||
|
else:
|
||||||
|
# Attempt to print subject anyway (e.g., for Meeting Items)
|
||||||
|
print(f"[{type_name(item.Class)}] Subject: {item.Subject}")
|
||||||
|
except Exception:
|
||||||
|
# Skip items that are corrupted or unreadable
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Recursion: Go deeper into subfolders
|
||||||
|
for subfolder in folder.Folders:
|
||||||
|
print_subjects_recursively(subfolder)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Skipping restricted folder '{folder.Name}': {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def type_name(class_id):
|
||||||
|
# Helper to identify non-email items
|
||||||
|
if class_id == 53: return "Meeting"
|
||||||
|
if class_id == 46: return "Report"
|
||||||
|
return f"Type {class_id}"
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
142
PST/20 ReadKulhavaSavePhotos.py
Normal file
142
PST/20 ReadKulhavaSavePhotos.py
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
import win32com.client
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
# --- CONFIGURATION ---
|
||||||
|
pst_path = r'd:\Dropbox\!!!Days\Downloads Z230\PST\tkulhava.pst'
|
||||||
|
output_dir = r'd:\Dropbox\!!!Days\Downloads Z230\PST\pictures'
|
||||||
|
|
||||||
|
# Image extensions to look for (case insensitive)
|
||||||
|
IMAGE_EXTENSIONS = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tif', '.tiff'}
|
||||||
|
|
||||||
|
|
||||||
|
def fix_encoding(text):
|
||||||
|
"""Repairs text wrongly decoded as cp1252 instead of cp1250."""
|
||||||
|
if not text: return ""
|
||||||
|
try:
|
||||||
|
return text.encode('cp1252').decode('cp1250')
|
||||||
|
except Exception:
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def get_unique_filepath(directory, filename):
|
||||||
|
"""
|
||||||
|
Checks if a file exists. If so, adds a counter (_1, _2) to the filename
|
||||||
|
until a unique name is found.
|
||||||
|
"""
|
||||||
|
# Clean filename of illegal characters just in case
|
||||||
|
filename = "".join(x for x in filename if x.isalnum() or x in "._- ")
|
||||||
|
|
||||||
|
path = pathlib.Path(directory) / filename
|
||||||
|
if not path.exists():
|
||||||
|
return path
|
||||||
|
|
||||||
|
# Split name and extension
|
||||||
|
stem = path.stem
|
||||||
|
suffix = path.suffix
|
||||||
|
counter = 1
|
||||||
|
|
||||||
|
while True:
|
||||||
|
new_filename = f"{stem}_{counter}{suffix}"
|
||||||
|
new_path = pathlib.Path(directory) / new_filename
|
||||||
|
if not new_path.exists():
|
||||||
|
return new_path
|
||||||
|
counter += 1
|
||||||
|
|
||||||
|
|
||||||
|
def process_item_attachments(item, save_folder):
|
||||||
|
"""Checks an item for attachments and saves pictures."""
|
||||||
|
try:
|
||||||
|
# Check if item has attachments
|
||||||
|
if item.Attachments.Count > 0:
|
||||||
|
for attachment in item.Attachments:
|
||||||
|
try:
|
||||||
|
# Get filename and extension
|
||||||
|
fname = getattr(attachment, 'FileName', '')
|
||||||
|
if not fname: continue
|
||||||
|
|
||||||
|
# Fix encoding on filename if needed (sometimes attachments inherit bad encoding)
|
||||||
|
fname = fix_encoding(fname)
|
||||||
|
|
||||||
|
ext = os.path.splitext(fname)[1].lower()
|
||||||
|
|
||||||
|
if ext in IMAGE_EXTENSIONS:
|
||||||
|
# Determine unique path
|
||||||
|
save_path = get_unique_filepath(save_folder, fname)
|
||||||
|
|
||||||
|
# Save the file
|
||||||
|
attachment.SaveAsFile(str(save_path))
|
||||||
|
print(f" [SAVED] {save_path.name}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" [ERROR saving attachment]: {e}")
|
||||||
|
except Exception:
|
||||||
|
# Some items (like corrupted notes) fail when accessing .Attachments
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def scan_folder_recursively(folder, save_folder):
|
||||||
|
"""Recursively walks folders and processes items."""
|
||||||
|
try:
|
||||||
|
folder_name = fix_encoding(folder.Name)
|
||||||
|
|
||||||
|
# Optional: Print folder progress
|
||||||
|
if folder.Items.Count > 0:
|
||||||
|
print(f"Scanning Folder: {folder_name}...")
|
||||||
|
|
||||||
|
# Process items in this folder
|
||||||
|
for item in folder.Items:
|
||||||
|
process_item_attachments(item, save_folder)
|
||||||
|
|
||||||
|
# Recursion
|
||||||
|
for subfolder in folder.Folders:
|
||||||
|
scan_folder_recursively(subfolder, save_folder)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Skipping folder '{fix_encoding(folder.Name)}': {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# 1. Ensure output directory exists
|
||||||
|
if not os.path.exists(output_dir):
|
||||||
|
os.makedirs(output_dir)
|
||||||
|
print(f"Created directory: {output_dir}")
|
||||||
|
|
||||||
|
if not os.path.exists(pst_path):
|
||||||
|
print(f"Error: PST file not found at {pst_path}")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 2. Connect to Outlook
|
||||||
|
outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI")
|
||||||
|
|
||||||
|
print(f"Mounting PST: {pst_path}...")
|
||||||
|
outlook.AddStore(pst_path)
|
||||||
|
|
||||||
|
# 3. Find the PST folder
|
||||||
|
pst_name = "tkulhava" # Usually derived from filename
|
||||||
|
root_folder = None
|
||||||
|
for folder in outlook.Folders:
|
||||||
|
if pst_name.lower() in folder.Name.lower():
|
||||||
|
root_folder = folder
|
||||||
|
break
|
||||||
|
|
||||||
|
if not root_folder:
|
||||||
|
root_folder = outlook.Folders.GetLast()
|
||||||
|
|
||||||
|
print(f"Opened: {fix_encoding(root_folder.Name)}")
|
||||||
|
print(f"Saving pictures to: {output_dir}")
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
# 4. Start processing
|
||||||
|
scan_folder_recursively(root_folder, output_dir)
|
||||||
|
|
||||||
|
# 5. Cleanup
|
||||||
|
outlook.RemoveStore(root_folder)
|
||||||
|
print("\nDone. PST detached.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Critical Error: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
67
Pandas/10 DuplicateTest.py
Normal file
67
Pandas/10 DuplicateTest.py
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
import time
|
||||||
|
|
||||||
|
# --- KONFIGURACE ---
|
||||||
|
db_user = 'root'
|
||||||
|
db_pass = 'Vlado9674+'
|
||||||
|
db_host = '192.168.1.76'
|
||||||
|
db_port = '3307'
|
||||||
|
db_name = 'torrents' # <--- ZDE DOPLNIT NÁZEV DATABÁZE
|
||||||
|
|
||||||
|
# --- PŘIPOJENÍ ---
|
||||||
|
connection_string = f'mysql+mysqlconnector://{db_user}:{db_pass}@{db_host}:{db_port}/{db_name}'
|
||||||
|
engine = create_engine(connection_string)
|
||||||
|
|
||||||
|
# SQL dotaz - vybíráme i full_path, abychom mohli v Pandas ukázat příklad cesty
|
||||||
|
# POZOR: Načítání 5.8M textových řetězců (full_path) zabere dost RAM (odhadem 2-4 GB).
|
||||||
|
query = """
|
||||||
|
SELECT id, blake3, file_size, full_path
|
||||||
|
FROM file_md5_index FORCE INDEX (idx_full_path_prefix)
|
||||||
|
WHERE host_name='Tower1' AND full_path LIKE '/mnt/user/#Library%'
|
||||||
|
"""
|
||||||
|
|
||||||
|
print("1. Začínám stahovat data z MySQL do RAM...")
|
||||||
|
start_load = time.time()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Stáhnutí dat
|
||||||
|
df = pd.read_sql(query, engine)
|
||||||
|
end_load = time.time()
|
||||||
|
print(f"-> Data stažena za: {end_load - start_load:.2f} sekund")
|
||||||
|
print(f"-> Počet řádků v paměti: {len(df)}")
|
||||||
|
|
||||||
|
print("\n2. Začínám hledat duplicity (Pandas GroupBy)...")
|
||||||
|
start_process = time.time()
|
||||||
|
|
||||||
|
# Logika hledání duplicit
|
||||||
|
# Najdeme jen ty, co mají duplicitní hash
|
||||||
|
duplicity = df[df.duplicated(subset=['blake3'], keep=False)]
|
||||||
|
|
||||||
|
if not duplicity.empty:
|
||||||
|
# Seskupení
|
||||||
|
vysledek = duplicity.groupby('blake3').agg({
|
||||||
|
'file_size': 'first', # Velikost souboru (předpokládáme stejnou pro stejný hash)
|
||||||
|
'id': 'count', # Počet výskytů
|
||||||
|
'full_path': lambda x: x.iloc[0] # Ukázka první cesty (rychlejší než 'first')
|
||||||
|
}).rename(columns={'id': 'pocet_kopii'})
|
||||||
|
|
||||||
|
# Filtrujeme jen ty, co mají skutečně více kopií a seřadíme podle velikosti * počet kopií
|
||||||
|
# (Chceme vidět, kde plýtváme nejvíc místa)
|
||||||
|
vysledek['celkove_plytvani'] = vysledek['file_size'] * (vysledek['pocet_kopii'] - 1)
|
||||||
|
vysledek = vysledek.sort_values('celkove_plytvani', ascending=False)
|
||||||
|
|
||||||
|
end_process = time.time()
|
||||||
|
print(f"-> Zpracováno za: {end_process - start_process:.4f} sekund")
|
||||||
|
|
||||||
|
print("\n--- TOP 20 NEJVĚTŠÍCH DUPLICIT ---")
|
||||||
|
# Zobrazíme hash, počet kopií, velikost jednoho souboru a ukázku cesty
|
||||||
|
print(vysledek[['pocet_kopii', 'file_size', 'full_path']].head(20))
|
||||||
|
|
||||||
|
print(f"\nCelkem nalezeno {len(vysledek)} unikátních souborů, které mají duplicity.")
|
||||||
|
else:
|
||||||
|
print("Nebyly nalezeny žádné duplicity.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\nCHYBA: {e}")
|
||||||
|
print("Zkontrolujte prosím název databáze a jestli máte dost paměti RAM.")
|
||||||
295
WalkFilesOnBackupHDD/10 WalkBackupHDD.py
Normal file
295
WalkFilesOnBackupHDD/10 WalkBackupHDD.py
Normal file
@@ -0,0 +1,295 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
FAST FILE HASH INDEXER – WINDOWS CLIENT (EXTERNAL DISKS)
|
||||||
|
- Mode: PHYSICAL BACKUP
|
||||||
|
- Hostname in DB = Disk Label (e.g., #HD015)
|
||||||
|
- Path in DB = Relative path (e.g., /Movies/Film.mkv)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os, time
|
||||||
|
import pymysql
|
||||||
|
import socket
|
||||||
|
import platform
|
||||||
|
import sys
|
||||||
|
from blake3 import blake3
|
||||||
|
|
||||||
|
# ==============================
|
||||||
|
# CONFIG
|
||||||
|
# ==============================
|
||||||
|
CHUNK_SIZE = 5 * 1024 * 1024 # 5 MB
|
||||||
|
PROGRESS_MIN_SIZE = 500 * 1024 * 1024 # 500 MB
|
||||||
|
PROGRESS_INTERVAL = 1.0 # seconds
|
||||||
|
|
||||||
|
EXCLUDED_DIRS = {"$RECYCLE.BIN", "System Volume Information", "RECYCLER", "msdownld.tmp"}
|
||||||
|
|
||||||
|
# --- Limity velikosti ---
|
||||||
|
FILE_MIN_SIZE = 0
|
||||||
|
FILE_MAX_SIZE = 1024 * 1024 * 1024* 1024 # 1TB
|
||||||
|
|
||||||
|
# --- Nastavení Databáze ---
|
||||||
|
DB_CONFIG = {
|
||||||
|
"host": "192.168.1.76",
|
||||||
|
"port": 3307,
|
||||||
|
"user": "root",
|
||||||
|
"password": "Vlado9674+",
|
||||||
|
"database": "torrents",
|
||||||
|
"charset": "utf8mb4",
|
||||||
|
"autocommit": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
CHUNK_SIZE = 4 * 1024 * 1024 # 4 MB
|
||||||
|
PRINT_SKIPPED = False # True = vypisovat i přeskočené
|
||||||
|
|
||||||
|
# ==============================
|
||||||
|
# SYSTEM INFO
|
||||||
|
# ==============================
|
||||||
|
|
||||||
|
# Fyzický název PC (jen pro výpis do konzole, do DB půjde název disku)
|
||||||
|
REAL_PC_HOSTNAME = socket.gethostname()
|
||||||
|
OS_NAME = platform.system()
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================
|
||||||
|
# FUNCTIONS
|
||||||
|
# ==============================
|
||||||
|
|
||||||
|
def compute_blake3(path: str) -> bytes:
|
||||||
|
h = blake3()
|
||||||
|
total_size = os.path.getsize(path)
|
||||||
|
show_progress = total_size >= PROGRESS_MIN_SIZE
|
||||||
|
|
||||||
|
processed = 0
|
||||||
|
start_time = time.time()
|
||||||
|
last_report = start_time
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
while True:
|
||||||
|
chunk = f.read(CHUNK_SIZE)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
|
||||||
|
h.update(chunk)
|
||||||
|
processed += len(chunk)
|
||||||
|
|
||||||
|
if show_progress:
|
||||||
|
now = time.time()
|
||||||
|
if now - last_report >= PROGRESS_INTERVAL:
|
||||||
|
elapsed = now - start_time
|
||||||
|
speed = processed / elapsed if elapsed > 0 else 0
|
||||||
|
percent = processed / total_size * 100
|
||||||
|
remaining = total_size - processed
|
||||||
|
eta = remaining / speed if speed > 0 else 0
|
||||||
|
|
||||||
|
print(
|
||||||
|
f" ⏳ {percent:6.2f}% | "
|
||||||
|
f"{processed/1024/1024:8.1f} / {total_size/1024/1024:.1f} MB | "
|
||||||
|
f"{speed/1024/1024:6.1f} MB/s | "
|
||||||
|
f"ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}",
|
||||||
|
flush=True
|
||||||
|
)
|
||||||
|
last_report = now
|
||||||
|
|
||||||
|
if show_progress:
|
||||||
|
total_time = time.time() - start_time
|
||||||
|
avg_speed = total_size / total_time if total_time > 0 else 0
|
||||||
|
print(
|
||||||
|
f" ✅ DONE | "
|
||||||
|
f"{total_size/1024/1024:.1f} MB | "
|
||||||
|
f"avg {avg_speed/1024/1024:.1f} MB/s | "
|
||||||
|
f"time {time.strftime('%H:%M:%S', time.gmtime(total_time))}",
|
||||||
|
flush=True
|
||||||
|
)
|
||||||
|
|
||||||
|
return h.digest()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ HASH ERROR: {path} - {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_drive_info():
|
||||||
|
"""Získá písmeno disku a jeho ID (které se použije jako host_name)."""
|
||||||
|
print("\n💿 --- NASTAVENÍ SKENOVÁNÍ (EXTERNÍ DISK) ---")
|
||||||
|
|
||||||
|
# 1. Písmeno disku
|
||||||
|
while True:
|
||||||
|
drive_input = input("📂 Zadejte písmeno disku ve Windows (např. 'E'): ").strip().upper()
|
||||||
|
drive_letter = drive_input.replace(":", "").replace("\\", "").replace("/", "")
|
||||||
|
|
||||||
|
if len(drive_letter) == 1 and drive_letter.isalpha():
|
||||||
|
drive_root = f"{drive_letter}:\\"
|
||||||
|
if os.path.isdir(drive_root):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print(f"❌ Disk {drive_root} není dostupný.")
|
||||||
|
else:
|
||||||
|
print("❌ Neplatný formát.")
|
||||||
|
|
||||||
|
# 2. Název disku -> HOST_NAME
|
||||||
|
while True:
|
||||||
|
disk_label = input("🏷️ Zadejte ID disku (bude uloženo jako 'host_name', např. '#HD015'): ").strip()
|
||||||
|
if len(disk_label) >= 2:
|
||||||
|
break
|
||||||
|
print("❌ Název je příliš krátký.")
|
||||||
|
|
||||||
|
return drive_root, disk_label
|
||||||
|
|
||||||
|
|
||||||
|
def size_allowed(size: int) -> bool:
|
||||||
|
if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE: return False
|
||||||
|
if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE: return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================
|
||||||
|
# MAIN
|
||||||
|
# ==============================
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("🚀 BLAKE3 External Disk Indexer", flush=True)
|
||||||
|
print(f"🖥 Running on PC: {REAL_PC_HOSTNAME}", flush=True)
|
||||||
|
|
||||||
|
# Získání vstupů
|
||||||
|
scan_root, disk_hostname = get_drive_info()
|
||||||
|
|
||||||
|
print(f"✅ Konfigurace:")
|
||||||
|
print(f" Zdroj (Windows) : {scan_root}")
|
||||||
|
print(f" DB Hostname : {disk_hostname}")
|
||||||
|
print(f" DB Cesty : /Složka/Soubor...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
db = pymysql.connect(**DB_CONFIG)
|
||||||
|
cur = db.cursor()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ DB Connection failed: {e}")
|
||||||
|
input("Enter pro konec...")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"📥 Načítám index pro disk: '{disk_hostname}'...", flush=True)
|
||||||
|
|
||||||
|
# === OPTIMALIZACE: Hledáme přesně podle host_name ===
|
||||||
|
cur.execute("""
|
||||||
|
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
|
||||||
|
FROM file_md5_index
|
||||||
|
WHERE host_name = %s
|
||||||
|
""", (disk_hostname,))
|
||||||
|
|
||||||
|
# Mapa: { "/Slozka/Soubor.ext": (size, mtime) }
|
||||||
|
indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
|
||||||
|
|
||||||
|
print(f"✅ Nalezeno {len(indexed_map):,} souborů v DB pro tento disk.", flush=True)
|
||||||
|
print("======================================", flush=True)
|
||||||
|
|
||||||
|
new_files = 0
|
||||||
|
skipped = 0
|
||||||
|
filtered = 0
|
||||||
|
errors = 0
|
||||||
|
seen_paths = set()
|
||||||
|
|
||||||
|
# --- SCAN ---
|
||||||
|
for root, dirs, files in os.walk(scan_root):
|
||||||
|
# Ignorace systémových složek
|
||||||
|
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
|
||||||
|
|
||||||
|
for fname in files:
|
||||||
|
disk_path = os.path.join(root, fname)
|
||||||
|
|
||||||
|
# 1. Stat (velikost, čas)
|
||||||
|
try:
|
||||||
|
stat = os.stat(disk_path)
|
||||||
|
except OSError:
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
size = stat.st_size
|
||||||
|
if not size_allowed(size):
|
||||||
|
filtered += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# 2. Vytvoření čisté cesty pro DB
|
||||||
|
# E:\Filmy\Avatar.mkv -> Filmy\Avatar.mkv
|
||||||
|
try:
|
||||||
|
rel_path = os.path.relpath(disk_path, scan_root)
|
||||||
|
except ValueError:
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Normalizace na Linux style: Filmy/Avatar.mkv
|
||||||
|
clean_path = rel_path.replace("\\", "/")
|
||||||
|
|
||||||
|
# Přidání lomítka na začátek: /Filmy/Avatar.mkv
|
||||||
|
if not clean_path.startswith("/"):
|
||||||
|
clean_path = "/" + clean_path
|
||||||
|
|
||||||
|
if clean_path in seen_paths:
|
||||||
|
continue
|
||||||
|
seen_paths.add(clean_path)
|
||||||
|
|
||||||
|
mtime = int(stat.st_mtime)
|
||||||
|
|
||||||
|
# === STRICT CHECK ===
|
||||||
|
is_match = False
|
||||||
|
if clean_path in indexed_map:
|
||||||
|
db_size, db_mtime = indexed_map[clean_path]
|
||||||
|
if size == db_size and mtime == db_mtime:
|
||||||
|
is_match = True
|
||||||
|
|
||||||
|
if is_match:
|
||||||
|
skipped += 1
|
||||||
|
if PRINT_SKIPPED:
|
||||||
|
print(f"⏭ SKIP {clean_path}", flush=True)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# === INSERT / UPDATE ===
|
||||||
|
print("➕ NEW / UPDATED", flush=True)
|
||||||
|
print(f" File: {clean_path}", flush=True)
|
||||||
|
print(f" Size: {size:,} B", flush=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
b3 = compute_blake3(disk_path)
|
||||||
|
except Exception:
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO file_md5_index
|
||||||
|
(os_name, host_name, full_path, file_name, directory,
|
||||||
|
file_size, mtime, blake3)
|
||||||
|
VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
|
||||||
|
ON DUPLICATE KEY UPDATE
|
||||||
|
file_size = VALUES(file_size),
|
||||||
|
mtime = VALUES(mtime),
|
||||||
|
blake3 = VALUES(blake3),
|
||||||
|
updated_at = CURRENT_TIMESTAMP
|
||||||
|
""", (
|
||||||
|
OS_NAME, # Např. 'Windows' (kde se to skenovalo)
|
||||||
|
disk_hostname, # ZDE SE UKLÁDÁ '#HD015'
|
||||||
|
clean_path, # ZDE SE UKLÁDÁ '/Filmy/Avatar.mkv'
|
||||||
|
fname,
|
||||||
|
os.path.dirname(clean_path),
|
||||||
|
size,
|
||||||
|
mtime,
|
||||||
|
b3,
|
||||||
|
))
|
||||||
|
|
||||||
|
new_files += 1
|
||||||
|
print(f" Hash: {b3.hex()}", flush=True)
|
||||||
|
print("--------------------------------------", flush=True)
|
||||||
|
|
||||||
|
print("======================================", flush=True)
|
||||||
|
print(f"✅ Hotovo : {new_files}")
|
||||||
|
print(f"⏭ Shoda : {skipped}")
|
||||||
|
print(f"⚠️ Chyby : {errors}")
|
||||||
|
print("🏁 Konec.")
|
||||||
|
|
||||||
|
cur.close()
|
||||||
|
db.close()
|
||||||
|
# input("\nStiskněte Enter pro ukončení...")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
355
WalkFilesOnBackupHDD/20 WalkBackupHDD.py
Normal file
355
WalkFilesOnBackupHDD/20 WalkBackupHDD.py
Normal file
@@ -0,0 +1,355 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
r"""
|
||||||
|
FAST FILE HASH INDEXER – WINDOWS CLIENT (HARDCODED CONFIG)
|
||||||
|
- Mode: PHYSICAL BACKUP
|
||||||
|
- Hostname in DB = Disk Label (e.g., #HD015)
|
||||||
|
- Path in DB = Relative path (e.g., /Movies/Film.mkv)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os, time
|
||||||
|
import pymysql
|
||||||
|
import socket
|
||||||
|
import platform
|
||||||
|
import sys
|
||||||
|
from blake3 import blake3
|
||||||
|
|
||||||
|
def get_path_hash(path_str: str) -> bytes:
|
||||||
|
"""Calculates MD5 hash of the path and returns raw 16 bytes for BINARY(16)."""
|
||||||
|
return hashlib.md5(path_str.encode('utf-8')).digest()
|
||||||
|
|
||||||
|
# ==============================
|
||||||
|
# ⚙️ USER CONFIGURATION
|
||||||
|
# ==============================
|
||||||
|
DISK_DRIVE_LETTER = "z" # (e.g., "E", "F", "P")
|
||||||
|
DISK_HOSTNAME = "TW22" # (e.g., "#HD015")
|
||||||
|
|
||||||
|
# 🔒 SAFETY SWITCH
|
||||||
|
# True = LIST ONLY (No DB changes). "Simulates" the run.
|
||||||
|
# False = EXECUTE (Deletes and Inserts into DB).
|
||||||
|
DRY_RUN = False
|
||||||
|
|
||||||
|
# ==============================
|
||||||
|
# TECHNICAL CONFIG
|
||||||
|
# ==============================
|
||||||
|
CHUNK_SIZE = 5 * 1024 * 1024 # 5 MB
|
||||||
|
PROGRESS_MIN_SIZE = 500 * 1024 * 1024 # 500 MB
|
||||||
|
PROGRESS_INTERVAL = 1.0 # seconds
|
||||||
|
|
||||||
|
EXCLUDED_DIRS = {"$RECYCLE.BIN", "System Volume Information", "RECYCLER", "msdownld.tmp"}
|
||||||
|
|
||||||
|
# --- File Size Limits ---
|
||||||
|
FILE_MIN_SIZE = 0
|
||||||
|
FILE_MAX_SIZE = 1024 * 1024*1024*1024 # 1TB
|
||||||
|
|
||||||
|
# --- DB Config ---
|
||||||
|
DB_CONFIG = {
|
||||||
|
"host": "192.168.1.76",
|
||||||
|
"port": 3307,
|
||||||
|
"user": "root",
|
||||||
|
"password": "Vlado9674+",
|
||||||
|
"database": "torrents",
|
||||||
|
"charset": "utf8mb4",
|
||||||
|
"autocommit": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
PRINT_SKIPPED = False # Set True to see files that were already in DB
|
||||||
|
|
||||||
|
# ==============================
|
||||||
|
# SYSTEM INFO
|
||||||
|
# ==============================
|
||||||
|
REAL_PC_HOSTNAME = socket.gethostname()
|
||||||
|
OS_NAME = platform.system()
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================
|
||||||
|
# FUNCTIONS
|
||||||
|
# ==============================
|
||||||
|
|
||||||
|
def compute_blake3(path: str) -> bytes:
|
||||||
|
h = blake3()
|
||||||
|
total_size = os.path.getsize(path)
|
||||||
|
show_progress = total_size >= PROGRESS_MIN_SIZE
|
||||||
|
|
||||||
|
processed = 0
|
||||||
|
start_time = time.time()
|
||||||
|
last_report = start_time
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
while True:
|
||||||
|
chunk = f.read(CHUNK_SIZE)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
|
||||||
|
h.update(chunk)
|
||||||
|
processed += len(chunk)
|
||||||
|
|
||||||
|
if show_progress:
|
||||||
|
now = time.time()
|
||||||
|
if now - last_report >= PROGRESS_INTERVAL:
|
||||||
|
elapsed = now - start_time
|
||||||
|
speed = processed / elapsed if elapsed > 0 else 0
|
||||||
|
percent = processed / total_size * 100
|
||||||
|
remaining = total_size - processed
|
||||||
|
eta = remaining / speed if speed > 0 else 0
|
||||||
|
|
||||||
|
print(
|
||||||
|
f" ⏳ {percent:6.2f}% | "
|
||||||
|
f"{processed / 1024 / 1024:8.1f} / {total_size / 1024 / 1024:.1f} MB | "
|
||||||
|
f"{speed / 1024 / 1024:6.1f} MB/s | "
|
||||||
|
f"ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}",
|
||||||
|
flush=True
|
||||||
|
)
|
||||||
|
last_report = now
|
||||||
|
|
||||||
|
if show_progress:
|
||||||
|
total_time = time.time() - start_time
|
||||||
|
avg_speed = total_size / total_time if total_time > 0 else 0
|
||||||
|
print(
|
||||||
|
f" ✅ DONE | "
|
||||||
|
f"{total_size / 1024 / 1024:.1f} MB | "
|
||||||
|
f"avg {avg_speed / 1024 / 1024:.1f} MB/s | "
|
||||||
|
f"time {time.strftime('%H:%M:%S', time.gmtime(total_time))}",
|
||||||
|
flush=True
|
||||||
|
)
|
||||||
|
|
||||||
|
return h.digest()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ HASH ERROR: {path} - {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def size_allowed(size: int) -> bool:
|
||||||
|
if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE: return False
|
||||||
|
if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE: return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_db_path(scan_root, disk_path):
|
||||||
|
"""
|
||||||
|
Converts a physical Windows path to the standardized DB format.
|
||||||
|
E:\Movies\File.mkv -> /Movies/File.mkv
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
rel_path = os.path.relpath(disk_path, scan_root)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Windows backslash to slash
|
||||||
|
clean_path = rel_path.replace("\\", "/")
|
||||||
|
|
||||||
|
# Ensure leading slash
|
||||||
|
if not clean_path.startswith("/"):
|
||||||
|
clean_path = "/" + clean_path
|
||||||
|
|
||||||
|
return clean_path
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================
|
||||||
|
# MAIN
|
||||||
|
# ==============================
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("🚀 BLAKE3 External Disk Indexer", flush=True)
|
||||||
|
print(f"🖥 Running on PC: {REAL_PC_HOSTNAME}", flush=True)
|
||||||
|
|
||||||
|
if DRY_RUN:
|
||||||
|
print("🛡️ DRY RUN MODE ACTIVE: No changes will be made to DB.", flush=True)
|
||||||
|
else:
|
||||||
|
print("⚠️ LIVE MODE: Changes WILL be committed to DB.", flush=True)
|
||||||
|
|
||||||
|
# Build root path
|
||||||
|
scan_root = f"{DISK_DRIVE_LETTER}:\\"
|
||||||
|
|
||||||
|
if not os.path.isdir(scan_root):
|
||||||
|
print(f"❌ ERROR: Drive '{scan_root}' not found!")
|
||||||
|
print(f" Please check DISK_DRIVE_LETTER in config.")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"✅ Config:")
|
||||||
|
print(f" Source (Win) : {scan_root}")
|
||||||
|
print(f" DB Hostname : {DISK_HOSTNAME}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
db = pymysql.connect(**DB_CONFIG)
|
||||||
|
cur = db.cursor()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ DB Connection failed: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"📥 Loading DB index for: '{DISK_HOSTNAME}'...", flush=True)
|
||||||
|
|
||||||
|
# === LOAD EXISTING DB RECORDS ===
|
||||||
|
cur.execute("""
|
||||||
|
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
|
||||||
|
FROM file_md5_index
|
||||||
|
WHERE host_name = %s
|
||||||
|
""", (DISK_HOSTNAME,))
|
||||||
|
|
||||||
|
indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
|
||||||
|
|
||||||
|
print(f"✅ Found {len(indexed_map):,} files in DB for this disk.", flush=True)
|
||||||
|
|
||||||
|
# =========================================================
|
||||||
|
# PHASE 1: CLEANUP (DELETE MISSING FILES)
|
||||||
|
# =========================================================
|
||||||
|
print("======================================", flush=True)
|
||||||
|
print("🧹 PHASE 1: Checking for deleted files...", flush=True)
|
||||||
|
|
||||||
|
current_disk_paths = set()
|
||||||
|
|
||||||
|
# Fast walk just to get paths
|
||||||
|
for root, dirs, files in os.walk(scan_root):
|
||||||
|
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
|
||||||
|
|
||||||
|
for fname in files:
|
||||||
|
disk_path = os.path.join(root, fname)
|
||||||
|
clean_path = normalize_db_path(scan_root, disk_path)
|
||||||
|
if clean_path:
|
||||||
|
current_disk_paths.add(clean_path)
|
||||||
|
|
||||||
|
paths_to_delete = set(indexed_map.keys()) - current_disk_paths
|
||||||
|
|
||||||
|
if paths_to_delete:
|
||||||
|
print(f"🗑️ Found {len(paths_to_delete):,} files to delete from DB.")
|
||||||
|
|
||||||
|
if DRY_RUN:
|
||||||
|
print("🛡️ [DRY RUN] Listing files to be deleted (No action taken):")
|
||||||
|
for p in sorted(list(paths_to_delete))[:20]: # Print first 20
|
||||||
|
print(f" - {p}")
|
||||||
|
if len(paths_to_delete) > 20:
|
||||||
|
print(f" ... and {len(paths_to_delete) - 20} more.")
|
||||||
|
else:
|
||||||
|
# Delete in batches
|
||||||
|
batch_size = 1000
|
||||||
|
to_delete_list = list(paths_to_delete)
|
||||||
|
|
||||||
|
for i in range(0, len(to_delete_list), batch_size):
|
||||||
|
batch = to_delete_list[i: i + batch_size]
|
||||||
|
format_strings = ','.join(['%s'] * len(batch))
|
||||||
|
|
||||||
|
query = f"DELETE FROM file_md5_index WHERE host_name = %s AND full_path IN ({format_strings})"
|
||||||
|
|
||||||
|
try:
|
||||||
|
cur.execute(query, [DISK_HOSTNAME] + batch)
|
||||||
|
print(f" ... deleted batch {i}-{i + len(batch)}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error deleting batch: {e}")
|
||||||
|
|
||||||
|
# Update local map
|
||||||
|
for p in paths_to_delete:
|
||||||
|
del indexed_map[p]
|
||||||
|
print("✅ Cleanup complete.")
|
||||||
|
else:
|
||||||
|
print("✅ No deleted files detected.")
|
||||||
|
|
||||||
|
# =========================================================
|
||||||
|
# PHASE 2: SCAN & UPDATE (HASHING)
|
||||||
|
# =========================================================
|
||||||
|
print("======================================", flush=True)
|
||||||
|
print("🚀 PHASE 2: Scanning for changes & new files...", flush=True)
|
||||||
|
|
||||||
|
new_files = 0
|
||||||
|
skipped = 0
|
||||||
|
filtered = 0
|
||||||
|
errors = 0
|
||||||
|
seen_paths = set()
|
||||||
|
|
||||||
|
for root, dirs, files in os.walk(scan_root):
|
||||||
|
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
|
||||||
|
|
||||||
|
for fname in files:
|
||||||
|
disk_path = os.path.join(root, fname)
|
||||||
|
|
||||||
|
try:
|
||||||
|
stat = os.stat(disk_path)
|
||||||
|
except OSError:
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
size = stat.st_size
|
||||||
|
if not size_allowed(size):
|
||||||
|
filtered += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
clean_path = normalize_db_path(scan_root, disk_path)
|
||||||
|
if not clean_path:
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if clean_path in seen_paths:
|
||||||
|
continue
|
||||||
|
seen_paths.add(clean_path)
|
||||||
|
|
||||||
|
mtime = int(stat.st_mtime)
|
||||||
|
|
||||||
|
# === MATCH CHECK ===
|
||||||
|
is_match = False
|
||||||
|
if clean_path in indexed_map:
|
||||||
|
db_size, db_mtime = indexed_map[clean_path]
|
||||||
|
if size == db_size and mtime == db_mtime:
|
||||||
|
is_match = True
|
||||||
|
|
||||||
|
if is_match:
|
||||||
|
skipped += 1
|
||||||
|
if PRINT_SKIPPED:
|
||||||
|
print(f"⏭ SKIP {clean_path}", flush=True)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# === INSERT / UPDATE ===
|
||||||
|
print("➕ NEW / UPDATED", flush=True)
|
||||||
|
print(f" File: {clean_path}", flush=True)
|
||||||
|
print(f" Size: {size:,} B", flush=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
b3 = compute_blake3(disk_path)
|
||||||
|
except Exception:
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if DRY_RUN:
|
||||||
|
print(f"🛡️ [DRY RUN] Would INSERT/UPDATE: {clean_path}")
|
||||||
|
print(f" Hash: {b3.hex()}")
|
||||||
|
new_files += 1
|
||||||
|
else:
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO file_md5_index
|
||||||
|
(os_name, host_name, full_path, file_name, directory,
|
||||||
|
file_size, mtime, blake3)
|
||||||
|
VALUES (%s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
|
||||||
|
ON DUPLICATE KEY UPDATE
|
||||||
|
file_size = VALUES(file_size),
|
||||||
|
mtime = VALUES(mtime),
|
||||||
|
blake3 = VALUES(blake3),
|
||||||
|
updated_at = CURRENT_TIMESTAMP
|
||||||
|
""", (
|
||||||
|
OS_NAME,
|
||||||
|
DISK_HOSTNAME,
|
||||||
|
clean_path,
|
||||||
|
fname,
|
||||||
|
os.path.dirname(clean_path),
|
||||||
|
size,
|
||||||
|
mtime,
|
||||||
|
b3,
|
||||||
|
))
|
||||||
|
new_files += 1
|
||||||
|
print(f" Hash: {b3.hex()}", flush=True)
|
||||||
|
|
||||||
|
print("--------------------------------------", flush=True)
|
||||||
|
|
||||||
|
print("======================================", flush=True)
|
||||||
|
print(f"✅ Processed : {new_files}")
|
||||||
|
print(f"⏭ Skipped : {skipped}")
|
||||||
|
print(f"🗑 Deleted : {len(paths_to_delete)} " + ("(DRY RUN)" if DRY_RUN else ""))
|
||||||
|
print(f"⚠️ Errors : {errors}")
|
||||||
|
print("🏁 Done.")
|
||||||
|
|
||||||
|
cur.close()
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
313
WalkFilesOnBackupHDD/30 WalkBackupHDD.py
Normal file
313
WalkFilesOnBackupHDD/30 WalkBackupHDD.py
Normal file
@@ -0,0 +1,313 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
r"""
|
||||||
|
FAST FILE HASH INDEXER – WINDOWS CLIENT (HARDCODED CONFIG)
|
||||||
|
- Mode: PHYSICAL BACKUP
|
||||||
|
- Hostname in DB = Disk Label (e.g., #HD015)
|
||||||
|
- Path in DB = Relative path (e.g., /Movies/Film.mkv)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import pymysql
|
||||||
|
import socket
|
||||||
|
import platform
|
||||||
|
import sys
|
||||||
|
import hashlib
|
||||||
|
from blake3 import blake3
|
||||||
|
|
||||||
|
# ==============================
|
||||||
|
# ⚙️ USER CONFIGURATION
|
||||||
|
# ==============================
|
||||||
|
DISK_DRIVE_LETTER = "z" # (e.g., "E", "F", "P")
|
||||||
|
DISK_HOSTNAME = "TW22" # (e.g., "#HD015")
|
||||||
|
|
||||||
|
# 🔒 SAFETY SWITCH
|
||||||
|
DRY_RUN = False
|
||||||
|
|
||||||
|
# ==============================
|
||||||
|
# TECHNICAL CONFIG
|
||||||
|
# ==============================
|
||||||
|
CHUNK_SIZE = 5 * 1024 * 1024 # 5 MB
|
||||||
|
PROGRESS_MIN_SIZE = 500 * 1024 * 1024 # 500 MB
|
||||||
|
PROGRESS_INTERVAL = 1.0 # seconds
|
||||||
|
|
||||||
|
EXCLUDED_DIRS = {"$RECYCLE.BIN", "System Volume Information", "RECYCLER", "msdownld.tmp"}
|
||||||
|
|
||||||
|
# --- File Size Limits ---
|
||||||
|
FILE_MIN_SIZE = 0
|
||||||
|
FILE_MAX_SIZE = 1024 * 1024 * 1024 * 1024 # 1TB
|
||||||
|
|
||||||
|
# --- DB Config ---
|
||||||
|
DB_CONFIG = {
|
||||||
|
"host": "192.168.1.50",
|
||||||
|
"port": 3306,
|
||||||
|
"user": "root",
|
||||||
|
"password": "Vlado9674+",
|
||||||
|
"database": "torrents",
|
||||||
|
"charset": "utf8mb4",
|
||||||
|
"autocommit": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
PRINT_SKIPPED = False # Set True to see files that were already in DB
|
||||||
|
|
||||||
|
# ==============================
|
||||||
|
# SYSTEM INFO
|
||||||
|
# ==============================
|
||||||
|
REAL_PC_HOSTNAME = socket.gethostname()
|
||||||
|
OS_NAME = platform.system()
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================
|
||||||
|
# FUNCTIONS
|
||||||
|
# ==============================
|
||||||
|
|
||||||
|
def get_path_hash(path_str: str) -> bytes:
|
||||||
|
"""Calculates MD5 hash of the path and returns raw 16 bytes for BINARY(16)."""
|
||||||
|
return hashlib.md5(path_str.encode('utf-8')).digest()
|
||||||
|
|
||||||
|
|
||||||
|
def compute_blake3(path: str) -> bytes:
|
||||||
|
h = blake3()
|
||||||
|
total_size = os.path.getsize(path)
|
||||||
|
show_progress = total_size >= PROGRESS_MIN_SIZE
|
||||||
|
|
||||||
|
processed = 0
|
||||||
|
start_time = time.time()
|
||||||
|
last_report = start_time
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
while True:
|
||||||
|
chunk = f.read(CHUNK_SIZE)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
|
||||||
|
h.update(chunk)
|
||||||
|
processed += len(chunk)
|
||||||
|
|
||||||
|
if show_progress:
|
||||||
|
now = time.time()
|
||||||
|
if now - last_report >= PROGRESS_INTERVAL:
|
||||||
|
elapsed = now - start_time
|
||||||
|
speed = processed / elapsed if elapsed > 0 else 0
|
||||||
|
percent = processed / total_size * 100
|
||||||
|
remaining = total_size - processed
|
||||||
|
eta = remaining / speed if speed > 0 else 0
|
||||||
|
|
||||||
|
print(
|
||||||
|
f" ⏳ {percent:6.2f}% | "
|
||||||
|
f"{processed / 1024 / 1024:8.1f} / {total_size / 1024 / 1024:.1f} MB | "
|
||||||
|
f"{speed / 1024 / 1024:6.1f} MB/s | "
|
||||||
|
f"ETA {time.strftime('%H:%M:%S', time.gmtime(eta))}",
|
||||||
|
flush=True
|
||||||
|
)
|
||||||
|
last_report = now
|
||||||
|
|
||||||
|
if show_progress:
|
||||||
|
total_time = time.time() - start_time
|
||||||
|
avg_speed = total_size / total_time if total_time > 0 else 0
|
||||||
|
print(
|
||||||
|
f" ✅ DONE | "
|
||||||
|
f"{total_size / 1024 / 1024:.1f} MB | "
|
||||||
|
f"avg {avg_speed / 1024 / 1024:.1f} MB/s | "
|
||||||
|
f"time {time.strftime('%H:%M:%S', time.gmtime(total_time))}",
|
||||||
|
flush=True
|
||||||
|
)
|
||||||
|
|
||||||
|
return h.digest()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ HASH ERROR: {path} - {e}")
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def size_allowed(size: int) -> bool:
|
||||||
|
if FILE_MIN_SIZE is not None and size < FILE_MIN_SIZE: return False
|
||||||
|
if FILE_MAX_SIZE is not None and size > FILE_MAX_SIZE: return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_db_path(scan_root, disk_path):
|
||||||
|
"""
|
||||||
|
Converts a physical Windows path to the standardized DB format.
|
||||||
|
E:\Movies\File.mkv -> /Movies/File.mkv
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
rel_path = os.path.relpath(disk_path, scan_root)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
clean_path = rel_path.replace("\\", "/")
|
||||||
|
if not clean_path.startswith("/"):
|
||||||
|
clean_path = "/" + clean_path
|
||||||
|
|
||||||
|
return clean_path
|
||||||
|
|
||||||
|
|
||||||
|
# ==============================
|
||||||
|
# MAIN
|
||||||
|
# ==============================
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("🚀 BLAKE3 External Disk Indexer (MySQL 9 Compatible)", flush=True)
|
||||||
|
print(f"🖥 Running on PC: {REAL_PC_HOSTNAME}", flush=True)
|
||||||
|
|
||||||
|
if DRY_RUN:
|
||||||
|
print("🛡️ DRY RUN MODE ACTIVE: No changes will be made to DB.", flush=True)
|
||||||
|
else:
|
||||||
|
print("⚠️ LIVE MODE: Changes WILL be committed to DB.", flush=True)
|
||||||
|
|
||||||
|
scan_root = f"{DISK_DRIVE_LETTER}:\\"
|
||||||
|
|
||||||
|
if not os.path.isdir(scan_root):
|
||||||
|
print(f"❌ ERROR: Drive '{scan_root}' not found!")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
db = pymysql.connect(**DB_CONFIG)
|
||||||
|
cur = db.cursor()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ DB Connection failed: {e}")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"📥 Loading DB index for: '{DISK_HOSTNAME}'...", flush=True)
|
||||||
|
|
||||||
|
# === LOAD EXISTING DB RECORDS ===
|
||||||
|
# We load path_hash as well for precise deletion
|
||||||
|
cur.execute("""
|
||||||
|
SELECT full_path, file_size, UNIX_TIMESTAMP(mtime)
|
||||||
|
FROM file_md5_index
|
||||||
|
WHERE host_name = %s
|
||||||
|
""", (DISK_HOSTNAME,))
|
||||||
|
|
||||||
|
indexed_map = {row[0]: (row[1], row[2]) for row in cur.fetchall()}
|
||||||
|
print(f"✅ Found {len(indexed_map):,} files in DB for this disk.", flush=True)
|
||||||
|
|
||||||
|
# =========================================================
|
||||||
|
# PHASE 1: CLEANUP (DELETE MISSING FILES)
|
||||||
|
# =========================================================
|
||||||
|
print("======================================", flush=True)
|
||||||
|
print("🧹 PHASE 1: Checking for deleted files...", flush=True)
|
||||||
|
|
||||||
|
current_disk_paths = set()
|
||||||
|
for root, dirs, files in os.walk(scan_root):
|
||||||
|
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
|
||||||
|
for fname in files:
|
||||||
|
disk_path = os.path.join(root, fname)
|
||||||
|
clean_path = normalize_db_path(scan_root, disk_path)
|
||||||
|
if clean_path:
|
||||||
|
current_disk_paths.add(clean_path)
|
||||||
|
|
||||||
|
paths_to_delete = set(indexed_map.keys()) - current_disk_paths
|
||||||
|
|
||||||
|
if paths_to_delete:
|
||||||
|
print(f"🗑️ Found {len(paths_to_delete):,} files to delete from DB.")
|
||||||
|
if DRY_RUN:
|
||||||
|
for p in sorted(list(paths_to_delete))[:20]:
|
||||||
|
print(f" - {p}")
|
||||||
|
else:
|
||||||
|
# Delete using path_hash for index efficiency
|
||||||
|
batch_size = 500
|
||||||
|
to_delete_list = list(paths_to_delete)
|
||||||
|
for i in range(0, len(to_delete_list), batch_size):
|
||||||
|
batch_paths = to_delete_list[i: i + batch_size]
|
||||||
|
# Map paths to their MD5 hashes
|
||||||
|
batch_hashes = [get_path_hash(p) for p in batch_paths]
|
||||||
|
|
||||||
|
format_strings = ','.join(['%s'] * len(batch_hashes))
|
||||||
|
query = f"DELETE FROM file_md5_index WHERE host_name = %s AND path_hash IN ({format_strings})"
|
||||||
|
|
||||||
|
try:
|
||||||
|
cur.execute(query, [DISK_HOSTNAME] + batch_hashes)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error deleting batch: {e}")
|
||||||
|
|
||||||
|
for p in paths_to_delete:
|
||||||
|
del indexed_map[p]
|
||||||
|
print("✅ Cleanup complete.")
|
||||||
|
else:
|
||||||
|
print("✅ No deleted files detected.")
|
||||||
|
|
||||||
|
# =========================================================
|
||||||
|
# PHASE 2: SCAN & UPDATE (HASHING)
|
||||||
|
# =========================================================
|
||||||
|
print("======================================", flush=True)
|
||||||
|
print("🚀 PHASE 2: Scanning for changes & new files...", flush=True)
|
||||||
|
|
||||||
|
new_files = 0
|
||||||
|
skipped = 0
|
||||||
|
errors = 0
|
||||||
|
seen_paths = set()
|
||||||
|
|
||||||
|
for root, dirs, files in os.walk(scan_root):
|
||||||
|
dirs[:] = [d for d in dirs if d not in EXCLUDED_DIRS]
|
||||||
|
for fname in files:
|
||||||
|
disk_path = os.path.join(root, fname)
|
||||||
|
try:
|
||||||
|
stat = os.stat(disk_path)
|
||||||
|
except OSError:
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
size = stat.st_size
|
||||||
|
if not size_allowed(size):
|
||||||
|
continue
|
||||||
|
|
||||||
|
clean_path = normalize_db_path(scan_root, disk_path)
|
||||||
|
if not clean_path or clean_path in seen_paths:
|
||||||
|
continue
|
||||||
|
seen_paths.add(clean_path)
|
||||||
|
|
||||||
|
mtime = int(stat.st_mtime)
|
||||||
|
|
||||||
|
# Match Check
|
||||||
|
if clean_path in indexed_map:
|
||||||
|
db_size, db_mtime = indexed_map[clean_path]
|
||||||
|
if size == db_size and mtime == db_mtime:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Compute Hashes
|
||||||
|
try:
|
||||||
|
b3_hash = compute_blake3(disk_path)
|
||||||
|
p_hash = get_path_hash(clean_path) # Essential for MySQL 9 Unique Index
|
||||||
|
except Exception:
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if DRY_RUN:
|
||||||
|
print(f"🛡️ [DRY RUN] NEW/UPDATE: {clean_path}")
|
||||||
|
new_files += 1
|
||||||
|
else:
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO file_md5_index
|
||||||
|
(os_name, host_name, full_path, path_hash, file_name, directory,
|
||||||
|
file_size, mtime, blake3)
|
||||||
|
VALUES (%s, %s, %s, %s, %s, %s, %s, FROM_UNIXTIME(%s), %s)
|
||||||
|
ON DUPLICATE KEY UPDATE
|
||||||
|
file_size = VALUES(file_size),
|
||||||
|
mtime = VALUES(mtime),
|
||||||
|
blake3 = VALUES(blake3),
|
||||||
|
updated_at = CURRENT_TIMESTAMP
|
||||||
|
""", (
|
||||||
|
OS_NAME, DISK_HOSTNAME, clean_path, p_hash, fname,
|
||||||
|
os.path.dirname(clean_path), size, mtime, b3_hash
|
||||||
|
))
|
||||||
|
new_files += 1
|
||||||
|
print(f"➕ ADDED: {clean_path} | {b3_hash.hex()[:8]}...")
|
||||||
|
|
||||||
|
print("======================================", flush=True)
|
||||||
|
print(f"✅ Processed : {new_files}")
|
||||||
|
print(f"⏭ Skipped : {skipped}")
|
||||||
|
print(f"🗑 Deleted : {len(paths_to_delete)}")
|
||||||
|
print(f"⚠️ Errors : {errors}")
|
||||||
|
print("🏁 Done.")
|
||||||
|
|
||||||
|
cur.close()
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
158
WalkFilesOnBackupHDD/40 TestPathNormalizedinTable.py
Normal file
158
WalkFilesOnBackupHDD/40 TestPathNormalizedinTable.py
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
#!/opt/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import pymysql
|
||||||
|
import hashlib
|
||||||
|
import posixpath
|
||||||
|
import unicodedata
|
||||||
|
from binascii import hexlify
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# CONFIG
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
DB_CONFIG = {
|
||||||
|
"host": "192.168.1.50",
|
||||||
|
"port": 3306,
|
||||||
|
"user": "root",
|
||||||
|
"password": "Vlado9674+",
|
||||||
|
"database": "torrents",
|
||||||
|
"charset": "utf8mb4",
|
||||||
|
}
|
||||||
|
|
||||||
|
HOST_FILTER = "tower1" # None = all hosts
|
||||||
|
LIMIT = None # e.g. 50000 for testing
|
||||||
|
SHOW_EXAMPLES = 20
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# CANONICAL PATH
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def canonical_path(path_str: str) -> str:
|
||||||
|
if not path_str:
|
||||||
|
return path_str
|
||||||
|
|
||||||
|
path_str = path_str.replace("\\", "/")
|
||||||
|
path_str = posixpath.normpath(path_str)
|
||||||
|
path_str = unicodedata.normalize("NFC", path_str)
|
||||||
|
|
||||||
|
return path_str
|
||||||
|
|
||||||
|
def md5_bytes(path_str: str) -> bytes:
|
||||||
|
return hashlib.md5(path_str.encode("utf-8")).digest()
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# MAIN
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
db = pymysql.connect(**DB_CONFIG)
|
||||||
|
cur = db.cursor(pymysql.cursors.SSCursor)
|
||||||
|
|
||||||
|
sql = """
|
||||||
|
SELECT id, full_path, path_hash
|
||||||
|
FROM file_md5_index
|
||||||
|
"""
|
||||||
|
|
||||||
|
params = []
|
||||||
|
|
||||||
|
if HOST_FILTER:
|
||||||
|
sql += " WHERE host_name = %s"
|
||||||
|
params.append(HOST_FILTER)
|
||||||
|
|
||||||
|
if LIMIT:
|
||||||
|
sql += " LIMIT %s"
|
||||||
|
params.append(LIMIT)
|
||||||
|
|
||||||
|
cur.execute(sql, params)
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
ok = 0
|
||||||
|
path_change = 0
|
||||||
|
hash_change = 0
|
||||||
|
|
||||||
|
examples_path = []
|
||||||
|
examples_hash = []
|
||||||
|
|
||||||
|
for rec_id, full_path, stored_hash in cur:
|
||||||
|
|
||||||
|
total += 1
|
||||||
|
|
||||||
|
canonical = canonical_path(full_path)
|
||||||
|
|
||||||
|
raw_hash = md5_bytes(full_path)
|
||||||
|
canonical_hash = md5_bytes(canonical)
|
||||||
|
|
||||||
|
# ---------------------------------------------------
|
||||||
|
# CASE 1: fully OK
|
||||||
|
# ---------------------------------------------------
|
||||||
|
if full_path == canonical and stored_hash == canonical_hash:
|
||||||
|
ok += 1
|
||||||
|
|
||||||
|
# ---------------------------------------------------
|
||||||
|
# CASE 2: path string would change
|
||||||
|
# ---------------------------------------------------
|
||||||
|
if full_path != canonical:
|
||||||
|
path_change += 1
|
||||||
|
|
||||||
|
if len(examples_path) < SHOW_EXAMPLES:
|
||||||
|
examples_path.append((rec_id, full_path, canonical))
|
||||||
|
|
||||||
|
# ---------------------------------------------------
|
||||||
|
# CASE 3: hash would change
|
||||||
|
# ---------------------------------------------------
|
||||||
|
if stored_hash != canonical_hash:
|
||||||
|
hash_change += 1
|
||||||
|
|
||||||
|
if len(examples_hash) < SHOW_EXAMPLES:
|
||||||
|
examples_hash.append(
|
||||||
|
(rec_id, full_path,
|
||||||
|
hexlify(stored_hash).decode(),
|
||||||
|
hexlify(canonical_hash).decode())
|
||||||
|
)
|
||||||
|
|
||||||
|
if total % 100000 == 0:
|
||||||
|
print(f"Checked {total:,} rows...")
|
||||||
|
|
||||||
|
# ============================================================
|
||||||
|
# REPORT
|
||||||
|
# ============================================================
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("AUDIT SUMMARY")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
print(f"Total rows checked : {total:,}")
|
||||||
|
print(f"OK (already canonical + hash OK) : {ok:,}")
|
||||||
|
print(f"Paths that would change : {path_change:,}")
|
||||||
|
print(f"Hashes that would change : {hash_change:,}")
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
# SHOW EXAMPLES
|
||||||
|
# ------------------------------------------------------------
|
||||||
|
|
||||||
|
if examples_path:
|
||||||
|
print("\n⚠ PATH CHANGE EXAMPLES:")
|
||||||
|
for rec_id, old, new in examples_path:
|
||||||
|
print(f"[id={rec_id}]")
|
||||||
|
print(" DB :", old)
|
||||||
|
print(" NEW:", new)
|
||||||
|
print()
|
||||||
|
|
||||||
|
if examples_hash:
|
||||||
|
print("\n❌ HASH CHANGE EXAMPLES:")
|
||||||
|
for rec_id, path, old_hash, new_hash in examples_hash:
|
||||||
|
print(f"[id={rec_id}] {path}")
|
||||||
|
print(" Stored :", old_hash)
|
||||||
|
print(" New :", new_hash)
|
||||||
|
print()
|
||||||
|
|
||||||
|
cur.close()
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
188
WalkFilesOnBackupHDD/50 Onetimepathnormalization.py
Normal file
188
WalkFilesOnBackupHDD/50 Onetimepathnormalization.py
Normal file
@@ -0,0 +1,188 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
ONE-TIME MIGRATION: Normalize full_path (NFC, forward slashes) + recompute path_hash
|
||||||
|
- Targets ONLY one host_name (Tower1 by default)
|
||||||
|
- Safe with UNIQUE(host_name, path_hash)
|
||||||
|
- Handles collisions by skipping conflicting rows and logging them
|
||||||
|
- DRY_RUN supported
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import hashlib
|
||||||
|
import posixpath
|
||||||
|
import unicodedata
|
||||||
|
import pymysql
|
||||||
|
from pymysql.err import IntegrityError
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# CONFIG
|
||||||
|
# =========================
|
||||||
|
HOST_TO_FIX = "Tower" # <-- set your Unraid host_name exactly as stored in DB
|
||||||
|
DRY_RUN = True # <-- first run True; then switch to False to apply
|
||||||
|
BATCH_SELECT_FETCH = 5000 # server-side cursor fetch size (streaming)
|
||||||
|
COMMIT_EVERY = 2000 # commit after N successful updates (when DRY_RUN=False)
|
||||||
|
LOG_EVERY = 50000 # progress print
|
||||||
|
|
||||||
|
DB_CONFIG = {
|
||||||
|
"host": "192.168.1.50",
|
||||||
|
"port": 3306,
|
||||||
|
"user": "root",
|
||||||
|
"password": "Vlado9674+",
|
||||||
|
"database": "torrents",
|
||||||
|
"charset": "utf8mb4",
|
||||||
|
"autocommit": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# CANONICALIZATION
|
||||||
|
# =========================
|
||||||
|
def canonical_path(path_str: str) -> str:
|
||||||
|
if not path_str:
|
||||||
|
return path_str
|
||||||
|
path_str = path_str.replace("\\", "/")
|
||||||
|
path_str = posixpath.normpath(path_str)
|
||||||
|
path_str = unicodedata.normalize("NFC", path_str)
|
||||||
|
return path_str
|
||||||
|
|
||||||
|
def md5_bytes(path_str: str) -> bytes:
|
||||||
|
return hashlib.md5(path_str.encode("utf-8")).digest() # 16 raw bytes for BINARY(16)
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# MAIN
|
||||||
|
# =========================
|
||||||
|
def main():
|
||||||
|
print(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] 🚀 Tower path_hash migration")
|
||||||
|
print(f"Host: {HOST_TO_FIX}")
|
||||||
|
print(f"DRY_RUN: {DRY_RUN}")
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
db = pymysql.connect(**DB_CONFIG)
|
||||||
|
|
||||||
|
# streaming cursor for reading
|
||||||
|
read_cur = db.cursor(pymysql.cursors.SSCursor)
|
||||||
|
read_cur.execute(
|
||||||
|
"""
|
||||||
|
SELECT id, full_path, path_hash
|
||||||
|
FROM file_md5_index
|
||||||
|
WHERE host_name = %s
|
||||||
|
""",
|
||||||
|
(HOST_TO_FIX,),
|
||||||
|
)
|
||||||
|
|
||||||
|
# normal cursor for updates
|
||||||
|
upd_cur = db.cursor()
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
needs_change = 0
|
||||||
|
updated_ok = 0
|
||||||
|
collisions = 0
|
||||||
|
other_errors = 0
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
pending_commits = 0
|
||||||
|
|
||||||
|
# Optional: make server-side cursor fetch a bit larger
|
||||||
|
# (PyMySQL streams regardless; this just makes loop smoother)
|
||||||
|
# Not strictly necessary.
|
||||||
|
|
||||||
|
while True:
|
||||||
|
rows = read_cur.fetchmany(BATCH_SELECT_FETCH)
|
||||||
|
if not rows:
|
||||||
|
break
|
||||||
|
|
||||||
|
for rec_id, full_path, stored_hash in rows:
|
||||||
|
total += 1
|
||||||
|
|
||||||
|
new_path = canonical_path(full_path)
|
||||||
|
new_hash = md5_bytes(new_path)
|
||||||
|
|
||||||
|
# already canonical & correct
|
||||||
|
if new_path == full_path and new_hash == stored_hash:
|
||||||
|
if total % LOG_EVERY == 0:
|
||||||
|
elapsed = time.time() - start
|
||||||
|
print(f"Checked {total:,} | needs_change {needs_change:,} | updated {updated_ok:,} | collisions {collisions:,} | {elapsed:.1f}s")
|
||||||
|
sys.stdout.flush()
|
||||||
|
continue
|
||||||
|
|
||||||
|
needs_change += 1
|
||||||
|
|
||||||
|
if DRY_RUN:
|
||||||
|
# in dry-run we just count; no DB writes
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Update with collision handling via UNIQUE(host_name, path_hash)
|
||||||
|
try:
|
||||||
|
# Use a savepoint so a duplicate-key error doesn't kill the whole transaction
|
||||||
|
upd_cur.execute("SAVEPOINT sp_one;")
|
||||||
|
|
||||||
|
upd_cur.execute(
|
||||||
|
"""
|
||||||
|
UPDATE file_md5_index
|
||||||
|
SET full_path = %s,
|
||||||
|
path_hash = %s
|
||||||
|
WHERE id = %s
|
||||||
|
""",
|
||||||
|
(new_path, new_hash, rec_id),
|
||||||
|
)
|
||||||
|
|
||||||
|
upd_cur.execute("RELEASE SAVEPOINT sp_one;")
|
||||||
|
|
||||||
|
updated_ok += 1
|
||||||
|
pending_commits += 1
|
||||||
|
|
||||||
|
if pending_commits >= COMMIT_EVERY:
|
||||||
|
db.commit()
|
||||||
|
pending_commits = 0
|
||||||
|
|
||||||
|
except IntegrityError as e:
|
||||||
|
# Duplicate key = collision on (host_name, path_hash)
|
||||||
|
# This means some OTHER row in the same host already has this new_hash.
|
||||||
|
upd_cur.execute("ROLLBACK TO SAVEPOINT sp_one;")
|
||||||
|
upd_cur.execute("RELEASE SAVEPOINT sp_one;")
|
||||||
|
collisions += 1
|
||||||
|
|
||||||
|
# Print a short line occasionally (avoid huge spam)
|
||||||
|
if collisions <= 50 or collisions % 1000 == 0:
|
||||||
|
print(f"⚠ COLLISION id={rec_id} | {e}")
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
upd_cur.execute("ROLLBACK TO SAVEPOINT sp_one;")
|
||||||
|
upd_cur.execute("RELEASE SAVEPOINT sp_one;")
|
||||||
|
other_errors += 1
|
||||||
|
if other_errors <= 50 or other_errors % 1000 == 0:
|
||||||
|
print(f"❌ ERROR id={rec_id} | {e}")
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
if total % LOG_EVERY == 0:
|
||||||
|
elapsed = time.time() - start
|
||||||
|
print(f"Checked {total:,} | needs_change {needs_change:,} | updated {updated_ok:,} | collisions {collisions:,} | {elapsed:.1f}s")
|
||||||
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
# finalize
|
||||||
|
if not DRY_RUN:
|
||||||
|
if pending_commits:
|
||||||
|
db.commit()
|
||||||
|
print("✅ Migration finished (committed).")
|
||||||
|
else:
|
||||||
|
print("⚠ DRY_RUN finished (no changes written).")
|
||||||
|
|
||||||
|
elapsed = time.time() - start
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"Total rows checked : {total:,}")
|
||||||
|
print(f"Rows needing change : {needs_change:,}")
|
||||||
|
print(f"Rows updated : {updated_ok:,}")
|
||||||
|
print(f"Collisions (skipped) : {collisions:,}")
|
||||||
|
print(f"Other errors : {other_errors:,}")
|
||||||
|
print(f"Elapsed : {elapsed:.1f}s")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
read_cur.close()
|
||||||
|
upd_cur.close()
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
146
WalkFilesOnBackupHDD/51 testthoseneedchangewhetherok.py
Normal file
146
WalkFilesOnBackupHDD/51 testthoseneedchangewhetherok.py
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import os
|
||||||
|
import hashlib
|
||||||
|
import posixpath
|
||||||
|
import unicodedata
|
||||||
|
import pymysql
|
||||||
|
import time
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# CONFIG
|
||||||
|
# =========================
|
||||||
|
|
||||||
|
HOST_TO_CHECK = "Tower"
|
||||||
|
WINDOWS_UNC_BASE = r"\\tower"
|
||||||
|
|
||||||
|
DB_CONFIG = {
|
||||||
|
"host": "192.168.1.50",
|
||||||
|
"port": 3306,
|
||||||
|
"user": "root",
|
||||||
|
"password": "Vlado9674+",
|
||||||
|
"database": "torrents",
|
||||||
|
"charset": "utf8mb4",
|
||||||
|
}
|
||||||
|
|
||||||
|
PRINT_FIRST_CHANGES = 20
|
||||||
|
LOG_EVERY = 5000
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# CANONICAL
|
||||||
|
# =========================
|
||||||
|
|
||||||
|
def canonical_path(path_str):
|
||||||
|
path_str = path_str.replace("\\", "/")
|
||||||
|
path_str = posixpath.normpath(path_str)
|
||||||
|
path_str = unicodedata.normalize("NFC", path_str)
|
||||||
|
return path_str
|
||||||
|
|
||||||
|
def md5_bytes(path_str):
|
||||||
|
return hashlib.md5(path_str.encode("utf-8")).digest()
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# PATH MAP
|
||||||
|
# =========================
|
||||||
|
|
||||||
|
def linux_to_windows_unc(linux_path):
|
||||||
|
rel = linux_path[len("/mnt/user/"):]
|
||||||
|
return os.path.join(WINDOWS_UNC_BASE, *rel.split("/"))
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# MAIN
|
||||||
|
# =========================
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print("🔍 Tower Canonical Path SMB Verification")
|
||||||
|
print(f"Host: {HOST_TO_CHECK}")
|
||||||
|
print(f"UNC Base: {WINDOWS_UNC_BASE}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
db = pymysql.connect(**DB_CONFIG)
|
||||||
|
cur = db.cursor(pymysql.cursors.SSCursor)
|
||||||
|
|
||||||
|
cur.execute("""
|
||||||
|
SELECT id, full_path, path_hash
|
||||||
|
FROM file_md5_index
|
||||||
|
WHERE host_name = %s
|
||||||
|
""", (HOST_TO_CHECK,))
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
needs_change = 0
|
||||||
|
exists_ok = 0
|
||||||
|
missing = 0
|
||||||
|
|
||||||
|
printed_changes = 0
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
for rec_id, full_path, stored_hash in cur:
|
||||||
|
|
||||||
|
total += 1
|
||||||
|
|
||||||
|
new_path = canonical_path(full_path)
|
||||||
|
new_hash = md5_bytes(new_path)
|
||||||
|
|
||||||
|
# Already canonical
|
||||||
|
if new_path == full_path and new_hash == stored_hash:
|
||||||
|
continue
|
||||||
|
|
||||||
|
needs_change += 1
|
||||||
|
|
||||||
|
win_path = linux_to_windows_unc(new_path)
|
||||||
|
exists = os.path.exists(win_path)
|
||||||
|
|
||||||
|
if exists:
|
||||||
|
exists_ok += 1
|
||||||
|
else:
|
||||||
|
missing += 1
|
||||||
|
|
||||||
|
# ---- Print first examples ----
|
||||||
|
if printed_changes < PRINT_FIRST_CHANGES:
|
||||||
|
print("\n🔧 CHANGE DETECTED")
|
||||||
|
print(f"ID : {rec_id}")
|
||||||
|
print(f"DB PATH : {full_path}")
|
||||||
|
print(f"NEW PATH : {new_path}")
|
||||||
|
print(f"WIN PATH : {win_path}")
|
||||||
|
print(f"Exists : {exists}")
|
||||||
|
printed_changes += 1
|
||||||
|
|
||||||
|
# ---- Progress ----
|
||||||
|
if total % LOG_EVERY == 0:
|
||||||
|
elapsed = time.time() - start
|
||||||
|
rate = total / elapsed if elapsed else 0
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"📊 Checked {total:,} rows | "
|
||||||
|
f"Needs change {needs_change:,} | "
|
||||||
|
f"Exists {exists_ok:,} | "
|
||||||
|
f"Missing {missing:,} | "
|
||||||
|
f"{rate:,.0f} rows/sec"
|
||||||
|
)
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# SUMMARY
|
||||||
|
# =========================
|
||||||
|
|
||||||
|
elapsed = time.time() - start
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("✅ FINAL SUMMARY")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"Total scanned : {total:,}")
|
||||||
|
print(f"Needs change : {needs_change:,}")
|
||||||
|
print(f"Exists on Tower : {exists_ok:,}")
|
||||||
|
print(f"Missing on Tower : {missing:,}")
|
||||||
|
print(f"Runtime : {elapsed:.1f}s")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
cur.close()
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
139
WalkFilesOnBackupHDD/53 towerpathcorrection.py
Normal file
139
WalkFilesOnBackupHDD/53 towerpathcorrection.py
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
TOWER PATH NORMALIZATION MIGRATION
|
||||||
|
----------------------------------
|
||||||
|
✔ Normalizes full_path → NFC canonical
|
||||||
|
✔ Recalculates path_hash
|
||||||
|
✔ Uses two DB connections (streaming safe)
|
||||||
|
✔ Idempotent (safe to rerun)
|
||||||
|
✔ Production safe
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pymysql
|
||||||
|
import hashlib
|
||||||
|
import posixpath
|
||||||
|
import unicodedata
|
||||||
|
import time
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# CONFIG
|
||||||
|
# =========================
|
||||||
|
|
||||||
|
HOST_TO_FIX = "tower1"
|
||||||
|
BATCH_FETCH = 5000
|
||||||
|
COMMIT_EVERY = 2000
|
||||||
|
|
||||||
|
DB_CONFIG = {
|
||||||
|
"host": "192.168.1.50",
|
||||||
|
"port": 3306,
|
||||||
|
"user": "root",
|
||||||
|
"password": "Vlado9674+",
|
||||||
|
"database": "torrents",
|
||||||
|
"charset": "utf8mb4",
|
||||||
|
"autocommit": False,
|
||||||
|
}
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# CANONICALIZATION
|
||||||
|
# =========================
|
||||||
|
|
||||||
|
def canonical_path(path_str: str) -> str:
|
||||||
|
path_str = path_str.replace("\\", "/")
|
||||||
|
path_str = posixpath.normpath(path_str)
|
||||||
|
path_str = unicodedata.normalize("NFC", path_str)
|
||||||
|
return path_str
|
||||||
|
|
||||||
|
def md5_bytes(path_str: str) -> bytes:
|
||||||
|
return hashlib.md5(path_str.encode("utf-8")).digest()
|
||||||
|
|
||||||
|
# =========================
|
||||||
|
# MAIN
|
||||||
|
# =========================
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
print("=" * 70)
|
||||||
|
print("🚀 TOWER PATH NORMALIZATION MIGRATION")
|
||||||
|
print(f"Host: {HOST_TO_FIX}")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
|
# --- TWO CONNECTIONS ---
|
||||||
|
db_read = pymysql.connect(**DB_CONFIG)
|
||||||
|
db_write = pymysql.connect(**DB_CONFIG)
|
||||||
|
|
||||||
|
read_cur = db_read.cursor(pymysql.cursors.SSCursor)
|
||||||
|
write_cur = db_write.cursor()
|
||||||
|
|
||||||
|
read_cur.execute("""
|
||||||
|
SELECT id, full_path, path_hash
|
||||||
|
FROM file_md5_index
|
||||||
|
WHERE host_name = %s
|
||||||
|
""", (HOST_TO_FIX,))
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
updated = 0
|
||||||
|
skipped = 0
|
||||||
|
pending_commit = 0
|
||||||
|
|
||||||
|
while True:
|
||||||
|
|
||||||
|
rows = read_cur.fetchmany(BATCH_FETCH)
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
break
|
||||||
|
|
||||||
|
for rec_id, full_path, stored_hash in rows:
|
||||||
|
|
||||||
|
total += 1
|
||||||
|
|
||||||
|
new_path = canonical_path(full_path)
|
||||||
|
new_hash = md5_bytes(new_path)
|
||||||
|
|
||||||
|
if new_path == full_path and new_hash == stored_hash:
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
write_cur.execute("""
|
||||||
|
UPDATE file_md5_index
|
||||||
|
SET full_path = %s,
|
||||||
|
path_hash = %s
|
||||||
|
WHERE id = %s
|
||||||
|
""", (new_path, new_hash, rec_id))
|
||||||
|
|
||||||
|
updated += 1
|
||||||
|
pending_commit += 1
|
||||||
|
|
||||||
|
if pending_commit >= COMMIT_EVERY:
|
||||||
|
db_write.commit()
|
||||||
|
pending_commit = 0
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"Checked {total:,} | Updated {updated:,} | Skipped {skipped:,}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if pending_commit:
|
||||||
|
db_write.commit()
|
||||||
|
|
||||||
|
elapsed = time.time() - start
|
||||||
|
|
||||||
|
print("\n" + "=" * 70)
|
||||||
|
print("✅ MIGRATION FINISHED")
|
||||||
|
print("=" * 70)
|
||||||
|
print(f"Total checked : {total:,}")
|
||||||
|
print(f"Rows updated : {updated:,}")
|
||||||
|
print(f"Rows skipped : {skipped:,}")
|
||||||
|
print(f"Runtime : {elapsed:.1f}s")
|
||||||
|
print("=" * 70)
|
||||||
|
|
||||||
|
read_cur.close()
|
||||||
|
write_cur.close()
|
||||||
|
db_read.close()
|
||||||
|
db_write.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user