348 lines
12 KiB
Python
348 lines
12 KiB
Python
"""
|
|
collect_pictures_windows.py
|
|
---------------------------
|
|
Windows verze zálohovacího skriptu.
|
|
Prochází všechny lokální disky (ne síťové, ne CD), hledá JPG/JPEG,
|
|
kopíruje (deduplikace BLAKE3) na Tower1 přes UNC cestu a zapisuje do PostgreSQL.
|
|
|
|
Cesta zálohy: \\Tower1\ZalohaVsechObrazku\{hostname}\{písmeno_disku}\{cesta}
|
|
Příklad: \\Tower1\ZalohaVsechObrazku\JMENO-PC\D\Foto\2023\img.jpg
|
|
|
|
Bezpečné pro opakované spuštění (pokračuje tam kde skončilo).
|
|
Bezpečné pro souběžný běh na více strojích (ON CONFLICT v SQL).
|
|
|
|
Předpoklad pro ukládání cest do DB:
|
|
Ať už je hostname Windows stroje jakýkoliv, Tower1 (druhý Unraid server)
|
|
je vždy dostupný jako \\Tower1\ZalohaVsechObrazku\...
|
|
Fyzické kopírování probíhá přes tuto UNC cestu, ale do DB se ukládá
|
|
vždy nativní Linux cesta Tower1, tj. /mnt/user/ZalohaVsechObrazku/...
|
|
(\\Tower1 → /mnt/user, zpětná lomítka → dopředná lomítka).
|
|
Tím jsou cesty v DB jednotné bez ohledu na to, který stroj zálohu pořídil.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import shutil
|
|
import socket
|
|
import string
|
|
import ctypes
|
|
import logging
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
|
|
import blake3
|
|
import psycopg2
|
|
from psycopg2.extras import execute_values
|
|
|
|
# ── Konfigurace ───────────────────────────────────────────────────────────────
|
|
|
|
DB_CONFIG = {
|
|
"host": "192.168.1.76",
|
|
"port": 5432,
|
|
"user": "vladimir.buzalka",
|
|
"password": "Vlado7309208104++",
|
|
"database": "fotky_buzalkovi",
|
|
}
|
|
|
|
ZALOHA_DIR = Path(r"\\Tower1\ZalohaVsechObrazku")
|
|
|
|
# Normalizace cesty pro DB — Windows UNC cestu převedeme na nativní Tower1 Linux cestu.
|
|
# \\Tower1\ZalohaVsechObrazku\... → /mnt/user/ZalohaVsechObrazku/...
|
|
WINDOWS_UNC_PREFIX = r"\\Tower1"
|
|
LINUX_NATIVE_PREFIX = "/mnt/user"
|
|
|
|
|
|
def normalize_path_for_db(path: Path) -> str:
|
|
"""Převede \\Tower1\ZalohaVsechObrazku\... na /mnt/user/ZalohaVsechObrazku/..."""
|
|
s = str(path)
|
|
if s.startswith(WINDOWS_UNC_PREFIX):
|
|
return LINUX_NATIVE_PREFIX + s[len(WINDOWS_UNC_PREFIX):].replace("\\", "/")
|
|
return s
|
|
|
|
JPEG_EXTENSIONS = {".jpg", ".jpeg"}
|
|
|
|
EXCLUDED_DIR_NAMES_LOWER = {
|
|
"zalohavsechobrazku",
|
|
"zálohavsechobrázku",
|
|
"zálohavsechobrazku",
|
|
"$recycle.bin",
|
|
"system volume information",
|
|
"recovery",
|
|
}
|
|
|
|
EXCLUDED_FULL_PATHS = {
|
|
Path(os.environ.get("WINDIR", r"C:\Windows")),
|
|
}
|
|
|
|
BATCH_SIZE = 500
|
|
|
|
LOG_FILE = Path(__file__).parent / "collect_pictures_windows.log"
|
|
|
|
# ── Logging ───────────────────────────────────────────────────────────────────
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)-7s %(message)s",
|
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|
handlers=[
|
|
logging.StreamHandler(sys.stdout),
|
|
logging.FileHandler(LOG_FILE, encoding="utf-8"),
|
|
],
|
|
)
|
|
log = logging.getLogger(__name__)
|
|
|
|
# ── SQL ───────────────────────────────────────────────────────────────────────
|
|
|
|
SQL_INSERT_ZALOHA = """
|
|
INSERT INTO zaloha_obrazku (blake3_hash, cesta_zalohy, nazev_souboru, velikost)
|
|
VALUES (%s, %s, %s, %s)
|
|
ON CONFLICT (blake3_hash) DO NOTHING
|
|
RETURNING id
|
|
"""
|
|
|
|
SQL_INSERT_ZDROJ = """
|
|
INSERT INTO zdrojove_soubory (hostname, cesta_zdroje, nazev_souboru, velikost, blake3_hash, zaloha_id)
|
|
VALUES (%s, %s, %s, %s, %s, %s)
|
|
ON CONFLICT (hostname, cesta_zdroje) DO NOTHING
|
|
"""
|
|
|
|
SQL_GET_ZALOHA_ID = "SELECT id FROM zaloha_obrazku WHERE blake3_hash = %s"
|
|
|
|
# ── Pomocné funkce ────────────────────────────────────────────────────────────
|
|
|
|
def get_local_drives() -> list[Path]:
|
|
DRIVE_FIXED = 3
|
|
drives = []
|
|
bitmask = ctypes.windll.kernel32.GetLogicalDrives()
|
|
for letter in string.ascii_uppercase:
|
|
if bitmask & 1:
|
|
drive = Path(f"{letter}:\\")
|
|
drive_type = ctypes.windll.kernel32.GetDriveTypeW(str(drive))
|
|
if drive_type == DRIVE_FIXED:
|
|
drives.append(drive)
|
|
bitmask >>= 1
|
|
return drives
|
|
|
|
|
|
def is_excluded_path(path: Path) -> bool:
|
|
for excl in EXCLUDED_FULL_PATHS:
|
|
try:
|
|
path.relative_to(excl)
|
|
return True
|
|
except ValueError:
|
|
pass
|
|
return False
|
|
|
|
|
|
def compute_blake3(path: Path, chunk: int = 1 << 20) -> str:
|
|
h = blake3.blake3()
|
|
with open(path, "rb") as f:
|
|
while data := f.read(chunk):
|
|
h.update(data)
|
|
return h.hexdigest()
|
|
|
|
|
|
def dest_path_for(source: Path, hostname: str) -> Path:
|
|
drive_letter = source.drive.rstrip(":")
|
|
relative = source.relative_to(source.drive + "\\")
|
|
return ZALOHA_DIR / hostname / drive_letter / relative
|
|
|
|
|
|
def copy_to_backup(source: Path, dest: Path) -> None:
|
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
if dest.exists():
|
|
return
|
|
shutil.copy2(source, dest)
|
|
|
|
|
|
def iter_jpeg_files(drives: list[Path]):
|
|
for drive in drives:
|
|
log.info(f"Skenuji disk: {drive}")
|
|
for root, dirs, files in os.walk(drive, followlinks=False):
|
|
root_path = Path(root)
|
|
|
|
if is_excluded_path(root_path):
|
|
dirs.clear()
|
|
continue
|
|
|
|
dirs[:] = [
|
|
d for d in dirs
|
|
if d.lower() not in EXCLUDED_DIR_NAMES_LOWER
|
|
and not is_excluded_path(root_path / d)
|
|
]
|
|
|
|
for fname in files:
|
|
if Path(fname).suffix.lower() in JPEG_EXTENSIONS:
|
|
yield root_path / fname
|
|
|
|
|
|
def load_known_sources(conn, hostname: str) -> set[str]:
|
|
with conn.cursor("src_cursor") as cur:
|
|
cur.itersize = 10000
|
|
cur.execute("SELECT cesta_zdroje FROM zdrojove_soubory WHERE hostname = %s", (hostname,))
|
|
return {row[0] for row in cur}
|
|
|
|
|
|
def load_known_hashes(conn) -> dict[str, int]:
|
|
with conn.cursor("hash_cursor") as cur:
|
|
cur.itersize = 10000
|
|
cur.execute("SELECT blake3_hash, id FROM zaloha_obrazku")
|
|
return {row[0]: row[1] for row in cur}
|
|
|
|
# ── Hlavní logika ─────────────────────────────────────────────────────────────
|
|
|
|
def process(conn, hostname, drives):
|
|
log.info(f"Hostname zdroje: {hostname}")
|
|
|
|
log.info("Načítám známé zdroje z DB...")
|
|
known_sources = load_known_sources(conn, hostname)
|
|
log.info(f"Známých zdrojů v DB: {len(known_sources)}")
|
|
|
|
log.info("Načítám známé hashe z DB...")
|
|
known_hashes = load_known_hashes(conn)
|
|
log.info(f"Známých hashů v DB: {len(known_hashes)}")
|
|
|
|
stats = {"nalezeno": 0, "kopirovano": 0, "duplicit": 0, "chyb": 0, "preskoceno": 0}
|
|
pending_zdroje = []
|
|
|
|
def flush_zdroje():
|
|
if not pending_zdroje:
|
|
return
|
|
cur = conn.cursor()
|
|
execute_values(
|
|
cur,
|
|
"""INSERT INTO zdrojove_soubory
|
|
(hostname, cesta_zdroje, nazev_souboru, velikost, blake3_hash, zaloha_id)
|
|
VALUES %s
|
|
ON CONFLICT (hostname, cesta_zdroje) DO NOTHING""",
|
|
pending_zdroje,
|
|
)
|
|
conn.commit()
|
|
cur.close()
|
|
pending_zdroje.clear()
|
|
|
|
for source in iter_jpeg_files(drives):
|
|
stats["nalezeno"] += 1
|
|
src_str = str(source)
|
|
|
|
if src_str in known_sources:
|
|
stats["preskoceno"] += 1
|
|
if stats["preskoceno"] % 5000 == 0:
|
|
log.info(f"Přeskočeno (již v DB): {stats['preskoceno']}")
|
|
continue
|
|
|
|
t_start = time.perf_counter()
|
|
|
|
try:
|
|
velikost = source.stat().st_size
|
|
hash_val = compute_blake3(source)
|
|
except (OSError, PermissionError) as e:
|
|
log.warning(f"CHYBA čtení: {source} → {e}")
|
|
stats["chyb"] += 1
|
|
continue
|
|
|
|
t_hash = time.perf_counter()
|
|
|
|
zaloha_id = known_hashes.get(hash_val)
|
|
|
|
if zaloha_id is not None:
|
|
stats["duplicit"] += 1
|
|
vel_mb = velikost / (1024 * 1024)
|
|
log.info(
|
|
f"DUPLIKÁT {source.name} "
|
|
f"({vel_mb:.1f} MB, hash={t_hash - t_start:.2f}s)"
|
|
)
|
|
else:
|
|
dest = dest_path_for(source, hostname)
|
|
try:
|
|
copy_to_backup(source, dest)
|
|
except (OSError, shutil.Error) as e:
|
|
log.warning(f"CHYBA kopírování: {source} → {e}")
|
|
stats["chyb"] += 1
|
|
continue
|
|
|
|
t_copy = time.perf_counter()
|
|
|
|
cur = conn.cursor()
|
|
cur.execute(SQL_INSERT_ZALOHA, (hash_val, normalize_path_for_db(dest), source.name, velikost))
|
|
row = cur.fetchone()
|
|
if row:
|
|
zaloha_id = row[0]
|
|
else:
|
|
cur.execute(SQL_GET_ZALOHA_ID, (hash_val,))
|
|
zaloha_id = cur.fetchone()[0]
|
|
cur.close()
|
|
conn.commit()
|
|
|
|
t_db = time.perf_counter()
|
|
|
|
known_hashes[hash_val] = zaloha_id
|
|
stats["kopirovano"] += 1
|
|
vel_mb = velikost / (1024 * 1024)
|
|
log.info(
|
|
f"ZKOPÍROVÁNO [{stats['kopirovano']:>6}] {source.name} "
|
|
f"({vel_mb:.1f} MB, hash={t_hash - t_start:.2f}s "
|
|
f"copy={t_copy - t_hash:.2f}s db={t_db - t_copy:.2f}s "
|
|
f"celkem={t_db - t_start:.2f}s)"
|
|
)
|
|
|
|
pending_zdroje.append((hostname, src_str, source.name, velikost, hash_val, zaloha_id))
|
|
known_sources.add(src_str)
|
|
|
|
if len(pending_zdroje) >= BATCH_SIZE:
|
|
flush_zdroje()
|
|
|
|
if stats["nalezeno"] % 5000 == 0:
|
|
log.info(
|
|
f"Průběh: nalezeno={stats['nalezeno']} "
|
|
f"nových={stats['kopirovano']} duplikátů={stats['duplicit']} "
|
|
f"chyb={stats['chyb']} přeskočeno={stats['preskoceno']}"
|
|
)
|
|
|
|
flush_zdroje()
|
|
return stats
|
|
|
|
|
|
def main():
|
|
hostname = socket.gethostname()
|
|
drives = get_local_drives()
|
|
|
|
log.info("=" * 60)
|
|
log.info(f"Spuštění: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
log.info(f"Hostname: {hostname}")
|
|
log.info(f"Disky: {[str(d) for d in drives]}")
|
|
log.info(f"Záloha: {ZALOHA_DIR}\\{hostname}\\")
|
|
|
|
try:
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|
conn.autocommit = False
|
|
log.info("PostgreSQL: připojeno")
|
|
except Exception as e:
|
|
log.error(f"Nelze se připojit k DB: {e}")
|
|
sys.exit(1)
|
|
|
|
stats = {"nalezeno": 0, "kopirovano": 0, "duplicit": 0, "chyb": 0, "preskoceno": 0}
|
|
try:
|
|
stats = process(conn, hostname, drives)
|
|
except KeyboardInterrupt:
|
|
log.warning("Přerušeno uživatelem — dosavadní záznamy jsou uloženy.")
|
|
conn.rollback()
|
|
except Exception as e:
|
|
log.error(f"Neočekávaná chyba: {e}", exc_info=True)
|
|
conn.rollback()
|
|
finally:
|
|
conn.close()
|
|
|
|
log.info("-" * 60)
|
|
log.info(f"Nalezeno JPG/JPEG: {stats['nalezeno']}")
|
|
log.info(f"Zkopírováno nových: {stats['kopirovano']}")
|
|
log.info(f"Duplikátů (hash): {stats['duplicit']}")
|
|
log.info(f"Přeskočeno (v DB): {stats['preskoceno']}")
|
|
log.info(f"Chyb: {stats['chyb']}")
|
|
log.info("Hotovo.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|