""" collect_pictures_windows.py --------------------------- Windows verze zálohovacího skriptu. Prochází všechny lokální disky (ne síťové, ne CD), hledá JPG/JPEG, kopíruje (deduplikace BLAKE3) na Tower1 přes UNC cestu a zapisuje do PostgreSQL. Cesta zálohy: \\Tower1\ZalohaVsechObrazku\{hostname}\{písmeno_disku}\{cesta} Příklad: \\Tower1\ZalohaVsechObrazku\JMENO-PC\D\Foto\2023\img.jpg Bezpečné pro opakované spuštění (pokračuje tam kde skončilo). Bezpečné pro souběžný běh na více strojích (ON CONFLICT v SQL). """ import os import sys import shutil import socket import string import ctypes import logging from pathlib import Path from datetime import datetime import blake3 import psycopg2 from psycopg2.extras import execute_values # ── Konfigurace ─────────────────────────────────────────────────────────────── DB_CONFIG = { "host": "192.168.1.76", "port": 5432, "user": "vladimir.buzalka", "password": "Vlado7309208104++", "database": "fotky_buzalkovi", } ZALOHA_DIR = Path(r"\\Tower1\ZalohaVsechObrazku") JPEG_EXTENSIONS = {".jpg", ".jpeg"} EXCLUDED_DIR_NAMES_LOWER = { "zalohavsechobrazku", "zálohavsechobrázku", "zálohavsechobrazku", "$recycle.bin", "system volume information", "recovery", } EXCLUDED_FULL_PATHS = { Path(os.environ.get("WINDIR", r"C:\Windows")), } BATCH_SIZE = 500 LOG_FILE = Path(__file__).parent / "collect_pictures_windows.log" # ── Logging ─────────────────────────────────────────────────────────────────── logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)-7s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", handlers=[ logging.StreamHandler(sys.stdout), logging.FileHandler(LOG_FILE, encoding="utf-8"), ], ) log = logging.getLogger(__name__) # ── SQL ─────────────────────────────────────────────────────────────────────── SQL_INSERT_ZALOHA = """ INSERT INTO zaloha_obrazku (blake3_hash, cesta_zalohy, nazev_souboru, velikost) VALUES (%s, %s, %s, %s) ON CONFLICT (blake3_hash) DO NOTHING RETURNING id """ SQL_INSERT_ZDROJ = """ INSERT INTO zdrojove_soubory (hostname, cesta_zdroje, nazev_souboru, velikost, blake3_hash, zaloha_id) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT (hostname, cesta_zdroje) DO NOTHING """ SQL_GET_ZALOHA_ID = "SELECT id FROM zaloha_obrazku WHERE blake3_hash = %s" # ── Pomocné funkce ──────────────────────────────────────────────────────────── def get_local_drives() -> list[Path]: DRIVE_FIXED = 3 drives = [] bitmask = ctypes.windll.kernel32.GetLogicalDrives() for letter in string.ascii_uppercase: if bitmask & 1: drive = Path(f"{letter}:\\") drive_type = ctypes.windll.kernel32.GetDriveTypeW(str(drive)) if drive_type == DRIVE_FIXED: drives.append(drive) bitmask >>= 1 return drives def is_excluded_path(path: Path) -> bool: for excl in EXCLUDED_FULL_PATHS: try: path.relative_to(excl) return True except ValueError: pass return False def compute_blake3(path: Path, chunk: int = 1 << 20) -> str: h = blake3.blake3() with open(path, "rb") as f: while data := f.read(chunk): h.update(data) return h.hexdigest() def dest_path_for(source: Path, hostname: str) -> Path: drive_letter = source.drive.rstrip(":") relative = source.relative_to(source.drive + "\\") return ZALOHA_DIR / hostname / drive_letter / relative def copy_to_backup(source: Path, dest: Path) -> None: dest.parent.mkdir(parents=True, exist_ok=True) if dest.exists(): return shutil.copy2(source, dest) def iter_jpeg_files(drives: list[Path]): for drive in drives: log.info(f"Skenuji disk: {drive}") for root, dirs, files in os.walk(drive, followlinks=False): root_path = Path(root) if is_excluded_path(root_path): dirs.clear() continue dirs[:] = [ d for d in dirs if d.lower() not in EXCLUDED_DIR_NAMES_LOWER and not is_excluded_path(root_path / d) ] for fname in files: if Path(fname).suffix.lower() in JPEG_EXTENSIONS: yield root_path / fname def load_known_sources(conn, hostname: str) -> set[str]: with conn.cursor("src_cursor") as cur: cur.itersize = 10000 cur.execute("SELECT cesta_zdroje FROM zdrojove_soubory WHERE hostname = %s", (hostname,)) return {row[0] for row in cur} def load_known_hashes(conn) -> dict[str, int]: with conn.cursor("hash_cursor") as cur: cur.itersize = 10000 cur.execute("SELECT blake3_hash, id FROM zaloha_obrazku") return {row[0]: row[1] for row in cur} # ── Hlavní logika ───────────────────────────────────────────────────────────── def process(conn, hostname, drives): log.info(f"Hostname zdroje: {hostname}") log.info("Načítám známé zdroje z DB...") known_sources = load_known_sources(conn, hostname) log.info(f"Známých zdrojů v DB: {len(known_sources)}") log.info("Načítám známé hashe z DB...") known_hashes = load_known_hashes(conn) log.info(f"Známých hashů v DB: {len(known_hashes)}") stats = {"nalezeno": 0, "kopirovano": 0, "duplicit": 0, "chyb": 0, "preskoceno": 0} pending_zdroje = [] def flush_zdroje(): if not pending_zdroje: return cur = conn.cursor() execute_values( cur, """INSERT INTO zdrojove_soubory (hostname, cesta_zdroje, nazev_souboru, velikost, blake3_hash, zaloha_id) VALUES %s ON CONFLICT (hostname, cesta_zdroje) DO NOTHING""", pending_zdroje, ) conn.commit() cur.close() pending_zdroje.clear() for source in iter_jpeg_files(drives): stats["nalezeno"] += 1 src_str = str(source) if src_str in known_sources: stats["preskoceno"] += 1 if stats["preskoceno"] % 5000 == 0: log.info(f"Přeskočeno (již v DB): {stats['preskoceno']}") continue try: velikost = source.stat().st_size hash_val = compute_blake3(source) except (OSError, PermissionError) as e: log.warning(f"CHYBA čtení: {source} → {e}") stats["chyb"] += 1 continue zaloha_id = known_hashes.get(hash_val) if zaloha_id is not None: stats["duplicit"] += 1 else: dest = dest_path_for(source, hostname) try: copy_to_backup(source, dest) except (OSError, shutil.Error) as e: log.warning(f"CHYBA kopírování: {source} → {e}") stats["chyb"] += 1 continue cur = conn.cursor() cur.execute(SQL_INSERT_ZALOHA, (hash_val, str(dest), source.name, velikost)) row = cur.fetchone() if row: zaloha_id = row[0] else: cur.execute(SQL_GET_ZALOHA_ID, (hash_val,)) zaloha_id = cur.fetchone()[0] cur.close() conn.commit() known_hashes[hash_val] = zaloha_id stats["kopirovano"] += 1 log.info(f"ZKOPÍROVÁNO [{stats['kopirovano']:>6}] {source}") pending_zdroje.append((hostname, src_str, source.name, velikost, hash_val, zaloha_id)) known_sources.add(src_str) if len(pending_zdroje) >= BATCH_SIZE: flush_zdroje() if stats["nalezeno"] % 5000 == 0: log.info( f"Průběh: nalezeno={stats['nalezeno']} " f"nových={stats['kopirovano']} duplikátů={stats['duplicit']} " f"chyb={stats['chyb']} přeskočeno={stats['preskoceno']}" ) flush_zdroje() return stats def main(): hostname = socket.gethostname() drives = get_local_drives() log.info("=" * 60) log.info(f"Spuštění: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") log.info(f"Hostname: {hostname}") log.info(f"Disky: {[str(d) for d in drives]}") log.info(f"Záloha: {ZALOHA_DIR}\\{hostname}\\") try: conn = psycopg2.connect(**DB_CONFIG) conn.autocommit = False log.info("PostgreSQL: připojeno") except Exception as e: log.error(f"Nelze se připojit k DB: {e}") sys.exit(1) stats = {"nalezeno": 0, "kopirovano": 0, "duplicit": 0, "chyb": 0, "preskoceno": 0} try: stats = process(conn, hostname, drives) except KeyboardInterrupt: log.warning("Přerušeno uživatelem — dosavadní záznamy jsou uloženy.") conn.rollback() except Exception as e: log.error(f"Neočekávaná chyba: {e}", exc_info=True) conn.rollback() finally: conn.close() log.info("-" * 60) log.info(f"Nalezeno JPG/JPEG: {stats['nalezeno']}") log.info(f"Zkopírováno nových: {stats['kopirovano']}") log.info(f"Duplikátů (hash): {stats['duplicit']}") log.info(f"Přeskočeno (v DB): {stats['preskoceno']}") log.info(f"Chyb: {stats['chyb']}") log.info("Hotovo.") if __name__ == "__main__": main()