import pymysql from datetime import datetime from indexer.config import DB_CONFIG, BATCH_SIZE def get_connection(): return pymysql.connect(**DB_CONFIG) # ── Run management ────────────────────────────────────────── def create_run(cur) -> int: cur.execute( "INSERT INTO runs (started_at, status) VALUES (%s, 'RUNNING')", (datetime.now(),) ) return cur.lastrowid def finalize_run(cur, run_id: int, stats: dict): cur.execute( """UPDATE runs SET finished_at = %s, status = 'COMPLETED', files_total = %s, files_new = %s, files_modified = %s, files_deleted = %s, files_unchanged = %s WHERE id = %s""", (datetime.now(), stats["total"], stats["new"], stats["modified"], stats["deleted"], stats["unchanged"], run_id) ) def fail_run(cur, run_id: int): cur.execute( "UPDATE runs SET finished_at = %s, status = 'FAILED' WHERE id = %s", (datetime.now(), run_id) ) # ── Load DB state ────────────────────────────────────────── def load_all_files(cur) -> dict: """ Načte všechny existující soubory z DB do RAM. Returns: {relative_path: {id, size, mtime, content_hash}} """ cur.execute( """SELECT id, relative_path, file_size, mtime, content_hash FROM files WHERE exists_now = 1""" ) result = {} for row in cur.fetchall(): file_id, rel_path, size, mtime, content_hash = row result[rel_path] = { "id": file_id, "size": size, "mtime": mtime, "content_hash": content_hash, } return result # ── Batch operations ──────────────────────────────────────── def batch_insert_files(cur, files_list: list, run_id: int) -> dict: """ Batch INSERT (or re-activate) souborů. Handles re-appearing files that were previously deleted (exists_now=0) via ON DUPLICATE KEY UPDATE. files_list: [{relative_path, file_name, directory, size, mtime, content_hash}] Returns: {relative_path: file_id} """ path_to_id = {} for i in range(0, len(files_list), BATCH_SIZE): chunk = files_list[i:i + BATCH_SIZE] for f in chunk: cur.execute( """INSERT INTO files (relative_path, file_name, directory, file_size, mtime, content_hash, first_seen_run, last_seen_run, exists_now) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1) ON DUPLICATE KEY UPDATE file_name = VALUES(file_name), directory = VALUES(directory), file_size = VALUES(file_size), mtime = VALUES(mtime), content_hash = VALUES(content_hash), last_seen_run = VALUES(last_seen_run), exists_now = 1""", (f["relative_path"], f["file_name"], f["directory"], f["size"], f["mtime"], f["content_hash"], run_id, run_id) ) # Fetch real IDs paths = [f["relative_path"] for f in chunk] placeholders = ",".join(["%s"] * len(paths)) cur.execute( f"SELECT id, relative_path FROM files WHERE relative_path IN ({placeholders})", paths, ) for row in cur.fetchall(): path_to_id[row[1]] = row[0] return path_to_id def batch_update_modified(cur, files_list: list, run_id: int): """ Batch UPDATE změněných souborů. files_list: [{id, size, mtime, content_hash}] """ for i in range(0, len(files_list), BATCH_SIZE): chunk = files_list[i:i + BATCH_SIZE] cur.executemany( """UPDATE files SET file_size = %s, mtime = %s, content_hash = %s, last_seen_run = %s, exists_now = 1 WHERE id = %s""", [(f["size"], f["mtime"], f["content_hash"], run_id, f["id"]) for f in chunk] ) def batch_mark_deleted(cur, file_ids: list, run_id: int): """Batch UPDATE smazaných souborů — exists_now = 0.""" for i in range(0, len(file_ids), BATCH_SIZE): chunk = file_ids[i:i + BATCH_SIZE] cur.executemany( "UPDATE files SET exists_now = 0, last_seen_run = %s WHERE id = %s", [(run_id, fid) for fid in chunk] ) def batch_update_unchanged(cur, file_ids: list, run_id: int): """Batch UPDATE nezměněných souborů — jen last_seen_run.""" for i in range(0, len(file_ids), BATCH_SIZE): chunk = file_ids[i:i + BATCH_SIZE] cur.executemany( "UPDATE files SET last_seen_run = %s, exists_now = 1 WHERE id = %s", [(run_id, fid) for fid in chunk] )