129 lines
4.4 KiB
Python
129 lines
4.4 KiB
Python
import pymysql
|
|
from datetime import datetime
|
|
from indexer.config import DB_CONFIG, BATCH_SIZE
|
|
|
|
|
|
def get_connection():
|
|
return pymysql.connect(**DB_CONFIG)
|
|
|
|
|
|
# ── Run management ──────────────────────────────────────────
|
|
|
|
def create_run(cur) -> int:
|
|
cur.execute(
|
|
"INSERT INTO runs (started_at, status) VALUES (%s, 'RUNNING')",
|
|
(datetime.now(),)
|
|
)
|
|
return cur.lastrowid
|
|
|
|
|
|
def finalize_run(cur, run_id: int, stats: dict):
|
|
cur.execute(
|
|
"""UPDATE runs
|
|
SET finished_at = %s, status = 'COMPLETED',
|
|
files_total = %s, files_new = %s, files_modified = %s,
|
|
files_deleted = %s, files_unchanged = %s
|
|
WHERE id = %s""",
|
|
(datetime.now(), stats["total"], stats["new"], stats["modified"],
|
|
stats["deleted"], stats["unchanged"], run_id)
|
|
)
|
|
|
|
|
|
def fail_run(cur, run_id: int):
|
|
cur.execute(
|
|
"UPDATE runs SET finished_at = %s, status = 'FAILED' WHERE id = %s",
|
|
(datetime.now(), run_id)
|
|
)
|
|
|
|
|
|
# ── Load DB state ──────────────────────────────────────────
|
|
|
|
def load_all_files(cur) -> dict:
|
|
"""
|
|
Načte všechny existující soubory z DB do RAM.
|
|
Returns: {relative_path: {id, size, mtime, content_hash}}
|
|
"""
|
|
cur.execute(
|
|
"""SELECT id, relative_path, file_size, mtime, content_hash
|
|
FROM files WHERE exists_now = 1"""
|
|
)
|
|
result = {}
|
|
for row in cur.fetchall():
|
|
file_id, rel_path, size, mtime, content_hash = row
|
|
result[rel_path] = {
|
|
"id": file_id,
|
|
"size": size,
|
|
"mtime": mtime,
|
|
"content_hash": content_hash,
|
|
}
|
|
return result
|
|
|
|
|
|
# ── Batch operations ────────────────────────────────────────
|
|
|
|
def batch_insert_files(cur, files_list: list, run_id: int) -> dict:
|
|
"""
|
|
Batch INSERT nových souborů.
|
|
files_list: [{relative_path, file_name, directory, size, mtime, content_hash}]
|
|
Returns: {relative_path: file_id}
|
|
"""
|
|
path_to_id = {}
|
|
for i in range(0, len(files_list), BATCH_SIZE):
|
|
chunk = files_list[i:i + BATCH_SIZE]
|
|
cur.executemany(
|
|
"""INSERT INTO files
|
|
(relative_path, file_name, directory, file_size, mtime,
|
|
content_hash, first_seen_run, last_seen_run, exists_now)
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1)""",
|
|
[(f["relative_path"], f["file_name"], f["directory"],
|
|
f["size"], f["mtime"], f["content_hash"], run_id, run_id)
|
|
for f in chunk]
|
|
)
|
|
# Fetch real IDs — lastrowid+j is unreliable with executemany
|
|
paths = [f["relative_path"] for f in chunk]
|
|
placeholders = ",".join(["%s"] * len(paths))
|
|
cur.execute(
|
|
f"SELECT id, relative_path FROM files WHERE relative_path IN ({placeholders})",
|
|
paths,
|
|
)
|
|
for row in cur.fetchall():
|
|
path_to_id[row[1]] = row[0]
|
|
return path_to_id
|
|
|
|
|
|
def batch_update_modified(cur, files_list: list, run_id: int):
|
|
"""
|
|
Batch UPDATE změněných souborů.
|
|
files_list: [{id, size, mtime, content_hash}]
|
|
"""
|
|
for i in range(0, len(files_list), BATCH_SIZE):
|
|
chunk = files_list[i:i + BATCH_SIZE]
|
|
cur.executemany(
|
|
"""UPDATE files
|
|
SET file_size = %s, mtime = %s, content_hash = %s,
|
|
last_seen_run = %s, exists_now = 1
|
|
WHERE id = %s""",
|
|
[(f["size"], f["mtime"], f["content_hash"], run_id, f["id"])
|
|
for f in chunk]
|
|
)
|
|
|
|
|
|
def batch_mark_deleted(cur, file_ids: list, run_id: int):
|
|
"""Batch UPDATE smazaných souborů — exists_now = 0."""
|
|
for i in range(0, len(file_ids), BATCH_SIZE):
|
|
chunk = file_ids[i:i + BATCH_SIZE]
|
|
cur.executemany(
|
|
"UPDATE files SET exists_now = 0, last_seen_run = %s WHERE id = %s",
|
|
[(run_id, fid) for fid in chunk]
|
|
)
|
|
|
|
|
|
def batch_update_unchanged(cur, file_ids: list, run_id: int):
|
|
"""Batch UPDATE nezměněných souborů — jen last_seen_run."""
|
|
for i in range(0, len(file_ids), BATCH_SIZE):
|
|
chunk = file_ids[i:i + BATCH_SIZE]
|
|
cur.executemany(
|
|
"UPDATE files SET last_seen_run = %s, exists_now = 1 WHERE id = %s",
|
|
[(run_id, fid) for fid in chunk]
|
|
)
|