This commit is contained in:
2026-05-18 07:04:08 +02:00
parent 52f04c2839
commit 07e6a9c374
8 changed files with 254 additions and 99 deletions
+22 -27
View File
@@ -1,8 +1,13 @@
import unicodedata
import pymysql
from datetime import datetime
from indexer.config import DB_CONFIG, BATCH_SIZE
def _nfc(s: str) -> str:
return unicodedata.normalize("NFC", s) if s else s
def get_connection():
return pymysql.connect(**DB_CONFIG)
@@ -50,7 +55,7 @@ def load_all_files(cur) -> dict:
result = {}
for row in cur.fetchall():
file_id, rel_path, size, mtime, content_hash = row
result[rel_path] = {
result[_nfc(rel_path)] = {
"id": file_id,
"size": size,
"mtime": mtime,
@@ -70,34 +75,24 @@ def batch_insert_files(cur, files_list: list, run_id: int) -> dict:
Returns: {relative_path: file_id}
"""
path_to_id = {}
for i in range(0, len(files_list), BATCH_SIZE):
chunk = files_list[i:i + BATCH_SIZE]
for f in chunk:
cur.execute(
"""INSERT INTO files
(relative_path, file_name, directory, file_size, mtime,
content_hash, first_seen_run, last_seen_run, exists_now)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1)
ON DUPLICATE KEY UPDATE
file_name = VALUES(file_name),
directory = VALUES(directory),
file_size = VALUES(file_size),
mtime = VALUES(mtime),
content_hash = VALUES(content_hash),
last_seen_run = VALUES(last_seen_run),
exists_now = 1""",
(f["relative_path"], f["file_name"], f["directory"],
f["size"], f["mtime"], f["content_hash"], run_id, run_id)
)
# Fetch real IDs
paths = [f["relative_path"] for f in chunk]
placeholders = ",".join(["%s"] * len(paths))
for f in files_list:
cur.execute(
f"SELECT id, relative_path FROM files WHERE relative_path IN ({placeholders})",
paths,
"""INSERT INTO files
(relative_path, file_name, directory, file_size, mtime,
content_hash, first_seen_run, last_seen_run, exists_now)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1)
ON DUPLICATE KEY UPDATE
file_name = VALUES(file_name),
directory = VALUES(directory),
file_size = VALUES(file_size),
mtime = VALUES(mtime),
content_hash = VALUES(content_hash),
last_seen_run = VALUES(last_seen_run),
exists_now = 1""",
(f["relative_path"], f["file_name"], f["directory"],
f["size"], f["mtime"], f["content_hash"], run_id, run_id)
)
for row in cur.fetchall():
path_to_id[row[1]] = row[0]
path_to_id[f["relative_path"]] = cur.lastrowid
return path_to_id