reporter
This commit is contained in:
+22
-27
@@ -1,8 +1,13 @@
|
||||
import unicodedata
|
||||
import pymysql
|
||||
from datetime import datetime
|
||||
from indexer.config import DB_CONFIG, BATCH_SIZE
|
||||
|
||||
|
||||
def _nfc(s: str) -> str:
|
||||
return unicodedata.normalize("NFC", s) if s else s
|
||||
|
||||
|
||||
def get_connection():
|
||||
return pymysql.connect(**DB_CONFIG)
|
||||
|
||||
@@ -50,7 +55,7 @@ def load_all_files(cur) -> dict:
|
||||
result = {}
|
||||
for row in cur.fetchall():
|
||||
file_id, rel_path, size, mtime, content_hash = row
|
||||
result[rel_path] = {
|
||||
result[_nfc(rel_path)] = {
|
||||
"id": file_id,
|
||||
"size": size,
|
||||
"mtime": mtime,
|
||||
@@ -70,34 +75,24 @@ def batch_insert_files(cur, files_list: list, run_id: int) -> dict:
|
||||
Returns: {relative_path: file_id}
|
||||
"""
|
||||
path_to_id = {}
|
||||
for i in range(0, len(files_list), BATCH_SIZE):
|
||||
chunk = files_list[i:i + BATCH_SIZE]
|
||||
for f in chunk:
|
||||
cur.execute(
|
||||
"""INSERT INTO files
|
||||
(relative_path, file_name, directory, file_size, mtime,
|
||||
content_hash, first_seen_run, last_seen_run, exists_now)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1)
|
||||
ON DUPLICATE KEY UPDATE
|
||||
file_name = VALUES(file_name),
|
||||
directory = VALUES(directory),
|
||||
file_size = VALUES(file_size),
|
||||
mtime = VALUES(mtime),
|
||||
content_hash = VALUES(content_hash),
|
||||
last_seen_run = VALUES(last_seen_run),
|
||||
exists_now = 1""",
|
||||
(f["relative_path"], f["file_name"], f["directory"],
|
||||
f["size"], f["mtime"], f["content_hash"], run_id, run_id)
|
||||
)
|
||||
# Fetch real IDs
|
||||
paths = [f["relative_path"] for f in chunk]
|
||||
placeholders = ",".join(["%s"] * len(paths))
|
||||
for f in files_list:
|
||||
cur.execute(
|
||||
f"SELECT id, relative_path FROM files WHERE relative_path IN ({placeholders})",
|
||||
paths,
|
||||
"""INSERT INTO files
|
||||
(relative_path, file_name, directory, file_size, mtime,
|
||||
content_hash, first_seen_run, last_seen_run, exists_now)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, 1)
|
||||
ON DUPLICATE KEY UPDATE
|
||||
file_name = VALUES(file_name),
|
||||
directory = VALUES(directory),
|
||||
file_size = VALUES(file_size),
|
||||
mtime = VALUES(mtime),
|
||||
content_hash = VALUES(content_hash),
|
||||
last_seen_run = VALUES(last_seen_run),
|
||||
exists_now = 1""",
|
||||
(f["relative_path"], f["file_name"], f["directory"],
|
||||
f["size"], f["mtime"], f["content_hash"], run_id, run_id)
|
||||
)
|
||||
for row in cur.fetchall():
|
||||
path_to_id[row[1]] = row[0]
|
||||
path_to_id[f["relative_path"]] = cur.lastrowid
|
||||
return path_to_id
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user