z230

2025-11-25 11:26:03 +01:00
parent df78ec1909
commit d1b89d8e34
2 changed files with 249 additions and 192 deletions
--- a/MD5calculate.py
+++ b/MD5calculate.py
@@ -1,192 +0,0 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import os
 import hashlib
 from datetime import datetime
 import mysql.connector
 from dotenv import load_dotenv
 from pathlib import Path
 # ======================================================
 # Load environment
 # ======================================================
 env_path = Path(__file__).resolve().parent / ".env"
 load_dotenv(env_path)
 # ======================================================
 # MySQL connection
 # ======================================================
 def get_db_connection():
    conn = mysql.connector.connect(
        host=os.getenv("DB_MYSQL_HOST"),
        user=os.getenv("DB_MYSQL_ROOT"),
        password=os.getenv("DB_MYSQL_ROOT_PASS"),
        port=int(os.getenv("DB_MYSQL_PORT")),
        database="walkfiles",
        auth_plugin="mysql_native_password"
    )
    c = conn.cursor()
    c.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
    c.close()
    return conn
 # ======================================================
 # Helpers
 # ======================================================
 def file_md5(path, chunk_size=1024 * 1024):
    """Compute content MD5 of a file in chunks."""
    md5 = hashlib.md5()
    with open(path, "rb") as f:
        while chunk := f.read(chunk_size):
            md5.update(chunk)
    return md5.hexdigest()
 def parse_size(size_str: str) -> int:
    """
    Convert human input like:
        10MB, 500kB, 2GB
    into bytes. If already numeric, return as-is.
    """
    s = size_str.strip().upper()
    if s.endswith("KB"):
        return int(float(s[:-2]) * 1024)
    if s.endswith("MB"):
        return int(float(s[:-2]) * 1024 * 1024)
    if s.endswith("GB"):
        return int(float(s[:-2]) * 1024 * 1024 * 1024)
    return int(s)  # assume raw bytes
 # ======================================================
 # MAIN LOGIC
 # ======================================================
 def run_md5_calculator(device_name=None,
                       device_id=None,
                       extension=".pdf",
                       max_size="50MB"):
    """
    device_name OR device_id must be provided.
    extension: ".pdf", ".jpg", etc.
    max_size: "10MB", "500KB", "1GB" or number of bytes
    """
    max_bytes = parse_size(max_size)
    conn, cursor = None, None
    try:
        conn = get_db_connection()
        cursor = conn.cursor(dictionary=True)
        # ------------------------------------------
        # Resolve device_id if only device_name given
        # ------------------------------------------
        if device_id is None:
            if device_name is None:
                raise RuntimeError("You must provide device_name or device_id")
            cursor.execute("SELECT id FROM devices WHERE name=%s", (device_name,))
            row = cursor.fetchone()
            if not row:
                raise RuntimeError(f"Device '{device_name}' not found")
            device_id = row["id"]
        print(f"\n🔍 Filtering: device={device_id}, ext={extension}, max_size={max_bytes} bytes\n")
        # ------------------------------------------
        # SELECT only files that need MD5 calculation
        # ------------------------------------------
        cursor.execute("""
            SELECT id, path, size, modified, content_md5, md5_calculated
            FROM files
            WHERE device_id=%s
              AND deleted = 0
              AND path LIKE %s
              AND size <= %s
        """, (device_id, "%" + extension, max_bytes))
        rows = cursor.fetchall()
        total = len(rows)
        print(f"📁 Files matching criteria: {total}")
        updates = 0
        for row in rows:
            file_id = row["id"]
            path = row["path"]
            size = row["size"]
            modified = row["modified"]
            prev_md5 = row["content_md5"]
            prev_calc = row["md5_calculated"]
            # -------------------------------
            # Skip missing files on disk
            # -------------------------------
            if not os.path.isfile(path):
                print(f"⚠️ Missing on disk, skipping: {path}")
                continue
            # -------------------------------
            # Check conditions for recalculation
            # -------------------------------
            need_md5 = False
            if prev_md5 is None:
                need_md5 = True
            else:
                if prev_calc is None or prev_calc < modified:
                    need_md5 = True
            if not need_md5:
                continue
            # -------------------------------
            # Compute MD5
            # -------------------------------
            print(f"🔄 Calculating MD5: {path}")
            new_md5 = file_md5(path)
            now = datetime.now().replace(microsecond=0)
            cursor.execute("""
                UPDATE files
                SET content_md5=%s,
                    md5_calculated=%s
                WHERE id=%s
            """, (new_md5, now, file_id))
            updates += 1
            # optional commit per-file:
            # conn.commit()
        conn.commit()
        print("\n✅ MD5 calculation finished.")
        print(f"   Updated files: {updates}")
        print(f"   Skipped files: {total - updates}\n")
    except Error as e:
        print("MySQL Error:", e)
    finally:
        if cursor:
            cursor.close()
        if conn:
            conn.close()
 # ======================================================
 # RUN EXAMPLE
 # ======================================================
 if __name__ == "__main__":
    # Example usage:
    run_md5_calculator(
        device_name="Z230",
        extension=".pdf",
        max_size="100MB"
    )
--- a/MD5CalculateMultiThread.py
+++ b/MD5CalculateMultiThread.py
@@ -0,0 +1,249 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import os
 import hashlib
 from datetime import datetime
 import mysql.connector
 from dotenv import load_dotenv
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor, as_completed
 # ======================================================
 # Load environment
 # ======================================================
 env_path = Path(__file__).resolve().parent / ".env"
 load_dotenv(env_path)
 # ======================================================
 # MySQL connection (each thread gets its own)
 # ======================================================
 def get_db_connection():
    conn = mysql.connector.connect(
        host=os.getenv("DB_MYSQL_HOST"),
        user=os.getenv("DB_MYSQL_ROOT"),
        password=os.getenv("DB_MYSQL_ROOT_PASS"),
        port=int(os.getenv("DB_MYSQL_PORT")),
        database="walkfiles",
        auth_plugin="mysql_native_password"
    )
    c = conn.cursor()
    c.execute("SET NAMES utf8mb4 COLLATE utf8mb4_general_ci")
    c.close()
    return conn
 # ======================================================
 # Helpers
 # ======================================================
 def file_md5(path, chunk_size=1024 * 1024):
    md5 = hashlib.md5()
    with open(path, "rb") as f:
        while chunk := f.read(chunk_size):
            md5.update(chunk)
    return md5.hexdigest()
 def parse_size(size_str: str) -> int:
    s = size_str.strip().upper()
    if s.endswith("KB"):
        return int(float(s[:-2]) * 1024)
    if s.endswith("MB"):
        return int(float(s[:-2]) * 1024 * 1024)
    if s.endswith("GB"):
        return int(float(s[:-2]) * 1024 * 1024 * 1024)
    return int(s)
 # ======================================================
 # WORKER: Runs in thread
 # ======================================================
 def process_one_file(row):
    file_id = row["id"]
    path = row["path"]
    modified = row["modified"]
    prev_md5 = row["content_md5"]
    prev_calc = row["md5_calculated"]
    # --- Skip if file does not exist ---
    if not os.path.isfile(path):
        return (file_id, "missing", None)
    # --- Decide if MD5 needed ---
    need_md5 = (
        prev_md5 is None or
        prev_calc is None or
        prev_calc < modified
    )
    if not need_md5:
        return (file_id, "skip", None)
    # --- Calculate MD5 ---
    new_md5 = file_md5(path)
    now = datetime.now().replace(microsecond=0)
    # --- Update DB inside thread ---
    try:
        conn = get_db_connection()
        c = conn.cursor()
        c.execute("""
            UPDATE files
            SET content_md5=%s,
                md5_calculated=%s
            WHERE id=%s
        """, (new_md5, now, file_id))
        conn.commit()
        c.close()
        conn.close()
        return (file_id, "updated", new_md5)
    except Exception as e:
        return (file_id, "error", str(e))
 # ======================================================
 # MAIN LOGIC (single-threaded DB query + multi-threaded MD5)
 # ======================================================
 def run_md5_calculator(device_name=None,
                       device_id=None,
                       extension=".pdf",
                       max_size="50MB",
                       path_prefix=None,
                       threads=8):
    # ----------------------------
    # DEVICE filter resolution
    # ----------------------------
    filter_by_device = True
    if device_name == "ANY" or device_id == "ANY":
        filter_by_device = False
    elif device_id is None:
        if device_name is None:
            raise RuntimeError("You must provide device_name or device_id")
        conn = get_db_connection()
        cur = conn.cursor(dictionary=True)
        cur.execute("SELECT id FROM devices WHERE name=%s", (device_name,))
        row = cur.fetchone()
        cur.close(); conn.close()
        if not row:
            raise RuntimeError(f"Device '{device_name}' not found")
        device_id = row["id"]
    # EXTENSION filter
    filter_by_extension = (extension != "ANY")
    # SIZE filter
    filter_by_size = (max_size != "ANY")
    max_bytes = parse_size(max_size) if filter_by_size else None
    # PATH filter
    filter_by_path = (path_prefix not in [None, "", "ANY"])
    cleaned_prefix = path_prefix.rstrip("\\/") if filter_by_path else None
    print(f"\n🔍 Filtering:"
          f" device={'ANY' if not filter_by_device else device_id},"
          f" ext={extension},"
          f" max_size={max_size},"
          f" prefix={path_prefix}\n")
    # ---------------------------------------
    # Fetch all rows in a single DB query
    # ---------------------------------------
    conn = get_db_connection()
    cursor = conn.cursor(dictionary=True)
    where = ["deleted = 0"]
    params = []
    if filter_by_device:
        where.append("device_id=%s")
        params.append(device_id)
    if filter_by_extension:
        where.append("path LIKE %s")
        params.append("%" + extension)
    if filter_by_size:
        where.append("size <= %s")
        params.append(max_bytes)
    if filter_by_path:
        where.append("path LIKE %s")
        params.append(cleaned_prefix + "%")
    sql = f"""
        SELECT id, path, size, modified, content_md5, md5_calculated
        FROM files
        WHERE {" AND ".join(where)}
    """
    cursor.execute(sql, params)
    rows = cursor.fetchall()
    cursor.close(); conn.close()
    total = len(rows)
    print(f"📁 Files matching criteria: {total}\n")
    # ======================================================
    # === MULTITHREADED MD5 CALCULATION BELOW ============
    # ======================================================
    updated = 0
    skipped = 0
    missing = 0
    errors = 0
    with ThreadPoolExecutor(max_workers=threads) as exe:
        futures = {exe.submit(process_one_file, r): r["id"] for r in rows}
        for future in as_completed(futures):
            file_id = futures[future]
            status, result = None, None
            try:
                file_id, status, result = future.result()
            except Exception as e:
                print(f"❌ Thread error for ID {file_id}: {e}")
                errors += 1
                continue
            if status == "updated":
                updated += 1
            elif status == "skip":
                skipped += 1
            elif status == "missing":
                missing += 1
            elif status == "error":
                errors += 1
                print(f"⚠️ DB update error: {result}")
    # ======================================================
    # SUMMARY
    # ======================================================
    print("\n============================")
    print("✅ Multithreaded MD5 finished")
    print("============================")
    print(f"Updated: {updated}")
    print(f"Skipped: {skipped}")
    print(f"Missing: {missing}")
    print(f"Errors:  {errors}")
    print(f"Threads: {threads}\n")
 # ======================================================
 # RUN EXAMPLE
 # ======================================================
 if __name__ == "__main__":
    run_md5_calculator(
        device_name="TOWER",
        extension="ANY",
        max_size="ANY",
        path_prefix="ANY",
        threads=12      # ← ADJUST THREAD COUNT HERE
    )