Merge remote-tracking branch 'origin/master'

2025-12-13 07:47:48 +01:00
parent dafae21aa3 dcc127dc7a
commit f3d8d685c2
4 changed files with 330 additions and 3 deletions
--- a/FlattenAdobe.py
+++ b/FlattenAdobe.py
@@ -0,0 +1,45 @@
 import fitz
 from pathlib import Path
 BASE_DIR = Path(r"z:\Dropbox\Ordinace\Dokumentace_ke_zpracování\AdobeFlattenStamp")
 def flatten_pdf_rasterize(input_pdf: Path):
    print(f"Processing: {input_pdf.name}")
    doc = fitz.open(input_pdf)
    # Create a new empty PDF
    new_doc = fitz.open()
    for page in doc:
        # Render each page to a high-resolution image
        pix = page.get_pixmap(dpi=400)
        # Create a new PDF page with same size
        new_page = new_doc.new_page(width=page.rect.width, height=page.rect.height)
        # Insert the rasterized image
        new_page.insert_image(new_page.rect, pixmap=pix)
    # Save output
    output_pdf = input_pdf.with_name(input_pdf.stem + "_flatten.pdf")
    new_doc.save(output_pdf, deflate=True)
    new_doc.close()
    doc.close()
    print(f"   ✔ Saved: {output_pdf.name}")
 def main():
    pdfs = list(BASE_DIR.glob("*.pdf"))
    if not pdfs:
        print("No PDF files found.")
        return
    for pdf in pdfs:
        flatten_pdf_rasterize(pdf)
    print("\nAll files processed.")
 if __name__ == "__main__":
    main()
--- a/test2.py
+++ b/test2.py
@@ -26,9 +26,22 @@ def ocr_page(page):
 def extract_rodne_cislo(text):
-    """Extract 10-digit rodné číslo (no slash)."""
+    """
-    m = re.search(r"\b\d{9,10}\b", text)
+    Extract rodné číslo in formats:
-    return m.group(0) if m else None
+    - 6 digits + slash + 4 digits  → 655527/1910
    - 6 digits + slash + 3 digits  → 655527/910
    - 10 digits without slash      → 6555271910
    Always returns 10 digits without slash.
    """
    m = re.search(r"\b(\d{6})/?(\d{3,4})\b", text)
    if not m:
        return None
    left = m.group(1)
    right = m.group(2).zfill(4)  # ensure 4 digits
    return left + right
 def extract_date(text):
--- a/SpočítejMD5ZpracoveneALAB.py
+++ b/SpočítejMD5ZpracoveneALAB.py
@@ -0,0 +1,111 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import hashlib
 import json
 from pathlib import Path
 import time
 # === ZDE JEN PŘIDÁVEJ ADRESÁŘE ===
 DIRECTORIES = [
    Path(r"U:\Dropbox\Ordinace\Dokumentace_zpracovaná"),
    Path(r"U:\Dropbox\Ordinace\LAB-PDF"),
 ]
 CHUNK = 65536
 def md5_file(path: Path) -> str:
    h = hashlib.md5()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(CHUNK), b""):
            h.update(chunk)
    return h.hexdigest()
 def load_db(db_path: Path) -> dict:
    if db_path.exists():
        with db_path.open("r", encoding="utf-8") as f:
            return json.load(f)
    return {}
 def save_db(db: dict, db_path: Path):
    with db_path.open("w", encoding="utf-8") as f:
        json.dump(db, f, ensure_ascii=False, indent=2)
 def process_directory(root: Path):
    print("\n===========================================")
    print(f"📁 ZPRACOVÁVÁM ADRESÁŘ: {root}")
    print("===========================================\n")
    db_path = root / "processed_files.json"
    # Načíst databázi
    db = load_db(db_path)
    print(f"Načteno z DB: {len(db)} záznamů")
    # Projít souborový systém
    files_in_fs = {}
    start_scan = time.time()
    for f in root.rglob("*"):
        if f.is_file() and f.suffix.lower() != ".json":
            stat = f.stat()
            files_in_fs[f.name] = {
                "size": stat.st_size,
                "mtime": int(stat.st_mtime),
                "path": str(f)
            }
    print(f"Nalezeno v FS: {len(files_in_fs)} souborů")
    print(f"Čas skenu: {time.time() - start_scan:.2f} s\n")
    new_files = 0
    changed_files = 0
    for fname, info in files_in_fs.items():
        size = info["size"]
        mtime = info["mtime"]
        # nový soubor?
        if fname not in db:
            print(f"Nový soubor → MD5: {fname}")
            new_files += 1
            db[fname] = {
                "size": size,
                "mtime": mtime,
                "md5": md5_file(Path(info["path"]))
            }
            continue
        # změněný soubor?
        if db[fname]["size"] != size or db[fname]["mtime"] != mtime:
            print(f"Změněný soubor → MD5: {fname}")
            changed_files += 1
            db[fname] = {
                "size": size,
                "mtime": mtime,
                "md5": md5_file(Path(info["path"]))
            }
    # uložit databázi
    save_db(db, db_path)
    print("\n=== Výsledky ===")
    print(f" Nové soubory:     {new_files}")
    print(f" Změněné soubory:  {changed_files}")
    print(f" Celkem v DB:      {len(db)}")
    print(f" Databáze:         {db_path}")
    print("=============================\n")
 def main():
    for directory in DIRECTORIES:
        process_directory(directory)
 if __name__ == "__main__":
    main()
--- a/spočítejMD5AoznačsouboryAadresáře.py
+++ b/spočítejMD5AoznačsouboryAadresáře.py
@@ -0,0 +1,158 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 import hashlib
 import json
 from pathlib import Path
 import time
 import traceback
 # ======= CONFIG =======
 MP_DIR = Path(r"U:\Dropbox\Ordinace\Dokumentace_ke_zpracování\MP")
 JSON_PATHS = [
    Path(r"U:\Dropbox\Ordinace\LAB-PDF\processed_files.json"),
    Path(r"U:\Dropbox\Ordinace\Dokumentace_zpracovaná\processed_files.json"),
 ]
 CHUNK = 65536
 PRINT_EVERY = 50
 # ======================
 def try_rename(old_path: Path, new_path: Path, retries: int = 5, delay: int = 5):
    """Try rename with retry mechanism."""
    for attempt in range(1, retries + 1):
        try:
            old_path.rename(new_path)
            return True
        except Exception as e:
            print(f"⚠ Rename failed ({attempt}/{retries}): {e}")
            if attempt < retries:
                print(f"   Waiting {delay}s before retry...")
                time.sleep(delay)
            else:
                print("   ❌ Maximum retries reached. Skipping.")
                traceback.print_exc()
                return False
 def md5_file(path: Path) -> str:
    h = hashlib.md5()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(CHUNK), b""):
            h.update(chunk)
    return h.hexdigest()
 def load_all_md5(json_paths):
    """Načte MD5 ze všech JSONů a vrátí množinu."""
    md5_set = set()
    for jp in json_paths:
        if not jp.exists():
            print(f"⚠ JSON nenalezen: {jp}")
            continue
        try:
            with jp.open("r", encoding="utf-8") as f:
                db = json.load(f)
                for _, info in db.items():
                    md5_set.add(info["md5"])
            print(f"Načteno {len(db)} záznamů z {jp}")
        except Exception as e:
            print(f"❌ Chyba při čtení {jp}: {e}")
    print(f"➡ Celkem MD5 hashů: {len(md5_set)}\n")
    return md5_set
 def mark_folders_if_all_marked(root: Path, dryrun: bool):
    print("\n=== KONTROLA ADRESÁŘŮ — OZNAČENÍ PLNĚ HOTOVÝCH ===")
    for folder in sorted(root.rglob("*")):
        if not folder.is_dir():
            continue
        files = [f for f in folder.iterdir() if f.is_file()]
        if not files:
            continue
        # všechny soubory označené?
        if not all(f.name.startswith("▲") for f in files):
            continue
        # adresář už označen?
        if len(folder.name) > 10 and folder.name[10] == "▲":
            continue
        # vložení ▲ na 11. pozici
        insert_pos = 10
        name = folder.name
        if len(name) <= insert_pos:
            new_name = name + "▲"
        else:
            new_name = name[:insert_pos] + "▲" + name[insert_pos:]
        new_path = folder.parent / new_name
        print(f"✔ Adresář označen: {folder.name} → {new_name}")
        if not dryrun:
            try_rename(folder, new_path)
 def run_matcher(dryrun: bool = True):
    print("\n=== MATCHER V3 — SOUBORY + ADRESÁŘE ===")
    print(f"Režim: {'DRYRUN (simulace)' if dryrun else 'OSTRÝ'}\n")
    all_md5 = load_all_md5(JSON_PATHS)
    counter = 0
    renamed = 0
    start = time.time()
    for file in MP_DIR.rglob("*"):
        if not file.is_file():
            continue
        counter += 1
        if counter % PRINT_EVERY == 0:
            speed = counter / (time.time() - start)
            print(f"  {counter} soub. ({speed:.1f}/s)")
        md5 = md5_file(file)
        if md5 in all_md5:
            if file.name.startswith("▲"):
                continue
            new_name = "▲" + file.name
            new_path = file.parent / new_name
            if dryrun:
                print(f"[DRYRUN] Označil bych: {file.name} → {new_name}")
            else:
                # === RETRY RENAME (soubory) ===
                success = try_rename(file, new_path)
                if success:
                    renamed += 1
                    print(f"✔ {file.name} → {new_name}")
    # označit adresáře
    mark_folders_if_all_marked(MP_DIR, dryrun)
    total_time = time.time() - start
    speed = 0 if counter == 0 or total_time == 0 else counter / total_time
    print("\n=== MATCHER HOTOVO ===")
    print(f" Zkontrolováno: {counter} souborů")
    print(f" Označeno:     {renamed}")
    print(f" Rychlost:     {speed:.1f} soub./s")
    print(f" Režim:        {'DRYRUN' if dryrun else 'OSTRÝ'}\n")
 if __name__ == "__main__":
    run_matcher(dryrun=False)