Merge remote-tracking branch 'origin/master'

This commit is contained in:
2025-12-13 07:47:48 +01:00
4 changed files with 330 additions and 3 deletions

View File

@@ -0,0 +1,45 @@
import fitz
from pathlib import Path
BASE_DIR = Path(r"z:\Dropbox\Ordinace\Dokumentace_ke_zpracování\AdobeFlattenStamp")
def flatten_pdf_rasterize(input_pdf: Path):
print(f"Processing: {input_pdf.name}")
doc = fitz.open(input_pdf)
# Create a new empty PDF
new_doc = fitz.open()
for page in doc:
# Render each page to a high-resolution image
pix = page.get_pixmap(dpi=400)
# Create a new PDF page with same size
new_page = new_doc.new_page(width=page.rect.width, height=page.rect.height)
# Insert the rasterized image
new_page.insert_image(new_page.rect, pixmap=pix)
# Save output
output_pdf = input_pdf.with_name(input_pdf.stem + "_flatten.pdf")
new_doc.save(output_pdf, deflate=True)
new_doc.close()
doc.close()
print(f" ✔ Saved: {output_pdf.name}")
def main():
pdfs = list(BASE_DIR.glob("*.pdf"))
if not pdfs:
print("No PDF files found.")
return
for pdf in pdfs:
flatten_pdf_rasterize(pdf)
print("\nAll files processed.")
if __name__ == "__main__":
main()

View File

@@ -26,9 +26,22 @@ def ocr_page(page):
def extract_rodne_cislo(text): def extract_rodne_cislo(text):
"""Extract 10-digit rodné číslo (no slash).""" """
m = re.search(r"\b\d{9,10}\b", text) Extract rodné číslo in formats:
return m.group(0) if m else None - 6 digits + slash + 4 digits → 655527/1910
- 6 digits + slash + 3 digits → 655527/910
- 10 digits without slash → 6555271910
Always returns 10 digits without slash.
"""
m = re.search(r"\b(\d{6})/?(\d{3,4})\b", text)
if not m:
return None
left = m.group(1)
right = m.group(2).zfill(4) # ensure 4 digits
return left + right
def extract_date(text): def extract_date(text):

View File

@@ -0,0 +1,111 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import hashlib
import json
from pathlib import Path
import time
# === ZDE JEN PŘIDÁVEJ ADRESÁŘE ===
DIRECTORIES = [
Path(r"U:\Dropbox\Ordinace\Dokumentace_zpracovaná"),
Path(r"U:\Dropbox\Ordinace\LAB-PDF"),
]
CHUNK = 65536
def md5_file(path: Path) -> str:
h = hashlib.md5()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(CHUNK), b""):
h.update(chunk)
return h.hexdigest()
def load_db(db_path: Path) -> dict:
if db_path.exists():
with db_path.open("r", encoding="utf-8") as f:
return json.load(f)
return {}
def save_db(db: dict, db_path: Path):
with db_path.open("w", encoding="utf-8") as f:
json.dump(db, f, ensure_ascii=False, indent=2)
def process_directory(root: Path):
print("\n===========================================")
print(f"📁 ZPRACOVÁVÁM ADRESÁŘ: {root}")
print("===========================================\n")
db_path = root / "processed_files.json"
# Načíst databázi
db = load_db(db_path)
print(f"Načteno z DB: {len(db)} záznamů")
# Projít souborový systém
files_in_fs = {}
start_scan = time.time()
for f in root.rglob("*"):
if f.is_file() and f.suffix.lower() != ".json":
stat = f.stat()
files_in_fs[f.name] = {
"size": stat.st_size,
"mtime": int(stat.st_mtime),
"path": str(f)
}
print(f"Nalezeno v FS: {len(files_in_fs)} souborů")
print(f"Čas skenu: {time.time() - start_scan:.2f} s\n")
new_files = 0
changed_files = 0
for fname, info in files_in_fs.items():
size = info["size"]
mtime = info["mtime"]
# nový soubor?
if fname not in db:
print(f"Nový soubor → MD5: {fname}")
new_files += 1
db[fname] = {
"size": size,
"mtime": mtime,
"md5": md5_file(Path(info["path"]))
}
continue
# změněný soubor?
if db[fname]["size"] != size or db[fname]["mtime"] != mtime:
print(f"Změněný soubor → MD5: {fname}")
changed_files += 1
db[fname] = {
"size": size,
"mtime": mtime,
"md5": md5_file(Path(info["path"]))
}
# uložit databázi
save_db(db, db_path)
print("\n=== Výsledky ===")
print(f" Nové soubory: {new_files}")
print(f" Změněné soubory: {changed_files}")
print(f" Celkem v DB: {len(db)}")
print(f" Databáze: {db_path}")
print("=============================\n")
def main():
for directory in DIRECTORIES:
process_directory(directory)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,158 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import hashlib
import json
from pathlib import Path
import time
import traceback
# ======= CONFIG =======
MP_DIR = Path(r"U:\Dropbox\Ordinace\Dokumentace_ke_zpracování\MP")
JSON_PATHS = [
Path(r"U:\Dropbox\Ordinace\LAB-PDF\processed_files.json"),
Path(r"U:\Dropbox\Ordinace\Dokumentace_zpracovaná\processed_files.json"),
]
CHUNK = 65536
PRINT_EVERY = 50
# ======================
def try_rename(old_path: Path, new_path: Path, retries: int = 5, delay: int = 5):
"""Try rename with retry mechanism."""
for attempt in range(1, retries + 1):
try:
old_path.rename(new_path)
return True
except Exception as e:
print(f"⚠ Rename failed ({attempt}/{retries}): {e}")
if attempt < retries:
print(f" Waiting {delay}s before retry...")
time.sleep(delay)
else:
print(" ❌ Maximum retries reached. Skipping.")
traceback.print_exc()
return False
def md5_file(path: Path) -> str:
h = hashlib.md5()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(CHUNK), b""):
h.update(chunk)
return h.hexdigest()
def load_all_md5(json_paths):
"""Načte MD5 ze všech JSONů a vrátí množinu."""
md5_set = set()
for jp in json_paths:
if not jp.exists():
print(f"⚠ JSON nenalezen: {jp}")
continue
try:
with jp.open("r", encoding="utf-8") as f:
db = json.load(f)
for _, info in db.items():
md5_set.add(info["md5"])
print(f"Načteno {len(db)} záznamů z {jp}")
except Exception as e:
print(f"❌ Chyba při čtení {jp}: {e}")
print(f"➡ Celkem MD5 hashů: {len(md5_set)}\n")
return md5_set
def mark_folders_if_all_marked(root: Path, dryrun: bool):
print("\n=== KONTROLA ADRESÁŘŮ — OZNAČENÍ PLNĚ HOTOVÝCH ===")
for folder in sorted(root.rglob("*")):
if not folder.is_dir():
continue
files = [f for f in folder.iterdir() if f.is_file()]
if not files:
continue
# všechny soubory označené?
if not all(f.name.startswith("") for f in files):
continue
# adresář už označen?
if len(folder.name) > 10 and folder.name[10] == "":
continue
# vložení ▲ na 11. pozici
insert_pos = 10
name = folder.name
if len(name) <= insert_pos:
new_name = name + ""
else:
new_name = name[:insert_pos] + "" + name[insert_pos:]
new_path = folder.parent / new_name
print(f"✔ Adresář označen: {folder.name}{new_name}")
if not dryrun:
try_rename(folder, new_path)
def run_matcher(dryrun: bool = True):
print("\n=== MATCHER V3 — SOUBORY + ADRESÁŘE ===")
print(f"Režim: {'DRYRUN (simulace)' if dryrun else 'OSTRÝ'}\n")
all_md5 = load_all_md5(JSON_PATHS)
counter = 0
renamed = 0
start = time.time()
for file in MP_DIR.rglob("*"):
if not file.is_file():
continue
counter += 1
if counter % PRINT_EVERY == 0:
speed = counter / (time.time() - start)
print(f" {counter} soub. ({speed:.1f}/s)")
md5 = md5_file(file)
if md5 in all_md5:
if file.name.startswith(""):
continue
new_name = "" + file.name
new_path = file.parent / new_name
if dryrun:
print(f"[DRYRUN] Označil bych: {file.name}{new_name}")
else:
# === RETRY RENAME (soubory) ===
success = try_rename(file, new_path)
if success:
renamed += 1
print(f"{file.name}{new_name}")
# označit adresáře
mark_folders_if_all_marked(MP_DIR, dryrun)
total_time = time.time() - start
speed = 0 if counter == 0 or total_time == 0 else counter / total_time
print("\n=== MATCHER HOTOVO ===")
print(f" Zkontrolováno: {counter} souborů")
print(f" Označeno: {renamed}")
print(f" Rychlost: {speed:.1f} soub./s")
print(f" Režim: {'DRYRUN' if dryrun else 'OSTRÝ'}\n")
if __name__ == "__main__":
run_matcher(dryrun=False)