z230
This commit is contained in:
121
ECG/30 ECG test3.py
Normal file
121
ECG/30 ECG test3.py
Normal file
@@ -0,0 +1,121 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import fitz
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import io
|
||||
import pytesseract
|
||||
from pathlib import Path
|
||||
from PIL import Image
|
||||
|
||||
# --- CONFIGURATION ---
|
||||
BASE_DIR = Path(r"u:\Dropbox\Ordinace\Dokumentace_ke_zpracování\EKGforProcessing")
|
||||
FLAG = "rotated-by-script"
|
||||
|
||||
|
||||
# Point to your Tesseract executable if it's not in your PATH:
|
||||
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
||||
|
||||
def ocr_page(page):
|
||||
"""Perform OCR using Tesseract with high DPI for better accuracy."""
|
||||
# Increasing DPI to 300 is crucial for small text on EKG strips
|
||||
pix = page.get_pixmap(dpi=300)
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||||
|
||||
# Using 'ces' for Czech language support
|
||||
# PSM 6: Assume a single uniform block of text
|
||||
text = pytesseract.image_to_string(img, lang='ces', config='--psm 6')
|
||||
return text
|
||||
|
||||
|
||||
def extract_rodne_cislo(text):
|
||||
"""Extracts RC (6 digits + optional slash/spaces + 3-4 digits)."""
|
||||
m = re.search(r"(\d{6})\s*/?\s*(\d{3,4})", text)
|
||||
if not m:
|
||||
return None
|
||||
left = m.group(1)
|
||||
right = m.group(2).zfill(4)
|
||||
return left + right
|
||||
|
||||
|
||||
def extract_date(text):
|
||||
"""Extracts date with high tolerance for OCR spacing errors."""
|
||||
# Matches D.M.YYYY, D. M. YYYY, DD.MM.YYYY, etc.
|
||||
# It also handles if OCR accidentally puts a comma instead of a dot
|
||||
m = re.search(r"(\d{1,2})[\.,]\s*(\d{1,2})[\.,]\s*(\d{4})", text)
|
||||
if m:
|
||||
return m.groups()
|
||||
return None
|
||||
|
||||
|
||||
def convert_date_to_iso(date_tuple):
|
||||
"""Converts (D, M, Y) -> YYYY-MM-DD."""
|
||||
d, m, y = date_tuple
|
||||
return f"{y}-{m.zfill(2)}-{d.zfill(2)}"
|
||||
|
||||
|
||||
# --- MAIN PROCESS ---
|
||||
for pdf_path in BASE_DIR.glob("*.pdf"):
|
||||
if ".tmp" in pdf_path.name: continue
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print(f"PROCESSING: {pdf_path.name}")
|
||||
print(f"{'=' * 60}")
|
||||
|
||||
doc = fitz.open(pdf_path)
|
||||
meta = doc.metadata
|
||||
keywords = meta.get("keywords", "") or ""
|
||||
|
||||
# 1. HANDLE ROTATION
|
||||
if FLAG not in keywords:
|
||||
print(" [Action] Rotating page...")
|
||||
page = doc[0]
|
||||
page.set_rotation((page.rotation + 90) % 360)
|
||||
if doc.page_count > 1:
|
||||
doc.delete_page(1)
|
||||
meta["keywords"] = (keywords + " " + FLAG).strip()
|
||||
doc.set_metadata(meta)
|
||||
tmp = pdf_path.with_suffix(".tmp.pdf")
|
||||
doc.save(tmp, deflate=True)
|
||||
doc.close()
|
||||
os.replace(tmp, pdf_path)
|
||||
doc = fitz.open(pdf_path)
|
||||
|
||||
# 2. PERFORM OCR
|
||||
raw_text = ocr_page(doc[0])
|
||||
|
||||
# --- DEBUG PRINTOUT ---
|
||||
print("\n--- START OF EXTRACTED TEXT ---")
|
||||
print(raw_text)
|
||||
print("--- END OF EXTRACTED TEXT ---\n")
|
||||
# ---------------------
|
||||
|
||||
rc = extract_rodne_cislo(raw_text)
|
||||
date_tuple = extract_date(raw_text)
|
||||
|
||||
print(f"RESULT -> RC: {rc if rc else 'NOT FOUND'}")
|
||||
print(f"RESULT -> Date: {date_tuple if date_tuple else 'NOT FOUND'}")
|
||||
|
||||
doc.close()
|
||||
|
||||
# 3. RENAME LOGIC
|
||||
if rc and date_tuple:
|
||||
date_iso = convert_date_to_iso(date_tuple)
|
||||
new_name = f"{rc} {date_iso} [EKG] [bez hodnocení].pdf"
|
||||
new_path = pdf_path.with_name(new_name)
|
||||
|
||||
if not new_path.exists():
|
||||
try:
|
||||
# Close the handle properly before renaming
|
||||
pdf_path.rename(new_path)
|
||||
print(f"✅ Success: Renamed to {new_name}")
|
||||
except Exception as e:
|
||||
print(f"❌ Rename error: {e}")
|
||||
else:
|
||||
print(f"⚠ Skipping: {new_name} already exists.")
|
||||
else:
|
||||
print("❌ Script could not find all data. Check the extracted text above.")
|
||||
|
||||
print("\nAll files processed.")
|
||||
Reference in New Issue
Block a user