205 lines
6.9 KiB
Python
205 lines
6.9 KiB
Python
import fitz # PyMuPDF
|
|
import pytesseract
|
|
from PIL import Image
|
|
import io
|
|
import os
|
|
import re
|
|
import fdb # Firebird database library
|
|
|
|
# ==========================================
|
|
# KONFIGURACE
|
|
# ==========================================
|
|
|
|
# 1. Cesty k souborům
|
|
INPUT_FOLDER = r'u:\Dropbox\Ordinace\Dokumentace_ke_zpracování\AdobeMakeSmaller'
|
|
OUTPUT_FOLDER = os.path.join(INPUT_FOLDER, '_HOTOVO_MEDICUS')
|
|
|
|
# 2. Tesseract OCR
|
|
PATH_TO_TESSERACT = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
|
|
|
# 3. Kvalita výstupu (Ultimate metoda)
|
|
VISUAL_DPI = 150 # Pro zobrazení (malé)
|
|
VISUAL_THRESHOLD = 150 # Práh černé
|
|
OCR_DPI = 300 # Pro čtení (velké)
|
|
|
|
# 4. Databáze MEDICUS (Firebird)
|
|
DB_CONFIG = {
|
|
'host': "192.168.1.4",
|
|
'port': 3050,
|
|
'database': r"z:\Medicus 3\data\MEDICUS.FDB",
|
|
'user': "SYSDBA",
|
|
'password': "masterkey",
|
|
'charset': "WIN1250" # Čeština ve Windows
|
|
}
|
|
|
|
|
|
# ==========================================
|
|
# FUNKCE
|
|
# ==========================================
|
|
|
|
def setup_tesseract():
|
|
if not os.path.exists(PATH_TO_TESSERACT):
|
|
print(f"CHYBA: Tesseract nenalezen na: {PATH_TO_TESSERACT}")
|
|
return False
|
|
pytesseract.pytesseract.tesseract_cmd = PATH_TO_TESSERACT
|
|
return True
|
|
|
|
|
|
def get_rc_candidates(text):
|
|
"""Vrátí seznam RČ nalezených v textu, očistěných od mezer a lomítek."""
|
|
candidates = set()
|
|
# Regex bere i mezery: 75 05 12 / 1234
|
|
pattern = r'\b\d{2}\s*\d{2}\s*\d{2}\s*[\/]?\s*\d{3,4}\b'
|
|
matches = re.findall(pattern, text)
|
|
|
|
for match in matches:
|
|
# Odstraníme vše kromě čísel -> vznikne čisté RČ pro DB
|
|
clean = re.sub(r'[^\d]', '', match)
|
|
if len(clean) in [9, 10]:
|
|
candidates.add(clean)
|
|
return list(candidates)
|
|
|
|
|
|
def get_patient_from_db(rc_clean):
|
|
"""
|
|
Vrátí (True, "Prijmeni, Jmeno") pokud najde RČ v tabulce KAR.
|
|
"""
|
|
con = None
|
|
try:
|
|
con = fdb.connect(**DB_CONFIG)
|
|
cur = con.cursor()
|
|
|
|
# Medicus má RČ bez lomítka, takže se ptáme rovnou
|
|
sql = "SELECT prijmeni, jmeno FROM kar WHERE rodcis = ?"
|
|
cur.execute(sql, (rc_clean,))
|
|
row = cur.fetchone()
|
|
|
|
if row:
|
|
# row[0] = Prijmeni, row[1] = Jmeno
|
|
# .strip() je nutný, databáze vrací mezery na konci (CHAR field)
|
|
prijmeni = str(row[0]).strip()
|
|
jmeno = str(row[1]).strip()
|
|
|
|
formatted_name = f"{prijmeni}, {jmeno}"
|
|
return True, formatted_name
|
|
else:
|
|
return False, None
|
|
|
|
except Exception as e:
|
|
print(f" [DB ERROR] {e}")
|
|
return False, None
|
|
finally:
|
|
if con:
|
|
con.close()
|
|
|
|
|
|
def process_medicus_final():
|
|
if not setup_tesseract():
|
|
return
|
|
|
|
if not os.path.exists(OUTPUT_FOLDER):
|
|
os.makedirs(OUTPUT_FOLDER)
|
|
|
|
files = [f for f in os.listdir(INPUT_FOLDER) if f.lower().endswith('.pdf')]
|
|
print(f"Startuji zpracování {len(files)} souborů.")
|
|
print(f"Výstupní formát: 'RC Prijmeni, Jmeno.pdf'")
|
|
print("-" * 70)
|
|
|
|
for filename in files:
|
|
input_path = os.path.join(INPUT_FOLDER, filename)
|
|
|
|
try:
|
|
print(f"Zpracovávám: {filename}")
|
|
|
|
doc_src = fitz.open(input_path)
|
|
final_doc = fitz.open()
|
|
|
|
found_rc = None
|
|
found_name = None
|
|
|
|
# --- 1. PRŮCHOD STRÁNKAMI ---
|
|
for page_num, page in enumerate(doc_src):
|
|
|
|
# A) PŘÍPRAVA OCR VRSTVY (High Res)
|
|
mat_ocr = fitz.Matrix(OCR_DPI / 72, OCR_DPI / 72)
|
|
pix_ocr = page.get_pixmap(matrix=mat_ocr, colorspace=fitz.csGRAY)
|
|
img_ocr = Image.frombytes("L", [pix_ocr.width, pix_ocr.height], pix_ocr.samples)
|
|
|
|
# Tesseract vytvoří PDF stránku
|
|
pdf_bytes = pytesseract.image_to_pdf_or_hocr(img_ocr, extension='pdf', lang='ces')
|
|
ocr_page_doc = fitz.open("pdf", pdf_bytes)
|
|
ocr_page = ocr_page_doc[0]
|
|
|
|
# B) HLEDÁNÍ PACIENTA (pokud ještě nemáme)
|
|
if not found_rc:
|
|
text = ocr_page.get_text()
|
|
candidates = get_rc_candidates(text)
|
|
|
|
for cand in candidates:
|
|
print(f" -> Ověřuji v DB: {cand} ...", end=" ")
|
|
exists, name_str = get_patient_from_db(cand)
|
|
|
|
if exists:
|
|
print(f"NALEZEN: {name_str}")
|
|
found_rc = cand
|
|
found_name = name_str
|
|
break # Máme ho!
|
|
else:
|
|
print("Nenalezen.")
|
|
|
|
# C) PŘÍPRAVA VIZUÁLNÍ VRSTVY (Low Res - Zmenšení)
|
|
mat_vis = fitz.Matrix(VISUAL_DPI / 72, VISUAL_DPI / 72)
|
|
pix_vis = page.get_pixmap(matrix=mat_vis, colorspace=fitz.csGRAY)
|
|
img_vis = Image.frombytes("L", [pix_vis.width, pix_vis.height], pix_vis.samples)
|
|
# Threshold
|
|
img_vis_bw = img_vis.point(lambda x: 255 if x > VISUAL_THRESHOLD else 0, mode='1')
|
|
|
|
buffer_vis = io.BytesIO()
|
|
img_vis_bw.save(buffer_vis, format="PNG", optimize=True)
|
|
|
|
# D) PROHOZENÍ OBRÁZKŮ (Ultimate trick)
|
|
images = ocr_page.get_images()
|
|
if images:
|
|
xref = images[0][0]
|
|
ocr_page.delete_image(xref) # Smaže velký
|
|
|
|
# Vloží malý
|
|
ocr_page.insert_image(ocr_page.rect, stream=buffer_vis.getvalue(), keep_proportion=True)
|
|
|
|
# Přidá stránku do finálního PDF
|
|
final_doc.insert_pdf(ocr_page_doc)
|
|
|
|
# --- 2. ULOŽENÍ SOUBORU ---
|
|
if found_rc and found_name:
|
|
# Formát: 1234567890 NOVAK, JAN.pdf
|
|
# Odstraníme případné nepovolené znaky ve jméně (pro jistotu)
|
|
safe_name = re.sub(r'[\\/*?:"<>|]', "", found_name)
|
|
new_filename = f"{found_rc} {safe_name}.pdf"
|
|
else:
|
|
new_filename = f"{os.path.splitext(filename)[0]}_neovereno.pdf"
|
|
|
|
# Řešení konfliktů (kdyby soubor už existoval)
|
|
counter = 1
|
|
base_name = os.path.splitext(new_filename)[0]
|
|
while os.path.exists(os.path.join(OUTPUT_FOLDER, new_filename)):
|
|
new_filename = f"{base_name}_{counter}.pdf"
|
|
counter += 1
|
|
|
|
output_path = os.path.join(OUTPUT_FOLDER, new_filename)
|
|
|
|
final_doc.save(output_path, garbage=4, deflate=True)
|
|
final_doc.close()
|
|
doc_src.close()
|
|
|
|
size_kb = os.path.getsize(output_path) / 1024
|
|
print(f" -> Uloženo: {new_filename} ({size_kb:.1f} kB)")
|
|
print("-" * 70)
|
|
|
|
except Exception as e:
|
|
print(f"CHYBA: {e}")
|
|
|
|
print("\nHOTOVO.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
process_medicus_final() |