z230
This commit is contained in:
205
MakeSmallerPDF/07 OCR tesseract lepší.py
Normal file
205
MakeSmallerPDF/07 OCR tesseract lepší.py
Normal file
@@ -0,0 +1,205 @@
|
||||
import fitz # PyMuPDF
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import fdb # Firebird database library
|
||||
|
||||
# ==========================================
|
||||
# KONFIGURACE
|
||||
# ==========================================
|
||||
|
||||
# 1. Cesty k souborům
|
||||
INPUT_FOLDER = r'u:\Dropbox\Ordinace\Dokumentace_ke_zpracování\AdobeMakeSmaller'
|
||||
OUTPUT_FOLDER = os.path.join(INPUT_FOLDER, '_HOTOVO_MEDICUS')
|
||||
|
||||
# 2. Tesseract OCR
|
||||
PATH_TO_TESSERACT = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
||||
|
||||
# 3. Kvalita výstupu (Ultimate metoda)
|
||||
VISUAL_DPI = 150 # Pro zobrazení (malé)
|
||||
VISUAL_THRESHOLD = 150 # Práh černé
|
||||
OCR_DPI = 300 # Pro čtení (velké)
|
||||
|
||||
# 4. Databáze MEDICUS (Firebird)
|
||||
DB_CONFIG = {
|
||||
'host': "192.168.1.4",
|
||||
'port': 3050,
|
||||
'database': r"z:\Medicus 3\data\MEDICUS.FDB",
|
||||
'user': "SYSDBA",
|
||||
'password': "masterkey",
|
||||
'charset': "WIN1250" # Čeština ve Windows
|
||||
}
|
||||
|
||||
|
||||
# ==========================================
|
||||
# FUNKCE
|
||||
# ==========================================
|
||||
|
||||
def setup_tesseract():
|
||||
if not os.path.exists(PATH_TO_TESSERACT):
|
||||
print(f"CHYBA: Tesseract nenalezen na: {PATH_TO_TESSERACT}")
|
||||
return False
|
||||
pytesseract.pytesseract.tesseract_cmd = PATH_TO_TESSERACT
|
||||
return True
|
||||
|
||||
|
||||
def get_rc_candidates(text):
|
||||
"""Vrátí seznam RČ nalezených v textu, očistěných od mezer a lomítek."""
|
||||
candidates = set()
|
||||
# Regex bere i mezery: 75 05 12 / 1234
|
||||
pattern = r'\b\d{2}\s*\d{2}\s*\d{2}\s*[\/]?\s*\d{3,4}\b'
|
||||
matches = re.findall(pattern, text)
|
||||
|
||||
for match in matches:
|
||||
# Odstraníme vše kromě čísel -> vznikne čisté RČ pro DB
|
||||
clean = re.sub(r'[^\d]', '', match)
|
||||
if len(clean) in [9, 10]:
|
||||
candidates.add(clean)
|
||||
return list(candidates)
|
||||
|
||||
|
||||
def get_patient_from_db(rc_clean):
|
||||
"""
|
||||
Vrátí (True, "Prijmeni, Jmeno") pokud najde RČ v tabulce KAR.
|
||||
"""
|
||||
con = None
|
||||
try:
|
||||
con = fdb.connect(**DB_CONFIG)
|
||||
cur = con.cursor()
|
||||
|
||||
# Medicus má RČ bez lomítka, takže se ptáme rovnou
|
||||
sql = "SELECT prijmeni, jmeno FROM kar WHERE rodcis = ?"
|
||||
cur.execute(sql, (rc_clean,))
|
||||
row = cur.fetchone()
|
||||
|
||||
if row:
|
||||
# row[0] = Prijmeni, row[1] = Jmeno
|
||||
# .strip() je nutný, databáze vrací mezery na konci (CHAR field)
|
||||
prijmeni = str(row[0]).strip()
|
||||
jmeno = str(row[1]).strip()
|
||||
|
||||
formatted_name = f"{prijmeni}, {jmeno}"
|
||||
return True, formatted_name
|
||||
else:
|
||||
return False, None
|
||||
|
||||
except Exception as e:
|
||||
print(f" [DB ERROR] {e}")
|
||||
return False, None
|
||||
finally:
|
||||
if con:
|
||||
con.close()
|
||||
|
||||
|
||||
def process_medicus_final():
|
||||
if not setup_tesseract():
|
||||
return
|
||||
|
||||
if not os.path.exists(OUTPUT_FOLDER):
|
||||
os.makedirs(OUTPUT_FOLDER)
|
||||
|
||||
files = [f for f in os.listdir(INPUT_FOLDER) if f.lower().endswith('.pdf')]
|
||||
print(f"Startuji zpracování {len(files)} souborů.")
|
||||
print(f"Výstupní formát: 'RC Prijmeni, Jmeno.pdf'")
|
||||
print("-" * 70)
|
||||
|
||||
for filename in files:
|
||||
input_path = os.path.join(INPUT_FOLDER, filename)
|
||||
|
||||
try:
|
||||
print(f"Zpracovávám: {filename}")
|
||||
|
||||
doc_src = fitz.open(input_path)
|
||||
final_doc = fitz.open()
|
||||
|
||||
found_rc = None
|
||||
found_name = None
|
||||
|
||||
# --- 1. PRŮCHOD STRÁNKAMI ---
|
||||
for page_num, page in enumerate(doc_src):
|
||||
|
||||
# A) PŘÍPRAVA OCR VRSTVY (High Res)
|
||||
mat_ocr = fitz.Matrix(OCR_DPI / 72, OCR_DPI / 72)
|
||||
pix_ocr = page.get_pixmap(matrix=mat_ocr, colorspace=fitz.csGRAY)
|
||||
img_ocr = Image.frombytes("L", [pix_ocr.width, pix_ocr.height], pix_ocr.samples)
|
||||
|
||||
# Tesseract vytvoří PDF stránku
|
||||
pdf_bytes = pytesseract.image_to_pdf_or_hocr(img_ocr, extension='pdf', lang='ces')
|
||||
ocr_page_doc = fitz.open("pdf", pdf_bytes)
|
||||
ocr_page = ocr_page_doc[0]
|
||||
|
||||
# B) HLEDÁNÍ PACIENTA (pokud ještě nemáme)
|
||||
if not found_rc:
|
||||
text = ocr_page.get_text()
|
||||
candidates = get_rc_candidates(text)
|
||||
|
||||
for cand in candidates:
|
||||
print(f" -> Ověřuji v DB: {cand} ...", end=" ")
|
||||
exists, name_str = get_patient_from_db(cand)
|
||||
|
||||
if exists:
|
||||
print(f"NALEZEN: {name_str}")
|
||||
found_rc = cand
|
||||
found_name = name_str
|
||||
break # Máme ho!
|
||||
else:
|
||||
print("Nenalezen.")
|
||||
|
||||
# C) PŘÍPRAVA VIZUÁLNÍ VRSTVY (Low Res - Zmenšení)
|
||||
mat_vis = fitz.Matrix(VISUAL_DPI / 72, VISUAL_DPI / 72)
|
||||
pix_vis = page.get_pixmap(matrix=mat_vis, colorspace=fitz.csGRAY)
|
||||
img_vis = Image.frombytes("L", [pix_vis.width, pix_vis.height], pix_vis.samples)
|
||||
# Threshold
|
||||
img_vis_bw = img_vis.point(lambda x: 255 if x > VISUAL_THRESHOLD else 0, mode='1')
|
||||
|
||||
buffer_vis = io.BytesIO()
|
||||
img_vis_bw.save(buffer_vis, format="PNG", optimize=True)
|
||||
|
||||
# D) PROHOZENÍ OBRÁZKŮ (Ultimate trick)
|
||||
images = ocr_page.get_images()
|
||||
if images:
|
||||
xref = images[0][0]
|
||||
ocr_page.delete_image(xref) # Smaže velký
|
||||
|
||||
# Vloží malý
|
||||
ocr_page.insert_image(ocr_page.rect, stream=buffer_vis.getvalue(), keep_proportion=True)
|
||||
|
||||
# Přidá stránku do finálního PDF
|
||||
final_doc.insert_pdf(ocr_page_doc)
|
||||
|
||||
# --- 2. ULOŽENÍ SOUBORU ---
|
||||
if found_rc and found_name:
|
||||
# Formát: 1234567890 NOVAK, JAN.pdf
|
||||
# Odstraníme případné nepovolené znaky ve jméně (pro jistotu)
|
||||
safe_name = re.sub(r'[\\/*?:"<>|]', "", found_name)
|
||||
new_filename = f"{found_rc} {safe_name}.pdf"
|
||||
else:
|
||||
new_filename = f"{os.path.splitext(filename)[0]}_neovereno.pdf"
|
||||
|
||||
# Řešení konfliktů (kdyby soubor už existoval)
|
||||
counter = 1
|
||||
base_name = os.path.splitext(new_filename)[0]
|
||||
while os.path.exists(os.path.join(OUTPUT_FOLDER, new_filename)):
|
||||
new_filename = f"{base_name}_{counter}.pdf"
|
||||
counter += 1
|
||||
|
||||
output_path = os.path.join(OUTPUT_FOLDER, new_filename)
|
||||
|
||||
final_doc.save(output_path, garbage=4, deflate=True)
|
||||
final_doc.close()
|
||||
doc_src.close()
|
||||
|
||||
size_kb = os.path.getsize(output_path) / 1024
|
||||
print(f" -> Uloženo: {new_filename} ({size_kb:.1f} kB)")
|
||||
print("-" * 70)
|
||||
|
||||
except Exception as e:
|
||||
print(f"CHYBA: {e}")
|
||||
|
||||
print("\nHOTOVO.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
process_medicus_final()
|
||||
Reference in New Issue
Block a user