z230
This commit is contained in:
145
MakeSmallerPDF/06 OCR Tesseract.py
Normal file
145
MakeSmallerPDF/06 OCR Tesseract.py
Normal file
@@ -0,0 +1,145 @@
|
||||
import fitz # PyMuPDF
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
|
||||
# --- NASTAVENÍ ---
|
||||
INPUT_FOLDER = r'u:\Dropbox\Ordinace\Dokumentace_ke_zpracování\AdobeMakeSmaller'
|
||||
OUTPUT_FOLDER = os.path.join(INPUT_FOLDER, '_HOTOVO_ULTIMATE')
|
||||
|
||||
# Cesta k Tesseractu
|
||||
PATH_TO_TESSERACT = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
||||
|
||||
# 1. Jak moc "ošklivý" (malý) má být viditelný obrázek?
|
||||
VISUAL_DPI = 150
|
||||
VISUAL_THRESHOLD = 150
|
||||
|
||||
# 2. Jak kvalitní má být předloha pro čtení (OCR)?
|
||||
OCR_DPI = 300
|
||||
|
||||
|
||||
def setup_tesseract():
|
||||
if not os.path.exists(PATH_TO_TESSERACT):
|
||||
print(f"CHYBA: Tesseract nenalezen na: {PATH_TO_TESSERACT}")
|
||||
return False
|
||||
pytesseract.pytesseract.tesseract_cmd = PATH_TO_TESSERACT
|
||||
return True
|
||||
|
||||
|
||||
def get_rc_from_text(text):
|
||||
"""Najde RČ v textu a vrátí ho bez lomítka."""
|
||||
# Hledá formát: 6 čísel, volitelně lomítko/mezera, 3-4 čísla
|
||||
matches = re.findall(r'\b(\d{6})\s*[\/]?\s*(\d{3,4})\b', text)
|
||||
for head, tail in matches:
|
||||
full_rc = f"{head}{tail}"
|
||||
# Základní kontrola délky a čísel
|
||||
if len(full_rc) in [9, 10] and full_rc.isdigit():
|
||||
return full_rc
|
||||
return None
|
||||
|
||||
|
||||
def process_ultimate_method():
|
||||
if not setup_tesseract():
|
||||
return
|
||||
|
||||
if not os.path.exists(OUTPUT_FOLDER):
|
||||
os.makedirs(OUTPUT_FOLDER)
|
||||
|
||||
files = [f for f in os.listdir(INPUT_FOLDER) if f.lower().endswith('.pdf')]
|
||||
print(f"Startuji 'ULTIMATE' zpracování {len(files)} souborů.")
|
||||
print("-" * 70)
|
||||
|
||||
for filename in files:
|
||||
input_path = os.path.join(INPUT_FOLDER, filename)
|
||||
|
||||
try:
|
||||
print(f"Zpracovávám: {filename} ...")
|
||||
|
||||
doc_src = fitz.open(input_path)
|
||||
final_doc = fitz.open()
|
||||
|
||||
detected_rc = None
|
||||
|
||||
for page_num, page in enumerate(doc_src):
|
||||
# --- KROK A: OCR (Vysoká kvalita pro čtení) ---
|
||||
# 1. Vyrenderujeme kvalitní obrázek pro Tesseract
|
||||
mat_ocr = fitz.Matrix(OCR_DPI / 72, OCR_DPI / 72)
|
||||
pix_ocr = page.get_pixmap(matrix=mat_ocr, colorspace=fitz.csGRAY)
|
||||
img_ocr = Image.frombytes("L", [pix_ocr.width, pix_ocr.height], pix_ocr.samples)
|
||||
|
||||
# 2. Tesseract vytvoří PDF stránku (Text + Velký obrázek)
|
||||
pdf_bytes = pytesseract.image_to_pdf_or_hocr(img_ocr, extension='pdf', lang='ces')
|
||||
|
||||
# 3. Otevřeme tuto stránku v paměti
|
||||
ocr_page_doc = fitz.open("pdf", pdf_bytes)
|
||||
ocr_page = ocr_page_doc[0]
|
||||
|
||||
# Přečteme text pro hledání RČ
|
||||
if not detected_rc:
|
||||
text_content = ocr_page.get_text()
|
||||
detected_rc = get_rc_from_text(text_content)
|
||||
|
||||
# --- KROK B: Visual (Nízká kvalita pro oči/úsporu) ---
|
||||
# 1. Vyrenderujeme malý obrázek
|
||||
mat_vis = fitz.Matrix(VISUAL_DPI / 72, VISUAL_DPI / 72)
|
||||
pix_vis = page.get_pixmap(matrix=mat_vis, colorspace=fitz.csGRAY)
|
||||
img_vis = Image.frombytes("L", [pix_vis.width, pix_vis.height], pix_vis.samples)
|
||||
|
||||
# 2. Binarizace (Threshold)
|
||||
img_vis_bw = img_vis.point(lambda x: 255 if x > VISUAL_THRESHOLD else 0, mode='1')
|
||||
|
||||
# 3. Uložíme do bufferu
|
||||
buffer_vis = io.BytesIO()
|
||||
img_vis_bw.save(buffer_vis, format="PNG", optimize=True)
|
||||
|
||||
# --- KROK C: Výměna (The Switch) ---
|
||||
# 1. Smažeme velký obrázek z Tesseract stránky (text zůstane)
|
||||
images = ocr_page.get_images()
|
||||
if images:
|
||||
xref = images[0][0]
|
||||
ocr_page.delete_image(xref)
|
||||
|
||||
# 2. Vložíme náš malý obrázek
|
||||
# Použijeme rect (rozměry) stránky, aby obrázek seděl přesně na celou stranu
|
||||
ocr_page.insert_image(ocr_page.rect, stream=buffer_vis.getvalue(), keep_proportion=True)
|
||||
|
||||
# 3. Vložíme upravenou stránku do finálního PDF
|
||||
final_doc.insert_pdf(ocr_page_doc)
|
||||
|
||||
# --- KROK D: Uložení ---
|
||||
if detected_rc:
|
||||
new_filename = f"{detected_rc}.pdf"
|
||||
print(f" -> Nalezeno RČ: {detected_rc}")
|
||||
else:
|
||||
new_filename = f"{os.path.splitext(filename)[0]}_ocr.pdf"
|
||||
print(f" -> RČ nenalezeno.")
|
||||
|
||||
# Řešení duplicit (pokud soubor už existuje)
|
||||
counter = 1
|
||||
base_name_check = os.path.splitext(new_filename)[0]
|
||||
while os.path.exists(os.path.join(OUTPUT_FOLDER, new_filename)):
|
||||
new_filename = f"{base_name_check}_{counter}.pdf"
|
||||
counter += 1
|
||||
|
||||
output_path = os.path.join(OUTPUT_FOLDER, new_filename)
|
||||
|
||||
# Uložení s kompresí
|
||||
final_doc.save(output_path, garbage=4, deflate=True)
|
||||
final_doc.close()
|
||||
doc_src.close()
|
||||
|
||||
# Výpis velikosti
|
||||
size_kb = os.path.getsize(output_path) / 1024
|
||||
print(f" -> Uloženo: {new_filename} ({size_kb:.1f} kB)")
|
||||
print("-" * 40)
|
||||
|
||||
except Exception as e:
|
||||
print(f"CHYBA u {filename}: {e}")
|
||||
|
||||
print("\nHOTOVO.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
process_ultimate_method()
|
||||
Reference in New Issue
Block a user