diff --git a/50 Dekurz pro reporter.py b/50 Dekurz pro reporter.py index 528d093..59ef4fc 100644 --- a/50 Dekurz pro reporter.py +++ b/50 Dekurz pro reporter.py @@ -85,11 +85,14 @@ def safe_rtf_to_text(x): df["DEKURS"] = df["DEKURS"].apply(safe_rtf_to_text) -df.replace({r'(\r\n|\r|\n)': r'\r\n'}, regex=True, inplace=True) -df.replace({r'[\ud800-\udfff\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]+': ''}, - regex=True, inplace=True) -df.replace({r'(\r\n){2,}': r'\r\n', r'(\r\n)+$': ''}, - regex=True, inplace=True) +# --- Normalize and clean line breaks more robustly --- +df["DEKURS"] = ( + df["DEKURS"] + .str.replace(r'\r\n|\r', '\n', regex=True) # normalize to \n + .str.replace(r'[ \t\xa0]*\n[ \t\xa0]*', '\n', regex=True) # strip spaces around newlines + .str.replace(r'\n{2,}', '\n', regex=True) # collapse 2+ newlines + .str.strip() # remove leading/trailing newlines/spaces +) df["PACIENT"] = df["PRIJMENI"].fillna("") + ", " + df["JMENO"].fillna("") df.drop(columns=["PRIJMENI", "JMENO"], inplace=True)