notebook

2025-10-26 18:05:45 +01:00
parent f8e8d2c0eb
commit bdac177353
1 changed files with 8 additions and 5 deletions
@@ -85,11 +85,14 @@ def safe_rtf_to_text(x):
 df["DEKURS"] = df["DEKURS"].apply(safe_rtf_to_text)
-df.replace({r'(\r\n|\r|\n)': r'\r\n'}, regex=True, inplace=True)
+# --- Normalize and clean line breaks more robustly ---
-df.replace({r'[\ud800-\udfff\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]+': ''},
+df["DEKURS"] = (
-           regex=True, inplace=True)
+    df["DEKURS"]
-df.replace({r'(\r\n){2,}': r'\r\n', r'(\r\n)+$': ''},
+    .str.replace(r'\r\n|\r', '\n', regex=True)          # normalize to \n
-           regex=True, inplace=True)
+    .str.replace(r'[ \t\xa0]*\n[ \t\xa0]*', '\n', regex=True)  # strip spaces around newlines
    .str.replace(r'\n{2,}', '\n', regex=True)           # collapse 2+ newlines
    .str.strip()                                        # remove leading/trailing newlines/spaces
 )
 df["PACIENT"] = df["PRIJMENI"].fillna("") + ", " + df["JMENO"].fillna("")
 df.drop(columns=["PRIJMENI", "JMENO"], inplace=True)