This commit is contained in:
2025-10-26 18:05:45 +01:00
parent f8e8d2c0eb
commit bdac177353

View File

@@ -85,11 +85,14 @@ def safe_rtf_to_text(x):
df["DEKURS"] = df["DEKURS"].apply(safe_rtf_to_text) df["DEKURS"] = df["DEKURS"].apply(safe_rtf_to_text)
df.replace({r'(\r\n|\r|\n)': r'\r\n'}, regex=True, inplace=True) # --- Normalize and clean line breaks more robustly ---
df.replace({r'[\ud800-\udfff\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]+': ''}, df["DEKURS"] = (
regex=True, inplace=True) df["DEKURS"]
df.replace({r'(\r\n){2,}': r'\r\n', r'(\r\n)+$': ''}, .str.replace(r'\r\n|\r', '\n', regex=True) # normalize to \n
regex=True, inplace=True) .str.replace(r'[ \t\xa0]*\n[ \t\xa0]*', '\n', regex=True) # strip spaces around newlines
.str.replace(r'\n{2,}', '\n', regex=True) # collapse 2+ newlines
.str.strip() # remove leading/trailing newlines/spaces
)
df["PACIENT"] = df["PRIJMENI"].fillna("") + ", " + df["JMENO"].fillna("") df["PACIENT"] = df["PRIJMENI"].fillna("") + ", " + df["JMENO"].fillna("")
df.drop(columns=["PRIJMENI", "JMENO"], inplace=True) df.drop(columns=["PRIJMENI", "JMENO"], inplace=True)