From 8e986e3b80750cd6c20ba05559071398714a8d03 Mon Sep 17 00:00:00 2001 From: "vladimir.buzalka" Date: Thu, 7 May 2026 12:01:44 +0200 Subject: [PATCH] z230 --- CTMS/PanoramaContacts/import_CZ_contacts.py | 218 +++++++++++--------- 1 file changed, 126 insertions(+), 92 deletions(-) diff --git a/CTMS/PanoramaContacts/import_CZ_contacts.py b/CTMS/PanoramaContacts/import_CZ_contacts.py index 5bd2f6e..a5ca9f5 100644 --- a/CTMS/PanoramaContacts/import_CZ_contacts.py +++ b/CTMS/PanoramaContacts/import_CZ_contacts.py @@ -1,9 +1,10 @@ """ import_CZ_contacts.py Importuje kontakty středisek Czechia z PANORAMA Dashboard xlsx do MySQL tabulky CTMS_contacts. +- Zpracuje všechny *.xlsx soubory ve SOURCE_DIR - Filtruje pouze řádky Country Name == 'Czechia' - file_date bere z document properties xlsx (dcterms:created) -- Před importem smaže stávající záznamy se stejným file_date + country_name == 'Czechia' +- Každý soubor vždy přepíše (delete + insert podle file_date + protocol_id + country) """ import zipfile @@ -15,7 +16,7 @@ import pandas as pd import mysql.connector # ── Konfigurace ──────────────────────────────────────────────────────────────── -SOURCE_FILE = Path(r"U:\PythonProject\Janssen\CTMS\PanoramaContacts\SourceData\PANORAMA Dashboard (33).xlsx") +SOURCE_DIR = Path(r"U:\PythonProject\Janssen\CTMS\PanoramaContacts\SourceData") DB_CONFIG = { "host": "192.168.1.76", @@ -31,127 +32,160 @@ COUNTRY = "Czechia" SHEET = "Site Contacts" HEADER_ROW = 5 # 0-based → řádek č. 6 v Excelu +COL_MAP = { + "Sector": "sector", + "TA": "ta", + "Protocol ID": "protocol_id", + "GTL-GTM/CTM": "gtl_ctm", + "Country Name": "country_name", + "LTM Name": "ltm_name", + "Site ID": "site_id", + "SM Name": "sm_name", + "PI Full Name": "pi_full_name", + "Institution Name": "institution_name", + "Contact Identifier": "contact_identifier", + "Title": "contact_title", + "Last Name": "last_name", + "First Name": "first_name", + "Contact Role": "contact_role", + "Contact Type": "contact_type", + "Pr St Cont Primary Indicator": "primary_indicator", + "SUA Reporting Indicator": "sua_reporting_indicator", + "Financial Disclosure Indicator": "financial_disclosure_indicator", + "Contact Phone Number": "phone", + "Alternative Phone Number": "phone_alt", + "Mobile Phone Number": "phone_mobile", + "Contact Fax Number": "fax", + "Contact Email Address": "email", + "SUA Reporting Email Address": "email_sua", + "Contact Start Date": "contact_start_date", + "Contact End Date": "contact_end_date", + "Degree/qualification": "degree_qualification", + "Job Title": "job_title", + "Contact Address Line 1": "address_line1", + "Contact Address Line 2": "address_line2", + "Contact Address Line 3": "address_line3", + "Contact City": "city", + "Contact Addr State/Province": "state_province", + "Contact Zip/Postal Code": "zip_postal_code", +} + # ── Pomocné funkce ───────────────────────────────────────────────────────────── -def get_file_created_date(xlsx_path: Path) -> datetime.date: - """Vrátí datum vytvoření souboru z docProps/core.xml (dcterms:created).""" - ns = {"dcterms": "http://purl.org/dc/terms/"} +def get_file_created_date(xlsx_path: Path): + """Vrátí date z dcterms:created v docProps/core.xml.""" with zipfile.ZipFile(xlsx_path) as z: with z.open("docProps/core.xml") as f: root = ET.parse(f).getroot() - created_el = root.find("{http://purl.org/dc/terms/}created") - dt = datetime.fromisoformat(created_el.text.replace("Z", "+00:00")) + el = root.find("{http://purl.org/dc/terms/}created") + dt = datetime.fromisoformat(el.text.replace("Z", "+00:00")) return dt.astimezone(timezone.utc).date() def clean_value(val): - """Převede NaN / NaT / float na None, jinak vrátí string nebo date.""" + """Převede NaN / NaT / Timestamp na typy přijatelné MySQL driverem.""" + import math if val is None: return None if isinstance(val, float): - import math return None if math.isnan(val) else val - if hasattr(val, "_value"): # pd.NaT - return None + if isinstance(val, pd.Timestamp): + return None if pd.isnull(val) else val.date() try: - import pandas as pd if pd.isna(val): return None except Exception: pass - if isinstance(val, pd.Timestamp): - return val.date() if not pd.isnull(val) else None return val +def get_protocol_id(xlsx_path: Path) -> str: + """Přečte protocol_id z prvního datového řádku (rychle, bez načtení celého souboru).""" + df = pd.read_excel(xlsx_path, sheet_name=SHEET, header=HEADER_ROW, + usecols=["Protocol ID"], nrows=1) + return str(df["Protocol ID"].iloc[0]) + + +def import_file(xlsx_path: Path, cursor, conn): + """Zpracuje jeden xlsx soubor — vždy přepíše (delete + insert).""" + file_date = get_file_created_date(xlsx_path) + protocol_id = get_protocol_id(xlsx_path) + print(f" file_date : {file_date}") + print(f" protocol_id : {protocol_id}") + + df = pd.read_excel(xlsx_path, sheet_name=SHEET, header=HEADER_ROW) + df_cz = df[df["Country Name"] == COUNTRY].copy() + print(f" radku CZ : {len(df_cz)}") + + if df_cz.empty: + print(" -> zadne CZ radky, preskoceno") + return 0 + + df_cz = df_cz.rename(columns=COL_MAP) + db_cols = list(COL_MAP.values()) + + # Smazání stávajících záznamů pro tento soubor (přepis) + cursor.execute( + f"DELETE FROM {TABLE} " + f"WHERE file_date = %s AND protocol_id = %s AND country_name = %s", + (file_date, protocol_id, COUNTRY) + ) + deleted = cursor.rowcount + if deleted: + print(f" prepis : smazano {deleted} starych radku") + + placeholders = ", ".join(["%s"] * (len(db_cols) + 1)) + insert_cols = "file_date, " + ", ".join(db_cols) + sql_insert = f"INSERT INTO {TABLE} ({insert_cols}) VALUES ({placeholders})" + + for _, row in df_cz.iterrows(): + values = [file_date] + [clean_value(row.get(col)) for col in db_cols] + cursor.execute(sql_insert, values) + + conn.commit() + return len(df_cz) + + # ── Hlavní logika ────────────────────────────────────────────────────────────── def main(): - print(f"Soubor : {SOURCE_FILE}") + files = sorted(SOURCE_DIR.glob("*.xlsx")) + if not files: + print(f"Zadne xlsx soubory v {SOURCE_DIR}") + return - # 1) Datum vytvoření z properties - file_date = get_file_created_date(SOURCE_FILE) - print(f"file_date (z docProps): {file_date}") - - # 2) Načtení dat - print("Načítám Excel…") - df = pd.read_excel(SOURCE_FILE, sheet_name=SHEET, header=HEADER_ROW) - - # 3) Filtr CZ - df_cz = df[df["Country Name"] == COUNTRY].copy() - print(f"Řádků CZ: {len(df_cz)}") - - # 4) Mapování Excel sloupců → DB sloupce - col_map = { - "Sector": "sector", - "TA": "ta", - "Protocol ID": "protocol_id", - "GTL-GTM/CTM": "gtl_ctm", - "Country Name": "country_name", - "LTM Name": "ltm_name", - "Site ID": "site_id", - "SM Name": "sm_name", - "PI Full Name": "pi_full_name", - "Institution Name": "institution_name", - "Contact Identifier": "contact_identifier", - "Title": "contact_title", - "Last Name": "last_name", - "First Name": "first_name", - "Contact Role": "contact_role", - "Contact Type": "contact_type", - "Pr St Cont Primary Indicator": "primary_indicator", - "SUA Reporting Indicator": "sua_reporting_indicator", - "Financial Disclosure Indicator": "financial_disclosure_indicator", - "Contact Phone Number": "phone", - "Alternative Phone Number": "phone_alt", - "Mobile Phone Number": "phone_mobile", - "Contact Fax Number": "fax", - "Contact Email Address": "email", - "SUA Reporting Email Address": "email_sua", - "Contact Start Date": "contact_start_date", - "Contact End Date": "contact_end_date", - "Degree/qualification": "degree_qualification", - "Job Title": "job_title", - "Contact Address Line 1": "address_line1", - "Contact Address Line 2": "address_line2", - "Contact Address Line 3": "address_line3", - "Contact City": "city", - "Contact Addr State/Province": "state_province", - "Contact Zip/Postal Code": "zip_postal_code", - } - - df_cz = df_cz.rename(columns=col_map) - db_cols = list(col_map.values()) - - # 5) Připojení k DB - print("Připojuji se k MySQL…") + today = datetime.now(timezone.utc).date() + print(f"Nalezeno souboru: {len(files)} | dnesni datum: {today}") + print(f"Pripojuji se k MySQL...") conn = mysql.connector.connect(**DB_CONFIG) cursor = conn.cursor() - # 6) Smazání stávajících záznamů pro stejný file_date + CZ (idempotentní import) - cursor.execute( - f"DELETE FROM {TABLE} WHERE file_date = %s AND country_name = %s", - (file_date, COUNTRY) - ) - deleted = cursor.rowcount - print(f"Smazáno starých záznamů: {deleted}") + summary = [] + for xlsx in files: + print(f"\n[{xlsx.name}]") + try: + file_date = get_file_created_date(xlsx) + if file_date != today: + print(f" file_date : {file_date} -> PRESKOCENO (neni dnesni datum)") + summary.append((xlsx.name, f"preskoceno (file_date={file_date})")) + continue + n = import_file(xlsx, cursor, conn) + if n is None: + summary.append((xlsx.name, "preskoceno")) + else: + summary.append((xlsx.name, f"importovano {n} radku")) + except Exception as e: + conn.rollback() + summary.append((xlsx.name, f"CHYBA: {e}")) + print(f" CHYBA: {e}") - # 7) Insert - placeholders = ", ".join(["%s"] * (len(db_cols) + 1)) # +1 pro file_date - insert_cols = "file_date, " + ", ".join(db_cols) - sql_insert = f"INSERT INTO {TABLE} ({insert_cols}) VALUES ({placeholders})" - - inserted = 0 - for _, row in df_cz.iterrows(): - values = [file_date] + [clean_value(row.get(col)) for col in db_cols] - cursor.execute(sql_insert, values) - inserted += 1 - - conn.commit() cursor.close() conn.close() - print(f"Importováno záznamů : {inserted}") - print("Hotovo OK") + print("\n" + "=" * 60) + print("SOUHRN:") + for name, status in summary: + print(f" {name:<45} {status}") + print("=" * 60) if __name__ == "__main__":