""" Import Covance eQuery CSV do MySQL tabulky covance_equeries. Strategie: versioning — každý import = nový import_id. """ import os import glob import datetime import numpy as np import pandas as pd import mysql.connector import db_config STUDY = "42847922MDD3003" SOURCE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "SourceData") # ── type converters ────────────────────────────────────────────────────────── def _py(val): if isinstance(val, np.generic): return val.item() return val def to_str(val): val = _py(val) if val is None: return None if isinstance(val, float) and (val != val): return None s = str(val).strip() return None if s.lower() in ("nan", "nat", "none", "") else s def to_int(val): val = _py(val) try: v = float(val) return None if (v != v) else int(v) except (TypeError, ValueError): return None def to_datetime(val): val = _py(val) if val is None: return None if isinstance(val, float) and (val != val): return None try: if pd.isna(val): return None except (TypeError, ValueError): pass if isinstance(val, pd.Timestamp): return None if pd.isna(val) else val.to_pydatetime() if isinstance(val, datetime.datetime): return val if isinstance(val, datetime.date): return datetime.datetime(val.year, val.month, val.day) s = str(val).strip() if not s or s.lower() in ("nat", "nan", "none", ""): return None for fmt in ("%b %d, %Y %I:%M %p", "%b %d, %Y", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d"): try: return datetime.datetime.strptime(s, fmt) except ValueError: pass return None # ── DB helpers ─────────────────────────────────────────────────────────────── def get_conn(): return mysql.connector.connect( host=db_config.DB_HOST, port=db_config.DB_PORT, user=db_config.DB_USER, password=db_config.DB_PASSWORD, database=db_config.DB_NAME, ) def insert_import(cursor, study, source_file): cursor.execute( "INSERT INTO iwrs_import (study, imported_at, source_file, report_type) VALUES (%s, %s, %s, %s)", (study, datetime.datetime.now(), source_file, "covance_equeries"), ) return cursor.lastrowid # ── parser ─────────────────────────────────────────────────────────────────── def find_csv(study): pattern = os.path.join(SOURCE_DIR, f"Protocol {study} *equery*.csv") files = glob.glob(pattern) if not files: raise FileNotFoundError(f"Nenalezen CSV soubor: {pattern}") return sorted(files)[-1] def parse_site(site_str): """'CZ10004 - Dr. Erik Herman' → ('CZ10004', 'Dr. Erik Herman')""" if not site_str: return None, None parts = site_str.split(" - ", 1) code = parts[0].strip() name = parts[1].strip() if len(parts) > 1 else None return code, name def parse_csv(path): df = pd.read_csv(path, dtype=str) # odstraň metadata řádky (eQueryId není číslo) df = df[pd.to_numeric(df["eQueryId"], errors="coerce").notna()].copy() rows = [] for _, r in df.iterrows(): site_code, investigator_name = parse_site(to_str(r.get("Site"))) rows.append({ "site_code": site_code, "investigator_name": investigator_name, "country": to_str(r.get("Country")), "visit": to_str(r.get("Visit")), "visit_collection_date": to_datetime(r.get("Visit Collection Date")), "accession": to_str(r.get("Accession")), "subject": to_str(r.get("Subject")), "equery_id": to_int(r.get("eQueryId")), "create_date": to_datetime(r.get("Create Date")), "response_datetime": to_datetime(r.get("Response Date Time")), "issue_type": to_str(r.get("Issue Type")), "status": to_str(r.get("Status")), "time_before_response": to_str(r.get("Time Before Response")), "user_name": to_str(r.get("User Name")), "email": to_str(r.get("Email")), "study_role": to_str(r.get("Study Role")), }) return rows # ── insert ─────────────────────────────────────────────────────────────────── def insert_equeries(cursor, import_id, rows): sql = """INSERT INTO covance_equeries (import_id, study, site_code, investigator_name, country, visit, visit_collection_date, accession, subject, equery_id, create_date, response_datetime, issue_type, status, time_before_response, user_name, email, study_role) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" for r in rows: cursor.execute(sql, ( import_id, STUDY, r["site_code"], r["investigator_name"], r["country"], r["visit"], r["visit_collection_date"], r["accession"], r["subject"], r["equery_id"], r["create_date"], r["response_datetime"], r["issue_type"], r["status"], r["time_before_response"], r["user_name"], r["email"], r["study_role"], )) # ── main ───────────────────────────────────────────────────────────────────── def main(): csv_path = find_csv(STUDY) print(f"Soubor: {os.path.basename(csv_path)}") rows = parse_csv(csv_path) print(f"Načteno řádků: {len(rows)}") conn = get_conn() cursor = conn.cursor() import_id = insert_import(cursor, STUDY, os.path.basename(csv_path)) print(f"import_id = {import_id}") insert_equeries(cursor, import_id, rows) conn.commit() cursor.close() conn.close() print(f"Hotovo — {len(rows)} eQuery záznamů importováno.") main()