183 lines
6.5 KiB
Python
183 lines
6.5 KiB
Python
"""
|
|
Import Covance eQuery CSV do MySQL tabulky covance_equeries.
|
|
Strategie: versioning — každý import = nový import_id.
|
|
"""
|
|
|
|
import os
|
|
import glob
|
|
import datetime
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import mysql.connector
|
|
|
|
import db_config
|
|
|
|
STUDY = "42847922MDD3003"
|
|
SOURCE_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "SourceData")
|
|
|
|
|
|
# ── type converters ──────────────────────────────────────────────────────────
|
|
|
|
def _py(val):
|
|
if isinstance(val, np.generic):
|
|
return val.item()
|
|
return val
|
|
|
|
def to_str(val):
|
|
val = _py(val)
|
|
if val is None:
|
|
return None
|
|
if isinstance(val, float) and (val != val):
|
|
return None
|
|
s = str(val).strip()
|
|
return None if s.lower() in ("nan", "nat", "none", "") else s
|
|
|
|
def to_int(val):
|
|
val = _py(val)
|
|
try:
|
|
v = float(val)
|
|
return None if (v != v) else int(v)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
def to_datetime(val):
|
|
val = _py(val)
|
|
if val is None:
|
|
return None
|
|
if isinstance(val, float) and (val != val):
|
|
return None
|
|
try:
|
|
if pd.isna(val):
|
|
return None
|
|
except (TypeError, ValueError):
|
|
pass
|
|
if isinstance(val, pd.Timestamp):
|
|
return None if pd.isna(val) else val.to_pydatetime()
|
|
if isinstance(val, datetime.datetime):
|
|
return val
|
|
if isinstance(val, datetime.date):
|
|
return datetime.datetime(val.year, val.month, val.day)
|
|
s = str(val).strip()
|
|
if not s or s.lower() in ("nat", "nan", "none", ""):
|
|
return None
|
|
for fmt in ("%b %d, %Y %I:%M %p", "%b %d, %Y",
|
|
"%Y-%m-%d %H:%M:%S", "%Y-%m-%d"):
|
|
try:
|
|
return datetime.datetime.strptime(s, fmt)
|
|
except ValueError:
|
|
pass
|
|
return None
|
|
|
|
|
|
# ── DB helpers ───────────────────────────────────────────────────────────────
|
|
|
|
def get_conn():
|
|
return mysql.connector.connect(
|
|
host=db_config.DB_HOST, port=db_config.DB_PORT,
|
|
user=db_config.DB_USER, password=db_config.DB_PASSWORD,
|
|
database=db_config.DB_NAME,
|
|
)
|
|
|
|
def insert_import(cursor, study, source_file):
|
|
cursor.execute(
|
|
"INSERT INTO iwrs_import (study, imported_at, source_file, report_type) VALUES (%s, %s, %s, %s)",
|
|
(study, datetime.datetime.now(), source_file, "covance_equeries"),
|
|
)
|
|
return cursor.lastrowid
|
|
|
|
|
|
# ── parser ───────────────────────────────────────────────────────────────────
|
|
|
|
def find_csv(study):
|
|
pattern = os.path.join(SOURCE_DIR, f"Protocol {study} *equery*.csv")
|
|
files = glob.glob(pattern)
|
|
if not files:
|
|
raise FileNotFoundError(f"Nenalezen CSV soubor: {pattern}")
|
|
return sorted(files)[-1]
|
|
|
|
def parse_site(site_str):
|
|
"""'CZ10004 - Dr. Erik Herman' → ('CZ10004', 'Dr. Erik Herman')"""
|
|
if not site_str:
|
|
return None, None
|
|
parts = site_str.split(" - ", 1)
|
|
code = parts[0].strip()
|
|
name = parts[1].strip() if len(parts) > 1 else None
|
|
return code, name
|
|
|
|
def parse_csv(path):
|
|
df = pd.read_csv(path, dtype=str)
|
|
# odstraň metadata řádky (eQueryId není číslo)
|
|
df = df[pd.to_numeric(df["eQueryId"], errors="coerce").notna()].copy()
|
|
|
|
rows = []
|
|
for _, r in df.iterrows():
|
|
site_code, investigator_name = parse_site(to_str(r.get("Site")))
|
|
rows.append({
|
|
"site_code": site_code,
|
|
"investigator_name": investigator_name,
|
|
"country": to_str(r.get("Country")),
|
|
"visit": to_str(r.get("Visit")),
|
|
"visit_collection_date": to_datetime(r.get("Visit Collection Date")),
|
|
"accession": to_str(r.get("Accession")),
|
|
"subject": to_str(r.get("Subject")),
|
|
"equery_id": to_int(r.get("eQueryId")),
|
|
"create_date": to_datetime(r.get("Create Date")),
|
|
"response_datetime": to_datetime(r.get("Response Date Time")),
|
|
"issue_type": to_str(r.get("Issue Type")),
|
|
"status": to_str(r.get("Status")),
|
|
"time_before_response": to_str(r.get("Time Before Response")),
|
|
"user_name": to_str(r.get("User Name")),
|
|
"email": to_str(r.get("Email")),
|
|
"study_role": to_str(r.get("Study Role")),
|
|
})
|
|
return rows
|
|
|
|
|
|
# ── insert ───────────────────────────────────────────────────────────────────
|
|
|
|
def insert_equeries(cursor, import_id, rows):
|
|
sql = """INSERT INTO covance_equeries
|
|
(import_id, study, site_code, investigator_name, country,
|
|
visit, visit_collection_date, accession, subject, equery_id,
|
|
create_date, response_datetime, issue_type, status,
|
|
time_before_response, user_name, email, study_role)
|
|
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
|
for r in rows:
|
|
cursor.execute(sql, (
|
|
import_id, STUDY,
|
|
r["site_code"], r["investigator_name"], r["country"],
|
|
r["visit"], r["visit_collection_date"], r["accession"],
|
|
r["subject"], r["equery_id"],
|
|
r["create_date"], r["response_datetime"],
|
|
r["issue_type"], r["status"],
|
|
r["time_before_response"], r["user_name"],
|
|
r["email"], r["study_role"],
|
|
))
|
|
|
|
|
|
# ── main ─────────────────────────────────────────────────────────────────────
|
|
|
|
def main():
|
|
csv_path = find_csv(STUDY)
|
|
print(f"Soubor: {os.path.basename(csv_path)}")
|
|
|
|
rows = parse_csv(csv_path)
|
|
print(f"Načteno řádků: {len(rows)}")
|
|
|
|
conn = get_conn()
|
|
cursor = conn.cursor()
|
|
|
|
import_id = insert_import(cursor, STUDY, os.path.basename(csv_path))
|
|
print(f"import_id = {import_id}")
|
|
|
|
insert_equeries(cursor, import_id, rows)
|
|
conn.commit()
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
print(f"Hotovo — {len(rows)} eQuery záznamů importováno.")
|
|
|
|
|
|
main()
|