janssen/Covance/create_report.py

"""
Covance samples report pro studii 42847922MDD3003.
Čte z MySQL (nejnovější import), generuje Excel s 7 listy:
  1. Přehled      — agregát per pacient+visit (Received / Not Received / Cancelled)
  2. Chybějící    — detail Not Received vzorků
  3. Kity         — kit inventory: centra × typy kitů s expirací
  4. eQueries     — přehled eQuery dotazů (Open červeně)
  5. ZDROJ Vzorky — surová data samples
  6. ZDROJ Kity   — surová data kit inventory
  7. ZDROJ eQuery — surová data eQueries
"""

import os
import datetime

import mysql.connector
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter

import db_config

STUDY       = "42847922MDD3003"
BASE_DIR    = os.path.dirname(os.path.abspath(__file__))
CREATED_DIR = os.path.join(BASE_DIR, "CreatedReports")

# ── styles ───────────────────────────────────────────────────────────────────
HEADER_FILL = PatternFill("solid", fgColor="1F4E79")
HEADER_FONT = Font(name="Arial", bold=True, color="FFFFFF", size=10)
NORMAL_FONT = Font(name="Arial", size=10)
BOLD_FONT   = Font(name="Arial", bold=True, size=10)
RED_FONT    = Font(name="Arial", bold=True, size=10, color="C00000")

THIN   = Side(style="thin", color="CCCCCC")
BORDER = Border(left=THIN, right=THIN, top=THIN, bottom=THIN)

EVEN_FILL      = PatternFill("solid", fgColor="EBF3FB")
ODD_FILL       = PatternFill("solid", fgColor="FFFFFF")
NOTRCV_FILL    = PatternFill("solid", fgColor="FCE4D6")
CANCELLED_FILL = PatternFill("solid", fgColor="F2F2F2")
OPEN_FILL       = PatternFill("solid", fgColor="FFC7CE")
OPEN_QUERY_FILL = PatternFill("solid", fgColor="FFD966")
HYPERLINK_FONT  = Font(name="Arial", size=10, color="0563C1", underline="single")

CENTER = Alignment(horizontal="center", vertical="center")
LEFT   = Alignment(horizontal="left",   vertical="center")


def unique_path(stem):
    path = os.path.join(CREATED_DIR, f"{stem}.xlsx")
    if not os.path.exists(path):
        return path
    tag = datetime.datetime.now().strftime("%H%M")
    return os.path.join(CREATED_DIR, f"{stem} {tag}.xlsx")


# ── data load ────────────────────────────────────────────────────────────────

def load_data():
    conn = mysql.connector.connect(
        host=db_config.DB_HOST, port=db_config.DB_PORT,
        user=db_config.DB_USER, password=db_config.DB_PASSWORD,
        database=db_config.DB_NAME,
    )
    sql = """
        SELECT
            investigator_no, investigator_name, patient_no,
            collection_date, protocol_visit_code,
            accession, container_no, container_barcode,
            specimen_type, sample_status,
            label_line1, label_line2
        FROM covance_samples
        WHERE import_id = (
            SELECT MAX(import_id) FROM iwrs_import
            WHERE study = %s AND report_type = 'covance_samples'
        )
        ORDER BY investigator_no, patient_no, protocol_visit_code, container_no
    """
    cursor = conn.cursor()
    cursor.execute(sql, (STUDY,))
    cols = [d[0] for d in cursor.description]
    rows = cursor.fetchall()
    cursor.close()
    conn.close()
    return pd.DataFrame(rows, columns=cols)


def load_equery_data():
    conn = mysql.connector.connect(
        host=db_config.DB_HOST, port=db_config.DB_PORT,
        user=db_config.DB_USER, password=db_config.DB_PASSWORD,
        database=db_config.DB_NAME,
    )
    sql = """
        SELECT site_code, investigator_name, subject, visit,
               accession, visit_collection_date, equery_id,
               create_date, response_datetime, issue_type, status,
               time_before_response, user_name, study_role
        FROM covance_equeries
        WHERE import_id = (
            SELECT MAX(import_id) FROM iwrs_import
            WHERE study = %s AND report_type = 'covance_equeries'
        )
        ORDER BY site_code ASC, create_date DESC
    """
    cursor = conn.cursor()
    cursor.execute(sql, (STUDY,))
    cols = [d[0] for d in cursor.description]
    rows = cursor.fetchall()
    cursor.close()
    conn.close()
    return pd.DataFrame(rows, columns=cols)


def load_kit_data():
    conn = mysql.connector.connect(
        host=db_config.DB_HOST, port=db_config.DB_PORT,
        user=db_config.DB_USER, password=db_config.DB_PASSWORD,
        database=db_config.DB_NAME,
    )
    sql = """
        SELECT site_code, investigator_name, kit_type, description,
               accession, shipped_date, expiration_date, days_to_expiration
        FROM covance_kit_inventory
        WHERE import_id = (
            SELECT MAX(import_id) FROM iwrs_import
            WHERE study = %s AND report_type = 'covance_kit_inventory'
        )
        ORDER BY site_code, kit_type+0, kit_type, accession
    """
    cursor = conn.cursor()
    cursor.execute(sql, (STUDY,))
    cols = [d[0] for d in cursor.description]
    rows = cursor.fetchall()
    cursor.close()
    conn.close()
    return pd.DataFrame(rows, columns=cols)


# ── helpers ──────────────────────────────────────────────────────────────────

def test_name(row):
    l1 = str(row["label_line1"]).strip() if pd.notna(row["label_line1"]) else ""
    l2 = str(row["label_line2"]).strip() if pd.notna(row["label_line2"]) else ""
    return f"{l1} {l2}".strip() if l2 else l1

def write_headers(ws, headers, widths, row=2):
    for c, (h, w) in enumerate(zip(headers, widths), 1):
        cell = ws.cell(row=row, column=c, value=h)
        cell.font      = HEADER_FONT
        cell.fill      = HEADER_FILL
        cell.alignment = CENTER
        cell.border    = BORDER
        ws.column_dimensions[get_column_letter(c)].width = w
    ws.row_dimensions[row].height = 18

def write_title(ws, text, ncols):
    ws.merge_cells(f"A1:{get_column_letter(ncols)}1")
    cell = ws["A1"]
    cell.value     = text
    cell.font      = Font(name="Arial", bold=True, size=12, color="1F4E79")
    cell.alignment = Alignment(horizontal="left", vertical="center")
    ws.row_dimensions[1].height = 22


# ── sheet 1: Přehled ─────────────────────────────────────────────────────────

def write_prehled(wb, df, accession_eq_row=None):
    if accession_eq_row is None:
        accession_eq_row = {}
    ws = wb.create_sheet("Přehled")
    ws.sheet_view.showGridLines = False

    today = datetime.date.today().strftime("%d-%b-%Y")
    write_title(ws, f"Covance Samples — {STUDY}   ({today})", 9)

    headers = ["Site", "Investigátor", "Pacient", "Visit", "Accession",
               "Datum odběru", "Celkem", "Received", "Not Received"]
    widths  = [9, 22, 14, 12, 13, 14, 8, 10, 13]
    write_headers(ws, headers, widths)

    agg = (
        df.groupby(["investigator_no", "investigator_name",
                    "patient_no", "protocol_visit_code", "accession", "collection_date"])
        .agg(
            celkem      =("sample_status", "count"),
            received    =("sample_status", lambda x: (x == "Received").sum()),
            not_received=("sample_status", lambda x: (x == "Not Received").sum()),
        )
        .reset_index()
        .sort_values(["investigator_no", "patient_no", "protocol_visit_code"])
        .reset_index(drop=True)
    )

    for r_idx, row in agg.iterrows():
        excel_row   = r_idx + 3
        has_missing = row["not_received"] > 0
        accession   = row["accession"]
        eq_row      = accession_eq_row.get(accession)   # None pokud nemá Open query

        if eq_row:
            fill = OPEN_QUERY_FILL
        elif has_missing:
            fill = NOTRCV_FILL
        else:
            fill = EVEN_FILL if r_idx % 2 == 0 else ODD_FILL

        col_date = row["collection_date"]
        date_str = col_date.strftime("%d-%b-%Y") if hasattr(col_date, "strftime") else str(col_date)

        values = [
            row["investigator_no"], row["investigator_name"], row["patient_no"],
            row["protocol_visit_code"], accession, date_str,
            int(row["celkem"]), int(row["received"]), int(row["not_received"]),
        ]
        for c_idx, val in enumerate(values, 1):
            cell = ws.cell(row=excel_row, column=c_idx, value=val)
            cell.fill      = fill
            cell.border    = BORDER
            cell.alignment = CENTER if c_idx in (1, 4, 5, 6, 7, 8, 9) else LEFT

            if c_idx == 5 and eq_row:
                cell.hyperlink = f"#'eQueries'!A{eq_row}"
                cell.font      = HYPERLINK_FONT
            elif c_idx == 9 and has_missing:
                cell.font = RED_FONT
            else:
                cell.font = NORMAL_FONT
        ws.row_dimensions[excel_row].height = 16

    ws.freeze_panes = "A3"
    ws.auto_filter.ref = f"A2:I{len(agg) + 2}"


# ── sheet 2: Chybějící ────────────────────────────────────────────────────────

def write_chybejici(wb, df):
    ws = wb.create_sheet("Chybějící")
    ws.sheet_view.showGridLines = False

    today = datetime.date.today().strftime("%d-%b-%Y")
    write_title(ws, f"Not Received vzorky — {STUDY}   ({today})", 8)

    headers = ["Site", "Pacient", "Visit", "Datum odběru",
               "Accession", "Container", "Typ vzorku", "Test"]
    widths  = [9, 14, 12, 14, 13, 10, 22, 30]
    write_headers(ws, headers, widths)

    missing = df[df["sample_status"] == "Not Received"].copy()
    missing["test"] = missing.apply(test_name, axis=1)
    missing = missing.sort_values(
        ["investigator_no", "patient_no", "protocol_visit_code", "container_no"]
    ).reset_index(drop=True)

    for r_idx, row in missing.iterrows():
        excel_row = r_idx + 3
        fill = EVEN_FILL if r_idx % 2 == 0 else ODD_FILL

        col_date = row["collection_date"]
        date_str = col_date.strftime("%d-%b-%Y") if hasattr(col_date, "strftime") else str(col_date)

        values = [
            row["investigator_no"], row["patient_no"],
            row["protocol_visit_code"], date_str,
            row["accession"], int(row["container_no"]) if pd.notna(row["container_no"]) else "",
            row["specimen_type"], row["test"],
        ]
        for c_idx, val in enumerate(values, 1):
            cell = ws.cell(row=excel_row, column=c_idx, value=val)
            cell.fill      = fill
            cell.border    = BORDER
            cell.alignment = CENTER if c_idx in (1, 3, 4, 5, 6) else LEFT
            cell.font      = NORMAL_FONT
        ws.row_dimensions[excel_row].height = 16

    ws.freeze_panes = "A3"
    ws.auto_filter.ref = f"A2:H{len(missing) + 2}"


# ── sheet 3: Kity (per centrum) ──────────────────────────────────────────────

def kit_sort_key(kt):
    try:
        return (0, int(kt), "")
    except ValueError:
        pass
    if str(kt).upper().startswith("T-"):
        try:
            return (1, int(str(kt)[2:]), "")
        except ValueError:
            pass
    return (2, 0, str(kt))

SITE_HDR_FILL  = PatternFill("solid", fgColor="2E75B6")
SITE_HDR_FONT  = Font(name="Arial", bold=True, color="FFFFFF", size=10)
TOTAL_FILL     = PatternFill("solid", fgColor="D6E4F0")
SOON_FILL      = PatternFill("solid", fgColor="FCE4D6")

def _cell(ws, row, col, value, font, fill, alignment, border):
    c = ws.cell(row=row, column=col, value=value)
    c.font = font; c.fill = fill; c.alignment = alignment; c.border = border
    return c

def write_kity(wb, df_kits):
    ws = wb.create_sheet("Kity")
    ws.sheet_view.showGridLines = False

    today      = datetime.date.today()
    cutoff     = today + datetime.timedelta(days=30)
    today_str  = today.strftime("%d-%b-%Y")

    # sada kitů napříč celou studií (seřazeno)
    kit_types = sorted(df_kits["kit_type"].dropna().unique(), key=kit_sort_key)
    kt_desc   = (df_kits.drop_duplicates("kit_type")
                 .set_index("kit_type")["description"].to_dict())

    # centra seřazená
    sites = (df_kits[["site_code", "investigator_name"]]
             .drop_duplicates()
             .sort_values("site_code")
             .values.tolist())

    # sloupce: A=Kit Type, B=Popis, C=≤30 dní, D=>30 dní, E=Celkem
    ws.column_dimensions["A"].width = 9
    ws.column_dimensions["B"].width = 28
    ws.column_dimensions["C"].width = 14
    ws.column_dimensions["D"].width = 14
    ws.column_dimensions["E"].width = 10

    write_title(ws, f"Kit Inventory — {STUDY}   ({today_str})", 5)

    # sub-header (řádek 2) — bez pevné výšky, Excel si ji sám přizpůsobí
    for col, txt in [(1, "Kit Type"), (2, "Popis"),
                     (3, f"Expiruje ≤30 dní\n({cutoff.strftime('%d-%b-%Y')})"),
                     (4, "Expiruje >30 dní"),
                     (5, "Celkem")]:
        c = ws.cell(row=2, column=col, value=txt)
        c.font = HEADER_FONT; c.fill = HEADER_FILL
        c.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
        c.border = BORDER

    cur_row = 3

    for site_code, investigator in sites:
        # ── site header ───────────────────────────────────────────────────────
        ws.merge_cells(f"A{cur_row}:E{cur_row}")
        c = ws.cell(row=cur_row, column=1,
                    value=f"{site_code}  —  {investigator}")
        c.font = SITE_HDR_FONT; c.fill = SITE_HDR_FILL
        c.alignment = LEFT; c.border = BORDER
        for col in range(2, 6):
            ws.cell(row=cur_row, column=col).fill   = SITE_HDR_FILL
            ws.cell(row=cur_row, column=col).border = BORDER
        ws.row_dimensions[cur_row].height = 17
        cur_row += 1

        # kity tohoto centra
        site_df = df_kits[df_kits["site_code"] == site_code].copy()
        site_df["exp_date"] = pd.to_datetime(site_df["expiration_date"]).dt.date

        site_soon  = 0
        site_later = 0

        for kt_idx, kt in enumerate(kit_types):
            kt_df = site_df[site_df["kit_type"] == kt]
            soon  = int((kt_df["exp_date"].apply(
                lambda d: d is not None and today <= d <= cutoff)).sum())
            later = int((kt_df["exp_date"].apply(
                lambda d: d is not None and d > cutoff)).sum())
            site_soon  += soon
            site_later += later
            total = soon + later

            fill = EVEN_FILL if kt_idx % 2 == 0 else ODD_FILL

            _cell(ws, cur_row, 1, kt,                  BOLD_FONT,   fill, CENTER, BORDER)
            _cell(ws, cur_row, 2, kt_desc.get(kt, ""), NORMAL_FONT, fill, LEFT,   BORDER)
            _cell(ws, cur_row, 3, soon  if soon  else None,
                  RED_FONT if soon else NORMAL_FONT,
                  SOON_FILL if soon else fill, CENTER, BORDER)
            _cell(ws, cur_row, 4, later if later else None,
                  NORMAL_FONT, fill, CENTER, BORDER)
            _cell(ws, cur_row, 5, total if total else None,
                  BOLD_FONT, fill, CENTER, BORDER)
            ws.row_dimensions[cur_row].height = 16
            cur_row += 1

        # ── součet centra ─────────────────────────────────────────────────────
        site_total = site_soon + site_later
        _cell(ws, cur_row, 1, "Celkem",       BOLD_FONT, TOTAL_FILL, CENTER, BORDER)
        _cell(ws, cur_row, 2, "",             BOLD_FONT, TOTAL_FILL, LEFT,   BORDER)
        _cell(ws, cur_row, 3, site_soon  if site_soon  else None,
              BOLD_FONT, TOTAL_FILL, CENTER, BORDER)
        _cell(ws, cur_row, 4, site_later if site_later else None,
              BOLD_FONT, TOTAL_FILL, CENTER, BORDER)
        _cell(ws, cur_row, 5, site_total if site_total else None,
              BOLD_FONT, TOTAL_FILL, CENTER, BORDER)
        ws.row_dimensions[cur_row].height = 16
        cur_row += 2  # prázdný řádek mezi centry

    ws.freeze_panes = "A3"


# ── sheet 4: eQueries ────────────────────────────────────────────────────────

def write_equeries(wb, df_eq):
    ws = wb.create_sheet("eQueries")
    ws.sheet_view.showGridLines = False

    today = datetime.date.today().strftime("%d-%b-%Y")
    write_title(ws, f"eQueries — {STUDY}   ({today})", 14)

    headers = ["Site", "Investigátor", "Pacient", "Visit", "Accession",
               "Visit Datum", "eQuery ID", "Vytvořeno", "Odpovězeno",
               "Issue Type", "Status", "Čas odpovědi", "Uživatel", "Role"]
    widths  = [9, 22, 14, 26, 13, 13, 10, 16, 16, 20, 9, 13, 22, 13]
    write_headers(ws, headers, widths)

    def fmt_dt(val, fmt="%d-%b-%Y %H:%M"):
        if val is None or (isinstance(val, float) and val != val):
            return None
        try:
            if pd.isna(val):
                return None
        except (TypeError, ValueError):
            pass
        if hasattr(val, "strftime"):
            return val.strftime(fmt)
        return str(val)

    for r_idx, row in df_eq.iterrows():
        excel_row = r_idx + 3
        is_open = str(row.get("status", "")).strip().lower() == "open"
        fill = OPEN_FILL if is_open else (EVEN_FILL if r_idx % 2 == 0 else ODD_FILL)
        font = Font(name="Arial", bold=True, size=10, color="9C0006") if is_open else NORMAL_FONT

        values = [
            row["site_code"], row["investigator_name"], row["subject"],
            row["visit"], row["accession"],
            fmt_dt(row["visit_collection_date"], "%d-%b-%Y"),
            row["equery_id"],
            fmt_dt(row["create_date"]),
            fmt_dt(row["response_datetime"]),
            row["issue_type"], row["status"],
            row["time_before_response"], row["user_name"], row["study_role"],
        ]
        for c_idx, val in enumerate(values, 1):
            if isinstance(val, float) and val != val:
                val = None
            cell = ws.cell(row=excel_row, column=c_idx, value=val)
            cell.fill      = fill
            cell.border    = BORDER
            cell.font      = font
            cell.alignment = CENTER if c_idx in (1, 6, 7, 8, 9, 11, 12) else LEFT
        ws.row_dimensions[excel_row].height = 16

    ws.freeze_panes = "A3"
    ws.auto_filter.ref = f"A2:N{len(df_eq) + 2}"


# ── sheet 5: ZDROJ Vzorky ────────────────────────────────────────────────────

# ── sheet 6: ZDROJ Kity ──────────────────────────────────────────────────────

def write_zdroj_kity(wb, df_kits):
    ws = wb.create_sheet("ZDROJ Kity")
    ws.sheet_view.showGridLines = True

    headers = list(df_kits.columns)
    for c, h in enumerate(headers, 1):
        cell = ws.cell(row=1, column=c, value=h)
        cell.font      = Font(name="Arial", bold=True, size=9, color="FFFFFF")
        cell.fill      = PatternFill("solid", fgColor="404040")
        cell.alignment = LEFT
        cell.border    = BORDER
        ws.column_dimensions[get_column_letter(c)].width = 20

    for r_idx, (_, row) in enumerate(df_kits.iterrows(), 2):
        fill = EVEN_FILL if r_idx % 2 == 0 else ODD_FILL
        for c_idx, col in enumerate(headers, 1):
            val = row[col]
            if pd.isna(val):
                val = ""
            elif hasattr(val, "strftime"):
                val = val.strftime("%Y-%m-%d")
            cell = ws.cell(row=r_idx, column=c_idx, value=val)
            cell.font      = Font(name="Arial", size=9)
            cell.fill      = fill
            cell.border    = BORDER
            cell.alignment = LEFT

    ws.freeze_panes = "A2"
    ws.auto_filter.ref = f"A1:{get_column_letter(len(headers))}1"


# ── sheet 4: ZDROJ ───────────────────────────────────────────────────────────

def write_zdroj(wb, df):
    ws = wb.create_sheet("ZDROJ Vzorky")
    ws.sheet_view.showGridLines = True

    headers = list(df.columns)
    for c, h in enumerate(headers, 1):
        cell = ws.cell(row=1, column=c, value=h)
        cell.font      = Font(name="Arial", bold=True, size=9, color="FFFFFF")
        cell.fill      = PatternFill("solid", fgColor="404040")
        cell.alignment = LEFT
        cell.border    = BORDER
        ws.column_dimensions[get_column_letter(c)].width = 18

    for r_idx, (_, row) in enumerate(df.iterrows(), 2):
        fill = EVEN_FILL if r_idx % 2 == 0 else ODD_FILL
        for c_idx, col in enumerate(headers, 1):
            val = row[col]
            if pd.isna(val):
                val = ""
            elif hasattr(val, "strftime"):
                val = val.strftime("%Y-%m-%d")
            cell = ws.cell(row=r_idx, column=c_idx, value=val)
            cell.font      = Font(name="Arial", size=9)
            cell.fill      = fill
            cell.border    = BORDER
            cell.alignment = LEFT

    ws.freeze_panes = "A2"
    ws.auto_filter.ref = f"A1:{get_column_letter(len(headers))}1"


def write_zdroj_equeries(wb, df_eq):
    ws = wb.create_sheet("ZDROJ eQuery")
    ws.sheet_view.showGridLines = True

    headers = list(df_eq.columns)
    for c, h in enumerate(headers, 1):
        cell = ws.cell(row=1, column=c, value=h)
        cell.font      = Font(name="Arial", bold=True, size=9, color="FFFFFF")
        cell.fill      = PatternFill("solid", fgColor="404040")
        cell.alignment = LEFT
        cell.border    = BORDER
        ws.column_dimensions[get_column_letter(c)].width = 20

    for r_idx, (_, row) in enumerate(df_eq.iterrows(), 2):
        fill = EVEN_FILL if r_idx % 2 == 0 else ODD_FILL
        for c_idx, col in enumerate(headers, 1):
            val = row[col]
            try:
                is_na = pd.isna(val)
            except (TypeError, ValueError):
                is_na = False
            if is_na or val is None:
                val = ""
            elif hasattr(val, "strftime"):
                val = val.strftime("%Y-%m-%d %H:%M")
            cell = ws.cell(row=r_idx, column=c_idx, value=val)
            cell.font      = Font(name="Arial", size=9)
            cell.fill      = fill
            cell.border    = BORDER
            cell.alignment = LEFT

    ws.freeze_panes = "A2"
    ws.auto_filter.ref = f"A1:{get_column_letter(len(headers))}1"


# ── main ─────────────────────────────────────────────────────────────────────

def main():
    os.makedirs(CREATED_DIR, exist_ok=True)

    print("Načítám data z MySQL...")
    df      = load_data()
    df_kits = load_kit_data()
    df_eq   = load_equery_data()
    print(f"  Vzorky:   {len(df)} řádků, {df['patient_no'].nunique()} pacientů")
    print(f"  Kity:     {len(df_kits)} kitů, {df_kits['site_code'].nunique()} center")
    print(f"  eQueries: {len(df_eq)} záznamů ({(df_eq['status']=='Open').sum()} Open)")

    # mapping accession → řádek v listu eQueries (jen Open queries, první výskyt)
    open_accs = set(df_eq[df_eq["status"] == "Open"]["accession"].dropna())
    accession_eq_row = {}
    for r_idx, row in df_eq.iterrows():
        acc = row.get("accession")
        if acc and acc in open_accs and acc not in accession_eq_row:
            accession_eq_row[acc] = r_idx + 3  # řádek 1=title, 2=header, data od 3

    wb = Workbook()
    wb.remove(wb.active)

    write_prehled(wb, df, accession_eq_row)
    write_chybejici(wb, df)
    write_kity(wb, df_kits)
    write_equeries(wb, df_eq)
    write_zdroj(wb, df)
    write_zdroj_kity(wb, df_kits)
    write_zdroj_equeries(wb, df_eq)

    now = datetime.datetime.now()
    stamp = now.strftime("%Y-%m-%d %H%M%S")
    out_path = unique_path(f"{stamp} {STUDY} Covance")
    wb.save(out_path)
    print(f"Uloženo: {out_path}")


main()