Initial commit — clean history (removed large test files, browser profiles, Medidata/Clario downloads)

This commit is contained in:
2026-06-01 15:36:31 +02:00
commit bb604e593e
1304 changed files with 116480 additions and 0 deletions
+604
View File
@@ -0,0 +1,604 @@
"""
Covance samples report pro studii 42847922MDD3003.
Čte z MySQL (nejnovější import), generuje Excel s 7 listy:
1. Přehled — agregát per pacient+visit (Received / Not Received / Cancelled)
2. Chybějící — detail Not Received vzorků
3. Kity — kit inventory: centra × typy kitů s expirací
4. eQueries — přehled eQuery dotazů (Open červeně)
5. ZDROJ Vzorky — surová data samples
6. ZDROJ Kity — surová data kit inventory
7. ZDROJ eQuery — surová data eQueries
"""
import os
import datetime
import mysql.connector
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter
import db_config
STUDY = "42847922MDD3003"
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
CREATED_DIR = os.path.join(BASE_DIR, "CreatedReports")
# ── styles ───────────────────────────────────────────────────────────────────
HEADER_FILL = PatternFill("solid", fgColor="1F4E79")
HEADER_FONT = Font(name="Arial", bold=True, color="FFFFFF", size=10)
NORMAL_FONT = Font(name="Arial", size=10)
BOLD_FONT = Font(name="Arial", bold=True, size=10)
RED_FONT = Font(name="Arial", bold=True, size=10, color="C00000")
THIN = Side(style="thin", color="CCCCCC")
BORDER = Border(left=THIN, right=THIN, top=THIN, bottom=THIN)
EVEN_FILL = PatternFill("solid", fgColor="EBF3FB")
ODD_FILL = PatternFill("solid", fgColor="FFFFFF")
NOTRCV_FILL = PatternFill("solid", fgColor="FCE4D6")
CANCELLED_FILL = PatternFill("solid", fgColor="F2F2F2")
OPEN_FILL = PatternFill("solid", fgColor="FFC7CE")
OPEN_QUERY_FILL = PatternFill("solid", fgColor="FFD966")
HYPERLINK_FONT = Font(name="Arial", size=10, color="0563C1", underline="single")
CENTER = Alignment(horizontal="center", vertical="center")
LEFT = Alignment(horizontal="left", vertical="center")
def unique_path(stem):
path = os.path.join(CREATED_DIR, f"{stem}.xlsx")
if not os.path.exists(path):
return path
tag = datetime.datetime.now().strftime("%H%M")
return os.path.join(CREATED_DIR, f"{stem} {tag}.xlsx")
# ── data load ────────────────────────────────────────────────────────────────
def load_data():
conn = mysql.connector.connect(
host=db_config.DB_HOST, port=db_config.DB_PORT,
user=db_config.DB_USER, password=db_config.DB_PASSWORD,
database=db_config.DB_NAME,
)
sql = """
SELECT
investigator_no, investigator_name, patient_no,
collection_date, protocol_visit_code,
accession, container_no, container_barcode,
specimen_type, sample_status,
label_line1, label_line2
FROM covance_samples
WHERE import_id = (
SELECT MAX(import_id) FROM iwrs_import
WHERE study = %s AND report_type = 'covance_samples'
)
ORDER BY investigator_no, patient_no, protocol_visit_code, container_no
"""
cursor = conn.cursor()
cursor.execute(sql, (STUDY,))
cols = [d[0] for d in cursor.description]
rows = cursor.fetchall()
cursor.close()
conn.close()
return pd.DataFrame(rows, columns=cols)
def load_equery_data():
conn = mysql.connector.connect(
host=db_config.DB_HOST, port=db_config.DB_PORT,
user=db_config.DB_USER, password=db_config.DB_PASSWORD,
database=db_config.DB_NAME,
)
sql = """
SELECT site_code, investigator_name, subject, visit,
accession, visit_collection_date, equery_id,
create_date, response_datetime, issue_type, status,
time_before_response, user_name, study_role
FROM covance_equeries
WHERE import_id = (
SELECT MAX(import_id) FROM iwrs_import
WHERE study = %s AND report_type = 'covance_equeries'
)
ORDER BY site_code ASC, create_date DESC
"""
cursor = conn.cursor()
cursor.execute(sql, (STUDY,))
cols = [d[0] for d in cursor.description]
rows = cursor.fetchall()
cursor.close()
conn.close()
return pd.DataFrame(rows, columns=cols)
def load_kit_data():
conn = mysql.connector.connect(
host=db_config.DB_HOST, port=db_config.DB_PORT,
user=db_config.DB_USER, password=db_config.DB_PASSWORD,
database=db_config.DB_NAME,
)
sql = """
SELECT site_code, investigator_name, kit_type, description,
accession, shipped_date, expiration_date, days_to_expiration
FROM covance_kit_inventory
WHERE import_id = (
SELECT MAX(import_id) FROM iwrs_import
WHERE study = %s AND report_type = 'covance_kit_inventory'
)
ORDER BY site_code, kit_type+0, kit_type, accession
"""
cursor = conn.cursor()
cursor.execute(sql, (STUDY,))
cols = [d[0] for d in cursor.description]
rows = cursor.fetchall()
cursor.close()
conn.close()
return pd.DataFrame(rows, columns=cols)
# ── helpers ──────────────────────────────────────────────────────────────────
def test_name(row):
l1 = str(row["label_line1"]).strip() if pd.notna(row["label_line1"]) else ""
l2 = str(row["label_line2"]).strip() if pd.notna(row["label_line2"]) else ""
return f"{l1} {l2}".strip() if l2 else l1
def write_headers(ws, headers, widths, row=2):
for c, (h, w) in enumerate(zip(headers, widths), 1):
cell = ws.cell(row=row, column=c, value=h)
cell.font = HEADER_FONT
cell.fill = HEADER_FILL
cell.alignment = CENTER
cell.border = BORDER
ws.column_dimensions[get_column_letter(c)].width = w
ws.row_dimensions[row].height = 18
def write_title(ws, text, ncols):
ws.merge_cells(f"A1:{get_column_letter(ncols)}1")
cell = ws["A1"]
cell.value = text
cell.font = Font(name="Arial", bold=True, size=12, color="1F4E79")
cell.alignment = Alignment(horizontal="left", vertical="center")
ws.row_dimensions[1].height = 22
# ── sheet 1: Přehled ─────────────────────────────────────────────────────────
def write_prehled(wb, df, accession_eq_row=None):
if accession_eq_row is None:
accession_eq_row = {}
ws = wb.create_sheet("Přehled")
ws.sheet_view.showGridLines = False
today = datetime.date.today().strftime("%d-%b-%Y")
write_title(ws, f"Covance Samples — {STUDY} ({today})", 9)
headers = ["Site", "Investigátor", "Pacient", "Visit", "Accession",
"Datum odběru", "Celkem", "Received", "Not Received"]
widths = [9, 22, 14, 12, 13, 14, 8, 10, 13]
write_headers(ws, headers, widths)
agg = (
df.groupby(["investigator_no", "investigator_name",
"patient_no", "protocol_visit_code", "accession", "collection_date"])
.agg(
celkem =("sample_status", "count"),
received =("sample_status", lambda x: (x == "Received").sum()),
not_received=("sample_status", lambda x: (x == "Not Received").sum()),
)
.reset_index()
.sort_values(["investigator_no", "patient_no", "protocol_visit_code"])
.reset_index(drop=True)
)
for r_idx, row in agg.iterrows():
excel_row = r_idx + 3
has_missing = row["not_received"] > 0
accession = row["accession"]
eq_row = accession_eq_row.get(accession) # None pokud nemá Open query
if eq_row:
fill = OPEN_QUERY_FILL
elif has_missing:
fill = NOTRCV_FILL
else:
fill = EVEN_FILL if r_idx % 2 == 0 else ODD_FILL
col_date = row["collection_date"]
date_str = col_date.strftime("%d-%b-%Y") if hasattr(col_date, "strftime") else str(col_date)
values = [
row["investigator_no"], row["investigator_name"], row["patient_no"],
row["protocol_visit_code"], accession, date_str,
int(row["celkem"]), int(row["received"]), int(row["not_received"]),
]
for c_idx, val in enumerate(values, 1):
cell = ws.cell(row=excel_row, column=c_idx, value=val)
cell.fill = fill
cell.border = BORDER
cell.alignment = CENTER if c_idx in (1, 4, 5, 6, 7, 8, 9) else LEFT
if c_idx == 5 and eq_row:
cell.hyperlink = f"#'eQueries'!A{eq_row}"
cell.font = HYPERLINK_FONT
elif c_idx == 9 and has_missing:
cell.font = RED_FONT
else:
cell.font = NORMAL_FONT
ws.row_dimensions[excel_row].height = 16
ws.freeze_panes = "A3"
ws.auto_filter.ref = f"A2:I{len(agg) + 2}"
# ── sheet 2: Chybějící ────────────────────────────────────────────────────────
def write_chybejici(wb, df):
ws = wb.create_sheet("Chybějící")
ws.sheet_view.showGridLines = False
today = datetime.date.today().strftime("%d-%b-%Y")
write_title(ws, f"Not Received vzorky — {STUDY} ({today})", 8)
headers = ["Site", "Pacient", "Visit", "Datum odběru",
"Accession", "Container", "Typ vzorku", "Test"]
widths = [9, 14, 12, 14, 13, 10, 22, 30]
write_headers(ws, headers, widths)
missing = df[df["sample_status"] == "Not Received"].copy()
missing["test"] = missing.apply(test_name, axis=1)
missing = missing.sort_values(
["investigator_no", "patient_no", "protocol_visit_code", "container_no"]
).reset_index(drop=True)
for r_idx, row in missing.iterrows():
excel_row = r_idx + 3
fill = EVEN_FILL if r_idx % 2 == 0 else ODD_FILL
col_date = row["collection_date"]
date_str = col_date.strftime("%d-%b-%Y") if hasattr(col_date, "strftime") else str(col_date)
values = [
row["investigator_no"], row["patient_no"],
row["protocol_visit_code"], date_str,
row["accession"], int(row["container_no"]) if pd.notna(row["container_no"]) else "",
row["specimen_type"], row["test"],
]
for c_idx, val in enumerate(values, 1):
cell = ws.cell(row=excel_row, column=c_idx, value=val)
cell.fill = fill
cell.border = BORDER
cell.alignment = CENTER if c_idx in (1, 3, 4, 5, 6) else LEFT
cell.font = NORMAL_FONT
ws.row_dimensions[excel_row].height = 16
ws.freeze_panes = "A3"
ws.auto_filter.ref = f"A2:H{len(missing) + 2}"
# ── sheet 3: Kity (per centrum) ──────────────────────────────────────────────
def kit_sort_key(kt):
try:
return (0, int(kt), "")
except ValueError:
pass
if str(kt).upper().startswith("T-"):
try:
return (1, int(str(kt)[2:]), "")
except ValueError:
pass
return (2, 0, str(kt))
SITE_HDR_FILL = PatternFill("solid", fgColor="2E75B6")
SITE_HDR_FONT = Font(name="Arial", bold=True, color="FFFFFF", size=10)
TOTAL_FILL = PatternFill("solid", fgColor="D6E4F0")
SOON_FILL = PatternFill("solid", fgColor="FCE4D6")
def _cell(ws, row, col, value, font, fill, alignment, border):
c = ws.cell(row=row, column=col, value=value)
c.font = font; c.fill = fill; c.alignment = alignment; c.border = border
return c
def write_kity(wb, df_kits):
ws = wb.create_sheet("Kity")
ws.sheet_view.showGridLines = False
today = datetime.date.today()
cutoff = today + datetime.timedelta(days=30)
today_str = today.strftime("%d-%b-%Y")
# sada kitů napříč celou studií (seřazeno)
kit_types = sorted(df_kits["kit_type"].dropna().unique(), key=kit_sort_key)
kt_desc = (df_kits.drop_duplicates("kit_type")
.set_index("kit_type")["description"].to_dict())
# centra seřazená
sites = (df_kits[["site_code", "investigator_name"]]
.drop_duplicates()
.sort_values("site_code")
.values.tolist())
# sloupce: A=Kit Type, B=Popis, C=≤30 dní, D=>30 dní, E=Celkem
ws.column_dimensions["A"].width = 9
ws.column_dimensions["B"].width = 28
ws.column_dimensions["C"].width = 14
ws.column_dimensions["D"].width = 14
ws.column_dimensions["E"].width = 10
write_title(ws, f"Kit Inventory — {STUDY} ({today_str})", 5)
# sub-header (řádek 2) — bez pevné výšky, Excel si ji sám přizpůsobí
for col, txt in [(1, "Kit Type"), (2, "Popis"),
(3, f"Expiruje ≤30 dní\n({cutoff.strftime('%d-%b-%Y')})"),
(4, "Expiruje >30 dní"),
(5, "Celkem")]:
c = ws.cell(row=2, column=col, value=txt)
c.font = HEADER_FONT; c.fill = HEADER_FILL
c.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
c.border = BORDER
cur_row = 3
for site_code, investigator in sites:
# ── site header ───────────────────────────────────────────────────────
ws.merge_cells(f"A{cur_row}:E{cur_row}")
c = ws.cell(row=cur_row, column=1,
value=f"{site_code}{investigator}")
c.font = SITE_HDR_FONT; c.fill = SITE_HDR_FILL
c.alignment = LEFT; c.border = BORDER
for col in range(2, 6):
ws.cell(row=cur_row, column=col).fill = SITE_HDR_FILL
ws.cell(row=cur_row, column=col).border = BORDER
ws.row_dimensions[cur_row].height = 17
cur_row += 1
# kity tohoto centra
site_df = df_kits[df_kits["site_code"] == site_code].copy()
site_df["exp_date"] = pd.to_datetime(site_df["expiration_date"]).dt.date
site_soon = 0
site_later = 0
for kt_idx, kt in enumerate(kit_types):
kt_df = site_df[site_df["kit_type"] == kt]
soon = int((kt_df["exp_date"].apply(
lambda d: d is not None and today <= d <= cutoff)).sum())
later = int((kt_df["exp_date"].apply(
lambda d: d is not None and d > cutoff)).sum())
site_soon += soon
site_later += later
total = soon + later
fill = EVEN_FILL if kt_idx % 2 == 0 else ODD_FILL
_cell(ws, cur_row, 1, kt, BOLD_FONT, fill, CENTER, BORDER)
_cell(ws, cur_row, 2, kt_desc.get(kt, ""), NORMAL_FONT, fill, LEFT, BORDER)
_cell(ws, cur_row, 3, soon if soon else None,
RED_FONT if soon else NORMAL_FONT,
SOON_FILL if soon else fill, CENTER, BORDER)
_cell(ws, cur_row, 4, later if later else None,
NORMAL_FONT, fill, CENTER, BORDER)
_cell(ws, cur_row, 5, total if total else None,
BOLD_FONT, fill, CENTER, BORDER)
ws.row_dimensions[cur_row].height = 16
cur_row += 1
# ── součet centra ─────────────────────────────────────────────────────
site_total = site_soon + site_later
_cell(ws, cur_row, 1, "Celkem", BOLD_FONT, TOTAL_FILL, CENTER, BORDER)
_cell(ws, cur_row, 2, "", BOLD_FONT, TOTAL_FILL, LEFT, BORDER)
_cell(ws, cur_row, 3, site_soon if site_soon else None,
BOLD_FONT, TOTAL_FILL, CENTER, BORDER)
_cell(ws, cur_row, 4, site_later if site_later else None,
BOLD_FONT, TOTAL_FILL, CENTER, BORDER)
_cell(ws, cur_row, 5, site_total if site_total else None,
BOLD_FONT, TOTAL_FILL, CENTER, BORDER)
ws.row_dimensions[cur_row].height = 16
cur_row += 2 # prázdný řádek mezi centry
ws.freeze_panes = "A3"
# ── sheet 4: eQueries ────────────────────────────────────────────────────────
def write_equeries(wb, df_eq):
ws = wb.create_sheet("eQueries")
ws.sheet_view.showGridLines = False
today = datetime.date.today().strftime("%d-%b-%Y")
write_title(ws, f"eQueries — {STUDY} ({today})", 14)
headers = ["Site", "Investigátor", "Pacient", "Visit", "Accession",
"Visit Datum", "eQuery ID", "Vytvořeno", "Odpovězeno",
"Issue Type", "Status", "Čas odpovědi", "Uživatel", "Role"]
widths = [9, 22, 14, 26, 13, 13, 10, 16, 16, 20, 9, 13, 22, 13]
write_headers(ws, headers, widths)
def fmt_dt(val, fmt="%d-%b-%Y %H:%M"):
if val is None or (isinstance(val, float) and val != val):
return None
try:
if pd.isna(val):
return None
except (TypeError, ValueError):
pass
if hasattr(val, "strftime"):
return val.strftime(fmt)
return str(val)
for r_idx, row in df_eq.iterrows():
excel_row = r_idx + 3
is_open = str(row.get("status", "")).strip().lower() == "open"
fill = OPEN_FILL if is_open else (EVEN_FILL if r_idx % 2 == 0 else ODD_FILL)
font = Font(name="Arial", bold=True, size=10, color="9C0006") if is_open else NORMAL_FONT
values = [
row["site_code"], row["investigator_name"], row["subject"],
row["visit"], row["accession"],
fmt_dt(row["visit_collection_date"], "%d-%b-%Y"),
row["equery_id"],
fmt_dt(row["create_date"]),
fmt_dt(row["response_datetime"]),
row["issue_type"], row["status"],
row["time_before_response"], row["user_name"], row["study_role"],
]
for c_idx, val in enumerate(values, 1):
if isinstance(val, float) and val != val:
val = None
cell = ws.cell(row=excel_row, column=c_idx, value=val)
cell.fill = fill
cell.border = BORDER
cell.font = font
cell.alignment = CENTER if c_idx in (1, 6, 7, 8, 9, 11, 12) else LEFT
ws.row_dimensions[excel_row].height = 16
ws.freeze_panes = "A3"
ws.auto_filter.ref = f"A2:N{len(df_eq) + 2}"
# ── sheet 5: ZDROJ Vzorky ────────────────────────────────────────────────────
# ── sheet 6: ZDROJ Kity ──────────────────────────────────────────────────────
def write_zdroj_kity(wb, df_kits):
ws = wb.create_sheet("ZDROJ Kity")
ws.sheet_view.showGridLines = True
headers = list(df_kits.columns)
for c, h in enumerate(headers, 1):
cell = ws.cell(row=1, column=c, value=h)
cell.font = Font(name="Arial", bold=True, size=9, color="FFFFFF")
cell.fill = PatternFill("solid", fgColor="404040")
cell.alignment = LEFT
cell.border = BORDER
ws.column_dimensions[get_column_letter(c)].width = 20
for r_idx, (_, row) in enumerate(df_kits.iterrows(), 2):
fill = EVEN_FILL if r_idx % 2 == 0 else ODD_FILL
for c_idx, col in enumerate(headers, 1):
val = row[col]
if pd.isna(val):
val = ""
elif hasattr(val, "strftime"):
val = val.strftime("%Y-%m-%d")
cell = ws.cell(row=r_idx, column=c_idx, value=val)
cell.font = Font(name="Arial", size=9)
cell.fill = fill
cell.border = BORDER
cell.alignment = LEFT
ws.freeze_panes = "A2"
ws.auto_filter.ref = f"A1:{get_column_letter(len(headers))}1"
# ── sheet 4: ZDROJ ───────────────────────────────────────────────────────────
def write_zdroj(wb, df):
ws = wb.create_sheet("ZDROJ Vzorky")
ws.sheet_view.showGridLines = True
headers = list(df.columns)
for c, h in enumerate(headers, 1):
cell = ws.cell(row=1, column=c, value=h)
cell.font = Font(name="Arial", bold=True, size=9, color="FFFFFF")
cell.fill = PatternFill("solid", fgColor="404040")
cell.alignment = LEFT
cell.border = BORDER
ws.column_dimensions[get_column_letter(c)].width = 18
for r_idx, (_, row) in enumerate(df.iterrows(), 2):
fill = EVEN_FILL if r_idx % 2 == 0 else ODD_FILL
for c_idx, col in enumerate(headers, 1):
val = row[col]
if pd.isna(val):
val = ""
elif hasattr(val, "strftime"):
val = val.strftime("%Y-%m-%d")
cell = ws.cell(row=r_idx, column=c_idx, value=val)
cell.font = Font(name="Arial", size=9)
cell.fill = fill
cell.border = BORDER
cell.alignment = LEFT
ws.freeze_panes = "A2"
ws.auto_filter.ref = f"A1:{get_column_letter(len(headers))}1"
def write_zdroj_equeries(wb, df_eq):
ws = wb.create_sheet("ZDROJ eQuery")
ws.sheet_view.showGridLines = True
headers = list(df_eq.columns)
for c, h in enumerate(headers, 1):
cell = ws.cell(row=1, column=c, value=h)
cell.font = Font(name="Arial", bold=True, size=9, color="FFFFFF")
cell.fill = PatternFill("solid", fgColor="404040")
cell.alignment = LEFT
cell.border = BORDER
ws.column_dimensions[get_column_letter(c)].width = 20
for r_idx, (_, row) in enumerate(df_eq.iterrows(), 2):
fill = EVEN_FILL if r_idx % 2 == 0 else ODD_FILL
for c_idx, col in enumerate(headers, 1):
val = row[col]
try:
is_na = pd.isna(val)
except (TypeError, ValueError):
is_na = False
if is_na or val is None:
val = ""
elif hasattr(val, "strftime"):
val = val.strftime("%Y-%m-%d %H:%M")
cell = ws.cell(row=r_idx, column=c_idx, value=val)
cell.font = Font(name="Arial", size=9)
cell.fill = fill
cell.border = BORDER
cell.alignment = LEFT
ws.freeze_panes = "A2"
ws.auto_filter.ref = f"A1:{get_column_letter(len(headers))}1"
# ── main ─────────────────────────────────────────────────────────────────────
def main():
os.makedirs(CREATED_DIR, exist_ok=True)
print("Načítám data z MySQL...")
df = load_data()
df_kits = load_kit_data()
df_eq = load_equery_data()
print(f" Vzorky: {len(df)} řádků, {df['patient_no'].nunique()} pacientů")
print(f" Kity: {len(df_kits)} kitů, {df_kits['site_code'].nunique()} center")
print(f" eQueries: {len(df_eq)} záznamů ({(df_eq['status']=='Open').sum()} Open)")
# mapping accession → řádek v listu eQueries (jen Open queries, první výskyt)
open_accs = set(df_eq[df_eq["status"] == "Open"]["accession"].dropna())
accession_eq_row = {}
for r_idx, row in df_eq.iterrows():
acc = row.get("accession")
if acc and acc in open_accs and acc not in accession_eq_row:
accession_eq_row[acc] = r_idx + 3 # řádek 1=title, 2=header, data od 3
wb = Workbook()
wb.remove(wb.active)
write_prehled(wb, df, accession_eq_row)
write_chybejici(wb, df)
write_kity(wb, df_kits)
write_equeries(wb, df_eq)
write_zdroj(wb, df)
write_zdroj_kity(wb, df_kits)
write_zdroj_equeries(wb, df_eq)
now = datetime.datetime.now()
stamp = now.strftime("%Y-%m-%d %H%M%S")
out_path = unique_path(f"{stamp} {STUDY} Covance")
wb.save(out_path)
print(f"Uloženo: {out_path}")
main()