This commit is contained in:
2026-06-16 14:32:28 +02:00
parent b825e4ee7c
commit f385d7bf0b
95 changed files with 43120 additions and 0 deletions
+530
View File
@@ -0,0 +1,530 @@
# create_report_v2.0.py — v2.0 — 2026-05-29
# UCO3001 Covance specimen & kit report — zdroj dat: MongoDB (covance + edc)
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
from openpyxl.utils import get_column_letter
from datetime import date, datetime
from pymongo import MongoClient
# ── Konfigurace ────────────────────────────────────────────────────────────────
MONGO_URI = "mongodb://192.168.1.76:27017"
out_dir = "U:/Dropbox/!!!Days/Downloads Z230/"
# ── MongoDB připojení ──────────────────────────────────────────────────────────
client = MongoClient(MONGO_URI)
covance_db = client["covance"]
edc_db = client["edc"]
# ── Načtení dat z MongoDB ──────────────────────────────────────────────────────
print("Načítám data z MongoDB...")
samples_docs = list(covance_db["allsamples"].find())
df = pd.DataFrame([doc["fields"] for doc in samples_docs]).reset_index(drop=True)
print(f" allsamples: {len(df)} záznamů")
kit_docs = list(covance_db["kits"].find())
kit_df_raw = pd.DataFrame([doc["fields"] for doc in kit_docs]).reset_index(drop=True)
print(f" kits: {len(kit_df_raw)} záznamů")
edc_docs = list(edc_db["UCO3001.DateofVisit"].find())
edc_rows = []
for doc in edc_docs:
edc_rows.append({
"SiteNumber": doc["site"]["number"],
"Subject": doc["subject"]["label"],
"InstanceName": doc["form"]["instanceName"],
"Field4Value": doc["fields"].get("Visit Start Date"),
"Field5Value": doc["fields"].get("Type of Contact"),
})
edc_df_raw = pd.DataFrame(edc_rows)
print(f" DateofVisit: {len(edc_df_raw)} záznamů")
# ── Výstupní soubor ────────────────────────────────────────────────────────────
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
out_filename = f"{timestamp} 77242113UCO3001 CZE Labcorp samples and kit inventory report.xlsx"
out_path = out_dir + out_filename
# ── Příprava dat — allsamples ──────────────────────────────────────────────────
all_patients = sorted(df['Patient No.'].dropna().unique())
bxscr = df[df['Protocol Visit Code'] == 'BXSCR']
dna = df[df['Protocol Visit Code'] == 'DNA']
def fmt_date(val):
if val is None:
return None
if isinstance(val, float) and pd.isna(val):
return None
if isinstance(val, datetime):
return val.replace(tzinfo=None)
if isinstance(val, str):
for fmt in ('%d-%b-%Y', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
try:
return datetime.strptime(val.strip(), fmt)
except ValueError:
pass
try:
return pd.to_datetime(val).to_pydatetime().replace(tzinfo=None)
except Exception:
return None
OK_STATUSES = {'Received', 'In Inventory', 'Shipped'}
def get_specimen_info(visit_df, patient, specimen_type=None):
rows = visit_df[visit_df['Patient No.'] == patient]
if specimen_type:
rows = rows[rows['Specimen Type'] == specimen_type]
rows = rows[rows['Sample Status'].isin(OK_STATUSES)]
if rows.empty:
return '', None
row = rows.iloc[0]
return fmt_date(row['Container Receipt Date']), rows.index[0] + 2
def get_label_info(patient, label_code, visit_code):
rows = df[(df['Patient No.'] == patient) &
(df['Protocol Visit Code'] == visit_code) &
(df['Container Label Line 1'] == label_code)]
rows = rows[rows['Sample Status'].isin(OK_STATUSES)]
if rows.empty:
return '', None
row = rows.iloc[0]
return fmt_date(row['Container Receipt Date']), rows.index[0] + 2
# ── Příprava dat — kit inventory ───────────────────────────────────────────────
cze = kit_df_raw[kit_df_raw["Country"] == "CZE"].copy()
def parse_kit_date(val):
if val is None or (isinstance(val, float) and pd.isna(val)):
return None
if isinstance(val, datetime):
return val.replace(tzinfo=None)
try:
return datetime.strptime(str(val).strip(), "%b %d, %Y")
except ValueError:
return None
cze["Shipped Date"] = cze["Shipped Date"].apply(parse_kit_date)
cze["Expiration Date"] = cze["Expiration Date"].apply(parse_kit_date)
cze["Days to Expiration"] = pd.to_numeric(cze["Days to Expiration"], errors="coerce")
cze = cze.sort_values(["Site", "Kit Type", "Expiration Date"]).reset_index(drop=True)
today_dt = datetime.combine(date.today(), datetime.min.time())
def bucket(exp_date):
if exp_date is None:
return None
return "soon" if (exp_date - today_dt).days <= 30 else "ok"
cze["_bucket"] = cze["Expiration Date"].apply(bucket)
kit_order = sorted(cze["Kit Type"].unique(), key=lambda x: (str(x).lstrip("T-").zfill(5), str(x)))
kit_desc = cze.drop_duplicates("Kit Type").set_index("Kit Type")["Description"].to_dict()
kit_sites = sorted(cze["Site"].unique())
# ── Příprava dat — EDC pacienti ────────────────────────────────────────────────
def fmt_date_edc(val):
if val is None or (isinstance(val, float) and pd.isna(val)):
return None
if isinstance(val, datetime):
return val.replace(tzinfo=None)
if isinstance(val, str):
for fmt in ('%d %b %Y', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
try:
return datetime.strptime(val.strip(), fmt)
except ValueError:
pass
try:
return pd.to_datetime(val).to_pydatetime().replace(tzinfo=None)
except Exception:
return None
_pat_pre = edc_df_raw[['SiteNumber', 'Subject', 'Field4Value']].copy()
_pat_pre['Field4Value'] = _pat_pre['Field4Value'].apply(fmt_date_edc)
_pat_pre = _pat_pre.sort_values(['SiteNumber', 'Subject', 'Field4Value']).reset_index(drop=True)
patient_row_map = {}
for i, row in _pat_pre.iterrows():
pat = row['Subject']
if pat not in patient_row_map:
patient_row_map[pat] = i + 2
bxscr_patients = sorted(bxscr['Patient No.'].dropna().unique())
# ── Workbook ───────────────────────────────────────────────────────────────────
out_wb = Workbook()
out_wb.remove(out_wb.active)
# ── Styly ──────────────────────────────────────────────────────────────────────
thin = Side(style='thin')
border = Border(left=thin, right=thin, top=thin, bottom=thin)
header_fill = PatternFill("solid", fgColor="4472C4")
header_font = Font(name='Calibri', bold=True, size=11, color="FFFFFF")
data_font = Font(name='Calibri', size=11)
date_font_link = Font(name='Calibri', size=11, color="000000", underline='single')
yes_fill = PatternFill("solid", fgColor="E2EFDA")
no_fill = PatternFill("solid", fgColor="FFE7E7")
sum_header_font = Font(name='Calibri', bold=True, size=11, color="000000")
sum_total_font = Font(name='Calibri', bold=True, size=11)
zero_font = Font(name='Calibri', size=11, color="BFBFBF")
zero_red_font = Font(name='Calibri', size=11, color="C00000")
dark_blue_fill = PatternFill("solid", fgColor="203764")
orange_fill = PatternFill("solid", fgColor="FFF2CC")
green_fill = PatternFill("solid", fgColor="E2EFDA")
total_fill = PatternFill("solid", fgColor="D9E1F2")
exp_fill = PatternFill("solid", fgColor="FFE7E7")
ok_fill = PatternFill("solid", fgColor="E2EFDA")
# ── List: Zdroj ────────────────────────────────────────────────────────────────
# Generován z covance.allsamples — pořadí řádků odpovídá df.index,
# proto hyperlinky z Přehledu vzorků (index + 2) míří na správné řádky.
src_ws = out_wb.create_sheet("Zdroj")
src_sheet_name = "Zdroj"
pat_sheet_name = "Seznam pacientů"
zdroj_columns = [
"Protocol Code", "Investigator No.", "Investigator Name", "Patient No.",
"Collection Date", "Protocol Visit Code", "Kit Receipt Date",
"Container Receipt Date", "Accession", "Container No.", "Container Barcode No.",
"Specimen Type", "Sample Status", "Expected Receipt Condition",
"Actual Receipt Condition", "Container Label Line 1", "Container Label Line 2",
"SM Sample Status", "SMART Specimen Class Description", "Parent Barcode", "Children Barcode",
]
for col_idx, col_name in enumerate(zdroj_columns, 1):
cell = src_ws.cell(row=1, column=col_idx, value=col_name)
cell.font = header_font
cell.fill = header_fill
cell.border = border
cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
src_ws.column_dimensions[get_column_letter(col_idx)].width = max(len(col_name) + 2, 14)
src_ws.row_dimensions[1].height = 30
src_ws.freeze_panes = "A2"
def clean(v):
try:
if pd.isna(v):
return None
except (TypeError, ValueError):
pass
return v
for row_idx, (_, row) in enumerate(df.iterrows(), 2):
for col_idx, col_name in enumerate(zdroj_columns, 1):
val = clean(row.get(col_name))
cell = src_ws.cell(row=row_idx, column=col_idx, value=val)
cell.font = data_font
cell.border = border
cell.alignment = Alignment(horizontal='center', vertical='center')
src_ws.auto_filter.ref = f"A1:{get_column_letter(len(zdroj_columns))}1"
# ── List: Přehled vzorků ───────────────────────────────────────────────────────
analysis_ws = out_wb.create_sheet("Přehled vzorků")
columns = [
("Investigator Name", 24),
("Číslo pacienta", 20),
("Máme biopsii SM11", 20),
("Máme RNA", 16),
("Máme Cryostor", 16),
("DNA", 14),
("PLASMPK I-0 TROUGH", 18),
("PLASMA PK I-0 PEAK", 18),
("SERUM ADA I-0 PRE", 18),
("SM06/SERUM BIOM", 16),
("SM07/WB RNA", 14),
("SM10/FECAL", 14),
("PLASMPK I-2 TROUGH", 18),
("PLASMA PK I-2 PEAK", 18),
("SERUM ADA I-2 PRE", 18),
("STOOL I-2", 12),
("PLASMPK I-4 TROUGH", 18),
("PLASMA PK I-4 PEAK", 18),
("SERUM ADA I-4 PRE", 18),
("SM06/SERUM BIOM", 16),
("SM07/WB RNA", 14),
("STOOL I-4", 12),
]
group_font = Font(name='Calibri', bold=True, size=11)
group_fill = PatternFill("solid", fgColor="FFFFFF")
group_border = Border(left=thin, right=thin, top=thin, bottom=thin)
groups = [
(3, 5, "SCREENING"),
(7, 12, "RANDOMIZACE I-0"),
(13, 16, "I-2"),
(17, 22, "I-4"),
]
for start_col, end_col, label in groups:
analysis_ws.merge_cells(start_row=1, start_column=start_col, end_row=1, end_column=end_col)
cell = analysis_ws.cell(row=1, column=start_col, value=label)
cell.font = group_font
cell.fill = group_fill
cell.alignment = Alignment(horizontal='center', vertical='center')
cell.border = group_border
for c in range(start_col, end_col + 1):
analysis_ws.cell(row=1, column=c).border = group_border
analysis_ws.row_dimensions[1].height = 20
for col_idx, (hdr, width) in enumerate(columns, 1):
cell = analysis_ws.cell(row=2, column=col_idx, value=hdr)
cell.font = header_font
cell.fill = header_fill
cell.border = border
cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
analysis_ws.column_dimensions[get_column_letter(col_idx)].width = width
analysis_ws.row_dimensions[2].height = 30
analysis_ws.freeze_panes = "C3"
for row_idx, patient in enumerate(bxscr_patients, 3):
investigator = bxscr[bxscr['Patient No.'] == patient].iloc[0]['Investigator Name']
sm11, sm11_row = get_specimen_info(bxscr, patient, 'Tissue , Paraffin Block')
rna, rna_row = get_specimen_info(bxscr, patient, 'Biopsy RNA Later')
cryo, cryo_row = get_specimen_info(bxscr, patient, 'Biopsy, Frozen Tissue')
dna_date, dna_row = get_specimen_info(dna, patient)
trough, trough_row = get_label_info(patient, 'PLASMPK I-0 TROUGH', 'I-0')
peak, peak_row = get_label_info(patient, 'PLASMA PK I-0 PEAK', 'I-0')
ada, ada_row = get_label_info(patient, 'SERUM ADA I-0 PRE', 'I-0')
sm06, sm06_row = get_label_info(patient, 'SM06/SERUM BIOM', 'I-0')
sm07, sm07_row = get_label_info(patient, 'SM07/WB RNA', 'I-0')
sm10, sm10_row = get_label_info(patient, 'SM10/FECAL', 'I-0')
trough2, trough2_row = get_label_info(patient, 'PLASMPK I-2 TROUGH', 'I-2')
peak2, peak2_row = get_label_info(patient, 'PLASMA PK I-2 PEAK', 'I-2')
ada2, ada2_row = get_label_info(patient, 'SERUM ADA I-2 PRE', 'I-2')
stool2, stool2_row = get_label_info(patient, 'STOOL I-2', 'I-2')
trough4, trough4_row = get_label_info(patient, 'PLASMPK I-4 TROUGH', 'I-4')
peak4, peak4_row = get_label_info(patient, 'PLASMA PK I-4 PEAK', 'I-4')
ada4, ada4_row = get_label_info(patient, 'SERUM ADA I-4 PRE', 'I-4')
sm064, sm064_row = get_label_info(patient, 'SM06/SERUM BIOM', 'I-4')
sm074, sm074_row = get_label_info(patient, 'SM07/WB RNA', 'I-4')
stool4, stool4_row = get_label_info(patient, 'STOOL I-4', 'I-4')
row_data = [
investigator, patient,
(sm11, sm11_row), (rna, rna_row), (cryo, cryo_row), (dna_date, dna_row),
(trough, trough_row), (peak, peak_row), (ada, ada_row),
(sm06, sm06_row), (sm07, sm07_row), (sm10, sm10_row),
(trough2, trough2_row),(peak2, peak2_row), (ada2, ada2_row), (stool2, stool2_row),
(trough4, trough4_row),(peak4, peak4_row), (ada4, ada4_row),
(sm064, sm064_row), (sm074, sm074_row), (stool4, stool4_row),
]
for col_idx, value in enumerate(row_data, 1):
if col_idx <= 2:
cell = analysis_ws.cell(row=row_idx, column=col_idx, value=value)
if col_idx == 2 and patient in patient_row_map:
cell.hyperlink = f"#'{pat_sheet_name}'!B{patient_row_map[patient]}"
cell.font = Font(name='Calibri', size=11, underline='single')
else:
cell.font = data_font
else:
dt, excel_row = value
cell = analysis_ws.cell(row=row_idx, column=col_idx, value=dt)
if dt and excel_row is not None:
cell.hyperlink = f"#'{src_sheet_name}'!A{excel_row}"
cell.font = date_font_link
cell.fill = yes_fill
cell.number_format = 'DD-MMM-YYYY'
else:
cell.font = Font(name='Calibri', size=11, color="C00000")
cell.fill = no_fill
cell.border = border
cell.alignment = Alignment(horizontal='center', vertical='center')
# ── List: Seznam pacientů ──────────────────────────────────────────────────────
patients_ws = out_wb.create_sheet("Seznam pacientů")
pat_columns = [
("Číslo centra", 20),
("Číslo pacienta", 20),
("Kód návštěvy", 20),
("Datum návštěvy", 16),
("Typ návštěvy", 16),
]
for col_idx, (col_name, width) in enumerate(pat_columns, 1):
cell = patients_ws.cell(row=1, column=col_idx, value=col_name)
cell.font = header_font
cell.fill = header_fill
cell.border = border
cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
patients_ws.column_dimensions[get_column_letter(col_idx)].width = width
patients_ws.row_dimensions[1].height = 30
patients_ws.freeze_panes = "A2"
pat_df = edc_df_raw[['SiteNumber', 'Subject', 'InstanceName', 'Field4Value', 'Field5Value']].copy()
pat_df['Field4Value'] = pat_df['Field4Value'].apply(fmt_date_edc)
pat_df = pat_df.sort_values(['SiteNumber', 'Subject', 'Field4Value']).reset_index(drop=True)
pat_col_keys = ['SiteNumber', 'Subject', 'InstanceName', 'Field4Value', 'Field5Value']
for row_idx, (_, row) in enumerate(pat_df.iterrows(), 2):
for col_idx, key in enumerate(pat_col_keys, 1):
value = clean(row[key])
cell = patients_ws.cell(row=row_idx, column=col_idx, value=value)
cell.font = data_font
cell.border = border
cell.alignment = Alignment(horizontal='center', vertical='center')
if col_idx == 4 and value is not None:
cell.number_format = 'DD-MMM-YYYY'
# ── Pomocná funkce pro souhrnné tabulky ────────────────────────────────────────
def write_summary_table(ws, current_row, title, rows_data, col_a_header):
for c in range(1, 5):
cell = ws.cell(row=current_row, column=c)
cell.fill = dark_blue_fill
cell.border = border
ws.cell(row=current_row, column=1, value=title).font = Font(name='Calibri', bold=True, size=12, color="FFFFFF")
ws.cell(row=current_row, column=1).alignment = Alignment(horizontal="left", vertical="center")
ws.merge_cells(start_row=current_row, start_column=1, end_row=current_row, end_column=4)
ws.row_dimensions[current_row].height = 22
current_row += 1
for col_idx, (h, f) in enumerate(zip(
[col_a_header, "Description", "Expiruje do 30 dní", "Expiruje později"],
[header_fill, header_fill, orange_fill, green_fill]
), 1):
cell = ws.cell(row=current_row, column=col_idx, value=h)
cell.font = sum_header_font
cell.fill = f
cell.border = border
cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
ws.row_dimensions[current_row].height = 28
current_row += 1
totals = [0, 0]
for col_a, col_b, n_soon, n_ok in rows_data:
totals[0] += n_soon
totals[1] += n_ok
all_zero = (n_soon == 0 and n_ok == 0)
row_vals = [col_a, col_b, n_soon, n_ok]
row_fills = [None, None,
orange_fill if n_soon > 0 else None,
green_fill if n_ok > 0 else None]
for col_idx, (val, rfill) in enumerate(zip(row_vals, row_fills), 1):
cell = ws.cell(row=current_row, column=col_idx, value=val)
if col_idx >= 3 and val == 0:
cell.font = zero_red_font if all_zero else zero_font
else:
cell.font = data_font
cell.border = border
cell.alignment = Alignment(horizontal="center" if col_idx >= 2 else "left", vertical="center")
if rfill:
cell.fill = rfill
current_row += 1
for col_idx, val in enumerate(["CELKEM", "", totals[0], totals[1]], 1):
cell = ws.cell(row=current_row, column=col_idx, value=val)
cell.font = sum_total_font
cell.fill = total_fill
cell.border = border
cell.alignment = Alignment(horizontal="center" if col_idx >= 2 else "left", vertical="center")
current_row += 2
return current_row
# ── List: Kit Inventory CZE ────────────────────────────────────────────────────
kit_ws = out_wb.create_sheet("Kit Inventory CZE")
listing_columns = [
("Project No.", 14),
("Region", 10),
("Country", 10),
("Site", 38),
("Kit Type", 12),
("Description", 22),
("Accession", 18),
("Shipped Date", 16),
("Expiration Date", 16),
("Days to Expiration", 20),
]
for col_idx, (hdr, width) in enumerate(listing_columns, 1):
cell = kit_ws.cell(row=1, column=col_idx, value=hdr)
cell.font = header_font
cell.fill = header_fill
cell.border = border
cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
kit_ws.column_dimensions[get_column_letter(col_idx)].width = width
kit_ws.row_dimensions[1].height = 30
kit_ws.freeze_panes = "A2"
for row_idx, (_, row) in enumerate(cze.iterrows(), 2):
days = row.get("Days to Expiration")
for col_idx, (col_name, _) in enumerate(listing_columns, 1):
value = clean(row.get(col_name))
cell = kit_ws.cell(row=row_idx, column=col_idx, value=value)
cell.font = data_font
cell.border = border
cell.alignment = Alignment(horizontal="center", vertical="center")
if col_name in ("Shipped Date", "Expiration Date") and value is not None:
cell.number_format = "DD-MMM-YYYY"
if col_name == "Days to Expiration":
cell.fill = exp_fill if (pd.notna(days) and days <= 60) else ok_fill
kit_ws.auto_filter.ref = f"A1:{get_column_letter(len(listing_columns))}1"
# ── List: Přehled po centrech ──────────────────────────────────────────────────
ctr_ws = out_wb.create_sheet("Přehled po centrech")
ctr_ws.column_dimensions["A"].width = 22
ctr_ws.column_dimensions["B"].width = 24
ctr_ws.column_dimensions["C"].width = 22
ctr_ws.column_dimensions["D"].width = 20
current_row = 1
for site in kit_sites:
site_df = cze[cze["Site"] == site]
rows_data = []
for kit in kit_order:
desc = kit_desc.get(kit, "")
kit_site_df = site_df[site_df["Kit Type"] == kit]
n_soon = int((kit_site_df["_bucket"] == "soon").sum())
n_ok = int((kit_site_df["_bucket"] == "ok").sum())
rows_data.append((f"{kit}{desc}", desc, n_soon, n_ok))
current_row = write_summary_table(ctr_ws, current_row, site, rows_data, "Kit Type")
# ── List: Přehled po typech kitů ───────────────────────────────────────────────
sum_ws = out_wb.create_sheet("Přehled po typech")
sum_ws.column_dimensions["A"].width = 38
sum_ws.column_dimensions["B"].width = 22
sum_ws.column_dimensions["C"].width = 22
sum_ws.column_dimensions["D"].width = 20
current_row = 1
for kit in kit_order:
desc = kit_desc.get(kit, "")
kit_df = cze[cze["Kit Type"] == kit]
rows_data = []
for site in sorted(kit_df["Site"].unique()):
site_df = kit_df[kit_df["Site"] == site]
n_soon = int((site_df["_bucket"] == "soon").sum())
n_ok = int((site_df["_bucket"] == "ok").sum())
rows_data.append((site, desc, n_soon, n_ok))
current_row = write_summary_table(sum_ws, current_row, f"Kit Type {kit}{desc}", rows_data, "Centrum")
# ── List: eQueries ─────────────────────────────────────────────────────────────
# TODO: doplnit až budou eQuery data importována do MongoDB
# Zdroj: covance db, kolekce "equeries" (dle konvence importu)
# Filtr: Country == "CZECH REPUBLIC"
# Sloupce: Site, Subject, Visit, Visit Collection Date, Accession,
# eQueryId, Issue Type, Status, Create Date, Response Date Time,
# Time Before Response, User Name
# Řazení: Open → Response Received → Closed, pak Site
eq_ws = out_wb.create_sheet("eQueries")
eq_ws.cell(row=1, column=1,
value="TODO: eQuery data zatím nejsou v MongoDB — doplnit po importu.").font = Font(
name='Calibri', bold=True, size=12, color="C00000"
)
eq_ws.column_dimensions["A"].width = 70
# ── Uložení ────────────────────────────────────────────────────────────────────
out_wb.save(out_path)
client.close()
print(f"\nUloženo: {out_path}")
print(f"Pacienti s BXSCR: {len(bxscr_patients)}, Všichni pacienti: {len(all_patients)}")
print(f"CZE kity: {len(cze)}, Typy kitů: {len(kit_order)}, Centra: {len(kit_sites)}")
+610
View File
@@ -0,0 +1,610 @@
# create_report_v2.1.py — v2.1 — 2026-06-16
# UCO3001 Covance specimen & kit report — zdroj dat: MongoDB (covance + edc)
# Změny v2.1: doplněn list "eQueries" z covance.equeries (study 35472 = UCO3001,
# Country == "CZECH REPUBLIC"), barevné zvýraznění dle stavu, řazení
# In Progress → Response Received → Closed, pak Site, pak Create Date.
import pandas as pd
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
from openpyxl.utils import get_column_letter
from datetime import date, datetime
from pymongo import MongoClient
# ── Konfigurace ────────────────────────────────────────────────────────────────
MONGO_URI = "mongodb://192.168.1.76:27017"
out_dir = "U:/Dropbox/!!!Days/Downloads Z230/"
EQ_STUDY = "35472" # 77242113UCO3001
# ── MongoDB připojení ──────────────────────────────────────────────────────────
client = MongoClient(MONGO_URI)
covance_db = client["covance"]
edc_db = client["edc"]
# ── Načtení dat z MongoDB ──────────────────────────────────────────────────────
print("Načítám data z MongoDB...")
samples_docs = list(covance_db["allsamples"].find())
df = pd.DataFrame([doc["fields"] for doc in samples_docs]).reset_index(drop=True)
print(f" allsamples: {len(df)} záznamů")
kit_docs = list(covance_db["kits"].find())
kit_df_raw = pd.DataFrame([doc["fields"] for doc in kit_docs]).reset_index(drop=True)
print(f" kits: {len(kit_df_raw)} záznamů")
eq_docs = list(covance_db["equeries"].find({"study": EQ_STUDY}))
eq_df_raw = pd.DataFrame([doc["fields"] for doc in eq_docs]).reset_index(drop=True)
print(f" equeries: {len(eq_df_raw)} záznamů (study {EQ_STUDY})")
edc_docs = list(edc_db["UCO3001.DateofVisit"].find())
edc_rows = []
for doc in edc_docs:
edc_rows.append({
"SiteNumber": doc["site"]["number"],
"Subject": doc["subject"]["label"],
"InstanceName": doc["form"]["instanceName"],
"Field4Value": doc["fields"].get("Visit Start Date"),
"Field5Value": doc["fields"].get("Type of Contact"),
})
edc_df_raw = pd.DataFrame(edc_rows)
print(f" DateofVisit: {len(edc_df_raw)} záznamů")
# ── Výstupní soubor ────────────────────────────────────────────────────────────
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
out_filename = f"{timestamp} 77242113UCO3001 CZE Labcorp samples and kit inventory report.xlsx"
out_path = out_dir + out_filename
# ── Příprava dat — allsamples ──────────────────────────────────────────────────
all_patients = sorted(df['Patient No.'].dropna().unique())
bxscr = df[df['Protocol Visit Code'] == 'BXSCR']
dna = df[df['Protocol Visit Code'] == 'DNA']
def fmt_date(val):
if val is None:
return None
if isinstance(val, float) and pd.isna(val):
return None
if isinstance(val, datetime):
return val.replace(tzinfo=None)
if isinstance(val, str):
for fmt in ('%d-%b-%Y', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
try:
return datetime.strptime(val.strip(), fmt)
except ValueError:
pass
try:
return pd.to_datetime(val).to_pydatetime().replace(tzinfo=None)
except Exception:
return None
OK_STATUSES = {'Received', 'In Inventory', 'Shipped'}
def get_specimen_info(visit_df, patient, specimen_type=None):
rows = visit_df[visit_df['Patient No.'] == patient]
if specimen_type:
rows = rows[rows['Specimen Type'] == specimen_type]
rows = rows[rows['Sample Status'].isin(OK_STATUSES)]
if rows.empty:
return '', None
row = rows.iloc[0]
return fmt_date(row['Container Receipt Date']), rows.index[0] + 2
def get_label_info(patient, label_code, visit_code):
rows = df[(df['Patient No.'] == patient) &
(df['Protocol Visit Code'] == visit_code) &
(df['Container Label Line 1'] == label_code)]
rows = rows[rows['Sample Status'].isin(OK_STATUSES)]
if rows.empty:
return '', None
row = rows.iloc[0]
return fmt_date(row['Container Receipt Date']), rows.index[0] + 2
# ── Příprava dat — kit inventory ───────────────────────────────────────────────
cze = kit_df_raw[kit_df_raw["Country"] == "CZE"].copy()
def parse_kit_date(val):
if val is None or (isinstance(val, float) and pd.isna(val)):
return None
if isinstance(val, datetime):
return val.replace(tzinfo=None)
try:
return datetime.strptime(str(val).strip(), "%b %d, %Y")
except ValueError:
return None
cze["Shipped Date"] = cze["Shipped Date"].apply(parse_kit_date)
cze["Expiration Date"] = cze["Expiration Date"].apply(parse_kit_date)
cze["Days to Expiration"] = pd.to_numeric(cze["Days to Expiration"], errors="coerce")
cze = cze.sort_values(["Site", "Kit Type", "Expiration Date"]).reset_index(drop=True)
today_dt = datetime.combine(date.today(), datetime.min.time())
def bucket(exp_date):
if exp_date is None:
return None
return "soon" if (exp_date - today_dt).days <= 30 else "ok"
cze["_bucket"] = cze["Expiration Date"].apply(bucket)
kit_order = sorted(cze["Kit Type"].unique(), key=lambda x: (str(x).lstrip("T-").zfill(5), str(x)))
kit_desc = cze.drop_duplicates("Kit Type").set_index("Kit Type")["Description"].to_dict()
kit_sites = sorted(cze["Site"].unique())
# ── Příprava dat — eQueries ────────────────────────────────────────────────────
def parse_eq_date(val):
"""Parsuje datum eQuery typu 'Mar 17, 2026 3:49 PM' (i bez času)."""
if val is None or (isinstance(val, float) and pd.isna(val)):
return None
if isinstance(val, datetime):
return val.replace(tzinfo=None)
s = str(val).strip()
for fmt in ("%b %d, %Y %I:%M %p", "%b %d, %Y"):
try:
return datetime.strptime(s, fmt)
except ValueError:
pass
try:
return pd.to_datetime(s).to_pydatetime().replace(tzinfo=None)
except Exception:
return None
if not eq_df_raw.empty:
eq_df = eq_df_raw.copy()
for c in ("Visit Collection Date", "Create Date", "Response Date Time"):
if c in eq_df.columns:
eq_df[c] = eq_df[c].apply(parse_eq_date)
# Řazení: In Progress → Response Received → Closed, pak Site, pak Create Date
status_order = {"In Progress": 0, "Response Received": 1, "Closed": 2}
eq_df["_status_rank"] = eq_df["Status"].map(lambda s: status_order.get(s, 99))
eq_df = eq_df.sort_values(
["_status_rank", "Site", "Create Date"]
).reset_index(drop=True)
else:
eq_df = eq_df_raw
# ── Příprava dat — EDC pacienti ────────────────────────────────────────────────
def fmt_date_edc(val):
if val is None or (isinstance(val, float) and pd.isna(val)):
return None
if isinstance(val, datetime):
return val.replace(tzinfo=None)
if isinstance(val, str):
for fmt in ('%d %b %Y', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
try:
return datetime.strptime(val.strip(), fmt)
except ValueError:
pass
try:
return pd.to_datetime(val).to_pydatetime().replace(tzinfo=None)
except Exception:
return None
_pat_pre = edc_df_raw[['SiteNumber', 'Subject', 'Field4Value']].copy()
_pat_pre['Field4Value'] = _pat_pre['Field4Value'].apply(fmt_date_edc)
_pat_pre = _pat_pre.sort_values(['SiteNumber', 'Subject', 'Field4Value']).reset_index(drop=True)
patient_row_map = {}
for i, row in _pat_pre.iterrows():
pat = row['Subject']
if pat not in patient_row_map:
patient_row_map[pat] = i + 2
bxscr_patients = sorted(bxscr['Patient No.'].dropna().unique())
# ── Workbook ───────────────────────────────────────────────────────────────────
out_wb = Workbook()
out_wb.remove(out_wb.active)
# ── Styly ──────────────────────────────────────────────────────────────────────
thin = Side(style='thin')
border = Border(left=thin, right=thin, top=thin, bottom=thin)
header_fill = PatternFill("solid", fgColor="4472C4")
header_font = Font(name='Calibri', bold=True, size=11, color="FFFFFF")
data_font = Font(name='Calibri', size=11)
date_font_link = Font(name='Calibri', size=11, color="000000", underline='single')
yes_fill = PatternFill("solid", fgColor="E2EFDA")
no_fill = PatternFill("solid", fgColor="FFE7E7")
sum_header_font = Font(name='Calibri', bold=True, size=11, color="000000")
sum_total_font = Font(name='Calibri', bold=True, size=11)
zero_font = Font(name='Calibri', size=11, color="BFBFBF")
zero_red_font = Font(name='Calibri', size=11, color="C00000")
dark_blue_fill = PatternFill("solid", fgColor="203764")
orange_fill = PatternFill("solid", fgColor="FFF2CC")
green_fill = PatternFill("solid", fgColor="E2EFDA")
total_fill = PatternFill("solid", fgColor="D9E1F2")
exp_fill = PatternFill("solid", fgColor="FFE7E7")
ok_fill = PatternFill("solid", fgColor="E2EFDA")
# ── List: Zdroj ────────────────────────────────────────────────────────────────
# Generován z covance.allsamples — pořadí řádků odpovídá df.index,
# proto hyperlinky z Přehledu vzorků (index + 2) míří na správné řádky.
src_ws = out_wb.create_sheet("Zdroj")
src_sheet_name = "Zdroj"
pat_sheet_name = "Seznam pacientů"
zdroj_columns = [
"Protocol Code", "Investigator No.", "Investigator Name", "Patient No.",
"Collection Date", "Protocol Visit Code", "Kit Receipt Date",
"Container Receipt Date", "Accession", "Container No.", "Container Barcode No.",
"Specimen Type", "Sample Status", "Expected Receipt Condition",
"Actual Receipt Condition", "Container Label Line 1", "Container Label Line 2",
"SM Sample Status", "SMART Specimen Class Description", "Parent Barcode", "Children Barcode",
]
for col_idx, col_name in enumerate(zdroj_columns, 1):
cell = src_ws.cell(row=1, column=col_idx, value=col_name)
cell.font = header_font
cell.fill = header_fill
cell.border = border
cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
src_ws.column_dimensions[get_column_letter(col_idx)].width = max(len(col_name) + 2, 14)
src_ws.row_dimensions[1].height = 30
src_ws.freeze_panes = "A2"
def clean(v):
try:
if pd.isna(v):
return None
except (TypeError, ValueError):
pass
return v
for row_idx, (_, row) in enumerate(df.iterrows(), 2):
for col_idx, col_name in enumerate(zdroj_columns, 1):
val = clean(row.get(col_name))
cell = src_ws.cell(row=row_idx, column=col_idx, value=val)
cell.font = data_font
cell.border = border
cell.alignment = Alignment(horizontal='center', vertical='center')
src_ws.auto_filter.ref = f"A1:{get_column_letter(len(zdroj_columns))}1"
# ── List: Přehled vzorků ───────────────────────────────────────────────────────
analysis_ws = out_wb.create_sheet("Přehled vzorků")
columns = [
("Investigator Name", 24),
("Číslo pacienta", 20),
("Máme biopsii SM11", 20),
("Máme RNA", 16),
("Máme Cryostor", 16),
("DNA", 14),
("PLASMPK I-0 TROUGH", 18),
("PLASMA PK I-0 PEAK", 18),
("SERUM ADA I-0 PRE", 18),
("SM06/SERUM BIOM", 16),
("SM07/WB RNA", 14),
("SM10/FECAL", 14),
("PLASMPK I-2 TROUGH", 18),
("PLASMA PK I-2 PEAK", 18),
("SERUM ADA I-2 PRE", 18),
("STOOL I-2", 12),
("PLASMPK I-4 TROUGH", 18),
("PLASMA PK I-4 PEAK", 18),
("SERUM ADA I-4 PRE", 18),
("SM06/SERUM BIOM", 16),
("SM07/WB RNA", 14),
("STOOL I-4", 12),
]
group_font = Font(name='Calibri', bold=True, size=11)
group_fill = PatternFill("solid", fgColor="FFFFFF")
group_border = Border(left=thin, right=thin, top=thin, bottom=thin)
groups = [
(3, 5, "SCREENING"),
(7, 12, "RANDOMIZACE I-0"),
(13, 16, "I-2"),
(17, 22, "I-4"),
]
for start_col, end_col, label in groups:
analysis_ws.merge_cells(start_row=1, start_column=start_col, end_row=1, end_column=end_col)
cell = analysis_ws.cell(row=1, column=start_col, value=label)
cell.font = group_font
cell.fill = group_fill
cell.alignment = Alignment(horizontal='center', vertical='center')
cell.border = group_border
for c in range(start_col, end_col + 1):
analysis_ws.cell(row=1, column=c).border = group_border
analysis_ws.row_dimensions[1].height = 20
for col_idx, (hdr, width) in enumerate(columns, 1):
cell = analysis_ws.cell(row=2, column=col_idx, value=hdr)
cell.font = header_font
cell.fill = header_fill
cell.border = border
cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
analysis_ws.column_dimensions[get_column_letter(col_idx)].width = width
analysis_ws.row_dimensions[2].height = 30
analysis_ws.freeze_panes = "C3"
for row_idx, patient in enumerate(bxscr_patients, 3):
investigator = bxscr[bxscr['Patient No.'] == patient].iloc[0]['Investigator Name']
sm11, sm11_row = get_specimen_info(bxscr, patient, 'Tissue , Paraffin Block')
rna, rna_row = get_specimen_info(bxscr, patient, 'Biopsy RNA Later')
cryo, cryo_row = get_specimen_info(bxscr, patient, 'Biopsy, Frozen Tissue')
dna_date, dna_row = get_specimen_info(dna, patient)
trough, trough_row = get_label_info(patient, 'PLASMPK I-0 TROUGH', 'I-0')
peak, peak_row = get_label_info(patient, 'PLASMA PK I-0 PEAK', 'I-0')
ada, ada_row = get_label_info(patient, 'SERUM ADA I-0 PRE', 'I-0')
sm06, sm06_row = get_label_info(patient, 'SM06/SERUM BIOM', 'I-0')
sm07, sm07_row = get_label_info(patient, 'SM07/WB RNA', 'I-0')
sm10, sm10_row = get_label_info(patient, 'SM10/FECAL', 'I-0')
trough2, trough2_row = get_label_info(patient, 'PLASMPK I-2 TROUGH', 'I-2')
peak2, peak2_row = get_label_info(patient, 'PLASMA PK I-2 PEAK', 'I-2')
ada2, ada2_row = get_label_info(patient, 'SERUM ADA I-2 PRE', 'I-2')
stool2, stool2_row = get_label_info(patient, 'STOOL I-2', 'I-2')
trough4, trough4_row = get_label_info(patient, 'PLASMPK I-4 TROUGH', 'I-4')
peak4, peak4_row = get_label_info(patient, 'PLASMA PK I-4 PEAK', 'I-4')
ada4, ada4_row = get_label_info(patient, 'SERUM ADA I-4 PRE', 'I-4')
sm064, sm064_row = get_label_info(patient, 'SM06/SERUM BIOM', 'I-4')
sm074, sm074_row = get_label_info(patient, 'SM07/WB RNA', 'I-4')
stool4, stool4_row = get_label_info(patient, 'STOOL I-4', 'I-4')
row_data = [
investigator, patient,
(sm11, sm11_row), (rna, rna_row), (cryo, cryo_row), (dna_date, dna_row),
(trough, trough_row), (peak, peak_row), (ada, ada_row),
(sm06, sm06_row), (sm07, sm07_row), (sm10, sm10_row),
(trough2, trough2_row),(peak2, peak2_row), (ada2, ada2_row), (stool2, stool2_row),
(trough4, trough4_row),(peak4, peak4_row), (ada4, ada4_row),
(sm064, sm064_row), (sm074, sm074_row), (stool4, stool4_row),
]
for col_idx, value in enumerate(row_data, 1):
if col_idx <= 2:
cell = analysis_ws.cell(row=row_idx, column=col_idx, value=value)
if col_idx == 2 and patient in patient_row_map:
cell.hyperlink = f"#'{pat_sheet_name}'!B{patient_row_map[patient]}"
cell.font = Font(name='Calibri', size=11, underline='single')
else:
cell.font = data_font
else:
dt, excel_row = value
cell = analysis_ws.cell(row=row_idx, column=col_idx, value=dt)
if dt and excel_row is not None:
cell.hyperlink = f"#'{src_sheet_name}'!A{excel_row}"
cell.font = date_font_link
cell.fill = yes_fill
cell.number_format = 'DD-MMM-YYYY'
else:
cell.font = Font(name='Calibri', size=11, color="C00000")
cell.fill = no_fill
cell.border = border
cell.alignment = Alignment(horizontal='center', vertical='center')
# ── List: Seznam pacientů ──────────────────────────────────────────────────────
patients_ws = out_wb.create_sheet("Seznam pacientů")
pat_columns = [
("Číslo centra", 20),
("Číslo pacienta", 20),
("Kód návštěvy", 20),
("Datum návštěvy", 16),
("Typ návštěvy", 16),
]
for col_idx, (col_name, width) in enumerate(pat_columns, 1):
cell = patients_ws.cell(row=1, column=col_idx, value=col_name)
cell.font = header_font
cell.fill = header_fill
cell.border = border
cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
patients_ws.column_dimensions[get_column_letter(col_idx)].width = width
patients_ws.row_dimensions[1].height = 30
patients_ws.freeze_panes = "A2"
pat_df = edc_df_raw[['SiteNumber', 'Subject', 'InstanceName', 'Field4Value', 'Field5Value']].copy()
pat_df['Field4Value'] = pat_df['Field4Value'].apply(fmt_date_edc)
pat_df = pat_df.sort_values(['SiteNumber', 'Subject', 'Field4Value']).reset_index(drop=True)
pat_col_keys = ['SiteNumber', 'Subject', 'InstanceName', 'Field4Value', 'Field5Value']
for row_idx, (_, row) in enumerate(pat_df.iterrows(), 2):
for col_idx, key in enumerate(pat_col_keys, 1):
value = clean(row[key])
cell = patients_ws.cell(row=row_idx, column=col_idx, value=value)
cell.font = data_font
cell.border = border
cell.alignment = Alignment(horizontal='center', vertical='center')
if col_idx == 4 and value is not None:
cell.number_format = 'DD-MMM-YYYY'
# ── Pomocná funkce pro souhrnné tabulky ────────────────────────────────────────
def write_summary_table(ws, current_row, title, rows_data, col_a_header):
for c in range(1, 5):
cell = ws.cell(row=current_row, column=c)
cell.fill = dark_blue_fill
cell.border = border
ws.cell(row=current_row, column=1, value=title).font = Font(name='Calibri', bold=True, size=12, color="FFFFFF")
ws.cell(row=current_row, column=1).alignment = Alignment(horizontal="left", vertical="center")
ws.merge_cells(start_row=current_row, start_column=1, end_row=current_row, end_column=4)
ws.row_dimensions[current_row].height = 22
current_row += 1
for col_idx, (h, f) in enumerate(zip(
[col_a_header, "Description", "Expiruje do 30 dní", "Expiruje později"],
[header_fill, header_fill, orange_fill, green_fill]
), 1):
cell = ws.cell(row=current_row, column=col_idx, value=h)
cell.font = sum_header_font
cell.fill = f
cell.border = border
cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
ws.row_dimensions[current_row].height = 28
current_row += 1
totals = [0, 0]
for col_a, col_b, n_soon, n_ok in rows_data:
totals[0] += n_soon
totals[1] += n_ok
all_zero = (n_soon == 0 and n_ok == 0)
row_vals = [col_a, col_b, n_soon, n_ok]
row_fills = [None, None,
orange_fill if n_soon > 0 else None,
green_fill if n_ok > 0 else None]
for col_idx, (val, rfill) in enumerate(zip(row_vals, row_fills), 1):
cell = ws.cell(row=current_row, column=col_idx, value=val)
if col_idx >= 3 and val == 0:
cell.font = zero_red_font if all_zero else zero_font
else:
cell.font = data_font
cell.border = border
cell.alignment = Alignment(horizontal="center" if col_idx >= 2 else "left", vertical="center")
if rfill:
cell.fill = rfill
current_row += 1
for col_idx, val in enumerate(["CELKEM", "", totals[0], totals[1]], 1):
cell = ws.cell(row=current_row, column=col_idx, value=val)
cell.font = sum_total_font
cell.fill = total_fill
cell.border = border
cell.alignment = Alignment(horizontal="center" if col_idx >= 2 else "left", vertical="center")
current_row += 2
return current_row
# ── List: Kit Inventory CZE ────────────────────────────────────────────────────
kit_ws = out_wb.create_sheet("Kit Inventory CZE")
listing_columns = [
("Project No.", 14),
("Region", 10),
("Country", 10),
("Site", 38),
("Kit Type", 12),
("Description", 22),
("Accession", 18),
("Shipped Date", 16),
("Expiration Date", 16),
("Days to Expiration", 20),
]
for col_idx, (hdr, width) in enumerate(listing_columns, 1):
cell = kit_ws.cell(row=1, column=col_idx, value=hdr)
cell.font = header_font
cell.fill = header_fill
cell.border = border
cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
kit_ws.column_dimensions[get_column_letter(col_idx)].width = width
kit_ws.row_dimensions[1].height = 30
kit_ws.freeze_panes = "A2"
for row_idx, (_, row) in enumerate(cze.iterrows(), 2):
days = row.get("Days to Expiration")
for col_idx, (col_name, _) in enumerate(listing_columns, 1):
value = clean(row.get(col_name))
cell = kit_ws.cell(row=row_idx, column=col_idx, value=value)
cell.font = data_font
cell.border = border
cell.alignment = Alignment(horizontal="center", vertical="center")
if col_name in ("Shipped Date", "Expiration Date") and value is not None:
cell.number_format = "DD-MMM-YYYY"
if col_name == "Days to Expiration":
cell.fill = exp_fill if (pd.notna(days) and days <= 60) else ok_fill
kit_ws.auto_filter.ref = f"A1:{get_column_letter(len(listing_columns))}1"
# ── List: Přehled po centrech ──────────────────────────────────────────────────
ctr_ws = out_wb.create_sheet("Přehled po centrech")
ctr_ws.column_dimensions["A"].width = 22
ctr_ws.column_dimensions["B"].width = 24
ctr_ws.column_dimensions["C"].width = 22
ctr_ws.column_dimensions["D"].width = 20
current_row = 1
for site in kit_sites:
site_df = cze[cze["Site"] == site]
rows_data = []
for kit in kit_order:
desc = kit_desc.get(kit, "")
kit_site_df = site_df[site_df["Kit Type"] == kit]
n_soon = int((kit_site_df["_bucket"] == "soon").sum())
n_ok = int((kit_site_df["_bucket"] == "ok").sum())
rows_data.append((f"{kit}{desc}", desc, n_soon, n_ok))
current_row = write_summary_table(ctr_ws, current_row, site, rows_data, "Kit Type")
# ── List: Přehled po typech kitů ───────────────────────────────────────────────
sum_ws = out_wb.create_sheet("Přehled po typech")
sum_ws.column_dimensions["A"].width = 38
sum_ws.column_dimensions["B"].width = 22
sum_ws.column_dimensions["C"].width = 22
sum_ws.column_dimensions["D"].width = 20
current_row = 1
for kit in kit_order:
desc = kit_desc.get(kit, "")
kit_df = cze[cze["Kit Type"] == kit]
rows_data = []
for site in sorted(kit_df["Site"].unique()):
site_df = kit_df[kit_df["Site"] == site]
n_soon = int((site_df["_bucket"] == "soon").sum())
n_ok = int((site_df["_bucket"] == "ok").sum())
rows_data.append((site, desc, n_soon, n_ok))
current_row = write_summary_table(sum_ws, current_row, f"Kit Type {kit}{desc}", rows_data, "Centrum")
# ── List: eQueries ─────────────────────────────────────────────────────────────
# Zdroj: covance.equeries (study 35472 = 77242113UCO3001), všechny CZECH REPUBLIC.
# Barevné zvýraznění sloupce Status: In Progress (otevřená) = červená,
# Response Received = oranžová, Closed = zelená.
eq_ws = out_wb.create_sheet("eQueries")
eq_columns = [
("Site", 30),
("Subject", 16),
("Visit", 26),
("Visit Collection Date", 20),
("Accession", 16),
("eQueryId", 14),
("Issue Type", 18),
("Status", 18),
("Create Date", 20),
("Response Date Time", 20),
("Time Before Response", 18),
("User Name", 22),
]
date_cols = {"Visit Collection Date", "Create Date", "Response Date Time"}
status_fill = {
"In Progress": exp_fill, # otevřená — červená
"Response Received": orange_fill, # oranžová
"Closed": green_fill, # zelená
}
for col_idx, (hdr, width) in enumerate(eq_columns, 1):
cell = eq_ws.cell(row=1, column=col_idx, value=hdr)
cell.font = header_font
cell.fill = header_fill
cell.border = border
cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
eq_ws.column_dimensions[get_column_letter(col_idx)].width = width
eq_ws.row_dimensions[1].height = 30
eq_ws.freeze_panes = "A2"
for row_idx, (_, row) in enumerate(eq_df.iterrows(), 2):
status_val = row.get("Status")
for col_idx, (col_name, _) in enumerate(eq_columns, 1):
value = clean(row.get(col_name))
cell = eq_ws.cell(row=row_idx, column=col_idx, value=value)
cell.font = data_font
cell.border = border
cell.alignment = Alignment(horizontal="center", vertical="center")
if col_name in date_cols and value is not None:
cell.number_format = "DD-MMM-YYYY HH:MM"
if col_name == "Status" and status_val in status_fill:
cell.fill = status_fill[status_val]
eq_ws.auto_filter.ref = f"A1:{get_column_letter(len(eq_columns))}1"
# ── Uložení ────────────────────────────────────────────────────────────────────
out_wb.save(out_path)
client.close()
print(f"\nUloženo: {out_path}")
print(f"Pacienti s BXSCR: {len(bxscr_patients)}, Všichni pacienti: {len(all_patients)}")
print(f"CZE kity: {len(cze)}, Typy kitů: {len(kit_order)}, Centra: {len(kit_sites)}")
print(f"eQueries (UCO3001): {len(eq_df)}")
@@ -0,0 +1,53 @@
# download_lab_reports_v1.0.py
**Verze:** 1.0 &nbsp;|&nbsp; **Datum:** 2026-06-16
Stahuje PDF **Lab Reports** ze `xsp.labcorp.com` pro studii **77242113UCO3001**
(interní číslo `36940`), filtrovaně na **10 českých center**.
## Princip
Stejný jako `download_test_results` — Playwright + perzistentní profil
(`browser_profile/`), jednorázový login (email → Next → heslo → Verify, jinak se
přeskočí), stahování **přes UI**: klik na odkaz **„English"** ve sloupci Download
`POST /api/download/documentFile``expect_download`.
Lab Reports je **AG Grid s virtuálním renderem** (~50 z 298 řádků v DOM).
Skript proto scrolluje viewport po indexech (`row-index`), u každého řádku
přečte metadata z buněk (`col-id`: `subjectId`, `accessionNumber`, `visit`,
`visitCollectionDate`, `siteNum`, `postedDateTime`; zdvojený text se odstraní)
a klikne na „English".
## Název PDF
Mezery místo podtržítek; první datum = **odběr** (`visitCollectionDate`),
`posted` = **datum vystavení** (`postedDateTime`, jen datum) — odliší reissue
stejného reportu (stejný accession, různé Posted):
```
77242113UCO3001 {odběr} {Site} {Subject} {Visit} {Accession} posted {posted}.pdf
```
Příklad: `77242113UCO3001 2026-06-11 CZ10009 CZ100092003 Screening 6227323331 posted 2026-06-15.pdf`
Při zbylé kolizi názvu se přidá ` (2)`, ` (3)`, …
## Spuštění
```
python download_lab_reports_v1.0.py --dry-run # jen vypíše názvy, nestahuje
python download_lab_reports_v1.0.py --limit 5 # stáhne jen prvních 5 (test)
python download_lab_reports_v1.0.py # stáhne vše (~298)
```
**Výstup:** `U:\PythonProject\Janssen\Covance\LabReports\`
## Konfigurace (v hlavičce skriptu)
- `SITES` — 10 interních ID center (z „GO TO LINK" URL).
- `STUDY` = `36940`, `STUDY_CODE` = `77242113UCO3001`.
- `OUT_DIR`, `LOGIN_URL`, `PROFILE_DIR`.
## TODO / k doladění
- Cílový adresář (možná Dropbox / per-studie podsložka).
- Případně přeskakovat už stažené (teď se přidává `(2)`).
## Pozn. ke spuštění
Skript otevírá viditelné GUI Chrome (Playwright) — musí běžet z **terminálu
uživatele s desktop session** (PowerShell / dvojklik), ne z headless/agent
prostředí (tam Chromium spadne s `exitCode 2147483651`).
+280
View File
@@ -0,0 +1,280 @@
# =============================================================================
# Název: download_lab_reports_v1.0.py
# Verze: 1.0
# Datum: 2026-06-16
# Popis: Stahuje PDF Lab Reports ze xsp.labcorp.com pro studii 77242113UCO3001
# (interni cislo 36940), filtrovane na 10 ceskych center (CZ).
# Princip stejny jako download_test_results: Playwright + perzistentni
# profil (browser_profile/), jednorazovy login, stahovani pres UI
# (klik na "English" v sloupci Download -> expect_download).
#
# Lab Reports grid je AG Grid s virtualnim renderem (~50 z 298 radku
# v DOM). Skript proto scrolluje viewport po indexech (row-index),
# u kazdeho radku precte metadata + klikne na "English".
#
# Nazev PDF (mezery, ne podtrzitka):
# "77242113UCO3001 {yyyy-mm-dd odber} {Site} {Subject} {Visit} {Accession}.pdf"
# Pri kolizi nazvu se prida " (2)", " (3)", ...
#
# Prepinace:
# --dry-run nestahuje, jen vypise metadata a vysledne nazvy souboru
# --limit N zpracuje jen prvnich N radku (test pojmenovani)
# =============================================================================
from playwright.sync_api import sync_playwright
from datetime import datetime
import argparse
import json
import os
import re
import traceback
import urllib.parse
# --- argumenty --------------------------------------------------------------
parser = argparse.ArgumentParser(description="Stahovani Lab Reports PDF (XSP) pro 77242113UCO3001.")
parser.add_argument("--dry-run", action="store_true", help="nestahovat, jen vypsat metadata + nazvy")
parser.add_argument("--limit", type=int, default=0, help="zpracovat jen prvnich N radku (0 = vse)")
ARGS = parser.parse_args()
def log(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)
# --- konfigurace ------------------------------------------------------------
EMAIL = "vbuzalka@its.jnj.com"
PASSWORD = "%zT3Wqfc9)cWua5"
LOGIN_URL = "https://xsp.labcorp.com/"
STUDY = "36940"
STUDY_CODE = "77242113UCO3001"
OUT_DIR = r"U:\PythonProject\Janssen\Covance\LabReports"
# 10 center (interni ID center) — z URL "GO TO LINK", co poslal uzivatel.
SITES = [
"930539", "930547", "930555", "930556", "930553",
"930549", "930525", "930536", "930557", "930531",
]
_BASE = os.path.dirname(os.path.abspath(__file__))
PROFILE_DIR = os.path.join(_BASE, "browser_profile")
def lab_reports_url():
site_param = json.dumps(SITES, separators=(",", ":")) # ["930539","930547",...]
return (f"https://xsp.labcorp.com/sponsor/study/{STUDY}/lab-reports"
f"?site={urllib.parse.quote(site_param)}")
# --- pomocne funkce nazvu souboru -------------------------------------------
def safe(s: str) -> str:
"""Odstrani znaky nepovolene v nazvu souboru Windows; zachova mezery."""
return re.sub(r'[\\/:*?"<>|]', "", s or "").strip()
_MONTHS = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05",
"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10",
"Nov": "11", "Dec": "12"}
def fmt_date(s: str) -> str:
"""'Jun 10, 2026' i 'Jun 15, 2026 7:49 PM' -> '2026-06-10'.
Bere jen vedouci datum (mesic den, rok), pripadny cas ignoruje."""
m = re.match(r"\s*([A-Za-z]{3})\w*\s+(\d{1,2}),\s+(\d{4})", s or "")
if m and m.group(1)[:3] in _MONTHS:
return f"{m.group(3)}-{_MONTHS[m.group(1)[:3]]}-{int(m.group(2)):02d}"
return safe(s)
def build_basename(meta: dict) -> str:
# Posted (datum vystaveni) odlisi reissue stejneho reportu (stejny accession,
# ruzne Posted). Pri shode i tak zbyva (2)(3) v unique_path().
return safe(
f"{STUDY_CODE} {fmt_date(meta['collected'])} {meta['site']} "
f"{meta['subject']} {meta['visit']} {meta['accession']} "
f"posted {fmt_date(meta['posted'])}"
)
def unique_path(out_dir: str, base: str, ext: str = ".pdf") -> str:
dest = os.path.join(out_dir, base + ext)
n = 2
while os.path.exists(dest):
dest = os.path.join(out_dir, f"{base} ({n}){ext}")
n += 1
return dest
# --- JS helpery (cteni AG Gridu) --------------------------------------------
JS_GRID_INFO = r"""() => {
const c = document.querySelector('.ag-body-container');
const r = document.querySelector('.ag-body-container .ag-row');
const rh = r ? r.getBoundingClientRect().height : 25;
const ch = c ? parseFloat(c.style.height || '0') : 0;
return { rowHeight: rh || 25, total: rh ? Math.round(ch / rh) : 0 };
}"""
JS_READ_ROW = r"""(idx) => {
const dedup = s => {
s = (s || '').replace(/\s+/g, ' ').trim();
const h = s.slice(0, Math.floor(s.length / 2));
if (s === h + h) return h;
const m = s.match(/^(.*?)\s+\1$/);
if (m) return m[1];
return s;
};
const row = document.querySelector('.ag-body-container .ag-row[row-index="' + idx + '"]');
if (!row) return null;
const get = id => {
const c = row.querySelector('[col-id="' + id + '"]');
return c ? dedup(c.textContent) : '';
};
return {
subject: get('subjectId'),
accession: get('accessionNumber'),
visit: get('visit'),
collected: get('visitCollectionDate'),
site: get('siteNum'),
posted: get('postedDateTime'),
};
}"""
JS_SCROLL_TO = r"""(args) => {
const [idx, rh] = args;
const vp = document.querySelector('.ag-body-viewport');
if (!vp) return;
vp.scrollTop = Math.max(0, idx * rh - vp.clientHeight / 2);
}"""
# --- login ------------------------------------------------------------------
def login(page):
log("LOGIN: otviram login stranku...")
page.goto(LOGIN_URL)
try:
page.get_by_label("Email").wait_for(state="visible", timeout=12000)
except Exception:
log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})")
return
log("LOGIN: zadavam email...")
page.get_by_label("Email").fill(EMAIL)
page.get_by_role("button", name="Next").click()
log("LOGIN: cekam na pole pro heslo...")
page.get_by_label("Password").wait_for(state="visible", timeout=30000)
log("LOGIN: zadavam heslo...")
page.get_by_label("Password").fill(PASSWORD)
page.get_by_role("button", name="Verify").click()
log("LOGIN: cekam na presmerovani po prihlaseni...")
try:
page.wait_for_url(lambda url: "code=" not in url or "xsp." in url, timeout=60000)
except Exception:
log("LOGIN: wait_for_url vyprsel, pokracuji.")
page.wait_for_timeout(3000)
log(f"LOGIN: prihlaseni hotovo ({page.url})")
# --- nacteni gridu ----------------------------------------------------------
def open_grid(page):
url = lab_reports_url()
log(f"GRID: navigace na Lab Reports ({len(SITES)} center)...")
page.goto(url)
log("GRID: cekam na radky (.ag-row)...")
page.wait_for_selector(".ag-body-container .ag-row", timeout=120000)
# stabilizace poctu radku
prev = -1
for i in range(20):
info = page.evaluate(JS_GRID_INFO)
cnt = info["total"]
log(f" ...kontrola #{i+1}: total={cnt}, rowHeight={info['rowHeight']}")
if cnt == prev and cnt > 0:
break
prev = cnt
page.wait_for_timeout(2000)
info = page.evaluate(JS_GRID_INFO)
log(f"GRID: nacteno, total={info['total']} radku, rowHeight={info['rowHeight']}px.")
return info["total"], info["rowHeight"]
# --- stazeni jednoho radku --------------------------------------------------
def process_row(page, idx, row_height, dry_run):
page.evaluate(JS_SCROLL_TO, [idx, row_height])
page.wait_for_selector(f'.ag-body-container .ag-row[row-index="{idx}"]', timeout=15000)
page.wait_for_timeout(150)
meta = page.evaluate(JS_READ_ROW, idx)
if not meta or not meta.get("subject"):
raise RuntimeError(f"radek {idx}: nepodarilo se precist metadata")
base = build_basename(meta)
dest = unique_path(OUT_DIR, base)
if dry_run:
log(f" [DRY] #{idx}: {os.path.basename(dest)}")
return os.path.basename(dest)
link = page.locator(
f'.ag-body-container .ag-row[row-index="{idx}"] a.dl-link',
has_text="English",
).first
with page.expect_download(timeout=60000) as dl:
link.click()
dl.value.save_as(dest)
log(f" #{idx}: -> {os.path.basename(dest)}")
return os.path.basename(dest)
# --- main -------------------------------------------------------------------
def main():
os.makedirs(OUT_DIR, exist_ok=True)
log(f"START: studie {STUDY_CODE} ({STUDY}), vystup '{OUT_DIR}', "
f"{'DRY-RUN' if ARGS.dry_run else 'STAHOVANI'}"
f"{f', limit {ARGS.limit}' if ARGS.limit else ''}.")
with sync_playwright() as p:
context = p.chromium.launch_persistent_context(
user_data_dir=PROFILE_DIR,
headless=False,
args=[
"--disable-blink-features=AutomationControlled",
"--start-maximized",
"--disable-restore-session-state",
"--disable-session-crashed-bubble",
],
no_viewport=True,
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
accept_downloads=True,
)
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
page = context.new_page()
log("START: prohlizec spusten.")
login(page)
total, row_height = open_grid(page)
if ARGS.limit:
total = min(total, ARGS.limit)
ok, failed = 0, []
for idx in range(total):
log(f">>> Radek {idx+1}/{total}")
try:
process_row(page, idx, row_height, ARGS.dry_run)
ok += 1
except Exception as e:
failed.append(idx)
log(f"CHYBA u radku {idx}: {e!r} — pokracuji dalsim.")
log(f"KONEC: hotovo {ok}/{total} radku.")
if failed:
log(f"KONEC: SELHALY indexy: {failed}")
context.close()
if __name__ == "__main__":
try:
main()
except Exception as e:
log(f"FATAL: beh spadl: {e!r}")
traceback.print_exc()
finally:
try:
input("\n[Enter] pro zavreni tohoto okna...")
except EOFError:
pass
@@ -0,0 +1,49 @@
# download_lab_reports_v1.1.py
**Verze:** 1.1 &nbsp;|&nbsp; **Datum:** 2026-06-16
Stahuje PDF **Lab Reports** ze `xsp.labcorp.com` (studie **77242113UCO3001**,
interní `36940`, 10 CZ center) a ukládá je **přímo do MongoDB** — metadata
z tabulky + skutečné PDF (inline Binary). **Na disk neukládá.**
## Princip
Playwright + perzistentní profil (`browser_profile/`), jednorázový login,
klik na „English" ve sloupci Download. PDF bajty se čtou z Playwright temp
souboru (`download.path()`), `save_as` se nevolá → nic netrvalého na disku.
Materializaci adresáře z Monga řeší samostatný (budoucí) skript.
## Inkrementálně (stop-at-known)
List je řazený **Posted DESC** (nejnovější nahoře). Skript jde shora dolů;
u každého řádku nejdřív přečte metadata a spočítá `record_id`. Jakmile narazí
na **už uložený** report, **končí** (vše pod ním je starší a už v Mongo je) —
stahuje tedy jen nové.
**Korekce výsledků** = stejný report znovu vystavený s **novým Posted**
nový `record_id` → stáhne se jako nový, původní zůstává.
## MongoDB
- db `covance`, kolekce **`labreports`**
- klíč `record_id = "{site}|{subject}|{accession}|{visit}|{posted}"`
(Posted vč. času odlišuje reissue)
- dokument: `study`, `studyCode`, `type`, `site`, `subject`, `accession`,
`visit`, `collected` (yyyy-mm-dd), `posted` (yyyy-mm-dd HH:MM),
`fields` (sloupce tabulky), `fileName`, **`pdf`** (Binary ~260 KB),
`pdfSize`, `pdfSha256`, `firstSeen`, `lastSeen`, `history[]`
- upsert: nový → insert; změna sha/fields → push do `history` + update;
shoda → jen `lastSeen`
## Spuštění
```
python download_lab_reports_v1.1.py --dry-run # jen vypíše NOVÉ, nestahuje, nepíše
python download_lab_reports_v1.1.py --limit 5 # test: max 5 řádků
python download_lab_reports_v1.1.py # inkrementální běh (stop-at-known)
python download_lab_reports_v1.1.py --full # projít vše (rekonciliace)
```
## Pozn. ke spuštění
Otevírá viditelné GUI Chrome — musí běžet z **terminálu uživatele s desktop
session** (ne headless/agent prostředí).
## Nahrazuje
`download_lab_reports_v1.0.py` (ukládal na disk) — po dokončení jeho běhu
přesunout v1.0 (.py i .md) do `TRASH/`.
+399
View File
@@ -0,0 +1,399 @@
# =============================================================================
# Název: download_lab_reports_v1.1.py
# Verze: 1.1
# Datum: 2026-06-16
# Popis: Stahuje PDF Lab Reports ze xsp.labcorp.com pro studii 77242113UCO3001
# (interni cislo 36940), filtrovane na 10 ceskych center (CZ),
# a uklada je PRIMO do MongoDB (db covance, kolekce labreports) —
# metadata z tabulky + skutecne PDF (inline Binary). Na disk NEUKLADA.
#
# Princip stahovani stejny jako download_test_results: Playwright +
# perzistentni profil, jednorazovy login, klik na "English" ve sloupci
# Download. PDF bajty se ctou z Playwright temp souboru (download.path()),
# save_as se nevola -> nic netrvale neni na disku.
#
# INKREMENTALNE (stop-at-known): list je Posted DESC (nejnovejsi nahore).
# Skript jde shora dolu; u kazdeho radku nejdriv precte metadata a
# spocita record_id. Jakmile narazi na uz ulozeny report, KONCI
# (vse pod nim je starsi a uz v Mongo je). Korekce vysledku = stejny
# report znovu vystaveny s NOVYM Posted => novy record_id => stahne se
# jako novy, puvodni zustava.
#
# record_id = "{site}|{subject}|{accession}|{visit}|{posted}"
# (Posted vc. casu odlisuje reissue).
#
# Prepinace:
# --full projit vsechny radky (bez predcasneho konce); upsertne
# chybejici / zmenene (rekonciliace).
# --dry-run nestahuje ani nepise do DB; jen vypise NOVE reporty.
# --limit N zpracovat max N radku (test).
# =============================================================================
from playwright.sync_api import sync_playwright
from datetime import datetime
from pymongo import MongoClient, ASCENDING
from bson.binary import Binary
import argparse
import hashlib
import json
import os
import re
import traceback
import urllib.parse
# --- argumenty --------------------------------------------------------------
parser = argparse.ArgumentParser(description="Stahovani Lab Reports PDF (XSP) do MongoDB.")
parser.add_argument("--full", action="store_true", help="projit vse (bez stop-at-known)")
parser.add_argument("--dry-run", action="store_true", help="nestahovat ani nepsat do DB; jen vypsat nove")
parser.add_argument("--limit", type=int, default=0, help="max N radku (0 = vse)")
ARGS = parser.parse_args()
def log(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)
# --- konfigurace ------------------------------------------------------------
EMAIL = "vbuzalka@its.jnj.com"
PASSWORD = "%zT3Wqfc9)cWua5"
LOGIN_URL = "https://xsp.labcorp.com/"
STUDY = "36940"
STUDY_CODE = "77242113UCO3001"
MONGO_URI = "mongodb://192.168.1.76:27017"
DB_NAME = "covance"
COLLECTION = "labreports"
# 10 center (interni ID center) — z URL "GO TO LINK".
SITES = [
"930539", "930547", "930555", "930556", "930553",
"930549", "930525", "930536", "930557", "930531",
]
_BASE = os.path.dirname(os.path.abspath(__file__))
PROFILE_DIR = os.path.join(_BASE, "browser_profile")
def lab_reports_url():
site_param = json.dumps(SITES, separators=(",", ":"))
return (f"https://xsp.labcorp.com/sponsor/study/{STUDY}/lab-reports"
f"?site={urllib.parse.quote(site_param)}")
# --- formatovani --------------------------------------------------------------
_MONTHS = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05",
"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10",
"Nov": "11", "Dec": "12"}
def safe(s: str) -> str:
return re.sub(r'[\\/:*?"<>|]', "", s or "").strip()
def fmt_date(s: str) -> str:
"""'Jun 10, 2026' i 'Jun 15, 2026 7:49 PM' -> '2026-06-10'."""
m = re.match(r"\s*([A-Za-z]{3})\w*\s+(\d{1,2}),\s+(\d{4})", s or "")
if m and m.group(1)[:3] in _MONTHS:
return f"{m.group(3)}-{_MONTHS[m.group(1)[:3]]}-{int(m.group(2)):02d}"
return safe(s)
def fmt_datetime(s: str) -> str:
"""'Jun 15, 2026 7:49 PM' -> '2026-06-15 19:49'. Bez casu -> jen datum."""
s = (s or "").strip()
for f in ("%b %d, %Y %I:%M %p", "%b %d, %Y %I:%M:%S %p"):
try:
return datetime.strptime(s, f).strftime("%Y-%m-%d %H:%M")
except ValueError:
pass
return fmt_date(s)
def make_record_id(meta: dict) -> str:
return "|".join([
meta["site"], meta["subject"], meta["accession"],
meta["visit"], fmt_datetime(meta["posted"]),
])
def build_basename(meta: dict) -> str:
"""Nazev (kvuli budoucimu materializacnimu skriptu); ulozen jako fileName."""
return safe(
f"{STUDY_CODE} {fmt_date(meta['collected'])} {meta['site']} "
f"{meta['subject']} {meta['visit']} {meta['accession']} "
f"posted {fmt_date(meta['posted'])}"
) + ".pdf"
# --- JS helpery (AG Grid) ---------------------------------------------------
JS_GRID_INFO = r"""() => {
const c = document.querySelector('.ag-body-container');
const r = document.querySelector('.ag-body-container .ag-row');
const rh = r ? r.getBoundingClientRect().height : 25;
const ch = c ? parseFloat(c.style.height || '0') : 0;
return { rowHeight: rh || 25, total: rh ? Math.round(ch / rh) : 0 };
}"""
JS_READ_ROW = r"""(idx) => {
const dedup = s => {
s = (s || '').replace(/\s+/g, ' ').trim();
const h = s.slice(0, Math.floor(s.length / 2));
if (s === h + h) return h;
const m = s.match(/^(.*?)\s+\1$/);
if (m) return m[1];
return s;
};
const row = document.querySelector('.ag-body-container .ag-row[row-index="' + idx + '"]');
if (!row) return null;
const get = id => {
const c = row.querySelector('[col-id="' + id + '"]');
return c ? dedup(c.textContent) : '';
};
return {
type: get('type'),
subject: get('subjectId'),
accession: get('accessionNumber'),
visit: get('visit'),
collected: get('visitCollectionDate'),
site: get('siteNum'),
posted: get('postedDateTime'),
};
}"""
JS_SCROLL_TO = r"""(args) => {
const [idx, rh] = args;
const vp = document.querySelector('.ag-body-viewport');
if (!vp) return;
vp.scrollTop = Math.max(0, idx * rh - vp.clientHeight / 2);
}"""
# --- login ------------------------------------------------------------------
def login(page):
log("LOGIN: otviram login stranku...")
page.goto(LOGIN_URL)
try:
page.get_by_label("Email").wait_for(state="visible", timeout=12000)
except Exception:
log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})")
return
log("LOGIN: zadavam email...")
page.get_by_label("Email").fill(EMAIL)
page.get_by_role("button", name="Next").click()
log("LOGIN: cekam na pole pro heslo...")
page.get_by_label("Password").wait_for(state="visible", timeout=30000)
log("LOGIN: zadavam heslo...")
page.get_by_label("Password").fill(PASSWORD)
page.get_by_role("button", name="Verify").click()
log("LOGIN: cekam na presmerovani po prihlaseni...")
try:
page.wait_for_url(lambda url: "code=" not in url or "xsp." in url, timeout=60000)
except Exception:
log("LOGIN: wait_for_url vyprsel, pokracuji.")
page.wait_for_timeout(3000)
log(f"LOGIN: prihlaseni hotovo ({page.url})")
def open_grid(page):
log(f"GRID: navigace na Lab Reports ({len(SITES)} center)...")
page.goto(lab_reports_url())
log("GRID: cekam na radky (.ag-row)...")
page.wait_for_selector(".ag-body-container .ag-row", timeout=120000)
prev = -1
for i in range(20):
info = page.evaluate(JS_GRID_INFO)
cnt = info["total"]
log(f" ...kontrola #{i+1}: total={cnt}, rowHeight={info['rowHeight']}")
if cnt == prev and cnt > 0:
break
prev = cnt
page.wait_for_timeout(2000)
info = page.evaluate(JS_GRID_INFO)
log(f"GRID: nacteno, total={info['total']} radku, rowHeight={info['rowHeight']}px.")
return info["total"], info["rowHeight"]
def read_row(page, idx, row_height):
page.evaluate(JS_SCROLL_TO, [idx, row_height])
page.wait_for_selector(f'.ag-body-container .ag-row[row-index="{idx}"]', timeout=15000)
page.wait_for_timeout(120)
meta = page.evaluate(JS_READ_ROW, idx)
if not meta or not meta.get("subject"):
raise RuntimeError(f"radek {idx}: nepodarilo se precist metadata")
return meta
def download_pdf_bytes(page, idx):
link = page.locator(
f'.ag-body-container .ag-row[row-index="{idx}"] a.dl-link',
has_text="English",
).first
with page.expect_download(timeout=60000) as dl:
link.click()
path = dl.value.path() # temp soubor Playwrightu
with open(path, "rb") as f:
data = f.read()
return data
def upsert(col, meta, rid, data, now):
fields = {
"Type": meta["type"],
"Subject": meta["subject"],
"Accession": meta["accession"],
"Visit": meta["visit"],
"Collected Date": meta["collected"],
"Site Number": meta["site"],
"Posted": meta["posted"],
}
sha = hashlib.sha256(data).hexdigest()
derived = {
"study": STUDY,
"studyCode": STUDY_CODE,
"type": meta["type"] or "Lab Result",
"site": meta["site"],
"subject": meta["subject"],
"accession": meta["accession"],
"visit": meta["visit"],
"collected": fmt_date(meta["collected"]),
"posted": fmt_datetime(meta["posted"]),
"fields": fields,
"fileName": build_basename(meta),
"pdf": Binary(data),
"pdfSize": len(data),
"pdfSha256": sha,
}
existing = col.find_one({"record_id": rid}, {"pdfSha256": 1, "fields": 1, "lastSeen": 1})
if existing is None:
col.insert_one({"record_id": rid, **derived,
"firstSeen": now, "lastSeen": now, "history": []})
return "insert"
if existing.get("pdfSha256") != sha or existing.get("fields") != fields:
col.update_one(
{"_id": existing["_id"]},
{"$push": {"history": {"date": existing.get("lastSeen"),
"fields": existing.get("fields"),
"pdfSha256": existing.get("pdfSha256")}},
"$set": {**derived, "lastSeen": now}},
)
return "update"
col.update_one({"_id": existing["_id"]}, {"$set": {"lastSeen": now}})
return "same"
def main():
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log(f"START: studie {STUDY_CODE} ({STUDY}) -> {DB_NAME}.{COLLECTION}, "
f"{'DRY-RUN' if ARGS.dry_run else 'ZAPIS'}"
f"{' [FULL]' if ARGS.full else ' [stop-at-known]'}"
f"{f', limit {ARGS.limit}' if ARGS.limit else ''}.")
col = None
existing_ids = set()
if not ARGS.dry_run:
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
col = client[DB_NAME][COLLECTION]
col.create_index([("record_id", ASCENDING)], unique=True)
for idx_def in (["study"], ["site"], ["subject"], ["accession"],
["posted"], ["collected"]):
col.create_index([(idx_def[0], ASCENDING)])
existing_ids = {d["record_id"] for d in col.find({"study": STUDY}, {"record_id": 1})}
log(f"START: v Mongo uz je {len(existing_ids)} reportu pro tuto studii.")
else:
# i v dry-run nacti existujici, at vime, co je opravdu nove
try:
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
existing_ids = {d["record_id"] for d in
client[DB_NAME][COLLECTION].find({"study": STUDY}, {"record_id": 1})}
log(f"START: [dry-run] v Mongo je {len(existing_ids)} reportu.")
except Exception as e:
log(f"START: [dry-run] Mongo nedostupne ({e!r}), beru vse jako nove.")
with sync_playwright() as p:
context = p.chromium.launch_persistent_context(
user_data_dir=PROFILE_DIR,
headless=False,
args=[
"--disable-blink-features=AutomationControlled",
"--start-maximized",
"--disable-restore-session-state",
"--disable-session-crashed-bubble",
],
no_viewport=True,
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
accept_downloads=True,
)
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
page = context.new_page()
log("START: prohlizec spusten.")
login(page)
total, row_height = open_grid(page)
if ARGS.limit:
total = min(total, ARGS.limit)
new_cnt = upd_cnt = same_cnt = 0
failed = []
stopped = False
for idx in range(total):
try:
meta = read_row(page, idx, row_height)
except Exception as e:
failed.append(idx)
log(f"CHYBA cteni radku {idx}: {e!r} — pokracuji.")
continue
rid = make_record_id(meta)
known = rid in existing_ids
if known and not ARGS.full:
log(f">>> Radek {idx+1}/{total}: '{rid}' uz v Mongo "
f"-> stop-at-known, koncim (zbytek je starsi).")
stopped = True
break
if known and ARGS.full:
log(f" #{idx}: znamy, [full] preskakuji download.")
same_cnt += 1
continue
# novy report
if ARGS.dry_run:
log(f" [DRY] NOVY #{idx}: {build_basename(meta)}")
new_cnt += 1
existing_ids.add(rid)
continue
try:
data = download_pdf_bytes(page, idx)
action = upsert(col, meta, rid, data, now)
existing_ids.add(rid)
if action == "insert":
new_cnt += 1
log(f" #{idx}: INSERT ({len(data)//1024} KB) {build_basename(meta)}")
elif action == "update":
upd_cnt += 1
log(f" #{idx}: UPDATE {build_basename(meta)}")
else:
same_cnt += 1
except Exception as e:
failed.append(idx)
log(f"CHYBA stazeni/zapisu radku {idx}: {e!r} — pokracuji.")
log(f"KONEC: nove={new_cnt}, update={upd_cnt}, beze zmeny={same_cnt}, "
f"chyby={len(failed)} {'(stop-at-known)' if stopped else '(projeto vse)'}.")
if failed:
log(f"KONEC: SELHALY indexy: {failed}")
context.close()
if __name__ == "__main__":
try:
main()
except Exception as e:
log(f"FATAL: beh spadl: {e!r}")
traceback.print_exc()
finally:
try:
input("\n[Enter] pro zavreni tohoto okna...")
except EOFError:
pass
@@ -0,0 +1,56 @@
# download_lab_reports_v1.2.py
**Verze:** 1.2 &nbsp;|&nbsp; **Datum:** 2026-06-16
Stahuje PDF **Lab Reports** ze `xsp.labcorp.com` (studie **77242113UCO3001**,
interní `36940`, 10 CZ center) přímo do **MongoDB** (`covance.labreports`) —
metadata + skutečné PDF (inline Binary). **Na disk neukládá.**
## Klíč `record_id = reportId`
`reportId` = stabilní 32-hex ID dokumentu z dat AG Gridu, **napříč všemi řádky
unikátní** (ověřeno: 997 řádků → 997 unikátních `reportId`/`fileId`).
**Proč ne metadata:** na portálu se reálně vyskytují **různá PDF se shodnými
viditelnými metadaty** — stejný `site|subject|accession|visit|posted` i na
minutu (korekce výsledku / reissue se shodným časem Posted). Ověřeno: 5 souborů
pro accession `6227697718` má 5 různých sha256. Klíčování podle metadat (v1.1)
by je chybně sloučilo → **ztráta dat**. `reportId` to řeší.
## Odkud data
Z **in-memory dat AG Gridu** přes grid API (`__agComponent.gridApi`):
`getDisplayedRowAtIndex(i).data``reportId`, `fileLinks[].fileId`+`fileName`
(server), `postedDateTime` (ISO), `siteNum`, `subjectNumber`. `reportId` jde
přečíst **bez scrollování** → levný pre-check „už mám". Accession/Visit/Collected
leží v gridu níž (ve struktuře `visits`), proto se berou z buněk (po scrollu).
**Pozn.:** `accession` se nijak nepočítá ani neodvozuje — je to reálný
identifikátor přidělený kitu v laboratoři; čte se **doslova** z buňky a ukládá
1:1 (`accession` i `fields.Accession`).
## Stahování + inkrementálně
Klik na „English" → `expect_download`, PDF z `download.path()` (bez `save_as`).
List je Posted DESC; ze seznamu (rowIndex, reportId) se shora hledá první už
uložený `reportId`**stop-at-known** (zbytek je starší a v Mongo je). Stahují
se jen nové (nahoře). Korekce = nový `reportId` → uloží se jako nový.
## MongoDB dokument
`record_id`(=reportId), `study`, `studyCode`, `type`, `site`, `subject`,
`accession`, `visit`, `collected`, `posted`, `postedIso`, `fileId`,
`serverFileName`, `fields` (sloupce tabulky), `fileName` (náš název),
**`pdf`** (Binary ~260 KB), `pdfSize`, `pdfSha256`, `firstSeen`, `lastSeen`,
`history[]`. Upsert: nový→insert; změna sha/fields→push history+update;
shoda→jen `lastSeen`.
Název v `fileName`: `77242113UCO3001 {odběr} {Site} {Subject} {Visit} {Accession} posted {posted}.pdf`
## Spuštění (z terminálu uživatele — otevírá GUI Chrome)
```
python download_lab_reports_v1.2.py --dry-run # vypíše NOVÉ, nestahuje
python download_lab_reports_v1.2.py --limit 5 # test: 5 nových
python download_lab_reports_v1.2.py # inkrementální běh
python download_lab_reports_v1.2.py --full # rekonciliace přes vše
```
## Nahrazuje
`download_lab_reports_v1.0.py` (disk) a `v1.1` (klíč podle metadat — chybný).
Obě → `TRASH/`.
+404
View File
@@ -0,0 +1,404 @@
# =============================================================================
# Název: download_lab_reports_v1.2.py
# Verze: 1.2
# Datum: 2026-06-16
# Popis: Stahuje PDF Lab Reports ze xsp.labcorp.com pro studii 77242113UCO3001
# (interni cislo 36940), filtrovane na 10 ceskych center (CZ),
# a uklada je PRIMO do MongoDB (db covance, kolekce labreports) —
# metadata z tabulky + skutecne PDF (inline Binary). Na disk NEUKLADA.
#
# KLIC: record_id = reportId (z dat AG Gridu) — stabilni 32-hex ID
# dokumentu, NAPRIC vsemi radky UNIKATNI (overeno: 997 radku ->
# 997 unikatnich reportId/fileId). Resi pripad, kdy se na portalu
# vyskytnou ruzna PDF se SHODNYMI viditelnymi metadaty (stejny
# site|subject|accession|visit|posted i na minutu) — to skutecne
# nastava (korekce vysledku reissue se shodnym casem Posted).
# Verze v1.1 klicovala podle metadat a tyto by chybne slucovala.
#
# Princip stahovani: Playwright + perzistentni profil, login, klik na
# "English" ve sloupci Download -> expect_download; PDF bajty se ctou
# z Playwright temp souboru (download.path()), save_as se nevola.
#
# INKREMENTALNE (stop-at-known): list je Posted DESC (nejnovejsi
# nahore). Nejdriv se z grid API precte SEZNAM (rowIndex, reportId)
# BEZ stahovani; od shora se hleda prvni uz ulozeny reportId -> vse
# pod nim je starsi a uz v Mongo je. Stahuji se jen nove (nahore).
#
# Prepinace:
# --full projit vsechny radky (bez stop-at-known); upsert
# chybejicich (rekonciliace).
# --dry-run nestahuje ani nepise do DB; jen vypise NOVE reporty.
# --limit N zpracovat max N novych radku (test).
# =============================================================================
from playwright.sync_api import sync_playwright
from datetime import datetime
from pymongo import MongoClient, ASCENDING
from bson.binary import Binary
import argparse
import hashlib
import json
import os
import re
import traceback
import urllib.parse
parser = argparse.ArgumentParser(description="Stahovani Lab Reports PDF (XSP) do MongoDB.")
parser.add_argument("--full", action="store_true", help="projit vse (bez stop-at-known)")
parser.add_argument("--dry-run", action="store_true", help="nestahovat ani nepsat do DB; jen vypsat nove")
parser.add_argument("--limit", type=int, default=0, help="max N novych radku (0 = vse)")
ARGS = parser.parse_args()
def log(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)
# --- konfigurace ------------------------------------------------------------
EMAIL = "vbuzalka@its.jnj.com"
PASSWORD = "%zT3Wqfc9)cWua5"
LOGIN_URL = "https://xsp.labcorp.com/"
STUDY = "36940"
STUDY_CODE = "77242113UCO3001"
MONGO_URI = "mongodb://192.168.1.76:27017"
DB_NAME = "covance"
COLLECTION = "labreports"
SITES = [
"930539", "930547", "930555", "930556", "930553",
"930549", "930525", "930536", "930557", "930531",
]
_BASE = os.path.dirname(os.path.abspath(__file__))
PROFILE_DIR = os.path.join(_BASE, "browser_profile")
def lab_reports_url():
site_param = json.dumps(SITES, separators=(",", ":"))
return (f"https://xsp.labcorp.com/sponsor/study/{STUDY}/lab-reports"
f"?site={urllib.parse.quote(site_param)}")
# --- formatovani -------------------------------------------------------------
_MONTHS = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05",
"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10",
"Nov": "11", "Dec": "12"}
def safe(s: str) -> str:
return re.sub(r'[\\/:*?"<>|]', "", s or "").strip()
def fmt_date(s: str) -> str:
"""'Jun 10, 2026' i 'Jun 15, 2026 7:49 PM' -> '2026-06-10'."""
m = re.match(r"\s*([A-Za-z]{3})\w*\s+(\d{1,2}),\s+(\d{4})", s or "")
if m and m.group(1)[:3] in _MONTHS:
return f"{m.group(3)}-{_MONTHS[m.group(1)[:3]]}-{int(m.group(2)):02d}"
return safe(s)
def build_basename(meta: dict) -> str:
"""Lidsky citelny nazev (pro budouci materializacni skript), ulozen jako fileName."""
return safe(
f"{STUDY_CODE} {fmt_date(meta['collected'])} {meta['site']} "
f"{meta['subject']} {meta['visit']} {meta['accession']} "
f"posted {fmt_date(meta['postedDisplay'])}"
) + ".pdf"
# --- JS helpery (AG Grid) ---------------------------------------------------
# Seznam vsech radku (rowIndex + reportId + data, ktera nepotrebuji vykresleni).
JS_ALL_ROWS = r"""() => {
let holder = null;
for (const el of document.querySelectorAll('.ag-root-wrapper, .ag-root, ag-grid-angular')) {
if (el.__agComponent) { holder = el.__agComponent; break; }
}
if (!holder) return null;
const api = (holder.gridApi && holder.gridApi.forEachNode) ? holder.gridApi : holder;
if (!api || !api.getDisplayedRowCount) return null;
const cnt = api.getDisplayedRowCount();
const out = [];
for (let i = 0; i < cnt; i++) {
const n = api.getDisplayedRowAtIndex(i);
if (!n || !n.data) continue;
const d = n.data;
const fl = (d.fileLinks || []).find(f => f.language === 'English') || (d.fileLinks || [])[0] || {};
out.push({
rowIndex: i,
reportId: d.reportId,
fileId: fl.fileId,
serverFileName: fl.fileName,
postedIso: d.postedDateTime,
site: d.siteNum,
subject: d.subjectNumber,
});
}
return out;
}"""
# Bunky daneho radku (potrebuji vykresleni -> nejdriv scroll). Accession/Visit/
# Collected nejsou v top-level datech (jsou ve 'visits'), beru je z bunek.
JS_CELLS = r"""(idx) => {
const dedup = s => {
s = (s || '').replace(/\s+/g, ' ').trim();
const h = s.slice(0, Math.floor(s.length / 2));
if (s === h + h) return h;
const m = s.match(/^(.*?)\s+\1$/);
if (m) return m[1];
return s;
};
const row = document.querySelector('.ag-body-container .ag-row[row-index="' + idx + '"]');
if (!row) return null;
const get = id => { const c = row.querySelector('[col-id="' + id + '"]'); return c ? dedup(c.textContent) : ''; };
return {
type: get('type'),
accession: get('accessionNumber'),
visit: get('visit'),
collected: get('visitCollectionDate'),
postedDisplay: get('postedDateTime'),
};
}"""
JS_SCROLL_TO = r"""(args) => {
const [idx, rh] = args;
const vp = document.querySelector('.ag-body-viewport');
if (!vp) return;
vp.scrollTop = Math.max(0, idx * rh - vp.clientHeight / 2);
}"""
JS_ROW_HEIGHT = r"""() => {
const r = document.querySelector('.ag-body-container .ag-row');
return r ? r.getBoundingClientRect().height || 25 : 25;
}"""
# --- login ------------------------------------------------------------------
def login(page):
log("LOGIN: otviram login stranku...")
page.goto(LOGIN_URL)
try:
page.get_by_label("Email").wait_for(state="visible", timeout=12000)
except Exception:
log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})")
return
log("LOGIN: zadavam email...")
page.get_by_label("Email").fill(EMAIL)
page.get_by_role("button", name="Next").click()
log("LOGIN: cekam na pole pro heslo...")
page.get_by_label("Password").wait_for(state="visible", timeout=30000)
log("LOGIN: zadavam heslo...")
page.get_by_label("Password").fill(PASSWORD)
page.get_by_role("button", name="Verify").click()
log("LOGIN: cekam na presmerovani po prihlaseni...")
try:
page.wait_for_url(lambda url: "code=" not in url or "xsp." in url, timeout=60000)
except Exception:
log("LOGIN: wait_for_url vyprsel, pokracuji.")
page.wait_for_timeout(3000)
log(f"LOGIN: prihlaseni hotovo ({page.url})")
def open_grid(page):
log(f"GRID: navigace na Lab Reports ({len(SITES)} center)...")
page.goto(lab_reports_url())
log("GRID: cekam na radky (.ag-row)...")
page.wait_for_selector(".ag-body-container .ag-row", timeout=120000)
# cekej, az grid API hlasi stabilni pocet radku
prev = -1
rows = None
for i in range(25):
rows = page.evaluate(JS_ALL_ROWS)
cnt = len(rows) if rows else 0
log(f" ...kontrola #{i+1}: rows={cnt}")
if rows and cnt == prev and cnt > 0:
break
prev = cnt
page.wait_for_timeout(2000)
row_height = page.evaluate(JS_ROW_HEIGHT)
log(f"GRID: nacteno {len(rows) if rows else 0} radku, rowHeight={row_height}px.")
return rows or [], row_height
def download_pdf_bytes(page, idx):
link = page.locator(
f'.ag-body-container .ag-row[row-index="{idx}"] a.dl-link',
has_text="English",
).first
with page.expect_download(timeout=60000) as dl:
link.click()
with open(dl.value.path(), "rb") as f:
return f.read()
def upsert(col, rec, cells, data, now):
fields = {
"Type": cells["type"],
"Subject": rec["subject"],
"Accession": cells["accession"],
"Visit": cells["visit"],
"Collected Date": cells["collected"],
"Site Number": rec["site"],
"Posted": cells["postedDisplay"],
}
sha = hashlib.sha256(data).hexdigest()
meta = {"site": rec["site"], "subject": rec["subject"],
"accession": cells["accession"], "visit": cells["visit"],
"collected": cells["collected"], "postedDisplay": cells["postedDisplay"]}
derived = {
"study": STUDY,
"studyCode": STUDY_CODE,
"type": cells["type"] or "Lab Result",
"site": rec["site"],
"subject": rec["subject"],
"accession": cells["accession"],
"visit": cells["visit"],
"collected": fmt_date(cells["collected"]),
"posted": cells["postedDisplay"],
"postedIso": rec["postedIso"],
"fileId": rec["fileId"],
"serverFileName": rec["serverFileName"],
"fields": fields,
"fileName": build_basename(meta),
"pdf": Binary(data),
"pdfSize": len(data),
"pdfSha256": sha,
}
rid = rec["reportId"]
existing = col.find_one({"record_id": rid}, {"pdfSha256": 1, "fields": 1, "lastSeen": 1})
if existing is None:
col.insert_one({"record_id": rid, **derived,
"firstSeen": now, "lastSeen": now, "history": []})
return "insert"
if existing.get("pdfSha256") != sha or existing.get("fields") != fields:
col.update_one(
{"_id": existing["_id"]},
{"$push": {"history": {"date": existing.get("lastSeen"),
"fields": existing.get("fields"),
"pdfSha256": existing.get("pdfSha256")}},
"$set": {**derived, "lastSeen": now}},
)
return "update"
col.update_one({"_id": existing["_id"]}, {"$set": {"lastSeen": now}})
return "same"
def main():
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log(f"START: studie {STUDY_CODE} ({STUDY}) -> {DB_NAME}.{COLLECTION}, "
f"{'DRY-RUN' if ARGS.dry_run else 'ZAPIS'}"
f"{' [FULL]' if ARGS.full else ' [stop-at-known]'}"
f"{f', limit {ARGS.limit}' if ARGS.limit else ''}.")
col = None
existing_ids = set()
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
if not ARGS.dry_run:
col = client[DB_NAME][COLLECTION]
col.create_index([("record_id", ASCENDING)], unique=True)
for f in ("study", "site", "subject", "accession", "postedIso", "fileId"):
col.create_index([(f, ASCENDING)])
existing_ids = {d["record_id"] for d in
client[DB_NAME][COLLECTION].find({"study": STUDY}, {"record_id": 1})}
log(f"START: v Mongo je {len(existing_ids)} reportu pro tuto studii.")
with sync_playwright() as p:
context = p.chromium.launch_persistent_context(
user_data_dir=PROFILE_DIR,
headless=False,
args=[
"--disable-blink-features=AutomationControlled",
"--start-maximized",
"--disable-restore-session-state",
"--disable-session-crashed-bubble",
],
no_viewport=True,
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
accept_downloads=True,
)
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
page = context.new_page()
log("START: prohlizec spusten.")
login(page)
rows, row_height = open_grid(page)
# POJISTKA: jen CZ centra. URL filtr by mel vratit jen 10 CZ sites,
# ale kdyby selhal (napr. ztrata filtru po nejakem reloadu), tahle
# kontrola zabrani stazeni cizich center.
non_cz = [r for r in rows if not str(r["site"]).startswith("CZ")]
if non_cz:
log(f"POZOR: {len(non_cz)} ne-CZ radku v gridu (napr. {non_cz[0]['site']}) "
f"-> filtruji jen CZ. Zkontroluj URL filtr center!")
rows = [r for r in rows if str(r["site"]).startswith("CZ")]
log(f"GRID: po CZ-pojistce {len(rows)} CZ radku.")
# vyber radky ke zpracovani: shora dolu, stop-at-known
todo = []
for rec in rows:
if rec["reportId"] in existing_ids:
if ARGS.full:
continue
log(f"STOP-AT-KNOWN: rowIndex {rec['rowIndex']} (reportId {rec['reportId'][:12]}…) "
f"uz v Mongo -> koncim vyber (zbytek je starsi).")
break
todo.append(rec)
if ARGS.limit:
todo = todo[:ARGS.limit]
log(f"PLAN: {len(todo)} novych radku ke stazeni "
f"(z {len(rows)} v gridu, {len(existing_ids)} uz v Mongo).")
new_cnt = upd_cnt = same_cnt = 0
failed = []
for k, rec in enumerate(todo, 1):
idx = rec["rowIndex"]
try:
page.evaluate(JS_SCROLL_TO, [idx, row_height])
page.wait_for_selector(f'.ag-body-container .ag-row[row-index="{idx}"]', timeout=15000)
page.wait_for_timeout(120)
cells = page.evaluate(JS_CELLS, idx)
if not cells:
raise RuntimeError("nepodarilo se precist bunky radku")
meta = {"site": rec["site"], "subject": rec["subject"],
"accession": cells["accession"], "visit": cells["visit"],
"collected": cells["collected"], "postedDisplay": cells["postedDisplay"]}
fname = build_basename(meta)
if ARGS.dry_run:
log(f">>> {k}/{len(todo)} [DRY] NOVY rowIdx {idx} ({rec['reportId'][:12]}…): {fname}")
new_cnt += 1
continue
data = download_pdf_bytes(page, idx)
action = upsert(col, rec, cells, data, now)
existing_ids.add(rec["reportId"])
if action == "insert":
new_cnt += 1
log(f">>> {k}/{len(todo)} INSERT rowIdx {idx} ({len(data)//1024} KB): {fname}")
elif action == "update":
upd_cnt += 1
log(f">>> {k}/{len(todo)} UPDATE rowIdx {idx}: {fname}")
else:
same_cnt += 1
log(f">>> {k}/{len(todo)} SAME rowIdx {idx}: {fname}")
except Exception as e:
failed.append(idx)
log(f"CHYBA rowIdx {idx} ({rec['reportId'][:12]}…): {e!r} — pokracuji.")
log(f"KONEC: nove={new_cnt}, update={upd_cnt}, beze zmeny={same_cnt}, chyby={len(failed)}.")
if failed:
log(f"KONEC: SELHALY rowIndexy: {failed}")
context.close()
if __name__ == "__main__":
try:
main()
except Exception as e:
log(f"FATAL: beh spadl: {e!r}")
traceback.print_exc()
finally:
try:
input("\n[Enter] pro zavreni tohoto okna...")
except EOFError:
pass
@@ -0,0 +1,53 @@
# download_lab_reports_v1.3.py
**Verze:** 1.3 &nbsp;|&nbsp; **Datum:** 2026-06-16
Stahuje PDF **Lab Reports** ze `xsp.labcorp.com` (studie **77242113UCO3001**,
interní `36940`, 10 CZ center) přímo do **MongoDB** (`covance.labreports`) —
metadata + skutečné PDF (inline Binary). **Na disk neukládá.**
## Režim synchronizace — proměnná `SYNC_MODE`
Nahoře ve skriptu:
```python
SYNC_MODE = "delta" # "delta" | "fullsync"
```
- **delta** — jen NOVÉ reporty přes interní `reportId` (stop-at-known). List je
Posted DESC; shora se hledá první už uložený `reportId` → vše pod ním je
starší a v Mongo je. Rychlé, běžný provoz.
- **fullsync** — projde VŠECHNY řádky a doplní chybějící/změněné (rekonciliace).
CLI přepíše proměnnou: `--delta` / `--fullsync`.
## Klíč `record_id = reportId`
Stabilní 32-hex ID dokumentu z dat AG Gridu, napříč všemi řádky **unikátní**
a **perzistentní v čase** (ověřeno: stejné `reportId` vrací i jiný grid pro
totéž centrum). Řeší různá PDF se shodnými viditelnými metadaty (reissue se
shodným Posted i na minutu) — metadata na klíč nestačí.
## Odkud data
Z in-memory dat AG Gridu (`__agComponent.gridApi`): `reportId`,
`fileLinks[].fileId`+`fileName` (server), `postedDateTime` (moment.js →
převedeno na ISO string), `siteNum`, `subjectNumber`. Accession/Visit/Collected
leží v gridu níž (ve `visits`) → z buněk (po scrollu). `accession` se čte
**doslova** (reálný identifikátor kitu), neukládá se nijak odvozeně.
## MongoDB dokument
`record_id`(=reportId), `study`, `studyCode`, `type`, `site`, `subject`,
`accession`, `visit`, `collected`, `posted`, `postedIso` (string), `fileId`,
`serverFileName`, `fields`, `fileName`, **`pdf`** (Binary), `pdfSize`,
`pdfSha256`, `firstSeen`, `lastSeen`, `history[]`.
## Spuštění (z terminálu uživatele — GUI Chrome)
```
python download_lab_reports_v1.3.py # podle SYNC_MODE (default delta)
python download_lab_reports_v1.3.py --fullsync # rekonciliace
python download_lab_reports_v1.3.py --dry-run # vypíše nové, nestahuje
python download_lab_reports_v1.3.py --limit 5
```
## Zmeny v1.3
- `SYNC_MODE` proměnná (delta/fullsync) + CLI `--delta`/`--fullsync`.
- Oprava `postedIso`: v1.2 ukládal celý moment.js objekt; nyní čistý ISO string.
## Nahrazuje
`download_lab_reports_v1.2.py``TRASH/`.
+408
View File
@@ -0,0 +1,408 @@
# =============================================================================
# Název: download_lab_reports_v1.3.py
# Verze: 1.3
# Datum: 2026-06-16
# Popis: Stahuje PDF Lab Reports ze xsp.labcorp.com pro studii 77242113UCO3001
# (interni cislo 36940), filtrovane na 10 ceskych center (CZ),
# a uklada je PRIMO do MongoDB (db covance, kolekce labreports) —
# metadata z tabulky + skutecne PDF (inline Binary). Na disk NEUKLADA.
#
# REZIM SYNCHRONIZACE: promenna SYNC_MODE nahore.
# "delta" = jen NOVE reporty pres interni reportId (stop-at-known).
# List je Posted DESC; shora se hleda prvni uz ulozeny
# reportId -> vse pod nim je starsi a uz v Mongo je.
# "fullsync" = projit VSECHNY radky a doplnit chybejici / zmenene
# (rekonciliace). Pomalejsi, stahuje vse chybejici.
# CLI prepise promennou: --delta / --fullsync.
#
# KLIC: record_id = reportId (z dat AG Gridu) — stabilni 32-hex ID
# dokumentu, NAPRIC vsemi radky UNIKATNI a perzistentni v case
# (overeno: stejne reportId vraci i jiny grid pro totez centrum).
# Resi pripad ruznych PDF se SHODNYMI viditelnymi metadaty.
#
# Zmeny v1.3: + SYNC_MODE promenna (delta/fullsync); oprava postedIso
# (drive se ukladal cely moment.js objekt -> ted cisty ISO).
# =============================================================================
from playwright.sync_api import sync_playwright
from datetime import datetime
from pymongo import MongoClient, ASCENDING
from bson.binary import Binary
import argparse
import hashlib
import json
import os
import re
import traceback
import urllib.parse
# ============================================================================
# REZIM SYNCHRONIZACE — nastav zde (CLI --delta / --fullsync ma prednost)
# ============================================================================
SYNC_MODE = "delta" # "delta" = jen nove (stop-at-known pres reportId)
# "fullsync" = projit vse, doplnit chybejici/zmenene
# ============================================================================
parser = argparse.ArgumentParser(description="Stahovani Lab Reports PDF (XSP) do MongoDB.")
parser.add_argument("--delta", action="store_true", help="vynutit rezim delta")
parser.add_argument("--fullsync", action="store_true", help="vynutit rezim fullsync")
parser.add_argument("--dry-run", action="store_true", help="nestahovat ani nepsat do DB; jen vypsat nove")
parser.add_argument("--limit", type=int, default=0, help="max N novych radku (0 = vse)")
ARGS = parser.parse_args()
# rozhodnuti rezimu: CLI > promenna
_mode = SYNC_MODE
if ARGS.fullsync:
_mode = "fullsync"
if ARGS.delta:
_mode = "delta"
FULLSYNC = (_mode == "fullsync")
def log(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)
# --- konfigurace ------------------------------------------------------------
EMAIL = "vbuzalka@its.jnj.com"
PASSWORD = "%zT3Wqfc9)cWua5"
LOGIN_URL = "https://xsp.labcorp.com/"
STUDY = "36940"
STUDY_CODE = "77242113UCO3001"
MONGO_URI = "mongodb://192.168.1.76:27017"
DB_NAME = "covance"
COLLECTION = "labreports"
SITES = [
"930539", "930547", "930555", "930556", "930553",
"930549", "930525", "930536", "930557", "930531",
]
_BASE = os.path.dirname(os.path.abspath(__file__))
PROFILE_DIR = os.path.join(_BASE, "browser_profile")
def lab_reports_url():
site_param = json.dumps(SITES, separators=(",", ":"))
return (f"https://xsp.labcorp.com/sponsor/study/{STUDY}/lab-reports"
f"?site={urllib.parse.quote(site_param)}")
# --- formatovani -------------------------------------------------------------
_MONTHS = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05",
"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10",
"Nov": "11", "Dec": "12"}
def safe(s: str) -> str:
return re.sub(r'[\\/:*?"<>|]', "", s or "").strip()
def fmt_date(s: str) -> str:
"""'Jun 10, 2026' i 'Jun 15, 2026 7:49 PM' -> '2026-06-10'."""
m = re.match(r"\s*([A-Za-z]{3})\w*\s+(\d{1,2}),\s+(\d{4})", s or "")
if m and m.group(1)[:3] in _MONTHS:
return f"{m.group(3)}-{_MONTHS[m.group(1)[:3]]}-{int(m.group(2)):02d}"
return safe(s)
def build_basename(meta: dict) -> str:
return safe(
f"{STUDY_CODE} {fmt_date(meta['collected'])} {meta['site']} "
f"{meta['subject']} {meta['visit']} {meta['accession']} "
f"posted {fmt_date(meta['postedDisplay'])}"
) + ".pdf"
# --- JS helpery (AG Grid) ---------------------------------------------------
# Seznam vsech radku. postedDateTime je v datech moment.js objekt -> prevedu
# na cisty ISO string (jinak by se serializoval cely moment objekt).
JS_ALL_ROWS = r"""() => {
let holder = null;
for (const el of document.querySelectorAll('.ag-root-wrapper, .ag-root, ag-grid-angular')) {
if (el.__agComponent) { holder = el.__agComponent; break; }
}
if (!holder) return null;
const api = (holder.gridApi && holder.gridApi.forEachNode) ? holder.gridApi : holder;
if (!api || !api.getDisplayedRowCount) return null;
const toIso = v => {
if (v == null) return null;
if (typeof v === 'string') return v;
if (v._i && typeof v._i === 'string') return v._i; // puvodni serverove ISO s offsetem
if (typeof v.toISOString === 'function') { try { return v.toISOString(); } catch (e) {} }
return String(v);
};
const cnt = api.getDisplayedRowCount();
const out = [];
for (let i = 0; i < cnt; i++) {
const n = api.getDisplayedRowAtIndex(i);
if (!n || !n.data) continue;
const d = n.data;
const fl = (d.fileLinks || []).find(f => f.language === 'English') || (d.fileLinks || [])[0] || {};
out.push({
rowIndex: i,
reportId: d.reportId,
fileId: fl.fileId,
serverFileName: fl.fileName,
postedIso: toIso(d.postedDateTime),
site: d.siteNum,
subject: d.subjectNumber,
});
}
return out;
}"""
JS_CELLS = r"""(idx) => {
const dedup = s => {
s = (s || '').replace(/\s+/g, ' ').trim();
const h = s.slice(0, Math.floor(s.length / 2));
if (s === h + h) return h;
const m = s.match(/^(.*?)\s+\1$/);
if (m) return m[1];
return s;
};
const row = document.querySelector('.ag-body-container .ag-row[row-index="' + idx + '"]');
if (!row) return null;
const get = id => { const c = row.querySelector('[col-id="' + id + '"]'); return c ? dedup(c.textContent) : ''; };
return {
type: get('type'),
accession: get('accessionNumber'),
visit: get('visit'),
collected: get('visitCollectionDate'),
postedDisplay: get('postedDateTime'),
};
}"""
JS_SCROLL_TO = r"""(args) => {
const [idx, rh] = args;
const vp = document.querySelector('.ag-body-viewport');
if (!vp) return;
vp.scrollTop = Math.max(0, idx * rh - vp.clientHeight / 2);
}"""
JS_ROW_HEIGHT = r"""() => {
const r = document.querySelector('.ag-body-container .ag-row');
return r ? r.getBoundingClientRect().height || 25 : 25;
}"""
# --- login ------------------------------------------------------------------
def login(page):
log("LOGIN: otviram login stranku...")
page.goto(LOGIN_URL)
try:
page.get_by_label("Email").wait_for(state="visible", timeout=12000)
except Exception:
log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})")
return
log("LOGIN: zadavam email...")
page.get_by_label("Email").fill(EMAIL)
page.get_by_role("button", name="Next").click()
log("LOGIN: cekam na pole pro heslo...")
page.get_by_label("Password").wait_for(state="visible", timeout=30000)
log("LOGIN: zadavam heslo...")
page.get_by_label("Password").fill(PASSWORD)
page.get_by_role("button", name="Verify").click()
log("LOGIN: cekam na presmerovani po prihlaseni...")
try:
page.wait_for_url(lambda url: "code=" not in url or "xsp." in url, timeout=60000)
except Exception:
log("LOGIN: wait_for_url vyprsel, pokracuji.")
page.wait_for_timeout(3000)
log(f"LOGIN: prihlaseni hotovo ({page.url})")
def open_grid(page):
log(f"GRID: navigace na Lab Reports ({len(SITES)} center)...")
page.goto(lab_reports_url())
log("GRID: cekam na radky (.ag-row)...")
page.wait_for_selector(".ag-body-container .ag-row", timeout=120000)
prev = -1
rows = None
for i in range(25):
rows = page.evaluate(JS_ALL_ROWS)
cnt = len(rows) if rows else 0
log(f" ...kontrola #{i+1}: rows={cnt}")
if rows and cnt == prev and cnt > 0:
break
prev = cnt
page.wait_for_timeout(2000)
row_height = page.evaluate(JS_ROW_HEIGHT)
log(f"GRID: nacteno {len(rows) if rows else 0} radku, rowHeight={row_height}px.")
return rows or [], row_height
def download_pdf_bytes(page, idx):
link = page.locator(
f'.ag-body-container .ag-row[row-index="{idx}"] a.dl-link',
has_text="English",
).first
with page.expect_download(timeout=60000) as dl:
link.click()
with open(dl.value.path(), "rb") as f:
return f.read()
def upsert(col, rec, cells, data, now):
fields = {
"Type": cells["type"],
"Subject": rec["subject"],
"Accession": cells["accession"],
"Visit": cells["visit"],
"Collected Date": cells["collected"],
"Site Number": rec["site"],
"Posted": cells["postedDisplay"],
}
sha = hashlib.sha256(data).hexdigest()
derived = {
"study": STUDY,
"studyCode": STUDY_CODE,
"type": cells["type"] or "Lab Result",
"site": rec["site"],
"subject": rec["subject"],
"accession": cells["accession"],
"visit": cells["visit"],
"collected": fmt_date(cells["collected"]),
"posted": cells["postedDisplay"],
"postedIso": rec["postedIso"],
"fileId": rec["fileId"],
"serverFileName": rec["serverFileName"],
"fields": fields,
"fileName": build_basename({**rec, "accession": cells["accession"],
"visit": cells["visit"], "collected": cells["collected"],
"postedDisplay": cells["postedDisplay"]}),
"pdf": Binary(data),
"pdfSize": len(data),
"pdfSha256": sha,
}
rid = rec["reportId"]
existing = col.find_one({"record_id": rid}, {"pdfSha256": 1, "fields": 1, "lastSeen": 1})
if existing is None:
col.insert_one({"record_id": rid, **derived,
"firstSeen": now, "lastSeen": now, "history": []})
return "insert"
if existing.get("pdfSha256") != sha or existing.get("fields") != fields:
col.update_one(
{"_id": existing["_id"]},
{"$push": {"history": {"date": existing.get("lastSeen"),
"fields": existing.get("fields"),
"pdfSha256": existing.get("pdfSha256")}},
"$set": {**derived, "lastSeen": now}},
)
return "update"
col.update_one({"_id": existing["_id"]}, {"$set": {"lastSeen": now}})
return "same"
def main():
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log(f"START: studie {STUDY_CODE} ({STUDY}) -> {DB_NAME}.{COLLECTION}, "
f"rezim={'FULLSYNC' if FULLSYNC else 'DELTA'}"
f"{', DRY-RUN' if ARGS.dry_run else ''}"
f"{f', limit {ARGS.limit}' if ARGS.limit else ''}.")
col = None
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
if not ARGS.dry_run:
col = client[DB_NAME][COLLECTION]
col.create_index([("record_id", ASCENDING)], unique=True)
for f in ("study", "site", "subject", "accession", "postedIso", "fileId"):
col.create_index([(f, ASCENDING)])
existing_ids = {d["record_id"] for d in
client[DB_NAME][COLLECTION].find({"study": STUDY}, {"record_id": 1})}
log(f"START: v Mongo je {len(existing_ids)} reportu pro tuto studii.")
with sync_playwright() as p:
context = p.chromium.launch_persistent_context(
user_data_dir=PROFILE_DIR,
headless=False,
args=[
"--disable-blink-features=AutomationControlled",
"--start-maximized",
"--disable-restore-session-state",
"--disable-session-crashed-bubble",
],
no_viewport=True,
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
accept_downloads=True,
)
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
page = context.new_page()
log("START: prohlizec spusten.")
login(page)
rows, row_height = open_grid(page)
# POJISTKA: jen CZ centra (kdyby URL filtr selhal).
non_cz = [r for r in rows if not str(r["site"]).startswith("CZ")]
if non_cz:
log(f"POZOR: {len(non_cz)} ne-CZ radku v gridu (napr. {non_cz[0]['site']}) "
f"-> filtruji jen CZ. Zkontroluj URL filtr center!")
rows = [r for r in rows if str(r["site"]).startswith("CZ")]
log(f"GRID: po CZ-pojistce {len(rows)} CZ radku.")
# vyber radku ke zpracovani podle rezimu
todo = []
for rec in rows:
if rec["reportId"] in existing_ids:
if FULLSYNC:
continue # fullsync: znamy preskoc, jdi dal
log(f"DELTA stop-at-known: rowIndex {rec['rowIndex']} "
f"(reportId {rec['reportId'][:12]}…) uz v Mongo -> koncim (zbytek je starsi).")
break # delta: prvni znamy = konec
todo.append(rec)
if ARGS.limit:
todo = todo[:ARGS.limit]
log(f"PLAN [{'FULLSYNC' if FULLSYNC else 'DELTA'}]: {len(todo)} novych radku ke stazeni "
f"(z {len(rows)} v gridu, {len(existing_ids)} uz v Mongo).")
new_cnt = upd_cnt = same_cnt = 0
failed = []
for k, rec in enumerate(todo, 1):
idx = rec["rowIndex"]
try:
page.evaluate(JS_SCROLL_TO, [idx, row_height])
page.wait_for_selector(f'.ag-body-container .ag-row[row-index="{idx}"]', timeout=15000)
page.wait_for_timeout(120)
cells = page.evaluate(JS_CELLS, idx)
if not cells:
raise RuntimeError("nepodarilo se precist bunky radku")
meta = {"site": rec["site"], "subject": rec["subject"],
"accession": cells["accession"], "visit": cells["visit"],
"collected": cells["collected"], "postedDisplay": cells["postedDisplay"]}
fname = build_basename(meta)
if ARGS.dry_run:
log(f">>> {k}/{len(todo)} [DRY] NOVY rowIdx {idx} ({rec['reportId'][:12]}…): {fname}")
new_cnt += 1
continue
data = download_pdf_bytes(page, idx)
action = upsert(col, rec, cells, data, now)
existing_ids.add(rec["reportId"])
if action == "insert":
new_cnt += 1
log(f">>> {k}/{len(todo)} INSERT rowIdx {idx} ({len(data)//1024} KB): {fname}")
elif action == "update":
upd_cnt += 1
log(f">>> {k}/{len(todo)} UPDATE rowIdx {idx}: {fname}")
else:
same_cnt += 1
log(f">>> {k}/{len(todo)} SAME rowIdx {idx}: {fname}")
except Exception as e:
failed.append(idx)
log(f"CHYBA rowIdx {idx} ({rec['reportId'][:12]}…): {e!r} — pokracuji.")
log(f"KONEC: nove={new_cnt}, update={upd_cnt}, beze zmeny={same_cnt}, chyby={len(failed)}.")
if failed:
log(f"KONEC: SELHALY rowIndexy: {failed}")
context.close()
if __name__ == "__main__":
try:
main()
except Exception as e:
log(f"FATAL: beh spadl: {e!r}")
traceback.print_exc()
+349
View File
@@ -0,0 +1,349 @@
# =============================================================================
# Název: import_to_mongo_v1.4.py
# Verze: 1.4
# Datum: 2026-06-09
# Popis: Import CSV reportů do MongoDB (db: covance).
# Pipeline 1 — allSamples: kolekce allsamples, klíč Container Barcode No.
# Zdroj: Source (study 36940 + 35472)
# Pipeline 2 — kits: kolekce kits, klíč Accession
# Zdroj: Source (study 36940 + 35472)
# Pipeline 3 — results: kolekce results, laboratorní výsledky per centrum.
# Zdroj: Source, soubory test-results-{SITE}-{typ}.csv
# (1. řádek = disclaimer, hlavička je 2. řádek!)
# Dva typy (standard / microbiology) v jedné kolekci,
# rozlišené polem resultType. record_id:
# standard: STD|{Accession}|{Test Group}|{Test}|{occ}
# microbiology: MIC|{Accession}|{Test Group}|{Specimen}|
# {Test Description}|{Drug Name/Agent}|{occ}
# Pipeline 4 — equeries: kolekce equeries, eQuery report (study 36940 + 35472).
# Zdroj: Source, soubory ...-equery.csv (FULL).
# Klíč eQueryId (stabilní systémové ID, unikátní per řádek);
# řádky footeru s parametry filtru (nečíselný eQueryId) se
# přeskakují. History sleduje životní cyklus dotazu
# (Open -> Response Received -> Closed).
# Varianta ...-equery_unresponded_only.csv je jen podmnožina
# (Status=Open) téhož reportu + footer => NEIMPORTUJE se,
# pouze se přesune do Zpracovano/ (move-only pipeline).
# Upsert s historií změn, zpracovaný soubor přesunut do Zpracovano/.
# Přepínač --dry-run: nic nezapisuje do DB ani nepřesouvá soubory.
# =============================================================================
import csv
import re
import shutil
import sys
from datetime import datetime
from pathlib import Path
from pymongo import MongoClient, ASCENDING
MONGO_URI = "mongodb://192.168.1.76:27017"
DB_NAME = "covance"
SOURCE = Path(__file__).parent / "Source"
# ---------------------------------------------------------------------------
# Builders record_id + metadata pro jednotlivé pipeline
# ---------------------------------------------------------------------------
def make_keyed_record(upsert_key: str):
"""Jednoduchý klíč = hodnota jednoho sloupce (allsamples, kits)."""
def builder(fields: dict, fmeta: dict | None, occ: dict):
key_val = fields.get(upsert_key)
if not key_val:
return None, {}
return key_val, {}
return builder
def _norm_subject(raw: str | None) -> str:
"""'CZ100062001 - null' -> 'CZ100062001'."""
s = (raw or "").strip()
return s.split(" - null")[0].strip()
def make_results_record(fields: dict, fmeta: dict, occ: dict):
rtype = fmeta["resultType"]
accession = fields.get("Accession")
if not accession:
return None, {}
if rtype == "standard":
parts = (accession, fields.get("Test Group", ""), fields.get("Test", ""))
prefix = "STD"
else: # microbiology
parts = (
accession,
fields.get("Test Group", ""),
fields.get("Specimen", ""),
fields.get("Test Description", ""),
fields.get("Drug Name/Agent", ""),
)
prefix = "MIC"
occ[parts] = occ.get(parts, 0) + 1
record_id = f"{prefix}|" + "|".join(str(p or "") for p in parts) + f"|{occ[parts]}"
extra = {
"study": fmeta["study"],
"site": fmeta["site"],
"subject": _norm_subject(fields.get("Subject")),
"resultType": rtype,
}
return record_id, extra
def make_equery_record(fields: dict, fmeta: dict | None, occ: dict):
"""Klíč = eQueryId. Footer s parametry filtru (nečíselný eQueryId) se přeskočí."""
key_val = (fields.get("eQueryId") or "").strip()
if not key_val.isdigit():
return None, {}
extra = {"study": fmeta["study"]} if fmeta else {}
return key_val, extra
def results_file_meta(filename: str) -> dict | None:
m = re.search(r"study-(\d+)-test-results-(\d+)-(standard|microbiology)", filename, re.IGNORECASE)
if not m:
return None
return {"study": m.group(1), "site": m.group(2), "resultType": m.group(3).lower()}
def equery_file_meta(filename: str) -> dict | None:
m = re.search(r"study-(\d+)-activity-reports", filename, re.IGNORECASE)
return {"study": m.group(1)} if m else {"study": None}
PIPELINES = [
{
"name": "allsamples",
"collection": "allsamples",
"pattern": re.compile(r".*-allSamples\.csv$", re.IGNORECASE),
"sources": [SOURCE],
"header_skip": 0,
"make_record": make_keyed_record("Container Barcode No."),
"file_meta": None,
"indexes": [
[("fields.Sample Status", ASCENDING)],
[("fields.Specimen Type", ASCENDING)],
],
},
{
"name": "kits",
"collection": "kits",
"pattern": re.compile(r".*-kit-inventory-on-hand-expiration\.csv$", re.IGNORECASE),
"sources": [SOURCE],
"header_skip": 0,
"make_record": make_keyed_record("Accession"),
"file_meta": None,
"indexes": [
[("fields.Kit Type", ASCENDING)],
[("fields.Site", ASCENDING)],
[("fields.Expiration Date", ASCENDING)],
],
},
{
"name": "results",
"collection": "results",
"pattern": re.compile(r".*test-results-\d+-(standard|microbiology)\.csv$", re.IGNORECASE),
"sources": [SOURCE],
"header_skip": 1, # 1. řádek je disclaimer, hlavička je 2. řádek
"make_record": make_results_record,
"file_meta": results_file_meta,
"indexes": [
[("subject", ASCENDING)],
[("study", ASCENDING)],
[("site", ASCENDING)],
[("resultType", ASCENDING)],
[("fields.Accession", ASCENDING)],
[("fields.Test Group", ASCENDING)],
],
},
{
"name": "equeries",
"collection": "equeries",
# FULL report; varianta _unresponded_only se sem ZÁMĚRNĚ nechytá (jiný pattern níže)
"pattern": re.compile(r".*activity-reports-documents-equery\.csv$", re.IGNORECASE),
"sources": [SOURCE],
"header_skip": 0,
"make_record": make_equery_record,
"file_meta": equery_file_meta,
"indexes": [
[("study", ASCENDING)],
[("fields.Status", ASCENDING)],
[("fields.Site", ASCENDING)],
[("fields.Subject", ASCENDING)],
[("fields.Issue Type", ASCENDING)],
],
},
{
"name": "equeries_unresponded",
"move_only": True, # podmnožina FULL reportu -> jen přesun, neimportuje se
"pattern": re.compile(r".*activity-reports-documents-equery_unresponded_only\.csv$", re.IGNORECASE),
"sources": [SOURCE],
},
]
def extract_snapshot_date(filename: str) -> str:
match = re.match(r"(\d{4}-\d{2}-\d{2})", Path(filename).name)
return match.group(1) if match else datetime.now().strftime("%Y-%m-%d")
def clean_value(val: str) -> str | None:
val = val.strip()
return val if val else None
def import_file(csv_path: Path, collection, pipeline: dict, dry_run: bool) -> dict:
snapshot_date = extract_snapshot_date(csv_path.name)
inserted = changed = unchanged = skipped = 0
fmeta = pipeline["file_meta"](csv_path.name) if pipeline["file_meta"] else None
with open(csv_path, newline="", encoding="utf-8-sig") as f:
lines = f.readlines()
reader = csv.DictReader(lines[pipeline["header_skip"]:])
rows = list(reader)
occ: dict = {} # stav pořadí výskytů (per soubor)
for row in rows:
fields = {k: clean_value(v) for k, v in row.items() if k}
record_id, extra = pipeline["make_record"](fields, fmeta, occ)
if not record_id:
skipped += 1
continue
existing = None if dry_run else collection.find_one({"record_id": record_id})
if existing is None and dry_run:
inserted += 1 # v dry-run nevíme jistě, počítáme jako kandidáty na insert
continue
if existing is None:
collection.insert_one({
"record_id": record_id,
"fields": fields,
**extra,
"sourceFile": csv_path.name,
"firstSeen": snapshot_date,
"lastSeen": snapshot_date,
"history": [],
})
inserted += 1
elif existing["fields"] != fields:
collection.update_one(
{"_id": existing["_id"]},
{
"$push": {"history": {"date": existing["lastSeen"], "fields": existing["fields"]}},
"$set": {"fields": fields, **extra, "sourceFile": csv_path.name, "lastSeen": snapshot_date},
},
)
changed += 1
else:
collection.update_one(
{"_id": existing["_id"]},
{"$set": {"lastSeen": snapshot_date, "sourceFile": csv_path.name}},
)
unchanged += 1
total_rows = len(rows)
db_count = "-" if dry_run else collection.count_documents({})
tag = "[DRY] " if dry_run else ""
print(f" {tag}[{snapshot_date}]: +{inserted} new, ~{changed} changed, ={unchanged} same, -{skipped} bez klice")
print(f" Radku v CSV: {total_rows}, dokumentu v DB: {db_count}")
if inserted + changed + unchanged + skipped != total_rows:
print(f" !!! VAROVANI: soucet ({inserted+changed+unchanged+skipped}) != radku v CSV ({total_rows})")
return {"inserted": inserted, "changed": changed, "unchanged": unchanged}
def collect_files(pipeline: dict, cli_args: list[str]) -> list[Path]:
if cli_args:
paths = []
for arg in cli_args:
p = Path(arg)
if p.is_file() and pipeline["pattern"].match(p.name):
paths.append(p)
return paths
paths = []
for src_dir in pipeline["sources"]:
if src_dir.exists():
paths.extend(sorted(p for p in src_dir.glob("*.csv") if pipeline["pattern"].match(p.name)))
return paths
def move_to_processed(csv_path: Path, dry_run: bool):
if dry_run:
print(f" [DRY] -> presunul by do Zpracovano/\n")
return
dest = csv_path.parent / "Zpracovano" / csv_path.name
shutil.move(str(csv_path), str(dest))
print(f" -> presunut do Zpracovano/\n")
def run_pipeline(pipeline: dict, client, cli_args: list[str], dry_run: bool):
paths = collect_files(pipeline, cli_args)
if not paths:
print(f"[{pipeline['name']}] Zadne soubory k importu.")
return
print(f"\n=== Pipeline: {pipeline['name']} ({len(paths)} souboru){' [DRY-RUN]' if dry_run else ''} ===")
# Move-only pipeline (např. unresponded podmnožina) — jen přesun, žádný import
if pipeline.get("move_only"):
if not dry_run:
for src_dir in pipeline["sources"]:
(src_dir / "Zpracovano").mkdir(exist_ok=True)
for csv_path in paths:
print(f"Move-only: {csv_path.name} [{csv_path.parent.parent.name}]")
move_to_processed(csv_path, dry_run)
print(f"[{pipeline['name']}] Presunuto {len(paths)} souboru (neimportuje se).")
return
col = None
if not dry_run:
col = client[DB_NAME][pipeline["collection"]]
col.create_index([("record_id", ASCENDING)], unique=True)
for idx in pipeline["indexes"]:
col.create_index(idx)
for src_dir in pipeline["sources"]:
(src_dir / "Zpracovano").mkdir(exist_ok=True)
total = {"inserted": 0, "changed": 0, "unchanged": 0}
for csv_path in paths:
print(f"Import: {csv_path.name} [{csv_path.parent.parent.name}]")
stats = import_file(csv_path, col, pipeline, dry_run)
for k in total:
total[k] += stats[k]
move_to_processed(csv_path, dry_run)
print(f"[{pipeline['name']}] Celkem: +{total['inserted']} new, ~{total['changed']} changed, ={total['unchanged']} same")
def main():
args = sys.argv[1:]
dry_run = "--dry-run" in args
cli_args = [a for a in args if a != "--dry-run"]
client = None
if not dry_run:
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
for pipeline in PIPELINES:
run_pipeline(pipeline, client, cli_args, dry_run)
if client:
client.close()
if __name__ == "__main__":
main()