z230
This commit is contained in:
@@ -0,0 +1,530 @@
|
||||
# create_report_v2.0.py — v2.0 — 2026-05-29
|
||||
# UCO3001 Covance specimen & kit report — zdroj dat: MongoDB (covance + edc)
|
||||
|
||||
import pandas as pd
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
|
||||
from openpyxl.utils import get_column_letter
|
||||
from datetime import date, datetime
|
||||
from pymongo import MongoClient
|
||||
|
||||
# ── Konfigurace ────────────────────────────────────────────────────────────────
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
out_dir = "U:/Dropbox/!!!Days/Downloads Z230/"
|
||||
|
||||
# ── MongoDB připojení ──────────────────────────────────────────────────────────
|
||||
client = MongoClient(MONGO_URI)
|
||||
covance_db = client["covance"]
|
||||
edc_db = client["edc"]
|
||||
|
||||
# ── Načtení dat z MongoDB ──────────────────────────────────────────────────────
|
||||
print("Načítám data z MongoDB...")
|
||||
|
||||
samples_docs = list(covance_db["allsamples"].find())
|
||||
df = pd.DataFrame([doc["fields"] for doc in samples_docs]).reset_index(drop=True)
|
||||
print(f" allsamples: {len(df)} záznamů")
|
||||
|
||||
kit_docs = list(covance_db["kits"].find())
|
||||
kit_df_raw = pd.DataFrame([doc["fields"] for doc in kit_docs]).reset_index(drop=True)
|
||||
print(f" kits: {len(kit_df_raw)} záznamů")
|
||||
|
||||
edc_docs = list(edc_db["UCO3001.DateofVisit"].find())
|
||||
edc_rows = []
|
||||
for doc in edc_docs:
|
||||
edc_rows.append({
|
||||
"SiteNumber": doc["site"]["number"],
|
||||
"Subject": doc["subject"]["label"],
|
||||
"InstanceName": doc["form"]["instanceName"],
|
||||
"Field4Value": doc["fields"].get("Visit Start Date"),
|
||||
"Field5Value": doc["fields"].get("Type of Contact"),
|
||||
})
|
||||
edc_df_raw = pd.DataFrame(edc_rows)
|
||||
print(f" DateofVisit: {len(edc_df_raw)} záznamů")
|
||||
|
||||
# ── Výstupní soubor ────────────────────────────────────────────────────────────
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
|
||||
out_filename = f"{timestamp} 77242113UCO3001 CZE Labcorp samples and kit inventory report.xlsx"
|
||||
out_path = out_dir + out_filename
|
||||
|
||||
# ── Příprava dat — allsamples ──────────────────────────────────────────────────
|
||||
all_patients = sorted(df['Patient No.'].dropna().unique())
|
||||
bxscr = df[df['Protocol Visit Code'] == 'BXSCR']
|
||||
dna = df[df['Protocol Visit Code'] == 'DNA']
|
||||
|
||||
def fmt_date(val):
|
||||
if val is None:
|
||||
return None
|
||||
if isinstance(val, float) and pd.isna(val):
|
||||
return None
|
||||
if isinstance(val, datetime):
|
||||
return val.replace(tzinfo=None)
|
||||
if isinstance(val, str):
|
||||
for fmt in ('%d-%b-%Y', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
|
||||
try:
|
||||
return datetime.strptime(val.strip(), fmt)
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
return pd.to_datetime(val).to_pydatetime().replace(tzinfo=None)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
OK_STATUSES = {'Received', 'In Inventory', 'Shipped'}
|
||||
|
||||
def get_specimen_info(visit_df, patient, specimen_type=None):
|
||||
rows = visit_df[visit_df['Patient No.'] == patient]
|
||||
if specimen_type:
|
||||
rows = rows[rows['Specimen Type'] == specimen_type]
|
||||
rows = rows[rows['Sample Status'].isin(OK_STATUSES)]
|
||||
if rows.empty:
|
||||
return '', None
|
||||
row = rows.iloc[0]
|
||||
return fmt_date(row['Container Receipt Date']), rows.index[0] + 2
|
||||
|
||||
def get_label_info(patient, label_code, visit_code):
|
||||
rows = df[(df['Patient No.'] == patient) &
|
||||
(df['Protocol Visit Code'] == visit_code) &
|
||||
(df['Container Label Line 1'] == label_code)]
|
||||
rows = rows[rows['Sample Status'].isin(OK_STATUSES)]
|
||||
if rows.empty:
|
||||
return '', None
|
||||
row = rows.iloc[0]
|
||||
return fmt_date(row['Container Receipt Date']), rows.index[0] + 2
|
||||
|
||||
# ── Příprava dat — kit inventory ───────────────────────────────────────────────
|
||||
cze = kit_df_raw[kit_df_raw["Country"] == "CZE"].copy()
|
||||
|
||||
def parse_kit_date(val):
|
||||
if val is None or (isinstance(val, float) and pd.isna(val)):
|
||||
return None
|
||||
if isinstance(val, datetime):
|
||||
return val.replace(tzinfo=None)
|
||||
try:
|
||||
return datetime.strptime(str(val).strip(), "%b %d, %Y")
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
cze["Shipped Date"] = cze["Shipped Date"].apply(parse_kit_date)
|
||||
cze["Expiration Date"] = cze["Expiration Date"].apply(parse_kit_date)
|
||||
cze["Days to Expiration"] = pd.to_numeric(cze["Days to Expiration"], errors="coerce")
|
||||
cze = cze.sort_values(["Site", "Kit Type", "Expiration Date"]).reset_index(drop=True)
|
||||
|
||||
today_dt = datetime.combine(date.today(), datetime.min.time())
|
||||
|
||||
def bucket(exp_date):
|
||||
if exp_date is None:
|
||||
return None
|
||||
return "soon" if (exp_date - today_dt).days <= 30 else "ok"
|
||||
|
||||
cze["_bucket"] = cze["Expiration Date"].apply(bucket)
|
||||
|
||||
kit_order = sorted(cze["Kit Type"].unique(), key=lambda x: (str(x).lstrip("T-").zfill(5), str(x)))
|
||||
kit_desc = cze.drop_duplicates("Kit Type").set_index("Kit Type")["Description"].to_dict()
|
||||
kit_sites = sorted(cze["Site"].unique())
|
||||
|
||||
# ── Příprava dat — EDC pacienti ────────────────────────────────────────────────
|
||||
def fmt_date_edc(val):
|
||||
if val is None or (isinstance(val, float) and pd.isna(val)):
|
||||
return None
|
||||
if isinstance(val, datetime):
|
||||
return val.replace(tzinfo=None)
|
||||
if isinstance(val, str):
|
||||
for fmt in ('%d %b %Y', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
|
||||
try:
|
||||
return datetime.strptime(val.strip(), fmt)
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
return pd.to_datetime(val).to_pydatetime().replace(tzinfo=None)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
_pat_pre = edc_df_raw[['SiteNumber', 'Subject', 'Field4Value']].copy()
|
||||
_pat_pre['Field4Value'] = _pat_pre['Field4Value'].apply(fmt_date_edc)
|
||||
_pat_pre = _pat_pre.sort_values(['SiteNumber', 'Subject', 'Field4Value']).reset_index(drop=True)
|
||||
patient_row_map = {}
|
||||
for i, row in _pat_pre.iterrows():
|
||||
pat = row['Subject']
|
||||
if pat not in patient_row_map:
|
||||
patient_row_map[pat] = i + 2
|
||||
|
||||
bxscr_patients = sorted(bxscr['Patient No.'].dropna().unique())
|
||||
|
||||
# ── Workbook ───────────────────────────────────────────────────────────────────
|
||||
out_wb = Workbook()
|
||||
out_wb.remove(out_wb.active)
|
||||
|
||||
# ── Styly ──────────────────────────────────────────────────────────────────────
|
||||
thin = Side(style='thin')
|
||||
border = Border(left=thin, right=thin, top=thin, bottom=thin)
|
||||
header_fill = PatternFill("solid", fgColor="4472C4")
|
||||
header_font = Font(name='Calibri', bold=True, size=11, color="FFFFFF")
|
||||
data_font = Font(name='Calibri', size=11)
|
||||
date_font_link = Font(name='Calibri', size=11, color="000000", underline='single')
|
||||
yes_fill = PatternFill("solid", fgColor="E2EFDA")
|
||||
no_fill = PatternFill("solid", fgColor="FFE7E7")
|
||||
sum_header_font = Font(name='Calibri', bold=True, size=11, color="000000")
|
||||
sum_total_font = Font(name='Calibri', bold=True, size=11)
|
||||
zero_font = Font(name='Calibri', size=11, color="BFBFBF")
|
||||
zero_red_font = Font(name='Calibri', size=11, color="C00000")
|
||||
dark_blue_fill = PatternFill("solid", fgColor="203764")
|
||||
orange_fill = PatternFill("solid", fgColor="FFF2CC")
|
||||
green_fill = PatternFill("solid", fgColor="E2EFDA")
|
||||
total_fill = PatternFill("solid", fgColor="D9E1F2")
|
||||
exp_fill = PatternFill("solid", fgColor="FFE7E7")
|
||||
ok_fill = PatternFill("solid", fgColor="E2EFDA")
|
||||
|
||||
# ── List: Zdroj ────────────────────────────────────────────────────────────────
|
||||
# Generován z covance.allsamples — pořadí řádků odpovídá df.index,
|
||||
# proto hyperlinky z Přehledu vzorků (index + 2) míří na správné řádky.
|
||||
src_ws = out_wb.create_sheet("Zdroj")
|
||||
src_sheet_name = "Zdroj"
|
||||
pat_sheet_name = "Seznam pacientů"
|
||||
|
||||
zdroj_columns = [
|
||||
"Protocol Code", "Investigator No.", "Investigator Name", "Patient No.",
|
||||
"Collection Date", "Protocol Visit Code", "Kit Receipt Date",
|
||||
"Container Receipt Date", "Accession", "Container No.", "Container Barcode No.",
|
||||
"Specimen Type", "Sample Status", "Expected Receipt Condition",
|
||||
"Actual Receipt Condition", "Container Label Line 1", "Container Label Line 2",
|
||||
"SM Sample Status", "SMART Specimen Class Description", "Parent Barcode", "Children Barcode",
|
||||
]
|
||||
|
||||
for col_idx, col_name in enumerate(zdroj_columns, 1):
|
||||
cell = src_ws.cell(row=1, column=col_idx, value=col_name)
|
||||
cell.font = header_font
|
||||
cell.fill = header_fill
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
|
||||
src_ws.column_dimensions[get_column_letter(col_idx)].width = max(len(col_name) + 2, 14)
|
||||
|
||||
src_ws.row_dimensions[1].height = 30
|
||||
src_ws.freeze_panes = "A2"
|
||||
|
||||
def clean(v):
|
||||
try:
|
||||
if pd.isna(v):
|
||||
return None
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
return v
|
||||
|
||||
for row_idx, (_, row) in enumerate(df.iterrows(), 2):
|
||||
for col_idx, col_name in enumerate(zdroj_columns, 1):
|
||||
val = clean(row.get(col_name))
|
||||
cell = src_ws.cell(row=row_idx, column=col_idx, value=val)
|
||||
cell.font = data_font
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal='center', vertical='center')
|
||||
|
||||
src_ws.auto_filter.ref = f"A1:{get_column_letter(len(zdroj_columns))}1"
|
||||
|
||||
# ── List: Přehled vzorků ───────────────────────────────────────────────────────
|
||||
analysis_ws = out_wb.create_sheet("Přehled vzorků")
|
||||
|
||||
columns = [
|
||||
("Investigator Name", 24),
|
||||
("Číslo pacienta", 20),
|
||||
("Máme biopsii SM11", 20),
|
||||
("Máme RNA", 16),
|
||||
("Máme Cryostor", 16),
|
||||
("DNA", 14),
|
||||
("PLASMPK I-0 TROUGH", 18),
|
||||
("PLASMA PK I-0 PEAK", 18),
|
||||
("SERUM ADA I-0 PRE", 18),
|
||||
("SM06/SERUM BIOM", 16),
|
||||
("SM07/WB RNA", 14),
|
||||
("SM10/FECAL", 14),
|
||||
("PLASMPK I-2 TROUGH", 18),
|
||||
("PLASMA PK I-2 PEAK", 18),
|
||||
("SERUM ADA I-2 PRE", 18),
|
||||
("STOOL I-2", 12),
|
||||
("PLASMPK I-4 TROUGH", 18),
|
||||
("PLASMA PK I-4 PEAK", 18),
|
||||
("SERUM ADA I-4 PRE", 18),
|
||||
("SM06/SERUM BIOM", 16),
|
||||
("SM07/WB RNA", 14),
|
||||
("STOOL I-4", 12),
|
||||
]
|
||||
|
||||
group_font = Font(name='Calibri', bold=True, size=11)
|
||||
group_fill = PatternFill("solid", fgColor="FFFFFF")
|
||||
group_border = Border(left=thin, right=thin, top=thin, bottom=thin)
|
||||
|
||||
groups = [
|
||||
(3, 5, "SCREENING"),
|
||||
(7, 12, "RANDOMIZACE I-0"),
|
||||
(13, 16, "I-2"),
|
||||
(17, 22, "I-4"),
|
||||
]
|
||||
for start_col, end_col, label in groups:
|
||||
analysis_ws.merge_cells(start_row=1, start_column=start_col, end_row=1, end_column=end_col)
|
||||
cell = analysis_ws.cell(row=1, column=start_col, value=label)
|
||||
cell.font = group_font
|
||||
cell.fill = group_fill
|
||||
cell.alignment = Alignment(horizontal='center', vertical='center')
|
||||
cell.border = group_border
|
||||
for c in range(start_col, end_col + 1):
|
||||
analysis_ws.cell(row=1, column=c).border = group_border
|
||||
|
||||
analysis_ws.row_dimensions[1].height = 20
|
||||
|
||||
for col_idx, (hdr, width) in enumerate(columns, 1):
|
||||
cell = analysis_ws.cell(row=2, column=col_idx, value=hdr)
|
||||
cell.font = header_font
|
||||
cell.fill = header_fill
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
|
||||
analysis_ws.column_dimensions[get_column_letter(col_idx)].width = width
|
||||
|
||||
analysis_ws.row_dimensions[2].height = 30
|
||||
analysis_ws.freeze_panes = "C3"
|
||||
|
||||
for row_idx, patient in enumerate(bxscr_patients, 3):
|
||||
investigator = bxscr[bxscr['Patient No.'] == patient].iloc[0]['Investigator Name']
|
||||
sm11, sm11_row = get_specimen_info(bxscr, patient, 'Tissue , Paraffin Block')
|
||||
rna, rna_row = get_specimen_info(bxscr, patient, 'Biopsy RNA Later')
|
||||
cryo, cryo_row = get_specimen_info(bxscr, patient, 'Biopsy, Frozen Tissue')
|
||||
dna_date, dna_row = get_specimen_info(dna, patient)
|
||||
trough, trough_row = get_label_info(patient, 'PLASMPK I-0 TROUGH', 'I-0')
|
||||
peak, peak_row = get_label_info(patient, 'PLASMA PK I-0 PEAK', 'I-0')
|
||||
ada, ada_row = get_label_info(patient, 'SERUM ADA I-0 PRE', 'I-0')
|
||||
sm06, sm06_row = get_label_info(patient, 'SM06/SERUM BIOM', 'I-0')
|
||||
sm07, sm07_row = get_label_info(patient, 'SM07/WB RNA', 'I-0')
|
||||
sm10, sm10_row = get_label_info(patient, 'SM10/FECAL', 'I-0')
|
||||
trough2, trough2_row = get_label_info(patient, 'PLASMPK I-2 TROUGH', 'I-2')
|
||||
peak2, peak2_row = get_label_info(patient, 'PLASMA PK I-2 PEAK', 'I-2')
|
||||
ada2, ada2_row = get_label_info(patient, 'SERUM ADA I-2 PRE', 'I-2')
|
||||
stool2, stool2_row = get_label_info(patient, 'STOOL I-2', 'I-2')
|
||||
trough4, trough4_row = get_label_info(patient, 'PLASMPK I-4 TROUGH', 'I-4')
|
||||
peak4, peak4_row = get_label_info(patient, 'PLASMA PK I-4 PEAK', 'I-4')
|
||||
ada4, ada4_row = get_label_info(patient, 'SERUM ADA I-4 PRE', 'I-4')
|
||||
sm064, sm064_row = get_label_info(patient, 'SM06/SERUM BIOM', 'I-4')
|
||||
sm074, sm074_row = get_label_info(patient, 'SM07/WB RNA', 'I-4')
|
||||
stool4, stool4_row = get_label_info(patient, 'STOOL I-4', 'I-4')
|
||||
|
||||
row_data = [
|
||||
investigator, patient,
|
||||
(sm11, sm11_row), (rna, rna_row), (cryo, cryo_row), (dna_date, dna_row),
|
||||
(trough, trough_row), (peak, peak_row), (ada, ada_row),
|
||||
(sm06, sm06_row), (sm07, sm07_row), (sm10, sm10_row),
|
||||
(trough2, trough2_row),(peak2, peak2_row), (ada2, ada2_row), (stool2, stool2_row),
|
||||
(trough4, trough4_row),(peak4, peak4_row), (ada4, ada4_row),
|
||||
(sm064, sm064_row), (sm074, sm074_row), (stool4, stool4_row),
|
||||
]
|
||||
|
||||
for col_idx, value in enumerate(row_data, 1):
|
||||
if col_idx <= 2:
|
||||
cell = analysis_ws.cell(row=row_idx, column=col_idx, value=value)
|
||||
if col_idx == 2 and patient in patient_row_map:
|
||||
cell.hyperlink = f"#'{pat_sheet_name}'!B{patient_row_map[patient]}"
|
||||
cell.font = Font(name='Calibri', size=11, underline='single')
|
||||
else:
|
||||
cell.font = data_font
|
||||
else:
|
||||
dt, excel_row = value
|
||||
cell = analysis_ws.cell(row=row_idx, column=col_idx, value=dt)
|
||||
if dt and excel_row is not None:
|
||||
cell.hyperlink = f"#'{src_sheet_name}'!A{excel_row}"
|
||||
cell.font = date_font_link
|
||||
cell.fill = yes_fill
|
||||
cell.number_format = 'DD-MMM-YYYY'
|
||||
else:
|
||||
cell.font = Font(name='Calibri', size=11, color="C00000")
|
||||
cell.fill = no_fill
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal='center', vertical='center')
|
||||
|
||||
# ── List: Seznam pacientů ──────────────────────────────────────────────────────
|
||||
patients_ws = out_wb.create_sheet("Seznam pacientů")
|
||||
|
||||
pat_columns = [
|
||||
("Číslo centra", 20),
|
||||
("Číslo pacienta", 20),
|
||||
("Kód návštěvy", 20),
|
||||
("Datum návštěvy", 16),
|
||||
("Typ návštěvy", 16),
|
||||
]
|
||||
|
||||
for col_idx, (col_name, width) in enumerate(pat_columns, 1):
|
||||
cell = patients_ws.cell(row=1, column=col_idx, value=col_name)
|
||||
cell.font = header_font
|
||||
cell.fill = header_fill
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
|
||||
patients_ws.column_dimensions[get_column_letter(col_idx)].width = width
|
||||
|
||||
patients_ws.row_dimensions[1].height = 30
|
||||
patients_ws.freeze_panes = "A2"
|
||||
|
||||
pat_df = edc_df_raw[['SiteNumber', 'Subject', 'InstanceName', 'Field4Value', 'Field5Value']].copy()
|
||||
pat_df['Field4Value'] = pat_df['Field4Value'].apply(fmt_date_edc)
|
||||
pat_df = pat_df.sort_values(['SiteNumber', 'Subject', 'Field4Value']).reset_index(drop=True)
|
||||
|
||||
pat_col_keys = ['SiteNumber', 'Subject', 'InstanceName', 'Field4Value', 'Field5Value']
|
||||
for row_idx, (_, row) in enumerate(pat_df.iterrows(), 2):
|
||||
for col_idx, key in enumerate(pat_col_keys, 1):
|
||||
value = clean(row[key])
|
||||
cell = patients_ws.cell(row=row_idx, column=col_idx, value=value)
|
||||
cell.font = data_font
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal='center', vertical='center')
|
||||
if col_idx == 4 and value is not None:
|
||||
cell.number_format = 'DD-MMM-YYYY'
|
||||
|
||||
# ── Pomocná funkce pro souhrnné tabulky ────────────────────────────────────────
|
||||
def write_summary_table(ws, current_row, title, rows_data, col_a_header):
|
||||
for c in range(1, 5):
|
||||
cell = ws.cell(row=current_row, column=c)
|
||||
cell.fill = dark_blue_fill
|
||||
cell.border = border
|
||||
ws.cell(row=current_row, column=1, value=title).font = Font(name='Calibri', bold=True, size=12, color="FFFFFF")
|
||||
ws.cell(row=current_row, column=1).alignment = Alignment(horizontal="left", vertical="center")
|
||||
ws.merge_cells(start_row=current_row, start_column=1, end_row=current_row, end_column=4)
|
||||
ws.row_dimensions[current_row].height = 22
|
||||
current_row += 1
|
||||
|
||||
for col_idx, (h, f) in enumerate(zip(
|
||||
[col_a_header, "Description", "Expiruje do 30 dní", "Expiruje později"],
|
||||
[header_fill, header_fill, orange_fill, green_fill]
|
||||
), 1):
|
||||
cell = ws.cell(row=current_row, column=col_idx, value=h)
|
||||
cell.font = sum_header_font
|
||||
cell.fill = f
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
||||
ws.row_dimensions[current_row].height = 28
|
||||
current_row += 1
|
||||
|
||||
totals = [0, 0]
|
||||
for col_a, col_b, n_soon, n_ok in rows_data:
|
||||
totals[0] += n_soon
|
||||
totals[1] += n_ok
|
||||
all_zero = (n_soon == 0 and n_ok == 0)
|
||||
row_vals = [col_a, col_b, n_soon, n_ok]
|
||||
row_fills = [None, None,
|
||||
orange_fill if n_soon > 0 else None,
|
||||
green_fill if n_ok > 0 else None]
|
||||
for col_idx, (val, rfill) in enumerate(zip(row_vals, row_fills), 1):
|
||||
cell = ws.cell(row=current_row, column=col_idx, value=val)
|
||||
if col_idx >= 3 and val == 0:
|
||||
cell.font = zero_red_font if all_zero else zero_font
|
||||
else:
|
||||
cell.font = data_font
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal="center" if col_idx >= 2 else "left", vertical="center")
|
||||
if rfill:
|
||||
cell.fill = rfill
|
||||
current_row += 1
|
||||
|
||||
for col_idx, val in enumerate(["CELKEM", "", totals[0], totals[1]], 1):
|
||||
cell = ws.cell(row=current_row, column=col_idx, value=val)
|
||||
cell.font = sum_total_font
|
||||
cell.fill = total_fill
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal="center" if col_idx >= 2 else "left", vertical="center")
|
||||
current_row += 2
|
||||
return current_row
|
||||
|
||||
# ── List: Kit Inventory CZE ────────────────────────────────────────────────────
|
||||
kit_ws = out_wb.create_sheet("Kit Inventory CZE")
|
||||
|
||||
listing_columns = [
|
||||
("Project No.", 14),
|
||||
("Region", 10),
|
||||
("Country", 10),
|
||||
("Site", 38),
|
||||
("Kit Type", 12),
|
||||
("Description", 22),
|
||||
("Accession", 18),
|
||||
("Shipped Date", 16),
|
||||
("Expiration Date", 16),
|
||||
("Days to Expiration", 20),
|
||||
]
|
||||
|
||||
for col_idx, (hdr, width) in enumerate(listing_columns, 1):
|
||||
cell = kit_ws.cell(row=1, column=col_idx, value=hdr)
|
||||
cell.font = header_font
|
||||
cell.fill = header_fill
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
||||
kit_ws.column_dimensions[get_column_letter(col_idx)].width = width
|
||||
|
||||
kit_ws.row_dimensions[1].height = 30
|
||||
kit_ws.freeze_panes = "A2"
|
||||
|
||||
for row_idx, (_, row) in enumerate(cze.iterrows(), 2):
|
||||
days = row.get("Days to Expiration")
|
||||
for col_idx, (col_name, _) in enumerate(listing_columns, 1):
|
||||
value = clean(row.get(col_name))
|
||||
cell = kit_ws.cell(row=row_idx, column=col_idx, value=value)
|
||||
cell.font = data_font
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal="center", vertical="center")
|
||||
if col_name in ("Shipped Date", "Expiration Date") and value is not None:
|
||||
cell.number_format = "DD-MMM-YYYY"
|
||||
if col_name == "Days to Expiration":
|
||||
cell.fill = exp_fill if (pd.notna(days) and days <= 60) else ok_fill
|
||||
|
||||
kit_ws.auto_filter.ref = f"A1:{get_column_letter(len(listing_columns))}1"
|
||||
|
||||
# ── List: Přehled po centrech ──────────────────────────────────────────────────
|
||||
ctr_ws = out_wb.create_sheet("Přehled po centrech")
|
||||
ctr_ws.column_dimensions["A"].width = 22
|
||||
ctr_ws.column_dimensions["B"].width = 24
|
||||
ctr_ws.column_dimensions["C"].width = 22
|
||||
ctr_ws.column_dimensions["D"].width = 20
|
||||
|
||||
current_row = 1
|
||||
for site in kit_sites:
|
||||
site_df = cze[cze["Site"] == site]
|
||||
rows_data = []
|
||||
for kit in kit_order:
|
||||
desc = kit_desc.get(kit, "")
|
||||
kit_site_df = site_df[site_df["Kit Type"] == kit]
|
||||
n_soon = int((kit_site_df["_bucket"] == "soon").sum())
|
||||
n_ok = int((kit_site_df["_bucket"] == "ok").sum())
|
||||
rows_data.append((f"{kit} — {desc}", desc, n_soon, n_ok))
|
||||
current_row = write_summary_table(ctr_ws, current_row, site, rows_data, "Kit Type")
|
||||
|
||||
# ── List: Přehled po typech kitů ───────────────────────────────────────────────
|
||||
sum_ws = out_wb.create_sheet("Přehled po typech")
|
||||
sum_ws.column_dimensions["A"].width = 38
|
||||
sum_ws.column_dimensions["B"].width = 22
|
||||
sum_ws.column_dimensions["C"].width = 22
|
||||
sum_ws.column_dimensions["D"].width = 20
|
||||
|
||||
current_row = 1
|
||||
for kit in kit_order:
|
||||
desc = kit_desc.get(kit, "")
|
||||
kit_df = cze[cze["Kit Type"] == kit]
|
||||
rows_data = []
|
||||
for site in sorted(kit_df["Site"].unique()):
|
||||
site_df = kit_df[kit_df["Site"] == site]
|
||||
n_soon = int((site_df["_bucket"] == "soon").sum())
|
||||
n_ok = int((site_df["_bucket"] == "ok").sum())
|
||||
rows_data.append((site, desc, n_soon, n_ok))
|
||||
current_row = write_summary_table(sum_ws, current_row, f"Kit Type {kit} — {desc}", rows_data, "Centrum")
|
||||
|
||||
# ── List: eQueries ─────────────────────────────────────────────────────────────
|
||||
# TODO: doplnit až budou eQuery data importována do MongoDB
|
||||
# Zdroj: covance db, kolekce "equeries" (dle konvence importu)
|
||||
# Filtr: Country == "CZECH REPUBLIC"
|
||||
# Sloupce: Site, Subject, Visit, Visit Collection Date, Accession,
|
||||
# eQueryId, Issue Type, Status, Create Date, Response Date Time,
|
||||
# Time Before Response, User Name
|
||||
# Řazení: Open → Response Received → Closed, pak Site
|
||||
eq_ws = out_wb.create_sheet("eQueries")
|
||||
eq_ws.cell(row=1, column=1,
|
||||
value="TODO: eQuery data zatím nejsou v MongoDB — doplnit po importu.").font = Font(
|
||||
name='Calibri', bold=True, size=12, color="C00000"
|
||||
)
|
||||
eq_ws.column_dimensions["A"].width = 70
|
||||
|
||||
# ── Uložení ────────────────────────────────────────────────────────────────────
|
||||
out_wb.save(out_path)
|
||||
client.close()
|
||||
|
||||
print(f"\nUloženo: {out_path}")
|
||||
print(f"Pacienti s BXSCR: {len(bxscr_patients)}, Všichni pacienti: {len(all_patients)}")
|
||||
print(f"CZE kity: {len(cze)}, Typy kitů: {len(kit_order)}, Centra: {len(kit_sites)}")
|
||||
@@ -0,0 +1,610 @@
|
||||
# create_report_v2.1.py — v2.1 — 2026-06-16
|
||||
# UCO3001 Covance specimen & kit report — zdroj dat: MongoDB (covance + edc)
|
||||
# Změny v2.1: doplněn list "eQueries" z covance.equeries (study 35472 = UCO3001,
|
||||
# Country == "CZECH REPUBLIC"), barevné zvýraznění dle stavu, řazení
|
||||
# In Progress → Response Received → Closed, pak Site, pak Create Date.
|
||||
|
||||
import pandas as pd
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.styles import Font, PatternFill, Border, Side, Alignment
|
||||
from openpyxl.utils import get_column_letter
|
||||
from datetime import date, datetime
|
||||
from pymongo import MongoClient
|
||||
|
||||
# ── Konfigurace ────────────────────────────────────────────────────────────────
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
out_dir = "U:/Dropbox/!!!Days/Downloads Z230/"
|
||||
EQ_STUDY = "35472" # 77242113UCO3001
|
||||
|
||||
# ── MongoDB připojení ──────────────────────────────────────────────────────────
|
||||
client = MongoClient(MONGO_URI)
|
||||
covance_db = client["covance"]
|
||||
edc_db = client["edc"]
|
||||
|
||||
# ── Načtení dat z MongoDB ──────────────────────────────────────────────────────
|
||||
print("Načítám data z MongoDB...")
|
||||
|
||||
samples_docs = list(covance_db["allsamples"].find())
|
||||
df = pd.DataFrame([doc["fields"] for doc in samples_docs]).reset_index(drop=True)
|
||||
print(f" allsamples: {len(df)} záznamů")
|
||||
|
||||
kit_docs = list(covance_db["kits"].find())
|
||||
kit_df_raw = pd.DataFrame([doc["fields"] for doc in kit_docs]).reset_index(drop=True)
|
||||
print(f" kits: {len(kit_df_raw)} záznamů")
|
||||
|
||||
eq_docs = list(covance_db["equeries"].find({"study": EQ_STUDY}))
|
||||
eq_df_raw = pd.DataFrame([doc["fields"] for doc in eq_docs]).reset_index(drop=True)
|
||||
print(f" equeries: {len(eq_df_raw)} záznamů (study {EQ_STUDY})")
|
||||
|
||||
edc_docs = list(edc_db["UCO3001.DateofVisit"].find())
|
||||
edc_rows = []
|
||||
for doc in edc_docs:
|
||||
edc_rows.append({
|
||||
"SiteNumber": doc["site"]["number"],
|
||||
"Subject": doc["subject"]["label"],
|
||||
"InstanceName": doc["form"]["instanceName"],
|
||||
"Field4Value": doc["fields"].get("Visit Start Date"),
|
||||
"Field5Value": doc["fields"].get("Type of Contact"),
|
||||
})
|
||||
edc_df_raw = pd.DataFrame(edc_rows)
|
||||
print(f" DateofVisit: {len(edc_df_raw)} záznamů")
|
||||
|
||||
# ── Výstupní soubor ────────────────────────────────────────────────────────────
|
||||
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
|
||||
out_filename = f"{timestamp} 77242113UCO3001 CZE Labcorp samples and kit inventory report.xlsx"
|
||||
out_path = out_dir + out_filename
|
||||
|
||||
# ── Příprava dat — allsamples ──────────────────────────────────────────────────
|
||||
all_patients = sorted(df['Patient No.'].dropna().unique())
|
||||
bxscr = df[df['Protocol Visit Code'] == 'BXSCR']
|
||||
dna = df[df['Protocol Visit Code'] == 'DNA']
|
||||
|
||||
def fmt_date(val):
|
||||
if val is None:
|
||||
return None
|
||||
if isinstance(val, float) and pd.isna(val):
|
||||
return None
|
||||
if isinstance(val, datetime):
|
||||
return val.replace(tzinfo=None)
|
||||
if isinstance(val, str):
|
||||
for fmt in ('%d-%b-%Y', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
|
||||
try:
|
||||
return datetime.strptime(val.strip(), fmt)
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
return pd.to_datetime(val).to_pydatetime().replace(tzinfo=None)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
OK_STATUSES = {'Received', 'In Inventory', 'Shipped'}
|
||||
|
||||
def get_specimen_info(visit_df, patient, specimen_type=None):
|
||||
rows = visit_df[visit_df['Patient No.'] == patient]
|
||||
if specimen_type:
|
||||
rows = rows[rows['Specimen Type'] == specimen_type]
|
||||
rows = rows[rows['Sample Status'].isin(OK_STATUSES)]
|
||||
if rows.empty:
|
||||
return '', None
|
||||
row = rows.iloc[0]
|
||||
return fmt_date(row['Container Receipt Date']), rows.index[0] + 2
|
||||
|
||||
def get_label_info(patient, label_code, visit_code):
|
||||
rows = df[(df['Patient No.'] == patient) &
|
||||
(df['Protocol Visit Code'] == visit_code) &
|
||||
(df['Container Label Line 1'] == label_code)]
|
||||
rows = rows[rows['Sample Status'].isin(OK_STATUSES)]
|
||||
if rows.empty:
|
||||
return '', None
|
||||
row = rows.iloc[0]
|
||||
return fmt_date(row['Container Receipt Date']), rows.index[0] + 2
|
||||
|
||||
# ── Příprava dat — kit inventory ───────────────────────────────────────────────
|
||||
cze = kit_df_raw[kit_df_raw["Country"] == "CZE"].copy()
|
||||
|
||||
def parse_kit_date(val):
|
||||
if val is None or (isinstance(val, float) and pd.isna(val)):
|
||||
return None
|
||||
if isinstance(val, datetime):
|
||||
return val.replace(tzinfo=None)
|
||||
try:
|
||||
return datetime.strptime(str(val).strip(), "%b %d, %Y")
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
cze["Shipped Date"] = cze["Shipped Date"].apply(parse_kit_date)
|
||||
cze["Expiration Date"] = cze["Expiration Date"].apply(parse_kit_date)
|
||||
cze["Days to Expiration"] = pd.to_numeric(cze["Days to Expiration"], errors="coerce")
|
||||
cze = cze.sort_values(["Site", "Kit Type", "Expiration Date"]).reset_index(drop=True)
|
||||
|
||||
today_dt = datetime.combine(date.today(), datetime.min.time())
|
||||
|
||||
def bucket(exp_date):
|
||||
if exp_date is None:
|
||||
return None
|
||||
return "soon" if (exp_date - today_dt).days <= 30 else "ok"
|
||||
|
||||
cze["_bucket"] = cze["Expiration Date"].apply(bucket)
|
||||
|
||||
kit_order = sorted(cze["Kit Type"].unique(), key=lambda x: (str(x).lstrip("T-").zfill(5), str(x)))
|
||||
kit_desc = cze.drop_duplicates("Kit Type").set_index("Kit Type")["Description"].to_dict()
|
||||
kit_sites = sorted(cze["Site"].unique())
|
||||
|
||||
# ── Příprava dat — eQueries ────────────────────────────────────────────────────
|
||||
def parse_eq_date(val):
|
||||
"""Parsuje datum eQuery typu 'Mar 17, 2026 3:49 PM' (i bez času)."""
|
||||
if val is None or (isinstance(val, float) and pd.isna(val)):
|
||||
return None
|
||||
if isinstance(val, datetime):
|
||||
return val.replace(tzinfo=None)
|
||||
s = str(val).strip()
|
||||
for fmt in ("%b %d, %Y %I:%M %p", "%b %d, %Y"):
|
||||
try:
|
||||
return datetime.strptime(s, fmt)
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
return pd.to_datetime(s).to_pydatetime().replace(tzinfo=None)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
if not eq_df_raw.empty:
|
||||
eq_df = eq_df_raw.copy()
|
||||
for c in ("Visit Collection Date", "Create Date", "Response Date Time"):
|
||||
if c in eq_df.columns:
|
||||
eq_df[c] = eq_df[c].apply(parse_eq_date)
|
||||
# Řazení: In Progress → Response Received → Closed, pak Site, pak Create Date
|
||||
status_order = {"In Progress": 0, "Response Received": 1, "Closed": 2}
|
||||
eq_df["_status_rank"] = eq_df["Status"].map(lambda s: status_order.get(s, 99))
|
||||
eq_df = eq_df.sort_values(
|
||||
["_status_rank", "Site", "Create Date"]
|
||||
).reset_index(drop=True)
|
||||
else:
|
||||
eq_df = eq_df_raw
|
||||
|
||||
# ── Příprava dat — EDC pacienti ────────────────────────────────────────────────
|
||||
def fmt_date_edc(val):
|
||||
if val is None or (isinstance(val, float) and pd.isna(val)):
|
||||
return None
|
||||
if isinstance(val, datetime):
|
||||
return val.replace(tzinfo=None)
|
||||
if isinstance(val, str):
|
||||
for fmt in ('%d %b %Y', '%Y-%m-%dT%H:%M:%S', '%Y-%m-%d'):
|
||||
try:
|
||||
return datetime.strptime(val.strip(), fmt)
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
return pd.to_datetime(val).to_pydatetime().replace(tzinfo=None)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
_pat_pre = edc_df_raw[['SiteNumber', 'Subject', 'Field4Value']].copy()
|
||||
_pat_pre['Field4Value'] = _pat_pre['Field4Value'].apply(fmt_date_edc)
|
||||
_pat_pre = _pat_pre.sort_values(['SiteNumber', 'Subject', 'Field4Value']).reset_index(drop=True)
|
||||
patient_row_map = {}
|
||||
for i, row in _pat_pre.iterrows():
|
||||
pat = row['Subject']
|
||||
if pat not in patient_row_map:
|
||||
patient_row_map[pat] = i + 2
|
||||
|
||||
bxscr_patients = sorted(bxscr['Patient No.'].dropna().unique())
|
||||
|
||||
# ── Workbook ───────────────────────────────────────────────────────────────────
|
||||
out_wb = Workbook()
|
||||
out_wb.remove(out_wb.active)
|
||||
|
||||
# ── Styly ──────────────────────────────────────────────────────────────────────
|
||||
thin = Side(style='thin')
|
||||
border = Border(left=thin, right=thin, top=thin, bottom=thin)
|
||||
header_fill = PatternFill("solid", fgColor="4472C4")
|
||||
header_font = Font(name='Calibri', bold=True, size=11, color="FFFFFF")
|
||||
data_font = Font(name='Calibri', size=11)
|
||||
date_font_link = Font(name='Calibri', size=11, color="000000", underline='single')
|
||||
yes_fill = PatternFill("solid", fgColor="E2EFDA")
|
||||
no_fill = PatternFill("solid", fgColor="FFE7E7")
|
||||
sum_header_font = Font(name='Calibri', bold=True, size=11, color="000000")
|
||||
sum_total_font = Font(name='Calibri', bold=True, size=11)
|
||||
zero_font = Font(name='Calibri', size=11, color="BFBFBF")
|
||||
zero_red_font = Font(name='Calibri', size=11, color="C00000")
|
||||
dark_blue_fill = PatternFill("solid", fgColor="203764")
|
||||
orange_fill = PatternFill("solid", fgColor="FFF2CC")
|
||||
green_fill = PatternFill("solid", fgColor="E2EFDA")
|
||||
total_fill = PatternFill("solid", fgColor="D9E1F2")
|
||||
exp_fill = PatternFill("solid", fgColor="FFE7E7")
|
||||
ok_fill = PatternFill("solid", fgColor="E2EFDA")
|
||||
|
||||
# ── List: Zdroj ────────────────────────────────────────────────────────────────
|
||||
# Generován z covance.allsamples — pořadí řádků odpovídá df.index,
|
||||
# proto hyperlinky z Přehledu vzorků (index + 2) míří na správné řádky.
|
||||
src_ws = out_wb.create_sheet("Zdroj")
|
||||
src_sheet_name = "Zdroj"
|
||||
pat_sheet_name = "Seznam pacientů"
|
||||
|
||||
zdroj_columns = [
|
||||
"Protocol Code", "Investigator No.", "Investigator Name", "Patient No.",
|
||||
"Collection Date", "Protocol Visit Code", "Kit Receipt Date",
|
||||
"Container Receipt Date", "Accession", "Container No.", "Container Barcode No.",
|
||||
"Specimen Type", "Sample Status", "Expected Receipt Condition",
|
||||
"Actual Receipt Condition", "Container Label Line 1", "Container Label Line 2",
|
||||
"SM Sample Status", "SMART Specimen Class Description", "Parent Barcode", "Children Barcode",
|
||||
]
|
||||
|
||||
for col_idx, col_name in enumerate(zdroj_columns, 1):
|
||||
cell = src_ws.cell(row=1, column=col_idx, value=col_name)
|
||||
cell.font = header_font
|
||||
cell.fill = header_fill
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
|
||||
src_ws.column_dimensions[get_column_letter(col_idx)].width = max(len(col_name) + 2, 14)
|
||||
|
||||
src_ws.row_dimensions[1].height = 30
|
||||
src_ws.freeze_panes = "A2"
|
||||
|
||||
def clean(v):
|
||||
try:
|
||||
if pd.isna(v):
|
||||
return None
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
return v
|
||||
|
||||
for row_idx, (_, row) in enumerate(df.iterrows(), 2):
|
||||
for col_idx, col_name in enumerate(zdroj_columns, 1):
|
||||
val = clean(row.get(col_name))
|
||||
cell = src_ws.cell(row=row_idx, column=col_idx, value=val)
|
||||
cell.font = data_font
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal='center', vertical='center')
|
||||
|
||||
src_ws.auto_filter.ref = f"A1:{get_column_letter(len(zdroj_columns))}1"
|
||||
|
||||
# ── List: Přehled vzorků ───────────────────────────────────────────────────────
|
||||
analysis_ws = out_wb.create_sheet("Přehled vzorků")
|
||||
|
||||
columns = [
|
||||
("Investigator Name", 24),
|
||||
("Číslo pacienta", 20),
|
||||
("Máme biopsii SM11", 20),
|
||||
("Máme RNA", 16),
|
||||
("Máme Cryostor", 16),
|
||||
("DNA", 14),
|
||||
("PLASMPK I-0 TROUGH", 18),
|
||||
("PLASMA PK I-0 PEAK", 18),
|
||||
("SERUM ADA I-0 PRE", 18),
|
||||
("SM06/SERUM BIOM", 16),
|
||||
("SM07/WB RNA", 14),
|
||||
("SM10/FECAL", 14),
|
||||
("PLASMPK I-2 TROUGH", 18),
|
||||
("PLASMA PK I-2 PEAK", 18),
|
||||
("SERUM ADA I-2 PRE", 18),
|
||||
("STOOL I-2", 12),
|
||||
("PLASMPK I-4 TROUGH", 18),
|
||||
("PLASMA PK I-4 PEAK", 18),
|
||||
("SERUM ADA I-4 PRE", 18),
|
||||
("SM06/SERUM BIOM", 16),
|
||||
("SM07/WB RNA", 14),
|
||||
("STOOL I-4", 12),
|
||||
]
|
||||
|
||||
group_font = Font(name='Calibri', bold=True, size=11)
|
||||
group_fill = PatternFill("solid", fgColor="FFFFFF")
|
||||
group_border = Border(left=thin, right=thin, top=thin, bottom=thin)
|
||||
|
||||
groups = [
|
||||
(3, 5, "SCREENING"),
|
||||
(7, 12, "RANDOMIZACE I-0"),
|
||||
(13, 16, "I-2"),
|
||||
(17, 22, "I-4"),
|
||||
]
|
||||
for start_col, end_col, label in groups:
|
||||
analysis_ws.merge_cells(start_row=1, start_column=start_col, end_row=1, end_column=end_col)
|
||||
cell = analysis_ws.cell(row=1, column=start_col, value=label)
|
||||
cell.font = group_font
|
||||
cell.fill = group_fill
|
||||
cell.alignment = Alignment(horizontal='center', vertical='center')
|
||||
cell.border = group_border
|
||||
for c in range(start_col, end_col + 1):
|
||||
analysis_ws.cell(row=1, column=c).border = group_border
|
||||
|
||||
analysis_ws.row_dimensions[1].height = 20
|
||||
|
||||
for col_idx, (hdr, width) in enumerate(columns, 1):
|
||||
cell = analysis_ws.cell(row=2, column=col_idx, value=hdr)
|
||||
cell.font = header_font
|
||||
cell.fill = header_fill
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
|
||||
analysis_ws.column_dimensions[get_column_letter(col_idx)].width = width
|
||||
|
||||
analysis_ws.row_dimensions[2].height = 30
|
||||
analysis_ws.freeze_panes = "C3"
|
||||
|
||||
for row_idx, patient in enumerate(bxscr_patients, 3):
|
||||
investigator = bxscr[bxscr['Patient No.'] == patient].iloc[0]['Investigator Name']
|
||||
sm11, sm11_row = get_specimen_info(bxscr, patient, 'Tissue , Paraffin Block')
|
||||
rna, rna_row = get_specimen_info(bxscr, patient, 'Biopsy RNA Later')
|
||||
cryo, cryo_row = get_specimen_info(bxscr, patient, 'Biopsy, Frozen Tissue')
|
||||
dna_date, dna_row = get_specimen_info(dna, patient)
|
||||
trough, trough_row = get_label_info(patient, 'PLASMPK I-0 TROUGH', 'I-0')
|
||||
peak, peak_row = get_label_info(patient, 'PLASMA PK I-0 PEAK', 'I-0')
|
||||
ada, ada_row = get_label_info(patient, 'SERUM ADA I-0 PRE', 'I-0')
|
||||
sm06, sm06_row = get_label_info(patient, 'SM06/SERUM BIOM', 'I-0')
|
||||
sm07, sm07_row = get_label_info(patient, 'SM07/WB RNA', 'I-0')
|
||||
sm10, sm10_row = get_label_info(patient, 'SM10/FECAL', 'I-0')
|
||||
trough2, trough2_row = get_label_info(patient, 'PLASMPK I-2 TROUGH', 'I-2')
|
||||
peak2, peak2_row = get_label_info(patient, 'PLASMA PK I-2 PEAK', 'I-2')
|
||||
ada2, ada2_row = get_label_info(patient, 'SERUM ADA I-2 PRE', 'I-2')
|
||||
stool2, stool2_row = get_label_info(patient, 'STOOL I-2', 'I-2')
|
||||
trough4, trough4_row = get_label_info(patient, 'PLASMPK I-4 TROUGH', 'I-4')
|
||||
peak4, peak4_row = get_label_info(patient, 'PLASMA PK I-4 PEAK', 'I-4')
|
||||
ada4, ada4_row = get_label_info(patient, 'SERUM ADA I-4 PRE', 'I-4')
|
||||
sm064, sm064_row = get_label_info(patient, 'SM06/SERUM BIOM', 'I-4')
|
||||
sm074, sm074_row = get_label_info(patient, 'SM07/WB RNA', 'I-4')
|
||||
stool4, stool4_row = get_label_info(patient, 'STOOL I-4', 'I-4')
|
||||
|
||||
row_data = [
|
||||
investigator, patient,
|
||||
(sm11, sm11_row), (rna, rna_row), (cryo, cryo_row), (dna_date, dna_row),
|
||||
(trough, trough_row), (peak, peak_row), (ada, ada_row),
|
||||
(sm06, sm06_row), (sm07, sm07_row), (sm10, sm10_row),
|
||||
(trough2, trough2_row),(peak2, peak2_row), (ada2, ada2_row), (stool2, stool2_row),
|
||||
(trough4, trough4_row),(peak4, peak4_row), (ada4, ada4_row),
|
||||
(sm064, sm064_row), (sm074, sm074_row), (stool4, stool4_row),
|
||||
]
|
||||
|
||||
for col_idx, value in enumerate(row_data, 1):
|
||||
if col_idx <= 2:
|
||||
cell = analysis_ws.cell(row=row_idx, column=col_idx, value=value)
|
||||
if col_idx == 2 and patient in patient_row_map:
|
||||
cell.hyperlink = f"#'{pat_sheet_name}'!B{patient_row_map[patient]}"
|
||||
cell.font = Font(name='Calibri', size=11, underline='single')
|
||||
else:
|
||||
cell.font = data_font
|
||||
else:
|
||||
dt, excel_row = value
|
||||
cell = analysis_ws.cell(row=row_idx, column=col_idx, value=dt)
|
||||
if dt and excel_row is not None:
|
||||
cell.hyperlink = f"#'{src_sheet_name}'!A{excel_row}"
|
||||
cell.font = date_font_link
|
||||
cell.fill = yes_fill
|
||||
cell.number_format = 'DD-MMM-YYYY'
|
||||
else:
|
||||
cell.font = Font(name='Calibri', size=11, color="C00000")
|
||||
cell.fill = no_fill
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal='center', vertical='center')
|
||||
|
||||
# ── List: Seznam pacientů ──────────────────────────────────────────────────────
|
||||
patients_ws = out_wb.create_sheet("Seznam pacientů")
|
||||
|
||||
pat_columns = [
|
||||
("Číslo centra", 20),
|
||||
("Číslo pacienta", 20),
|
||||
("Kód návštěvy", 20),
|
||||
("Datum návštěvy", 16),
|
||||
("Typ návštěvy", 16),
|
||||
]
|
||||
|
||||
for col_idx, (col_name, width) in enumerate(pat_columns, 1):
|
||||
cell = patients_ws.cell(row=1, column=col_idx, value=col_name)
|
||||
cell.font = header_font
|
||||
cell.fill = header_fill
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal='center', vertical='center', wrap_text=True)
|
||||
patients_ws.column_dimensions[get_column_letter(col_idx)].width = width
|
||||
|
||||
patients_ws.row_dimensions[1].height = 30
|
||||
patients_ws.freeze_panes = "A2"
|
||||
|
||||
pat_df = edc_df_raw[['SiteNumber', 'Subject', 'InstanceName', 'Field4Value', 'Field5Value']].copy()
|
||||
pat_df['Field4Value'] = pat_df['Field4Value'].apply(fmt_date_edc)
|
||||
pat_df = pat_df.sort_values(['SiteNumber', 'Subject', 'Field4Value']).reset_index(drop=True)
|
||||
|
||||
pat_col_keys = ['SiteNumber', 'Subject', 'InstanceName', 'Field4Value', 'Field5Value']
|
||||
for row_idx, (_, row) in enumerate(pat_df.iterrows(), 2):
|
||||
for col_idx, key in enumerate(pat_col_keys, 1):
|
||||
value = clean(row[key])
|
||||
cell = patients_ws.cell(row=row_idx, column=col_idx, value=value)
|
||||
cell.font = data_font
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal='center', vertical='center')
|
||||
if col_idx == 4 and value is not None:
|
||||
cell.number_format = 'DD-MMM-YYYY'
|
||||
|
||||
# ── Pomocná funkce pro souhrnné tabulky ────────────────────────────────────────
|
||||
def write_summary_table(ws, current_row, title, rows_data, col_a_header):
|
||||
for c in range(1, 5):
|
||||
cell = ws.cell(row=current_row, column=c)
|
||||
cell.fill = dark_blue_fill
|
||||
cell.border = border
|
||||
ws.cell(row=current_row, column=1, value=title).font = Font(name='Calibri', bold=True, size=12, color="FFFFFF")
|
||||
ws.cell(row=current_row, column=1).alignment = Alignment(horizontal="left", vertical="center")
|
||||
ws.merge_cells(start_row=current_row, start_column=1, end_row=current_row, end_column=4)
|
||||
ws.row_dimensions[current_row].height = 22
|
||||
current_row += 1
|
||||
|
||||
for col_idx, (h, f) in enumerate(zip(
|
||||
[col_a_header, "Description", "Expiruje do 30 dní", "Expiruje později"],
|
||||
[header_fill, header_fill, orange_fill, green_fill]
|
||||
), 1):
|
||||
cell = ws.cell(row=current_row, column=col_idx, value=h)
|
||||
cell.font = sum_header_font
|
||||
cell.fill = f
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
||||
ws.row_dimensions[current_row].height = 28
|
||||
current_row += 1
|
||||
|
||||
totals = [0, 0]
|
||||
for col_a, col_b, n_soon, n_ok in rows_data:
|
||||
totals[0] += n_soon
|
||||
totals[1] += n_ok
|
||||
all_zero = (n_soon == 0 and n_ok == 0)
|
||||
row_vals = [col_a, col_b, n_soon, n_ok]
|
||||
row_fills = [None, None,
|
||||
orange_fill if n_soon > 0 else None,
|
||||
green_fill if n_ok > 0 else None]
|
||||
for col_idx, (val, rfill) in enumerate(zip(row_vals, row_fills), 1):
|
||||
cell = ws.cell(row=current_row, column=col_idx, value=val)
|
||||
if col_idx >= 3 and val == 0:
|
||||
cell.font = zero_red_font if all_zero else zero_font
|
||||
else:
|
||||
cell.font = data_font
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal="center" if col_idx >= 2 else "left", vertical="center")
|
||||
if rfill:
|
||||
cell.fill = rfill
|
||||
current_row += 1
|
||||
|
||||
for col_idx, val in enumerate(["CELKEM", "", totals[0], totals[1]], 1):
|
||||
cell = ws.cell(row=current_row, column=col_idx, value=val)
|
||||
cell.font = sum_total_font
|
||||
cell.fill = total_fill
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal="center" if col_idx >= 2 else "left", vertical="center")
|
||||
current_row += 2
|
||||
return current_row
|
||||
|
||||
# ── List: Kit Inventory CZE ────────────────────────────────────────────────────
|
||||
kit_ws = out_wb.create_sheet("Kit Inventory CZE")
|
||||
|
||||
listing_columns = [
|
||||
("Project No.", 14),
|
||||
("Region", 10),
|
||||
("Country", 10),
|
||||
("Site", 38),
|
||||
("Kit Type", 12),
|
||||
("Description", 22),
|
||||
("Accession", 18),
|
||||
("Shipped Date", 16),
|
||||
("Expiration Date", 16),
|
||||
("Days to Expiration", 20),
|
||||
]
|
||||
|
||||
for col_idx, (hdr, width) in enumerate(listing_columns, 1):
|
||||
cell = kit_ws.cell(row=1, column=col_idx, value=hdr)
|
||||
cell.font = header_font
|
||||
cell.fill = header_fill
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
||||
kit_ws.column_dimensions[get_column_letter(col_idx)].width = width
|
||||
|
||||
kit_ws.row_dimensions[1].height = 30
|
||||
kit_ws.freeze_panes = "A2"
|
||||
|
||||
for row_idx, (_, row) in enumerate(cze.iterrows(), 2):
|
||||
days = row.get("Days to Expiration")
|
||||
for col_idx, (col_name, _) in enumerate(listing_columns, 1):
|
||||
value = clean(row.get(col_name))
|
||||
cell = kit_ws.cell(row=row_idx, column=col_idx, value=value)
|
||||
cell.font = data_font
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal="center", vertical="center")
|
||||
if col_name in ("Shipped Date", "Expiration Date") and value is not None:
|
||||
cell.number_format = "DD-MMM-YYYY"
|
||||
if col_name == "Days to Expiration":
|
||||
cell.fill = exp_fill if (pd.notna(days) and days <= 60) else ok_fill
|
||||
|
||||
kit_ws.auto_filter.ref = f"A1:{get_column_letter(len(listing_columns))}1"
|
||||
|
||||
# ── List: Přehled po centrech ──────────────────────────────────────────────────
|
||||
ctr_ws = out_wb.create_sheet("Přehled po centrech")
|
||||
ctr_ws.column_dimensions["A"].width = 22
|
||||
ctr_ws.column_dimensions["B"].width = 24
|
||||
ctr_ws.column_dimensions["C"].width = 22
|
||||
ctr_ws.column_dimensions["D"].width = 20
|
||||
|
||||
current_row = 1
|
||||
for site in kit_sites:
|
||||
site_df = cze[cze["Site"] == site]
|
||||
rows_data = []
|
||||
for kit in kit_order:
|
||||
desc = kit_desc.get(kit, "")
|
||||
kit_site_df = site_df[site_df["Kit Type"] == kit]
|
||||
n_soon = int((kit_site_df["_bucket"] == "soon").sum())
|
||||
n_ok = int((kit_site_df["_bucket"] == "ok").sum())
|
||||
rows_data.append((f"{kit} — {desc}", desc, n_soon, n_ok))
|
||||
current_row = write_summary_table(ctr_ws, current_row, site, rows_data, "Kit Type")
|
||||
|
||||
# ── List: Přehled po typech kitů ───────────────────────────────────────────────
|
||||
sum_ws = out_wb.create_sheet("Přehled po typech")
|
||||
sum_ws.column_dimensions["A"].width = 38
|
||||
sum_ws.column_dimensions["B"].width = 22
|
||||
sum_ws.column_dimensions["C"].width = 22
|
||||
sum_ws.column_dimensions["D"].width = 20
|
||||
|
||||
current_row = 1
|
||||
for kit in kit_order:
|
||||
desc = kit_desc.get(kit, "")
|
||||
kit_df = cze[cze["Kit Type"] == kit]
|
||||
rows_data = []
|
||||
for site in sorted(kit_df["Site"].unique()):
|
||||
site_df = kit_df[kit_df["Site"] == site]
|
||||
n_soon = int((site_df["_bucket"] == "soon").sum())
|
||||
n_ok = int((site_df["_bucket"] == "ok").sum())
|
||||
rows_data.append((site, desc, n_soon, n_ok))
|
||||
current_row = write_summary_table(sum_ws, current_row, f"Kit Type {kit} — {desc}", rows_data, "Centrum")
|
||||
|
||||
# ── List: eQueries ─────────────────────────────────────────────────────────────
|
||||
# Zdroj: covance.equeries (study 35472 = 77242113UCO3001), všechny CZECH REPUBLIC.
|
||||
# Barevné zvýraznění sloupce Status: In Progress (otevřená) = červená,
|
||||
# Response Received = oranžová, Closed = zelená.
|
||||
eq_ws = out_wb.create_sheet("eQueries")
|
||||
|
||||
eq_columns = [
|
||||
("Site", 30),
|
||||
("Subject", 16),
|
||||
("Visit", 26),
|
||||
("Visit Collection Date", 20),
|
||||
("Accession", 16),
|
||||
("eQueryId", 14),
|
||||
("Issue Type", 18),
|
||||
("Status", 18),
|
||||
("Create Date", 20),
|
||||
("Response Date Time", 20),
|
||||
("Time Before Response", 18),
|
||||
("User Name", 22),
|
||||
]
|
||||
|
||||
date_cols = {"Visit Collection Date", "Create Date", "Response Date Time"}
|
||||
status_fill = {
|
||||
"In Progress": exp_fill, # otevřená — červená
|
||||
"Response Received": orange_fill, # oranžová
|
||||
"Closed": green_fill, # zelená
|
||||
}
|
||||
|
||||
for col_idx, (hdr, width) in enumerate(eq_columns, 1):
|
||||
cell = eq_ws.cell(row=1, column=col_idx, value=hdr)
|
||||
cell.font = header_font
|
||||
cell.fill = header_fill
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
|
||||
eq_ws.column_dimensions[get_column_letter(col_idx)].width = width
|
||||
|
||||
eq_ws.row_dimensions[1].height = 30
|
||||
eq_ws.freeze_panes = "A2"
|
||||
|
||||
for row_idx, (_, row) in enumerate(eq_df.iterrows(), 2):
|
||||
status_val = row.get("Status")
|
||||
for col_idx, (col_name, _) in enumerate(eq_columns, 1):
|
||||
value = clean(row.get(col_name))
|
||||
cell = eq_ws.cell(row=row_idx, column=col_idx, value=value)
|
||||
cell.font = data_font
|
||||
cell.border = border
|
||||
cell.alignment = Alignment(horizontal="center", vertical="center")
|
||||
if col_name in date_cols and value is not None:
|
||||
cell.number_format = "DD-MMM-YYYY HH:MM"
|
||||
if col_name == "Status" and status_val in status_fill:
|
||||
cell.fill = status_fill[status_val]
|
||||
|
||||
eq_ws.auto_filter.ref = f"A1:{get_column_letter(len(eq_columns))}1"
|
||||
|
||||
# ── Uložení ────────────────────────────────────────────────────────────────────
|
||||
out_wb.save(out_path)
|
||||
client.close()
|
||||
|
||||
print(f"\nUloženo: {out_path}")
|
||||
print(f"Pacienti s BXSCR: {len(bxscr_patients)}, Všichni pacienti: {len(all_patients)}")
|
||||
print(f"CZE kity: {len(cze)}, Typy kitů: {len(kit_order)}, Centra: {len(kit_sites)}")
|
||||
print(f"eQueries (UCO3001): {len(eq_df)}")
|
||||
@@ -0,0 +1,53 @@
|
||||
# download_lab_reports_v1.0.py
|
||||
|
||||
**Verze:** 1.0 | **Datum:** 2026-06-16
|
||||
|
||||
Stahuje PDF **Lab Reports** ze `xsp.labcorp.com` pro studii **77242113UCO3001**
|
||||
(interní číslo `36940`), filtrovaně na **10 českých center**.
|
||||
|
||||
## Princip
|
||||
Stejný jako `download_test_results` — Playwright + perzistentní profil
|
||||
(`browser_profile/`), jednorázový login (email → Next → heslo → Verify, jinak se
|
||||
přeskočí), stahování **přes UI**: klik na odkaz **„English"** ve sloupci Download
|
||||
→ `POST /api/download/documentFile` → `expect_download`.
|
||||
|
||||
Lab Reports je **AG Grid s virtuálním renderem** (~50 z 298 řádků v DOM).
|
||||
Skript proto scrolluje viewport po indexech (`row-index`), u každého řádku
|
||||
přečte metadata z buněk (`col-id`: `subjectId`, `accessionNumber`, `visit`,
|
||||
`visitCollectionDate`, `siteNum`, `postedDateTime`; zdvojený text se odstraní)
|
||||
a klikne na „English".
|
||||
|
||||
## Název PDF
|
||||
Mezery místo podtržítek; první datum = **odběr** (`visitCollectionDate`),
|
||||
`posted` = **datum vystavení** (`postedDateTime`, jen datum) — odliší reissue
|
||||
stejného reportu (stejný accession, různé Posted):
|
||||
|
||||
```
|
||||
77242113UCO3001 {odběr} {Site} {Subject} {Visit} {Accession} posted {posted}.pdf
|
||||
```
|
||||
Příklad: `77242113UCO3001 2026-06-11 CZ10009 CZ100092003 Screening 6227323331 posted 2026-06-15.pdf`
|
||||
|
||||
Při zbylé kolizi názvu se přidá ` (2)`, ` (3)`, …
|
||||
|
||||
## Spuštění
|
||||
```
|
||||
python download_lab_reports_v1.0.py --dry-run # jen vypíše názvy, nestahuje
|
||||
python download_lab_reports_v1.0.py --limit 5 # stáhne jen prvních 5 (test)
|
||||
python download_lab_reports_v1.0.py # stáhne vše (~298)
|
||||
```
|
||||
|
||||
**Výstup:** `U:\PythonProject\Janssen\Covance\LabReports\`
|
||||
|
||||
## Konfigurace (v hlavičce skriptu)
|
||||
- `SITES` — 10 interních ID center (z „GO TO LINK" URL).
|
||||
- `STUDY` = `36940`, `STUDY_CODE` = `77242113UCO3001`.
|
||||
- `OUT_DIR`, `LOGIN_URL`, `PROFILE_DIR`.
|
||||
|
||||
## TODO / k doladění
|
||||
- Cílový adresář (možná Dropbox / per-studie podsložka).
|
||||
- Případně přeskakovat už stažené (teď se přidává `(2)`).
|
||||
|
||||
## Pozn. ke spuštění
|
||||
Skript otevírá viditelné GUI Chrome (Playwright) — musí běžet z **terminálu
|
||||
uživatele s desktop session** (PowerShell / dvojklik), ne z headless/agent
|
||||
prostředí (tam Chromium spadne s `exitCode 2147483651`).
|
||||
@@ -0,0 +1,280 @@
|
||||
# =============================================================================
|
||||
# Název: download_lab_reports_v1.0.py
|
||||
# Verze: 1.0
|
||||
# Datum: 2026-06-16
|
||||
# Popis: Stahuje PDF Lab Reports ze xsp.labcorp.com pro studii 77242113UCO3001
|
||||
# (interni cislo 36940), filtrovane na 10 ceskych center (CZ).
|
||||
# Princip stejny jako download_test_results: Playwright + perzistentni
|
||||
# profil (browser_profile/), jednorazovy login, stahovani pres UI
|
||||
# (klik na "English" v sloupci Download -> expect_download).
|
||||
#
|
||||
# Lab Reports grid je AG Grid s virtualnim renderem (~50 z 298 radku
|
||||
# v DOM). Skript proto scrolluje viewport po indexech (row-index),
|
||||
# u kazdeho radku precte metadata + klikne na "English".
|
||||
#
|
||||
# Nazev PDF (mezery, ne podtrzitka):
|
||||
# "77242113UCO3001 {yyyy-mm-dd odber} {Site} {Subject} {Visit} {Accession}.pdf"
|
||||
# Pri kolizi nazvu se prida " (2)", " (3)", ...
|
||||
#
|
||||
# Prepinace:
|
||||
# --dry-run nestahuje, jen vypise metadata a vysledne nazvy souboru
|
||||
# --limit N zpracuje jen prvnich N radku (test pojmenovani)
|
||||
# =============================================================================
|
||||
from playwright.sync_api import sync_playwright
|
||||
from datetime import datetime
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import traceback
|
||||
import urllib.parse
|
||||
|
||||
# --- argumenty --------------------------------------------------------------
|
||||
parser = argparse.ArgumentParser(description="Stahovani Lab Reports PDF (XSP) pro 77242113UCO3001.")
|
||||
parser.add_argument("--dry-run", action="store_true", help="nestahovat, jen vypsat metadata + nazvy")
|
||||
parser.add_argument("--limit", type=int, default=0, help="zpracovat jen prvnich N radku (0 = vse)")
|
||||
ARGS = parser.parse_args()
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)
|
||||
|
||||
|
||||
# --- konfigurace ------------------------------------------------------------
|
||||
EMAIL = "vbuzalka@its.jnj.com"
|
||||
PASSWORD = "%zT3Wqfc9)cWua5"
|
||||
LOGIN_URL = "https://xsp.labcorp.com/"
|
||||
STUDY = "36940"
|
||||
STUDY_CODE = "77242113UCO3001"
|
||||
OUT_DIR = r"U:\PythonProject\Janssen\Covance\LabReports"
|
||||
|
||||
# 10 center (interni ID center) — z URL "GO TO LINK", co poslal uzivatel.
|
||||
SITES = [
|
||||
"930539", "930547", "930555", "930556", "930553",
|
||||
"930549", "930525", "930536", "930557", "930531",
|
||||
]
|
||||
|
||||
_BASE = os.path.dirname(os.path.abspath(__file__))
|
||||
PROFILE_DIR = os.path.join(_BASE, "browser_profile")
|
||||
|
||||
|
||||
def lab_reports_url():
|
||||
site_param = json.dumps(SITES, separators=(",", ":")) # ["930539","930547",...]
|
||||
return (f"https://xsp.labcorp.com/sponsor/study/{STUDY}/lab-reports"
|
||||
f"?site={urllib.parse.quote(site_param)}")
|
||||
|
||||
|
||||
# --- pomocne funkce nazvu souboru -------------------------------------------
|
||||
def safe(s: str) -> str:
|
||||
"""Odstrani znaky nepovolene v nazvu souboru Windows; zachova mezery."""
|
||||
return re.sub(r'[\\/:*?"<>|]', "", s or "").strip()
|
||||
|
||||
|
||||
_MONTHS = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05",
|
||||
"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10",
|
||||
"Nov": "11", "Dec": "12"}
|
||||
|
||||
|
||||
def fmt_date(s: str) -> str:
|
||||
"""'Jun 10, 2026' i 'Jun 15, 2026 7:49 PM' -> '2026-06-10'.
|
||||
Bere jen vedouci datum (mesic den, rok), pripadny cas ignoruje."""
|
||||
m = re.match(r"\s*([A-Za-z]{3})\w*\s+(\d{1,2}),\s+(\d{4})", s or "")
|
||||
if m and m.group(1)[:3] in _MONTHS:
|
||||
return f"{m.group(3)}-{_MONTHS[m.group(1)[:3]]}-{int(m.group(2)):02d}"
|
||||
return safe(s)
|
||||
|
||||
|
||||
def build_basename(meta: dict) -> str:
|
||||
# Posted (datum vystaveni) odlisi reissue stejneho reportu (stejny accession,
|
||||
# ruzne Posted). Pri shode i tak zbyva (2)(3) v unique_path().
|
||||
return safe(
|
||||
f"{STUDY_CODE} {fmt_date(meta['collected'])} {meta['site']} "
|
||||
f"{meta['subject']} {meta['visit']} {meta['accession']} "
|
||||
f"posted {fmt_date(meta['posted'])}"
|
||||
)
|
||||
|
||||
|
||||
def unique_path(out_dir: str, base: str, ext: str = ".pdf") -> str:
|
||||
dest = os.path.join(out_dir, base + ext)
|
||||
n = 2
|
||||
while os.path.exists(dest):
|
||||
dest = os.path.join(out_dir, f"{base} ({n}){ext}")
|
||||
n += 1
|
||||
return dest
|
||||
|
||||
|
||||
# --- JS helpery (cteni AG Gridu) --------------------------------------------
|
||||
JS_GRID_INFO = r"""() => {
|
||||
const c = document.querySelector('.ag-body-container');
|
||||
const r = document.querySelector('.ag-body-container .ag-row');
|
||||
const rh = r ? r.getBoundingClientRect().height : 25;
|
||||
const ch = c ? parseFloat(c.style.height || '0') : 0;
|
||||
return { rowHeight: rh || 25, total: rh ? Math.round(ch / rh) : 0 };
|
||||
}"""
|
||||
|
||||
JS_READ_ROW = r"""(idx) => {
|
||||
const dedup = s => {
|
||||
s = (s || '').replace(/\s+/g, ' ').trim();
|
||||
const h = s.slice(0, Math.floor(s.length / 2));
|
||||
if (s === h + h) return h;
|
||||
const m = s.match(/^(.*?)\s+\1$/);
|
||||
if (m) return m[1];
|
||||
return s;
|
||||
};
|
||||
const row = document.querySelector('.ag-body-container .ag-row[row-index="' + idx + '"]');
|
||||
if (!row) return null;
|
||||
const get = id => {
|
||||
const c = row.querySelector('[col-id="' + id + '"]');
|
||||
return c ? dedup(c.textContent) : '';
|
||||
};
|
||||
return {
|
||||
subject: get('subjectId'),
|
||||
accession: get('accessionNumber'),
|
||||
visit: get('visit'),
|
||||
collected: get('visitCollectionDate'),
|
||||
site: get('siteNum'),
|
||||
posted: get('postedDateTime'),
|
||||
};
|
||||
}"""
|
||||
|
||||
JS_SCROLL_TO = r"""(args) => {
|
||||
const [idx, rh] = args;
|
||||
const vp = document.querySelector('.ag-body-viewport');
|
||||
if (!vp) return;
|
||||
vp.scrollTop = Math.max(0, idx * rh - vp.clientHeight / 2);
|
||||
}"""
|
||||
|
||||
|
||||
# --- login ------------------------------------------------------------------
|
||||
def login(page):
|
||||
log("LOGIN: otviram login stranku...")
|
||||
page.goto(LOGIN_URL)
|
||||
try:
|
||||
page.get_by_label("Email").wait_for(state="visible", timeout=12000)
|
||||
except Exception:
|
||||
log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})")
|
||||
return
|
||||
log("LOGIN: zadavam email...")
|
||||
page.get_by_label("Email").fill(EMAIL)
|
||||
page.get_by_role("button", name="Next").click()
|
||||
log("LOGIN: cekam na pole pro heslo...")
|
||||
page.get_by_label("Password").wait_for(state="visible", timeout=30000)
|
||||
log("LOGIN: zadavam heslo...")
|
||||
page.get_by_label("Password").fill(PASSWORD)
|
||||
page.get_by_role("button", name="Verify").click()
|
||||
log("LOGIN: cekam na presmerovani po prihlaseni...")
|
||||
try:
|
||||
page.wait_for_url(lambda url: "code=" not in url or "xsp." in url, timeout=60000)
|
||||
except Exception:
|
||||
log("LOGIN: wait_for_url vyprsel, pokracuji.")
|
||||
page.wait_for_timeout(3000)
|
||||
log(f"LOGIN: prihlaseni hotovo ({page.url})")
|
||||
|
||||
|
||||
# --- nacteni gridu ----------------------------------------------------------
|
||||
def open_grid(page):
|
||||
url = lab_reports_url()
|
||||
log(f"GRID: navigace na Lab Reports ({len(SITES)} center)...")
|
||||
page.goto(url)
|
||||
log("GRID: cekam na radky (.ag-row)...")
|
||||
page.wait_for_selector(".ag-body-container .ag-row", timeout=120000)
|
||||
# stabilizace poctu radku
|
||||
prev = -1
|
||||
for i in range(20):
|
||||
info = page.evaluate(JS_GRID_INFO)
|
||||
cnt = info["total"]
|
||||
log(f" ...kontrola #{i+1}: total={cnt}, rowHeight={info['rowHeight']}")
|
||||
if cnt == prev and cnt > 0:
|
||||
break
|
||||
prev = cnt
|
||||
page.wait_for_timeout(2000)
|
||||
info = page.evaluate(JS_GRID_INFO)
|
||||
log(f"GRID: nacteno, total={info['total']} radku, rowHeight={info['rowHeight']}px.")
|
||||
return info["total"], info["rowHeight"]
|
||||
|
||||
|
||||
# --- stazeni jednoho radku --------------------------------------------------
|
||||
def process_row(page, idx, row_height, dry_run):
|
||||
page.evaluate(JS_SCROLL_TO, [idx, row_height])
|
||||
page.wait_for_selector(f'.ag-body-container .ag-row[row-index="{idx}"]', timeout=15000)
|
||||
page.wait_for_timeout(150)
|
||||
|
||||
meta = page.evaluate(JS_READ_ROW, idx)
|
||||
if not meta or not meta.get("subject"):
|
||||
raise RuntimeError(f"radek {idx}: nepodarilo se precist metadata")
|
||||
|
||||
base = build_basename(meta)
|
||||
dest = unique_path(OUT_DIR, base)
|
||||
|
||||
if dry_run:
|
||||
log(f" [DRY] #{idx}: {os.path.basename(dest)}")
|
||||
return os.path.basename(dest)
|
||||
|
||||
link = page.locator(
|
||||
f'.ag-body-container .ag-row[row-index="{idx}"] a.dl-link',
|
||||
has_text="English",
|
||||
).first
|
||||
with page.expect_download(timeout=60000) as dl:
|
||||
link.click()
|
||||
dl.value.save_as(dest)
|
||||
log(f" #{idx}: -> {os.path.basename(dest)}")
|
||||
return os.path.basename(dest)
|
||||
|
||||
|
||||
# --- main -------------------------------------------------------------------
|
||||
def main():
|
||||
os.makedirs(OUT_DIR, exist_ok=True)
|
||||
log(f"START: studie {STUDY_CODE} ({STUDY}), vystup '{OUT_DIR}', "
|
||||
f"{'DRY-RUN' if ARGS.dry_run else 'STAHOVANI'}"
|
||||
f"{f', limit {ARGS.limit}' if ARGS.limit else ''}.")
|
||||
|
||||
with sync_playwright() as p:
|
||||
context = p.chromium.launch_persistent_context(
|
||||
user_data_dir=PROFILE_DIR,
|
||||
headless=False,
|
||||
args=[
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--start-maximized",
|
||||
"--disable-restore-session-state",
|
||||
"--disable-session-crashed-bubble",
|
||||
],
|
||||
no_viewport=True,
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
||||
accept_downloads=True,
|
||||
)
|
||||
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
||||
page = context.new_page()
|
||||
log("START: prohlizec spusten.")
|
||||
|
||||
login(page)
|
||||
total, row_height = open_grid(page)
|
||||
if ARGS.limit:
|
||||
total = min(total, ARGS.limit)
|
||||
|
||||
ok, failed = 0, []
|
||||
for idx in range(total):
|
||||
log(f">>> Radek {idx+1}/{total}")
|
||||
try:
|
||||
process_row(page, idx, row_height, ARGS.dry_run)
|
||||
ok += 1
|
||||
except Exception as e:
|
||||
failed.append(idx)
|
||||
log(f"CHYBA u radku {idx}: {e!r} — pokracuji dalsim.")
|
||||
|
||||
log(f"KONEC: hotovo {ok}/{total} radku.")
|
||||
if failed:
|
||||
log(f"KONEC: SELHALY indexy: {failed}")
|
||||
context.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
log(f"FATAL: beh spadl: {e!r}")
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
try:
|
||||
input("\n[Enter] pro zavreni tohoto okna...")
|
||||
except EOFError:
|
||||
pass
|
||||
@@ -0,0 +1,49 @@
|
||||
# download_lab_reports_v1.1.py
|
||||
|
||||
**Verze:** 1.1 | **Datum:** 2026-06-16
|
||||
|
||||
Stahuje PDF **Lab Reports** ze `xsp.labcorp.com` (studie **77242113UCO3001**,
|
||||
interní `36940`, 10 CZ center) a ukládá je **přímo do MongoDB** — metadata
|
||||
z tabulky + skutečné PDF (inline Binary). **Na disk neukládá.**
|
||||
|
||||
## Princip
|
||||
Playwright + perzistentní profil (`browser_profile/`), jednorázový login,
|
||||
klik na „English" ve sloupci Download. PDF bajty se čtou z Playwright temp
|
||||
souboru (`download.path()`), `save_as` se nevolá → nic netrvalého na disku.
|
||||
Materializaci adresáře z Monga řeší samostatný (budoucí) skript.
|
||||
|
||||
## Inkrementálně (stop-at-known)
|
||||
List je řazený **Posted DESC** (nejnovější nahoře). Skript jde shora dolů;
|
||||
u každého řádku nejdřív přečte metadata a spočítá `record_id`. Jakmile narazí
|
||||
na **už uložený** report, **končí** (vše pod ním je starší a už v Mongo je) —
|
||||
stahuje tedy jen nové.
|
||||
|
||||
**Korekce výsledků** = stejný report znovu vystavený s **novým Posted** →
|
||||
nový `record_id` → stáhne se jako nový, původní zůstává.
|
||||
|
||||
## MongoDB
|
||||
- db `covance`, kolekce **`labreports`**
|
||||
- klíč `record_id = "{site}|{subject}|{accession}|{visit}|{posted}"`
|
||||
(Posted vč. času odlišuje reissue)
|
||||
- dokument: `study`, `studyCode`, `type`, `site`, `subject`, `accession`,
|
||||
`visit`, `collected` (yyyy-mm-dd), `posted` (yyyy-mm-dd HH:MM),
|
||||
`fields` (sloupce tabulky), `fileName`, **`pdf`** (Binary ~260 KB),
|
||||
`pdfSize`, `pdfSha256`, `firstSeen`, `lastSeen`, `history[]`
|
||||
- upsert: nový → insert; změna sha/fields → push do `history` + update;
|
||||
shoda → jen `lastSeen`
|
||||
|
||||
## Spuštění
|
||||
```
|
||||
python download_lab_reports_v1.1.py --dry-run # jen vypíše NOVÉ, nestahuje, nepíše
|
||||
python download_lab_reports_v1.1.py --limit 5 # test: max 5 řádků
|
||||
python download_lab_reports_v1.1.py # inkrementální běh (stop-at-known)
|
||||
python download_lab_reports_v1.1.py --full # projít vše (rekonciliace)
|
||||
```
|
||||
|
||||
## Pozn. ke spuštění
|
||||
Otevírá viditelné GUI Chrome — musí běžet z **terminálu uživatele s desktop
|
||||
session** (ne headless/agent prostředí).
|
||||
|
||||
## Nahrazuje
|
||||
`download_lab_reports_v1.0.py` (ukládal na disk) — po dokončení jeho běhu
|
||||
přesunout v1.0 (.py i .md) do `TRASH/`.
|
||||
@@ -0,0 +1,399 @@
|
||||
# =============================================================================
|
||||
# Název: download_lab_reports_v1.1.py
|
||||
# Verze: 1.1
|
||||
# Datum: 2026-06-16
|
||||
# Popis: Stahuje PDF Lab Reports ze xsp.labcorp.com pro studii 77242113UCO3001
|
||||
# (interni cislo 36940), filtrovane na 10 ceskych center (CZ),
|
||||
# a uklada je PRIMO do MongoDB (db covance, kolekce labreports) —
|
||||
# metadata z tabulky + skutecne PDF (inline Binary). Na disk NEUKLADA.
|
||||
#
|
||||
# Princip stahovani stejny jako download_test_results: Playwright +
|
||||
# perzistentni profil, jednorazovy login, klik na "English" ve sloupci
|
||||
# Download. PDF bajty se ctou z Playwright temp souboru (download.path()),
|
||||
# save_as se nevola -> nic netrvale neni na disku.
|
||||
#
|
||||
# INKREMENTALNE (stop-at-known): list je Posted DESC (nejnovejsi nahore).
|
||||
# Skript jde shora dolu; u kazdeho radku nejdriv precte metadata a
|
||||
# spocita record_id. Jakmile narazi na uz ulozeny report, KONCI
|
||||
# (vse pod nim je starsi a uz v Mongo je). Korekce vysledku = stejny
|
||||
# report znovu vystaveny s NOVYM Posted => novy record_id => stahne se
|
||||
# jako novy, puvodni zustava.
|
||||
#
|
||||
# record_id = "{site}|{subject}|{accession}|{visit}|{posted}"
|
||||
# (Posted vc. casu odlisuje reissue).
|
||||
#
|
||||
# Prepinace:
|
||||
# --full projit vsechny radky (bez predcasneho konce); upsertne
|
||||
# chybejici / zmenene (rekonciliace).
|
||||
# --dry-run nestahuje ani nepise do DB; jen vypise NOVE reporty.
|
||||
# --limit N zpracovat max N radku (test).
|
||||
# =============================================================================
|
||||
from playwright.sync_api import sync_playwright
|
||||
from datetime import datetime
|
||||
from pymongo import MongoClient, ASCENDING
|
||||
from bson.binary import Binary
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import traceback
|
||||
import urllib.parse
|
||||
|
||||
# --- argumenty --------------------------------------------------------------
|
||||
parser = argparse.ArgumentParser(description="Stahovani Lab Reports PDF (XSP) do MongoDB.")
|
||||
parser.add_argument("--full", action="store_true", help="projit vse (bez stop-at-known)")
|
||||
parser.add_argument("--dry-run", action="store_true", help="nestahovat ani nepsat do DB; jen vypsat nove")
|
||||
parser.add_argument("--limit", type=int, default=0, help="max N radku (0 = vse)")
|
||||
ARGS = parser.parse_args()
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)
|
||||
|
||||
|
||||
# --- konfigurace ------------------------------------------------------------
|
||||
EMAIL = "vbuzalka@its.jnj.com"
|
||||
PASSWORD = "%zT3Wqfc9)cWua5"
|
||||
LOGIN_URL = "https://xsp.labcorp.com/"
|
||||
STUDY = "36940"
|
||||
STUDY_CODE = "77242113UCO3001"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
DB_NAME = "covance"
|
||||
COLLECTION = "labreports"
|
||||
|
||||
# 10 center (interni ID center) — z URL "GO TO LINK".
|
||||
SITES = [
|
||||
"930539", "930547", "930555", "930556", "930553",
|
||||
"930549", "930525", "930536", "930557", "930531",
|
||||
]
|
||||
|
||||
_BASE = os.path.dirname(os.path.abspath(__file__))
|
||||
PROFILE_DIR = os.path.join(_BASE, "browser_profile")
|
||||
|
||||
|
||||
def lab_reports_url():
|
||||
site_param = json.dumps(SITES, separators=(",", ":"))
|
||||
return (f"https://xsp.labcorp.com/sponsor/study/{STUDY}/lab-reports"
|
||||
f"?site={urllib.parse.quote(site_param)}")
|
||||
|
||||
|
||||
# --- formatovani --------------------------------------------------------------
|
||||
_MONTHS = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05",
|
||||
"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10",
|
||||
"Nov": "11", "Dec": "12"}
|
||||
|
||||
|
||||
def safe(s: str) -> str:
|
||||
return re.sub(r'[\\/:*?"<>|]', "", s or "").strip()
|
||||
|
||||
|
||||
def fmt_date(s: str) -> str:
|
||||
"""'Jun 10, 2026' i 'Jun 15, 2026 7:49 PM' -> '2026-06-10'."""
|
||||
m = re.match(r"\s*([A-Za-z]{3})\w*\s+(\d{1,2}),\s+(\d{4})", s or "")
|
||||
if m and m.group(1)[:3] in _MONTHS:
|
||||
return f"{m.group(3)}-{_MONTHS[m.group(1)[:3]]}-{int(m.group(2)):02d}"
|
||||
return safe(s)
|
||||
|
||||
|
||||
def fmt_datetime(s: str) -> str:
|
||||
"""'Jun 15, 2026 7:49 PM' -> '2026-06-15 19:49'. Bez casu -> jen datum."""
|
||||
s = (s or "").strip()
|
||||
for f in ("%b %d, %Y %I:%M %p", "%b %d, %Y %I:%M:%S %p"):
|
||||
try:
|
||||
return datetime.strptime(s, f).strftime("%Y-%m-%d %H:%M")
|
||||
except ValueError:
|
||||
pass
|
||||
return fmt_date(s)
|
||||
|
||||
|
||||
def make_record_id(meta: dict) -> str:
|
||||
return "|".join([
|
||||
meta["site"], meta["subject"], meta["accession"],
|
||||
meta["visit"], fmt_datetime(meta["posted"]),
|
||||
])
|
||||
|
||||
|
||||
def build_basename(meta: dict) -> str:
|
||||
"""Nazev (kvuli budoucimu materializacnimu skriptu); ulozen jako fileName."""
|
||||
return safe(
|
||||
f"{STUDY_CODE} {fmt_date(meta['collected'])} {meta['site']} "
|
||||
f"{meta['subject']} {meta['visit']} {meta['accession']} "
|
||||
f"posted {fmt_date(meta['posted'])}"
|
||||
) + ".pdf"
|
||||
|
||||
|
||||
# --- JS helpery (AG Grid) ---------------------------------------------------
|
||||
JS_GRID_INFO = r"""() => {
|
||||
const c = document.querySelector('.ag-body-container');
|
||||
const r = document.querySelector('.ag-body-container .ag-row');
|
||||
const rh = r ? r.getBoundingClientRect().height : 25;
|
||||
const ch = c ? parseFloat(c.style.height || '0') : 0;
|
||||
return { rowHeight: rh || 25, total: rh ? Math.round(ch / rh) : 0 };
|
||||
}"""
|
||||
|
||||
JS_READ_ROW = r"""(idx) => {
|
||||
const dedup = s => {
|
||||
s = (s || '').replace(/\s+/g, ' ').trim();
|
||||
const h = s.slice(0, Math.floor(s.length / 2));
|
||||
if (s === h + h) return h;
|
||||
const m = s.match(/^(.*?)\s+\1$/);
|
||||
if (m) return m[1];
|
||||
return s;
|
||||
};
|
||||
const row = document.querySelector('.ag-body-container .ag-row[row-index="' + idx + '"]');
|
||||
if (!row) return null;
|
||||
const get = id => {
|
||||
const c = row.querySelector('[col-id="' + id + '"]');
|
||||
return c ? dedup(c.textContent) : '';
|
||||
};
|
||||
return {
|
||||
type: get('type'),
|
||||
subject: get('subjectId'),
|
||||
accession: get('accessionNumber'),
|
||||
visit: get('visit'),
|
||||
collected: get('visitCollectionDate'),
|
||||
site: get('siteNum'),
|
||||
posted: get('postedDateTime'),
|
||||
};
|
||||
}"""
|
||||
|
||||
JS_SCROLL_TO = r"""(args) => {
|
||||
const [idx, rh] = args;
|
||||
const vp = document.querySelector('.ag-body-viewport');
|
||||
if (!vp) return;
|
||||
vp.scrollTop = Math.max(0, idx * rh - vp.clientHeight / 2);
|
||||
}"""
|
||||
|
||||
|
||||
# --- login ------------------------------------------------------------------
|
||||
def login(page):
|
||||
log("LOGIN: otviram login stranku...")
|
||||
page.goto(LOGIN_URL)
|
||||
try:
|
||||
page.get_by_label("Email").wait_for(state="visible", timeout=12000)
|
||||
except Exception:
|
||||
log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})")
|
||||
return
|
||||
log("LOGIN: zadavam email...")
|
||||
page.get_by_label("Email").fill(EMAIL)
|
||||
page.get_by_role("button", name="Next").click()
|
||||
log("LOGIN: cekam na pole pro heslo...")
|
||||
page.get_by_label("Password").wait_for(state="visible", timeout=30000)
|
||||
log("LOGIN: zadavam heslo...")
|
||||
page.get_by_label("Password").fill(PASSWORD)
|
||||
page.get_by_role("button", name="Verify").click()
|
||||
log("LOGIN: cekam na presmerovani po prihlaseni...")
|
||||
try:
|
||||
page.wait_for_url(lambda url: "code=" not in url or "xsp." in url, timeout=60000)
|
||||
except Exception:
|
||||
log("LOGIN: wait_for_url vyprsel, pokracuji.")
|
||||
page.wait_for_timeout(3000)
|
||||
log(f"LOGIN: prihlaseni hotovo ({page.url})")
|
||||
|
||||
|
||||
def open_grid(page):
|
||||
log(f"GRID: navigace na Lab Reports ({len(SITES)} center)...")
|
||||
page.goto(lab_reports_url())
|
||||
log("GRID: cekam na radky (.ag-row)...")
|
||||
page.wait_for_selector(".ag-body-container .ag-row", timeout=120000)
|
||||
prev = -1
|
||||
for i in range(20):
|
||||
info = page.evaluate(JS_GRID_INFO)
|
||||
cnt = info["total"]
|
||||
log(f" ...kontrola #{i+1}: total={cnt}, rowHeight={info['rowHeight']}")
|
||||
if cnt == prev and cnt > 0:
|
||||
break
|
||||
prev = cnt
|
||||
page.wait_for_timeout(2000)
|
||||
info = page.evaluate(JS_GRID_INFO)
|
||||
log(f"GRID: nacteno, total={info['total']} radku, rowHeight={info['rowHeight']}px.")
|
||||
return info["total"], info["rowHeight"]
|
||||
|
||||
|
||||
def read_row(page, idx, row_height):
|
||||
page.evaluate(JS_SCROLL_TO, [idx, row_height])
|
||||
page.wait_for_selector(f'.ag-body-container .ag-row[row-index="{idx}"]', timeout=15000)
|
||||
page.wait_for_timeout(120)
|
||||
meta = page.evaluate(JS_READ_ROW, idx)
|
||||
if not meta or not meta.get("subject"):
|
||||
raise RuntimeError(f"radek {idx}: nepodarilo se precist metadata")
|
||||
return meta
|
||||
|
||||
|
||||
def download_pdf_bytes(page, idx):
|
||||
link = page.locator(
|
||||
f'.ag-body-container .ag-row[row-index="{idx}"] a.dl-link',
|
||||
has_text="English",
|
||||
).first
|
||||
with page.expect_download(timeout=60000) as dl:
|
||||
link.click()
|
||||
path = dl.value.path() # temp soubor Playwrightu
|
||||
with open(path, "rb") as f:
|
||||
data = f.read()
|
||||
return data
|
||||
|
||||
|
||||
def upsert(col, meta, rid, data, now):
|
||||
fields = {
|
||||
"Type": meta["type"],
|
||||
"Subject": meta["subject"],
|
||||
"Accession": meta["accession"],
|
||||
"Visit": meta["visit"],
|
||||
"Collected Date": meta["collected"],
|
||||
"Site Number": meta["site"],
|
||||
"Posted": meta["posted"],
|
||||
}
|
||||
sha = hashlib.sha256(data).hexdigest()
|
||||
derived = {
|
||||
"study": STUDY,
|
||||
"studyCode": STUDY_CODE,
|
||||
"type": meta["type"] or "Lab Result",
|
||||
"site": meta["site"],
|
||||
"subject": meta["subject"],
|
||||
"accession": meta["accession"],
|
||||
"visit": meta["visit"],
|
||||
"collected": fmt_date(meta["collected"]),
|
||||
"posted": fmt_datetime(meta["posted"]),
|
||||
"fields": fields,
|
||||
"fileName": build_basename(meta),
|
||||
"pdf": Binary(data),
|
||||
"pdfSize": len(data),
|
||||
"pdfSha256": sha,
|
||||
}
|
||||
existing = col.find_one({"record_id": rid}, {"pdfSha256": 1, "fields": 1, "lastSeen": 1})
|
||||
if existing is None:
|
||||
col.insert_one({"record_id": rid, **derived,
|
||||
"firstSeen": now, "lastSeen": now, "history": []})
|
||||
return "insert"
|
||||
if existing.get("pdfSha256") != sha or existing.get("fields") != fields:
|
||||
col.update_one(
|
||||
{"_id": existing["_id"]},
|
||||
{"$push": {"history": {"date": existing.get("lastSeen"),
|
||||
"fields": existing.get("fields"),
|
||||
"pdfSha256": existing.get("pdfSha256")}},
|
||||
"$set": {**derived, "lastSeen": now}},
|
||||
)
|
||||
return "update"
|
||||
col.update_one({"_id": existing["_id"]}, {"$set": {"lastSeen": now}})
|
||||
return "same"
|
||||
|
||||
|
||||
def main():
|
||||
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
log(f"START: studie {STUDY_CODE} ({STUDY}) -> {DB_NAME}.{COLLECTION}, "
|
||||
f"{'DRY-RUN' if ARGS.dry_run else 'ZAPIS'}"
|
||||
f"{' [FULL]' if ARGS.full else ' [stop-at-known]'}"
|
||||
f"{f', limit {ARGS.limit}' if ARGS.limit else ''}.")
|
||||
|
||||
col = None
|
||||
existing_ids = set()
|
||||
if not ARGS.dry_run:
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
col = client[DB_NAME][COLLECTION]
|
||||
col.create_index([("record_id", ASCENDING)], unique=True)
|
||||
for idx_def in (["study"], ["site"], ["subject"], ["accession"],
|
||||
["posted"], ["collected"]):
|
||||
col.create_index([(idx_def[0], ASCENDING)])
|
||||
existing_ids = {d["record_id"] for d in col.find({"study": STUDY}, {"record_id": 1})}
|
||||
log(f"START: v Mongo uz je {len(existing_ids)} reportu pro tuto studii.")
|
||||
else:
|
||||
# i v dry-run nacti existujici, at vime, co je opravdu nove
|
||||
try:
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
existing_ids = {d["record_id"] for d in
|
||||
client[DB_NAME][COLLECTION].find({"study": STUDY}, {"record_id": 1})}
|
||||
log(f"START: [dry-run] v Mongo je {len(existing_ids)} reportu.")
|
||||
except Exception as e:
|
||||
log(f"START: [dry-run] Mongo nedostupne ({e!r}), beru vse jako nove.")
|
||||
|
||||
with sync_playwright() as p:
|
||||
context = p.chromium.launch_persistent_context(
|
||||
user_data_dir=PROFILE_DIR,
|
||||
headless=False,
|
||||
args=[
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--start-maximized",
|
||||
"--disable-restore-session-state",
|
||||
"--disable-session-crashed-bubble",
|
||||
],
|
||||
no_viewport=True,
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
||||
accept_downloads=True,
|
||||
)
|
||||
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
||||
page = context.new_page()
|
||||
log("START: prohlizec spusten.")
|
||||
|
||||
login(page)
|
||||
total, row_height = open_grid(page)
|
||||
if ARGS.limit:
|
||||
total = min(total, ARGS.limit)
|
||||
|
||||
new_cnt = upd_cnt = same_cnt = 0
|
||||
failed = []
|
||||
stopped = False
|
||||
for idx in range(total):
|
||||
try:
|
||||
meta = read_row(page, idx, row_height)
|
||||
except Exception as e:
|
||||
failed.append(idx)
|
||||
log(f"CHYBA cteni radku {idx}: {e!r} — pokracuji.")
|
||||
continue
|
||||
|
||||
rid = make_record_id(meta)
|
||||
known = rid in existing_ids
|
||||
|
||||
if known and not ARGS.full:
|
||||
log(f">>> Radek {idx+1}/{total}: '{rid}' uz v Mongo "
|
||||
f"-> stop-at-known, koncim (zbytek je starsi).")
|
||||
stopped = True
|
||||
break
|
||||
if known and ARGS.full:
|
||||
log(f" #{idx}: znamy, [full] preskakuji download.")
|
||||
same_cnt += 1
|
||||
continue
|
||||
|
||||
# novy report
|
||||
if ARGS.dry_run:
|
||||
log(f" [DRY] NOVY #{idx}: {build_basename(meta)}")
|
||||
new_cnt += 1
|
||||
existing_ids.add(rid)
|
||||
continue
|
||||
try:
|
||||
data = download_pdf_bytes(page, idx)
|
||||
action = upsert(col, meta, rid, data, now)
|
||||
existing_ids.add(rid)
|
||||
if action == "insert":
|
||||
new_cnt += 1
|
||||
log(f" #{idx}: INSERT ({len(data)//1024} KB) {build_basename(meta)}")
|
||||
elif action == "update":
|
||||
upd_cnt += 1
|
||||
log(f" #{idx}: UPDATE {build_basename(meta)}")
|
||||
else:
|
||||
same_cnt += 1
|
||||
except Exception as e:
|
||||
failed.append(idx)
|
||||
log(f"CHYBA stazeni/zapisu radku {idx}: {e!r} — pokracuji.")
|
||||
|
||||
log(f"KONEC: nove={new_cnt}, update={upd_cnt}, beze zmeny={same_cnt}, "
|
||||
f"chyby={len(failed)} {'(stop-at-known)' if stopped else '(projeto vse)'}.")
|
||||
if failed:
|
||||
log(f"KONEC: SELHALY indexy: {failed}")
|
||||
context.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
log(f"FATAL: beh spadl: {e!r}")
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
try:
|
||||
input("\n[Enter] pro zavreni tohoto okna...")
|
||||
except EOFError:
|
||||
pass
|
||||
@@ -0,0 +1,56 @@
|
||||
# download_lab_reports_v1.2.py
|
||||
|
||||
**Verze:** 1.2 | **Datum:** 2026-06-16
|
||||
|
||||
Stahuje PDF **Lab Reports** ze `xsp.labcorp.com` (studie **77242113UCO3001**,
|
||||
interní `36940`, 10 CZ center) přímo do **MongoDB** (`covance.labreports`) —
|
||||
metadata + skutečné PDF (inline Binary). **Na disk neukládá.**
|
||||
|
||||
## Klíč `record_id = reportId`
|
||||
`reportId` = stabilní 32-hex ID dokumentu z dat AG Gridu, **napříč všemi řádky
|
||||
unikátní** (ověřeno: 997 řádků → 997 unikátních `reportId`/`fileId`).
|
||||
|
||||
**Proč ne metadata:** na portálu se reálně vyskytují **různá PDF se shodnými
|
||||
viditelnými metadaty** — stejný `site|subject|accession|visit|posted` i na
|
||||
minutu (korekce výsledku / reissue se shodným časem Posted). Ověřeno: 5 souborů
|
||||
pro accession `6227697718` má 5 různých sha256. Klíčování podle metadat (v1.1)
|
||||
by je chybně sloučilo → **ztráta dat**. `reportId` to řeší.
|
||||
|
||||
## Odkud data
|
||||
Z **in-memory dat AG Gridu** přes grid API (`__agComponent.gridApi`):
|
||||
`getDisplayedRowAtIndex(i).data` → `reportId`, `fileLinks[].fileId`+`fileName`
|
||||
(server), `postedDateTime` (ISO), `siteNum`, `subjectNumber`. `reportId` jde
|
||||
přečíst **bez scrollování** → levný pre-check „už mám". Accession/Visit/Collected
|
||||
leží v gridu níž (ve struktuře `visits`), proto se berou z buněk (po scrollu).
|
||||
|
||||
**Pozn.:** `accession` se nijak nepočítá ani neodvozuje — je to reálný
|
||||
identifikátor přidělený kitu v laboratoři; čte se **doslova** z buňky a ukládá
|
||||
1:1 (`accession` i `fields.Accession`).
|
||||
|
||||
## Stahování + inkrementálně
|
||||
Klik na „English" → `expect_download`, PDF z `download.path()` (bez `save_as`).
|
||||
List je Posted DESC; ze seznamu (rowIndex, reportId) se shora hledá první už
|
||||
uložený `reportId` → **stop-at-known** (zbytek je starší a v Mongo je). Stahují
|
||||
se jen nové (nahoře). Korekce = nový `reportId` → uloží se jako nový.
|
||||
|
||||
## MongoDB dokument
|
||||
`record_id`(=reportId), `study`, `studyCode`, `type`, `site`, `subject`,
|
||||
`accession`, `visit`, `collected`, `posted`, `postedIso`, `fileId`,
|
||||
`serverFileName`, `fields` (sloupce tabulky), `fileName` (náš název),
|
||||
**`pdf`** (Binary ~260 KB), `pdfSize`, `pdfSha256`, `firstSeen`, `lastSeen`,
|
||||
`history[]`. Upsert: nový→insert; změna sha/fields→push history+update;
|
||||
shoda→jen `lastSeen`.
|
||||
|
||||
Název v `fileName`: `77242113UCO3001 {odběr} {Site} {Subject} {Visit} {Accession} posted {posted}.pdf`
|
||||
|
||||
## Spuštění (z terminálu uživatele — otevírá GUI Chrome)
|
||||
```
|
||||
python download_lab_reports_v1.2.py --dry-run # vypíše NOVÉ, nestahuje
|
||||
python download_lab_reports_v1.2.py --limit 5 # test: 5 nových
|
||||
python download_lab_reports_v1.2.py # inkrementální běh
|
||||
python download_lab_reports_v1.2.py --full # rekonciliace přes vše
|
||||
```
|
||||
|
||||
## Nahrazuje
|
||||
`download_lab_reports_v1.0.py` (disk) a `v1.1` (klíč podle metadat — chybný).
|
||||
Obě → `TRASH/`.
|
||||
@@ -0,0 +1,404 @@
|
||||
# =============================================================================
|
||||
# Název: download_lab_reports_v1.2.py
|
||||
# Verze: 1.2
|
||||
# Datum: 2026-06-16
|
||||
# Popis: Stahuje PDF Lab Reports ze xsp.labcorp.com pro studii 77242113UCO3001
|
||||
# (interni cislo 36940), filtrovane na 10 ceskych center (CZ),
|
||||
# a uklada je PRIMO do MongoDB (db covance, kolekce labreports) —
|
||||
# metadata z tabulky + skutecne PDF (inline Binary). Na disk NEUKLADA.
|
||||
#
|
||||
# KLIC: record_id = reportId (z dat AG Gridu) — stabilni 32-hex ID
|
||||
# dokumentu, NAPRIC vsemi radky UNIKATNI (overeno: 997 radku ->
|
||||
# 997 unikatnich reportId/fileId). Resi pripad, kdy se na portalu
|
||||
# vyskytnou ruzna PDF se SHODNYMI viditelnymi metadaty (stejny
|
||||
# site|subject|accession|visit|posted i na minutu) — to skutecne
|
||||
# nastava (korekce vysledku reissue se shodnym casem Posted).
|
||||
# Verze v1.1 klicovala podle metadat a tyto by chybne slucovala.
|
||||
#
|
||||
# Princip stahovani: Playwright + perzistentni profil, login, klik na
|
||||
# "English" ve sloupci Download -> expect_download; PDF bajty se ctou
|
||||
# z Playwright temp souboru (download.path()), save_as se nevola.
|
||||
#
|
||||
# INKREMENTALNE (stop-at-known): list je Posted DESC (nejnovejsi
|
||||
# nahore). Nejdriv se z grid API precte SEZNAM (rowIndex, reportId)
|
||||
# BEZ stahovani; od shora se hleda prvni uz ulozeny reportId -> vse
|
||||
# pod nim je starsi a uz v Mongo je. Stahuji se jen nove (nahore).
|
||||
#
|
||||
# Prepinace:
|
||||
# --full projit vsechny radky (bez stop-at-known); upsert
|
||||
# chybejicich (rekonciliace).
|
||||
# --dry-run nestahuje ani nepise do DB; jen vypise NOVE reporty.
|
||||
# --limit N zpracovat max N novych radku (test).
|
||||
# =============================================================================
|
||||
from playwright.sync_api import sync_playwright
|
||||
from datetime import datetime
|
||||
from pymongo import MongoClient, ASCENDING
|
||||
from bson.binary import Binary
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import traceback
|
||||
import urllib.parse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Stahovani Lab Reports PDF (XSP) do MongoDB.")
|
||||
parser.add_argument("--full", action="store_true", help="projit vse (bez stop-at-known)")
|
||||
parser.add_argument("--dry-run", action="store_true", help="nestahovat ani nepsat do DB; jen vypsat nove")
|
||||
parser.add_argument("--limit", type=int, default=0, help="max N novych radku (0 = vse)")
|
||||
ARGS = parser.parse_args()
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)
|
||||
|
||||
|
||||
# --- konfigurace ------------------------------------------------------------
|
||||
EMAIL = "vbuzalka@its.jnj.com"
|
||||
PASSWORD = "%zT3Wqfc9)cWua5"
|
||||
LOGIN_URL = "https://xsp.labcorp.com/"
|
||||
STUDY = "36940"
|
||||
STUDY_CODE = "77242113UCO3001"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
DB_NAME = "covance"
|
||||
COLLECTION = "labreports"
|
||||
|
||||
SITES = [
|
||||
"930539", "930547", "930555", "930556", "930553",
|
||||
"930549", "930525", "930536", "930557", "930531",
|
||||
]
|
||||
|
||||
_BASE = os.path.dirname(os.path.abspath(__file__))
|
||||
PROFILE_DIR = os.path.join(_BASE, "browser_profile")
|
||||
|
||||
|
||||
def lab_reports_url():
|
||||
site_param = json.dumps(SITES, separators=(",", ":"))
|
||||
return (f"https://xsp.labcorp.com/sponsor/study/{STUDY}/lab-reports"
|
||||
f"?site={urllib.parse.quote(site_param)}")
|
||||
|
||||
|
||||
# --- formatovani -------------------------------------------------------------
|
||||
_MONTHS = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05",
|
||||
"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10",
|
||||
"Nov": "11", "Dec": "12"}
|
||||
|
||||
|
||||
def safe(s: str) -> str:
|
||||
return re.sub(r'[\\/:*?"<>|]', "", s or "").strip()
|
||||
|
||||
|
||||
def fmt_date(s: str) -> str:
|
||||
"""'Jun 10, 2026' i 'Jun 15, 2026 7:49 PM' -> '2026-06-10'."""
|
||||
m = re.match(r"\s*([A-Za-z]{3})\w*\s+(\d{1,2}),\s+(\d{4})", s or "")
|
||||
if m and m.group(1)[:3] in _MONTHS:
|
||||
return f"{m.group(3)}-{_MONTHS[m.group(1)[:3]]}-{int(m.group(2)):02d}"
|
||||
return safe(s)
|
||||
|
||||
|
||||
def build_basename(meta: dict) -> str:
|
||||
"""Lidsky citelny nazev (pro budouci materializacni skript), ulozen jako fileName."""
|
||||
return safe(
|
||||
f"{STUDY_CODE} {fmt_date(meta['collected'])} {meta['site']} "
|
||||
f"{meta['subject']} {meta['visit']} {meta['accession']} "
|
||||
f"posted {fmt_date(meta['postedDisplay'])}"
|
||||
) + ".pdf"
|
||||
|
||||
|
||||
# --- JS helpery (AG Grid) ---------------------------------------------------
|
||||
# Seznam vsech radku (rowIndex + reportId + data, ktera nepotrebuji vykresleni).
|
||||
JS_ALL_ROWS = r"""() => {
|
||||
let holder = null;
|
||||
for (const el of document.querySelectorAll('.ag-root-wrapper, .ag-root, ag-grid-angular')) {
|
||||
if (el.__agComponent) { holder = el.__agComponent; break; }
|
||||
}
|
||||
if (!holder) return null;
|
||||
const api = (holder.gridApi && holder.gridApi.forEachNode) ? holder.gridApi : holder;
|
||||
if (!api || !api.getDisplayedRowCount) return null;
|
||||
const cnt = api.getDisplayedRowCount();
|
||||
const out = [];
|
||||
for (let i = 0; i < cnt; i++) {
|
||||
const n = api.getDisplayedRowAtIndex(i);
|
||||
if (!n || !n.data) continue;
|
||||
const d = n.data;
|
||||
const fl = (d.fileLinks || []).find(f => f.language === 'English') || (d.fileLinks || [])[0] || {};
|
||||
out.push({
|
||||
rowIndex: i,
|
||||
reportId: d.reportId,
|
||||
fileId: fl.fileId,
|
||||
serverFileName: fl.fileName,
|
||||
postedIso: d.postedDateTime,
|
||||
site: d.siteNum,
|
||||
subject: d.subjectNumber,
|
||||
});
|
||||
}
|
||||
return out;
|
||||
}"""
|
||||
|
||||
# Bunky daneho radku (potrebuji vykresleni -> nejdriv scroll). Accession/Visit/
|
||||
# Collected nejsou v top-level datech (jsou ve 'visits'), beru je z bunek.
|
||||
JS_CELLS = r"""(idx) => {
|
||||
const dedup = s => {
|
||||
s = (s || '').replace(/\s+/g, ' ').trim();
|
||||
const h = s.slice(0, Math.floor(s.length / 2));
|
||||
if (s === h + h) return h;
|
||||
const m = s.match(/^(.*?)\s+\1$/);
|
||||
if (m) return m[1];
|
||||
return s;
|
||||
};
|
||||
const row = document.querySelector('.ag-body-container .ag-row[row-index="' + idx + '"]');
|
||||
if (!row) return null;
|
||||
const get = id => { const c = row.querySelector('[col-id="' + id + '"]'); return c ? dedup(c.textContent) : ''; };
|
||||
return {
|
||||
type: get('type'),
|
||||
accession: get('accessionNumber'),
|
||||
visit: get('visit'),
|
||||
collected: get('visitCollectionDate'),
|
||||
postedDisplay: get('postedDateTime'),
|
||||
};
|
||||
}"""
|
||||
|
||||
JS_SCROLL_TO = r"""(args) => {
|
||||
const [idx, rh] = args;
|
||||
const vp = document.querySelector('.ag-body-viewport');
|
||||
if (!vp) return;
|
||||
vp.scrollTop = Math.max(0, idx * rh - vp.clientHeight / 2);
|
||||
}"""
|
||||
|
||||
JS_ROW_HEIGHT = r"""() => {
|
||||
const r = document.querySelector('.ag-body-container .ag-row');
|
||||
return r ? r.getBoundingClientRect().height || 25 : 25;
|
||||
}"""
|
||||
|
||||
|
||||
# --- login ------------------------------------------------------------------
|
||||
def login(page):
|
||||
log("LOGIN: otviram login stranku...")
|
||||
page.goto(LOGIN_URL)
|
||||
try:
|
||||
page.get_by_label("Email").wait_for(state="visible", timeout=12000)
|
||||
except Exception:
|
||||
log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})")
|
||||
return
|
||||
log("LOGIN: zadavam email...")
|
||||
page.get_by_label("Email").fill(EMAIL)
|
||||
page.get_by_role("button", name="Next").click()
|
||||
log("LOGIN: cekam na pole pro heslo...")
|
||||
page.get_by_label("Password").wait_for(state="visible", timeout=30000)
|
||||
log("LOGIN: zadavam heslo...")
|
||||
page.get_by_label("Password").fill(PASSWORD)
|
||||
page.get_by_role("button", name="Verify").click()
|
||||
log("LOGIN: cekam na presmerovani po prihlaseni...")
|
||||
try:
|
||||
page.wait_for_url(lambda url: "code=" not in url or "xsp." in url, timeout=60000)
|
||||
except Exception:
|
||||
log("LOGIN: wait_for_url vyprsel, pokracuji.")
|
||||
page.wait_for_timeout(3000)
|
||||
log(f"LOGIN: prihlaseni hotovo ({page.url})")
|
||||
|
||||
|
||||
def open_grid(page):
|
||||
log(f"GRID: navigace na Lab Reports ({len(SITES)} center)...")
|
||||
page.goto(lab_reports_url())
|
||||
log("GRID: cekam na radky (.ag-row)...")
|
||||
page.wait_for_selector(".ag-body-container .ag-row", timeout=120000)
|
||||
# cekej, az grid API hlasi stabilni pocet radku
|
||||
prev = -1
|
||||
rows = None
|
||||
for i in range(25):
|
||||
rows = page.evaluate(JS_ALL_ROWS)
|
||||
cnt = len(rows) if rows else 0
|
||||
log(f" ...kontrola #{i+1}: rows={cnt}")
|
||||
if rows and cnt == prev and cnt > 0:
|
||||
break
|
||||
prev = cnt
|
||||
page.wait_for_timeout(2000)
|
||||
row_height = page.evaluate(JS_ROW_HEIGHT)
|
||||
log(f"GRID: nacteno {len(rows) if rows else 0} radku, rowHeight={row_height}px.")
|
||||
return rows or [], row_height
|
||||
|
||||
|
||||
def download_pdf_bytes(page, idx):
|
||||
link = page.locator(
|
||||
f'.ag-body-container .ag-row[row-index="{idx}"] a.dl-link',
|
||||
has_text="English",
|
||||
).first
|
||||
with page.expect_download(timeout=60000) as dl:
|
||||
link.click()
|
||||
with open(dl.value.path(), "rb") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def upsert(col, rec, cells, data, now):
|
||||
fields = {
|
||||
"Type": cells["type"],
|
||||
"Subject": rec["subject"],
|
||||
"Accession": cells["accession"],
|
||||
"Visit": cells["visit"],
|
||||
"Collected Date": cells["collected"],
|
||||
"Site Number": rec["site"],
|
||||
"Posted": cells["postedDisplay"],
|
||||
}
|
||||
sha = hashlib.sha256(data).hexdigest()
|
||||
meta = {"site": rec["site"], "subject": rec["subject"],
|
||||
"accession": cells["accession"], "visit": cells["visit"],
|
||||
"collected": cells["collected"], "postedDisplay": cells["postedDisplay"]}
|
||||
derived = {
|
||||
"study": STUDY,
|
||||
"studyCode": STUDY_CODE,
|
||||
"type": cells["type"] or "Lab Result",
|
||||
"site": rec["site"],
|
||||
"subject": rec["subject"],
|
||||
"accession": cells["accession"],
|
||||
"visit": cells["visit"],
|
||||
"collected": fmt_date(cells["collected"]),
|
||||
"posted": cells["postedDisplay"],
|
||||
"postedIso": rec["postedIso"],
|
||||
"fileId": rec["fileId"],
|
||||
"serverFileName": rec["serverFileName"],
|
||||
"fields": fields,
|
||||
"fileName": build_basename(meta),
|
||||
"pdf": Binary(data),
|
||||
"pdfSize": len(data),
|
||||
"pdfSha256": sha,
|
||||
}
|
||||
rid = rec["reportId"]
|
||||
existing = col.find_one({"record_id": rid}, {"pdfSha256": 1, "fields": 1, "lastSeen": 1})
|
||||
if existing is None:
|
||||
col.insert_one({"record_id": rid, **derived,
|
||||
"firstSeen": now, "lastSeen": now, "history": []})
|
||||
return "insert"
|
||||
if existing.get("pdfSha256") != sha or existing.get("fields") != fields:
|
||||
col.update_one(
|
||||
{"_id": existing["_id"]},
|
||||
{"$push": {"history": {"date": existing.get("lastSeen"),
|
||||
"fields": existing.get("fields"),
|
||||
"pdfSha256": existing.get("pdfSha256")}},
|
||||
"$set": {**derived, "lastSeen": now}},
|
||||
)
|
||||
return "update"
|
||||
col.update_one({"_id": existing["_id"]}, {"$set": {"lastSeen": now}})
|
||||
return "same"
|
||||
|
||||
|
||||
def main():
|
||||
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
log(f"START: studie {STUDY_CODE} ({STUDY}) -> {DB_NAME}.{COLLECTION}, "
|
||||
f"{'DRY-RUN' if ARGS.dry_run else 'ZAPIS'}"
|
||||
f"{' [FULL]' if ARGS.full else ' [stop-at-known]'}"
|
||||
f"{f', limit {ARGS.limit}' if ARGS.limit else ''}.")
|
||||
|
||||
col = None
|
||||
existing_ids = set()
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
if not ARGS.dry_run:
|
||||
col = client[DB_NAME][COLLECTION]
|
||||
col.create_index([("record_id", ASCENDING)], unique=True)
|
||||
for f in ("study", "site", "subject", "accession", "postedIso", "fileId"):
|
||||
col.create_index([(f, ASCENDING)])
|
||||
existing_ids = {d["record_id"] for d in
|
||||
client[DB_NAME][COLLECTION].find({"study": STUDY}, {"record_id": 1})}
|
||||
log(f"START: v Mongo je {len(existing_ids)} reportu pro tuto studii.")
|
||||
|
||||
with sync_playwright() as p:
|
||||
context = p.chromium.launch_persistent_context(
|
||||
user_data_dir=PROFILE_DIR,
|
||||
headless=False,
|
||||
args=[
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--start-maximized",
|
||||
"--disable-restore-session-state",
|
||||
"--disable-session-crashed-bubble",
|
||||
],
|
||||
no_viewport=True,
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
||||
accept_downloads=True,
|
||||
)
|
||||
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
||||
page = context.new_page()
|
||||
log("START: prohlizec spusten.")
|
||||
|
||||
login(page)
|
||||
rows, row_height = open_grid(page)
|
||||
|
||||
# POJISTKA: jen CZ centra. URL filtr by mel vratit jen 10 CZ sites,
|
||||
# ale kdyby selhal (napr. ztrata filtru po nejakem reloadu), tahle
|
||||
# kontrola zabrani stazeni cizich center.
|
||||
non_cz = [r for r in rows if not str(r["site"]).startswith("CZ")]
|
||||
if non_cz:
|
||||
log(f"POZOR: {len(non_cz)} ne-CZ radku v gridu (napr. {non_cz[0]['site']}) "
|
||||
f"-> filtruji jen CZ. Zkontroluj URL filtr center!")
|
||||
rows = [r for r in rows if str(r["site"]).startswith("CZ")]
|
||||
log(f"GRID: po CZ-pojistce {len(rows)} CZ radku.")
|
||||
|
||||
# vyber radky ke zpracovani: shora dolu, stop-at-known
|
||||
todo = []
|
||||
for rec in rows:
|
||||
if rec["reportId"] in existing_ids:
|
||||
if ARGS.full:
|
||||
continue
|
||||
log(f"STOP-AT-KNOWN: rowIndex {rec['rowIndex']} (reportId {rec['reportId'][:12]}…) "
|
||||
f"uz v Mongo -> koncim vyber (zbytek je starsi).")
|
||||
break
|
||||
todo.append(rec)
|
||||
if ARGS.limit:
|
||||
todo = todo[:ARGS.limit]
|
||||
log(f"PLAN: {len(todo)} novych radku ke stazeni "
|
||||
f"(z {len(rows)} v gridu, {len(existing_ids)} uz v Mongo).")
|
||||
|
||||
new_cnt = upd_cnt = same_cnt = 0
|
||||
failed = []
|
||||
for k, rec in enumerate(todo, 1):
|
||||
idx = rec["rowIndex"]
|
||||
try:
|
||||
page.evaluate(JS_SCROLL_TO, [idx, row_height])
|
||||
page.wait_for_selector(f'.ag-body-container .ag-row[row-index="{idx}"]', timeout=15000)
|
||||
page.wait_for_timeout(120)
|
||||
cells = page.evaluate(JS_CELLS, idx)
|
||||
if not cells:
|
||||
raise RuntimeError("nepodarilo se precist bunky radku")
|
||||
meta = {"site": rec["site"], "subject": rec["subject"],
|
||||
"accession": cells["accession"], "visit": cells["visit"],
|
||||
"collected": cells["collected"], "postedDisplay": cells["postedDisplay"]}
|
||||
fname = build_basename(meta)
|
||||
|
||||
if ARGS.dry_run:
|
||||
log(f">>> {k}/{len(todo)} [DRY] NOVY rowIdx {idx} ({rec['reportId'][:12]}…): {fname}")
|
||||
new_cnt += 1
|
||||
continue
|
||||
|
||||
data = download_pdf_bytes(page, idx)
|
||||
action = upsert(col, rec, cells, data, now)
|
||||
existing_ids.add(rec["reportId"])
|
||||
if action == "insert":
|
||||
new_cnt += 1
|
||||
log(f">>> {k}/{len(todo)} INSERT rowIdx {idx} ({len(data)//1024} KB): {fname}")
|
||||
elif action == "update":
|
||||
upd_cnt += 1
|
||||
log(f">>> {k}/{len(todo)} UPDATE rowIdx {idx}: {fname}")
|
||||
else:
|
||||
same_cnt += 1
|
||||
log(f">>> {k}/{len(todo)} SAME rowIdx {idx}: {fname}")
|
||||
except Exception as e:
|
||||
failed.append(idx)
|
||||
log(f"CHYBA rowIdx {idx} ({rec['reportId'][:12]}…): {e!r} — pokracuji.")
|
||||
|
||||
log(f"KONEC: nove={new_cnt}, update={upd_cnt}, beze zmeny={same_cnt}, chyby={len(failed)}.")
|
||||
if failed:
|
||||
log(f"KONEC: SELHALY rowIndexy: {failed}")
|
||||
context.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
log(f"FATAL: beh spadl: {e!r}")
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
try:
|
||||
input("\n[Enter] pro zavreni tohoto okna...")
|
||||
except EOFError:
|
||||
pass
|
||||
@@ -0,0 +1,53 @@
|
||||
# download_lab_reports_v1.3.py
|
||||
|
||||
**Verze:** 1.3 | **Datum:** 2026-06-16
|
||||
|
||||
Stahuje PDF **Lab Reports** ze `xsp.labcorp.com` (studie **77242113UCO3001**,
|
||||
interní `36940`, 10 CZ center) přímo do **MongoDB** (`covance.labreports`) —
|
||||
metadata + skutečné PDF (inline Binary). **Na disk neukládá.**
|
||||
|
||||
## Režim synchronizace — proměnná `SYNC_MODE`
|
||||
Nahoře ve skriptu:
|
||||
```python
|
||||
SYNC_MODE = "delta" # "delta" | "fullsync"
|
||||
```
|
||||
- **delta** — jen NOVÉ reporty přes interní `reportId` (stop-at-known). List je
|
||||
Posted DESC; shora se hledá první už uložený `reportId` → vše pod ním je
|
||||
starší a v Mongo je. Rychlé, běžný provoz.
|
||||
- **fullsync** — projde VŠECHNY řádky a doplní chybějící/změněné (rekonciliace).
|
||||
|
||||
CLI přepíše proměnnou: `--delta` / `--fullsync`.
|
||||
|
||||
## Klíč `record_id = reportId`
|
||||
Stabilní 32-hex ID dokumentu z dat AG Gridu, napříč všemi řádky **unikátní**
|
||||
a **perzistentní v čase** (ověřeno: stejné `reportId` vrací i jiný grid pro
|
||||
totéž centrum). Řeší různá PDF se shodnými viditelnými metadaty (reissue se
|
||||
shodným Posted i na minutu) — metadata na klíč nestačí.
|
||||
|
||||
## Odkud data
|
||||
Z in-memory dat AG Gridu (`__agComponent.gridApi`): `reportId`,
|
||||
`fileLinks[].fileId`+`fileName` (server), `postedDateTime` (moment.js →
|
||||
převedeno na ISO string), `siteNum`, `subjectNumber`. Accession/Visit/Collected
|
||||
leží v gridu níž (ve `visits`) → z buněk (po scrollu). `accession` se čte
|
||||
**doslova** (reálný identifikátor kitu), neukládá se nijak odvozeně.
|
||||
|
||||
## MongoDB dokument
|
||||
`record_id`(=reportId), `study`, `studyCode`, `type`, `site`, `subject`,
|
||||
`accession`, `visit`, `collected`, `posted`, `postedIso` (string), `fileId`,
|
||||
`serverFileName`, `fields`, `fileName`, **`pdf`** (Binary), `pdfSize`,
|
||||
`pdfSha256`, `firstSeen`, `lastSeen`, `history[]`.
|
||||
|
||||
## Spuštění (z terminálu uživatele — GUI Chrome)
|
||||
```
|
||||
python download_lab_reports_v1.3.py # podle SYNC_MODE (default delta)
|
||||
python download_lab_reports_v1.3.py --fullsync # rekonciliace
|
||||
python download_lab_reports_v1.3.py --dry-run # vypíše nové, nestahuje
|
||||
python download_lab_reports_v1.3.py --limit 5
|
||||
```
|
||||
|
||||
## Zmeny v1.3
|
||||
- `SYNC_MODE` proměnná (delta/fullsync) + CLI `--delta`/`--fullsync`.
|
||||
- Oprava `postedIso`: v1.2 ukládal celý moment.js objekt; nyní čistý ISO string.
|
||||
|
||||
## Nahrazuje
|
||||
`download_lab_reports_v1.2.py` → `TRASH/`.
|
||||
@@ -0,0 +1,408 @@
|
||||
# =============================================================================
|
||||
# Název: download_lab_reports_v1.3.py
|
||||
# Verze: 1.3
|
||||
# Datum: 2026-06-16
|
||||
# Popis: Stahuje PDF Lab Reports ze xsp.labcorp.com pro studii 77242113UCO3001
|
||||
# (interni cislo 36940), filtrovane na 10 ceskych center (CZ),
|
||||
# a uklada je PRIMO do MongoDB (db covance, kolekce labreports) —
|
||||
# metadata z tabulky + skutecne PDF (inline Binary). Na disk NEUKLADA.
|
||||
#
|
||||
# REZIM SYNCHRONIZACE: promenna SYNC_MODE nahore.
|
||||
# "delta" = jen NOVE reporty pres interni reportId (stop-at-known).
|
||||
# List je Posted DESC; shora se hleda prvni uz ulozeny
|
||||
# reportId -> vse pod nim je starsi a uz v Mongo je.
|
||||
# "fullsync" = projit VSECHNY radky a doplnit chybejici / zmenene
|
||||
# (rekonciliace). Pomalejsi, stahuje vse chybejici.
|
||||
# CLI prepise promennou: --delta / --fullsync.
|
||||
#
|
||||
# KLIC: record_id = reportId (z dat AG Gridu) — stabilni 32-hex ID
|
||||
# dokumentu, NAPRIC vsemi radky UNIKATNI a perzistentni v case
|
||||
# (overeno: stejne reportId vraci i jiny grid pro totez centrum).
|
||||
# Resi pripad ruznych PDF se SHODNYMI viditelnymi metadaty.
|
||||
#
|
||||
# Zmeny v1.3: + SYNC_MODE promenna (delta/fullsync); oprava postedIso
|
||||
# (drive se ukladal cely moment.js objekt -> ted cisty ISO).
|
||||
# =============================================================================
|
||||
from playwright.sync_api import sync_playwright
|
||||
from datetime import datetime
|
||||
from pymongo import MongoClient, ASCENDING
|
||||
from bson.binary import Binary
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import traceback
|
||||
import urllib.parse
|
||||
|
||||
# ============================================================================
|
||||
# REZIM SYNCHRONIZACE — nastav zde (CLI --delta / --fullsync ma prednost)
|
||||
# ============================================================================
|
||||
SYNC_MODE = "delta" # "delta" = jen nove (stop-at-known pres reportId)
|
||||
# "fullsync" = projit vse, doplnit chybejici/zmenene
|
||||
# ============================================================================
|
||||
|
||||
parser = argparse.ArgumentParser(description="Stahovani Lab Reports PDF (XSP) do MongoDB.")
|
||||
parser.add_argument("--delta", action="store_true", help="vynutit rezim delta")
|
||||
parser.add_argument("--fullsync", action="store_true", help="vynutit rezim fullsync")
|
||||
parser.add_argument("--dry-run", action="store_true", help="nestahovat ani nepsat do DB; jen vypsat nove")
|
||||
parser.add_argument("--limit", type=int, default=0, help="max N novych radku (0 = vse)")
|
||||
ARGS = parser.parse_args()
|
||||
|
||||
# rozhodnuti rezimu: CLI > promenna
|
||||
_mode = SYNC_MODE
|
||||
if ARGS.fullsync:
|
||||
_mode = "fullsync"
|
||||
if ARGS.delta:
|
||||
_mode = "delta"
|
||||
FULLSYNC = (_mode == "fullsync")
|
||||
|
||||
|
||||
def log(msg):
|
||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}", flush=True)
|
||||
|
||||
|
||||
# --- konfigurace ------------------------------------------------------------
|
||||
EMAIL = "vbuzalka@its.jnj.com"
|
||||
PASSWORD = "%zT3Wqfc9)cWua5"
|
||||
LOGIN_URL = "https://xsp.labcorp.com/"
|
||||
STUDY = "36940"
|
||||
STUDY_CODE = "77242113UCO3001"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
DB_NAME = "covance"
|
||||
COLLECTION = "labreports"
|
||||
|
||||
SITES = [
|
||||
"930539", "930547", "930555", "930556", "930553",
|
||||
"930549", "930525", "930536", "930557", "930531",
|
||||
]
|
||||
|
||||
_BASE = os.path.dirname(os.path.abspath(__file__))
|
||||
PROFILE_DIR = os.path.join(_BASE, "browser_profile")
|
||||
|
||||
|
||||
def lab_reports_url():
|
||||
site_param = json.dumps(SITES, separators=(",", ":"))
|
||||
return (f"https://xsp.labcorp.com/sponsor/study/{STUDY}/lab-reports"
|
||||
f"?site={urllib.parse.quote(site_param)}")
|
||||
|
||||
|
||||
# --- formatovani -------------------------------------------------------------
|
||||
_MONTHS = {"Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "May": "05",
|
||||
"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10",
|
||||
"Nov": "11", "Dec": "12"}
|
||||
|
||||
|
||||
def safe(s: str) -> str:
|
||||
return re.sub(r'[\\/:*?"<>|]', "", s or "").strip()
|
||||
|
||||
|
||||
def fmt_date(s: str) -> str:
|
||||
"""'Jun 10, 2026' i 'Jun 15, 2026 7:49 PM' -> '2026-06-10'."""
|
||||
m = re.match(r"\s*([A-Za-z]{3})\w*\s+(\d{1,2}),\s+(\d{4})", s or "")
|
||||
if m and m.group(1)[:3] in _MONTHS:
|
||||
return f"{m.group(3)}-{_MONTHS[m.group(1)[:3]]}-{int(m.group(2)):02d}"
|
||||
return safe(s)
|
||||
|
||||
|
||||
def build_basename(meta: dict) -> str:
|
||||
return safe(
|
||||
f"{STUDY_CODE} {fmt_date(meta['collected'])} {meta['site']} "
|
||||
f"{meta['subject']} {meta['visit']} {meta['accession']} "
|
||||
f"posted {fmt_date(meta['postedDisplay'])}"
|
||||
) + ".pdf"
|
||||
|
||||
|
||||
# --- JS helpery (AG Grid) ---------------------------------------------------
|
||||
# Seznam vsech radku. postedDateTime je v datech moment.js objekt -> prevedu
|
||||
# na cisty ISO string (jinak by se serializoval cely moment objekt).
|
||||
JS_ALL_ROWS = r"""() => {
|
||||
let holder = null;
|
||||
for (const el of document.querySelectorAll('.ag-root-wrapper, .ag-root, ag-grid-angular')) {
|
||||
if (el.__agComponent) { holder = el.__agComponent; break; }
|
||||
}
|
||||
if (!holder) return null;
|
||||
const api = (holder.gridApi && holder.gridApi.forEachNode) ? holder.gridApi : holder;
|
||||
if (!api || !api.getDisplayedRowCount) return null;
|
||||
const toIso = v => {
|
||||
if (v == null) return null;
|
||||
if (typeof v === 'string') return v;
|
||||
if (v._i && typeof v._i === 'string') return v._i; // puvodni serverove ISO s offsetem
|
||||
if (typeof v.toISOString === 'function') { try { return v.toISOString(); } catch (e) {} }
|
||||
return String(v);
|
||||
};
|
||||
const cnt = api.getDisplayedRowCount();
|
||||
const out = [];
|
||||
for (let i = 0; i < cnt; i++) {
|
||||
const n = api.getDisplayedRowAtIndex(i);
|
||||
if (!n || !n.data) continue;
|
||||
const d = n.data;
|
||||
const fl = (d.fileLinks || []).find(f => f.language === 'English') || (d.fileLinks || [])[0] || {};
|
||||
out.push({
|
||||
rowIndex: i,
|
||||
reportId: d.reportId,
|
||||
fileId: fl.fileId,
|
||||
serverFileName: fl.fileName,
|
||||
postedIso: toIso(d.postedDateTime),
|
||||
site: d.siteNum,
|
||||
subject: d.subjectNumber,
|
||||
});
|
||||
}
|
||||
return out;
|
||||
}"""
|
||||
|
||||
JS_CELLS = r"""(idx) => {
|
||||
const dedup = s => {
|
||||
s = (s || '').replace(/\s+/g, ' ').trim();
|
||||
const h = s.slice(0, Math.floor(s.length / 2));
|
||||
if (s === h + h) return h;
|
||||
const m = s.match(/^(.*?)\s+\1$/);
|
||||
if (m) return m[1];
|
||||
return s;
|
||||
};
|
||||
const row = document.querySelector('.ag-body-container .ag-row[row-index="' + idx + '"]');
|
||||
if (!row) return null;
|
||||
const get = id => { const c = row.querySelector('[col-id="' + id + '"]'); return c ? dedup(c.textContent) : ''; };
|
||||
return {
|
||||
type: get('type'),
|
||||
accession: get('accessionNumber'),
|
||||
visit: get('visit'),
|
||||
collected: get('visitCollectionDate'),
|
||||
postedDisplay: get('postedDateTime'),
|
||||
};
|
||||
}"""
|
||||
|
||||
JS_SCROLL_TO = r"""(args) => {
|
||||
const [idx, rh] = args;
|
||||
const vp = document.querySelector('.ag-body-viewport');
|
||||
if (!vp) return;
|
||||
vp.scrollTop = Math.max(0, idx * rh - vp.clientHeight / 2);
|
||||
}"""
|
||||
|
||||
JS_ROW_HEIGHT = r"""() => {
|
||||
const r = document.querySelector('.ag-body-container .ag-row');
|
||||
return r ? r.getBoundingClientRect().height || 25 : 25;
|
||||
}"""
|
||||
|
||||
|
||||
# --- login ------------------------------------------------------------------
|
||||
def login(page):
|
||||
log("LOGIN: otviram login stranku...")
|
||||
page.goto(LOGIN_URL)
|
||||
try:
|
||||
page.get_by_label("Email").wait_for(state="visible", timeout=12000)
|
||||
except Exception:
|
||||
log(f"LOGIN: Email pole se neobjevilo -> session aktivni, login preskocen ({page.url})")
|
||||
return
|
||||
log("LOGIN: zadavam email...")
|
||||
page.get_by_label("Email").fill(EMAIL)
|
||||
page.get_by_role("button", name="Next").click()
|
||||
log("LOGIN: cekam na pole pro heslo...")
|
||||
page.get_by_label("Password").wait_for(state="visible", timeout=30000)
|
||||
log("LOGIN: zadavam heslo...")
|
||||
page.get_by_label("Password").fill(PASSWORD)
|
||||
page.get_by_role("button", name="Verify").click()
|
||||
log("LOGIN: cekam na presmerovani po prihlaseni...")
|
||||
try:
|
||||
page.wait_for_url(lambda url: "code=" not in url or "xsp." in url, timeout=60000)
|
||||
except Exception:
|
||||
log("LOGIN: wait_for_url vyprsel, pokracuji.")
|
||||
page.wait_for_timeout(3000)
|
||||
log(f"LOGIN: prihlaseni hotovo ({page.url})")
|
||||
|
||||
|
||||
def open_grid(page):
|
||||
log(f"GRID: navigace na Lab Reports ({len(SITES)} center)...")
|
||||
page.goto(lab_reports_url())
|
||||
log("GRID: cekam na radky (.ag-row)...")
|
||||
page.wait_for_selector(".ag-body-container .ag-row", timeout=120000)
|
||||
prev = -1
|
||||
rows = None
|
||||
for i in range(25):
|
||||
rows = page.evaluate(JS_ALL_ROWS)
|
||||
cnt = len(rows) if rows else 0
|
||||
log(f" ...kontrola #{i+1}: rows={cnt}")
|
||||
if rows and cnt == prev and cnt > 0:
|
||||
break
|
||||
prev = cnt
|
||||
page.wait_for_timeout(2000)
|
||||
row_height = page.evaluate(JS_ROW_HEIGHT)
|
||||
log(f"GRID: nacteno {len(rows) if rows else 0} radku, rowHeight={row_height}px.")
|
||||
return rows or [], row_height
|
||||
|
||||
|
||||
def download_pdf_bytes(page, idx):
|
||||
link = page.locator(
|
||||
f'.ag-body-container .ag-row[row-index="{idx}"] a.dl-link',
|
||||
has_text="English",
|
||||
).first
|
||||
with page.expect_download(timeout=60000) as dl:
|
||||
link.click()
|
||||
with open(dl.value.path(), "rb") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def upsert(col, rec, cells, data, now):
|
||||
fields = {
|
||||
"Type": cells["type"],
|
||||
"Subject": rec["subject"],
|
||||
"Accession": cells["accession"],
|
||||
"Visit": cells["visit"],
|
||||
"Collected Date": cells["collected"],
|
||||
"Site Number": rec["site"],
|
||||
"Posted": cells["postedDisplay"],
|
||||
}
|
||||
sha = hashlib.sha256(data).hexdigest()
|
||||
derived = {
|
||||
"study": STUDY,
|
||||
"studyCode": STUDY_CODE,
|
||||
"type": cells["type"] or "Lab Result",
|
||||
"site": rec["site"],
|
||||
"subject": rec["subject"],
|
||||
"accession": cells["accession"],
|
||||
"visit": cells["visit"],
|
||||
"collected": fmt_date(cells["collected"]),
|
||||
"posted": cells["postedDisplay"],
|
||||
"postedIso": rec["postedIso"],
|
||||
"fileId": rec["fileId"],
|
||||
"serverFileName": rec["serverFileName"],
|
||||
"fields": fields,
|
||||
"fileName": build_basename({**rec, "accession": cells["accession"],
|
||||
"visit": cells["visit"], "collected": cells["collected"],
|
||||
"postedDisplay": cells["postedDisplay"]}),
|
||||
"pdf": Binary(data),
|
||||
"pdfSize": len(data),
|
||||
"pdfSha256": sha,
|
||||
}
|
||||
rid = rec["reportId"]
|
||||
existing = col.find_one({"record_id": rid}, {"pdfSha256": 1, "fields": 1, "lastSeen": 1})
|
||||
if existing is None:
|
||||
col.insert_one({"record_id": rid, **derived,
|
||||
"firstSeen": now, "lastSeen": now, "history": []})
|
||||
return "insert"
|
||||
if existing.get("pdfSha256") != sha or existing.get("fields") != fields:
|
||||
col.update_one(
|
||||
{"_id": existing["_id"]},
|
||||
{"$push": {"history": {"date": existing.get("lastSeen"),
|
||||
"fields": existing.get("fields"),
|
||||
"pdfSha256": existing.get("pdfSha256")}},
|
||||
"$set": {**derived, "lastSeen": now}},
|
||||
)
|
||||
return "update"
|
||||
col.update_one({"_id": existing["_id"]}, {"$set": {"lastSeen": now}})
|
||||
return "same"
|
||||
|
||||
|
||||
def main():
|
||||
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
log(f"START: studie {STUDY_CODE} ({STUDY}) -> {DB_NAME}.{COLLECTION}, "
|
||||
f"rezim={'FULLSYNC' if FULLSYNC else 'DELTA'}"
|
||||
f"{', DRY-RUN' if ARGS.dry_run else ''}"
|
||||
f"{f', limit {ARGS.limit}' if ARGS.limit else ''}.")
|
||||
|
||||
col = None
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
if not ARGS.dry_run:
|
||||
col = client[DB_NAME][COLLECTION]
|
||||
col.create_index([("record_id", ASCENDING)], unique=True)
|
||||
for f in ("study", "site", "subject", "accession", "postedIso", "fileId"):
|
||||
col.create_index([(f, ASCENDING)])
|
||||
existing_ids = {d["record_id"] for d in
|
||||
client[DB_NAME][COLLECTION].find({"study": STUDY}, {"record_id": 1})}
|
||||
log(f"START: v Mongo je {len(existing_ids)} reportu pro tuto studii.")
|
||||
|
||||
with sync_playwright() as p:
|
||||
context = p.chromium.launch_persistent_context(
|
||||
user_data_dir=PROFILE_DIR,
|
||||
headless=False,
|
||||
args=[
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--start-maximized",
|
||||
"--disable-restore-session-state",
|
||||
"--disable-session-crashed-bubble",
|
||||
],
|
||||
no_viewport=True,
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
||||
accept_downloads=True,
|
||||
)
|
||||
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
|
||||
page = context.new_page()
|
||||
log("START: prohlizec spusten.")
|
||||
|
||||
login(page)
|
||||
rows, row_height = open_grid(page)
|
||||
|
||||
# POJISTKA: jen CZ centra (kdyby URL filtr selhal).
|
||||
non_cz = [r for r in rows if not str(r["site"]).startswith("CZ")]
|
||||
if non_cz:
|
||||
log(f"POZOR: {len(non_cz)} ne-CZ radku v gridu (napr. {non_cz[0]['site']}) "
|
||||
f"-> filtruji jen CZ. Zkontroluj URL filtr center!")
|
||||
rows = [r for r in rows if str(r["site"]).startswith("CZ")]
|
||||
log(f"GRID: po CZ-pojistce {len(rows)} CZ radku.")
|
||||
|
||||
# vyber radku ke zpracovani podle rezimu
|
||||
todo = []
|
||||
for rec in rows:
|
||||
if rec["reportId"] in existing_ids:
|
||||
if FULLSYNC:
|
||||
continue # fullsync: znamy preskoc, jdi dal
|
||||
log(f"DELTA stop-at-known: rowIndex {rec['rowIndex']} "
|
||||
f"(reportId {rec['reportId'][:12]}…) uz v Mongo -> koncim (zbytek je starsi).")
|
||||
break # delta: prvni znamy = konec
|
||||
todo.append(rec)
|
||||
if ARGS.limit:
|
||||
todo = todo[:ARGS.limit]
|
||||
log(f"PLAN [{'FULLSYNC' if FULLSYNC else 'DELTA'}]: {len(todo)} novych radku ke stazeni "
|
||||
f"(z {len(rows)} v gridu, {len(existing_ids)} uz v Mongo).")
|
||||
|
||||
new_cnt = upd_cnt = same_cnt = 0
|
||||
failed = []
|
||||
for k, rec in enumerate(todo, 1):
|
||||
idx = rec["rowIndex"]
|
||||
try:
|
||||
page.evaluate(JS_SCROLL_TO, [idx, row_height])
|
||||
page.wait_for_selector(f'.ag-body-container .ag-row[row-index="{idx}"]', timeout=15000)
|
||||
page.wait_for_timeout(120)
|
||||
cells = page.evaluate(JS_CELLS, idx)
|
||||
if not cells:
|
||||
raise RuntimeError("nepodarilo se precist bunky radku")
|
||||
meta = {"site": rec["site"], "subject": rec["subject"],
|
||||
"accession": cells["accession"], "visit": cells["visit"],
|
||||
"collected": cells["collected"], "postedDisplay": cells["postedDisplay"]}
|
||||
fname = build_basename(meta)
|
||||
|
||||
if ARGS.dry_run:
|
||||
log(f">>> {k}/{len(todo)} [DRY] NOVY rowIdx {idx} ({rec['reportId'][:12]}…): {fname}")
|
||||
new_cnt += 1
|
||||
continue
|
||||
|
||||
data = download_pdf_bytes(page, idx)
|
||||
action = upsert(col, rec, cells, data, now)
|
||||
existing_ids.add(rec["reportId"])
|
||||
if action == "insert":
|
||||
new_cnt += 1
|
||||
log(f">>> {k}/{len(todo)} INSERT rowIdx {idx} ({len(data)//1024} KB): {fname}")
|
||||
elif action == "update":
|
||||
upd_cnt += 1
|
||||
log(f">>> {k}/{len(todo)} UPDATE rowIdx {idx}: {fname}")
|
||||
else:
|
||||
same_cnt += 1
|
||||
log(f">>> {k}/{len(todo)} SAME rowIdx {idx}: {fname}")
|
||||
except Exception as e:
|
||||
failed.append(idx)
|
||||
log(f"CHYBA rowIdx {idx} ({rec['reportId'][:12]}…): {e!r} — pokracuji.")
|
||||
|
||||
log(f"KONEC: nove={new_cnt}, update={upd_cnt}, beze zmeny={same_cnt}, chyby={len(failed)}.")
|
||||
if failed:
|
||||
log(f"KONEC: SELHALY rowIndexy: {failed}")
|
||||
context.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except Exception as e:
|
||||
log(f"FATAL: beh spadl: {e!r}")
|
||||
traceback.print_exc()
|
||||
@@ -0,0 +1,349 @@
|
||||
# =============================================================================
|
||||
# Název: import_to_mongo_v1.4.py
|
||||
# Verze: 1.4
|
||||
# Datum: 2026-06-09
|
||||
# Popis: Import CSV reportů do MongoDB (db: covance).
|
||||
# Pipeline 1 — allSamples: kolekce allsamples, klíč Container Barcode No.
|
||||
# Zdroj: Source (study 36940 + 35472)
|
||||
# Pipeline 2 — kits: kolekce kits, klíč Accession
|
||||
# Zdroj: Source (study 36940 + 35472)
|
||||
# Pipeline 3 — results: kolekce results, laboratorní výsledky per centrum.
|
||||
# Zdroj: Source, soubory test-results-{SITE}-{typ}.csv
|
||||
# (1. řádek = disclaimer, hlavička je 2. řádek!)
|
||||
# Dva typy (standard / microbiology) v jedné kolekci,
|
||||
# rozlišené polem resultType. record_id:
|
||||
# standard: STD|{Accession}|{Test Group}|{Test}|{occ}
|
||||
# microbiology: MIC|{Accession}|{Test Group}|{Specimen}|
|
||||
# {Test Description}|{Drug Name/Agent}|{occ}
|
||||
# Pipeline 4 — equeries: kolekce equeries, eQuery report (study 36940 + 35472).
|
||||
# Zdroj: Source, soubory ...-equery.csv (FULL).
|
||||
# Klíč eQueryId (stabilní systémové ID, unikátní per řádek);
|
||||
# řádky footeru s parametry filtru (nečíselný eQueryId) se
|
||||
# přeskakují. History sleduje životní cyklus dotazu
|
||||
# (Open -> Response Received -> Closed).
|
||||
# Varianta ...-equery_unresponded_only.csv je jen podmnožina
|
||||
# (Status=Open) téhož reportu + footer => NEIMPORTUJE se,
|
||||
# pouze se přesune do Zpracovano/ (move-only pipeline).
|
||||
# Upsert s historií změn, zpracovaný soubor přesunut do Zpracovano/.
|
||||
# Přepínač --dry-run: nic nezapisuje do DB ani nepřesouvá soubory.
|
||||
# =============================================================================
|
||||
|
||||
import csv
|
||||
import re
|
||||
import shutil
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from pymongo import MongoClient, ASCENDING
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
DB_NAME = "covance"
|
||||
|
||||
SOURCE = Path(__file__).parent / "Source"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Builders record_id + metadata pro jednotlivé pipeline
|
||||
# ---------------------------------------------------------------------------
|
||||
def make_keyed_record(upsert_key: str):
|
||||
"""Jednoduchý klíč = hodnota jednoho sloupce (allsamples, kits)."""
|
||||
def builder(fields: dict, fmeta: dict | None, occ: dict):
|
||||
key_val = fields.get(upsert_key)
|
||||
if not key_val:
|
||||
return None, {}
|
||||
return key_val, {}
|
||||
return builder
|
||||
|
||||
|
||||
def _norm_subject(raw: str | None) -> str:
|
||||
"""'CZ100062001 - null' -> 'CZ100062001'."""
|
||||
s = (raw or "").strip()
|
||||
return s.split(" - null")[0].strip()
|
||||
|
||||
|
||||
def make_results_record(fields: dict, fmeta: dict, occ: dict):
|
||||
rtype = fmeta["resultType"]
|
||||
accession = fields.get("Accession")
|
||||
if not accession:
|
||||
return None, {}
|
||||
|
||||
if rtype == "standard":
|
||||
parts = (accession, fields.get("Test Group", ""), fields.get("Test", ""))
|
||||
prefix = "STD"
|
||||
else: # microbiology
|
||||
parts = (
|
||||
accession,
|
||||
fields.get("Test Group", ""),
|
||||
fields.get("Specimen", ""),
|
||||
fields.get("Test Description", ""),
|
||||
fields.get("Drug Name/Agent", ""),
|
||||
)
|
||||
prefix = "MIC"
|
||||
|
||||
occ[parts] = occ.get(parts, 0) + 1
|
||||
record_id = f"{prefix}|" + "|".join(str(p or "") for p in parts) + f"|{occ[parts]}"
|
||||
|
||||
extra = {
|
||||
"study": fmeta["study"],
|
||||
"site": fmeta["site"],
|
||||
"subject": _norm_subject(fields.get("Subject")),
|
||||
"resultType": rtype,
|
||||
}
|
||||
return record_id, extra
|
||||
|
||||
|
||||
def make_equery_record(fields: dict, fmeta: dict | None, occ: dict):
|
||||
"""Klíč = eQueryId. Footer s parametry filtru (nečíselný eQueryId) se přeskočí."""
|
||||
key_val = (fields.get("eQueryId") or "").strip()
|
||||
if not key_val.isdigit():
|
||||
return None, {}
|
||||
extra = {"study": fmeta["study"]} if fmeta else {}
|
||||
return key_val, extra
|
||||
|
||||
|
||||
def results_file_meta(filename: str) -> dict | None:
|
||||
m = re.search(r"study-(\d+)-test-results-(\d+)-(standard|microbiology)", filename, re.IGNORECASE)
|
||||
if not m:
|
||||
return None
|
||||
return {"study": m.group(1), "site": m.group(2), "resultType": m.group(3).lower()}
|
||||
|
||||
|
||||
def equery_file_meta(filename: str) -> dict | None:
|
||||
m = re.search(r"study-(\d+)-activity-reports", filename, re.IGNORECASE)
|
||||
return {"study": m.group(1)} if m else {"study": None}
|
||||
|
||||
|
||||
PIPELINES = [
|
||||
{
|
||||
"name": "allsamples",
|
||||
"collection": "allsamples",
|
||||
"pattern": re.compile(r".*-allSamples\.csv$", re.IGNORECASE),
|
||||
"sources": [SOURCE],
|
||||
"header_skip": 0,
|
||||
"make_record": make_keyed_record("Container Barcode No."),
|
||||
"file_meta": None,
|
||||
"indexes": [
|
||||
[("fields.Sample Status", ASCENDING)],
|
||||
[("fields.Specimen Type", ASCENDING)],
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "kits",
|
||||
"collection": "kits",
|
||||
"pattern": re.compile(r".*-kit-inventory-on-hand-expiration\.csv$", re.IGNORECASE),
|
||||
"sources": [SOURCE],
|
||||
"header_skip": 0,
|
||||
"make_record": make_keyed_record("Accession"),
|
||||
"file_meta": None,
|
||||
"indexes": [
|
||||
[("fields.Kit Type", ASCENDING)],
|
||||
[("fields.Site", ASCENDING)],
|
||||
[("fields.Expiration Date", ASCENDING)],
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "results",
|
||||
"collection": "results",
|
||||
"pattern": re.compile(r".*test-results-\d+-(standard|microbiology)\.csv$", re.IGNORECASE),
|
||||
"sources": [SOURCE],
|
||||
"header_skip": 1, # 1. řádek je disclaimer, hlavička je 2. řádek
|
||||
"make_record": make_results_record,
|
||||
"file_meta": results_file_meta,
|
||||
"indexes": [
|
||||
[("subject", ASCENDING)],
|
||||
[("study", ASCENDING)],
|
||||
[("site", ASCENDING)],
|
||||
[("resultType", ASCENDING)],
|
||||
[("fields.Accession", ASCENDING)],
|
||||
[("fields.Test Group", ASCENDING)],
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "equeries",
|
||||
"collection": "equeries",
|
||||
# FULL report; varianta _unresponded_only se sem ZÁMĚRNĚ nechytá (jiný pattern níže)
|
||||
"pattern": re.compile(r".*activity-reports-documents-equery\.csv$", re.IGNORECASE),
|
||||
"sources": [SOURCE],
|
||||
"header_skip": 0,
|
||||
"make_record": make_equery_record,
|
||||
"file_meta": equery_file_meta,
|
||||
"indexes": [
|
||||
[("study", ASCENDING)],
|
||||
[("fields.Status", ASCENDING)],
|
||||
[("fields.Site", ASCENDING)],
|
||||
[("fields.Subject", ASCENDING)],
|
||||
[("fields.Issue Type", ASCENDING)],
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "equeries_unresponded",
|
||||
"move_only": True, # podmnožina FULL reportu -> jen přesun, neimportuje se
|
||||
"pattern": re.compile(r".*activity-reports-documents-equery_unresponded_only\.csv$", re.IGNORECASE),
|
||||
"sources": [SOURCE],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
def extract_snapshot_date(filename: str) -> str:
|
||||
match = re.match(r"(\d{4}-\d{2}-\d{2})", Path(filename).name)
|
||||
return match.group(1) if match else datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
|
||||
def clean_value(val: str) -> str | None:
|
||||
val = val.strip()
|
||||
return val if val else None
|
||||
|
||||
|
||||
def import_file(csv_path: Path, collection, pipeline: dict, dry_run: bool) -> dict:
|
||||
snapshot_date = extract_snapshot_date(csv_path.name)
|
||||
inserted = changed = unchanged = skipped = 0
|
||||
|
||||
fmeta = pipeline["file_meta"](csv_path.name) if pipeline["file_meta"] else None
|
||||
|
||||
with open(csv_path, newline="", encoding="utf-8-sig") as f:
|
||||
lines = f.readlines()
|
||||
reader = csv.DictReader(lines[pipeline["header_skip"]:])
|
||||
rows = list(reader)
|
||||
|
||||
occ: dict = {} # stav pořadí výskytů (per soubor)
|
||||
|
||||
for row in rows:
|
||||
fields = {k: clean_value(v) for k, v in row.items() if k}
|
||||
|
||||
record_id, extra = pipeline["make_record"](fields, fmeta, occ)
|
||||
if not record_id:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
existing = None if dry_run else collection.find_one({"record_id": record_id})
|
||||
|
||||
if existing is None and dry_run:
|
||||
inserted += 1 # v dry-run nevíme jistě, počítáme jako kandidáty na insert
|
||||
continue
|
||||
|
||||
if existing is None:
|
||||
collection.insert_one({
|
||||
"record_id": record_id,
|
||||
"fields": fields,
|
||||
**extra,
|
||||
"sourceFile": csv_path.name,
|
||||
"firstSeen": snapshot_date,
|
||||
"lastSeen": snapshot_date,
|
||||
"history": [],
|
||||
})
|
||||
inserted += 1
|
||||
|
||||
elif existing["fields"] != fields:
|
||||
collection.update_one(
|
||||
{"_id": existing["_id"]},
|
||||
{
|
||||
"$push": {"history": {"date": existing["lastSeen"], "fields": existing["fields"]}},
|
||||
"$set": {"fields": fields, **extra, "sourceFile": csv_path.name, "lastSeen": snapshot_date},
|
||||
},
|
||||
)
|
||||
changed += 1
|
||||
|
||||
else:
|
||||
collection.update_one(
|
||||
{"_id": existing["_id"]},
|
||||
{"$set": {"lastSeen": snapshot_date, "sourceFile": csv_path.name}},
|
||||
)
|
||||
unchanged += 1
|
||||
|
||||
total_rows = len(rows)
|
||||
db_count = "-" if dry_run else collection.count_documents({})
|
||||
tag = "[DRY] " if dry_run else ""
|
||||
print(f" {tag}[{snapshot_date}]: +{inserted} new, ~{changed} changed, ={unchanged} same, -{skipped} bez klice")
|
||||
print(f" Radku v CSV: {total_rows}, dokumentu v DB: {db_count}")
|
||||
|
||||
if inserted + changed + unchanged + skipped != total_rows:
|
||||
print(f" !!! VAROVANI: soucet ({inserted+changed+unchanged+skipped}) != radku v CSV ({total_rows})")
|
||||
|
||||
return {"inserted": inserted, "changed": changed, "unchanged": unchanged}
|
||||
|
||||
|
||||
def collect_files(pipeline: dict, cli_args: list[str]) -> list[Path]:
|
||||
if cli_args:
|
||||
paths = []
|
||||
for arg in cli_args:
|
||||
p = Path(arg)
|
||||
if p.is_file() and pipeline["pattern"].match(p.name):
|
||||
paths.append(p)
|
||||
return paths
|
||||
|
||||
paths = []
|
||||
for src_dir in pipeline["sources"]:
|
||||
if src_dir.exists():
|
||||
paths.extend(sorted(p for p in src_dir.glob("*.csv") if pipeline["pattern"].match(p.name)))
|
||||
return paths
|
||||
|
||||
|
||||
def move_to_processed(csv_path: Path, dry_run: bool):
|
||||
if dry_run:
|
||||
print(f" [DRY] -> presunul by do Zpracovano/\n")
|
||||
return
|
||||
dest = csv_path.parent / "Zpracovano" / csv_path.name
|
||||
shutil.move(str(csv_path), str(dest))
|
||||
print(f" -> presunut do Zpracovano/\n")
|
||||
|
||||
|
||||
def run_pipeline(pipeline: dict, client, cli_args: list[str], dry_run: bool):
|
||||
paths = collect_files(pipeline, cli_args)
|
||||
if not paths:
|
||||
print(f"[{pipeline['name']}] Zadne soubory k importu.")
|
||||
return
|
||||
|
||||
print(f"\n=== Pipeline: {pipeline['name']} ({len(paths)} souboru){' [DRY-RUN]' if dry_run else ''} ===")
|
||||
|
||||
# Move-only pipeline (např. unresponded podmnožina) — jen přesun, žádný import
|
||||
if pipeline.get("move_only"):
|
||||
if not dry_run:
|
||||
for src_dir in pipeline["sources"]:
|
||||
(src_dir / "Zpracovano").mkdir(exist_ok=True)
|
||||
for csv_path in paths:
|
||||
print(f"Move-only: {csv_path.name} [{csv_path.parent.parent.name}]")
|
||||
move_to_processed(csv_path, dry_run)
|
||||
print(f"[{pipeline['name']}] Presunuto {len(paths)} souboru (neimportuje se).")
|
||||
return
|
||||
|
||||
col = None
|
||||
if not dry_run:
|
||||
col = client[DB_NAME][pipeline["collection"]]
|
||||
col.create_index([("record_id", ASCENDING)], unique=True)
|
||||
for idx in pipeline["indexes"]:
|
||||
col.create_index(idx)
|
||||
for src_dir in pipeline["sources"]:
|
||||
(src_dir / "Zpracovano").mkdir(exist_ok=True)
|
||||
|
||||
total = {"inserted": 0, "changed": 0, "unchanged": 0}
|
||||
|
||||
for csv_path in paths:
|
||||
print(f"Import: {csv_path.name} [{csv_path.parent.parent.name}]")
|
||||
stats = import_file(csv_path, col, pipeline, dry_run)
|
||||
for k in total:
|
||||
total[k] += stats[k]
|
||||
move_to_processed(csv_path, dry_run)
|
||||
|
||||
print(f"[{pipeline['name']}] Celkem: +{total['inserted']} new, ~{total['changed']} changed, ={total['unchanged']} same")
|
||||
|
||||
|
||||
def main():
|
||||
args = sys.argv[1:]
|
||||
dry_run = "--dry-run" in args
|
||||
cli_args = [a for a in args if a != "--dry-run"]
|
||||
|
||||
client = None
|
||||
if not dry_run:
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
client.admin.command("ping")
|
||||
|
||||
for pipeline in PIPELINES:
|
||||
run_pipeline(pipeline, client, cli_args, dry_run)
|
||||
|
||||
if client:
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user