notebook
This commit is contained in:
+11
-365
@@ -4,7 +4,7 @@ Kompletní pipeline pro Drugs:
|
||||
2. IP destruction (per košík, přeskočí již existující soubory)
|
||||
3. Shipments report (jeden soubor na studii, přepisuje)
|
||||
4. Shipment details (per zásilka CZ, vždy přepisuje)
|
||||
5. Import do MySQL
|
||||
5. Import do MongoDB (studie.iwrs_shipments / iwrs_shipment_items / iwrs_inventory / iwrs_destruction)
|
||||
|
||||
Spusť tento skript — zpracuje obě studie automaticky.
|
||||
"""
|
||||
@@ -14,12 +14,11 @@ import glob
|
||||
import re
|
||||
import datetime
|
||||
|
||||
import numpy as np
|
||||
import sys
|
||||
import pandas as pd
|
||||
from playwright.sync_api import sync_playwright
|
||||
import mysql.connector
|
||||
|
||||
import db_config
|
||||
import import_to_mongo as drugs_mongo
|
||||
|
||||
BASE_URL = "https://janssen.4gclinical.com"
|
||||
EMAIL = "vbuzalka@its.jnj.com"
|
||||
@@ -42,357 +41,6 @@ SITES = {
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
||||
# ── type converters ──────────────────────────────────────────────────────────
|
||||
|
||||
def _py(val):
|
||||
if isinstance(val, np.generic):
|
||||
return val.item()
|
||||
return val
|
||||
|
||||
def to_date(val):
|
||||
val = _py(val)
|
||||
if val is None:
|
||||
return None
|
||||
if isinstance(val, float) and (val != val):
|
||||
return None
|
||||
try:
|
||||
if pd.isna(val):
|
||||
return None
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
if isinstance(val, pd.Timestamp):
|
||||
return None if pd.isna(val) else val.date()
|
||||
if isinstance(val, datetime.datetime):
|
||||
return val.date()
|
||||
if isinstance(val, datetime.date):
|
||||
return val
|
||||
s = str(val).strip()
|
||||
if not s or s.lower() in ("nat", "nan", "none", ""):
|
||||
return None
|
||||
for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
|
||||
try:
|
||||
return datetime.datetime.strptime(s, fmt).date()
|
||||
except ValueError:
|
||||
pass
|
||||
return None
|
||||
|
||||
def to_int(val):
|
||||
val = _py(val)
|
||||
try:
|
||||
v = float(val)
|
||||
return None if (v != v) else int(v)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
def to_str(val):
|
||||
val = _py(val)
|
||||
if val is None:
|
||||
return None
|
||||
if isinstance(val, float) and (val != val):
|
||||
return None
|
||||
s = str(val).strip()
|
||||
return None if s.lower() in ("nan", "nat", "none", "") else s
|
||||
|
||||
|
||||
# ── DB helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
def get_conn():
|
||||
return mysql.connector.connect(
|
||||
host=db_config.DB_HOST, port=db_config.DB_PORT,
|
||||
user=db_config.DB_USER, password=db_config.DB_PASSWORD,
|
||||
database=db_config.DB_NAME,
|
||||
)
|
||||
|
||||
def insert_import(cursor, study, source_label):
|
||||
cursor.execute(
|
||||
"INSERT INTO iwrs_import (study, imported_at, source_file, report_type) VALUES (%s, %s, %s, %s)",
|
||||
(study, datetime.datetime.now(), source_label, "drugs"),
|
||||
)
|
||||
return cursor.lastrowid
|
||||
|
||||
def basket_already_imported(cursor, study, basket_id):
|
||||
cursor.execute(
|
||||
"SELECT 1 FROM iwrs_destruction WHERE study=%s AND basket_id=%s LIMIT 1",
|
||||
(study, str(basket_id)),
|
||||
)
|
||||
return cursor.fetchone() is not None
|
||||
|
||||
|
||||
# ── parsery ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def parse_shipments_report(study):
|
||||
path = os.path.join(BASE_DIR, f"xls_shipments_{study}", f"shipments_report_{study}.xlsx")
|
||||
if not os.path.exists(path):
|
||||
print(f" CHYBÍ: {path}")
|
||||
return []
|
||||
raw = pd.read_excel(path, header=None)
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
if "Shipment ID" in [str(v).strip() for v in row]:
|
||||
header_row = i
|
||||
break
|
||||
if header_row is None:
|
||||
return []
|
||||
df = pd.read_excel(path, header=header_row).dropna(how="all")
|
||||
df = df[df["Location"].astype(str).str.contains("Czech", na=False, case=False)]
|
||||
col = df.columns.tolist()
|
||||
rows = []
|
||||
for _, r in df.iterrows():
|
||||
rows.append({
|
||||
"shipment_id": to_str(r["Shipment ID"]),
|
||||
"status": to_str(r["IRT Shipment Status"]),
|
||||
"type": to_str(r["Type"]),
|
||||
"ship_from": to_str(r["Shipment From"]),
|
||||
"ship_to_site": to_str(r["Ship To:"]),
|
||||
"location": to_str(r["Location"]),
|
||||
"request_date": to_date(r["Request Date"]),
|
||||
"shipped_date": to_date(r["Shipped Date"]),
|
||||
"received_date": to_date(r["Received Date"]) if "Received Date" in col else None,
|
||||
"received_by": to_str(r["Received by"]) if "Received by" in col else None,
|
||||
"delivered_date_utc": to_date(r["Delivered Date [UTC]"]) if "Delivered Date [UTC]" in col else None,
|
||||
"delivery_recipient": to_str(r["Delivery Recipient"]) if "Delivery Recipient" in col else None,
|
||||
"delivery_details": to_str(r["Delivery Details"]) if "Delivery Details" in col else None,
|
||||
"cancelled_date": to_date(r["Cancelled Date"]) if "Cancelled Date" in col else None,
|
||||
"total_medication_ids": to_int(r["Total Medication IDs"]) if "Total Medication IDs" in col else None,
|
||||
"tracking_no": to_str(r["Tracking #"]) if "Tracking #" in col else None,
|
||||
"shipping_category": to_str(r["Shipping Category"]) if "Shipping Category" in col else None,
|
||||
"expected_arrival": to_date(r["Expected Arrival"]) if "Expected Arrival" in col else None,
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
def parse_shipment_details(study):
|
||||
detail_dir = os.path.join(BASE_DIR, f"xls_shipment_details_{study}")
|
||||
files = sorted(glob.glob(os.path.join(detail_dir, "shipment_details_*.xlsx")))
|
||||
rows = []
|
||||
for path in files:
|
||||
m = re.search(r"shipment_details_(.+)\.xlsx", os.path.basename(path))
|
||||
shipment_id = m.group(1) if m else "UNKNOWN"
|
||||
raw = pd.read_excel(path, header=None)
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
if "Medication ID" in [str(v).strip() for v in row]:
|
||||
header_row = i
|
||||
break
|
||||
if header_row is None:
|
||||
continue
|
||||
df = pd.read_excel(path, header=header_row).dropna(how="all")
|
||||
for _, r in df.iterrows():
|
||||
med_desc = (to_str(r.get("Medication Description"))
|
||||
or to_str(r.get("Medication ID Description")))
|
||||
med_type = (to_str(r.get("Medication type"))
|
||||
or to_str(r.get("Medication ID type")))
|
||||
rows.append({
|
||||
"shipment_id": shipment_id,
|
||||
"destination_location": to_str(r.get("Destination Location")),
|
||||
"shipment_status": to_str(r.get("IRT Shipment Status")),
|
||||
"shipment_type": to_str(r.get("Type")),
|
||||
"destination_site": to_str(r.get("Destination Site")),
|
||||
"investigator": to_str(r.get("Investigator")),
|
||||
"medication_description": med_desc,
|
||||
"medication_type": med_type,
|
||||
"medication_id": to_str(r.get("Medication ID")),
|
||||
"packaged_lot_no": to_str(r.get("Packaged Lot number")),
|
||||
"packaged_lot_description": to_str(r.get("Packaged Lot description")),
|
||||
"container_id": to_str(r.get("Container ID")),
|
||||
"quantity": to_int(r.get("Quantity of Medication IDs")),
|
||||
"expiration_date": to_date(r.get("Expiration Date")),
|
||||
"item_status": to_str(r.get("Status")),
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
def parse_inventory(study):
|
||||
inv_dir = os.path.join(BASE_DIR, f"xls_reports_{study}")
|
||||
files = sorted(glob.glob(os.path.join(inv_dir, "onsite_inventory_detail_*.xlsx")))
|
||||
rows = []
|
||||
for path in files:
|
||||
raw = pd.read_excel(path, header=None)
|
||||
site = investigator = location = None
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
|
||||
if first.startswith("Site:"):
|
||||
site = first.replace("Site:", "").strip()
|
||||
elif first.startswith("Investigator:"):
|
||||
investigator = first.replace("Investigator:", "").strip()
|
||||
elif first.startswith("Location:"):
|
||||
location = first.replace("Location:", "").strip()
|
||||
if first in ("Medication", "Medication ID") and header_row is None:
|
||||
header_row = i
|
||||
if header_row is None:
|
||||
continue
|
||||
df = pd.read_excel(path, header=header_row).dropna(how="all")
|
||||
df = df.rename(columns={df.columns[0]: "medication_id"})
|
||||
for _, r in df.iterrows():
|
||||
rows.append({
|
||||
"site": site,
|
||||
"investigator": investigator,
|
||||
"location": location,
|
||||
"medication_id": to_str(r["medication_id"]),
|
||||
"packaged_lot_no": to_str(r.get("Packaged Lot number")),
|
||||
"original_expiration_date": to_date(r.get("Original Expiration Date when Packaged Lot was Added")),
|
||||
"expiration_date": to_date(r.get("Expiration date")),
|
||||
"received_date": to_date(r.get("Received Date")),
|
||||
"receipt_user": to_str(r.get("Shipment Receipt User")),
|
||||
"subject_identifier": to_str(r.get("Subject Identifier")),
|
||||
"quantity_assigned": to_int(r.get("Quantity Assigned")),
|
||||
"irt_transaction": to_str(r.get("IRT Transaction")),
|
||||
"date_assigned": to_date(r.get("Date Assigned")),
|
||||
"assignment_user": to_str(r.get("Assignment User")),
|
||||
"dispensation_status": to_str(r.get("Dispensation Status")),
|
||||
"dispensing_date": to_date(r.get("Dispensing date") or r.get("Dispensing Date")),
|
||||
"quantity_dispensed": to_int(r.get("Quantity Dispensed")),
|
||||
"dispensing_user": to_str(r.get("Dispensing User")),
|
||||
"quantity_returned": to_int(r.get("Quantity Returned")),
|
||||
"date_returned": to_date(r.get("Date Returned")),
|
||||
"return_user": to_str(r.get("Return User")),
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
def parse_destruction_files(study):
|
||||
dest_dir = os.path.join(BASE_DIR, f"xls_ip_destruction_{study}")
|
||||
files = sorted(glob.glob(os.path.join(dest_dir, "ip_destruction_basket_*.xlsx")))
|
||||
baskets = []
|
||||
for path in files:
|
||||
raw = pd.read_excel(path, header=None)
|
||||
meta = {}
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
|
||||
for key, attr in [
|
||||
("Investigator Name:", "investigator"),
|
||||
("Site ID:", "site_id"),
|
||||
("Location:", "location"),
|
||||
("Basket ID:", "basket_id"),
|
||||
("Drug Destruction Created Date:", "destruction_date"),
|
||||
]:
|
||||
if first.startswith(key):
|
||||
meta[attr] = first.replace(key, "").strip()
|
||||
if first == "Medication ID Description" and header_row is None:
|
||||
header_row = i
|
||||
if header_row is None:
|
||||
continue
|
||||
df = pd.read_excel(path, header=header_row).dropna(how="all")
|
||||
items = []
|
||||
for _, r in df.iterrows():
|
||||
items.append({
|
||||
"medication_description": to_str(r.get("Medication ID Description")),
|
||||
"medication_id": to_str(r.get("Medication ID")),
|
||||
"packaged_lot_description": to_str(r.get("Packaged Lot description")),
|
||||
"comments": to_str(r.get("Comments")),
|
||||
})
|
||||
baskets.append({
|
||||
"site_id": meta.get("site_id"),
|
||||
"investigator": meta.get("investigator"),
|
||||
"location": meta.get("location"),
|
||||
"basket_id": meta.get("basket_id"),
|
||||
"destruction_date": to_date(meta.get("destruction_date")),
|
||||
"items": items,
|
||||
})
|
||||
return baskets
|
||||
|
||||
|
||||
# ── insertery ────────────────────────────────────────────────────────────────
|
||||
|
||||
def insert_shipments(cursor, import_id, study, rows):
|
||||
sql = """INSERT INTO iwrs_shipments
|
||||
(import_id, study, shipment_id, status, type, ship_from, ship_to_site,
|
||||
location, request_date, shipped_date, received_date, received_by,
|
||||
delivered_date_utc, delivery_recipient, delivery_details, cancelled_date,
|
||||
total_medication_ids, tracking_no, shipping_category, expected_arrival)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
||||
for r in rows:
|
||||
cursor.execute(sql, (
|
||||
import_id, study, r["shipment_id"], r["status"], r["type"],
|
||||
r["ship_from"], r["ship_to_site"], r["location"],
|
||||
r["request_date"], r["shipped_date"], r["received_date"],
|
||||
r["received_by"], r["delivered_date_utc"], r["delivery_recipient"],
|
||||
r["delivery_details"], r["cancelled_date"], r["total_medication_ids"],
|
||||
r["tracking_no"], r["shipping_category"], r["expected_arrival"],
|
||||
))
|
||||
|
||||
|
||||
def insert_shipment_items(cursor, import_id, study, rows):
|
||||
sql = """INSERT INTO iwrs_shipment_items
|
||||
(import_id, study, shipment_id, destination_location, shipment_status,
|
||||
shipment_type, destination_site, investigator, medication_description,
|
||||
medication_type, medication_id, packaged_lot_no, packaged_lot_description,
|
||||
container_id, quantity, expiration_date, item_status)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
||||
for r in rows:
|
||||
cursor.execute(sql, (
|
||||
import_id, study, r["shipment_id"], r["destination_location"],
|
||||
r["shipment_status"], r["shipment_type"], r["destination_site"],
|
||||
r["investigator"], r["medication_description"], r["medication_type"],
|
||||
r["medication_id"], r["packaged_lot_no"], r["packaged_lot_description"],
|
||||
r["container_id"], r["quantity"], r["expiration_date"], r["item_status"],
|
||||
))
|
||||
|
||||
|
||||
def insert_inventory(cursor, import_id, study, rows):
|
||||
sql = """INSERT INTO iwrs_inventory
|
||||
(import_id, study, site, investigator, location, medication_id,
|
||||
packaged_lot_no, original_expiration_date, expiration_date, received_date,
|
||||
receipt_user, subject_identifier, quantity_assigned, irt_transaction,
|
||||
date_assigned, assignment_user, dispensation_status, dispensing_date,
|
||||
quantity_dispensed, dispensing_user, quantity_returned, date_returned, return_user)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
||||
for r in rows:
|
||||
cursor.execute(sql, (
|
||||
import_id, study, r["site"], r["investigator"], r["location"],
|
||||
r["medication_id"], r["packaged_lot_no"], r["original_expiration_date"],
|
||||
r["expiration_date"], r["received_date"], r["receipt_user"],
|
||||
r["subject_identifier"], r["quantity_assigned"], r["irt_transaction"],
|
||||
r["date_assigned"], r["assignment_user"], r["dispensation_status"],
|
||||
r["dispensing_date"], r["quantity_dispensed"], r["dispensing_user"],
|
||||
r["quantity_returned"], r["date_returned"], r["return_user"],
|
||||
))
|
||||
|
||||
|
||||
def insert_destruction(cursor, study, baskets):
|
||||
sql = """INSERT IGNORE INTO iwrs_destruction
|
||||
(study, site_id, investigator, location, basket_id, destruction_date,
|
||||
medication_description, medication_id, packaged_lot_description, comments)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
||||
skipped = imported = 0
|
||||
for b in baskets:
|
||||
if basket_already_imported(cursor, study, b["basket_id"]):
|
||||
skipped += 1
|
||||
continue
|
||||
for item in b["items"]:
|
||||
cursor.execute(sql, (
|
||||
study, b["site_id"], b["investigator"], b["location"],
|
||||
b["basket_id"], b["destruction_date"],
|
||||
item["medication_description"], item["medication_id"],
|
||||
item["packaged_lot_description"], item["comments"],
|
||||
))
|
||||
imported += 1
|
||||
return imported, skipped
|
||||
|
||||
|
||||
def import_study(study):
|
||||
print(f"\n Parsování dat pro {study}...")
|
||||
shipments = parse_shipments_report(study)
|
||||
items = parse_shipment_details(study)
|
||||
inventory = parse_inventory(study)
|
||||
baskets = parse_destruction_files(study)
|
||||
print(f" Zásilky: {len(shipments)} | Položky: {len(items)} | Sklad: {len(inventory)} | Destrukce: {len(baskets)} košíků")
|
||||
|
||||
conn = get_conn()
|
||||
cursor = conn.cursor()
|
||||
import_id = insert_import(cursor, study, f"drugs_{study}")
|
||||
print(f" import_id = {import_id}")
|
||||
insert_shipments(cursor, import_id, study, shipments)
|
||||
insert_shipment_items(cursor, import_id, study, items)
|
||||
insert_inventory(cursor, import_id, study, inventory)
|
||||
dest_imported, dest_skipped = insert_destruction(cursor, study, baskets)
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
print(f" Destrukce: {dest_imported} nových | {dest_skipped} košíků přeskočeno")
|
||||
|
||||
|
||||
# ── login ────────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -577,19 +225,17 @@ def main():
|
||||
finally:
|
||||
browser.close()
|
||||
|
||||
# ── Import do MySQL ───────────────────────────────────────────────────────
|
||||
# ── Import do MongoDB ─────────────────────────────────────────────────────
|
||||
print(f"\n{'='*60}")
|
||||
print("IMPORT DO MySQL")
|
||||
print("IMPORT DO MongoDB")
|
||||
print(f"{'='*60}")
|
||||
|
||||
for study in STUDIES:
|
||||
print(f"\n[{study}]")
|
||||
try:
|
||||
import_study(study)
|
||||
except Exception as e:
|
||||
import traceback
|
||||
print(f" CHYBA při importu: {e}")
|
||||
traceback.print_exc()
|
||||
try:
|
||||
drugs_mongo.run(STUDIES)
|
||||
except Exception as e:
|
||||
import traceback
|
||||
print(f" CHYBA při importu: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("Vše hotovo.")
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -156,38 +156,62 @@ def run(page, study):
|
||||
total_notif = 0
|
||||
for subject in subjects:
|
||||
filename = os.path.join(out_dir, f"{today} {study} {subject} Subject Detail.xlsx")
|
||||
print(f" [{subject}] Stahuji...")
|
||||
input_field = page.locator('input[placeholder="search"], input[type="text"]').first
|
||||
input_field.click()
|
||||
input_field.fill(subject)
|
||||
page.wait_for_timeout(500)
|
||||
|
||||
# Zachytíme table_1 response při výběru subjektu
|
||||
if api_base:
|
||||
success = False
|
||||
table1_data = None
|
||||
for attempt in range(1, 4):
|
||||
try:
|
||||
with page.expect_response(
|
||||
lambda r: "report_data" in r.url and "table_1" in r.url,
|
||||
timeout=60000
|
||||
) as resp_info:
|
||||
print(f" [{subject}] Stahuji... (pokus {attempt}/3)")
|
||||
input_field = page.locator('input[placeholder="search"], input[type="text"]').first
|
||||
input_field.click()
|
||||
input_field.fill(subject)
|
||||
page.wait_for_timeout(500)
|
||||
|
||||
# Zachytíme table_1 response při výběru subjektu
|
||||
if api_base:
|
||||
try:
|
||||
with page.expect_response(
|
||||
lambda r: "report_data" in r.url and "table_1" in r.url,
|
||||
timeout=60000
|
||||
) as resp_info:
|
||||
page.locator("mat-option").first.dispatch_event("click")
|
||||
table1_data = resp_info.value.json()
|
||||
except Exception as e:
|
||||
print(f" [{subject}] CHYBA zachycení table_1: {e}")
|
||||
page.locator("mat-option").first.dispatch_event("click")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
table1_data = None
|
||||
else:
|
||||
page.locator("mat-option").first.dispatch_event("click")
|
||||
table1_data = resp_info.value.json()
|
||||
except Exception as e:
|
||||
print(f" [{subject}] CHYBA zachycení table_1: {e}")
|
||||
page.locator("mat-option").first.dispatch_event("click")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
table1_data = None
|
||||
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
table1_data = None
|
||||
else:
|
||||
page.locator("mat-option").first.dispatch_event("click")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
table1_data = None
|
||||
page.wait_for_timeout(2000)
|
||||
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
page.wait_for_timeout(1000)
|
||||
with page.expect_download(timeout=60000) as dl:
|
||||
page.get_by_role("button", name="Download XLS").click()
|
||||
dl.value.save_as(filename)
|
||||
print(f" [{subject}] XLS OK")
|
||||
success = True
|
||||
break
|
||||
except Exception as e:
|
||||
print(f" [{subject}] pokus {attempt} selhal: {e}")
|
||||
if attempt < 3:
|
||||
try:
|
||||
page.goto(f"{BASE_URL}/report/patient_detail_report")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
except Exception as ge:
|
||||
print(f" [{subject}] refresh selhal: {ge}")
|
||||
|
||||
with page.expect_download(timeout=120000) as dl:
|
||||
page.get_by_role("button", name="Download XLS").click()
|
||||
dl.value.save_as(filename)
|
||||
print(f" [{subject}] XLS OK")
|
||||
if not success:
|
||||
print(f" [{subject}] PŘESKAKUJI po 3 neúspěšných pokusech")
|
||||
try:
|
||||
page.goto(f"{BASE_URL}/report/patient_detail_report")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
except Exception:
|
||||
pass
|
||||
continue
|
||||
|
||||
# Stáhnout notifikace pro tohoto subjekta
|
||||
if api_base and table1_data:
|
||||
@@ -196,8 +220,13 @@ def run(page, study):
|
||||
)
|
||||
total_notif += n
|
||||
|
||||
page.get_by_role("button", name="Clear").click()
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
try:
|
||||
page.get_by_role("button", name="Clear").click()
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
except Exception as e:
|
||||
print(f" [{subject}] Clear selhal: {e} — refresh")
|
||||
page.goto(f"{BASE_URL}/report/patient_detail_report")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
|
||||
print(f" [{study}] Subject details hotovo. Nových notifikací: {total_notif}")
|
||||
|
||||
|
||||
+24
-292
@@ -2,23 +2,21 @@
|
||||
Kompletní pipeline:
|
||||
1. Stažení Subject Summary Reportů (obě studie)
|
||||
2. Stažení Subject Detail Reportů + notifikací (obě studie)
|
||||
3. Import do MySQL (summary, visits, notifikace)
|
||||
3. Import do MongoDB (subject_summary + visits + notifications)
|
||||
|
||||
Spusť tento skript místo samostatných skriptů.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import datetime
|
||||
import glob
|
||||
import re
|
||||
|
||||
from playwright.sync_api import sync_playwright
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import db_config
|
||||
import mysql.connector
|
||||
import download_subject_details as dsd
|
||||
import import_to_mongo
|
||||
import import_notifications_to_mongo
|
||||
|
||||
# ── CONFIG ───────────────────────────────────────────────────────────────────
|
||||
BASE_URL = "https://janssen.4gclinical.com"
|
||||
@@ -72,6 +70,7 @@ def download_summary(page, study, today):
|
||||
# ── KROK 2: Subject Details ───────────────────────────────────────────────────
|
||||
|
||||
def get_subjects_from_summary(summary_path):
|
||||
import pandas as pd
|
||||
raw = pd.read_excel(summary_path, header=None)
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
@@ -112,277 +111,7 @@ def download_details(page, study, summary_path, today):
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
|
||||
|
||||
# ── KROK 3: Import do MySQL ───────────────────────────────────────────────────
|
||||
|
||||
def get_conn():
|
||||
return mysql.connector.connect(
|
||||
host=db_config.DB_HOST,
|
||||
port=db_config.DB_PORT,
|
||||
user=db_config.DB_USER,
|
||||
password=db_config.DB_PASSWORD,
|
||||
database=db_config.DB_NAME,
|
||||
)
|
||||
|
||||
|
||||
def _py(val):
|
||||
"""Převede numpy skalár na Python nativní typ."""
|
||||
if isinstance(val, np.generic):
|
||||
return val.item()
|
||||
return val
|
||||
|
||||
|
||||
def to_date(val):
|
||||
val = _py(val)
|
||||
if val is None or (isinstance(val, float) and (val != val)):
|
||||
return None
|
||||
try:
|
||||
if pd.isna(val):
|
||||
return None
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
if isinstance(val, pd.Timestamp):
|
||||
return None if pd.isna(val) else val.date()
|
||||
if isinstance(val, datetime.datetime):
|
||||
return val.date()
|
||||
if isinstance(val, datetime.date):
|
||||
return val
|
||||
s = str(val).strip()
|
||||
if not s or s.lower() in ("nat", "nan", "none", ""):
|
||||
return None
|
||||
for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
|
||||
try:
|
||||
return datetime.datetime.strptime(s, fmt).date()
|
||||
except ValueError:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def to_int(val):
|
||||
val = _py(val)
|
||||
try:
|
||||
v = float(val)
|
||||
return None if (v != v) else int(v)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def to_float(val):
|
||||
val = _py(val)
|
||||
try:
|
||||
v = float(val)
|
||||
return None if (v != v) else float(v)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def to_str(val):
|
||||
val = _py(val)
|
||||
if val is None:
|
||||
return None
|
||||
if isinstance(val, float) and (val != val):
|
||||
return None
|
||||
s = str(val).strip()
|
||||
return None if s.lower() in ("nan", "nat", "none", "") else s
|
||||
|
||||
|
||||
def read_summary_df(path):
|
||||
raw = pd.read_excel(path, header=None)
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
if "Subject" in [str(v).strip() for v in row]:
|
||||
header_row = i
|
||||
break
|
||||
if header_row is None:
|
||||
raise ValueError(f"Hlavičkový řádek nenalezen v {path}")
|
||||
return pd.read_excel(path, header=header_row).dropna(how="all")
|
||||
|
||||
|
||||
def parse_detail_visits(path):
|
||||
df = pd.read_excel(path, sheet_name="patient_detail_report", header=None)
|
||||
header_row = None
|
||||
for i, row in df.iterrows():
|
||||
if "Visit Type" in [str(v).strip() for v in row]:
|
||||
header_row = i
|
||||
break
|
||||
if header_row is None:
|
||||
return []
|
||||
visits_df = df.iloc[header_row + 1:].copy()
|
||||
visits_df.columns = range(visits_df.shape[1])
|
||||
rows = []
|
||||
for _, r in visits_df.iterrows():
|
||||
visit_type = to_str(r.get(0))
|
||||
if visit_type not in ("Past", "Upcoming"):
|
||||
continue
|
||||
rows.append({
|
||||
"visit_type": visit_type,
|
||||
"scheduled_date": to_date(r.get(1)),
|
||||
"window_days": to_str(r.get(2)),
|
||||
"actual_date": to_date(r.get(3)),
|
||||
"irt_transaction_no": to_int(r.get(4)),
|
||||
"irt_transaction_description": to_str(r.get(5)),
|
||||
"medication_assignment": to_str(r.get(6)),
|
||||
"quantity_assigned": to_int(r.get(7)),
|
||||
"medication_id": to_str(r.get(8)),
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
def insert_import(cursor, study, source_file):
|
||||
cursor.execute(
|
||||
"INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)",
|
||||
(study, datetime.datetime.now(), os.path.basename(source_file)),
|
||||
)
|
||||
return cursor.lastrowid
|
||||
|
||||
|
||||
def insert_uco3001_summary(cursor, import_id, df):
|
||||
sql = """INSERT INTO iwrs_uco3001_subject_summary (
|
||||
import_id, subject, prior_subject_identifier, site, investigator, location,
|
||||
cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight,
|
||||
rescreened_subject, adt_ir, three_or_more_advanced_therapies,
|
||||
only_oral_5asa_compounds, ustekinumab, isolated_proctitis,
|
||||
clinical_responder_status_i12_m0, irt_subject_status,
|
||||
i0_rand_date_local, last_irt_transaction,
|
||||
last_irt_transaction_date_local, last_irt_transaction_date_utc,
|
||||
next_irt_transaction, next_irt_transaction_date_local,
|
||||
most_recent_med_assignment_date, days_since_last_med_assignment,
|
||||
patient_forecast_status, patient_forecast_status_changed_date
|
||||
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
||||
col = df.columns.tolist()
|
||||
for _, r in df.iterrows():
|
||||
cursor.execute(sql, (
|
||||
import_id,
|
||||
to_str(r["Subject"]),
|
||||
to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
|
||||
to_str(r["Site"]),
|
||||
to_str(r["Investigator"]),
|
||||
to_str(r["Location"]),
|
||||
to_str(r["Cohort per IRT"]),
|
||||
to_date(r["Informed Consent Date"]),
|
||||
to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None,
|
||||
to_int(r["Subject's age collection"]),
|
||||
to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None,
|
||||
to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None,
|
||||
to_str(r["ADT-IR"]) if "ADT-IR" in col else None,
|
||||
to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None,
|
||||
to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None,
|
||||
to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None,
|
||||
to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None,
|
||||
to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None,
|
||||
to_str(r["IRT Subject Status"]),
|
||||
to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None,
|
||||
to_str(r["Last Recorded IRT Transaction"]),
|
||||
to_date(r["Last Recorded IRT Transaction Date [Local]"]),
|
||||
to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
|
||||
to_str(r["Next Expected IRT Transaction"]),
|
||||
to_date(r["Next Expected IRT Transaction Date [Local]"]),
|
||||
to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None,
|
||||
to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None,
|
||||
to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None,
|
||||
to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None,
|
||||
))
|
||||
|
||||
|
||||
def insert_mdd3003_summary(cursor, import_id, df):
|
||||
sql = """INSERT INTO iwrs_mdd3003_subject_summary (
|
||||
import_id, subject, prior_subject_identifier, site, investigator, location,
|
||||
cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age,
|
||||
madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17,
|
||||
stratification_country, age_group, stable_remitters, irt_subject_status,
|
||||
last_irt_transaction, last_irt_transaction_date_local,
|
||||
last_irt_transaction_date_utc, next_irt_transaction,
|
||||
next_irt_transaction_date_local, date_screened, date_screen_failed,
|
||||
date_randomized_part1, date_early_withdraw_randomized_part1,
|
||||
date_open_label_induction, date_early_withdraw_open_label_induction,
|
||||
date_randomized_part2, date_early_withdraw_randomized_part2,
|
||||
date_completed, date_unblinded
|
||||
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
||||
col = df.columns.tolist()
|
||||
for _, r in df.iterrows():
|
||||
cursor.execute(sql, (
|
||||
import_id,
|
||||
to_str(r["Subject"]),
|
||||
to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
|
||||
to_str(r["Site"]),
|
||||
to_str(r["Investigator"]),
|
||||
to_str(r["Location"]),
|
||||
to_str(r["Cohort per IRT"]),
|
||||
to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None,
|
||||
to_date(r["Informed Consent Date"]),
|
||||
to_int(r["Subject's age collection"]),
|
||||
to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None,
|
||||
to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None,
|
||||
to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None,
|
||||
to_str(r["Stratification Country"]) if "Stratification Country" in col else None,
|
||||
to_str(r["Age Group"]) if "Age Group" in col else None,
|
||||
to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None,
|
||||
to_str(r["IRT Subject Status"]),
|
||||
to_str(r["Last Recorded IRT Transaction"]),
|
||||
to_date(r["Last Recorded IRT Transaction Date [Local]"]),
|
||||
to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
|
||||
to_str(r["Next Expected IRT Transaction"]),
|
||||
to_date(r["Next Expected IRT Transaction Date [Local]"]),
|
||||
to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None,
|
||||
to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None,
|
||||
to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None,
|
||||
to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None,
|
||||
to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None,
|
||||
to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None,
|
||||
to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None,
|
||||
to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None,
|
||||
to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None,
|
||||
to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None,
|
||||
))
|
||||
|
||||
|
||||
def insert_visits(cursor, import_id, study, subject, visits):
|
||||
if not visits:
|
||||
return
|
||||
sql = """INSERT INTO iwrs_subject_visits (
|
||||
import_id, study, subject, visit_type, scheduled_date, window_days,
|
||||
actual_date, irt_transaction_no, irt_transaction_description,
|
||||
medication_assignment, quantity_assigned, medication_id
|
||||
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
||||
for v in visits:
|
||||
cursor.execute(sql, (
|
||||
import_id, study, subject,
|
||||
v["visit_type"], v["scheduled_date"], v["window_days"],
|
||||
v["actual_date"], v["irt_transaction_no"],
|
||||
v["irt_transaction_description"], v["medication_assignment"],
|
||||
v["quantity_assigned"], v["medication_id"],
|
||||
))
|
||||
|
||||
|
||||
def import_to_mysql(summary_path, detail_files, study):
|
||||
print(f"\n [MySQL] Importuji {study}...")
|
||||
df_summary = read_summary_df(summary_path)
|
||||
conn = get_conn()
|
||||
cursor = conn.cursor()
|
||||
|
||||
import_id = insert_import(cursor, study, summary_path)
|
||||
|
||||
if study == "77242113UCO3001":
|
||||
insert_uco3001_summary(cursor, import_id, df_summary)
|
||||
else:
|
||||
insert_mdd3003_summary(cursor, import_id, df_summary)
|
||||
|
||||
total_visits = 0
|
||||
for path in detail_files:
|
||||
fname = os.path.basename(path)
|
||||
m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname)
|
||||
subject = m.group(1) if m else "UNKNOWN"
|
||||
visits = parse_detail_visits(path)
|
||||
insert_visits(cursor, import_id, study, subject, visits)
|
||||
total_visits += len(visits)
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
print(f" [MySQL] import_id={import_id} | pacientů={len(df_summary)} | transakcí={total_visits}")
|
||||
return import_id
|
||||
|
||||
|
||||
# ── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
# ── KROK 3: Import do MongoDB ────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
today = datetime.date.today().strftime("%Y-%m-%d")
|
||||
@@ -391,12 +120,12 @@ def main():
|
||||
|
||||
summary_paths = {}
|
||||
|
||||
# ── Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session) ──
|
||||
# Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session)
|
||||
with sync_playwright() as p:
|
||||
for study in STUDIES:
|
||||
print(f"\n{'='*60}")
|
||||
print("\n" + "=" * 60)
|
||||
print(f"[{study}] KROK 1: Subject Summary Report")
|
||||
print(f"{'='*60}")
|
||||
print("=" * 60)
|
||||
browser = p.chromium.launch(headless=False)
|
||||
context = browser.new_context(accept_downloads=True)
|
||||
page = context.new_page()
|
||||
@@ -415,10 +144,10 @@ def main():
|
||||
finally:
|
||||
browser.close()
|
||||
|
||||
# ── Krok 3: import do MySQL ──────────────────────────────────────────────
|
||||
print(f"\n{'='*60}")
|
||||
print("KROK 3: Import do MySQL")
|
||||
print(f"{'='*60}")
|
||||
# Krok 3: import do MongoDB
|
||||
print("\n" + "=" * 60)
|
||||
print("KROK 3: Import do MongoDB")
|
||||
print("=" * 60)
|
||||
|
||||
for study in STUDIES:
|
||||
summary_path = summary_paths.get(study)
|
||||
@@ -426,18 +155,21 @@ def main():
|
||||
print(f" [{study}] PŘESKOČENO — stahování selhalo")
|
||||
continue
|
||||
|
||||
detail_files = sorted(glob.glob(
|
||||
os.path.join(DETAILS_DIR, study, f"{today} {study} * Subject Detail.xlsx")
|
||||
))
|
||||
|
||||
try:
|
||||
import_to_mysql(summary_path, detail_files, study)
|
||||
import_to_mongo.run(study, summary_path, DETAILS_DIR, today)
|
||||
except Exception as e:
|
||||
print(f" [{study}] CHYBA při importu: {e}")
|
||||
print(f" [{study}] CHYBA při importu summary/visits: {e}")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
# Notifikace: PDF/JSON z disku rovnou do Mongo iwrs_notifications
|
||||
print("\n [notifikace] import PDF/JSON do Mongo...")
|
||||
try:
|
||||
import_notifications_to_mongo.main(STUDIES)
|
||||
except Exception as e:
|
||||
print(f" CHYBA při importu notifikací: {e}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Vše hotovo.")
|
||||
print(f"{'='*60}")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user