janssen/IWRS/Drugs/run_all.py

"""
Kompletní pipeline pro Drugs:
  1. Onsite inventory detail    (per site, vždy přepisuje)
  2. IP destruction             (per košík, přeskočí již existující soubory)
  3. Shipments report           (jeden soubor na studii, přepisuje)
  4. Shipment details           (per zásilka CZ, vždy přepisuje)
  5. Import do MySQL

Spusť tento skript — zpracuje obě studie automaticky.
"""

import os
import glob
import re
import datetime

import numpy as np
import pandas as pd
from playwright.sync_api import sync_playwright
import mysql.connector

import db_config

BASE_URL = "https://janssen.4gclinical.com"
EMAIL    = "vbuzalka@its.jnj.com"
PASSWORD = "Vlado123++-+"

STUDIES = ["77242113UCO3001", "42847922MDD3003"]

SITES = {
    "77242113UCO3001": [
        "DD5-CZ10001", "DD5-CZ10003", "DD5-CZ10006", "DD5-CZ10009",
        "DD5-CZ10010", "DD5-CZ10012", "DD5-CZ10013", "DD5-CZ10015",
        "DD5-CZ10016", "DD5-CZ10020", "DD5-CZ10021", "DD5-CZ10022",
    ],
    "42847922MDD3003": [
        "S10-CZ10002", "S10-CZ10004", "S10-CZ10005",
        "S10-CZ10008", "S10-CZ10011", "S10-CZ10012",
    ],
}

BASE_DIR = os.path.dirname(os.path.abspath(__file__))


# ── type converters ──────────────────────────────────────────────────────────

def _py(val):
    if isinstance(val, np.generic):
        return val.item()
    return val

def to_date(val):
    val = _py(val)
    if val is None:
        return None
    if isinstance(val, float) and (val != val):
        return None
    try:
        if pd.isna(val):
            return None
    except (TypeError, ValueError):
        pass
    if isinstance(val, pd.Timestamp):
        return None if pd.isna(val) else val.date()
    if isinstance(val, datetime.datetime):
        return val.date()
    if isinstance(val, datetime.date):
        return val
    s = str(val).strip()
    if not s or s.lower() in ("nat", "nan", "none", ""):
        return None
    for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
        try:
            return datetime.datetime.strptime(s, fmt).date()
        except ValueError:
            pass
    return None

def to_int(val):
    val = _py(val)
    try:
        v = float(val)
        return None if (v != v) else int(v)
    except (TypeError, ValueError):
        return None

def to_str(val):
    val = _py(val)
    if val is None:
        return None
    if isinstance(val, float) and (val != val):
        return None
    s = str(val).strip()
    return None if s.lower() in ("nan", "nat", "none", "") else s


# ── DB helpers ───────────────────────────────────────────────────────────────

def get_conn():
    return mysql.connector.connect(
        host=db_config.DB_HOST, port=db_config.DB_PORT,
        user=db_config.DB_USER, password=db_config.DB_PASSWORD,
        database=db_config.DB_NAME,
    )

def insert_import(cursor, study, source_label):
    cursor.execute(
        "INSERT INTO iwrs_import (study, imported_at, source_file, report_type) VALUES (%s, %s, %s, %s)",
        (study, datetime.datetime.now(), source_label, "drugs"),
    )
    return cursor.lastrowid

def basket_already_imported(cursor, study, basket_id):
    cursor.execute(
        "SELECT 1 FROM iwrs_destruction WHERE study=%s AND basket_id=%s LIMIT 1",
        (study, str(basket_id)),
    )
    return cursor.fetchone() is not None


# ── parsery ──────────────────────────────────────────────────────────────────

def parse_shipments_report(study):
    path = os.path.join(BASE_DIR, f"xls_shipments_{study}", f"shipments_report_{study}.xlsx")
    if not os.path.exists(path):
        print(f"  CHYBÍ: {path}")
        return []
    raw = pd.read_excel(path, header=None)
    header_row = None
    for i, row in raw.iterrows():
        if "Shipment ID" in [str(v).strip() for v in row]:
            header_row = i
            break
    if header_row is None:
        return []
    df = pd.read_excel(path, header=header_row).dropna(how="all")
    df = df[df["Location"].astype(str).str.contains("Czech", na=False, case=False)]
    col = df.columns.tolist()
    rows = []
    for _, r in df.iterrows():
        rows.append({
            "shipment_id":          to_str(r["Shipment ID"]),
            "status":               to_str(r["IRT Shipment Status"]),
            "type":                 to_str(r["Type"]),
            "ship_from":            to_str(r["Shipment From"]),
            "ship_to_site":         to_str(r["Ship To:"]),
            "location":             to_str(r["Location"]),
            "request_date":         to_date(r["Request Date"]),
            "shipped_date":         to_date(r["Shipped Date"]),
            "received_date":        to_date(r["Received Date"]) if "Received Date" in col else None,
            "received_by":          to_str(r["Received by"]) if "Received by" in col else None,
            "delivered_date_utc":   to_date(r["Delivered Date [UTC]"]) if "Delivered Date [UTC]" in col else None,
            "delivery_recipient":   to_str(r["Delivery Recipient"]) if "Delivery Recipient" in col else None,
            "delivery_details":     to_str(r["Delivery Details"]) if "Delivery Details" in col else None,
            "cancelled_date":       to_date(r["Cancelled Date"]) if "Cancelled Date" in col else None,
            "total_medication_ids": to_int(r["Total Medication IDs"]) if "Total Medication IDs" in col else None,
            "tracking_no":          to_str(r["Tracking #"]) if "Tracking #" in col else None,
            "shipping_category":    to_str(r["Shipping Category"]) if "Shipping Category" in col else None,
            "expected_arrival":     to_date(r["Expected Arrival"]) if "Expected Arrival" in col else None,
        })
    return rows


def parse_shipment_details(study):
    detail_dir = os.path.join(BASE_DIR, f"xls_shipment_details_{study}")
    files = sorted(glob.glob(os.path.join(detail_dir, "shipment_details_*.xlsx")))
    rows = []
    for path in files:
        m = re.search(r"shipment_details_(.+)\.xlsx", os.path.basename(path))
        shipment_id = m.group(1) if m else "UNKNOWN"
        raw = pd.read_excel(path, header=None)
        header_row = None
        for i, row in raw.iterrows():
            if "Medication ID" in [str(v).strip() for v in row]:
                header_row = i
                break
        if header_row is None:
            continue
        df = pd.read_excel(path, header=header_row).dropna(how="all")
        for _, r in df.iterrows():
            med_desc = (to_str(r.get("Medication Description"))
                        or to_str(r.get("Medication ID Description")))
            med_type = (to_str(r.get("Medication type"))
                        or to_str(r.get("Medication ID type")))
            rows.append({
                "shipment_id":             shipment_id,
                "destination_location":    to_str(r.get("Destination Location")),
                "shipment_status":         to_str(r.get("IRT Shipment Status")),
                "shipment_type":           to_str(r.get("Type")),
                "destination_site":        to_str(r.get("Destination Site")),
                "investigator":            to_str(r.get("Investigator")),
                "medication_description":  med_desc,
                "medication_type":         med_type,
                "medication_id":           to_str(r.get("Medication ID")),
                "packaged_lot_no":         to_str(r.get("Packaged Lot number")),
                "packaged_lot_description": to_str(r.get("Packaged Lot description")),
                "container_id":            to_str(r.get("Container ID")),
                "quantity":                to_int(r.get("Quantity of Medication IDs")),
                "expiration_date":         to_date(r.get("Expiration Date")),
                "item_status":             to_str(r.get("Status")),
            })
    return rows


def parse_inventory(study):
    inv_dir = os.path.join(BASE_DIR, f"xls_reports_{study}")
    files = sorted(glob.glob(os.path.join(inv_dir, "onsite_inventory_detail_*.xlsx")))
    rows = []
    for path in files:
        raw = pd.read_excel(path, header=None)
        site = investigator = location = None
        header_row = None
        for i, row in raw.iterrows():
            first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
            if first.startswith("Site:"):
                site = first.replace("Site:", "").strip()
            elif first.startswith("Investigator:"):
                investigator = first.replace("Investigator:", "").strip()
            elif first.startswith("Location:"):
                location = first.replace("Location:", "").strip()
            if first in ("Medication", "Medication ID") and header_row is None:
                header_row = i
        if header_row is None:
            continue
        df = pd.read_excel(path, header=header_row).dropna(how="all")
        df = df.rename(columns={df.columns[0]: "medication_id"})
        for _, r in df.iterrows():
            rows.append({
                "site":                    site,
                "investigator":            investigator,
                "location":                location,
                "medication_id":           to_str(r["medication_id"]),
                "packaged_lot_no":         to_str(r.get("Packaged Lot number")),
                "original_expiration_date": to_date(r.get("Original Expiration Date when Packaged Lot was Added")),
                "expiration_date":         to_date(r.get("Expiration date")),
                "received_date":           to_date(r.get("Received Date")),
                "receipt_user":            to_str(r.get("Shipment Receipt User")),
                "subject_identifier":      to_str(r.get("Subject Identifier")),
                "quantity_assigned":       to_int(r.get("Quantity Assigned")),
                "irt_transaction":         to_str(r.get("IRT Transaction")),
                "date_assigned":           to_date(r.get("Date Assigned")),
                "assignment_user":         to_str(r.get("Assignment User")),
                "dispensation_status":     to_str(r.get("Dispensation Status")),
                "dispensing_date":         to_date(r.get("Dispensing date") or r.get("Dispensing Date")),
                "quantity_dispensed":      to_int(r.get("Quantity Dispensed")),
                "dispensing_user":         to_str(r.get("Dispensing User")),
                "quantity_returned":       to_int(r.get("Quantity Returned")),
                "date_returned":           to_date(r.get("Date Returned")),
                "return_user":             to_str(r.get("Return User")),
            })
    return rows


def parse_destruction_files(study):
    dest_dir = os.path.join(BASE_DIR, f"xls_ip_destruction_{study}")
    files = sorted(glob.glob(os.path.join(dest_dir, "ip_destruction_basket_*.xlsx")))
    baskets = []
    for path in files:
        raw = pd.read_excel(path, header=None)
        meta = {}
        header_row = None
        for i, row in raw.iterrows():
            first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
            for key, attr in [
                ("Investigator Name:", "investigator"),
                ("Site ID:", "site_id"),
                ("Location:", "location"),
                ("Basket ID:", "basket_id"),
                ("Drug Destruction Created Date:", "destruction_date"),
            ]:
                if first.startswith(key):
                    meta[attr] = first.replace(key, "").strip()
            if first == "Medication ID Description" and header_row is None:
                header_row = i
        if header_row is None:
            continue
        df = pd.read_excel(path, header=header_row).dropna(how="all")
        items = []
        for _, r in df.iterrows():
            items.append({
                "medication_description":   to_str(r.get("Medication ID Description")),
                "medication_id":            to_str(r.get("Medication ID")),
                "packaged_lot_description": to_str(r.get("Packaged Lot description")),
                "comments":                 to_str(r.get("Comments")),
            })
        baskets.append({
            "site_id":          meta.get("site_id"),
            "investigator":     meta.get("investigator"),
            "location":         meta.get("location"),
            "basket_id":        meta.get("basket_id"),
            "destruction_date": to_date(meta.get("destruction_date")),
            "items":            items,
        })
    return baskets


# ── insertery ────────────────────────────────────────────────────────────────

def insert_shipments(cursor, import_id, study, rows):
    sql = """INSERT INTO iwrs_shipments
        (import_id, study, shipment_id, status, type, ship_from, ship_to_site,
         location, request_date, shipped_date, received_date, received_by,
         delivered_date_utc, delivery_recipient, delivery_details, cancelled_date,
         total_medication_ids, tracking_no, shipping_category, expected_arrival)
        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
    for r in rows:
        cursor.execute(sql, (
            import_id, study, r["shipment_id"], r["status"], r["type"],
            r["ship_from"], r["ship_to_site"], r["location"],
            r["request_date"], r["shipped_date"], r["received_date"],
            r["received_by"], r["delivered_date_utc"], r["delivery_recipient"],
            r["delivery_details"], r["cancelled_date"], r["total_medication_ids"],
            r["tracking_no"], r["shipping_category"], r["expected_arrival"],
        ))


def insert_shipment_items(cursor, import_id, study, rows):
    sql = """INSERT INTO iwrs_shipment_items
        (import_id, study, shipment_id, destination_location, shipment_status,
         shipment_type, destination_site, investigator, medication_description,
         medication_type, medication_id, packaged_lot_no, packaged_lot_description,
         container_id, quantity, expiration_date, item_status)
        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
    for r in rows:
        cursor.execute(sql, (
            import_id, study, r["shipment_id"], r["destination_location"],
            r["shipment_status"], r["shipment_type"], r["destination_site"],
            r["investigator"], r["medication_description"], r["medication_type"],
            r["medication_id"], r["packaged_lot_no"], r["packaged_lot_description"],
            r["container_id"], r["quantity"], r["expiration_date"], r["item_status"],
        ))


def insert_inventory(cursor, import_id, study, rows):
    sql = """INSERT INTO iwrs_inventory
        (import_id, study, site, investigator, location, medication_id,
         packaged_lot_no, original_expiration_date, expiration_date, received_date,
         receipt_user, subject_identifier, quantity_assigned, irt_transaction,
         date_assigned, assignment_user, dispensation_status, dispensing_date,
         quantity_dispensed, dispensing_user, quantity_returned, date_returned, return_user)
        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
    for r in rows:
        cursor.execute(sql, (
            import_id, study, r["site"], r["investigator"], r["location"],
            r["medication_id"], r["packaged_lot_no"], r["original_expiration_date"],
            r["expiration_date"], r["received_date"], r["receipt_user"],
            r["subject_identifier"], r["quantity_assigned"], r["irt_transaction"],
            r["date_assigned"], r["assignment_user"], r["dispensation_status"],
            r["dispensing_date"], r["quantity_dispensed"], r["dispensing_user"],
            r["quantity_returned"], r["date_returned"], r["return_user"],
        ))


def insert_destruction(cursor, study, baskets):
    sql = """INSERT IGNORE INTO iwrs_destruction
        (study, site_id, investigator, location, basket_id, destruction_date,
         medication_description, medication_id, packaged_lot_description, comments)
        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
    skipped = imported = 0
    for b in baskets:
        if basket_already_imported(cursor, study, b["basket_id"]):
            skipped += 1
            continue
        for item in b["items"]:
            cursor.execute(sql, (
                study, b["site_id"], b["investigator"], b["location"],
                b["basket_id"], b["destruction_date"],
                item["medication_description"], item["medication_id"],
                item["packaged_lot_description"], item["comments"],
            ))
            imported += 1
    return imported, skipped


def import_study(study):
    print(f"\n  Parsování dat pro {study}...")
    shipments = parse_shipments_report(study)
    items     = parse_shipment_details(study)
    inventory = parse_inventory(study)
    baskets   = parse_destruction_files(study)
    print(f"  Zásilky: {len(shipments)}  |  Položky: {len(items)}  |  Sklad: {len(inventory)}  |  Destrukce: {len(baskets)} košíků")

    conn = get_conn()
    cursor = conn.cursor()
    import_id = insert_import(cursor, study, f"drugs_{study}")
    print(f"  import_id = {import_id}")
    insert_shipments(cursor, import_id, study, shipments)
    insert_shipment_items(cursor, import_id, study, items)
    insert_inventory(cursor, import_id, study, inventory)
    dest_imported, dest_skipped = insert_destruction(cursor, study, baskets)
    conn.commit()
    cursor.close()
    conn.close()
    print(f"  Destrukce: {dest_imported} nových | {dest_skipped} košíků přeskočeno")


# ── login ────────────────────────────────────────────────────────────────────

def login(page, study):
    page.goto(BASE_URL)
    page.wait_for_load_state("networkidle")
    page.get_by_label("Email *").fill(EMAIL)
    page.get_by_label("Password *").fill(PASSWORD)
    page.locator("#login__submit").click()
    page.wait_for_load_state("networkidle")
    page.get_by_label("Study *").click()
    page.get_by_role("option", name=study).click()
    page.get_by_role("button", name="SELECT").click()
    page.wait_for_load_state("networkidle")


# ── download funkce ──────────────────────────────────────────────────────────

def download_inventory(page, study):
    out_dir = os.path.join(BASE_DIR, f"xls_reports_{study}")
    os.makedirs(out_dir, exist_ok=True)

    page.goto(f"{BASE_URL}/report/onsite_inventory_detail")
    page.wait_for_load_state("networkidle", timeout=120000)

    for site_id in SITES[study]:
        print(f"    [{site_id}] inventory...")
        page.locator('input[placeholder="search"], input[type="text"]').first.click()
        page.get_by_role("option", name=site_id).click()
        page.wait_for_load_state("networkidle", timeout=120000)

        filename = os.path.join(out_dir, f"onsite_inventory_detail_{site_id}.xlsx")
        with page.expect_download(timeout=120000) as dl:
            page.get_by_role("button", name="Download XLS").click()
        dl.value.save_as(filename)

        page.get_by_role("button", name="Clear").click()
        page.wait_for_load_state("networkidle", timeout=120000)
    print(f"    Inventory OK ({len(SITES[study])} center)")


def download_destruction(page, study):
    out_dir = os.path.join(BASE_DIR, f"xls_ip_destruction_{study}")
    os.makedirs(out_dir, exist_ok=True)

    page.goto(f"{BASE_URL}/report/ip_destruction_form")
    page.wait_for_load_state("networkidle", timeout=120000)

    page.locator('input[placeholder="search"], input[type="text"]').first.click()
    page.wait_for_timeout(1000)
    baskets = [b.strip() for b in page.locator("mat-option").all_inner_texts()
               if b.strip() and b.strip() != "No results found"]
    page.keyboard.press("Escape")
    page.wait_for_timeout(500)

    if not baskets:
        print("    Žádné destruction košíky")
        return

    new_count = 0
    for basket in baskets:
        filename = os.path.join(out_dir, f"ip_destruction_basket_{basket}.xlsx")
        if os.path.exists(filename):
            continue  # destrukce se nemění — přeskočit
        print(f"    [košík {basket}] stahování...")
        input_field = page.locator('input[placeholder="search"], input[type="text"]').first
        input_field.click()
        input_field.fill(basket)
        page.wait_for_timeout(500)
        page.locator("mat-option").first.dispatch_event("click")
        page.wait_for_load_state("networkidle", timeout=120000)

        with page.expect_download(timeout=120000) as dl:
            page.get_by_role("button", name="Download XLS").click()
        dl.value.save_as(filename)
        new_count += 1

        page.get_by_role("button", name="Clear").click()
        page.wait_for_load_state("networkidle", timeout=120000)

    print(f"    Destruction OK ({new_count} nových, {len(baskets) - new_count} přeskočeno)")


def download_shipments_report(page, study):
    out_dir = os.path.join(BASE_DIR, f"xls_shipments_{study}")
    os.makedirs(out_dir, exist_ok=True)

    page.goto(f"{BASE_URL}/report/shipments_report")
    page.wait_for_load_state("networkidle", timeout=120000)

    filename = os.path.join(out_dir, f"shipments_report_{study}.xlsx")
    with page.expect_download(timeout=120000) as dl:
        page.get_by_role("button", name="Download XLS").click()
    dl.value.save_as(filename)
    print(f"    Shipments report OK")


def download_shipment_details(page, study):
    out_dir = os.path.join(BASE_DIR, f"xls_shipment_details_{study}")
    os.makedirs(out_dir, exist_ok=True)

    # načti CZ shipment IDs z právě staženého shipments reportu
    report_path = os.path.join(BASE_DIR, f"xls_shipments_{study}", f"shipments_report_{study}.xlsx")
    raw = pd.read_excel(report_path, header=None)
    header_row = None
    for i, row in raw.iterrows():
        if "Shipment ID" in [str(v).strip() for v in row]:
            header_row = i
            break
    df = pd.read_excel(report_path, header=header_row)
    df = df.dropna(how="all")
    df = df[df["Location"].astype(str).str.contains("Czech", na=False, case=False)]
    cz_ids = df["Shipment ID"].astype(str).str.strip().tolist()
    print(f"    CZ zásilek ke stažení: {len(cz_ids)}")

    page.goto(f"{BASE_URL}/report/shipment_details_report")
    page.wait_for_load_state("networkidle", timeout=120000)

    for shipment in cz_ids:
        filename = os.path.join(out_dir, f"shipment_details_{shipment}.xlsx")
        input_field = page.locator('input[placeholder="search"], input[type="text"]').first
        input_field.click()
        input_field.fill(shipment)
        page.wait_for_timeout(500)
        page.locator("mat-option").first.dispatch_event("click")
        page.wait_for_load_state("networkidle", timeout=120000)

        with page.expect_download(timeout=120000) as dl:
            page.get_by_role("button", name="Download XLS").click()
        dl.value.save_as(filename)
        print(f"    [{shipment}] OK")

        page.get_by_role("button", name="Clear").click()
        page.wait_for_load_state("networkidle", timeout=120000)


# ── main ─────────────────────────────────────────────────────────────────────

def main():
    os.chdir(BASE_DIR)

    # ── Stahování ────────────────────────────────────────────────────────────
    with sync_playwright() as p:
        for study in STUDIES:
            print(f"\n{'='*60}")
            print(f"[{study}] STAHOVÁNÍ")
            print(f"{'='*60}")

            browser = p.chromium.launch(headless=False)
            context = browser.new_context(accept_downloads=True)
            page = context.new_page()

            try:
                print("  Přihlášení...")
                login(page, study)

                print("\n  [1/4] Onsite inventory...")
                download_inventory(page, study)

                print("\n  [2/4] IP destruction...")
                download_destruction(page, study)

                print("\n  [3/4] Shipments report...")
                download_shipments_report(page, study)

                print("\n  [4/4] Shipment details (CZ)...")
                download_shipment_details(page, study)

            except Exception as e:
                import traceback
                print(f"  CHYBA při stahování: {e}")
                traceback.print_exc()
            finally:
                browser.close()

    # ── Import do MySQL ───────────────────────────────────────────────────────
    print(f"\n{'='*60}")
    print("IMPORT DO MySQL")
    print(f"{'='*60}")

    for study in STUDIES:
        print(f"\n[{study}]")
        try:
            import_study(study)
        except Exception as e:
            import traceback
            print(f"  CHYBA při importu: {e}")
            traceback.print_exc()

    print(f"\n{'='*60}")
    print("Vše hotovo.")
    print(f"{'='*60}")


main()