diff --git a/IWRS/Drugs/run_all.py b/IWRS/Drugs/run_all.py index f50fec9..136d2f3 100644 --- a/IWRS/Drugs/run_all.py +++ b/IWRS/Drugs/run_all.py @@ -4,7 +4,7 @@ Kompletní pipeline pro Drugs: 2. IP destruction (per košík, přeskočí již existující soubory) 3. Shipments report (jeden soubor na studii, přepisuje) 4. Shipment details (per zásilka CZ, vždy přepisuje) - 5. Import do MySQL + 5. Import do MongoDB (studie.iwrs_shipments / iwrs_shipment_items / iwrs_inventory / iwrs_destruction) Spusť tento skript — zpracuje obě studie automaticky. """ @@ -14,12 +14,11 @@ import glob import re import datetime -import numpy as np +import sys import pandas as pd from playwright.sync_api import sync_playwright -import mysql.connector -import db_config +import import_to_mongo as drugs_mongo BASE_URL = "https://janssen.4gclinical.com" EMAIL = "vbuzalka@its.jnj.com" @@ -42,357 +41,6 @@ SITES = { BASE_DIR = os.path.dirname(os.path.abspath(__file__)) -# ── type converters ────────────────────────────────────────────────────────── - -def _py(val): - if isinstance(val, np.generic): - return val.item() - return val - -def to_date(val): - val = _py(val) - if val is None: - return None - if isinstance(val, float) and (val != val): - return None - try: - if pd.isna(val): - return None - except (TypeError, ValueError): - pass - if isinstance(val, pd.Timestamp): - return None if pd.isna(val) else val.date() - if isinstance(val, datetime.datetime): - return val.date() - if isinstance(val, datetime.date): - return val - s = str(val).strip() - if not s or s.lower() in ("nat", "nan", "none", ""): - return None - for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"): - try: - return datetime.datetime.strptime(s, fmt).date() - except ValueError: - pass - return None - -def to_int(val): - val = _py(val) - try: - v = float(val) - return None if (v != v) else int(v) - except (TypeError, ValueError): - return None - -def to_str(val): - val = _py(val) - if val is None: - return None - if isinstance(val, float) and (val != val): - return None - s = str(val).strip() - return None if s.lower() in ("nan", "nat", "none", "") else s - - -# ── DB helpers ─────────────────────────────────────────────────────────────── - -def get_conn(): - return mysql.connector.connect( - host=db_config.DB_HOST, port=db_config.DB_PORT, - user=db_config.DB_USER, password=db_config.DB_PASSWORD, - database=db_config.DB_NAME, - ) - -def insert_import(cursor, study, source_label): - cursor.execute( - "INSERT INTO iwrs_import (study, imported_at, source_file, report_type) VALUES (%s, %s, %s, %s)", - (study, datetime.datetime.now(), source_label, "drugs"), - ) - return cursor.lastrowid - -def basket_already_imported(cursor, study, basket_id): - cursor.execute( - "SELECT 1 FROM iwrs_destruction WHERE study=%s AND basket_id=%s LIMIT 1", - (study, str(basket_id)), - ) - return cursor.fetchone() is not None - - -# ── parsery ────────────────────────────────────────────────────────────────── - -def parse_shipments_report(study): - path = os.path.join(BASE_DIR, f"xls_shipments_{study}", f"shipments_report_{study}.xlsx") - if not os.path.exists(path): - print(f" CHYBÍ: {path}") - return [] - raw = pd.read_excel(path, header=None) - header_row = None - for i, row in raw.iterrows(): - if "Shipment ID" in [str(v).strip() for v in row]: - header_row = i - break - if header_row is None: - return [] - df = pd.read_excel(path, header=header_row).dropna(how="all") - df = df[df["Location"].astype(str).str.contains("Czech", na=False, case=False)] - col = df.columns.tolist() - rows = [] - for _, r in df.iterrows(): - rows.append({ - "shipment_id": to_str(r["Shipment ID"]), - "status": to_str(r["IRT Shipment Status"]), - "type": to_str(r["Type"]), - "ship_from": to_str(r["Shipment From"]), - "ship_to_site": to_str(r["Ship To:"]), - "location": to_str(r["Location"]), - "request_date": to_date(r["Request Date"]), - "shipped_date": to_date(r["Shipped Date"]), - "received_date": to_date(r["Received Date"]) if "Received Date" in col else None, - "received_by": to_str(r["Received by"]) if "Received by" in col else None, - "delivered_date_utc": to_date(r["Delivered Date [UTC]"]) if "Delivered Date [UTC]" in col else None, - "delivery_recipient": to_str(r["Delivery Recipient"]) if "Delivery Recipient" in col else None, - "delivery_details": to_str(r["Delivery Details"]) if "Delivery Details" in col else None, - "cancelled_date": to_date(r["Cancelled Date"]) if "Cancelled Date" in col else None, - "total_medication_ids": to_int(r["Total Medication IDs"]) if "Total Medication IDs" in col else None, - "tracking_no": to_str(r["Tracking #"]) if "Tracking #" in col else None, - "shipping_category": to_str(r["Shipping Category"]) if "Shipping Category" in col else None, - "expected_arrival": to_date(r["Expected Arrival"]) if "Expected Arrival" in col else None, - }) - return rows - - -def parse_shipment_details(study): - detail_dir = os.path.join(BASE_DIR, f"xls_shipment_details_{study}") - files = sorted(glob.glob(os.path.join(detail_dir, "shipment_details_*.xlsx"))) - rows = [] - for path in files: - m = re.search(r"shipment_details_(.+)\.xlsx", os.path.basename(path)) - shipment_id = m.group(1) if m else "UNKNOWN" - raw = pd.read_excel(path, header=None) - header_row = None - for i, row in raw.iterrows(): - if "Medication ID" in [str(v).strip() for v in row]: - header_row = i - break - if header_row is None: - continue - df = pd.read_excel(path, header=header_row).dropna(how="all") - for _, r in df.iterrows(): - med_desc = (to_str(r.get("Medication Description")) - or to_str(r.get("Medication ID Description"))) - med_type = (to_str(r.get("Medication type")) - or to_str(r.get("Medication ID type"))) - rows.append({ - "shipment_id": shipment_id, - "destination_location": to_str(r.get("Destination Location")), - "shipment_status": to_str(r.get("IRT Shipment Status")), - "shipment_type": to_str(r.get("Type")), - "destination_site": to_str(r.get("Destination Site")), - "investigator": to_str(r.get("Investigator")), - "medication_description": med_desc, - "medication_type": med_type, - "medication_id": to_str(r.get("Medication ID")), - "packaged_lot_no": to_str(r.get("Packaged Lot number")), - "packaged_lot_description": to_str(r.get("Packaged Lot description")), - "container_id": to_str(r.get("Container ID")), - "quantity": to_int(r.get("Quantity of Medication IDs")), - "expiration_date": to_date(r.get("Expiration Date")), - "item_status": to_str(r.get("Status")), - }) - return rows - - -def parse_inventory(study): - inv_dir = os.path.join(BASE_DIR, f"xls_reports_{study}") - files = sorted(glob.glob(os.path.join(inv_dir, "onsite_inventory_detail_*.xlsx"))) - rows = [] - for path in files: - raw = pd.read_excel(path, header=None) - site = investigator = location = None - header_row = None - for i, row in raw.iterrows(): - first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else "" - if first.startswith("Site:"): - site = first.replace("Site:", "").strip() - elif first.startswith("Investigator:"): - investigator = first.replace("Investigator:", "").strip() - elif first.startswith("Location:"): - location = first.replace("Location:", "").strip() - if first in ("Medication", "Medication ID") and header_row is None: - header_row = i - if header_row is None: - continue - df = pd.read_excel(path, header=header_row).dropna(how="all") - df = df.rename(columns={df.columns[0]: "medication_id"}) - for _, r in df.iterrows(): - rows.append({ - "site": site, - "investigator": investigator, - "location": location, - "medication_id": to_str(r["medication_id"]), - "packaged_lot_no": to_str(r.get("Packaged Lot number")), - "original_expiration_date": to_date(r.get("Original Expiration Date when Packaged Lot was Added")), - "expiration_date": to_date(r.get("Expiration date")), - "received_date": to_date(r.get("Received Date")), - "receipt_user": to_str(r.get("Shipment Receipt User")), - "subject_identifier": to_str(r.get("Subject Identifier")), - "quantity_assigned": to_int(r.get("Quantity Assigned")), - "irt_transaction": to_str(r.get("IRT Transaction")), - "date_assigned": to_date(r.get("Date Assigned")), - "assignment_user": to_str(r.get("Assignment User")), - "dispensation_status": to_str(r.get("Dispensation Status")), - "dispensing_date": to_date(r.get("Dispensing date") or r.get("Dispensing Date")), - "quantity_dispensed": to_int(r.get("Quantity Dispensed")), - "dispensing_user": to_str(r.get("Dispensing User")), - "quantity_returned": to_int(r.get("Quantity Returned")), - "date_returned": to_date(r.get("Date Returned")), - "return_user": to_str(r.get("Return User")), - }) - return rows - - -def parse_destruction_files(study): - dest_dir = os.path.join(BASE_DIR, f"xls_ip_destruction_{study}") - files = sorted(glob.glob(os.path.join(dest_dir, "ip_destruction_basket_*.xlsx"))) - baskets = [] - for path in files: - raw = pd.read_excel(path, header=None) - meta = {} - header_row = None - for i, row in raw.iterrows(): - first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else "" - for key, attr in [ - ("Investigator Name:", "investigator"), - ("Site ID:", "site_id"), - ("Location:", "location"), - ("Basket ID:", "basket_id"), - ("Drug Destruction Created Date:", "destruction_date"), - ]: - if first.startswith(key): - meta[attr] = first.replace(key, "").strip() - if first == "Medication ID Description" and header_row is None: - header_row = i - if header_row is None: - continue - df = pd.read_excel(path, header=header_row).dropna(how="all") - items = [] - for _, r in df.iterrows(): - items.append({ - "medication_description": to_str(r.get("Medication ID Description")), - "medication_id": to_str(r.get("Medication ID")), - "packaged_lot_description": to_str(r.get("Packaged Lot description")), - "comments": to_str(r.get("Comments")), - }) - baskets.append({ - "site_id": meta.get("site_id"), - "investigator": meta.get("investigator"), - "location": meta.get("location"), - "basket_id": meta.get("basket_id"), - "destruction_date": to_date(meta.get("destruction_date")), - "items": items, - }) - return baskets - - -# ── insertery ──────────────────────────────────────────────────────────────── - -def insert_shipments(cursor, import_id, study, rows): - sql = """INSERT INTO iwrs_shipments - (import_id, study, shipment_id, status, type, ship_from, ship_to_site, - location, request_date, shipped_date, received_date, received_by, - delivered_date_utc, delivery_recipient, delivery_details, cancelled_date, - total_medication_ids, tracking_no, shipping_category, expected_arrival) - VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" - for r in rows: - cursor.execute(sql, ( - import_id, study, r["shipment_id"], r["status"], r["type"], - r["ship_from"], r["ship_to_site"], r["location"], - r["request_date"], r["shipped_date"], r["received_date"], - r["received_by"], r["delivered_date_utc"], r["delivery_recipient"], - r["delivery_details"], r["cancelled_date"], r["total_medication_ids"], - r["tracking_no"], r["shipping_category"], r["expected_arrival"], - )) - - -def insert_shipment_items(cursor, import_id, study, rows): - sql = """INSERT INTO iwrs_shipment_items - (import_id, study, shipment_id, destination_location, shipment_status, - shipment_type, destination_site, investigator, medication_description, - medication_type, medication_id, packaged_lot_no, packaged_lot_description, - container_id, quantity, expiration_date, item_status) - VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" - for r in rows: - cursor.execute(sql, ( - import_id, study, r["shipment_id"], r["destination_location"], - r["shipment_status"], r["shipment_type"], r["destination_site"], - r["investigator"], r["medication_description"], r["medication_type"], - r["medication_id"], r["packaged_lot_no"], r["packaged_lot_description"], - r["container_id"], r["quantity"], r["expiration_date"], r["item_status"], - )) - - -def insert_inventory(cursor, import_id, study, rows): - sql = """INSERT INTO iwrs_inventory - (import_id, study, site, investigator, location, medication_id, - packaged_lot_no, original_expiration_date, expiration_date, received_date, - receipt_user, subject_identifier, quantity_assigned, irt_transaction, - date_assigned, assignment_user, dispensation_status, dispensing_date, - quantity_dispensed, dispensing_user, quantity_returned, date_returned, return_user) - VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" - for r in rows: - cursor.execute(sql, ( - import_id, study, r["site"], r["investigator"], r["location"], - r["medication_id"], r["packaged_lot_no"], r["original_expiration_date"], - r["expiration_date"], r["received_date"], r["receipt_user"], - r["subject_identifier"], r["quantity_assigned"], r["irt_transaction"], - r["date_assigned"], r["assignment_user"], r["dispensation_status"], - r["dispensing_date"], r["quantity_dispensed"], r["dispensing_user"], - r["quantity_returned"], r["date_returned"], r["return_user"], - )) - - -def insert_destruction(cursor, study, baskets): - sql = """INSERT IGNORE INTO iwrs_destruction - (study, site_id, investigator, location, basket_id, destruction_date, - medication_description, medication_id, packaged_lot_description, comments) - VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" - skipped = imported = 0 - for b in baskets: - if basket_already_imported(cursor, study, b["basket_id"]): - skipped += 1 - continue - for item in b["items"]: - cursor.execute(sql, ( - study, b["site_id"], b["investigator"], b["location"], - b["basket_id"], b["destruction_date"], - item["medication_description"], item["medication_id"], - item["packaged_lot_description"], item["comments"], - )) - imported += 1 - return imported, skipped - - -def import_study(study): - print(f"\n Parsování dat pro {study}...") - shipments = parse_shipments_report(study) - items = parse_shipment_details(study) - inventory = parse_inventory(study) - baskets = parse_destruction_files(study) - print(f" Zásilky: {len(shipments)} | Položky: {len(items)} | Sklad: {len(inventory)} | Destrukce: {len(baskets)} košíků") - - conn = get_conn() - cursor = conn.cursor() - import_id = insert_import(cursor, study, f"drugs_{study}") - print(f" import_id = {import_id}") - insert_shipments(cursor, import_id, study, shipments) - insert_shipment_items(cursor, import_id, study, items) - insert_inventory(cursor, import_id, study, inventory) - dest_imported, dest_skipped = insert_destruction(cursor, study, baskets) - conn.commit() - cursor.close() - conn.close() - print(f" Destrukce: {dest_imported} nových | {dest_skipped} košíků přeskočeno") - # ── login ──────────────────────────────────────────────────────────────────── @@ -577,19 +225,17 @@ def main(): finally: browser.close() - # ── Import do MySQL ─────────────────────────────────────────────────────── + # ── Import do MongoDB ───────────────────────────────────────────────────── print(f"\n{'='*60}") - print("IMPORT DO MySQL") + print("IMPORT DO MongoDB") print(f"{'='*60}") - for study in STUDIES: - print(f"\n[{study}]") - try: - import_study(study) - except Exception as e: - import traceback - print(f" CHYBA při importu: {e}") - traceback.print_exc() + try: + drugs_mongo.run(STUDIES) + except Exception as e: + import traceback + print(f" CHYBA při importu: {e}") + traceback.print_exc() print(f"\n{'='*60}") print("Vše hotovo.") diff --git a/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10002.xlsx b/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10002.xlsx index b5ab1b6..e1afa1e 100644 Binary files a/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10002.xlsx and b/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10002.xlsx differ diff --git a/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10004.xlsx b/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10004.xlsx index afb8652..5d52606 100644 Binary files a/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10004.xlsx and b/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10004.xlsx differ diff --git a/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10005.xlsx b/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10005.xlsx index 6503ae0..c8f3f9a 100644 Binary files a/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10005.xlsx and b/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10005.xlsx differ diff --git a/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10008.xlsx b/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10008.xlsx index d7d7e99..6dff02a 100644 Binary files a/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10008.xlsx and b/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10008.xlsx differ diff --git a/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10011.xlsx b/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10011.xlsx index 8d5b240..ee072cd 100644 Binary files a/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10011.xlsx and b/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10011.xlsx differ diff --git a/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10012.xlsx b/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10012.xlsx index 0bbf9be..49d483c 100644 Binary files a/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10012.xlsx and b/IWRS/Drugs/xls_reports_42847922MDD3003/onsite_inventory_detail_S10-CZ10012.xlsx differ diff --git a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10001.xlsx b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10001.xlsx index 88d9093..b775c12 100644 Binary files a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10001.xlsx and b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10001.xlsx differ diff --git a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10003.xlsx b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10003.xlsx index c60b87e..ae07e82 100644 Binary files a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10003.xlsx and b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10003.xlsx differ diff --git a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10006.xlsx b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10006.xlsx index a2412c0..282822d 100644 Binary files a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10006.xlsx and b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10006.xlsx differ diff --git a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10009.xlsx b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10009.xlsx index 60ca5fe..81966e7 100644 Binary files a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10009.xlsx and b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10009.xlsx differ diff --git a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10010.xlsx b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10010.xlsx index a4b281a..9692302 100644 Binary files a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10010.xlsx and b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10010.xlsx differ diff --git a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10012.xlsx b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10012.xlsx index ab09276..e42e16a 100644 Binary files a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10012.xlsx and b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10012.xlsx differ diff --git a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10013.xlsx b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10013.xlsx index 21054f2..8f68fd5 100644 Binary files a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10013.xlsx and b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10013.xlsx differ diff --git a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10015.xlsx b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10015.xlsx index 2620c1d..04a4646 100644 Binary files a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10015.xlsx and b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10015.xlsx differ diff --git a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10016.xlsx b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10016.xlsx index 9912902..f618e6e 100644 Binary files a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10016.xlsx and b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10016.xlsx differ diff --git a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10020.xlsx b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10020.xlsx index 2289574..262eb33 100644 Binary files a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10020.xlsx and b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10020.xlsx differ diff --git a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10021.xlsx b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10021.xlsx index bf2e694..4120a06 100644 Binary files a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10021.xlsx and b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10021.xlsx differ diff --git a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10022.xlsx b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10022.xlsx index 2ac4434..fc87ed6 100644 Binary files a/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10022.xlsx and b/IWRS/Drugs/xls_reports_77242113UCO3001/onsite_inventory_detail_DD5-CZ10022.xlsx differ diff --git a/IWRS/Drugs/xls_shipment_details_42847922MDD3003/shipment_details_102758.xlsx b/IWRS/Drugs/xls_shipment_details_42847922MDD3003/shipment_details_102758.xlsx index a0a559c..600e413 100644 Binary files a/IWRS/Drugs/xls_shipment_details_42847922MDD3003/shipment_details_102758.xlsx and b/IWRS/Drugs/xls_shipment_details_42847922MDD3003/shipment_details_102758.xlsx differ diff --git a/IWRS/Drugs/xls_shipment_details_77242113UCO3001/shipment_details_101204.xlsx b/IWRS/Drugs/xls_shipment_details_77242113UCO3001/shipment_details_101204.xlsx index 02cf6c7..9548536 100644 Binary files a/IWRS/Drugs/xls_shipment_details_77242113UCO3001/shipment_details_101204.xlsx and b/IWRS/Drugs/xls_shipment_details_77242113UCO3001/shipment_details_101204.xlsx differ diff --git a/IWRS/Drugs/xls_shipments_42847922MDD3003/shipments_report_42847922MDD3003.xlsx b/IWRS/Drugs/xls_shipments_42847922MDD3003/shipments_report_42847922MDD3003.xlsx index db2fe36..5ae7223 100644 Binary files a/IWRS/Drugs/xls_shipments_42847922MDD3003/shipments_report_42847922MDD3003.xlsx and b/IWRS/Drugs/xls_shipments_42847922MDD3003/shipments_report_42847922MDD3003.xlsx differ diff --git a/IWRS/Drugs/xls_shipments_77242113UCO3001/shipments_report_77242113UCO3001.xlsx b/IWRS/Drugs/xls_shipments_77242113UCO3001/shipments_report_77242113UCO3001.xlsx index 25ee5c7..60123d2 100644 Binary files a/IWRS/Drugs/xls_shipments_77242113UCO3001/shipments_report_77242113UCO3001.xlsx and b/IWRS/Drugs/xls_shipments_77242113UCO3001/shipments_report_77242113UCO3001.xlsx differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-08-26_Janssen_42847922MDD3003_Subject_CZ100080002_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk8785.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-08-26_Janssen_42847922MDD3003_Subject_CZ100080002_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk8785.pdf index 45347d1..bd83887 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-08-26_Janssen_42847922MDD3003_Subject_CZ100080002_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk8785.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-08-26_Janssen_42847922MDD3003_Subject_CZ100080002_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk8785.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-09-09_Janssen_42847922MDD3003_Subject_CZ100080002_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk9416.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-09-09_Janssen_42847922MDD3003_Subject_CZ100080002_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk9416.pdf index 2e2ff20..0ff1592 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-09-09_Janssen_42847922MDD3003_Subject_CZ100080002_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk9416.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-09-09_Janssen_42847922MDD3003_Subject_CZ100080002_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk9416.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-09-22_Janssen_42847922MDD3003_Subject_CZ100040001_dispensing_confirmation_has_occurred_at_site_S10-CZ10004_pk9348.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-09-22_Janssen_42847922MDD3003_Subject_CZ100040001_dispensing_confirmation_has_occurred_at_site_S10-CZ10004_pk9348.pdf index abe0277..5cf5e4a 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-09-22_Janssen_42847922MDD3003_Subject_CZ100040001_dispensing_confirmation_has_occurred_at_site_S10-CZ10004_pk9348.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-09-22_Janssen_42847922MDD3003_Subject_CZ100040001_dispensing_confirmation_has_occurred_at_site_S10-CZ10004_pk9348.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-09-22_Janssen_42847922MDD3003_Subject_CZ100040001_has_returned_a_medication_at_site_S10-CZ10004_pk9349.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-09-22_Janssen_42847922MDD3003_Subject_CZ100040001_has_returned_a_medication_at_site_S10-CZ10004_pk9349.pdf index 6b5301c..f9695d7 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-09-22_Janssen_42847922MDD3003_Subject_CZ100040001_has_returned_a_medication_at_site_S10-CZ10004_pk9349.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-09-22_Janssen_42847922MDD3003_Subject_CZ100040001_has_returned_a_medication_at_site_S10-CZ10004_pk9349.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-10-08_Janssen_42847922MDD3003_Subject_CZ100080005_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk11269.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-10-08_Janssen_42847922MDD3003_Subject_CZ100080005_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk11269.pdf index 6609511..139832e 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-10-08_Janssen_42847922MDD3003_Subject_CZ100080005_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk11269.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-10-08_Janssen_42847922MDD3003_Subject_CZ100080005_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk11269.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-10-22_Janssen_42847922MDD3003_Subject_CZ100080005_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk11621.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-10-22_Janssen_42847922MDD3003_Subject_CZ100080005_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk11621.pdf index a7b4e8f..977a26c 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-10-22_Janssen_42847922MDD3003_Subject_CZ100080005_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk11621.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-10-22_Janssen_42847922MDD3003_Subject_CZ100080005_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk11621.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-10-29_Janssen_42847922MDD3003_Subject_CZ100080006_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk12065.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-10-29_Janssen_42847922MDD3003_Subject_CZ100080006_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk12065.pdf index 54e633a..db835b5 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-10-29_Janssen_42847922MDD3003_Subject_CZ100080006_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk12065.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-10-29_Janssen_42847922MDD3003_Subject_CZ100080006_dispensing_confirmation_has_occurred_at_site_S10-CZ10008_pk12065.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12040.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12040.pdf index 76dc7e6..6de6219 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12040.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12040.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12041.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12041.pdf index 066af2c..b4d6f13 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12041.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12041.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12042.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12042.pdf index 1d2cf90..c617bfe 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12042.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12042.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12043.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12043.pdf index 9d445b9..202da4e 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12043.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12043.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12044.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12044.pdf index 7fdff0c..43a81b3 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12044.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080002_has_returned_a_medication_at_site_S10-CZ10008_pk12044.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080004_has_returned_a_medication_at_site_S10-CZ10008_pk12046.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080004_has_returned_a_medication_at_site_S10-CZ10008_pk12046.pdf index 2bcc10f..a40da01 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080004_has_returned_a_medication_at_site_S10-CZ10008_pk12046.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080004_has_returned_a_medication_at_site_S10-CZ10008_pk12046.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080005_has_returned_a_medication_at_site_S10-CZ10008_pk12050.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080005_has_returned_a_medication_at_site_S10-CZ10008_pk12050.pdf index 5e583cf..0208433 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080005_has_returned_a_medication_at_site_S10-CZ10008_pk12050.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080005_has_returned_a_medication_at_site_S10-CZ10008_pk12050.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080005_has_returned_a_medication_at_site_S10-CZ10008_pk12052.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080005_has_returned_a_medication_at_site_S10-CZ10008_pk12052.pdf index 1830288..e1c0609 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080005_has_returned_a_medication_at_site_S10-CZ10008_pk12052.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-11-25_Janssen_42847922MDD3003_Subject_CZ100080005_has_returned_a_medication_at_site_S10-CZ10008_pk12052.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-12-11_Janssen_42847922MDD3003_Subject_CZ100110007_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk14220.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-12-11_Janssen_42847922MDD3003_Subject_CZ100110007_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk14220.pdf index c916925..815f4a0 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-12-11_Janssen_42847922MDD3003_Subject_CZ100110007_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk14220.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-12-11_Janssen_42847922MDD3003_Subject_CZ100110007_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk14220.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-12-22_Janssen_42847922MDD3003_Subject_CZ100110007_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk15012.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-12-22_Janssen_42847922MDD3003_Subject_CZ100110007_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk15012.pdf index fe56fdf..0d51455 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-12-22_Janssen_42847922MDD3003_Subject_CZ100110007_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk15012.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-12-22_Janssen_42847922MDD3003_Subject_CZ100110007_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk15012.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-12-22_Janssen_42847922MDD3003_Subject_CZ100110007_has_returned_a_medication_at_site_S10-CZ10011_pk14219.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-12-22_Janssen_42847922MDD3003_Subject_CZ100110007_has_returned_a_medication_at_site_S10-CZ10011_pk14219.pdf index c6e12bb..16382df 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-12-22_Janssen_42847922MDD3003_Subject_CZ100110007_has_returned_a_medication_at_site_S10-CZ10011_pk14219.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2025-12-22_Janssen_42847922MDD3003_Subject_CZ100110007_has_returned_a_medication_at_site_S10-CZ10011_pk14219.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-02-10_Janssen_42847922MDD3003_Subject_CZ100110006_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk16495.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-02-10_Janssen_42847922MDD3003_Subject_CZ100110006_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk16495.pdf index 0dd5fe3..452bef7 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-02-10_Janssen_42847922MDD3003_Subject_CZ100110006_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk16495.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-02-10_Janssen_42847922MDD3003_Subject_CZ100110006_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk16495.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-02-17_Janssen_42847922MDD3003_Subject_CZ100120005_dispensing_confirmation_has_occurred_at_site_S10-CZ10012_pk20767.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-02-17_Janssen_42847922MDD3003_Subject_CZ100120005_dispensing_confirmation_has_occurred_at_site_S10-CZ10012_pk20767.pdf index a7f56c7..5a00c2b 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-02-17_Janssen_42847922MDD3003_Subject_CZ100120005_dispensing_confirmation_has_occurred_at_site_S10-CZ10012_pk20767.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-02-17_Janssen_42847922MDD3003_Subject_CZ100120005_dispensing_confirmation_has_occurred_at_site_S10-CZ10012_pk20767.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-02-27_Janssen_42847922MDD3003_Subject_CZ100120005_has_returned_a_medication_at_site_S10-CZ10012_pk19358.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-02-27_Janssen_42847922MDD3003_Subject_CZ100120005_has_returned_a_medication_at_site_S10-CZ10012_pk19358.pdf index 6b55b59..372b6c2 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-02-27_Janssen_42847922MDD3003_Subject_CZ100120005_has_returned_a_medication_at_site_S10-CZ10012_pk19358.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-02-27_Janssen_42847922MDD3003_Subject_CZ100120005_has_returned_a_medication_at_site_S10-CZ10012_pk19358.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-15_Janssen_42847922MDD3003_Subject_CZ100120005_dispensing_confirmation_has_occurred_at_site_S10-CZ10012_pk18456.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-15_Janssen_42847922MDD3003_Subject_CZ100120005_dispensing_confirmation_has_occurred_at_site_S10-CZ10012_pk18456.pdf index e70f9da..0f33a66 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-15_Janssen_42847922MDD3003_Subject_CZ100120005_dispensing_confirmation_has_occurred_at_site_S10-CZ10012_pk18456.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-15_Janssen_42847922MDD3003_Subject_CZ100120005_dispensing_confirmation_has_occurred_at_site_S10-CZ10012_pk18456.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-15_Janssen_42847922MDD3003_Subject_CZ100120005_dispensing_confirmation_has_occurred_at_site_S10-CZ10012_pk20768.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-15_Janssen_42847922MDD3003_Subject_CZ100120005_dispensing_confirmation_has_occurred_at_site_S10-CZ10012_pk20768.pdf index 8323a4e..5e5ec10 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-15_Janssen_42847922MDD3003_Subject_CZ100120005_dispensing_confirmation_has_occurred_at_site_S10-CZ10012_pk20768.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-15_Janssen_42847922MDD3003_Subject_CZ100120005_dispensing_confirmation_has_occurred_at_site_S10-CZ10012_pk20768.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-15_Janssen_42847922MDD3003_Subject_CZ100120005_dispensing_confirmation_has_occurred_at_site_S10-CZ10012_pk21110.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-15_Janssen_42847922MDD3003_Subject_CZ100120005_dispensing_confirmation_has_occurred_at_site_S10-CZ10012_pk21110.pdf index 9c631e8..1af58ab 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-15_Janssen_42847922MDD3003_Subject_CZ100120005_dispensing_confirmation_has_occurred_at_site_S10-CZ10012_pk21110.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-15_Janssen_42847922MDD3003_Subject_CZ100120005_dispensing_confirmation_has_occurred_at_site_S10-CZ10012_pk21110.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-15_Janssen_42847922MDD3003_Subject_CZ100120005_has_returned_a_medication_at_site_S10-CZ10012_pk21349.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-15_Janssen_42847922MDD3003_Subject_CZ100120005_has_returned_a_medication_at_site_S10-CZ10012_pk21349.pdf index 61b53f1..0cc4395 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-15_Janssen_42847922MDD3003_Subject_CZ100120005_has_returned_a_medication_at_site_S10-CZ10012_pk21349.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-15_Janssen_42847922MDD3003_Subject_CZ100120005_has_returned_a_medication_at_site_S10-CZ10012_pk21349.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-16_Janssen_42847922MDD3003_Subject_CZ100110007_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk18472.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-16_Janssen_42847922MDD3003_Subject_CZ100110007_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk18472.pdf index 0bb1003..d0ec4cc 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-16_Janssen_42847922MDD3003_Subject_CZ100110007_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk18472.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-03-16_Janssen_42847922MDD3003_Subject_CZ100110007_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk18472.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-04-07_Janssen_42847922MDD3003_Subject_CZ100110006_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk20028.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-04-07_Janssen_42847922MDD3003_Subject_CZ100110006_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk20028.pdf index a904fab..13aac67 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-04-07_Janssen_42847922MDD3003_Subject_CZ100110006_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk20028.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-04-07_Janssen_42847922MDD3003_Subject_CZ100110006_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk20028.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-04-28_Janssen_42847922MDD3003_Subject_CZ100110006_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk21486.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-04-28_Janssen_42847922MDD3003_Subject_CZ100110006_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk21486.pdf index 83ac0ef..b12dfa9 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-04-28_Janssen_42847922MDD3003_Subject_CZ100110006_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk21486.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-04-28_Janssen_42847922MDD3003_Subject_CZ100110006_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk21486.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-04-28_Janssen_42847922MDD3003_Subject_CZ100110006_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk21488.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-04-28_Janssen_42847922MDD3003_Subject_CZ100110006_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk21488.pdf index 2dfe522..59fc9a8 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-04-28_Janssen_42847922MDD3003_Subject_CZ100110006_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk21488.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-04-28_Janssen_42847922MDD3003_Subject_CZ100110006_dispensing_confirmation_has_occurred_at_site_S10-CZ10011_pk21488.pdf differ diff --git a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-04-28_Janssen_42847922MDD3003_Subject_CZ100110006_has_returned_a_medication_at_site_S10-CZ10011_pk21487.pdf b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-04-28_Janssen_42847922MDD3003_Subject_CZ100110006_has_returned_a_medication_at_site_S10-CZ10011_pk21487.pdf index 243b8b3..ad66008 100644 Binary files a/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-04-28_Janssen_42847922MDD3003_Subject_CZ100110006_has_returned_a_medication_at_site_S10-CZ10011_pk21487.pdf and b/IWRS/Patients/IncomingSourceReportsDetails/42847922MDD3003/2026-04-28_Janssen_42847922MDD3003_Subject_CZ100110006_has_returned_a_medication_at_site_S10-CZ10011_pk21487.pdf differ diff --git a/IWRS/Patients/download_subject_details.py b/IWRS/Patients/download_subject_details.py index f22b2d7..b478a23 100644 --- a/IWRS/Patients/download_subject_details.py +++ b/IWRS/Patients/download_subject_details.py @@ -156,38 +156,62 @@ def run(page, study): total_notif = 0 for subject in subjects: filename = os.path.join(out_dir, f"{today} {study} {subject} Subject Detail.xlsx") - print(f" [{subject}] Stahuji...") - input_field = page.locator('input[placeholder="search"], input[type="text"]').first - input_field.click() - input_field.fill(subject) - page.wait_for_timeout(500) - # Zachytíme table_1 response při výběru subjektu - if api_base: + success = False + table1_data = None + for attempt in range(1, 4): try: - with page.expect_response( - lambda r: "report_data" in r.url and "table_1" in r.url, - timeout=60000 - ) as resp_info: + print(f" [{subject}] Stahuji... (pokus {attempt}/3)") + input_field = page.locator('input[placeholder="search"], input[type="text"]').first + input_field.click() + input_field.fill(subject) + page.wait_for_timeout(500) + + # Zachytíme table_1 response při výběru subjektu + if api_base: + try: + with page.expect_response( + lambda r: "report_data" in r.url and "table_1" in r.url, + timeout=60000 + ) as resp_info: + page.locator("mat-option").first.dispatch_event("click") + table1_data = resp_info.value.json() + except Exception as e: + print(f" [{subject}] CHYBA zachycení table_1: {e}") + page.locator("mat-option").first.dispatch_event("click") + page.wait_for_load_state("networkidle", timeout=120000) + table1_data = None + else: page.locator("mat-option").first.dispatch_event("click") - table1_data = resp_info.value.json() - except Exception as e: - print(f" [{subject}] CHYBA zachycení table_1: {e}") - page.locator("mat-option").first.dispatch_event("click") + page.wait_for_load_state("networkidle", timeout=120000) + table1_data = None + page.wait_for_load_state("networkidle", timeout=120000) - table1_data = None - else: - page.locator("mat-option").first.dispatch_event("click") - page.wait_for_load_state("networkidle", timeout=120000) - table1_data = None + page.wait_for_timeout(2000) - page.wait_for_load_state("networkidle", timeout=120000) - page.wait_for_timeout(1000) + with page.expect_download(timeout=60000) as dl: + page.get_by_role("button", name="Download XLS").click() + dl.value.save_as(filename) + print(f" [{subject}] XLS OK") + success = True + break + except Exception as e: + print(f" [{subject}] pokus {attempt} selhal: {e}") + if attempt < 3: + try: + page.goto(f"{BASE_URL}/report/patient_detail_report") + page.wait_for_load_state("networkidle", timeout=120000) + except Exception as ge: + print(f" [{subject}] refresh selhal: {ge}") - with page.expect_download(timeout=120000) as dl: - page.get_by_role("button", name="Download XLS").click() - dl.value.save_as(filename) - print(f" [{subject}] XLS OK") + if not success: + print(f" [{subject}] PŘESKAKUJI po 3 neúspěšných pokusech") + try: + page.goto(f"{BASE_URL}/report/patient_detail_report") + page.wait_for_load_state("networkidle", timeout=120000) + except Exception: + pass + continue # Stáhnout notifikace pro tohoto subjekta if api_base and table1_data: @@ -196,8 +220,13 @@ def run(page, study): ) total_notif += n - page.get_by_role("button", name="Clear").click() - page.wait_for_load_state("networkidle", timeout=120000) + try: + page.get_by_role("button", name="Clear").click() + page.wait_for_load_state("networkidle", timeout=120000) + except Exception as e: + print(f" [{subject}] Clear selhal: {e} — refresh") + page.goto(f"{BASE_URL}/report/patient_detail_report") + page.wait_for_load_state("networkidle", timeout=120000) print(f" [{study}] Subject details hotovo. Nových notifikací: {total_notif}") diff --git a/IWRS/Patients/run_all.py b/IWRS/Patients/run_all.py index eb57c3a..8ea266a 100644 --- a/IWRS/Patients/run_all.py +++ b/IWRS/Patients/run_all.py @@ -2,23 +2,21 @@ Kompletní pipeline: 1. Stažení Subject Summary Reportů (obě studie) 2. Stažení Subject Detail Reportů + notifikací (obě studie) - 3. Import do MySQL (summary, visits, notifikace) + 3. Import do MongoDB (subject_summary + visits + notifications) Spusť tento skript místo samostatných skriptů. """ import os +import sys import datetime import glob -import re from playwright.sync_api import sync_playwright -import numpy as np -import pandas as pd -import db_config -import mysql.connector import download_subject_details as dsd +import import_to_mongo +import import_notifications_to_mongo # ── CONFIG ─────────────────────────────────────────────────────────────────── BASE_URL = "https://janssen.4gclinical.com" @@ -72,6 +70,7 @@ def download_summary(page, study, today): # ── KROK 2: Subject Details ─────────────────────────────────────────────────── def get_subjects_from_summary(summary_path): + import pandas as pd raw = pd.read_excel(summary_path, header=None) header_row = None for i, row in raw.iterrows(): @@ -112,277 +111,7 @@ def download_details(page, study, summary_path, today): page.wait_for_load_state("networkidle", timeout=120000) -# ── KROK 3: Import do MySQL ─────────────────────────────────────────────────── - -def get_conn(): - return mysql.connector.connect( - host=db_config.DB_HOST, - port=db_config.DB_PORT, - user=db_config.DB_USER, - password=db_config.DB_PASSWORD, - database=db_config.DB_NAME, - ) - - -def _py(val): - """Převede numpy skalár na Python nativní typ.""" - if isinstance(val, np.generic): - return val.item() - return val - - -def to_date(val): - val = _py(val) - if val is None or (isinstance(val, float) and (val != val)): - return None - try: - if pd.isna(val): - return None - except (TypeError, ValueError): - pass - if isinstance(val, pd.Timestamp): - return None if pd.isna(val) else val.date() - if isinstance(val, datetime.datetime): - return val.date() - if isinstance(val, datetime.date): - return val - s = str(val).strip() - if not s or s.lower() in ("nat", "nan", "none", ""): - return None - for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"): - try: - return datetime.datetime.strptime(s, fmt).date() - except ValueError: - pass - return None - - -def to_int(val): - val = _py(val) - try: - v = float(val) - return None if (v != v) else int(v) - except (TypeError, ValueError): - return None - - -def to_float(val): - val = _py(val) - try: - v = float(val) - return None if (v != v) else float(v) - except (TypeError, ValueError): - return None - - -def to_str(val): - val = _py(val) - if val is None: - return None - if isinstance(val, float) and (val != val): - return None - s = str(val).strip() - return None if s.lower() in ("nan", "nat", "none", "") else s - - -def read_summary_df(path): - raw = pd.read_excel(path, header=None) - header_row = None - for i, row in raw.iterrows(): - if "Subject" in [str(v).strip() for v in row]: - header_row = i - break - if header_row is None: - raise ValueError(f"Hlavičkový řádek nenalezen v {path}") - return pd.read_excel(path, header=header_row).dropna(how="all") - - -def parse_detail_visits(path): - df = pd.read_excel(path, sheet_name="patient_detail_report", header=None) - header_row = None - for i, row in df.iterrows(): - if "Visit Type" in [str(v).strip() for v in row]: - header_row = i - break - if header_row is None: - return [] - visits_df = df.iloc[header_row + 1:].copy() - visits_df.columns = range(visits_df.shape[1]) - rows = [] - for _, r in visits_df.iterrows(): - visit_type = to_str(r.get(0)) - if visit_type not in ("Past", "Upcoming"): - continue - rows.append({ - "visit_type": visit_type, - "scheduled_date": to_date(r.get(1)), - "window_days": to_str(r.get(2)), - "actual_date": to_date(r.get(3)), - "irt_transaction_no": to_int(r.get(4)), - "irt_transaction_description": to_str(r.get(5)), - "medication_assignment": to_str(r.get(6)), - "quantity_assigned": to_int(r.get(7)), - "medication_id": to_str(r.get(8)), - }) - return rows - - -def insert_import(cursor, study, source_file): - cursor.execute( - "INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)", - (study, datetime.datetime.now(), os.path.basename(source_file)), - ) - return cursor.lastrowid - - -def insert_uco3001_summary(cursor, import_id, df): - sql = """INSERT INTO iwrs_uco3001_subject_summary ( - import_id, subject, prior_subject_identifier, site, investigator, location, - cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight, - rescreened_subject, adt_ir, three_or_more_advanced_therapies, - only_oral_5asa_compounds, ustekinumab, isolated_proctitis, - clinical_responder_status_i12_m0, irt_subject_status, - i0_rand_date_local, last_irt_transaction, - last_irt_transaction_date_local, last_irt_transaction_date_utc, - next_irt_transaction, next_irt_transaction_date_local, - most_recent_med_assignment_date, days_since_last_med_assignment, - patient_forecast_status, patient_forecast_status_changed_date - ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" - col = df.columns.tolist() - for _, r in df.iterrows(): - cursor.execute(sql, ( - import_id, - to_str(r["Subject"]), - to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None, - to_str(r["Site"]), - to_str(r["Investigator"]), - to_str(r["Location"]), - to_str(r["Cohort per IRT"]), - to_date(r["Informed Consent Date"]), - to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None, - to_int(r["Subject's age collection"]), - to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None, - to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None, - to_str(r["ADT-IR"]) if "ADT-IR" in col else None, - to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None, - to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None, - to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None, - to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None, - to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None, - to_str(r["IRT Subject Status"]), - to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None, - to_str(r["Last Recorded IRT Transaction"]), - to_date(r["Last Recorded IRT Transaction Date [Local]"]), - to_date(r["Last Recorded IRT Transaction Date (UTC)"]), - to_str(r["Next Expected IRT Transaction"]), - to_date(r["Next Expected IRT Transaction Date [Local]"]), - to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None, - to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None, - to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None, - to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None, - )) - - -def insert_mdd3003_summary(cursor, import_id, df): - sql = """INSERT INTO iwrs_mdd3003_subject_summary ( - import_id, subject, prior_subject_identifier, site, investigator, location, - cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age, - madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17, - stratification_country, age_group, stable_remitters, irt_subject_status, - last_irt_transaction, last_irt_transaction_date_local, - last_irt_transaction_date_utc, next_irt_transaction, - next_irt_transaction_date_local, date_screened, date_screen_failed, - date_randomized_part1, date_early_withdraw_randomized_part1, - date_open_label_induction, date_early_withdraw_open_label_induction, - date_randomized_part2, date_early_withdraw_randomized_part2, - date_completed, date_unblinded - ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" - col = df.columns.tolist() - for _, r in df.iterrows(): - cursor.execute(sql, ( - import_id, - to_str(r["Subject"]), - to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None, - to_str(r["Site"]), - to_str(r["Investigator"]), - to_str(r["Location"]), - to_str(r["Cohort per IRT"]), - to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None, - to_date(r["Informed Consent Date"]), - to_int(r["Subject's age collection"]), - to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None, - to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None, - to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None, - to_str(r["Stratification Country"]) if "Stratification Country" in col else None, - to_str(r["Age Group"]) if "Age Group" in col else None, - to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None, - to_str(r["IRT Subject Status"]), - to_str(r["Last Recorded IRT Transaction"]), - to_date(r["Last Recorded IRT Transaction Date [Local]"]), - to_date(r["Last Recorded IRT Transaction Date (UTC)"]), - to_str(r["Next Expected IRT Transaction"]), - to_date(r["Next Expected IRT Transaction Date [Local]"]), - to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None, - to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None, - to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None, - to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None, - to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None, - to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None, - to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None, - to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None, - to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None, - to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None, - )) - - -def insert_visits(cursor, import_id, study, subject, visits): - if not visits: - return - sql = """INSERT INTO iwrs_subject_visits ( - import_id, study, subject, visit_type, scheduled_date, window_days, - actual_date, irt_transaction_no, irt_transaction_description, - medication_assignment, quantity_assigned, medication_id - ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)""" - for v in visits: - cursor.execute(sql, ( - import_id, study, subject, - v["visit_type"], v["scheduled_date"], v["window_days"], - v["actual_date"], v["irt_transaction_no"], - v["irt_transaction_description"], v["medication_assignment"], - v["quantity_assigned"], v["medication_id"], - )) - - -def import_to_mysql(summary_path, detail_files, study): - print(f"\n [MySQL] Importuji {study}...") - df_summary = read_summary_df(summary_path) - conn = get_conn() - cursor = conn.cursor() - - import_id = insert_import(cursor, study, summary_path) - - if study == "77242113UCO3001": - insert_uco3001_summary(cursor, import_id, df_summary) - else: - insert_mdd3003_summary(cursor, import_id, df_summary) - - total_visits = 0 - for path in detail_files: - fname = os.path.basename(path) - m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname) - subject = m.group(1) if m else "UNKNOWN" - visits = parse_detail_visits(path) - insert_visits(cursor, import_id, study, subject, visits) - total_visits += len(visits) - - conn.commit() - cursor.close() - conn.close() - print(f" [MySQL] import_id={import_id} | pacientů={len(df_summary)} | transakcí={total_visits}") - return import_id - - -# ── MAIN ───────────────────────────────────────────────────────────────────── +# ── KROK 3: Import do MongoDB ──────────────────────────────────────────────── def main(): today = datetime.date.today().strftime("%Y-%m-%d") @@ -391,12 +120,12 @@ def main(): summary_paths = {} - # ── Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session) ── + # Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session) with sync_playwright() as p: for study in STUDIES: - print(f"\n{'='*60}") + print("\n" + "=" * 60) print(f"[{study}] KROK 1: Subject Summary Report") - print(f"{'='*60}") + print("=" * 60) browser = p.chromium.launch(headless=False) context = browser.new_context(accept_downloads=True) page = context.new_page() @@ -415,10 +144,10 @@ def main(): finally: browser.close() - # ── Krok 3: import do MySQL ────────────────────────────────────────────── - print(f"\n{'='*60}") - print("KROK 3: Import do MySQL") - print(f"{'='*60}") + # Krok 3: import do MongoDB + print("\n" + "=" * 60) + print("KROK 3: Import do MongoDB") + print("=" * 60) for study in STUDIES: summary_path = summary_paths.get(study) @@ -426,18 +155,21 @@ def main(): print(f" [{study}] PŘESKOČENO — stahování selhalo") continue - detail_files = sorted(glob.glob( - os.path.join(DETAILS_DIR, study, f"{today} {study} * Subject Detail.xlsx") - )) - try: - import_to_mysql(summary_path, detail_files, study) + import_to_mongo.run(study, summary_path, DETAILS_DIR, today) except Exception as e: - print(f" [{study}] CHYBA při importu: {e}") + print(f" [{study}] CHYBA při importu summary/visits: {e}") - print(f"\n{'='*60}") + # Notifikace: PDF/JSON z disku rovnou do Mongo iwrs_notifications + print("\n [notifikace] import PDF/JSON do Mongo...") + try: + import_notifications_to_mongo.main(STUDIES) + except Exception as e: + print(f" CHYBA při importu notifikací: {e}") + + print("\n" + "=" * 60) print("Vše hotovo.") - print(f"{'='*60}") + print("=" * 60) main() diff --git a/Python-runner/download_attachments_v1.0.py b/Python-runner/download_attachments_v1.0.py deleted file mode 100644 index 75901f7..0000000 --- a/Python-runner/download_attachments_v1.0.py +++ /dev/null @@ -1,449 +0,0 @@ -""" -download_attachments_v1.0.py -Nazev: download_attachments_v1.0.py -Verze: 1.0 -Datum: 2026-06-02 -Autor: vladimir.buzalka - -Popis: - Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB kolekce - ordinace@buzalkova.cz primo pres Microsoft Graph API a uklada je do - adresare /mnt/Emails/ordinace@buzalkova.cz/Attachments/. - - Deduplikace podle SHA256 hashe obsahu: - - stejny hash = soubor uz existuje -> preskoci - - prvni vyskytu souboru: ulozi pod puvodnimnazvem - - kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ... - - Po ulozeni aktualizuje MongoDB: - - v email dokumentu: kazda priloha dostane file_hash + local_path - - kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes, - mime_type, first_seen_at, ref_count (pocet emailu ktery ji obsahuje) - - Bezpecne prerusit a opakovat: - - zpravy kde jsou vsechny prilohy uz stazene (maji file_hash) se preskoci - - --force-recheck znovu overi i uz stazene (pro pripad zmen na disku) - - POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky! - -Spousteni: - python download_attachments_v1.0.py # stahni vse co chybi - python download_attachments_v1.0.py --limit 50 # test na prvnich 50 emailech - python download_attachments_v1.0.py --force-recheck # overi i uz stazene - -Docker (po pridani mountu /mnt/user/Emails -> /mnt/Emails): - docker exec -it python-runner python /scripts/download_attachments_v1.0.py - -Zavislosti: - msal, requests, pymongo, python-dateutil - Python 3.10+ - -Struktura na disku: - /mnt/Emails/ - └── ordinace@buzalkova.cz/ - └── Attachments/ - ├── faktura_2026.pdf - ├── vysledky_lab.pdf - ├── vysledky_lab_2.pdf <- kolize nazvu, jiny obsah - └── ... - -Kolekce emaily.attachments_index: - _id SHA256 hash (hex) - filename nazev souboru na disku (prvni vyskytu) - local_path relativni cesta od Attachments/ (zatim = filename) - size_bytes velikost souboru - mime_type MIME typ - first_seen_at datetime UTC - ref_count v kolika emailech se tato priloha vyskytuje - -Aktualizace v email dokumentu (kolekce ordinace@buzalkova.cz): - attachments[i].file_hash SHA256 hash - attachments[i].local_path cesta relativni od Attachments/ - -Historie verzi: - 1.0 2026-06-02 Inicialni verze -""" - -import sys -import hashlib -import logging -import argparse -from pathlib import Path -from datetime import datetime, timezone -from typing import Optional - -import msal -import requests -from pymongo import MongoClient, UpdateOne - -if hasattr(sys.stdout, "reconfigure"): - sys.stdout.reconfigure(encoding="utf-8", errors="replace") - -# ─── KONFIGURACE ────────────────────────────────────────────────────────────── -GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9" -GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f" -GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk" -GRAPH_MAILBOX = "ordinace@buzalkova.cz" -GRAPH_URL = "https://graph.microsoft.com/v1.0" - -MONGO_URI = "mongodb://192.168.1.76:27017" -MONGO_DB = "emaily" -MONGO_COL_EMAILS = "ordinace@buzalkova.cz" -MONGO_COL_INDEX = "attachments_index" - -ATTACHMENTS_DIR = Path("/mnt/Emails/ordinace@buzalkova.cz/Attachments") -LOG_FILE = Path(__file__).parent / "parse_emails_errors.log" -SCRIPT_VERSION = "1.0" -BATCH_SIZE = 50 -# ────────────────────────────────────────────────────────────────────────────── - -logging.basicConfig( - filename=str(LOG_FILE), - level=logging.ERROR, - format="%(asctime)s | %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - encoding="utf-8", -) - -_graph_token: Optional[str] = None - - -# ─── Graph API ──────────────────────────────────────────────────────────────── - -def get_token() -> str: - global _graph_token - app = msal.ConfidentialClientApplication( - GRAPH_CLIENT_ID, - authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}", - client_credential=GRAPH_CLIENT_SECRET, - ) - result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) - if "access_token" not in result: - raise RuntimeError(f"Graph auth failed: {result}") - _graph_token = result["access_token"] - return _graph_token - - -def graph_get_bytes(url: str) -> bytes: - """Stahne binarni obsah prilohy.""" - global _graph_token - if not _graph_token: - get_token() - for attempt in range(2): - r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True) - if r.status_code == 401: - get_token() - continue - r.raise_for_status() - return r.content - raise RuntimeError(f"Graph GET bytes failed: {url}") - - -def graph_get_json(url: str, params: dict = None) -> dict: - global _graph_token - if not _graph_token: - get_token() - for attempt in range(2): - r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30) - if r.status_code == 401: - get_token() - continue - r.raise_for_status() - return r.json() - raise RuntimeError(f"Graph GET json failed: {url}") - - -def fetch_attachment_content(graph_message_id: str, attachment_id: str) -> Optional[bytes]: - """Stahne obsah prilohy pres Graph API.""" - url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{graph_message_id}/attachments/{attachment_id}/$value" - try: - return graph_get_bytes(url) - except Exception as e: - logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s", graph_message_id, attachment_id, e) - return None - - -def fetch_message_attachments(graph_message_id: str) -> list[dict]: - """Nacte seznam priloh zpravy z Graph API (metadata vcetne attachment ID).""" - url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{graph_message_id}/attachments" - try: - data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline,contentId"}) - return data.get("value", []) - except Exception as e: - logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e) - return [] - - -# ─── Dedup + ukládání ───────────────────────────────────────────────────────── - -def sha256(data: bytes) -> str: - return hashlib.sha256(data).hexdigest() - - -def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, index_col) -> str: - """ - Vrati nazev souboru ktery pouzit pro ulozeni. - Pokud desired_name jiz existuje s jinym hashem, prida suffix _2, _3 ... - """ - # Zkontroluj jestli existujici soubor se stejnym nazvem ma stejny hash - existing = index_col.find_one({"filename": desired_name}) - if existing: - if existing["_id"] == hash_val: - return desired_name # Stejny hash, stejne jmeno — dedup hit - # Jiny hash — hledej volny suffix - stem = Path(desired_name).stem - suffix = Path(desired_name).suffix - n = 2 - while True: - candidate = f"{stem}_{n}{suffix}" - if not (att_dir / candidate).exists(): - # Overi ze ani v indexu neni tento kandidat s jinym hashem - ex2 = index_col.find_one({"filename": candidate}) - if not ex2 or ex2["_id"] == hash_val: - return candidate - n += 1 - return desired_name - - -def save_attachment(content: bytes, original_name: str, att_dir: Path, index_col) -> tuple[str, str, bool]: - """ - Ulozi prilohu s deduplikaci. - Vraci (hash, local_path, was_new): - was_new=True -> soubor byl ulozen - was_new=False -> hash uz existoval, soubor preskocen - """ - hash_val = sha256(content) - - # Zkontroluj index — pokud hash uz existuje, vrat existujici zaznam - existing = index_col.find_one({"_id": hash_val}) - if existing: - # Zvys pocitadlo referenci - index_col.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}}) - return hash_val, existing["local_path"], False - - # Novy soubor — urcit nazev - safe_name = "".join(c if c.isalnum() or c in "._- " else "_" for c in original_name).strip() - if not safe_name: - safe_name = f"attachment_{hash_val[:8]}" - - filename = resolve_filename(safe_name, att_dir, hash_val, index_col) - file_path = att_dir / filename - - # Uloz soubor - file_path.write_bytes(content) - - # Zaznamenej do indexu - index_col.insert_one({ - "_id": hash_val, - "filename": filename, - "local_path": filename, - "size_bytes": len(content), - "mime_type": "", - "first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None), - "ref_count": 1, - }) - - return hash_val, filename, True - - -# ─── MAIN ───────────────────────────────────────────────────────────────────── - -def main(): - ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}") - ap.add_argument("--limit", type=int, default=0, - help="Zpracovat max N emailu (0 = vse)") - ap.add_argument("--force-recheck", action="store_true", - help="Znovu overi i emaily kde prilohy uz maji file_hash") - ap.add_argument("--no-indexes", action="store_true", - help="Nevytvorit indexy na konci") - args = ap.parse_args() - - start = datetime.now() - print(f"=== download_attachments v{SCRIPT_VERSION} ===") - print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}") - print(f"Schránka: {GRAPH_MAILBOX}") - print(f"Cilovy adresar: {ATTACHMENTS_DIR}") - print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}") - - # Adresar - ATTACHMENTS_DIR.mkdir(parents=True, exist_ok=True) - print(f" Adresar OK") - - # Graph - print("\nPřipojuji se k Graph API...") - try: - get_token() - print(" Graph API OK") - except Exception as e: - print(f" CHYBA: {e}") - sys.exit(1) - - # MongoDB - client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) - try: - client.admin.command("ping") - print(" MongoDB OK") - except Exception as e: - print(f" CHYBA: MongoDB neni dostupna -- {e}") - sys.exit(1) - - col_emails = client[MONGO_DB][MONGO_COL_EMAILS] - col_index = client[MONGO_DB][MONGO_COL_INDEX] - - # Indexy na attachment index kolekci - if not args.no_indexes: - col_index.create_index("filename") - col_index.create_index("mime_type") - - # Dotaz — emaily s prilohou ktere jeste nebyly zpracovany - if args.force_recheck: - query = {"has_attachments": True} - else: - query = { - "has_attachments": True, - "attachments": { - "$elemMatch": { - "is_inline": False, - "file_hash": {"$exists": False}, - } - } - } - - total = col_emails.count_documents(query) - print(f"\nEmailu ke zpracovani: {total}") - if total == 0: - print("Neni co stahnout.") - client.close() - return - - cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1}) - if args.limit: - cursor = cursor.limit(args.limit) - - ok_count = 0 - new_count = 0 - skip_count = 0 - err_count = 0 - email_i = 0 - batch = [] - - def flush(): - if not batch: - return - try: - col_emails.bulk_write(batch, ordered=False) - except Exception as e: - logging.error("bulk_write: %s", e) - print(f" CHYBA bulk_write: {e}") - batch.clear() - - for email_doc in cursor: - email_i += 1 - email_id = email_doc["_id"] - graph_id = email_doc.get("graph_id", "") - subject = (email_doc.get("subject") or "")[:60] - att_list = email_doc.get("attachments") or [] - - # Jen skutecne prilohy - real_atts = [a for a in att_list if not a.get("is_inline", False)] - if not real_atts: - continue - - print(f"\n {email_i:>5}/{total} {subject}") - - # Nacti attachment IDs z Graph API - graph_atts = fetch_message_attachments(graph_id) - graph_att_map = {a["name"]: a for a in graph_atts if not a.get("isInline", False)} - - updated_atts = list(att_list) - email_ok = True - - for i, att in enumerate(updated_atts): - if att.get("is_inline", False): - continue - if not args.force_recheck and att.get("file_hash"): - skip_count += 1 - print(f" SKIP {att['filename']}") - continue - - att_name = att.get("filename", "") - graph_att = graph_att_map.get(att_name) - - if not graph_att: - # Zkus najit podle casti nazvu - for gname, ga in graph_att_map.items(): - if att_name.lower() in gname.lower(): - graph_att = ga - break - - if not graph_att: - logging.error("attachment not found in Graph [email=%s att=%s]", email_id, att_name) - print(f" ERR {att_name} (nenalezeno v Graph)") - err_count += 1 - email_ok = False - continue - - # Stahni obsah - content = fetch_attachment_content(graph_id, graph_att["id"]) - if content is None: - err_count += 1 - email_ok = False - print(f" ERR {att_name} (stazeni selhalo)") - continue - - # Uloz s dedupem - hash_val, local_path, was_new = save_attachment(content, att_name, ATTACHMENTS_DIR, col_index) - - # Aktualizuj MIME typ v indexu - col_index.update_one( - {"_id": hash_val}, - {"$set": {"mime_type": att.get("mime_type", graph_att.get("contentType", ""))}}, - ) - - # Zaznamenej do emailu - updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path} - - if was_new: - new_count += 1 - print(f" NEW {local_path} ({len(content):,} B)") - else: - skip_count += 1 - print(f" DUP {att_name} -> {local_path}") - - if email_ok: - ok_count += 1 - - # Uloz aktualizovane prilohy zpet do emailu - batch.append(UpdateOne( - {"_id": email_id}, - {"$set": {"attachments": updated_atts}} - )) - - if len(batch) >= BATCH_SIZE: - flush() - - if email_i % 100 == 0: - elapsed = (datetime.now() - start).total_seconds() - print(f" {'─'*60}") - print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={skip_count} err={err_count}") - print(f" {'─'*60}") - - flush() - - elapsed_total = (datetime.now() - start).total_seconds() - files_total = col_index.count_documents({}) - size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1})) - - print(f"\n{'='*52}") - print(f"Vysledek: emaily={ok_count} | nove soubory={new_count} | duplikaty={skip_count} | err={err_count}") - print(f"Souboru v indexu: {files_total} ({size_total/1024/1024:.1f} MB)") - print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s") - print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") - if err_count: - print(f"Chyby logovany do: {LOG_FILE}") - - client.close() - - -if __name__ == "__main__": - main() diff --git a/Python-runner/download_attachments_v1.1.py b/Python-runner/download_attachments_v1.1.py deleted file mode 100644 index cbbaed3..0000000 --- a/Python-runner/download_attachments_v1.1.py +++ /dev/null @@ -1,428 +0,0 @@ -""" -download_attachments_v1.1.py -Nazev: download_attachments_v1.1.py -Verze: 1.1 -Datum: 2026-06-02 -Autor: vladimir.buzalka - -Popis: - Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB - pres Microsoft Graph API a uklada je do adresare - /mnt/Emails//Attachments/. - - Schránka se predava jako povinny parametr --mailbox. - - Deduplikace podle SHA256 hashe obsahu: - - stejny hash = soubor uz existuje -> preskoci - - prvni vyskytu souboru: ulozi pod puvodnimnazvem - - kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ... - - Po ulozeni aktualizuje MongoDB: - - v email dokumentu: kazda priloha dostane file_hash + local_path - - kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes, - mime_type, mailbox, first_seen_at, ref_count - - Bezpecne prerusit a opakovat — emaily kde vsechny prilohy maji file_hash - se preskoci. --force-recheck znovu overi i uz stazene. - - POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky! - -Spousteni: - python download_attachments_v1.1.py --mailbox ordinace@buzalkova.cz - python download_attachments_v1.1.py --mailbox vladimir.buzalka@buzalka.cz --limit 50 - python download_attachments_v1.1.py --mailbox ordinace@buzalkova.cz --force-recheck - -Docker: - docker exec -it python-runner python /scripts/download_attachments_v1.1.py \\ - --mailbox ordinace@buzalkova.cz - -Zavislosti: - msal, requests, pymongo - Python 3.10+ - -Struktura na disku: - /mnt/Emails/ - └── / - └── Attachments/ - ├── faktura_2026.pdf - ├── vysledky_lab.pdf - ├── vysledky_lab_2.pdf - └── ... - -Kolekce emaily.attachments_index: - _id SHA256 hash (hex) - filename nazev souboru na disku - local_path relativni cesta od Attachments/ - size_bytes velikost souboru - mime_type MIME typ - mailbox schránka ze ktere pochazi prvni vyskytu - first_seen_at datetime UTC - ref_count v kolika emailech se tato priloha vyskytuje - -Historie verzi: - 1.0 2026-06-02 Inicialni verze - 1.1 2026-06-02 Schránka jako parametr --mailbox (univerzalni pouziti) -""" - -import sys -import hashlib -import logging -import argparse -from pathlib import Path -from datetime import datetime, timezone -from typing import Optional - -import msal -import requests -from pymongo import MongoClient, UpdateOne - -if hasattr(sys.stdout, "reconfigure"): - sys.stdout.reconfigure(encoding="utf-8", errors="replace") - -# ─── KONFIGURACE ────────────────────────────────────────────────────────────── -GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9" -GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f" -GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk" -GRAPH_URL = "https://graph.microsoft.com/v1.0" - -MONGO_URI = "mongodb://192.168.1.76:27017" -MONGO_DB = "emaily" -MONGO_COL_INDEX = "attachments_index" - -EMAILS_BASE_DIR = Path("/mnt/Emails") -LOG_FILE = Path(__file__).parent / "parse_emails_errors.log" -SCRIPT_VERSION = "1.1" -BATCH_SIZE = 50 -# ────────────────────────────────────────────────────────────────────────────── - -logging.basicConfig( - filename=str(LOG_FILE), - level=logging.ERROR, - format="%(asctime)s | %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - encoding="utf-8", -) - -_graph_token: Optional[str] = None - - -# ─── Graph API ──────────────────────────────────────────────────────────────── - -def get_token() -> str: - global _graph_token - app = msal.ConfidentialClientApplication( - GRAPH_CLIENT_ID, - authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}", - client_credential=GRAPH_CLIENT_SECRET, - ) - result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) - if "access_token" not in result: - raise RuntimeError(f"Graph auth failed: {result}") - _graph_token = result["access_token"] - return _graph_token - - -def graph_get_bytes(url: str) -> bytes: - global _graph_token - if not _graph_token: - get_token() - for attempt in range(2): - r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True) - if r.status_code == 401: - get_token() - continue - r.raise_for_status() - return r.content - raise RuntimeError(f"Graph GET bytes failed: {url}") - - -def graph_get_json(url: str, params: dict = None) -> dict: - global _graph_token - if not _graph_token: - get_token() - for attempt in range(2): - r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30) - if r.status_code == 401: - get_token() - continue - r.raise_for_status() - return r.json() - raise RuntimeError(f"Graph GET json failed: {url}") - - -def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]: - url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments" - try: - data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline,contentId"}) - return data.get("value", []) - except Exception as e: - logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e) - return [] - - -def fetch_attachment_content(mailbox: str, graph_message_id: str, attachment_id: str) -> Optional[bytes]: - url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}/$value" - try: - return graph_get_bytes(url) - except Exception as e: - logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s", graph_message_id, attachment_id, e) - return None - - -# ─── Dedup + ukládání ───────────────────────────────────────────────────────── - -def sha256(data: bytes) -> str: - return hashlib.sha256(data).hexdigest() - - -def safe_filename(name: str) -> str: - safe = "".join(c if c.isalnum() or c in "._- " else "_" for c in name).strip() - return safe or "attachment" - - -def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str: - """Vrati nazev souboru pro ulozeni — resi kolize (stejny nazev, jiny hash).""" - existing = col_index.find_one({"filename": desired_name}) - if existing: - if existing["_id"] == hash_val: - return desired_name # Dedup hit — stejny hash - # Kolize — hledej volny suffix - stem = Path(desired_name).stem - suffix = Path(desired_name).suffix - n = 2 - while True: - candidate = f"{stem}_{n}{suffix}" - ex2 = col_index.find_one({"filename": candidate}) - if not ex2 or ex2["_id"] == hash_val: - if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val): - return candidate - n += 1 - return desired_name - - -def save_attachment( - content: bytes, - original_name: str, - mime_type: str, - mailbox: str, - att_dir: Path, - col_index, -) -> tuple[str, str, bool]: - """ - Ulozi prilohu s deduplikaci. - Vraci (hash, local_path, was_new). - """ - hash_val = sha256(content) - - existing = col_index.find_one({"_id": hash_val}) - if existing: - col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}}) - return hash_val, existing["local_path"], False - - filename = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index) - file_path = att_dir / filename - file_path.write_bytes(content) - - col_index.insert_one({ - "_id": hash_val, - "filename": filename, - "local_path": filename, - "size_bytes": len(content), - "mime_type": mime_type, - "mailbox": mailbox, - "first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None), - "ref_count": 1, - }) - - return hash_val, filename, True - - -# ─── MAIN ───────────────────────────────────────────────────────────────────── - -def main(): - ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}") - ap.add_argument("--mailbox", required=True, - help="Emailova schranka (napr. ordinace@buzalkova.cz)") - ap.add_argument("--limit", type=int, default=0, - help="Zpracovat max N emailu (0 = vse)") - ap.add_argument("--force-recheck", action="store_true", - help="Znovu overi i emaily kde prilohy uz maji file_hash") - ap.add_argument("--no-indexes", action="store_true", - help="Nevytvorit indexy na attachments_index kolekci") - args = ap.parse_args() - - mailbox = args.mailbox - att_dir = EMAILS_BASE_DIR / mailbox / "Attachments" - mongo_col = mailbox - - start = datetime.now() - print(f"=== download_attachments v{SCRIPT_VERSION} ===") - print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}") - print(f"Schránka: {mailbox}") - print(f"Cilovy adresar: {att_dir}") - print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{mongo_col}") - - att_dir.mkdir(parents=True, exist_ok=True) - print(" Adresar OK") - - print("\nPřipojuji se k Graph API...") - try: - get_token() - print(" Graph API OK") - except Exception as e: - print(f" CHYBA: {e}") - sys.exit(1) - - client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) - try: - client.admin.command("ping") - print(" MongoDB OK") - except Exception as e: - print(f" CHYBA: MongoDB neni dostupna -- {e}") - sys.exit(1) - - col_emails = client[MONGO_DB][mongo_col] - col_index = client[MONGO_DB][MONGO_COL_INDEX] - - if not args.no_indexes: - col_index.create_index("filename") - col_index.create_index("mime_type") - col_index.create_index("mailbox") - - # Dotaz - if args.force_recheck: - query = {"has_attachments": True} - else: - query = { - "has_attachments": True, - "attachments": { - "$elemMatch": { - "is_inline": False, - "file_hash": {"$exists": False}, - } - } - } - - total = col_emails.count_documents(query) - print(f"\nEmailu ke zpracovani: {total}") - if total == 0: - print("Neni co stahnout.") - client.close() - return - - cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1}) - if args.limit: - cursor = cursor.limit(args.limit) - - ok_count = 0 - new_count = 0 - dup_count = 0 - err_count = 0 - email_i = 0 - batch = [] - - def flush(): - if not batch: - return - try: - col_emails.bulk_write(batch, ordered=False) - except Exception as e: - logging.error("bulk_write: %s", e) - print(f" CHYBA bulk_write: {e}") - batch.clear() - - for email_doc in cursor: - email_i += 1 - email_id = email_doc["_id"] - graph_id = email_doc.get("graph_id", "") - subject = (email_doc.get("subject") or "")[:60] - att_list = email_doc.get("attachments") or [] - - real_atts = [a for a in att_list if not a.get("is_inline", False)] - if not real_atts: - continue - - print(f"\n {email_i:>5}/{total} {subject}") - - graph_atts = fetch_message_attachments(mailbox, graph_id) - graph_att_map = {a["name"]: a for a in graph_atts if not a.get("isInline", False)} - - updated_atts = list(att_list) - email_ok = True - - for i, att in enumerate(updated_atts): - if att.get("is_inline", False): - continue - if not args.force_recheck and att.get("file_hash"): - print(f" SKIP {att['filename']}") - continue - - att_name = att.get("filename", "") - graph_att = graph_att_map.get(att_name) - if not graph_att: - for gname, ga in graph_att_map.items(): - if att_name.lower() in gname.lower(): - graph_att = ga - break - - if not graph_att: - logging.error("attachment not found in Graph [email=%s att=%s]", email_id, att_name) - print(f" ERR {att_name} (nenalezeno v Graph)") - err_count += 1 - email_ok = False - continue - - content = fetch_attachment_content(mailbox, graph_id, graph_att["id"]) - if content is None: - err_count += 1 - email_ok = False - print(f" ERR {att_name} (stazeni selhalo)") - continue - - mime_type = att.get("mime_type") or graph_att.get("contentType", "") - hash_val, local_path, was_new = save_attachment( - content, att_name, mime_type, mailbox, att_dir, col_index - ) - - updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path} - - if was_new: - new_count += 1 - print(f" NEW {local_path} ({len(content):,} B)") - else: - dup_count += 1 - print(f" DUP {att_name} -> {local_path}") - - if email_ok: - ok_count += 1 - - batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}})) - - if len(batch) >= BATCH_SIZE: - flush() - - if email_i % 100 == 0: - elapsed = (datetime.now() - start).total_seconds() - print(f" {'─'*60}") - print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={dup_count} err={err_count}") - print(f" {'─'*60}") - - flush() - - elapsed_total = (datetime.now() - start).total_seconds() - files_total = col_index.count_documents({}) - size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1})) - - print(f"\n{'='*52}") - print(f"Vysledek: emaily={ok_count} | nove={new_count} | dup={dup_count} | err={err_count}") - print(f"Souboru v indexu: {files_total} ({size_total / 1024 / 1024:.1f} MB)") - print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s") - print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") - if err_count: - print(f"Chyby logovany do: {LOG_FILE}") - - client.close() - - -if __name__ == "__main__": - main() diff --git a/Python-runner/parse_emails_graph_v1.0.py b/Python-runner/parse_emails_graph_v1.0.py deleted file mode 100644 index 1653b05..0000000 --- a/Python-runner/parse_emails_graph_v1.0.py +++ /dev/null @@ -1,560 +0,0 @@ -""" -parse_emails_graph_v1.0.py -Nazev: parse_emails_graph_v1.0.py -Verze: 1.0 -Datum: 2026-06-02 -Autor: vladimir.buzalka - -Popis: - Cte vsechny emaily ze schranky ordinace@buzalkova.cz primo pres - Microsoft Graph API a importuje je jako dokumenty do MongoDB. - Ze kazde zpravy extrahuje vsechny dostupne vlastnosti: - - - predmet, odesilatel, prijemci (To/CC/BCC s typy) - - cas doruceni, odeslani, vytvoreni, modifikace (UTC) - - telo HTML (max 2 MB) + textovy preview - - prilohy (metadata: jmeno, velikost, MIME typ, inline flag) - - internet headers (SPF, DKIM, Received, X-*, ...) - - MAPI-ekvivalenty: dulezitost, priznak, konverzacni vlakno, - kategorie, In-Reply-To, References, ... - - navic: isRead, isDraft, folder_path, inferenceClassification - - Prochazi VSECHNY slozky schranky rekurzivne (Inbox, Sent, Deleted, - archivni slozky, ...). - - DB: emaily - Kolekce: ordinace@buzalkova.cz - _id: Internet Message-ID (nebo "graphid:" jako fallback) - - Bezpecne prerusit a opakovat: - - upsert podle _id — duplicity se automaticky prepisi - - --skip-existing nacte seznam hotovych _id z MongoDB a preskoci je - - POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky! - -Spousteni: - python parse_emails_graph_v1.0.py # kompletni import - python parse_emails_graph_v1.0.py --limit 50 # test na prvnich 50 - python parse_emails_graph_v1.0.py --skip-existing # pokracovani po preruseni - python parse_emails_graph_v1.0.py --folder Inbox # jen jedna slozka - python parse_emails_graph_v1.0.py --no-indexes # bez indexu na konci - -Zavislosti: - msal, requests, pymongo, python-dateutil - Python 3.10+ - -Struktura dokumentu v MongoDB: - _id Internet Message-ID (nebo graphid: fallback) - graph_id Graph API message ID (pro pripadne dalsi operace) - subject predmet zpravy - normalized_subject predmet bez RE:/FW:/AW: prefixu - importance 0=nizka 1=normalni 2=vysoka - flag_status 0=bez priznaku 1=oznaceno 2=dokonceno - is_read bool — aktualni stav precteni ve schrance - is_draft bool - has_attachments bool - attachment_count int - inference_classification focused / other (Outlook AI trideni) - categories [str] - conversation_id Graph conversationId - conversation_index base64 conversationIndex - conversation_topic tema vlakna (z internet headers Thread-Topic) - in_reply_to Message-ID predchozi zpravy - internet_references [Message-ID] — cela historia vlakna - received_at datetime UTC - sent_at datetime UTC - created_at datetime UTC — cas vytvoreni zaznamu v M365 - modified_at datetime UTC — cas posledni modifikace - folder_id Graph parentFolderId - folder_path cela cesta slozky (napr. Inbox/Subfolder) - sender.email emailova adresa odesilatele - sender.name zobrazovane jmeno odesilatele - to retezec To (joined) - cc retezec CC - bcc retezec BCC - recipients [{type, email, name}] — to/cc/bcc s typy - body_html HTML telo (max 2 MB) - body_preview textovy nahled (max 255 znaku z Graph) - attachments [{filename, size_bytes, mime_type, - content_id, is_inline}] - headers dict internet headers (lowercase_s_podtrzitky) - parsed_at datetime UTC — cas parsovani - -Indexy: - received_at, sent_at, sender.email, graph_id (unique), - conversation_id, folder_path, has_attachments, categories, - importance, flag_status, is_read, - text_search (subject + body_preview + to + cc) - -Historie verzi: - 1.0 2026-06-02 Inicialni verze — Graph API jako zdroj -""" - -import sys -import re -import logging -import argparse -import base64 -from pathlib import Path -from datetime import datetime, timezone -from typing import Optional - -import msal -import requests -from dateutil import parser as dtparser -from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT - -if hasattr(sys.stdout, "reconfigure"): - sys.stdout.reconfigure(encoding="utf-8", errors="replace") - -# ─── KONFIGURACE ────────────────────────────────────────────────────────────── -GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9" -GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f" -GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk" -GRAPH_MAILBOX = "ordinace@buzalkova.cz" -GRAPH_URL = "https://graph.microsoft.com/v1.0" - -MONGO_URI = "mongodb://192.168.1.76:27017" -MONGO_DB = "emaily" -MONGO_COL = "ordinace@buzalkova.cz" -BATCH_SIZE = 100 -PAGE_SIZE = 50 -LOG_FILE = Path(__file__).parent / "parse_emails_errors.log" -SCRIPT_VERSION = "1.0" -# ────────────────────────────────────────────────────────────────────────────── - -logging.basicConfig( - filename=str(LOG_FILE), - level=logging.ERROR, - format="%(asctime)s | %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - encoding="utf-8", -) - -IMPORTANCE_MAP = {"low": 0, "normal": 1, "high": 2} -FLAG_STATUS_MAP = {"notFlagged": 0, "flagged": 1, "complete": 2} -RE_SUBJECT = re.compile(r"^(RE|FW|AW|SV|VS|TR|WG|odpov[eě]d[ťt]|fwd?)[:\s]+", re.IGNORECASE) - -MSG_SELECT = ( - "id,internetMessageId,subject,bodyPreview,body," - "importance,isRead,isDraft,hasAttachments," - "receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime," - "sender,from,toRecipients,ccRecipients,bccRecipients,replyTo," - "conversationId,conversationIndex,parentFolderId," - "categories,flag,inferenceClassification,internetMessageHeaders" -) - - -# ─── Graph API helpers ──────────────────────────────────────────────────────── - -_graph_token: Optional[str] = None - - -def get_token() -> str: - global _graph_token - app = msal.ConfidentialClientApplication( - GRAPH_CLIENT_ID, - authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}", - client_credential=GRAPH_CLIENT_SECRET, - ) - result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"]) - if "access_token" not in result: - raise RuntimeError(f"Graph auth failed: {result}") - _graph_token = result["access_token"] - return _graph_token - - -def graph_get(url: str, params: dict = None) -> dict: - global _graph_token - if not _graph_token: - get_token() - for attempt in range(2): - r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30) - if r.status_code == 401: - get_token() - continue - r.raise_for_status() - return r.json() - raise RuntimeError(f"Graph GET failed after retry: {url}") - - -def get_all_folders(parent_id: str = None, parent_path: str = "") -> list[dict]: - """Rekurzivne nacte vsechny slozky schranky. Vraci [{id, path}].""" - if parent_id is None: - url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders" - else: - url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{parent_id}/childFolders" - - folders = [] - params = {"$top": 100, "$select": "id,displayName,childFolderCount"} - while url: - data = graph_get(url, params) - for f in data.get("value", []): - path = f"{parent_path}/{f['displayName']}".lstrip("/") - folders.append({"id": f["id"], "path": path}) - if f.get("childFolderCount", 0) > 0: - folders.extend(get_all_folders(f["id"], path)) - url = data.get("@odata.nextLink") - params = None - return folders - - -def iter_folder_messages(folder_id: str): - """Generator: vraci zpravy ze slozky po strankach.""" - url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{folder_id}/messages" - params = {"$top": PAGE_SIZE, "$select": MSG_SELECT, "$expand": "attachments"} - while url: - data = graph_get(url, params) - for msg in data.get("value", []): - yield msg - url = data.get("@odata.nextLink") - params = None - - -# ─── Pomocné funkce ─────────────────────────────────────────────────────────── - -def parse_date(raw) -> Optional[datetime]: - if raw is None: - return None - if isinstance(raw, datetime): - if raw.tzinfo: - return raw.astimezone(timezone.utc).replace(tzinfo=None) - return raw - try: - dt = dtparser.parse(str(raw)) - if dt.tzinfo: - return dt.astimezone(timezone.utc).replace(tzinfo=None) - return dt - except Exception: - return None - - -def normalize_subject(subject: str) -> str: - s = subject.strip() - while True: - m = RE_SUBJECT.match(s) - if not m: - break - s = s[m.end():].strip() - return s - - -def parse_headers(raw_headers: list) -> dict: - result = {} - for h in raw_headers: - k = h["name"].lower().replace("-", "_") - v = h["value"] - if k in result: - existing = result[k] - if isinstance(existing, list): - existing.append(v) - else: - result[k] = [existing, v] - else: - result[k] = v - return result - - -def format_recipients(lst: list) -> str: - return "; ".join( - f'{r["emailAddress"].get("name", "")} <{r["emailAddress"].get("address", "")}>'.strip() - for r in lst - ) - - -# ─── Hlavní extrakce ───────────────────────────────────────────────────────── - -def extract_message(msg: dict, folder_path: str) -> Optional[dict]: - try: - # _id - mid = (msg.get("internetMessageId") or "").strip() - if not mid: - mid = f"graphid:{msg['id']}" - - subject = msg.get("subject") or "" - norm_subject = normalize_subject(subject) - - # tělo - body_html = None - body_preview = msg.get("bodyPreview") or "" - body = msg.get("body", {}) - if body.get("contentType") == "html": - content = body.get("content") or "" - body_html = content if len(content) <= 2 * 1024 * 1024 else content[:2 * 1024 * 1024] - elif body.get("contentType") == "text": - body_preview = (body.get("content") or "")[:2000] - - # odesílatel - sender_ea = (msg.get("from") or msg.get("sender") or {}).get("emailAddress", {}) - sender_email = sender_ea.get("address", "") - sender_name = sender_ea.get("name", "") - - # příjemci - to_list = msg.get("toRecipients", []) - cc_list = msg.get("ccRecipients", []) - bcc_list = msg.get("bccRecipients", []) - - recipients = ( - [{"type": "to", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in to_list] + - [{"type": "cc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in cc_list] + - [{"type": "bcc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in bcc_list] - ) - - # příznaky - importance = IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1) - flag_status = FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0) - - # internet headers - raw_headers = msg.get("internetMessageHeaders") or [] - headers = parse_headers(raw_headers) - - in_reply_to = headers.get("in_reply_to", "") - if isinstance(in_reply_to, list): - in_reply_to = in_reply_to[0] - - refs_raw = headers.get("references", "") - if isinstance(refs_raw, list): - refs_raw = " ".join(refs_raw) - internet_refs = [r.strip() for r in refs_raw.split() if r.strip()] if refs_raw else [] - - conv_topic = headers.get("thread_topic", "") - if isinstance(conv_topic, list): - conv_topic = conv_topic[0] - - # conversation index - conv_index = "" - ci_raw = msg.get("conversationIndex") - if ci_raw: - try: - conv_index = base64.b64encode(base64.b64decode(ci_raw)).decode() - except Exception: - conv_index = ci_raw - - # přílohy (jen metadata, bez obsahu) - attachments = [] - for att in msg.get("attachments") or []: - fname = att.get("name") or "" - if not fname: - continue - attachments.append({ - "filename": fname, - "size_bytes": att.get("size", 0), - "mime_type": att.get("contentType", "application/octet-stream"), - "content_id": att.get("contentId"), - "is_inline": att.get("isInline", False), - }) - - return { - "_id": mid, - "graph_id": msg["id"], - - "subject": subject, - "normalized_subject": norm_subject, - "importance": importance, - "flag_status": flag_status, - "is_read": msg.get("isRead", False), - "is_draft": msg.get("isDraft", False), - "has_attachments": msg.get("hasAttachments", False), - "attachment_count": len(attachments), - "inference_classification": msg.get("inferenceClassification", ""), - "categories": msg.get("categories") or [], - - "conversation_id": msg.get("conversationId", ""), - "conversation_index": conv_index, - "conversation_topic": conv_topic, - "in_reply_to": in_reply_to, - "internet_references": internet_refs, - - "received_at": parse_date(msg.get("receivedDateTime")), - "sent_at": parse_date(msg.get("sentDateTime")), - "created_at": parse_date(msg.get("createdDateTime")), - "modified_at": parse_date(msg.get("lastModifiedDateTime")), - - "folder_id": msg.get("parentFolderId", ""), - "folder_path": folder_path, - - "sender": { - "email": sender_email, - "name": sender_name, - }, - "to": format_recipients(to_list), - "cc": format_recipients(cc_list), - "bcc": format_recipients(bcc_list), - "recipients": recipients, - - "body_html": body_html, - "body_preview": body_preview, - - "attachments": attachments, - "headers": headers, - - "parsed_at": datetime.now(timezone.utc).replace(tzinfo=None), - } - - except Exception as e: - logging.error("extract_message failed [%s]: %s", msg.get("id", "?"), e) - return None - - -# ─── MongoDB indexy ─────────────────────────────────────────────────────────── - -def create_indexes(col): - print(" Vytvarim indexy...") - col.create_index([("received_at", ASCENDING)]) - col.create_index([("sent_at", ASCENDING)]) - col.create_index([("sender.email", ASCENDING)]) - col.create_index([("graph_id", ASCENDING)], unique=True, sparse=True) - col.create_index([("conversation_id", ASCENDING)]) - col.create_index([("folder_path", ASCENDING)]) - col.create_index([("has_attachments", ASCENDING)]) - col.create_index([("categories", ASCENDING)]) - col.create_index([("importance", ASCENDING)]) - col.create_index([("flag_status", ASCENDING)]) - col.create_index([("is_read", ASCENDING)]) - col.create_index([ - ("subject", TEXT), - ("body_preview", TEXT), - ("to", TEXT), - ("cc", TEXT), - ], name="text_search", default_language="none") - print(" Indexy hotovy.") - - -# ─── MAIN ───────────────────────────────────────────────────────────────────── - -def main(): - ap = argparse.ArgumentParser(description=f"parse_emails_graph v{SCRIPT_VERSION}") - ap.add_argument("--limit", type=int, default=0, - help="Zpracovat max N zprav (0 = vse)") - ap.add_argument("--skip-existing", action="store_true", - help="Preskocit zpravy ktere jiz jsou v MongoDB") - ap.add_argument("--folder", default="", - help="Zpracovat jen slozku se zadanym nazvem (napr. Inbox)") - ap.add_argument("--no-indexes", action="store_true", - help="Nevytvorit indexy na konci") - args = ap.parse_args() - - start = datetime.now() - print(f"=== parse_emails_graph v{SCRIPT_VERSION} ===") - print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}") - print(f"Schránka: {GRAPH_MAILBOX}") - print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{MONGO_COL}") - - # Graph token - print("\nPřipojuji se k Graph API...") - try: - get_token() - print(" Graph API OK") - except Exception as e: - print(f" CHYBA: {e}") - sys.exit(1) - - # MongoDB - client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) - try: - client.admin.command("ping") - print(" MongoDB OK") - except Exception as e: - print(f" CHYBA: MongoDB neni dostupna -- {e}") - sys.exit(1) - col = client[MONGO_DB][MONGO_COL] - - # Skip existing - existing: set = set() - if args.skip_existing: - print(" Nacitam existujici zaznamy z MongoDB...") - existing = set(col.distinct("_id")) - print(f" {len(existing)} jiz importovano") - - # Slozky - print("\nNacitam seznam slozek...") - all_folders = get_all_folders() - if args.folder: - all_folders = [f for f in all_folders if args.folder.lower() in f["path"].lower()] - print(f" Slozek ke zpracovani: {len(all_folders)}") - for f in all_folders: - print(f" {f['path']}") - - # Import - batch = [] - ok_count = 0 - err_count = 0 - skip_count = 0 - total_i = 0 - - def flush(): - if not batch: - return - try: - col.bulk_write(batch, ordered=False) - except Exception as e: - logging.error("bulk_write: %s", e) - print(f" CHYBA bulk_write: {e}") - batch.clear() - - print() - for folder in all_folders: - print(f"--- Složka: {folder['path']} ---") - folder_count = 0 - - for msg in iter_folder_messages(folder["id"]): - if args.limit and total_i >= args.limit: - break - - mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}" - - if mid in existing: - skip_count += 1 - total_i += 1 - continue - - doc = extract_message(msg, folder["path"]) - total_i += 1 - folder_count += 1 - - if doc is None: - err_count += 1 - else: - batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True)) - ok_count += 1 - - if len(batch) >= BATCH_SIZE: - flush() - - status = "ERR " if doc is None else "OK " - subject_str = (doc.get("subject") or "")[:60] if doc else "?" - sender_str = (doc.get("sender", {}).get("email") or "")[:40] if doc else "?" - print(f" {total_i:>6} {status} {subject_str:<60} {sender_str}") - - if total_i % 500 == 0: - elapsed = (datetime.now() - start).total_seconds() - rate = total_i / elapsed if elapsed > 0 else 0 - print(f" {'─'*80}") - print(f" Průběh: ok={ok_count} skip={skip_count} err={err_count} {rate:.1f} msg/s") - print(f" {'─'*80}") - - flush() - print(f" → {folder_count} zprav ze slozky {folder['path']}") - - if args.limit and total_i >= args.limit: - break - - elapsed_total = (datetime.now() - start).total_seconds() - print(f"\n{'='*52}") - print(f"Vysledek: ok={ok_count} | skip={skip_count} | err={err_count}") - print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s") - print(f"Dokumentu v kolekci: {col.count_documents({})}") - - if not args.no_indexes: - print() - create_indexes(col) - - print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") - if err_count: - print(f"Chyby logovany do: {LOG_FILE}") - - client.close() - - -if __name__ == "__main__": - main() diff --git a/Python-runner/python_runner.md b/Python-runner/python_runner.md index 00423ff..415e09d 100644 --- a/Python-runner/python_runner.md +++ b/Python-runner/python_runner.md @@ -39,53 +39,138 @@ c.close() ## Volume mounty -| Host (Unraid) | Kontejner | Popis | -|-----------------------|-------------------|------------------------------| -| `/mnt/user/Scripts` | `/scripts` | Skripty, logy — working dir | -| `/mnt/user/JNJEMAILS` | `/mnt/JNJEMAILS` | .msg soubory emailů (JNJ) | +| Host (Unraid) | Kontejner | Popis | +|-----------------------|-------------------|----------------------------------| +| `/mnt/user/Scripts` | `/scripts` | Skripty, logy — working dir | +| `/mnt/user/Emails` | `/mnt/Emails` | Stažené přílohy `/Attachments/` | ---- - -## Spouštění skriptů - -```bash -# Interaktivně (vidíš výstup): -docker exec -it python-runner python /scripts/parse_emails_tower_v1.1.py --limit 50 --no-indexes - -# Na pozadí (log do souboru): -docker exec -d python-runner bash -c \ - "python /scripts/parse_emails_tower_v1.1.py > /scripts/parse_emails.log 2>&1" - -# Pokračování po přerušení (skip hotových): -docker exec -d python-runner bash -c \ - "python /scripts/parse_emails_tower_v1.1.py --skip-existing > /scripts/parse_emails.log 2>&1" - -# Sledování průběhu: -docker exec -it python-runner tail -f /scripts/parse_emails.log -``` +> Skripty čtou emaily **přímo přes Microsoft Graph API** — lokální `.msg` soubory už nejsou potřeba. --- ## Aktuální skripty v /scripts -| Soubor | Popis | -|-------------------------------|------------------------------------------------| -| `parse_emails_tower_v1.1.py` | Import .msg → MongoDB (db: emaily, kolekce: vbuzalka@its.jnj.com) | -| `parse_emails_tower_v1.1.md` | Dokumentace ke skriptu | -| `parse_emails.log` | Log průběhu importu | -| `parse_emails_errors.log` | Log chyb (soubory které selhaly) | +| Soubor | Popis | +|---------------------------------|--------------------------------------------------------------| +| `parse_emails_graph_v1.3.py` | Import emailů ze schránky přes Graph API → MongoDB | +| `download_attachments_v1.3.py` | Stažení skutečných příloh emailů (Graph API) → `/mnt/Emails` | +| `python_runner.md` | Tato dokumentace | +| `parse_emails_errors.log` | Log chyb (soubory/zprávy které selhaly) | -Lokální protějšek: `EmailsImport/parse_emails_v1.0.py` — identický kód, liší se jen cestou -(`\\tower\JNJEMAILS` SMB vs. `/mnt/JNJEMAILS` lokální mount) a verzí hlavičky. +> **POZOR:** oba skripty pouze **čtou** ze schránky — žádný zápis do schránky. + +--- + +## Microsoft Graph API — konfigurace (v obou skriptech) + +| Parametr | Hodnota | +|-----------------|----------------------------------------| +| Graph URL | `https://graph.microsoft.com/v1.0` | +| Tenant ID | `7d269944-37a4-43a1-8140-c7517dc426e9` | +| Client ID | `4b222bfd-78c9-4239-a53f-43006b3ed07f` | +| Auth | client credentials (msal) | + +| MongoDB | Hodnota | +|-----------------|----------------------------------------| +| URI | `mongodb://192.168.1.76:27017` | +| DB | `emaily` | +| Kolekce emailů | `` (např. `ordinace@buzalkova.cz`) | +| Index příloh | `attachments_index` | + +--- + +## 1) parse_emails_graph_v1.3.py — import emailů → MongoDB + +Čte **všechny složky** schránky rekurzivně (Inbox, Sent, Deleted, archivy …) přes +Graph API a importuje každou zprávu jako dokument do MongoDB. `_id` = Internet +Message-ID (fallback `graphid:`). Upsert → bezpečné přerušit a opakovat. + +Z každé zprávy extrahuje: předmět, odesílatel, příjemci To/CC/BCC, časy (UTC), +HTML tělo (max 2 MB) + text preview, přílohy (metadata + `graph_att_id`), +internet headers (SPF/DKIM/Received/X-*), MAPI-ekvivalenty (důležitost, příznak, +konverzační vlákno, kategorie, In-Reply-To, References), `isRead`, `isDraft`, +`folder_path`, `inferenceClassification`. + +```bash +# První import (vše): +docker exec -it python-runner python /scripts/parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz + +# Test na 50 zprávách bez indexů: +docker exec -it python-runner python /scripts/parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes + +# Pravidelný sync na pozadí (log do souboru): +docker exec -d python-runner bash -c "python /scripts/parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode sync > /scripts/parse_emails.log 2>&1" +``` + +> **`-d` = detached:** příkaz se hned vrátí a skript běží dál v kontejneru i po +> zavření terminálu / odpojení SSH. Bez `-d` (resp. s `-it`) skript skončí ve chvíli, +> kdy se spojení zavře. Pro dlouhé běhy vždy pouštěj s `-d` a logem do souboru, +> průběh pak sleduj přes `tail -f` (viz [Sledování průběhu](#sledování-průběhu)). + +### Parametry + +| Parametr | Popis | +|---|---| +| `--mailbox` | **Povinný.** Schránka (e-mail), zároveň název kolekce v MongoDB. | +| `--mode` | `full` (výchozí — plný upsert), `new-only` (jen nové), `sync` (existující: aktualizuje `is_read`/`flag_status`/`categories`/`modified_at`/`folder_path`; nové importuje celé — ideální pro pravidelné spouštění). | +| `--folder` | Import jen jedné složky (např. `Inbox`). | +| `--limit N` | Zpracuje jen prvních N zpráv (test). | +| `--no-indexes` | Nevytváří indexy na konci. | + +--- + +## 2) download_attachments_v1.3.py — stažení příloh → /mnt/Emails + +Stahuje skutečné přílohy (`is_inline=False`) všech emailů z MongoDB přes Graph API +do `/mnt/Emails//Attachments/`. Primárně přes `graph_att_id` (přímé ID), +name-matching jako fallback pro staré emaily. + +Deduplikace podle **SHA256** obsahu: +- stejný hash → soubor už existuje → přeskočí +- kolize názvu (stejný název, jiný hash) → `faktura_2.pdf`, `faktura_3.pdf` … + +Po uložení aktualizuje MongoDB: každá příloha dostane `file_hash` + `local_path`; +kolekce `emaily.attachments_index` (`_id`=hash, filename, path, size_bytes, +mime_type, mailbox, first_seen_at, ref_count). Emaily kde mají všechny přílohy +`file_hash` se přeskočí → bezpečné opakovat. + +```bash +# Interaktivně (vidíš výstup, skončí zavřením terminálu): +docker exec -it python-runner python /scripts/download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz + +# Na pozadí (běží dál i po zavření terminálu, log do souboru): +docker exec -d python-runner bash -c "python /scripts/download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz > /scripts/download_attachments.log 2>&1" +``` + +> `-d` = detached — stejné chování jako u skriptu výše (viz poznámka v sekci 1). + +### Parametry + +| Parametr | Popis | +|---|---| +| `--mailbox` | **Povinný.** Schránka (e-mail) = kolekce v MongoDB. | +| `--limit N` | Zpracuje jen prvních N emailů (test). | +| `--force-recheck` | Znovu ověří i už stažené přílohy. | +| `--no-indexes` | Nevytváří indexy na konci. | + +--- + +## Sledování průběhu + +```bash +docker exec -it python-runner tail -f /scripts/parse_emails.log +``` --- ## Nainstalované Python balíčky ``` -extract-msg 0.55.0 +msal (Graph API auth) +requests pymongo 4.17.0 python-dateutil 2.9.0.post0 +extract-msg 0.55.0 cryptography 48.0.0 beautifulsoup4 4.13.5 oletools 0.60.2 @@ -112,11 +197,8 @@ docker exec python-runner pip install --- -## Logika parse_emails (oba skripty) +## Historie -- Čte všechny `.msg` soubory z MSGS_DIR -- Extrahuje: předmět, odesílatel, příjemci (To/CC/BCC), tělo (text+HTML), přílohy, internet headers, všechny raw MAPI properties -- Ukládá do MongoDB: `emaily` → `vbuzalka@its.jnj.com` -- `_id` = Internet Message-ID (nebo `filename:` jako fallback) -- Upsert → bezpečné opakování, `--skip-existing` pro pokračování -- Indexy: received_at, sent_at, sender.email, filename (unique), full-text (subject+body+to+cc) +| Datum | Změna | +|---|---| +| 2026-06-02 | Přechod z `.msg` souborů na Microsoft Graph API. Skript `parse_emails_tower_v1.1.py` (import lokálních `.msg`) nahrazen `parse_emails_graph_v1.3.py`; přidán `download_attachments_v1.3.py`. Staré verze v `Trash/`. | diff --git a/claude-memory/MEMORY.md b/claude-memory/MEMORY.md index 2dea2cb..ac65006 100644 --- a/claude-memory/MEMORY.md +++ b/claude-memory/MEMORY.md @@ -3,6 +3,8 @@ - [Pracovat v maintree](feedback_worktree.md) — vždy pracuj v `U:/janssen/`, ne ve worktree větvích - [Projekt Covance UCO3001](project_covance.md) — report vzorků studie 77242113UCO3001, skript `create_report.py`, zdroj + logika OK statusů - [EDC import do MongoDB](project_edc_mongo.md) — skript `medidata/edc_import.py`, import Data Listing + QueryDetails CSV do MongoDB (192.168.1.76), kolekce `queries` + `queries_snapshots` pro tracking vývoje queries v čase +- [IWRS notifikace v Mongo](project_iwrs_mongo.md) — parser `IWRS/Patients/parse_notifications_to_mongo.py` čte texty notifikací z MySQL a ukládá strukturovaná data do `studie.iwrs` (lot, expirace, clinical response, audit trail) - [Dropbox file transfer](project_dropbox_file_transfer.md) — přenos souborů z JNJ PC do Dropboxu přes msgreceiver kontejner na Unraidu - [Graph email import](project_graph_email_import.md) — import JNJ emailů do schránky vladimir.buzalka@buzalka.cz přes Graph API - [Memory sync přes Giteu](setup_memory_sync.md) — paměť je v `claude-memory/` v janssen repu, junction + git push synchronizuje mezi PC +- [Claude Code learning path](project_claude_learning.md) — Level 2 Intermediate, mezery: Skills/Subagenty/Hooks/Print mode, tutoriál v `claude-howto/`