notebook

2026-06-03 07:10:15 +02:00
parent 9ed9f97140
commit 681095d557
60 changed files with 215 additions and 2161 deletions
@@ -4,7 +4,7 @@ Kompletní pipeline pro Drugs:
  2. IP destruction             (per košík, přeskočí již existující soubory)
  3. Shipments report           (jeden soubor na studii, přepisuje)
  4. Shipment details           (per zásilka CZ, vždy přepisuje)
-  5. Import do MySQL
+  5. Import do MongoDB (studie.iwrs_shipments / iwrs_shipment_items / iwrs_inventory / iwrs_destruction)

 Spusť tento skript — zpracuje obě studie automaticky.
 """
@@ -14,12 +14,11 @@ import glob
 import re
 import datetime

-import numpy as np
+import sys
 import pandas as pd
 from playwright.sync_api import sync_playwright
-import mysql.connector

-import db_config
+import import_to_mongo as drugs_mongo

 BASE_URL = "https://janssen.4gclinical.com"
 EMAIL    = "vbuzalka@its.jnj.com"
@@ -42,357 +41,6 @@ SITES = {
 BASE_DIR = os.path.dirname(os.path.abspath(__file__))


-# ── type converters ──────────────────────────────────────────────────────────
-
-def _py(val):
-    if isinstance(val, np.generic):
-        return val.item()
-    return val
-
-def to_date(val):
-    val = _py(val)
-    if val is None:
-        return None
-    if isinstance(val, float) and (val != val):
-        return None
-    try:
-        if pd.isna(val):
-            return None
-    except (TypeError, ValueError):
-        pass
-    if isinstance(val, pd.Timestamp):
-        return None if pd.isna(val) else val.date()
-    if isinstance(val, datetime.datetime):
-        return val.date()
-    if isinstance(val, datetime.date):
-        return val
-    s = str(val).strip()
-    if not s or s.lower() in ("nat", "nan", "none", ""):
-        return None
-    for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
-        try:
-            return datetime.datetime.strptime(s, fmt).date()
-        except ValueError:
-            pass
-    return None
-
-def to_int(val):
-    val = _py(val)
-    try:
-        v = float(val)
-        return None if (v != v) else int(v)
-    except (TypeError, ValueError):
-        return None
-
-def to_str(val):
-    val = _py(val)
-    if val is None:
-        return None
-    if isinstance(val, float) and (val != val):
-        return None
-    s = str(val).strip()
-    return None if s.lower() in ("nan", "nat", "none", "") else s
-
-
-# ── DB helpers ───────────────────────────────────────────────────────────────
-
-def get_conn():
-    return mysql.connector.connect(
-        host=db_config.DB_HOST, port=db_config.DB_PORT,
-        user=db_config.DB_USER, password=db_config.DB_PASSWORD,
-        database=db_config.DB_NAME,
-    )
-
-def insert_import(cursor, study, source_label):
-    cursor.execute(
-        "INSERT INTO iwrs_import (study, imported_at, source_file, report_type) VALUES (%s, %s, %s, %s)",
-        (study, datetime.datetime.now(), source_label, "drugs"),
-    )
-    return cursor.lastrowid
-
-def basket_already_imported(cursor, study, basket_id):
-    cursor.execute(
-        "SELECT 1 FROM iwrs_destruction WHERE study=%s AND basket_id=%s LIMIT 1",
-        (study, str(basket_id)),
-    )
-    return cursor.fetchone() is not None
-
-
-# ── parsery ──────────────────────────────────────────────────────────────────
-
-def parse_shipments_report(study):
-    path = os.path.join(BASE_DIR, f"xls_shipments_{study}", f"shipments_report_{study}.xlsx")
-    if not os.path.exists(path):
-        print(f"  CHYBÍ: {path}")
-        return []
-    raw = pd.read_excel(path, header=None)
-    header_row = None
-    for i, row in raw.iterrows():
-        if "Shipment ID" in [str(v).strip() for v in row]:
-            header_row = i
-            break
-    if header_row is None:
-        return []
-    df = pd.read_excel(path, header=header_row).dropna(how="all")
-    df = df[df["Location"].astype(str).str.contains("Czech", na=False, case=False)]
-    col = df.columns.tolist()
-    rows = []
-    for _, r in df.iterrows():
-        rows.append({
-            "shipment_id":          to_str(r["Shipment ID"]),
-            "status":               to_str(r["IRT Shipment Status"]),
-            "type":                 to_str(r["Type"]),
-            "ship_from":            to_str(r["Shipment From"]),
-            "ship_to_site":         to_str(r["Ship To:"]),
-            "location":             to_str(r["Location"]),
-            "request_date":         to_date(r["Request Date"]),
-            "shipped_date":         to_date(r["Shipped Date"]),
-            "received_date":        to_date(r["Received Date"]) if "Received Date" in col else None,
-            "received_by":          to_str(r["Received by"]) if "Received by" in col else None,
-            "delivered_date_utc":   to_date(r["Delivered Date [UTC]"]) if "Delivered Date [UTC]" in col else None,
-            "delivery_recipient":   to_str(r["Delivery Recipient"]) if "Delivery Recipient" in col else None,
-            "delivery_details":     to_str(r["Delivery Details"]) if "Delivery Details" in col else None,
-            "cancelled_date":       to_date(r["Cancelled Date"]) if "Cancelled Date" in col else None,
-            "total_medication_ids": to_int(r["Total Medication IDs"]) if "Total Medication IDs" in col else None,
-            "tracking_no":          to_str(r["Tracking #"]) if "Tracking #" in col else None,
-            "shipping_category":    to_str(r["Shipping Category"]) if "Shipping Category" in col else None,
-            "expected_arrival":     to_date(r["Expected Arrival"]) if "Expected Arrival" in col else None,
-        })
-    return rows
-
-
-def parse_shipment_details(study):
-    detail_dir = os.path.join(BASE_DIR, f"xls_shipment_details_{study}")
-    files = sorted(glob.glob(os.path.join(detail_dir, "shipment_details_*.xlsx")))
-    rows = []
-    for path in files:
-        m = re.search(r"shipment_details_(.+)\.xlsx", os.path.basename(path))
-        shipment_id = m.group(1) if m else "UNKNOWN"
-        raw = pd.read_excel(path, header=None)
-        header_row = None
-        for i, row in raw.iterrows():
-            if "Medication ID" in [str(v).strip() for v in row]:
-                header_row = i
-                break
-        if header_row is None:
-            continue
-        df = pd.read_excel(path, header=header_row).dropna(how="all")
-        for _, r in df.iterrows():
-            med_desc = (to_str(r.get("Medication Description"))
-                        or to_str(r.get("Medication ID Description")))
-            med_type = (to_str(r.get("Medication type"))
-                        or to_str(r.get("Medication ID type")))
-            rows.append({
-                "shipment_id":             shipment_id,
-                "destination_location":    to_str(r.get("Destination Location")),
-                "shipment_status":         to_str(r.get("IRT Shipment Status")),
-                "shipment_type":           to_str(r.get("Type")),
-                "destination_site":        to_str(r.get("Destination Site")),
-                "investigator":            to_str(r.get("Investigator")),
-                "medication_description":  med_desc,
-                "medication_type":         med_type,
-                "medication_id":           to_str(r.get("Medication ID")),
-                "packaged_lot_no":         to_str(r.get("Packaged Lot number")),
-                "packaged_lot_description": to_str(r.get("Packaged Lot description")),
-                "container_id":            to_str(r.get("Container ID")),
-                "quantity":                to_int(r.get("Quantity of Medication IDs")),
-                "expiration_date":         to_date(r.get("Expiration Date")),
-                "item_status":             to_str(r.get("Status")),
-            })
-    return rows
-
-
-def parse_inventory(study):
-    inv_dir = os.path.join(BASE_DIR, f"xls_reports_{study}")
-    files = sorted(glob.glob(os.path.join(inv_dir, "onsite_inventory_detail_*.xlsx")))
-    rows = []
-    for path in files:
-        raw = pd.read_excel(path, header=None)
-        site = investigator = location = None
-        header_row = None
-        for i, row in raw.iterrows():
-            first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
-            if first.startswith("Site:"):
-                site = first.replace("Site:", "").strip()
-            elif first.startswith("Investigator:"):
-                investigator = first.replace("Investigator:", "").strip()
-            elif first.startswith("Location:"):
-                location = first.replace("Location:", "").strip()
-            if first in ("Medication", "Medication ID") and header_row is None:
-                header_row = i
-        if header_row is None:
-            continue
-        df = pd.read_excel(path, header=header_row).dropna(how="all")
-        df = df.rename(columns={df.columns[0]: "medication_id"})
-        for _, r in df.iterrows():
-            rows.append({
-                "site":                    site,
-                "investigator":            investigator,
-                "location":                location,
-                "medication_id":           to_str(r["medication_id"]),
-                "packaged_lot_no":         to_str(r.get("Packaged Lot number")),
-                "original_expiration_date": to_date(r.get("Original Expiration Date when Packaged Lot was Added")),
-                "expiration_date":         to_date(r.get("Expiration date")),
-                "received_date":           to_date(r.get("Received Date")),
-                "receipt_user":            to_str(r.get("Shipment Receipt User")),
-                "subject_identifier":      to_str(r.get("Subject Identifier")),
-                "quantity_assigned":       to_int(r.get("Quantity Assigned")),
-                "irt_transaction":         to_str(r.get("IRT Transaction")),
-                "date_assigned":           to_date(r.get("Date Assigned")),
-                "assignment_user":         to_str(r.get("Assignment User")),
-                "dispensation_status":     to_str(r.get("Dispensation Status")),
-                "dispensing_date":         to_date(r.get("Dispensing date") or r.get("Dispensing Date")),
-                "quantity_dispensed":      to_int(r.get("Quantity Dispensed")),
-                "dispensing_user":         to_str(r.get("Dispensing User")),
-                "quantity_returned":       to_int(r.get("Quantity Returned")),
-                "date_returned":           to_date(r.get("Date Returned")),
-                "return_user":             to_str(r.get("Return User")),
-            })
-    return rows
-
-
-def parse_destruction_files(study):
-    dest_dir = os.path.join(BASE_DIR, f"xls_ip_destruction_{study}")
-    files = sorted(glob.glob(os.path.join(dest_dir, "ip_destruction_basket_*.xlsx")))
-    baskets = []
-    for path in files:
-        raw = pd.read_excel(path, header=None)
-        meta = {}
-        header_row = None
-        for i, row in raw.iterrows():
-            first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
-            for key, attr in [
-                ("Investigator Name:", "investigator"),
-                ("Site ID:", "site_id"),
-                ("Location:", "location"),
-                ("Basket ID:", "basket_id"),
-                ("Drug Destruction Created Date:", "destruction_date"),
-            ]:
-                if first.startswith(key):
-                    meta[attr] = first.replace(key, "").strip()
-            if first == "Medication ID Description" and header_row is None:
-                header_row = i
-        if header_row is None:
-            continue
-        df = pd.read_excel(path, header=header_row).dropna(how="all")
-        items = []
-        for _, r in df.iterrows():
-            items.append({
-                "medication_description":   to_str(r.get("Medication ID Description")),
-                "medication_id":            to_str(r.get("Medication ID")),
-                "packaged_lot_description": to_str(r.get("Packaged Lot description")),
-                "comments":                 to_str(r.get("Comments")),
-            })
-        baskets.append({
-            "site_id":          meta.get("site_id"),
-            "investigator":     meta.get("investigator"),
-            "location":         meta.get("location"),
-            "basket_id":        meta.get("basket_id"),
-            "destruction_date": to_date(meta.get("destruction_date")),
-            "items":            items,
-        })
-    return baskets
-
-
-# ── insertery ────────────────────────────────────────────────────────────────
-
-def insert_shipments(cursor, import_id, study, rows):
-    sql = """INSERT INTO iwrs_shipments
-        (import_id, study, shipment_id, status, type, ship_from, ship_to_site,
-         location, request_date, shipped_date, received_date, received_by,
-         delivered_date_utc, delivery_recipient, delivery_details, cancelled_date,
-         total_medication_ids, tracking_no, shipping_category, expected_arrival)
-        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
-    for r in rows:
-        cursor.execute(sql, (
-            import_id, study, r["shipment_id"], r["status"], r["type"],
-            r["ship_from"], r["ship_to_site"], r["location"],
-            r["request_date"], r["shipped_date"], r["received_date"],
-            r["received_by"], r["delivered_date_utc"], r["delivery_recipient"],
-            r["delivery_details"], r["cancelled_date"], r["total_medication_ids"],
-            r["tracking_no"], r["shipping_category"], r["expected_arrival"],
-        ))
-
-
-def insert_shipment_items(cursor, import_id, study, rows):
-    sql = """INSERT INTO iwrs_shipment_items
-        (import_id, study, shipment_id, destination_location, shipment_status,
-         shipment_type, destination_site, investigator, medication_description,
-         medication_type, medication_id, packaged_lot_no, packaged_lot_description,
-         container_id, quantity, expiration_date, item_status)
-        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
-    for r in rows:
-        cursor.execute(sql, (
-            import_id, study, r["shipment_id"], r["destination_location"],
-            r["shipment_status"], r["shipment_type"], r["destination_site"],
-            r["investigator"], r["medication_description"], r["medication_type"],
-            r["medication_id"], r["packaged_lot_no"], r["packaged_lot_description"],
-            r["container_id"], r["quantity"], r["expiration_date"], r["item_status"],
-        ))
-
-
-def insert_inventory(cursor, import_id, study, rows):
-    sql = """INSERT INTO iwrs_inventory
-        (import_id, study, site, investigator, location, medication_id,
-         packaged_lot_no, original_expiration_date, expiration_date, received_date,
-         receipt_user, subject_identifier, quantity_assigned, irt_transaction,
-         date_assigned, assignment_user, dispensation_status, dispensing_date,
-         quantity_dispensed, dispensing_user, quantity_returned, date_returned, return_user)
-        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
-    for r in rows:
-        cursor.execute(sql, (
-            import_id, study, r["site"], r["investigator"], r["location"],
-            r["medication_id"], r["packaged_lot_no"], r["original_expiration_date"],
-            r["expiration_date"], r["received_date"], r["receipt_user"],
-            r["subject_identifier"], r["quantity_assigned"], r["irt_transaction"],
-            r["date_assigned"], r["assignment_user"], r["dispensation_status"],
-            r["dispensing_date"], r["quantity_dispensed"], r["dispensing_user"],
-            r["quantity_returned"], r["date_returned"], r["return_user"],
-        ))
-
-
-def insert_destruction(cursor, study, baskets):
-    sql = """INSERT IGNORE INTO iwrs_destruction
-        (study, site_id, investigator, location, basket_id, destruction_date,
-         medication_description, medication_id, packaged_lot_description, comments)
-        VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
-    skipped = imported = 0
-    for b in baskets:
-        if basket_already_imported(cursor, study, b["basket_id"]):
-            skipped += 1
-            continue
-        for item in b["items"]:
-            cursor.execute(sql, (
-                study, b["site_id"], b["investigator"], b["location"],
-                b["basket_id"], b["destruction_date"],
-                item["medication_description"], item["medication_id"],
-                item["packaged_lot_description"], item["comments"],
-            ))
-            imported += 1
-    return imported, skipped
-
-
-def import_study(study):
-    print(f"\n  Parsování dat pro {study}...")
-    shipments = parse_shipments_report(study)
-    items     = parse_shipment_details(study)
-    inventory = parse_inventory(study)
-    baskets   = parse_destruction_files(study)
-    print(f"  Zásilky: {len(shipments)}  |  Položky: {len(items)}  |  Sklad: {len(inventory)}  |  Destrukce: {len(baskets)} košíků")
-
-    conn = get_conn()
-    cursor = conn.cursor()
-    import_id = insert_import(cursor, study, f"drugs_{study}")
-    print(f"  import_id = {import_id}")
-    insert_shipments(cursor, import_id, study, shipments)
-    insert_shipment_items(cursor, import_id, study, items)
-    insert_inventory(cursor, import_id, study, inventory)
-    dest_imported, dest_skipped = insert_destruction(cursor, study, baskets)
-    conn.commit()
-    cursor.close()
-    conn.close()
-    print(f"  Destrukce: {dest_imported} nových | {dest_skipped} košíků přeskočeno")
-

 # ── login ────────────────────────────────────────────────────────────────────

@@ -577,19 +225,17 @@ def main():
            finally:
                browser.close()

-    # ── Import do MySQL ───────────────────────────────────────────────────────
+    # ── Import do MongoDB ─────────────────────────────────────────────────────
    print(f"\n{'='*60}")
-    print("IMPORT DO MySQL")
+    print("IMPORT DO MongoDB")
    print(f"{'='*60}")

-    for study in STUDIES:
-        print(f"\n[{study}]")
-        try:
-            import_study(study)
-        except Exception as e:
-            import traceback
-            print(f"  CHYBA při importu: {e}")
-            traceback.print_exc()
+    try:
+        drugs_mongo.run(STUDIES)
+    except Exception as e:
+        import traceback
+        print(f"  CHYBA při importu: {e}")
+        traceback.print_exc()

    print(f"\n{'='*60}")
    print("Vše hotovo.")
@@ -156,38 +156,62 @@ def run(page, study):
    total_notif = 0
    for subject in subjects:
        filename = os.path.join(out_dir, f"{today} {study} {subject} Subject Detail.xlsx")
-        print(f"  [{subject}] Stahuji...")
-        input_field = page.locator('input[placeholder="search"], input[type="text"]').first
-        input_field.click()
-        input_field.fill(subject)
-        page.wait_for_timeout(500)

-        # Zachytíme table_1 response při výběru subjektu
-        if api_base:
+        success = False
+        table1_data = None
+        for attempt in range(1, 4):
            try:
-                with page.expect_response(
-                    lambda r: "report_data" in r.url and "table_1" in r.url,
-                    timeout=60000
-                ) as resp_info:
+                print(f"  [{subject}] Stahuji... (pokus {attempt}/3)")
+                input_field = page.locator('input[placeholder="search"], input[type="text"]').first
+                input_field.click()
+                input_field.fill(subject)
+                page.wait_for_timeout(500)
+
+                # Zachytíme table_1 response při výběru subjektu
+                if api_base:
+                    try:
+                        with page.expect_response(
+                            lambda r: "report_data" in r.url and "table_1" in r.url,
+                            timeout=60000
+                        ) as resp_info:
+                            page.locator("mat-option").first.dispatch_event("click")
+                        table1_data = resp_info.value.json()
+                    except Exception as e:
+                        print(f"  [{subject}] CHYBA zachycení table_1: {e}")
+                        page.locator("mat-option").first.dispatch_event("click")
+                        page.wait_for_load_state("networkidle", timeout=120000)
+                        table1_data = None
+                else:
                    page.locator("mat-option").first.dispatch_event("click")
-                table1_data = resp_info.value.json()
-            except Exception as e:
-                print(f"  [{subject}] CHYBA zachycení table_1: {e}")
-                page.locator("mat-option").first.dispatch_event("click")
+                    page.wait_for_load_state("networkidle", timeout=120000)
+                    table1_data = None
+
                page.wait_for_load_state("networkidle", timeout=120000)
-                table1_data = None
-        else:
-            page.locator("mat-option").first.dispatch_event("click")
-            page.wait_for_load_state("networkidle", timeout=120000)
-            table1_data = None
+                page.wait_for_timeout(2000)

-        page.wait_for_load_state("networkidle", timeout=120000)
-        page.wait_for_timeout(1000)
+                with page.expect_download(timeout=60000) as dl:
+                    page.get_by_role("button", name="Download XLS").click()
+                dl.value.save_as(filename)
+                print(f"  [{subject}] XLS OK")
+                success = True
+                break
+            except Exception as e:
+                print(f"  [{subject}] pokus {attempt} selhal: {e}")
+                if attempt < 3:
+                    try:
+                        page.goto(f"{BASE_URL}/report/patient_detail_report")
+                        page.wait_for_load_state("networkidle", timeout=120000)
+                    except Exception as ge:
+                        print(f"  [{subject}] refresh selhal: {ge}")

-        with page.expect_download(timeout=120000) as dl:
-            page.get_by_role("button", name="Download XLS").click()
-        dl.value.save_as(filename)
-        print(f"  [{subject}] XLS OK")
+        if not success:
+            print(f"  [{subject}] PŘESKAKUJI po 3 neúspěšných pokusech")
+            try:
+                page.goto(f"{BASE_URL}/report/patient_detail_report")
+                page.wait_for_load_state("networkidle", timeout=120000)
+            except Exception:
+                pass
+            continue

        # Stáhnout notifikace pro tohoto subjekta
        if api_base and table1_data:
@@ -196,8 +220,13 @@ def run(page, study):
            )
            total_notif += n

-        page.get_by_role("button", name="Clear").click()
-        page.wait_for_load_state("networkidle", timeout=120000)
+        try:
+            page.get_by_role("button", name="Clear").click()
+            page.wait_for_load_state("networkidle", timeout=120000)
+        except Exception as e:
+            print(f"  [{subject}] Clear selhal: {e} — refresh")
+            page.goto(f"{BASE_URL}/report/patient_detail_report")
+            page.wait_for_load_state("networkidle", timeout=120000)

    print(f"  [{study}] Subject details hotovo. Nových notifikací: {total_notif}")

@@ -2,23 +2,21 @@
 Kompletní pipeline:
  1. Stažení Subject Summary Reportů (obě studie)
  2. Stažení Subject Detail Reportů + notifikací (obě studie)
-  3. Import do MySQL (summary, visits, notifikace)
+  3. Import do MongoDB (subject_summary + visits + notifications)

 Spusť tento skript místo samostatných skriptů.
 """

 import os
+import sys
 import datetime
 import glob
-import re

 from playwright.sync_api import sync_playwright
-import numpy as np
-import pandas as pd

-import db_config
-import mysql.connector
 import download_subject_details as dsd
+import import_to_mongo
+import import_notifications_to_mongo

 # ── CONFIG ───────────────────────────────────────────────────────────────────
 BASE_URL = "https://janssen.4gclinical.com"
@@ -72,6 +70,7 @@ def download_summary(page, study, today):
 # ── KROK 2: Subject Details ───────────────────────────────────────────────────

 def get_subjects_from_summary(summary_path):
+    import pandas as pd
    raw = pd.read_excel(summary_path, header=None)
    header_row = None
    for i, row in raw.iterrows():
@@ -112,277 +111,7 @@ def download_details(page, study, summary_path, today):
        page.wait_for_load_state("networkidle", timeout=120000)


-# ── KROK 3: Import do MySQL ───────────────────────────────────────────────────
-
-def get_conn():
-    return mysql.connector.connect(
-        host=db_config.DB_HOST,
-        port=db_config.DB_PORT,
-        user=db_config.DB_USER,
-        password=db_config.DB_PASSWORD,
-        database=db_config.DB_NAME,
-    )
-
-
-def _py(val):
-    """Převede numpy skalár na Python nativní typ."""
-    if isinstance(val, np.generic):
-        return val.item()
-    return val
-
-
-def to_date(val):
-    val = _py(val)
-    if val is None or (isinstance(val, float) and (val != val)):
-        return None
-    try:
-        if pd.isna(val):
-            return None
-    except (TypeError, ValueError):
-        pass
-    if isinstance(val, pd.Timestamp):
-        return None if pd.isna(val) else val.date()
-    if isinstance(val, datetime.datetime):
-        return val.date()
-    if isinstance(val, datetime.date):
-        return val
-    s = str(val).strip()
-    if not s or s.lower() in ("nat", "nan", "none", ""):
-        return None
-    for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
-        try:
-            return datetime.datetime.strptime(s, fmt).date()
-        except ValueError:
-            pass
-    return None
-
-
-def to_int(val):
-    val = _py(val)
-    try:
-        v = float(val)
-        return None if (v != v) else int(v)
-    except (TypeError, ValueError):
-        return None
-
-
-def to_float(val):
-    val = _py(val)
-    try:
-        v = float(val)
-        return None if (v != v) else float(v)
-    except (TypeError, ValueError):
-        return None
-
-
-def to_str(val):
-    val = _py(val)
-    if val is None:
-        return None
-    if isinstance(val, float) and (val != val):
-        return None
-    s = str(val).strip()
-    return None if s.lower() in ("nan", "nat", "none", "") else s
-
-
-def read_summary_df(path):
-    raw = pd.read_excel(path, header=None)
-    header_row = None
-    for i, row in raw.iterrows():
-        if "Subject" in [str(v).strip() for v in row]:
-            header_row = i
-            break
-    if header_row is None:
-        raise ValueError(f"Hlavičkový řádek nenalezen v {path}")
-    return pd.read_excel(path, header=header_row).dropna(how="all")
-
-
-def parse_detail_visits(path):
-    df = pd.read_excel(path, sheet_name="patient_detail_report", header=None)
-    header_row = None
-    for i, row in df.iterrows():
-        if "Visit Type" in [str(v).strip() for v in row]:
-            header_row = i
-            break
-    if header_row is None:
-        return []
-    visits_df = df.iloc[header_row + 1:].copy()
-    visits_df.columns = range(visits_df.shape[1])
-    rows = []
-    for _, r in visits_df.iterrows():
-        visit_type = to_str(r.get(0))
-        if visit_type not in ("Past", "Upcoming"):
-            continue
-        rows.append({
-            "visit_type":                  visit_type,
-            "scheduled_date":              to_date(r.get(1)),
-            "window_days":                 to_str(r.get(2)),
-            "actual_date":                 to_date(r.get(3)),
-            "irt_transaction_no":          to_int(r.get(4)),
-            "irt_transaction_description": to_str(r.get(5)),
-            "medication_assignment":       to_str(r.get(6)),
-            "quantity_assigned":           to_int(r.get(7)),
-            "medication_id":               to_str(r.get(8)),
-        })
-    return rows
-
-
-def insert_import(cursor, study, source_file):
-    cursor.execute(
-        "INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)",
-        (study, datetime.datetime.now(), os.path.basename(source_file)),
-    )
-    return cursor.lastrowid
-
-
-def insert_uco3001_summary(cursor, import_id, df):
-    sql = """INSERT INTO iwrs_uco3001_subject_summary (
-        import_id, subject, prior_subject_identifier, site, investigator, location,
-        cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight,
-        rescreened_subject, adt_ir, three_or_more_advanced_therapies,
-        only_oral_5asa_compounds, ustekinumab, isolated_proctitis,
-        clinical_responder_status_i12_m0, irt_subject_status,
-        i0_rand_date_local, last_irt_transaction,
-        last_irt_transaction_date_local, last_irt_transaction_date_utc,
-        next_irt_transaction, next_irt_transaction_date_local,
-        most_recent_med_assignment_date, days_since_last_med_assignment,
-        patient_forecast_status, patient_forecast_status_changed_date
-    ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
-    col = df.columns.tolist()
-    for _, r in df.iterrows():
-        cursor.execute(sql, (
-            import_id,
-            to_str(r["Subject"]),
-            to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
-            to_str(r["Site"]),
-            to_str(r["Investigator"]),
-            to_str(r["Location"]),
-            to_str(r["Cohort per IRT"]),
-            to_date(r["Informed Consent Date"]),
-            to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None,
-            to_int(r["Subject's age collection"]),
-            to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None,
-            to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None,
-            to_str(r["ADT-IR"]) if "ADT-IR" in col else None,
-            to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None,
-            to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None,
-            to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None,
-            to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None,
-            to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None,
-            to_str(r["IRT Subject Status"]),
-            to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None,
-            to_str(r["Last Recorded IRT Transaction"]),
-            to_date(r["Last Recorded IRT Transaction Date [Local]"]),
-            to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
-            to_str(r["Next Expected IRT Transaction"]),
-            to_date(r["Next Expected IRT Transaction Date [Local]"]),
-            to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None,
-            to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None,
-            to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None,
-            to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None,
-        ))
-
-
-def insert_mdd3003_summary(cursor, import_id, df):
-    sql = """INSERT INTO iwrs_mdd3003_subject_summary (
-        import_id, subject, prior_subject_identifier, site, investigator, location,
-        cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age,
-        madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17,
-        stratification_country, age_group, stable_remitters, irt_subject_status,
-        last_irt_transaction, last_irt_transaction_date_local,
-        last_irt_transaction_date_utc, next_irt_transaction,
-        next_irt_transaction_date_local, date_screened, date_screen_failed,
-        date_randomized_part1, date_early_withdraw_randomized_part1,
-        date_open_label_induction, date_early_withdraw_open_label_induction,
-        date_randomized_part2, date_early_withdraw_randomized_part2,
-        date_completed, date_unblinded
-    ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
-    col = df.columns.tolist()
-    for _, r in df.iterrows():
-        cursor.execute(sql, (
-            import_id,
-            to_str(r["Subject"]),
-            to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
-            to_str(r["Site"]),
-            to_str(r["Investigator"]),
-            to_str(r["Location"]),
-            to_str(r["Cohort per IRT"]),
-            to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None,
-            to_date(r["Informed Consent Date"]),
-            to_int(r["Subject's age collection"]),
-            to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None,
-            to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None,
-            to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None,
-            to_str(r["Stratification Country"]) if "Stratification Country" in col else None,
-            to_str(r["Age Group"]) if "Age Group" in col else None,
-            to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None,
-            to_str(r["IRT Subject Status"]),
-            to_str(r["Last Recorded IRT Transaction"]),
-            to_date(r["Last Recorded IRT Transaction Date [Local]"]),
-            to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
-            to_str(r["Next Expected IRT Transaction"]),
-            to_date(r["Next Expected IRT Transaction Date [Local]"]),
-            to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None,
-            to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None,
-            to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None,
-            to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None,
-            to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None,
-            to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None,
-            to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None,
-            to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None,
-            to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None,
-            to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None,
-        ))
-
-
-def insert_visits(cursor, import_id, study, subject, visits):
-    if not visits:
-        return
-    sql = """INSERT INTO iwrs_subject_visits (
-        import_id, study, subject, visit_type, scheduled_date, window_days,
-        actual_date, irt_transaction_no, irt_transaction_description,
-        medication_assignment, quantity_assigned, medication_id
-    ) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
-    for v in visits:
-        cursor.execute(sql, (
-            import_id, study, subject,
-            v["visit_type"], v["scheduled_date"], v["window_days"],
-            v["actual_date"], v["irt_transaction_no"],
-            v["irt_transaction_description"], v["medication_assignment"],
-            v["quantity_assigned"], v["medication_id"],
-        ))
-
-
-def import_to_mysql(summary_path, detail_files, study):
-    print(f"\n  [MySQL] Importuji {study}...")
-    df_summary = read_summary_df(summary_path)
-    conn = get_conn()
-    cursor = conn.cursor()
-
-    import_id = insert_import(cursor, study, summary_path)
-
-    if study == "77242113UCO3001":
-        insert_uco3001_summary(cursor, import_id, df_summary)
-    else:
-        insert_mdd3003_summary(cursor, import_id, df_summary)
-
-    total_visits = 0
-    for path in detail_files:
-        fname = os.path.basename(path)
-        m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname)
-        subject = m.group(1) if m else "UNKNOWN"
-        visits = parse_detail_visits(path)
-        insert_visits(cursor, import_id, study, subject, visits)
-        total_visits += len(visits)
-
-    conn.commit()
-    cursor.close()
-    conn.close()
-    print(f"  [MySQL] import_id={import_id} | pacientů={len(df_summary)} | transakcí={total_visits}")
-    return import_id
-
-
-# ── MAIN ─────────────────────────────────────────────────────────────────────
+# ── KROK 3: Import do MongoDB ────────────────────────────────────────────────

 def main():
    today = datetime.date.today().strftime("%Y-%m-%d")
@@ -391,12 +120,12 @@ def main():

    summary_paths = {}

-    # ── Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session) ──
+    # Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session)
    with sync_playwright() as p:
        for study in STUDIES:
-            print(f"\n{'='*60}")
+            print("\n" + "=" * 60)
            print(f"[{study}] KROK 1: Subject Summary Report")
-            print(f"{'='*60}")
+            print("=" * 60)
            browser = p.chromium.launch(headless=False)
            context = browser.new_context(accept_downloads=True)
            page = context.new_page()
@@ -415,10 +144,10 @@ def main():
            finally:
                browser.close()

-    # ── Krok 3: import do MySQL ──────────────────────────────────────────────
-    print(f"\n{'='*60}")
-    print("KROK 3: Import do MySQL")
-    print(f"{'='*60}")
+    # Krok 3: import do MongoDB
+    print("\n" + "=" * 60)
+    print("KROK 3: Import do MongoDB")
+    print("=" * 60)

    for study in STUDIES:
        summary_path = summary_paths.get(study)
@@ -426,18 +155,21 @@ def main():
            print(f"  [{study}] PŘESKOČENO — stahování selhalo")
            continue

-        detail_files = sorted(glob.glob(
-            os.path.join(DETAILS_DIR, study, f"{today} {study} * Subject Detail.xlsx")
-        ))
-
        try:
-            import_to_mysql(summary_path, detail_files, study)
+            import_to_mongo.run(study, summary_path, DETAILS_DIR, today)
        except Exception as e:
-            print(f"  [{study}] CHYBA při importu: {e}")
+            print(f"  [{study}] CHYBA při importu summary/visits: {e}")

-    print(f"\n{'='*60}")
+    # Notifikace: PDF/JSON z disku rovnou do Mongo iwrs_notifications
+    print("\n  [notifikace] import PDF/JSON do Mongo...")
+    try:
+        import_notifications_to_mongo.main(STUDIES)
+    except Exception as e:
+        print(f"  CHYBA při importu notifikací: {e}")
+
+    print("\n" + "=" * 60)
    print("Vše hotovo.")
-    print(f"{'='*60}")
+    print("=" * 60)


 main()
@@ -1,449 +0,0 @@
-"""
-download_attachments_v1.0.py
-Nazev:  download_attachments_v1.0.py
-Verze:  1.0
-Datum:  2026-06-02
-Autor:  vladimir.buzalka
-
-Popis:
-    Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB kolekce
-    ordinace@buzalkova.cz primo pres Microsoft Graph API a uklada je do
-    adresare /mnt/Emails/ordinace@buzalkova.cz/Attachments/.
-
-    Deduplikace podle SHA256 hashe obsahu:
-        - stejny hash = soubor uz existuje -> preskoci
-        - prvni vyskytu souboru: ulozi pod puvodnimnazvem
-        - kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ...
-
-    Po ulozeni aktualizuje MongoDB:
-        - v email dokumentu: kazda priloha dostane file_hash + local_path
-        - kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes,
-          mime_type, first_seen_at, ref_count (pocet emailu ktery ji obsahuje)
-
-    Bezpecne prerusit a opakovat:
-        - zpravy kde jsou vsechny prilohy uz stazene (maji file_hash) se preskoci
-        - --force-recheck znovu overi i uz stazene (pro pripad zmen na disku)
-
-    POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
-
-Spousteni:
-    python download_attachments_v1.0.py               # stahni vse co chybi
-    python download_attachments_v1.0.py --limit 50    # test na prvnich 50 emailech
-    python download_attachments_v1.0.py --force-recheck  # overi i uz stazene
-
-Docker (po pridani mountu /mnt/user/Emails -> /mnt/Emails):
-    docker exec -it python-runner python /scripts/download_attachments_v1.0.py
-
-Zavislosti:
-    msal, requests, pymongo, python-dateutil
-    Python 3.10+
-
-Struktura na disku:
-    /mnt/Emails/
-    └── ordinace@buzalkova.cz/
-        └── Attachments/
-            ├── faktura_2026.pdf
-            ├── vysledky_lab.pdf
-            ├── vysledky_lab_2.pdf   <- kolize nazvu, jiny obsah
-            └── ...
-
-Kolekce emaily.attachments_index:
-    _id          SHA256 hash (hex)
-    filename     nazev souboru na disku (prvni vyskytu)
-    local_path   relativni cesta od Attachments/ (zatim = filename)
-    size_bytes   velikost souboru
-    mime_type    MIME typ
-    first_seen_at  datetime UTC
-    ref_count    v kolika emailech se tato priloha vyskytuje
-
-Aktualizace v email dokumentu (kolekce ordinace@buzalkova.cz):
-    attachments[i].file_hash    SHA256 hash
-    attachments[i].local_path   cesta relativni od Attachments/
-
-Historie verzi:
-    1.0  2026-06-02  Inicialni verze
-"""
-
-import sys
-import hashlib
-import logging
-import argparse
-from pathlib import Path
-from datetime import datetime, timezone
-from typing import Optional
-
-import msal
-import requests
-from pymongo import MongoClient, UpdateOne
-
-if hasattr(sys.stdout, "reconfigure"):
-    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
-
-# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
-GRAPH_TENANT_ID     = "7d269944-37a4-43a1-8140-c7517dc426e9"
-GRAPH_CLIENT_ID     = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
-GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
-GRAPH_MAILBOX       = "ordinace@buzalkova.cz"
-GRAPH_URL           = "https://graph.microsoft.com/v1.0"
-
-MONGO_URI           = "mongodb://192.168.1.76:27017"
-MONGO_DB            = "emaily"
-MONGO_COL_EMAILS    = "ordinace@buzalkova.cz"
-MONGO_COL_INDEX     = "attachments_index"
-
-ATTACHMENTS_DIR     = Path("/mnt/Emails/ordinace@buzalkova.cz/Attachments")
-LOG_FILE            = Path(__file__).parent / "parse_emails_errors.log"
-SCRIPT_VERSION      = "1.0"
-BATCH_SIZE          = 50
-# ──────────────────────────────────────────────────────────────────────────────
-
-logging.basicConfig(
-    filename=str(LOG_FILE),
-    level=logging.ERROR,
-    format="%(asctime)s | %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-    encoding="utf-8",
-)
-
-_graph_token: Optional[str] = None
-
-
-# ─── Graph API ────────────────────────────────────────────────────────────────
-
-def get_token() -> str:
-    global _graph_token
-    app = msal.ConfidentialClientApplication(
-        GRAPH_CLIENT_ID,
-        authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
-        client_credential=GRAPH_CLIENT_SECRET,
-    )
-    result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
-    if "access_token" not in result:
-        raise RuntimeError(f"Graph auth failed: {result}")
-    _graph_token = result["access_token"]
-    return _graph_token
-
-
-def graph_get_bytes(url: str) -> bytes:
-    """Stahne binarni obsah prilohy."""
-    global _graph_token
-    if not _graph_token:
-        get_token()
-    for attempt in range(2):
-        r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True)
-        if r.status_code == 401:
-            get_token()
-            continue
-        r.raise_for_status()
-        return r.content
-    raise RuntimeError(f"Graph GET bytes failed: {url}")
-
-
-def graph_get_json(url: str, params: dict = None) -> dict:
-    global _graph_token
-    if not _graph_token:
-        get_token()
-    for attempt in range(2):
-        r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
-        if r.status_code == 401:
-            get_token()
-            continue
-        r.raise_for_status()
-        return r.json()
-    raise RuntimeError(f"Graph GET json failed: {url}")
-
-
-def fetch_attachment_content(graph_message_id: str, attachment_id: str) -> Optional[bytes]:
-    """Stahne obsah prilohy pres Graph API."""
-    url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{graph_message_id}/attachments/{attachment_id}/$value"
-    try:
-        return graph_get_bytes(url)
-    except Exception as e:
-        logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s", graph_message_id, attachment_id, e)
-        return None
-
-
-def fetch_message_attachments(graph_message_id: str) -> list[dict]:
-    """Nacte seznam priloh zpravy z Graph API (metadata vcetne attachment ID)."""
-    url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{graph_message_id}/attachments"
-    try:
-        data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline,contentId"})
-        return data.get("value", [])
-    except Exception as e:
-        logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
-        return []
-
-
-# ─── Dedup + ukládání ─────────────────────────────────────────────────────────
-
-def sha256(data: bytes) -> str:
-    return hashlib.sha256(data).hexdigest()
-
-
-def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, index_col) -> str:
-    """
-    Vrati nazev souboru ktery pouzit pro ulozeni.
-    Pokud desired_name jiz existuje s jinym hashem, prida suffix _2, _3 ...
-    """
-    # Zkontroluj jestli existujici soubor se stejnym nazvem ma stejny hash
-    existing = index_col.find_one({"filename": desired_name})
-    if existing:
-        if existing["_id"] == hash_val:
-            return desired_name  # Stejny hash, stejne jmeno — dedup hit
-        # Jiny hash — hledej volny suffix
-        stem   = Path(desired_name).stem
-        suffix = Path(desired_name).suffix
-        n = 2
-        while True:
-            candidate = f"{stem}_{n}{suffix}"
-            if not (att_dir / candidate).exists():
-                # Overi ze ani v indexu neni tento kandidat s jinym hashem
-                ex2 = index_col.find_one({"filename": candidate})
-                if not ex2 or ex2["_id"] == hash_val:
-                    return candidate
-            n += 1
-    return desired_name
-
-
-def save_attachment(content: bytes, original_name: str, att_dir: Path, index_col) -> tuple[str, str, bool]:
-    """
-    Ulozi prilohu s deduplikaci.
-    Vraci (hash, local_path, was_new):
-        was_new=True  -> soubor byl ulozen
-        was_new=False -> hash uz existoval, soubor preskocen
-    """
-    hash_val = sha256(content)
-
-    # Zkontroluj index — pokud hash uz existuje, vrat existujici zaznam
-    existing = index_col.find_one({"_id": hash_val})
-    if existing:
-        # Zvys pocitadlo referenci
-        index_col.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
-        return hash_val, existing["local_path"], False
-
-    # Novy soubor — urcit nazev
-    safe_name = "".join(c if c.isalnum() or c in "._- " else "_" for c in original_name).strip()
-    if not safe_name:
-        safe_name = f"attachment_{hash_val[:8]}"
-
-    filename  = resolve_filename(safe_name, att_dir, hash_val, index_col)
-    file_path = att_dir / filename
-
-    # Uloz soubor
-    file_path.write_bytes(content)
-
-    # Zaznamenej do indexu
-    index_col.insert_one({
-        "_id":          hash_val,
-        "filename":     filename,
-        "local_path":   filename,
-        "size_bytes":   len(content),
-        "mime_type":    "",
-        "first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
-        "ref_count":    1,
-    })
-
-    return hash_val, filename, True
-
-
-# ─── MAIN ─────────────────────────────────────────────────────────────────────
-
-def main():
-    ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
-    ap.add_argument("--limit",         type=int, default=0,
-                    help="Zpracovat max N emailu (0 = vse)")
-    ap.add_argument("--force-recheck", action="store_true",
-                    help="Znovu overi i emaily kde prilohy uz maji file_hash")
-    ap.add_argument("--no-indexes",    action="store_true",
-                    help="Nevytvorit indexy na konci")
-    args = ap.parse_args()
-
-    start = datetime.now()
-    print(f"=== download_attachments v{SCRIPT_VERSION} ===")
-    print(f"Start:    {start.strftime('%Y-%m-%d %H:%M:%S')}")
-    print(f"Schránka: {GRAPH_MAILBOX}")
-    print(f"Cilovy adresar: {ATTACHMENTS_DIR}")
-    print(f"MongoDB:  {MONGO_URI} -> {MONGO_DB}")
-
-    # Adresar
-    ATTACHMENTS_DIR.mkdir(parents=True, exist_ok=True)
-    print(f"  Adresar OK")
-
-    # Graph
-    print("\nPřipojuji se k Graph API...")
-    try:
-        get_token()
-        print("  Graph API OK")
-    except Exception as e:
-        print(f"  CHYBA: {e}")
-        sys.exit(1)
-
-    # MongoDB
-    client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
-    try:
-        client.admin.command("ping")
-        print("  MongoDB OK")
-    except Exception as e:
-        print(f"  CHYBA: MongoDB neni dostupna -- {e}")
-        sys.exit(1)
-
-    col_emails = client[MONGO_DB][MONGO_COL_EMAILS]
-    col_index  = client[MONGO_DB][MONGO_COL_INDEX]
-
-    # Indexy na attachment index kolekci
-    if not args.no_indexes:
-        col_index.create_index("filename")
-        col_index.create_index("mime_type")
-
-    # Dotaz — emaily s prilohou ktere jeste nebyly zpracovany
-    if args.force_recheck:
-        query = {"has_attachments": True}
-    else:
-        query = {
-            "has_attachments": True,
-            "attachments": {
-                "$elemMatch": {
-                    "is_inline": False,
-                    "file_hash":  {"$exists": False},
-                }
-            }
-        }
-
-    total = col_emails.count_documents(query)
-    print(f"\nEmailu ke zpracovani: {total}")
-    if total == 0:
-        print("Neni co stahnout.")
-        client.close()
-        return
-
-    cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
-    if args.limit:
-        cursor = cursor.limit(args.limit)
-
-    ok_count   = 0
-    new_count  = 0
-    skip_count = 0
-    err_count  = 0
-    email_i    = 0
-    batch      = []
-
-    def flush():
-        if not batch:
-            return
-        try:
-            col_emails.bulk_write(batch, ordered=False)
-        except Exception as e:
-            logging.error("bulk_write: %s", e)
-            print(f"  CHYBA bulk_write: {e}")
-        batch.clear()
-
-    for email_doc in cursor:
-        email_i += 1
-        email_id   = email_doc["_id"]
-        graph_id   = email_doc.get("graph_id", "")
-        subject    = (email_doc.get("subject") or "")[:60]
-        att_list   = email_doc.get("attachments") or []
-
-        # Jen skutecne prilohy
-        real_atts = [a for a in att_list if not a.get("is_inline", False)]
-        if not real_atts:
-            continue
-
-        print(f"\n  {email_i:>5}/{total}  {subject}")
-
-        # Nacti attachment IDs z Graph API
-        graph_atts = fetch_message_attachments(graph_id)
-        graph_att_map = {a["name"]: a for a in graph_atts if not a.get("isInline", False)}
-
-        updated_atts = list(att_list)
-        email_ok = True
-
-        for i, att in enumerate(updated_atts):
-            if att.get("is_inline", False):
-                continue
-            if not args.force_recheck and att.get("file_hash"):
-                skip_count += 1
-                print(f"         SKIP  {att['filename']}")
-                continue
-
-            att_name    = att.get("filename", "")
-            graph_att   = graph_att_map.get(att_name)
-
-            if not graph_att:
-                # Zkus najit podle casti nazvu
-                for gname, ga in graph_att_map.items():
-                    if att_name.lower() in gname.lower():
-                        graph_att = ga
-                        break
-
-            if not graph_att:
-                logging.error("attachment not found in Graph [email=%s att=%s]", email_id, att_name)
-                print(f"         ERR   {att_name} (nenalezeno v Graph)")
-                err_count += 1
-                email_ok = False
-                continue
-
-            # Stahni obsah
-            content = fetch_attachment_content(graph_id, graph_att["id"])
-            if content is None:
-                err_count += 1
-                email_ok = False
-                print(f"         ERR   {att_name} (stazeni selhalo)")
-                continue
-
-            # Uloz s dedupem
-            hash_val, local_path, was_new = save_attachment(content, att_name, ATTACHMENTS_DIR, col_index)
-
-            # Aktualizuj MIME typ v indexu
-            col_index.update_one(
-                {"_id": hash_val},
-                {"$set": {"mime_type": att.get("mime_type", graph_att.get("contentType", ""))}},
-            )
-
-            # Zaznamenej do emailu
-            updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path}
-
-            if was_new:
-                new_count += 1
-                print(f"         NEW   {local_path}  ({len(content):,} B)")
-            else:
-                skip_count += 1
-                print(f"         DUP   {att_name} -> {local_path}")
-
-        if email_ok:
-            ok_count += 1
-
-        # Uloz aktualizovane prilohy zpet do emailu
-        batch.append(UpdateOne(
-            {"_id": email_id},
-            {"$set": {"attachments": updated_atts}}
-        ))
-
-        if len(batch) >= BATCH_SIZE:
-            flush()
-
-        if email_i % 100 == 0:
-            elapsed = (datetime.now() - start).total_seconds()
-            print(f"  {'─'*60}")
-            print(f"  Průběh: emaily={email_i}/{total}  nove={new_count}  dup={skip_count}  err={err_count}")
-            print(f"  {'─'*60}")
-
-    flush()
-
-    elapsed_total = (datetime.now() - start).total_seconds()
-    files_total   = col_index.count_documents({})
-    size_total    = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
-
-    print(f"\n{'='*52}")
-    print(f"Vysledek:  emaily={ok_count}  |  nove soubory={new_count}  |  duplikaty={skip_count}  |  err={err_count}")
-    print(f"Souboru v indexu: {files_total}  ({size_total/1024/1024:.1f} MB)")
-    print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
-    print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-    if err_count:
-        print(f"Chyby logovany do: {LOG_FILE}")
-
-    client.close()
-
-
-if __name__ == "__main__":
-    main()
@@ -1,428 +0,0 @@
-"""
-download_attachments_v1.1.py
-Nazev:  download_attachments_v1.1.py
-Verze:  1.1
-Datum:  2026-06-02
-Autor:  vladimir.buzalka
-
-Popis:
-    Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB
-    pres Microsoft Graph API a uklada je do adresare
-    /mnt/Emails/<schránka>/Attachments/.
-
-    Schránka se predava jako povinny parametr --mailbox.
-
-    Deduplikace podle SHA256 hashe obsahu:
-        - stejny hash = soubor uz existuje -> preskoci
-        - prvni vyskytu souboru: ulozi pod puvodnimnazvem
-        - kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ...
-
-    Po ulozeni aktualizuje MongoDB:
-        - v email dokumentu: kazda priloha dostane file_hash + local_path
-        - kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes,
-          mime_type, mailbox, first_seen_at, ref_count
-
-    Bezpecne prerusit a opakovat — emaily kde vsechny prilohy maji file_hash
-    se preskoci. --force-recheck znovu overi i uz stazene.
-
-    POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
-
-Spousteni:
-    python download_attachments_v1.1.py --mailbox ordinace@buzalkova.cz
-    python download_attachments_v1.1.py --mailbox vladimir.buzalka@buzalka.cz --limit 50
-    python download_attachments_v1.1.py --mailbox ordinace@buzalkova.cz --force-recheck
-
-Docker:
-    docker exec -it python-runner python /scripts/download_attachments_v1.1.py \\
-        --mailbox ordinace@buzalkova.cz
-
-Zavislosti:
-    msal, requests, pymongo
-    Python 3.10+
-
-Struktura na disku:
-    /mnt/Emails/
-    └── <mailbox>/
-        └── Attachments/
-            ├── faktura_2026.pdf
-            ├── vysledky_lab.pdf
-            ├── vysledky_lab_2.pdf
-            └── ...
-
-Kolekce emaily.attachments_index:
-    _id            SHA256 hash (hex)
-    filename       nazev souboru na disku
-    local_path     relativni cesta od Attachments/
-    size_bytes     velikost souboru
-    mime_type      MIME typ
-    mailbox        schránka ze ktere pochazi prvni vyskytu
-    first_seen_at  datetime UTC
-    ref_count      v kolika emailech se tato priloha vyskytuje
-
-Historie verzi:
-    1.0  2026-06-02  Inicialni verze
-    1.1  2026-06-02  Schránka jako parametr --mailbox (univerzalni pouziti)
-"""
-
-import sys
-import hashlib
-import logging
-import argparse
-from pathlib import Path
-from datetime import datetime, timezone
-from typing import Optional
-
-import msal
-import requests
-from pymongo import MongoClient, UpdateOne
-
-if hasattr(sys.stdout, "reconfigure"):
-    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
-
-# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
-GRAPH_TENANT_ID     = "7d269944-37a4-43a1-8140-c7517dc426e9"
-GRAPH_CLIENT_ID     = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
-GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
-GRAPH_URL           = "https://graph.microsoft.com/v1.0"
-
-MONGO_URI           = "mongodb://192.168.1.76:27017"
-MONGO_DB            = "emaily"
-MONGO_COL_INDEX     = "attachments_index"
-
-EMAILS_BASE_DIR     = Path("/mnt/Emails")
-LOG_FILE            = Path(__file__).parent / "parse_emails_errors.log"
-SCRIPT_VERSION      = "1.1"
-BATCH_SIZE          = 50
-# ──────────────────────────────────────────────────────────────────────────────
-
-logging.basicConfig(
-    filename=str(LOG_FILE),
-    level=logging.ERROR,
-    format="%(asctime)s | %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-    encoding="utf-8",
-)
-
-_graph_token: Optional[str] = None
-
-
-# ─── Graph API ────────────────────────────────────────────────────────────────
-
-def get_token() -> str:
-    global _graph_token
-    app = msal.ConfidentialClientApplication(
-        GRAPH_CLIENT_ID,
-        authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
-        client_credential=GRAPH_CLIENT_SECRET,
-    )
-    result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
-    if "access_token" not in result:
-        raise RuntimeError(f"Graph auth failed: {result}")
-    _graph_token = result["access_token"]
-    return _graph_token
-
-
-def graph_get_bytes(url: str) -> bytes:
-    global _graph_token
-    if not _graph_token:
-        get_token()
-    for attempt in range(2):
-        r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True)
-        if r.status_code == 401:
-            get_token()
-            continue
-        r.raise_for_status()
-        return r.content
-    raise RuntimeError(f"Graph GET bytes failed: {url}")
-
-
-def graph_get_json(url: str, params: dict = None) -> dict:
-    global _graph_token
-    if not _graph_token:
-        get_token()
-    for attempt in range(2):
-        r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
-        if r.status_code == 401:
-            get_token()
-            continue
-        r.raise_for_status()
-        return r.json()
-    raise RuntimeError(f"Graph GET json failed: {url}")
-
-
-def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]:
-    url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments"
-    try:
-        data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline,contentId"})
-        return data.get("value", [])
-    except Exception as e:
-        logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
-        return []
-
-
-def fetch_attachment_content(mailbox: str, graph_message_id: str, attachment_id: str) -> Optional[bytes]:
-    url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}/$value"
-    try:
-        return graph_get_bytes(url)
-    except Exception as e:
-        logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s", graph_message_id, attachment_id, e)
-        return None
-
-
-# ─── Dedup + ukládání ─────────────────────────────────────────────────────────
-
-def sha256(data: bytes) -> str:
-    return hashlib.sha256(data).hexdigest()
-
-
-def safe_filename(name: str) -> str:
-    safe = "".join(c if c.isalnum() or c in "._- " else "_" for c in name).strip()
-    return safe or "attachment"
-
-
-def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str:
-    """Vrati nazev souboru pro ulozeni — resi kolize (stejny nazev, jiny hash)."""
-    existing = col_index.find_one({"filename": desired_name})
-    if existing:
-        if existing["_id"] == hash_val:
-            return desired_name  # Dedup hit — stejny hash
-        # Kolize — hledej volny suffix
-        stem   = Path(desired_name).stem
-        suffix = Path(desired_name).suffix
-        n = 2
-        while True:
-            candidate = f"{stem}_{n}{suffix}"
-            ex2 = col_index.find_one({"filename": candidate})
-            if not ex2 or ex2["_id"] == hash_val:
-                if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val):
-                    return candidate
-            n += 1
-    return desired_name
-
-
-def save_attachment(
-    content: bytes,
-    original_name: str,
-    mime_type: str,
-    mailbox: str,
-    att_dir: Path,
-    col_index,
-) -> tuple[str, str, bool]:
-    """
-    Ulozi prilohu s deduplikaci.
-    Vraci (hash, local_path, was_new).
-    """
-    hash_val = sha256(content)
-
-    existing = col_index.find_one({"_id": hash_val})
-    if existing:
-        col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
-        return hash_val, existing["local_path"], False
-
-    filename  = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index)
-    file_path = att_dir / filename
-    file_path.write_bytes(content)
-
-    col_index.insert_one({
-        "_id":          hash_val,
-        "filename":     filename,
-        "local_path":   filename,
-        "size_bytes":   len(content),
-        "mime_type":    mime_type,
-        "mailbox":      mailbox,
-        "first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
-        "ref_count":    1,
-    })
-
-    return hash_val, filename, True
-
-
-# ─── MAIN ─────────────────────────────────────────────────────────────────────
-
-def main():
-    ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
-    ap.add_argument("--mailbox",       required=True,
-                    help="Emailova schranka (napr. ordinace@buzalkova.cz)")
-    ap.add_argument("--limit",         type=int, default=0,
-                    help="Zpracovat max N emailu (0 = vse)")
-    ap.add_argument("--force-recheck", action="store_true",
-                    help="Znovu overi i emaily kde prilohy uz maji file_hash")
-    ap.add_argument("--no-indexes",    action="store_true",
-                    help="Nevytvorit indexy na attachments_index kolekci")
-    args = ap.parse_args()
-
-    mailbox     = args.mailbox
-    att_dir     = EMAILS_BASE_DIR / mailbox / "Attachments"
-    mongo_col   = mailbox
-
-    start = datetime.now()
-    print(f"=== download_attachments v{SCRIPT_VERSION} ===")
-    print(f"Start:    {start.strftime('%Y-%m-%d %H:%M:%S')}")
-    print(f"Schránka: {mailbox}")
-    print(f"Cilovy adresar: {att_dir}")
-    print(f"MongoDB:  {MONGO_URI} -> {MONGO_DB}.{mongo_col}")
-
-    att_dir.mkdir(parents=True, exist_ok=True)
-    print("  Adresar OK")
-
-    print("\nPřipojuji se k Graph API...")
-    try:
-        get_token()
-        print("  Graph API OK")
-    except Exception as e:
-        print(f"  CHYBA: {e}")
-        sys.exit(1)
-
-    client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
-    try:
-        client.admin.command("ping")
-        print("  MongoDB OK")
-    except Exception as e:
-        print(f"  CHYBA: MongoDB neni dostupna -- {e}")
-        sys.exit(1)
-
-    col_emails = client[MONGO_DB][mongo_col]
-    col_index  = client[MONGO_DB][MONGO_COL_INDEX]
-
-    if not args.no_indexes:
-        col_index.create_index("filename")
-        col_index.create_index("mime_type")
-        col_index.create_index("mailbox")
-
-    # Dotaz
-    if args.force_recheck:
-        query = {"has_attachments": True}
-    else:
-        query = {
-            "has_attachments": True,
-            "attachments": {
-                "$elemMatch": {
-                    "is_inline": False,
-                    "file_hash": {"$exists": False},
-                }
-            }
-        }
-
-    total = col_emails.count_documents(query)
-    print(f"\nEmailu ke zpracovani: {total}")
-    if total == 0:
-        print("Neni co stahnout.")
-        client.close()
-        return
-
-    cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
-    if args.limit:
-        cursor = cursor.limit(args.limit)
-
-    ok_count   = 0
-    new_count  = 0
-    dup_count  = 0
-    err_count  = 0
-    email_i    = 0
-    batch      = []
-
-    def flush():
-        if not batch:
-            return
-        try:
-            col_emails.bulk_write(batch, ordered=False)
-        except Exception as e:
-            logging.error("bulk_write: %s", e)
-            print(f"  CHYBA bulk_write: {e}")
-        batch.clear()
-
-    for email_doc in cursor:
-        email_i   += 1
-        email_id   = email_doc["_id"]
-        graph_id   = email_doc.get("graph_id", "")
-        subject    = (email_doc.get("subject") or "")[:60]
-        att_list   = email_doc.get("attachments") or []
-
-        real_atts = [a for a in att_list if not a.get("is_inline", False)]
-        if not real_atts:
-            continue
-
-        print(f"\n  {email_i:>5}/{total}  {subject}")
-
-        graph_atts    = fetch_message_attachments(mailbox, graph_id)
-        graph_att_map = {a["name"]: a for a in graph_atts if not a.get("isInline", False)}
-
-        updated_atts = list(att_list)
-        email_ok     = True
-
-        for i, att in enumerate(updated_atts):
-            if att.get("is_inline", False):
-                continue
-            if not args.force_recheck and att.get("file_hash"):
-                print(f"         SKIP  {att['filename']}")
-                continue
-
-            att_name  = att.get("filename", "")
-            graph_att = graph_att_map.get(att_name)
-            if not graph_att:
-                for gname, ga in graph_att_map.items():
-                    if att_name.lower() in gname.lower():
-                        graph_att = ga
-                        break
-
-            if not graph_att:
-                logging.error("attachment not found in Graph [email=%s att=%s]", email_id, att_name)
-                print(f"         ERR   {att_name} (nenalezeno v Graph)")
-                err_count += 1
-                email_ok = False
-                continue
-
-            content = fetch_attachment_content(mailbox, graph_id, graph_att["id"])
-            if content is None:
-                err_count += 1
-                email_ok = False
-                print(f"         ERR   {att_name} (stazeni selhalo)")
-                continue
-
-            mime_type = att.get("mime_type") or graph_att.get("contentType", "")
-            hash_val, local_path, was_new = save_attachment(
-                content, att_name, mime_type, mailbox, att_dir, col_index
-            )
-
-            updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path}
-
-            if was_new:
-                new_count += 1
-                print(f"         NEW   {local_path}  ({len(content):,} B)")
-            else:
-                dup_count += 1
-                print(f"         DUP   {att_name} -> {local_path}")
-
-        if email_ok:
-            ok_count += 1
-
-        batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}}))
-
-        if len(batch) >= BATCH_SIZE:
-            flush()
-
-        if email_i % 100 == 0:
-            elapsed = (datetime.now() - start).total_seconds()
-            print(f"  {'─'*60}")
-            print(f"  Průběh: emaily={email_i}/{total}  nove={new_count}  dup={dup_count}  err={err_count}")
-            print(f"  {'─'*60}")
-
-    flush()
-
-    elapsed_total = (datetime.now() - start).total_seconds()
-    files_total   = col_index.count_documents({})
-    size_total    = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
-
-    print(f"\n{'='*52}")
-    print(f"Vysledek:  emaily={ok_count}  |  nove={new_count}  |  dup={dup_count}  |  err={err_count}")
-    print(f"Souboru v indexu: {files_total}  ({size_total / 1024 / 1024:.1f} MB)")
-    print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
-    print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-    if err_count:
-        print(f"Chyby logovany do: {LOG_FILE}")
-
-    client.close()
-
-
-if __name__ == "__main__":
-    main()
@@ -1,560 +0,0 @@
-"""
-parse_emails_graph_v1.0.py
-Nazev:  parse_emails_graph_v1.0.py
-Verze:  1.0
-Datum:  2026-06-02
-Autor:  vladimir.buzalka
-
-Popis:
-    Cte vsechny emaily ze schranky ordinace@buzalkova.cz primo pres
-    Microsoft Graph API a importuje je jako dokumenty do MongoDB.
-    Ze kazde zpravy extrahuje vsechny dostupne vlastnosti:
-
-        - predmet, odesilatel, prijemci (To/CC/BCC s typy)
-        - cas doruceni, odeslani, vytvoreni, modifikace (UTC)
-        - telo HTML (max 2 MB) + textovy preview
-        - prilohy (metadata: jmeno, velikost, MIME typ, inline flag)
-        - internet headers (SPF, DKIM, Received, X-*, ...)
-        - MAPI-ekvivalenty: dulezitost, priznak, konverzacni vlakno,
-          kategorie, In-Reply-To, References, ...
-        - navic: isRead, isDraft, folder_path, inferenceClassification
-
-    Prochazi VSECHNY slozky schranky rekurzivne (Inbox, Sent, Deleted,
-    archivni slozky, ...).
-
-    DB:       emaily
-    Kolekce:  ordinace@buzalkova.cz
-    _id:      Internet Message-ID (nebo "graphid:<id>" jako fallback)
-
-    Bezpecne prerusit a opakovat:
-        - upsert podle _id — duplicity se automaticky prepisi
-        - --skip-existing nacte seznam hotovych _id z MongoDB a preskoci je
-
-    POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
-
-Spousteni:
-    python parse_emails_graph_v1.0.py                    # kompletni import
-    python parse_emails_graph_v1.0.py --limit 50         # test na prvnich 50
-    python parse_emails_graph_v1.0.py --skip-existing    # pokracovani po preruseni
-    python parse_emails_graph_v1.0.py --folder Inbox     # jen jedna slozka
-    python parse_emails_graph_v1.0.py --no-indexes       # bez indexu na konci
-
-Zavislosti:
-    msal, requests, pymongo, python-dateutil
-    Python 3.10+
-
-Struktura dokumentu v MongoDB:
-    _id                     Internet Message-ID (nebo graphid: fallback)
-    graph_id                Graph API message ID (pro pripadne dalsi operace)
-    subject                 predmet zpravy
-    normalized_subject      predmet bez RE:/FW:/AW: prefixu
-    importance              0=nizka 1=normalni 2=vysoka
-    flag_status             0=bez priznaku 1=oznaceno 2=dokonceno
-    is_read                 bool — aktualni stav precteni ve schrance
-    is_draft                bool
-    has_attachments         bool
-    attachment_count        int
-    inference_classification focused / other (Outlook AI trideni)
-    categories              [str]
-    conversation_id         Graph conversationId
-    conversation_index      base64 conversationIndex
-    conversation_topic      tema vlakna (z internet headers Thread-Topic)
-    in_reply_to             Message-ID predchozi zpravy
-    internet_references     [Message-ID] — cela historia vlakna
-    received_at             datetime UTC
-    sent_at                 datetime UTC
-    created_at              datetime UTC — cas vytvoreni zaznamu v M365
-    modified_at             datetime UTC — cas posledni modifikace
-    folder_id               Graph parentFolderId
-    folder_path             cela cesta slozky (napr. Inbox/Subfolder)
-    sender.email            emailova adresa odesilatele
-    sender.name             zobrazovane jmeno odesilatele
-    to                      retezec To (joined)
-    cc                      retezec CC
-    bcc                     retezec BCC
-    recipients              [{type, email, name}] — to/cc/bcc s typy
-    body_html               HTML telo (max 2 MB)
-    body_preview            textovy nahled (max 255 znaku z Graph)
-    attachments             [{filename, size_bytes, mime_type,
-                              content_id, is_inline}]
-    headers                 dict internet headers (lowercase_s_podtrzitky)
-    parsed_at               datetime UTC — cas parsovani
-
-Indexy:
-    received_at, sent_at, sender.email, graph_id (unique),
-    conversation_id, folder_path, has_attachments, categories,
-    importance, flag_status, is_read,
-    text_search (subject + body_preview + to + cc)
-
-Historie verzi:
-    1.0  2026-06-02  Inicialni verze — Graph API jako zdroj
-"""
-
-import sys
-import re
-import logging
-import argparse
-import base64
-from pathlib import Path
-from datetime import datetime, timezone
-from typing import Optional
-
-import msal
-import requests
-from dateutil import parser as dtparser
-from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT
-
-if hasattr(sys.stdout, "reconfigure"):
-    sys.stdout.reconfigure(encoding="utf-8", errors="replace")
-
-# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
-GRAPH_TENANT_ID     = "7d269944-37a4-43a1-8140-c7517dc426e9"
-GRAPH_CLIENT_ID     = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
-GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
-GRAPH_MAILBOX       = "ordinace@buzalkova.cz"
-GRAPH_URL           = "https://graph.microsoft.com/v1.0"
-
-MONGO_URI      = "mongodb://192.168.1.76:27017"
-MONGO_DB       = "emaily"
-MONGO_COL      = "ordinace@buzalkova.cz"
-BATCH_SIZE     = 100
-PAGE_SIZE      = 50
-LOG_FILE       = Path(__file__).parent / "parse_emails_errors.log"
-SCRIPT_VERSION = "1.0"
-# ──────────────────────────────────────────────────────────────────────────────
-
-logging.basicConfig(
-    filename=str(LOG_FILE),
-    level=logging.ERROR,
-    format="%(asctime)s | %(message)s",
-    datefmt="%Y-%m-%d %H:%M:%S",
-    encoding="utf-8",
-)
-
-IMPORTANCE_MAP  = {"low": 0, "normal": 1, "high": 2}
-FLAG_STATUS_MAP = {"notFlagged": 0, "flagged": 1, "complete": 2}
-RE_SUBJECT      = re.compile(r"^(RE|FW|AW|SV|VS|TR|WG|odpov[eě]d[ťt]|fwd?)[:\s]+", re.IGNORECASE)
-
-MSG_SELECT = (
-    "id,internetMessageId,subject,bodyPreview,body,"
-    "importance,isRead,isDraft,hasAttachments,"
-    "receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
-    "sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
-    "conversationId,conversationIndex,parentFolderId,"
-    "categories,flag,inferenceClassification,internetMessageHeaders"
-)
-
-
-# ─── Graph API helpers ────────────────────────────────────────────────────────
-
-_graph_token: Optional[str] = None
-
-
-def get_token() -> str:
-    global _graph_token
-    app = msal.ConfidentialClientApplication(
-        GRAPH_CLIENT_ID,
-        authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
-        client_credential=GRAPH_CLIENT_SECRET,
-    )
-    result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
-    if "access_token" not in result:
-        raise RuntimeError(f"Graph auth failed: {result}")
-    _graph_token = result["access_token"]
-    return _graph_token
-
-
-def graph_get(url: str, params: dict = None) -> dict:
-    global _graph_token
-    if not _graph_token:
-        get_token()
-    for attempt in range(2):
-        r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
-        if r.status_code == 401:
-            get_token()
-            continue
-        r.raise_for_status()
-        return r.json()
-    raise RuntimeError(f"Graph GET failed after retry: {url}")
-
-
-def get_all_folders(parent_id: str = None, parent_path: str = "") -> list[dict]:
-    """Rekurzivne nacte vsechny slozky schranky. Vraci [{id, path}]."""
-    if parent_id is None:
-        url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders"
-    else:
-        url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{parent_id}/childFolders"
-
-    folders = []
-    params = {"$top": 100, "$select": "id,displayName,childFolderCount"}
-    while url:
-        data = graph_get(url, params)
-        for f in data.get("value", []):
-            path = f"{parent_path}/{f['displayName']}".lstrip("/")
-            folders.append({"id": f["id"], "path": path})
-            if f.get("childFolderCount", 0) > 0:
-                folders.extend(get_all_folders(f["id"], path))
-        url = data.get("@odata.nextLink")
-        params = None
-    return folders
-
-
-def iter_folder_messages(folder_id: str):
-    """Generator: vraci zpravy ze slozky po strankach."""
-    url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{folder_id}/messages"
-    params = {"$top": PAGE_SIZE, "$select": MSG_SELECT, "$expand": "attachments"}
-    while url:
-        data = graph_get(url, params)
-        for msg in data.get("value", []):
-            yield msg
-        url = data.get("@odata.nextLink")
-        params = None
-
-
-# ─── Pomocné funkce ───────────────────────────────────────────────────────────
-
-def parse_date(raw) -> Optional[datetime]:
-    if raw is None:
-        return None
-    if isinstance(raw, datetime):
-        if raw.tzinfo:
-            return raw.astimezone(timezone.utc).replace(tzinfo=None)
-        return raw
-    try:
-        dt = dtparser.parse(str(raw))
-        if dt.tzinfo:
-            return dt.astimezone(timezone.utc).replace(tzinfo=None)
-        return dt
-    except Exception:
-        return None
-
-
-def normalize_subject(subject: str) -> str:
-    s = subject.strip()
-    while True:
-        m = RE_SUBJECT.match(s)
-        if not m:
-            break
-        s = s[m.end():].strip()
-    return s
-
-
-def parse_headers(raw_headers: list) -> dict:
-    result = {}
-    for h in raw_headers:
-        k = h["name"].lower().replace("-", "_")
-        v = h["value"]
-        if k in result:
-            existing = result[k]
-            if isinstance(existing, list):
-                existing.append(v)
-            else:
-                result[k] = [existing, v]
-        else:
-            result[k] = v
-    return result
-
-
-def format_recipients(lst: list) -> str:
-    return "; ".join(
-        f'{r["emailAddress"].get("name", "")} <{r["emailAddress"].get("address", "")}>'.strip()
-        for r in lst
-    )
-
-
-# ─── Hlavní extrakce ─────────────────────────────────────────────────────────
-
-def extract_message(msg: dict, folder_path: str) -> Optional[dict]:
-    try:
-        # _id
-        mid = (msg.get("internetMessageId") or "").strip()
-        if not mid:
-            mid = f"graphid:{msg['id']}"
-
-        subject = msg.get("subject") or ""
-        norm_subject = normalize_subject(subject)
-
-        # tělo
-        body_html = None
-        body_preview = msg.get("bodyPreview") or ""
-        body = msg.get("body", {})
-        if body.get("contentType") == "html":
-            content = body.get("content") or ""
-            body_html = content if len(content) <= 2 * 1024 * 1024 else content[:2 * 1024 * 1024]
-        elif body.get("contentType") == "text":
-            body_preview = (body.get("content") or "")[:2000]
-
-        # odesílatel
-        sender_ea = (msg.get("from") or msg.get("sender") or {}).get("emailAddress", {})
-        sender_email = sender_ea.get("address", "")
-        sender_name  = sender_ea.get("name", "")
-
-        # příjemci
-        to_list  = msg.get("toRecipients", [])
-        cc_list  = msg.get("ccRecipients", [])
-        bcc_list = msg.get("bccRecipients", [])
-
-        recipients = (
-            [{"type": "to",  "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in to_list] +
-            [{"type": "cc",  "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in cc_list] +
-            [{"type": "bcc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in bcc_list]
-        )
-
-        # příznaky
-        importance  = IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1)
-        flag_status = FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0)
-
-        # internet headers
-        raw_headers = msg.get("internetMessageHeaders") or []
-        headers = parse_headers(raw_headers)
-
-        in_reply_to = headers.get("in_reply_to", "")
-        if isinstance(in_reply_to, list):
-            in_reply_to = in_reply_to[0]
-
-        refs_raw = headers.get("references", "")
-        if isinstance(refs_raw, list):
-            refs_raw = " ".join(refs_raw)
-        internet_refs = [r.strip() for r in refs_raw.split() if r.strip()] if refs_raw else []
-
-        conv_topic = headers.get("thread_topic", "")
-        if isinstance(conv_topic, list):
-            conv_topic = conv_topic[0]
-
-        # conversation index
-        conv_index = ""
-        ci_raw = msg.get("conversationIndex")
-        if ci_raw:
-            try:
-                conv_index = base64.b64encode(base64.b64decode(ci_raw)).decode()
-            except Exception:
-                conv_index = ci_raw
-
-        # přílohy (jen metadata, bez obsahu)
-        attachments = []
-        for att in msg.get("attachments") or []:
-            fname = att.get("name") or ""
-            if not fname:
-                continue
-            attachments.append({
-                "filename":   fname,
-                "size_bytes": att.get("size", 0),
-                "mime_type":  att.get("contentType", "application/octet-stream"),
-                "content_id": att.get("contentId"),
-                "is_inline":  att.get("isInline", False),
-            })
-
-        return {
-            "_id":     mid,
-            "graph_id": msg["id"],
-
-            "subject":            subject,
-            "normalized_subject": norm_subject,
-            "importance":         importance,
-            "flag_status":        flag_status,
-            "is_read":            msg.get("isRead", False),
-            "is_draft":           msg.get("isDraft", False),
-            "has_attachments":    msg.get("hasAttachments", False),
-            "attachment_count":   len(attachments),
-            "inference_classification": msg.get("inferenceClassification", ""),
-            "categories":         msg.get("categories") or [],
-
-            "conversation_id":    msg.get("conversationId", ""),
-            "conversation_index": conv_index,
-            "conversation_topic": conv_topic,
-            "in_reply_to":        in_reply_to,
-            "internet_references": internet_refs,
-
-            "received_at": parse_date(msg.get("receivedDateTime")),
-            "sent_at":     parse_date(msg.get("sentDateTime")),
-            "created_at":  parse_date(msg.get("createdDateTime")),
-            "modified_at": parse_date(msg.get("lastModifiedDateTime")),
-
-            "folder_id":   msg.get("parentFolderId", ""),
-            "folder_path": folder_path,
-
-            "sender": {
-                "email": sender_email,
-                "name":  sender_name,
-            },
-            "to":         format_recipients(to_list),
-            "cc":         format_recipients(cc_list),
-            "bcc":        format_recipients(bcc_list),
-            "recipients": recipients,
-
-            "body_html":    body_html,
-            "body_preview": body_preview,
-
-            "attachments": attachments,
-            "headers":     headers,
-
-            "parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
-        }
-
-    except Exception as e:
-        logging.error("extract_message failed [%s]: %s", msg.get("id", "?"), e)
-        return None
-
-
-# ─── MongoDB indexy ───────────────────────────────────────────────────────────
-
-def create_indexes(col):
-    print("  Vytvarim indexy...")
-    col.create_index([("received_at",    ASCENDING)])
-    col.create_index([("sent_at",        ASCENDING)])
-    col.create_index([("sender.email",   ASCENDING)])
-    col.create_index([("graph_id",       ASCENDING)], unique=True, sparse=True)
-    col.create_index([("conversation_id", ASCENDING)])
-    col.create_index([("folder_path",    ASCENDING)])
-    col.create_index([("has_attachments", ASCENDING)])
-    col.create_index([("categories",     ASCENDING)])
-    col.create_index([("importance",     ASCENDING)])
-    col.create_index([("flag_status",    ASCENDING)])
-    col.create_index([("is_read",        ASCENDING)])
-    col.create_index([
-        ("subject",       TEXT),
-        ("body_preview",  TEXT),
-        ("to",            TEXT),
-        ("cc",            TEXT),
-    ], name="text_search", default_language="none")
-    print("  Indexy hotovy.")
-
-
-# ─── MAIN ─────────────────────────────────────────────────────────────────────
-
-def main():
-    ap = argparse.ArgumentParser(description=f"parse_emails_graph v{SCRIPT_VERSION}")
-    ap.add_argument("--limit",         type=int, default=0,
-                    help="Zpracovat max N zprav (0 = vse)")
-    ap.add_argument("--skip-existing", action="store_true",
-                    help="Preskocit zpravy ktere jiz jsou v MongoDB")
-    ap.add_argument("--folder",        default="",
-                    help="Zpracovat jen slozku se zadanym nazvem (napr. Inbox)")
-    ap.add_argument("--no-indexes",    action="store_true",
-                    help="Nevytvorit indexy na konci")
-    args = ap.parse_args()
-
-    start = datetime.now()
-    print(f"=== parse_emails_graph v{SCRIPT_VERSION} ===")
-    print(f"Start:    {start.strftime('%Y-%m-%d %H:%M:%S')}")
-    print(f"Schránka: {GRAPH_MAILBOX}")
-    print(f"MongoDB:  {MONGO_URI} -> {MONGO_DB}.{MONGO_COL}")
-
-    # Graph token
-    print("\nPřipojuji se k Graph API...")
-    try:
-        get_token()
-        print("  Graph API OK")
-    except Exception as e:
-        print(f"  CHYBA: {e}")
-        sys.exit(1)
-
-    # MongoDB
-    client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
-    try:
-        client.admin.command("ping")
-        print("  MongoDB OK")
-    except Exception as e:
-        print(f"  CHYBA: MongoDB neni dostupna -- {e}")
-        sys.exit(1)
-    col = client[MONGO_DB][MONGO_COL]
-
-    # Skip existing
-    existing: set = set()
-    if args.skip_existing:
-        print("  Nacitam existujici zaznamy z MongoDB...")
-        existing = set(col.distinct("_id"))
-        print(f"  {len(existing)} jiz importovano")
-
-    # Slozky
-    print("\nNacitam seznam slozek...")
-    all_folders = get_all_folders()
-    if args.folder:
-        all_folders = [f for f in all_folders if args.folder.lower() in f["path"].lower()]
-    print(f"  Slozek ke zpracovani: {len(all_folders)}")
-    for f in all_folders:
-        print(f"    {f['path']}")
-
-    # Import
-    batch     = []
-    ok_count  = 0
-    err_count = 0
-    skip_count = 0
-    total_i   = 0
-
-    def flush():
-        if not batch:
-            return
-        try:
-            col.bulk_write(batch, ordered=False)
-        except Exception as e:
-            logging.error("bulk_write: %s", e)
-            print(f"  CHYBA bulk_write: {e}")
-        batch.clear()
-
-    print()
-    for folder in all_folders:
-        print(f"--- Složka: {folder['path']} ---")
-        folder_count = 0
-
-        for msg in iter_folder_messages(folder["id"]):
-            if args.limit and total_i >= args.limit:
-                break
-
-            mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
-
-            if mid in existing:
-                skip_count += 1
-                total_i += 1
-                continue
-
-            doc = extract_message(msg, folder["path"])
-            total_i += 1
-            folder_count += 1
-
-            if doc is None:
-                err_count += 1
-            else:
-                batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True))
-                ok_count += 1
-
-            if len(batch) >= BATCH_SIZE:
-                flush()
-
-            status      = "ERR " if doc is None else "OK  "
-            subject_str = (doc.get("subject") or "")[:60] if doc else "?"
-            sender_str  = (doc.get("sender", {}).get("email") or "")[:40] if doc else "?"
-            print(f"  {total_i:>6}  {status}  {subject_str:<60}  {sender_str}")
-
-            if total_i % 500 == 0:
-                elapsed = (datetime.now() - start).total_seconds()
-                rate    = total_i / elapsed if elapsed > 0 else 0
-                print(f"  {'─'*80}")
-                print(f"  Průběh: ok={ok_count}  skip={skip_count}  err={err_count}  {rate:.1f} msg/s")
-                print(f"  {'─'*80}")
-
-        flush()
-        print(f"  → {folder_count} zprav ze slozky {folder['path']}")
-
-        if args.limit and total_i >= args.limit:
-            break
-
-    elapsed_total = (datetime.now() - start).total_seconds()
-    print(f"\n{'='*52}")
-    print(f"Vysledek:  ok={ok_count}  |  skip={skip_count}  |  err={err_count}")
-    print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
-    print(f"Dokumentu v kolekci: {col.count_documents({})}")
-
-    if not args.no_indexes:
-        print()
-        create_indexes(col)
-
-    print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
-    if err_count:
-        print(f"Chyby logovany do: {LOG_FILE}")
-
-    client.close()
-
-
-if __name__ == "__main__":
-    main()
@@ -39,53 +39,138 @@ c.close()

 ## Volume mounty

-| Host (Unraid)         | Kontejner         | Popis                        |
-|-----------------------|-------------------|------------------------------|
-| `/mnt/user/Scripts`   | `/scripts`        | Skripty, logy — working dir  |
-| `/mnt/user/JNJEMAILS` | `/mnt/JNJEMAILS`  | .msg soubory emailů (JNJ)    |
+| Host (Unraid)         | Kontejner         | Popis                            |
+|-----------------------|-------------------|----------------------------------|
+| `/mnt/user/Scripts`   | `/scripts`        | Skripty, logy — working dir      |
+| `/mnt/user/Emails`    | `/mnt/Emails`     | Stažené přílohy `<schránka>/Attachments/` |

---
-
-## Spouštění skriptů
-
-```bash
-# Interaktivně (vidíš výstup):
-docker exec -it python-runner python /scripts/parse_emails_tower_v1.1.py --limit 50 --no-indexes
-
-# Na pozadí (log do souboru):
-docker exec -d python-runner bash -c \
-  "python /scripts/parse_emails_tower_v1.1.py > /scripts/parse_emails.log 2>&1"
-
-# Pokračování po přerušení (skip hotových):
-docker exec -d python-runner bash -c \
-  "python /scripts/parse_emails_tower_v1.1.py --skip-existing > /scripts/parse_emails.log 2>&1"
-
-# Sledování průběhu:
-docker exec -it python-runner tail -f /scripts/parse_emails.log
-```
+> Skripty čtou emaily **přímo přes Microsoft Graph API** — lokální `.msg` soubory už nejsou potřeba.

 ---

 ## Aktuální skripty v /scripts

-| Soubor                        | Popis                                          |
-|-------------------------------|------------------------------------------------|
-| `parse_emails_tower_v1.1.py`  | Import .msg → MongoDB (db: emaily, kolekce: vbuzalka@its.jnj.com) |
-| `parse_emails_tower_v1.1.md`  | Dokumentace ke skriptu                         |
-| `parse_emails.log`            | Log průběhu importu                            |
-| `parse_emails_errors.log`     | Log chyb (soubory které selhaly)               |
+| Soubor                          | Popis                                                        |
+|---------------------------------|--------------------------------------------------------------|
+| `parse_emails_graph_v1.3.py`    | Import emailů ze schránky přes Graph API → MongoDB           |
+| `download_attachments_v1.3.py`  | Stažení skutečných příloh emailů (Graph API) → `/mnt/Emails` |
+| `python_runner.md`              | Tato dokumentace                                             |
+| `parse_emails_errors.log`       | Log chyb (soubory/zprávy které selhaly)                     |

-Lokální protějšek: `EmailsImport/parse_emails_v1.0.py` — identický kód, liší se jen cestou
-(`\\tower\JNJEMAILS` SMB vs. `/mnt/JNJEMAILS` lokální mount) a verzí hlavičky.
+> **POZOR:** oba skripty pouze **čtou** ze schránky — žádný zápis do schránky.
+
+---
+
+## Microsoft Graph API — konfigurace (v obou skriptech)
+
+| Parametr        | Hodnota                                |
+|-----------------|----------------------------------------|
+| Graph URL       | `https://graph.microsoft.com/v1.0`     |
+| Tenant ID       | `7d269944-37a4-43a1-8140-c7517dc426e9` |
+| Client ID       | `4b222bfd-78c9-4239-a53f-43006b3ed07f` |
+| Auth            | client credentials (msal)              |
+
+| MongoDB         | Hodnota                                |
+|-----------------|----------------------------------------|
+| URI             | `mongodb://192.168.1.76:27017`         |
+| DB              | `emaily`                               |
+| Kolekce emailů  | `<mailbox>` (např. `ordinace@buzalkova.cz`) |
+| Index příloh    | `attachments_index`                    |
+
+---
+
+## 1) parse_emails_graph_v1.3.py — import emailů → MongoDB
+
+Čte **všechny složky** schránky rekurzivně (Inbox, Sent, Deleted, archivy …) přes
+Graph API a importuje každou zprávu jako dokument do MongoDB. `_id` = Internet
+Message-ID (fallback `graphid:<id>`). Upsert → bezpečné přerušit a opakovat.
+
+Z každé zprávy extrahuje: předmět, odesílatel, příjemci To/CC/BCC, časy (UTC),
+HTML tělo (max 2 MB) + text preview, přílohy (metadata + `graph_att_id`),
+internet headers (SPF/DKIM/Received/X-*), MAPI-ekvivalenty (důležitost, příznak,
+konverzační vlákno, kategorie, In-Reply-To, References), `isRead`, `isDraft`,
+`folder_path`, `inferenceClassification`.
+
+```bash
+# První import (vše):
+docker exec -it python-runner python /scripts/parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz
+
+# Test na 50 zprávách bez indexů:
+docker exec -it python-runner python /scripts/parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
+
+# Pravidelný sync na pozadí (log do souboru):
+docker exec -d python-runner bash -c "python /scripts/parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode sync > /scripts/parse_emails.log 2>&1"
+```
+
+> **`-d` = detached:** příkaz se hned vrátí a skript běží dál v kontejneru i po
+> zavření terminálu / odpojení SSH. Bez `-d` (resp. s `-it`) skript skončí ve chvíli,
+> kdy se spojení zavře. Pro dlouhé běhy vždy pouštěj s `-d` a logem do souboru,
+> průběh pak sleduj přes `tail -f` (viz [Sledování průběhu](#sledování-průběhu)).
+
+### Parametry
+
+| Parametr | Popis |
+|---|---|
+| `--mailbox` | **Povinný.** Schránka (e-mail), zároveň název kolekce v MongoDB. |
+| `--mode` | `full` (výchozí — plný upsert), `new-only` (jen nové), `sync` (existující: aktualizuje `is_read`/`flag_status`/`categories`/`modified_at`/`folder_path`; nové importuje celé — ideální pro pravidelné spouštění). |
+| `--folder` | Import jen jedné složky (např. `Inbox`). |
+| `--limit N` | Zpracuje jen prvních N zpráv (test). |
+| `--no-indexes` | Nevytváří indexy na konci. |
+
+---
+
+## 2) download_attachments_v1.3.py — stažení příloh → /mnt/Emails
+
+Stahuje skutečné přílohy (`is_inline=False`) všech emailů z MongoDB přes Graph API
+do `/mnt/Emails/<schránka>/Attachments/`. Primárně přes `graph_att_id` (přímé ID),
+name-matching jako fallback pro staré emaily.
+
+Deduplikace podle **SHA256** obsahu:
+- stejný hash → soubor už existuje → přeskočí
+- kolize názvu (stejný název, jiný hash) → `faktura_2.pdf`, `faktura_3.pdf` …
+
+Po uložení aktualizuje MongoDB: každá příloha dostane `file_hash` + `local_path`;
+kolekce `emaily.attachments_index` (`_id`=hash, filename, path, size_bytes,
+mime_type, mailbox, first_seen_at, ref_count). Emaily kde mají všechny přílohy
+`file_hash` se přeskočí → bezpečné opakovat.
+
+```bash
+# Interaktivně (vidíš výstup, skončí zavřením terminálu):
+docker exec -it python-runner python /scripts/download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz
+
+# Na pozadí (běží dál i po zavření terminálu, log do souboru):
+docker exec -d python-runner bash -c "python /scripts/download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz > /scripts/download_attachments.log 2>&1"
+```
+
+> `-d` = detached — stejné chování jako u skriptu výše (viz poznámka v sekci 1).
+
+### Parametry
+
+| Parametr | Popis |
+|---|---|
+| `--mailbox` | **Povinný.** Schránka (e-mail) = kolekce v MongoDB. |
+| `--limit N` | Zpracuje jen prvních N emailů (test). |
+| `--force-recheck` | Znovu ověří i už stažené přílohy. |
+| `--no-indexes` | Nevytváří indexy na konci. |
+
+---
+
+## Sledování průběhu
+
+```bash
+docker exec -it python-runner tail -f /scripts/parse_emails.log
+```

 ---

 ## Nainstalované Python balíčky

 ```
-extract-msg        0.55.0
+msal               (Graph API auth)
+requests
 pymongo            4.17.0
 python-dateutil    2.9.0.post0
+extract-msg        0.55.0
 cryptography       48.0.0
 beautifulsoup4     4.13.5
 oletools           0.60.2
@@ -112,11 +197,8 @@ docker exec python-runner pip install <balicek>

 ---

-## Logika parse_emails (oba skripty)
+## Historie

- Čte všechny `.msg` soubory z MSGS_DIR
- Extrahuje: předmět, odesílatel, příjemci (To/CC/BCC), tělo (text+HTML), přílohy, internet headers, všechny raw MAPI properties
- Ukládá do MongoDB: `emaily` → `vbuzalka@its.jnj.com`
- `_id` = Internet Message-ID (nebo `filename:<stem>` jako fallback)
- Upsert → bezpečné opakování, `--skip-existing` pro pokračování
- Indexy: received_at, sent_at, sender.email, filename (unique), full-text (subject+body+to+cc)
+| Datum | Změna |
+|---|---|
+| 2026-06-02 | Přechod z `.msg` souborů na Microsoft Graph API. Skript `parse_emails_tower_v1.1.py` (import lokálních `.msg`) nahrazen `parse_emails_graph_v1.3.py`; přidán `download_attachments_v1.3.py`. Staré verze v `Trash/`. |
@@ -3,6 +3,8 @@
 - [Pracovat v maintree](feedback_worktree.md) — vždy pracuj v `U:/janssen/`, ne ve worktree větvích
 - [Projekt Covance UCO3001](project_covance.md) — report vzorků studie 77242113UCO3001, skript `create_report.py`, zdroj + logika OK statusů
 - [EDC import do MongoDB](project_edc_mongo.md) — skript `medidata/edc_import.py`, import Data Listing + QueryDetails CSV do MongoDB (192.168.1.76), kolekce `queries` + `queries_snapshots` pro tracking vývoje queries v čase
+- [IWRS notifikace v Mongo](project_iwrs_mongo.md) — parser `IWRS/Patients/parse_notifications_to_mongo.py` čte texty notifikací z MySQL a ukládá strukturovaná data do `studie.iwrs` (lot, expirace, clinical response, audit trail)
 - [Dropbox file transfer](project_dropbox_file_transfer.md) — přenos souborů z JNJ PC do Dropboxu přes msgreceiver kontejner na Unraidu
 - [Graph email import](project_graph_email_import.md) — import JNJ emailů do schránky vladimir.buzalka@buzalka.cz přes Graph API
 - [Memory sync přes Giteu](setup_memory_sync.md) — paměť je v `claude-memory/` v janssen repu, junction + git push synchronizuje mezi PC
+- [Claude Code learning path](project_claude_learning.md) — Level 2 Intermediate, mezery: Skills/Subagenty/Hooks/Print mode, tutoriál v `claude-howto/`