notebook
This commit is contained in:
+11
-365
@@ -4,7 +4,7 @@ Kompletní pipeline pro Drugs:
|
||||
2. IP destruction (per košík, přeskočí již existující soubory)
|
||||
3. Shipments report (jeden soubor na studii, přepisuje)
|
||||
4. Shipment details (per zásilka CZ, vždy přepisuje)
|
||||
5. Import do MySQL
|
||||
5. Import do MongoDB (studie.iwrs_shipments / iwrs_shipment_items / iwrs_inventory / iwrs_destruction)
|
||||
|
||||
Spusť tento skript — zpracuje obě studie automaticky.
|
||||
"""
|
||||
@@ -14,12 +14,11 @@ import glob
|
||||
import re
|
||||
import datetime
|
||||
|
||||
import numpy as np
|
||||
import sys
|
||||
import pandas as pd
|
||||
from playwright.sync_api import sync_playwright
|
||||
import mysql.connector
|
||||
|
||||
import db_config
|
||||
import import_to_mongo as drugs_mongo
|
||||
|
||||
BASE_URL = "https://janssen.4gclinical.com"
|
||||
EMAIL = "vbuzalka@its.jnj.com"
|
||||
@@ -42,357 +41,6 @@ SITES = {
|
||||
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
||||
# ── type converters ──────────────────────────────────────────────────────────
|
||||
|
||||
def _py(val):
|
||||
if isinstance(val, np.generic):
|
||||
return val.item()
|
||||
return val
|
||||
|
||||
def to_date(val):
|
||||
val = _py(val)
|
||||
if val is None:
|
||||
return None
|
||||
if isinstance(val, float) and (val != val):
|
||||
return None
|
||||
try:
|
||||
if pd.isna(val):
|
||||
return None
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
if isinstance(val, pd.Timestamp):
|
||||
return None if pd.isna(val) else val.date()
|
||||
if isinstance(val, datetime.datetime):
|
||||
return val.date()
|
||||
if isinstance(val, datetime.date):
|
||||
return val
|
||||
s = str(val).strip()
|
||||
if not s or s.lower() in ("nat", "nan", "none", ""):
|
||||
return None
|
||||
for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
|
||||
try:
|
||||
return datetime.datetime.strptime(s, fmt).date()
|
||||
except ValueError:
|
||||
pass
|
||||
return None
|
||||
|
||||
def to_int(val):
|
||||
val = _py(val)
|
||||
try:
|
||||
v = float(val)
|
||||
return None if (v != v) else int(v)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
def to_str(val):
|
||||
val = _py(val)
|
||||
if val is None:
|
||||
return None
|
||||
if isinstance(val, float) and (val != val):
|
||||
return None
|
||||
s = str(val).strip()
|
||||
return None if s.lower() in ("nan", "nat", "none", "") else s
|
||||
|
||||
|
||||
# ── DB helpers ───────────────────────────────────────────────────────────────
|
||||
|
||||
def get_conn():
|
||||
return mysql.connector.connect(
|
||||
host=db_config.DB_HOST, port=db_config.DB_PORT,
|
||||
user=db_config.DB_USER, password=db_config.DB_PASSWORD,
|
||||
database=db_config.DB_NAME,
|
||||
)
|
||||
|
||||
def insert_import(cursor, study, source_label):
|
||||
cursor.execute(
|
||||
"INSERT INTO iwrs_import (study, imported_at, source_file, report_type) VALUES (%s, %s, %s, %s)",
|
||||
(study, datetime.datetime.now(), source_label, "drugs"),
|
||||
)
|
||||
return cursor.lastrowid
|
||||
|
||||
def basket_already_imported(cursor, study, basket_id):
|
||||
cursor.execute(
|
||||
"SELECT 1 FROM iwrs_destruction WHERE study=%s AND basket_id=%s LIMIT 1",
|
||||
(study, str(basket_id)),
|
||||
)
|
||||
return cursor.fetchone() is not None
|
||||
|
||||
|
||||
# ── parsery ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def parse_shipments_report(study):
|
||||
path = os.path.join(BASE_DIR, f"xls_shipments_{study}", f"shipments_report_{study}.xlsx")
|
||||
if not os.path.exists(path):
|
||||
print(f" CHYBÍ: {path}")
|
||||
return []
|
||||
raw = pd.read_excel(path, header=None)
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
if "Shipment ID" in [str(v).strip() for v in row]:
|
||||
header_row = i
|
||||
break
|
||||
if header_row is None:
|
||||
return []
|
||||
df = pd.read_excel(path, header=header_row).dropna(how="all")
|
||||
df = df[df["Location"].astype(str).str.contains("Czech", na=False, case=False)]
|
||||
col = df.columns.tolist()
|
||||
rows = []
|
||||
for _, r in df.iterrows():
|
||||
rows.append({
|
||||
"shipment_id": to_str(r["Shipment ID"]),
|
||||
"status": to_str(r["IRT Shipment Status"]),
|
||||
"type": to_str(r["Type"]),
|
||||
"ship_from": to_str(r["Shipment From"]),
|
||||
"ship_to_site": to_str(r["Ship To:"]),
|
||||
"location": to_str(r["Location"]),
|
||||
"request_date": to_date(r["Request Date"]),
|
||||
"shipped_date": to_date(r["Shipped Date"]),
|
||||
"received_date": to_date(r["Received Date"]) if "Received Date" in col else None,
|
||||
"received_by": to_str(r["Received by"]) if "Received by" in col else None,
|
||||
"delivered_date_utc": to_date(r["Delivered Date [UTC]"]) if "Delivered Date [UTC]" in col else None,
|
||||
"delivery_recipient": to_str(r["Delivery Recipient"]) if "Delivery Recipient" in col else None,
|
||||
"delivery_details": to_str(r["Delivery Details"]) if "Delivery Details" in col else None,
|
||||
"cancelled_date": to_date(r["Cancelled Date"]) if "Cancelled Date" in col else None,
|
||||
"total_medication_ids": to_int(r["Total Medication IDs"]) if "Total Medication IDs" in col else None,
|
||||
"tracking_no": to_str(r["Tracking #"]) if "Tracking #" in col else None,
|
||||
"shipping_category": to_str(r["Shipping Category"]) if "Shipping Category" in col else None,
|
||||
"expected_arrival": to_date(r["Expected Arrival"]) if "Expected Arrival" in col else None,
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
def parse_shipment_details(study):
|
||||
detail_dir = os.path.join(BASE_DIR, f"xls_shipment_details_{study}")
|
||||
files = sorted(glob.glob(os.path.join(detail_dir, "shipment_details_*.xlsx")))
|
||||
rows = []
|
||||
for path in files:
|
||||
m = re.search(r"shipment_details_(.+)\.xlsx", os.path.basename(path))
|
||||
shipment_id = m.group(1) if m else "UNKNOWN"
|
||||
raw = pd.read_excel(path, header=None)
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
if "Medication ID" in [str(v).strip() for v in row]:
|
||||
header_row = i
|
||||
break
|
||||
if header_row is None:
|
||||
continue
|
||||
df = pd.read_excel(path, header=header_row).dropna(how="all")
|
||||
for _, r in df.iterrows():
|
||||
med_desc = (to_str(r.get("Medication Description"))
|
||||
or to_str(r.get("Medication ID Description")))
|
||||
med_type = (to_str(r.get("Medication type"))
|
||||
or to_str(r.get("Medication ID type")))
|
||||
rows.append({
|
||||
"shipment_id": shipment_id,
|
||||
"destination_location": to_str(r.get("Destination Location")),
|
||||
"shipment_status": to_str(r.get("IRT Shipment Status")),
|
||||
"shipment_type": to_str(r.get("Type")),
|
||||
"destination_site": to_str(r.get("Destination Site")),
|
||||
"investigator": to_str(r.get("Investigator")),
|
||||
"medication_description": med_desc,
|
||||
"medication_type": med_type,
|
||||
"medication_id": to_str(r.get("Medication ID")),
|
||||
"packaged_lot_no": to_str(r.get("Packaged Lot number")),
|
||||
"packaged_lot_description": to_str(r.get("Packaged Lot description")),
|
||||
"container_id": to_str(r.get("Container ID")),
|
||||
"quantity": to_int(r.get("Quantity of Medication IDs")),
|
||||
"expiration_date": to_date(r.get("Expiration Date")),
|
||||
"item_status": to_str(r.get("Status")),
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
def parse_inventory(study):
|
||||
inv_dir = os.path.join(BASE_DIR, f"xls_reports_{study}")
|
||||
files = sorted(glob.glob(os.path.join(inv_dir, "onsite_inventory_detail_*.xlsx")))
|
||||
rows = []
|
||||
for path in files:
|
||||
raw = pd.read_excel(path, header=None)
|
||||
site = investigator = location = None
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
|
||||
if first.startswith("Site:"):
|
||||
site = first.replace("Site:", "").strip()
|
||||
elif first.startswith("Investigator:"):
|
||||
investigator = first.replace("Investigator:", "").strip()
|
||||
elif first.startswith("Location:"):
|
||||
location = first.replace("Location:", "").strip()
|
||||
if first in ("Medication", "Medication ID") and header_row is None:
|
||||
header_row = i
|
||||
if header_row is None:
|
||||
continue
|
||||
df = pd.read_excel(path, header=header_row).dropna(how="all")
|
||||
df = df.rename(columns={df.columns[0]: "medication_id"})
|
||||
for _, r in df.iterrows():
|
||||
rows.append({
|
||||
"site": site,
|
||||
"investigator": investigator,
|
||||
"location": location,
|
||||
"medication_id": to_str(r["medication_id"]),
|
||||
"packaged_lot_no": to_str(r.get("Packaged Lot number")),
|
||||
"original_expiration_date": to_date(r.get("Original Expiration Date when Packaged Lot was Added")),
|
||||
"expiration_date": to_date(r.get("Expiration date")),
|
||||
"received_date": to_date(r.get("Received Date")),
|
||||
"receipt_user": to_str(r.get("Shipment Receipt User")),
|
||||
"subject_identifier": to_str(r.get("Subject Identifier")),
|
||||
"quantity_assigned": to_int(r.get("Quantity Assigned")),
|
||||
"irt_transaction": to_str(r.get("IRT Transaction")),
|
||||
"date_assigned": to_date(r.get("Date Assigned")),
|
||||
"assignment_user": to_str(r.get("Assignment User")),
|
||||
"dispensation_status": to_str(r.get("Dispensation Status")),
|
||||
"dispensing_date": to_date(r.get("Dispensing date") or r.get("Dispensing Date")),
|
||||
"quantity_dispensed": to_int(r.get("Quantity Dispensed")),
|
||||
"dispensing_user": to_str(r.get("Dispensing User")),
|
||||
"quantity_returned": to_int(r.get("Quantity Returned")),
|
||||
"date_returned": to_date(r.get("Date Returned")),
|
||||
"return_user": to_str(r.get("Return User")),
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
def parse_destruction_files(study):
|
||||
dest_dir = os.path.join(BASE_DIR, f"xls_ip_destruction_{study}")
|
||||
files = sorted(glob.glob(os.path.join(dest_dir, "ip_destruction_basket_*.xlsx")))
|
||||
baskets = []
|
||||
for path in files:
|
||||
raw = pd.read_excel(path, header=None)
|
||||
meta = {}
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
first = str(row.iloc[0]).strip() if pd.notna(row.iloc[0]) else ""
|
||||
for key, attr in [
|
||||
("Investigator Name:", "investigator"),
|
||||
("Site ID:", "site_id"),
|
||||
("Location:", "location"),
|
||||
("Basket ID:", "basket_id"),
|
||||
("Drug Destruction Created Date:", "destruction_date"),
|
||||
]:
|
||||
if first.startswith(key):
|
||||
meta[attr] = first.replace(key, "").strip()
|
||||
if first == "Medication ID Description" and header_row is None:
|
||||
header_row = i
|
||||
if header_row is None:
|
||||
continue
|
||||
df = pd.read_excel(path, header=header_row).dropna(how="all")
|
||||
items = []
|
||||
for _, r in df.iterrows():
|
||||
items.append({
|
||||
"medication_description": to_str(r.get("Medication ID Description")),
|
||||
"medication_id": to_str(r.get("Medication ID")),
|
||||
"packaged_lot_description": to_str(r.get("Packaged Lot description")),
|
||||
"comments": to_str(r.get("Comments")),
|
||||
})
|
||||
baskets.append({
|
||||
"site_id": meta.get("site_id"),
|
||||
"investigator": meta.get("investigator"),
|
||||
"location": meta.get("location"),
|
||||
"basket_id": meta.get("basket_id"),
|
||||
"destruction_date": to_date(meta.get("destruction_date")),
|
||||
"items": items,
|
||||
})
|
||||
return baskets
|
||||
|
||||
|
||||
# ── insertery ────────────────────────────────────────────────────────────────
|
||||
|
||||
def insert_shipments(cursor, import_id, study, rows):
|
||||
sql = """INSERT INTO iwrs_shipments
|
||||
(import_id, study, shipment_id, status, type, ship_from, ship_to_site,
|
||||
location, request_date, shipped_date, received_date, received_by,
|
||||
delivered_date_utc, delivery_recipient, delivery_details, cancelled_date,
|
||||
total_medication_ids, tracking_no, shipping_category, expected_arrival)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
||||
for r in rows:
|
||||
cursor.execute(sql, (
|
||||
import_id, study, r["shipment_id"], r["status"], r["type"],
|
||||
r["ship_from"], r["ship_to_site"], r["location"],
|
||||
r["request_date"], r["shipped_date"], r["received_date"],
|
||||
r["received_by"], r["delivered_date_utc"], r["delivery_recipient"],
|
||||
r["delivery_details"], r["cancelled_date"], r["total_medication_ids"],
|
||||
r["tracking_no"], r["shipping_category"], r["expected_arrival"],
|
||||
))
|
||||
|
||||
|
||||
def insert_shipment_items(cursor, import_id, study, rows):
|
||||
sql = """INSERT INTO iwrs_shipment_items
|
||||
(import_id, study, shipment_id, destination_location, shipment_status,
|
||||
shipment_type, destination_site, investigator, medication_description,
|
||||
medication_type, medication_id, packaged_lot_no, packaged_lot_description,
|
||||
container_id, quantity, expiration_date, item_status)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
||||
for r in rows:
|
||||
cursor.execute(sql, (
|
||||
import_id, study, r["shipment_id"], r["destination_location"],
|
||||
r["shipment_status"], r["shipment_type"], r["destination_site"],
|
||||
r["investigator"], r["medication_description"], r["medication_type"],
|
||||
r["medication_id"], r["packaged_lot_no"], r["packaged_lot_description"],
|
||||
r["container_id"], r["quantity"], r["expiration_date"], r["item_status"],
|
||||
))
|
||||
|
||||
|
||||
def insert_inventory(cursor, import_id, study, rows):
|
||||
sql = """INSERT INTO iwrs_inventory
|
||||
(import_id, study, site, investigator, location, medication_id,
|
||||
packaged_lot_no, original_expiration_date, expiration_date, received_date,
|
||||
receipt_user, subject_identifier, quantity_assigned, irt_transaction,
|
||||
date_assigned, assignment_user, dispensation_status, dispensing_date,
|
||||
quantity_dispensed, dispensing_user, quantity_returned, date_returned, return_user)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
||||
for r in rows:
|
||||
cursor.execute(sql, (
|
||||
import_id, study, r["site"], r["investigator"], r["location"],
|
||||
r["medication_id"], r["packaged_lot_no"], r["original_expiration_date"],
|
||||
r["expiration_date"], r["received_date"], r["receipt_user"],
|
||||
r["subject_identifier"], r["quantity_assigned"], r["irt_transaction"],
|
||||
r["date_assigned"], r["assignment_user"], r["dispensation_status"],
|
||||
r["dispensing_date"], r["quantity_dispensed"], r["dispensing_user"],
|
||||
r["quantity_returned"], r["date_returned"], r["return_user"],
|
||||
))
|
||||
|
||||
|
||||
def insert_destruction(cursor, study, baskets):
|
||||
sql = """INSERT IGNORE INTO iwrs_destruction
|
||||
(study, site_id, investigator, location, basket_id, destruction_date,
|
||||
medication_description, medication_id, packaged_lot_description, comments)
|
||||
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
||||
skipped = imported = 0
|
||||
for b in baskets:
|
||||
if basket_already_imported(cursor, study, b["basket_id"]):
|
||||
skipped += 1
|
||||
continue
|
||||
for item in b["items"]:
|
||||
cursor.execute(sql, (
|
||||
study, b["site_id"], b["investigator"], b["location"],
|
||||
b["basket_id"], b["destruction_date"],
|
||||
item["medication_description"], item["medication_id"],
|
||||
item["packaged_lot_description"], item["comments"],
|
||||
))
|
||||
imported += 1
|
||||
return imported, skipped
|
||||
|
||||
|
||||
def import_study(study):
|
||||
print(f"\n Parsování dat pro {study}...")
|
||||
shipments = parse_shipments_report(study)
|
||||
items = parse_shipment_details(study)
|
||||
inventory = parse_inventory(study)
|
||||
baskets = parse_destruction_files(study)
|
||||
print(f" Zásilky: {len(shipments)} | Položky: {len(items)} | Sklad: {len(inventory)} | Destrukce: {len(baskets)} košíků")
|
||||
|
||||
conn = get_conn()
|
||||
cursor = conn.cursor()
|
||||
import_id = insert_import(cursor, study, f"drugs_{study}")
|
||||
print(f" import_id = {import_id}")
|
||||
insert_shipments(cursor, import_id, study, shipments)
|
||||
insert_shipment_items(cursor, import_id, study, items)
|
||||
insert_inventory(cursor, import_id, study, inventory)
|
||||
dest_imported, dest_skipped = insert_destruction(cursor, study, baskets)
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
print(f" Destrukce: {dest_imported} nových | {dest_skipped} košíků přeskočeno")
|
||||
|
||||
|
||||
# ── login ────────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -577,19 +225,17 @@ def main():
|
||||
finally:
|
||||
browser.close()
|
||||
|
||||
# ── Import do MySQL ───────────────────────────────────────────────────────
|
||||
# ── Import do MongoDB ─────────────────────────────────────────────────────
|
||||
print(f"\n{'='*60}")
|
||||
print("IMPORT DO MySQL")
|
||||
print("IMPORT DO MongoDB")
|
||||
print(f"{'='*60}")
|
||||
|
||||
for study in STUDIES:
|
||||
print(f"\n[{study}]")
|
||||
try:
|
||||
import_study(study)
|
||||
except Exception as e:
|
||||
import traceback
|
||||
print(f" CHYBA při importu: {e}")
|
||||
traceback.print_exc()
|
||||
try:
|
||||
drugs_mongo.run(STUDIES)
|
||||
except Exception as e:
|
||||
import traceback
|
||||
print(f" CHYBA při importu: {e}")
|
||||
traceback.print_exc()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("Vše hotovo.")
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
BIN
Binary file not shown.
@@ -156,38 +156,62 @@ def run(page, study):
|
||||
total_notif = 0
|
||||
for subject in subjects:
|
||||
filename = os.path.join(out_dir, f"{today} {study} {subject} Subject Detail.xlsx")
|
||||
print(f" [{subject}] Stahuji...")
|
||||
input_field = page.locator('input[placeholder="search"], input[type="text"]').first
|
||||
input_field.click()
|
||||
input_field.fill(subject)
|
||||
page.wait_for_timeout(500)
|
||||
|
||||
# Zachytíme table_1 response při výběru subjektu
|
||||
if api_base:
|
||||
success = False
|
||||
table1_data = None
|
||||
for attempt in range(1, 4):
|
||||
try:
|
||||
with page.expect_response(
|
||||
lambda r: "report_data" in r.url and "table_1" in r.url,
|
||||
timeout=60000
|
||||
) as resp_info:
|
||||
print(f" [{subject}] Stahuji... (pokus {attempt}/3)")
|
||||
input_field = page.locator('input[placeholder="search"], input[type="text"]').first
|
||||
input_field.click()
|
||||
input_field.fill(subject)
|
||||
page.wait_for_timeout(500)
|
||||
|
||||
# Zachytíme table_1 response při výběru subjektu
|
||||
if api_base:
|
||||
try:
|
||||
with page.expect_response(
|
||||
lambda r: "report_data" in r.url and "table_1" in r.url,
|
||||
timeout=60000
|
||||
) as resp_info:
|
||||
page.locator("mat-option").first.dispatch_event("click")
|
||||
table1_data = resp_info.value.json()
|
||||
except Exception as e:
|
||||
print(f" [{subject}] CHYBA zachycení table_1: {e}")
|
||||
page.locator("mat-option").first.dispatch_event("click")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
table1_data = None
|
||||
else:
|
||||
page.locator("mat-option").first.dispatch_event("click")
|
||||
table1_data = resp_info.value.json()
|
||||
except Exception as e:
|
||||
print(f" [{subject}] CHYBA zachycení table_1: {e}")
|
||||
page.locator("mat-option").first.dispatch_event("click")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
table1_data = None
|
||||
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
table1_data = None
|
||||
else:
|
||||
page.locator("mat-option").first.dispatch_event("click")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
table1_data = None
|
||||
page.wait_for_timeout(2000)
|
||||
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
page.wait_for_timeout(1000)
|
||||
with page.expect_download(timeout=60000) as dl:
|
||||
page.get_by_role("button", name="Download XLS").click()
|
||||
dl.value.save_as(filename)
|
||||
print(f" [{subject}] XLS OK")
|
||||
success = True
|
||||
break
|
||||
except Exception as e:
|
||||
print(f" [{subject}] pokus {attempt} selhal: {e}")
|
||||
if attempt < 3:
|
||||
try:
|
||||
page.goto(f"{BASE_URL}/report/patient_detail_report")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
except Exception as ge:
|
||||
print(f" [{subject}] refresh selhal: {ge}")
|
||||
|
||||
with page.expect_download(timeout=120000) as dl:
|
||||
page.get_by_role("button", name="Download XLS").click()
|
||||
dl.value.save_as(filename)
|
||||
print(f" [{subject}] XLS OK")
|
||||
if not success:
|
||||
print(f" [{subject}] PŘESKAKUJI po 3 neúspěšných pokusech")
|
||||
try:
|
||||
page.goto(f"{BASE_URL}/report/patient_detail_report")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
except Exception:
|
||||
pass
|
||||
continue
|
||||
|
||||
# Stáhnout notifikace pro tohoto subjekta
|
||||
if api_base and table1_data:
|
||||
@@ -196,8 +220,13 @@ def run(page, study):
|
||||
)
|
||||
total_notif += n
|
||||
|
||||
page.get_by_role("button", name="Clear").click()
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
try:
|
||||
page.get_by_role("button", name="Clear").click()
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
except Exception as e:
|
||||
print(f" [{subject}] Clear selhal: {e} — refresh")
|
||||
page.goto(f"{BASE_URL}/report/patient_detail_report")
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
|
||||
print(f" [{study}] Subject details hotovo. Nových notifikací: {total_notif}")
|
||||
|
||||
|
||||
+24
-292
@@ -2,23 +2,21 @@
|
||||
Kompletní pipeline:
|
||||
1. Stažení Subject Summary Reportů (obě studie)
|
||||
2. Stažení Subject Detail Reportů + notifikací (obě studie)
|
||||
3. Import do MySQL (summary, visits, notifikace)
|
||||
3. Import do MongoDB (subject_summary + visits + notifications)
|
||||
|
||||
Spusť tento skript místo samostatných skriptů.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import datetime
|
||||
import glob
|
||||
import re
|
||||
|
||||
from playwright.sync_api import sync_playwright
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
import db_config
|
||||
import mysql.connector
|
||||
import download_subject_details as dsd
|
||||
import import_to_mongo
|
||||
import import_notifications_to_mongo
|
||||
|
||||
# ── CONFIG ───────────────────────────────────────────────────────────────────
|
||||
BASE_URL = "https://janssen.4gclinical.com"
|
||||
@@ -72,6 +70,7 @@ def download_summary(page, study, today):
|
||||
# ── KROK 2: Subject Details ───────────────────────────────────────────────────
|
||||
|
||||
def get_subjects_from_summary(summary_path):
|
||||
import pandas as pd
|
||||
raw = pd.read_excel(summary_path, header=None)
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
@@ -112,277 +111,7 @@ def download_details(page, study, summary_path, today):
|
||||
page.wait_for_load_state("networkidle", timeout=120000)
|
||||
|
||||
|
||||
# ── KROK 3: Import do MySQL ───────────────────────────────────────────────────
|
||||
|
||||
def get_conn():
|
||||
return mysql.connector.connect(
|
||||
host=db_config.DB_HOST,
|
||||
port=db_config.DB_PORT,
|
||||
user=db_config.DB_USER,
|
||||
password=db_config.DB_PASSWORD,
|
||||
database=db_config.DB_NAME,
|
||||
)
|
||||
|
||||
|
||||
def _py(val):
|
||||
"""Převede numpy skalár na Python nativní typ."""
|
||||
if isinstance(val, np.generic):
|
||||
return val.item()
|
||||
return val
|
||||
|
||||
|
||||
def to_date(val):
|
||||
val = _py(val)
|
||||
if val is None or (isinstance(val, float) and (val != val)):
|
||||
return None
|
||||
try:
|
||||
if pd.isna(val):
|
||||
return None
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
if isinstance(val, pd.Timestamp):
|
||||
return None if pd.isna(val) else val.date()
|
||||
if isinstance(val, datetime.datetime):
|
||||
return val.date()
|
||||
if isinstance(val, datetime.date):
|
||||
return val
|
||||
s = str(val).strip()
|
||||
if not s or s.lower() in ("nat", "nan", "none", ""):
|
||||
return None
|
||||
for fmt in ("%Y-%m-%d", "%d-%b-%Y", "%d-%m-%Y", "%Y-%m-%d %H:%M:%S"):
|
||||
try:
|
||||
return datetime.datetime.strptime(s, fmt).date()
|
||||
except ValueError:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def to_int(val):
|
||||
val = _py(val)
|
||||
try:
|
||||
v = float(val)
|
||||
return None if (v != v) else int(v)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def to_float(val):
|
||||
val = _py(val)
|
||||
try:
|
||||
v = float(val)
|
||||
return None if (v != v) else float(v)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def to_str(val):
|
||||
val = _py(val)
|
||||
if val is None:
|
||||
return None
|
||||
if isinstance(val, float) and (val != val):
|
||||
return None
|
||||
s = str(val).strip()
|
||||
return None if s.lower() in ("nan", "nat", "none", "") else s
|
||||
|
||||
|
||||
def read_summary_df(path):
|
||||
raw = pd.read_excel(path, header=None)
|
||||
header_row = None
|
||||
for i, row in raw.iterrows():
|
||||
if "Subject" in [str(v).strip() for v in row]:
|
||||
header_row = i
|
||||
break
|
||||
if header_row is None:
|
||||
raise ValueError(f"Hlavičkový řádek nenalezen v {path}")
|
||||
return pd.read_excel(path, header=header_row).dropna(how="all")
|
||||
|
||||
|
||||
def parse_detail_visits(path):
|
||||
df = pd.read_excel(path, sheet_name="patient_detail_report", header=None)
|
||||
header_row = None
|
||||
for i, row in df.iterrows():
|
||||
if "Visit Type" in [str(v).strip() for v in row]:
|
||||
header_row = i
|
||||
break
|
||||
if header_row is None:
|
||||
return []
|
||||
visits_df = df.iloc[header_row + 1:].copy()
|
||||
visits_df.columns = range(visits_df.shape[1])
|
||||
rows = []
|
||||
for _, r in visits_df.iterrows():
|
||||
visit_type = to_str(r.get(0))
|
||||
if visit_type not in ("Past", "Upcoming"):
|
||||
continue
|
||||
rows.append({
|
||||
"visit_type": visit_type,
|
||||
"scheduled_date": to_date(r.get(1)),
|
||||
"window_days": to_str(r.get(2)),
|
||||
"actual_date": to_date(r.get(3)),
|
||||
"irt_transaction_no": to_int(r.get(4)),
|
||||
"irt_transaction_description": to_str(r.get(5)),
|
||||
"medication_assignment": to_str(r.get(6)),
|
||||
"quantity_assigned": to_int(r.get(7)),
|
||||
"medication_id": to_str(r.get(8)),
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
def insert_import(cursor, study, source_file):
|
||||
cursor.execute(
|
||||
"INSERT INTO iwrs_import (study, imported_at, source_file) VALUES (%s, %s, %s)",
|
||||
(study, datetime.datetime.now(), os.path.basename(source_file)),
|
||||
)
|
||||
return cursor.lastrowid
|
||||
|
||||
|
||||
def insert_uco3001_summary(cursor, import_id, df):
|
||||
sql = """INSERT INTO iwrs_uco3001_subject_summary (
|
||||
import_id, subject, prior_subject_identifier, site, investigator, location,
|
||||
cohort_per_irt, informed_consent_date, adolescent_assent_date, age, weight,
|
||||
rescreened_subject, adt_ir, three_or_more_advanced_therapies,
|
||||
only_oral_5asa_compounds, ustekinumab, isolated_proctitis,
|
||||
clinical_responder_status_i12_m0, irt_subject_status,
|
||||
i0_rand_date_local, last_irt_transaction,
|
||||
last_irt_transaction_date_local, last_irt_transaction_date_utc,
|
||||
next_irt_transaction, next_irt_transaction_date_local,
|
||||
most_recent_med_assignment_date, days_since_last_med_assignment,
|
||||
patient_forecast_status, patient_forecast_status_changed_date
|
||||
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
||||
col = df.columns.tolist()
|
||||
for _, r in df.iterrows():
|
||||
cursor.execute(sql, (
|
||||
import_id,
|
||||
to_str(r["Subject"]),
|
||||
to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
|
||||
to_str(r["Site"]),
|
||||
to_str(r["Investigator"]),
|
||||
to_str(r["Location"]),
|
||||
to_str(r["Cohort per IRT"]),
|
||||
to_date(r["Informed Consent Date"]),
|
||||
to_date(r["Adolescent Assent Date"]) if "Adolescent Assent Date" in col else None,
|
||||
to_int(r["Subject's age collection"]),
|
||||
to_float(r["Subject's weight collection"]) if "Subject's weight collection" in col else None,
|
||||
to_str(r["Rescreened Subject"]) if "Rescreened Subject" in col else None,
|
||||
to_str(r["ADT-IR"]) if "ADT-IR" in col else None,
|
||||
to_str(r["3 or More Advanced Therapies"]) if "3 or More Advanced Therapies" in col else None,
|
||||
to_str(r["Only Oral 5-ASA Compounds"]) if "Only Oral 5-ASA Compounds" in col else None,
|
||||
to_str(r["Ustekinumab"]) if "Ustekinumab" in col else None,
|
||||
to_str(r["Isolated Proctitis"]) if "Isolated Proctitis" in col else None,
|
||||
to_str(r["Clinical Responder Status at I-12 / M-0"]) if "Clinical Responder Status at I-12 / M-0" in col else None,
|
||||
to_str(r["IRT Subject Status"]),
|
||||
to_date(r["I0_RAND_TIMESTAMP_LOCAL [Local]"]) if "I0_RAND_TIMESTAMP_LOCAL [Local]" in col else None,
|
||||
to_str(r["Last Recorded IRT Transaction"]),
|
||||
to_date(r["Last Recorded IRT Transaction Date [Local]"]),
|
||||
to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
|
||||
to_str(r["Next Expected IRT Transaction"]),
|
||||
to_date(r["Next Expected IRT Transaction Date [Local]"]),
|
||||
to_date(r["Most Recent Medication Assignment Transaction [Local]"]) if "Most Recent Medication Assignment Transaction [Local]" in col else None,
|
||||
to_int(r["Days Since Last Medication Assignment Transaction"]) if "Days Since Last Medication Assignment Transaction" in col else None,
|
||||
to_str(r["Patient Forecast Status"]) if "Patient Forecast Status" in col else None,
|
||||
to_date(r["Patient Forecast Status Changed Date (UTC)"]) if "Patient Forecast Status Changed Date (UTC)" in col else None,
|
||||
))
|
||||
|
||||
|
||||
def insert_mdd3003_summary(cursor, import_id, df):
|
||||
sql = """INSERT INTO iwrs_mdd3003_subject_summary (
|
||||
import_id, subject, prior_subject_identifier, site, investigator, location,
|
||||
cohort_per_irt, madrs_criteria_integrated, informed_consent_date, age,
|
||||
madrs_criteria_v15, madrs_criteria_v16, madrs_criteria_v17,
|
||||
stratification_country, age_group, stable_remitters, irt_subject_status,
|
||||
last_irt_transaction, last_irt_transaction_date_local,
|
||||
last_irt_transaction_date_utc, next_irt_transaction,
|
||||
next_irt_transaction_date_local, date_screened, date_screen_failed,
|
||||
date_randomized_part1, date_early_withdraw_randomized_part1,
|
||||
date_open_label_induction, date_early_withdraw_open_label_induction,
|
||||
date_randomized_part2, date_early_withdraw_randomized_part2,
|
||||
date_completed, date_unblinded
|
||||
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
||||
col = df.columns.tolist()
|
||||
for _, r in df.iterrows():
|
||||
cursor.execute(sql, (
|
||||
import_id,
|
||||
to_str(r["Subject"]),
|
||||
to_str(r["Prior Subject Identifier"]) if "Prior Subject Identifier" in col else None,
|
||||
to_str(r["Site"]),
|
||||
to_str(r["Investigator"]),
|
||||
to_str(r["Location"]),
|
||||
to_str(r["Cohort per IRT"]),
|
||||
to_str(r["MADRS response criteria integrated or manually entered"]) if "MADRS response criteria integrated or manually entered" in col else None,
|
||||
to_date(r["Informed Consent Date"]),
|
||||
to_int(r["Subject's age collection"]),
|
||||
to_str(r["MADRS response criteria v1.5 from RAVE"]) if "MADRS response criteria v1.5 from RAVE" in col else None,
|
||||
to_str(r["MADRS response criteria v1.6 from RAVE"]) if "MADRS response criteria v1.6 from RAVE" in col else None,
|
||||
to_str(r["MADRS response criteria v1.7 from RAVE"]) if "MADRS response criteria v1.7 from RAVE" in col else None,
|
||||
to_str(r["Stratification Country"]) if "Stratification Country" in col else None,
|
||||
to_str(r["Age Group"]) if "Age Group" in col else None,
|
||||
to_str(r["Stable Remitters vs. Non Stable Remitters"]) if "Stable Remitters vs. Non Stable Remitters" in col else None,
|
||||
to_str(r["IRT Subject Status"]),
|
||||
to_str(r["Last Recorded IRT Transaction"]),
|
||||
to_date(r["Last Recorded IRT Transaction Date [Local]"]),
|
||||
to_date(r["Last Recorded IRT Transaction Date (UTC)"]),
|
||||
to_str(r["Next Expected IRT Transaction"]),
|
||||
to_date(r["Next Expected IRT Transaction Date [Local]"]),
|
||||
to_date(r["Date Screened [Local]"]) if "Date Screened [Local]" in col else None,
|
||||
to_date(r["Date Screen Failed [Local]"]) if "Date Screen Failed [Local]" in col else None,
|
||||
to_date(r["Date Randomized Part 1 [Local]"]) if "Date Randomized Part 1 [Local]" in col else None,
|
||||
to_date(r["Date Early Withdraw Randomized Part 1 [Local]"]) if "Date Early Withdraw Randomized Part 1 [Local]" in col else None,
|
||||
to_date(r["Date Open Label Induction [Local]"]) if "Date Open Label Induction [Local]" in col else None,
|
||||
to_date(r["Date Early Withdraw Open Label Induction [Local]"]) if "Date Early Withdraw Open Label Induction [Local]" in col else None,
|
||||
to_date(r["Date Randomized Part 2 [Local]"]) if "Date Randomized Part 2 [Local]" in col else None,
|
||||
to_date(r["Date Early Withdraw Randomized Part 2 [Local]"]) if "Date Early Withdraw Randomized Part 2 [Local]" in col else None,
|
||||
to_date(r["Date Completed [Local]"]) if "Date Completed [Local]" in col else None,
|
||||
to_date(r["Date Unblinded [Local]"]) if "Date Unblinded [Local]" in col else None,
|
||||
))
|
||||
|
||||
|
||||
def insert_visits(cursor, import_id, study, subject, visits):
|
||||
if not visits:
|
||||
return
|
||||
sql = """INSERT INTO iwrs_subject_visits (
|
||||
import_id, study, subject, visit_type, scheduled_date, window_days,
|
||||
actual_date, irt_transaction_no, irt_transaction_description,
|
||||
medication_assignment, quantity_assigned, medication_id
|
||||
) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
|
||||
for v in visits:
|
||||
cursor.execute(sql, (
|
||||
import_id, study, subject,
|
||||
v["visit_type"], v["scheduled_date"], v["window_days"],
|
||||
v["actual_date"], v["irt_transaction_no"],
|
||||
v["irt_transaction_description"], v["medication_assignment"],
|
||||
v["quantity_assigned"], v["medication_id"],
|
||||
))
|
||||
|
||||
|
||||
def import_to_mysql(summary_path, detail_files, study):
|
||||
print(f"\n [MySQL] Importuji {study}...")
|
||||
df_summary = read_summary_df(summary_path)
|
||||
conn = get_conn()
|
||||
cursor = conn.cursor()
|
||||
|
||||
import_id = insert_import(cursor, study, summary_path)
|
||||
|
||||
if study == "77242113UCO3001":
|
||||
insert_uco3001_summary(cursor, import_id, df_summary)
|
||||
else:
|
||||
insert_mdd3003_summary(cursor, import_id, df_summary)
|
||||
|
||||
total_visits = 0
|
||||
for path in detail_files:
|
||||
fname = os.path.basename(path)
|
||||
m = re.search(r"\d{4}-\d{2}-\d{2} \S+ (\S+) Subject Detail\.xlsx", fname)
|
||||
subject = m.group(1) if m else "UNKNOWN"
|
||||
visits = parse_detail_visits(path)
|
||||
insert_visits(cursor, import_id, study, subject, visits)
|
||||
total_visits += len(visits)
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
conn.close()
|
||||
print(f" [MySQL] import_id={import_id} | pacientů={len(df_summary)} | transakcí={total_visits}")
|
||||
return import_id
|
||||
|
||||
|
||||
# ── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
# ── KROK 3: Import do MongoDB ────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
today = datetime.date.today().strftime("%Y-%m-%d")
|
||||
@@ -391,12 +120,12 @@ def main():
|
||||
|
||||
summary_paths = {}
|
||||
|
||||
# ── Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session) ──
|
||||
# Krok 1 + 2: stahování (Playwright, každá studie zvlášť kvůli session)
|
||||
with sync_playwright() as p:
|
||||
for study in STUDIES:
|
||||
print(f"\n{'='*60}")
|
||||
print("\n" + "=" * 60)
|
||||
print(f"[{study}] KROK 1: Subject Summary Report")
|
||||
print(f"{'='*60}")
|
||||
print("=" * 60)
|
||||
browser = p.chromium.launch(headless=False)
|
||||
context = browser.new_context(accept_downloads=True)
|
||||
page = context.new_page()
|
||||
@@ -415,10 +144,10 @@ def main():
|
||||
finally:
|
||||
browser.close()
|
||||
|
||||
# ── Krok 3: import do MySQL ──────────────────────────────────────────────
|
||||
print(f"\n{'='*60}")
|
||||
print("KROK 3: Import do MySQL")
|
||||
print(f"{'='*60}")
|
||||
# Krok 3: import do MongoDB
|
||||
print("\n" + "=" * 60)
|
||||
print("KROK 3: Import do MongoDB")
|
||||
print("=" * 60)
|
||||
|
||||
for study in STUDIES:
|
||||
summary_path = summary_paths.get(study)
|
||||
@@ -426,18 +155,21 @@ def main():
|
||||
print(f" [{study}] PŘESKOČENO — stahování selhalo")
|
||||
continue
|
||||
|
||||
detail_files = sorted(glob.glob(
|
||||
os.path.join(DETAILS_DIR, study, f"{today} {study} * Subject Detail.xlsx")
|
||||
))
|
||||
|
||||
try:
|
||||
import_to_mysql(summary_path, detail_files, study)
|
||||
import_to_mongo.run(study, summary_path, DETAILS_DIR, today)
|
||||
except Exception as e:
|
||||
print(f" [{study}] CHYBA při importu: {e}")
|
||||
print(f" [{study}] CHYBA při importu summary/visits: {e}")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
# Notifikace: PDF/JSON z disku rovnou do Mongo iwrs_notifications
|
||||
print("\n [notifikace] import PDF/JSON do Mongo...")
|
||||
try:
|
||||
import_notifications_to_mongo.main(STUDIES)
|
||||
except Exception as e:
|
||||
print(f" CHYBA při importu notifikací: {e}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Vše hotovo.")
|
||||
print(f"{'='*60}")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
main()
|
||||
|
||||
@@ -1,449 +0,0 @@
|
||||
"""
|
||||
download_attachments_v1.0.py
|
||||
Nazev: download_attachments_v1.0.py
|
||||
Verze: 1.0
|
||||
Datum: 2026-06-02
|
||||
Autor: vladimir.buzalka
|
||||
|
||||
Popis:
|
||||
Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB kolekce
|
||||
ordinace@buzalkova.cz primo pres Microsoft Graph API a uklada je do
|
||||
adresare /mnt/Emails/ordinace@buzalkova.cz/Attachments/.
|
||||
|
||||
Deduplikace podle SHA256 hashe obsahu:
|
||||
- stejny hash = soubor uz existuje -> preskoci
|
||||
- prvni vyskytu souboru: ulozi pod puvodnimnazvem
|
||||
- kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ...
|
||||
|
||||
Po ulozeni aktualizuje MongoDB:
|
||||
- v email dokumentu: kazda priloha dostane file_hash + local_path
|
||||
- kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes,
|
||||
mime_type, first_seen_at, ref_count (pocet emailu ktery ji obsahuje)
|
||||
|
||||
Bezpecne prerusit a opakovat:
|
||||
- zpravy kde jsou vsechny prilohy uz stazene (maji file_hash) se preskoci
|
||||
- --force-recheck znovu overi i uz stazene (pro pripad zmen na disku)
|
||||
|
||||
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
|
||||
|
||||
Spousteni:
|
||||
python download_attachments_v1.0.py # stahni vse co chybi
|
||||
python download_attachments_v1.0.py --limit 50 # test na prvnich 50 emailech
|
||||
python download_attachments_v1.0.py --force-recheck # overi i uz stazene
|
||||
|
||||
Docker (po pridani mountu /mnt/user/Emails -> /mnt/Emails):
|
||||
docker exec -it python-runner python /scripts/download_attachments_v1.0.py
|
||||
|
||||
Zavislosti:
|
||||
msal, requests, pymongo, python-dateutil
|
||||
Python 3.10+
|
||||
|
||||
Struktura na disku:
|
||||
/mnt/Emails/
|
||||
└── ordinace@buzalkova.cz/
|
||||
└── Attachments/
|
||||
├── faktura_2026.pdf
|
||||
├── vysledky_lab.pdf
|
||||
├── vysledky_lab_2.pdf <- kolize nazvu, jiny obsah
|
||||
└── ...
|
||||
|
||||
Kolekce emaily.attachments_index:
|
||||
_id SHA256 hash (hex)
|
||||
filename nazev souboru na disku (prvni vyskytu)
|
||||
local_path relativni cesta od Attachments/ (zatim = filename)
|
||||
size_bytes velikost souboru
|
||||
mime_type MIME typ
|
||||
first_seen_at datetime UTC
|
||||
ref_count v kolika emailech se tato priloha vyskytuje
|
||||
|
||||
Aktualizace v email dokumentu (kolekce ordinace@buzalkova.cz):
|
||||
attachments[i].file_hash SHA256 hash
|
||||
attachments[i].local_path cesta relativni od Attachments/
|
||||
|
||||
Historie verzi:
|
||||
1.0 2026-06-02 Inicialni verze
|
||||
"""
|
||||
|
||||
import sys
|
||||
import hashlib
|
||||
import logging
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import msal
|
||||
import requests
|
||||
from pymongo import MongoClient, UpdateOne
|
||||
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
|
||||
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
|
||||
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
|
||||
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
|
||||
GRAPH_MAILBOX = "ordinace@buzalkova.cz"
|
||||
GRAPH_URL = "https://graph.microsoft.com/v1.0"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
MONGO_COL_EMAILS = "ordinace@buzalkova.cz"
|
||||
MONGO_COL_INDEX = "attachments_index"
|
||||
|
||||
ATTACHMENTS_DIR = Path("/mnt/Emails/ordinace@buzalkova.cz/Attachments")
|
||||
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
|
||||
SCRIPT_VERSION = "1.0"
|
||||
BATCH_SIZE = 50
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
logging.basicConfig(
|
||||
filename=str(LOG_FILE),
|
||||
level=logging.ERROR,
|
||||
format="%(asctime)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
_graph_token: Optional[str] = None
|
||||
|
||||
|
||||
# ─── Graph API ────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_token() -> str:
|
||||
global _graph_token
|
||||
app = msal.ConfidentialClientApplication(
|
||||
GRAPH_CLIENT_ID,
|
||||
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
|
||||
client_credential=GRAPH_CLIENT_SECRET,
|
||||
)
|
||||
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
||||
if "access_token" not in result:
|
||||
raise RuntimeError(f"Graph auth failed: {result}")
|
||||
_graph_token = result["access_token"]
|
||||
return _graph_token
|
||||
|
||||
|
||||
def graph_get_bytes(url: str) -> bytes:
|
||||
"""Stahne binarni obsah prilohy."""
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.content
|
||||
raise RuntimeError(f"Graph GET bytes failed: {url}")
|
||||
|
||||
|
||||
def graph_get_json(url: str, params: dict = None) -> dict:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
raise RuntimeError(f"Graph GET json failed: {url}")
|
||||
|
||||
|
||||
def fetch_attachment_content(graph_message_id: str, attachment_id: str) -> Optional[bytes]:
|
||||
"""Stahne obsah prilohy pres Graph API."""
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{graph_message_id}/attachments/{attachment_id}/$value"
|
||||
try:
|
||||
return graph_get_bytes(url)
|
||||
except Exception as e:
|
||||
logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s", graph_message_id, attachment_id, e)
|
||||
return None
|
||||
|
||||
|
||||
def fetch_message_attachments(graph_message_id: str) -> list[dict]:
|
||||
"""Nacte seznam priloh zpravy z Graph API (metadata vcetne attachment ID)."""
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/messages/{graph_message_id}/attachments"
|
||||
try:
|
||||
data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline,contentId"})
|
||||
return data.get("value", [])
|
||||
except Exception as e:
|
||||
logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
|
||||
return []
|
||||
|
||||
|
||||
# ─── Dedup + ukládání ─────────────────────────────────────────────────────────
|
||||
|
||||
def sha256(data: bytes) -> str:
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, index_col) -> str:
|
||||
"""
|
||||
Vrati nazev souboru ktery pouzit pro ulozeni.
|
||||
Pokud desired_name jiz existuje s jinym hashem, prida suffix _2, _3 ...
|
||||
"""
|
||||
# Zkontroluj jestli existujici soubor se stejnym nazvem ma stejny hash
|
||||
existing = index_col.find_one({"filename": desired_name})
|
||||
if existing:
|
||||
if existing["_id"] == hash_val:
|
||||
return desired_name # Stejny hash, stejne jmeno — dedup hit
|
||||
# Jiny hash — hledej volny suffix
|
||||
stem = Path(desired_name).stem
|
||||
suffix = Path(desired_name).suffix
|
||||
n = 2
|
||||
while True:
|
||||
candidate = f"{stem}_{n}{suffix}"
|
||||
if not (att_dir / candidate).exists():
|
||||
# Overi ze ani v indexu neni tento kandidat s jinym hashem
|
||||
ex2 = index_col.find_one({"filename": candidate})
|
||||
if not ex2 or ex2["_id"] == hash_val:
|
||||
return candidate
|
||||
n += 1
|
||||
return desired_name
|
||||
|
||||
|
||||
def save_attachment(content: bytes, original_name: str, att_dir: Path, index_col) -> tuple[str, str, bool]:
|
||||
"""
|
||||
Ulozi prilohu s deduplikaci.
|
||||
Vraci (hash, local_path, was_new):
|
||||
was_new=True -> soubor byl ulozen
|
||||
was_new=False -> hash uz existoval, soubor preskocen
|
||||
"""
|
||||
hash_val = sha256(content)
|
||||
|
||||
# Zkontroluj index — pokud hash uz existuje, vrat existujici zaznam
|
||||
existing = index_col.find_one({"_id": hash_val})
|
||||
if existing:
|
||||
# Zvys pocitadlo referenci
|
||||
index_col.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
|
||||
return hash_val, existing["local_path"], False
|
||||
|
||||
# Novy soubor — urcit nazev
|
||||
safe_name = "".join(c if c.isalnum() or c in "._- " else "_" for c in original_name).strip()
|
||||
if not safe_name:
|
||||
safe_name = f"attachment_{hash_val[:8]}"
|
||||
|
||||
filename = resolve_filename(safe_name, att_dir, hash_val, index_col)
|
||||
file_path = att_dir / filename
|
||||
|
||||
# Uloz soubor
|
||||
file_path.write_bytes(content)
|
||||
|
||||
# Zaznamenej do indexu
|
||||
index_col.insert_one({
|
||||
"_id": hash_val,
|
||||
"filename": filename,
|
||||
"local_path": filename,
|
||||
"size_bytes": len(content),
|
||||
"mime_type": "",
|
||||
"first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
"ref_count": 1,
|
||||
})
|
||||
|
||||
return hash_val, filename, True
|
||||
|
||||
|
||||
# ─── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
|
||||
ap.add_argument("--limit", type=int, default=0,
|
||||
help="Zpracovat max N emailu (0 = vse)")
|
||||
ap.add_argument("--force-recheck", action="store_true",
|
||||
help="Znovu overi i emaily kde prilohy uz maji file_hash")
|
||||
ap.add_argument("--no-indexes", action="store_true",
|
||||
help="Nevytvorit indexy na konci")
|
||||
args = ap.parse_args()
|
||||
|
||||
start = datetime.now()
|
||||
print(f"=== download_attachments v{SCRIPT_VERSION} ===")
|
||||
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Schránka: {GRAPH_MAILBOX}")
|
||||
print(f"Cilovy adresar: {ATTACHMENTS_DIR}")
|
||||
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}")
|
||||
|
||||
# Adresar
|
||||
ATTACHMENTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
print(f" Adresar OK")
|
||||
|
||||
# Graph
|
||||
print("\nPřipojuji se k Graph API...")
|
||||
try:
|
||||
get_token()
|
||||
print(" Graph API OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# MongoDB
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
try:
|
||||
client.admin.command("ping")
|
||||
print(" MongoDB OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: MongoDB neni dostupna -- {e}")
|
||||
sys.exit(1)
|
||||
|
||||
col_emails = client[MONGO_DB][MONGO_COL_EMAILS]
|
||||
col_index = client[MONGO_DB][MONGO_COL_INDEX]
|
||||
|
||||
# Indexy na attachment index kolekci
|
||||
if not args.no_indexes:
|
||||
col_index.create_index("filename")
|
||||
col_index.create_index("mime_type")
|
||||
|
||||
# Dotaz — emaily s prilohou ktere jeste nebyly zpracovany
|
||||
if args.force_recheck:
|
||||
query = {"has_attachments": True}
|
||||
else:
|
||||
query = {
|
||||
"has_attachments": True,
|
||||
"attachments": {
|
||||
"$elemMatch": {
|
||||
"is_inline": False,
|
||||
"file_hash": {"$exists": False},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
total = col_emails.count_documents(query)
|
||||
print(f"\nEmailu ke zpracovani: {total}")
|
||||
if total == 0:
|
||||
print("Neni co stahnout.")
|
||||
client.close()
|
||||
return
|
||||
|
||||
cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
|
||||
if args.limit:
|
||||
cursor = cursor.limit(args.limit)
|
||||
|
||||
ok_count = 0
|
||||
new_count = 0
|
||||
skip_count = 0
|
||||
err_count = 0
|
||||
email_i = 0
|
||||
batch = []
|
||||
|
||||
def flush():
|
||||
if not batch:
|
||||
return
|
||||
try:
|
||||
col_emails.bulk_write(batch, ordered=False)
|
||||
except Exception as e:
|
||||
logging.error("bulk_write: %s", e)
|
||||
print(f" CHYBA bulk_write: {e}")
|
||||
batch.clear()
|
||||
|
||||
for email_doc in cursor:
|
||||
email_i += 1
|
||||
email_id = email_doc["_id"]
|
||||
graph_id = email_doc.get("graph_id", "")
|
||||
subject = (email_doc.get("subject") or "")[:60]
|
||||
att_list = email_doc.get("attachments") or []
|
||||
|
||||
# Jen skutecne prilohy
|
||||
real_atts = [a for a in att_list if not a.get("is_inline", False)]
|
||||
if not real_atts:
|
||||
continue
|
||||
|
||||
print(f"\n {email_i:>5}/{total} {subject}")
|
||||
|
||||
# Nacti attachment IDs z Graph API
|
||||
graph_atts = fetch_message_attachments(graph_id)
|
||||
graph_att_map = {a["name"]: a for a in graph_atts if not a.get("isInline", False)}
|
||||
|
||||
updated_atts = list(att_list)
|
||||
email_ok = True
|
||||
|
||||
for i, att in enumerate(updated_atts):
|
||||
if att.get("is_inline", False):
|
||||
continue
|
||||
if not args.force_recheck and att.get("file_hash"):
|
||||
skip_count += 1
|
||||
print(f" SKIP {att['filename']}")
|
||||
continue
|
||||
|
||||
att_name = att.get("filename", "")
|
||||
graph_att = graph_att_map.get(att_name)
|
||||
|
||||
if not graph_att:
|
||||
# Zkus najit podle casti nazvu
|
||||
for gname, ga in graph_att_map.items():
|
||||
if att_name.lower() in gname.lower():
|
||||
graph_att = ga
|
||||
break
|
||||
|
||||
if not graph_att:
|
||||
logging.error("attachment not found in Graph [email=%s att=%s]", email_id, att_name)
|
||||
print(f" ERR {att_name} (nenalezeno v Graph)")
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
continue
|
||||
|
||||
# Stahni obsah
|
||||
content = fetch_attachment_content(graph_id, graph_att["id"])
|
||||
if content is None:
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
print(f" ERR {att_name} (stazeni selhalo)")
|
||||
continue
|
||||
|
||||
# Uloz s dedupem
|
||||
hash_val, local_path, was_new = save_attachment(content, att_name, ATTACHMENTS_DIR, col_index)
|
||||
|
||||
# Aktualizuj MIME typ v indexu
|
||||
col_index.update_one(
|
||||
{"_id": hash_val},
|
||||
{"$set": {"mime_type": att.get("mime_type", graph_att.get("contentType", ""))}},
|
||||
)
|
||||
|
||||
# Zaznamenej do emailu
|
||||
updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path}
|
||||
|
||||
if was_new:
|
||||
new_count += 1
|
||||
print(f" NEW {local_path} ({len(content):,} B)")
|
||||
else:
|
||||
skip_count += 1
|
||||
print(f" DUP {att_name} -> {local_path}")
|
||||
|
||||
if email_ok:
|
||||
ok_count += 1
|
||||
|
||||
# Uloz aktualizovane prilohy zpet do emailu
|
||||
batch.append(UpdateOne(
|
||||
{"_id": email_id},
|
||||
{"$set": {"attachments": updated_atts}}
|
||||
))
|
||||
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
flush()
|
||||
|
||||
if email_i % 100 == 0:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
print(f" {'─'*60}")
|
||||
print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={skip_count} err={err_count}")
|
||||
print(f" {'─'*60}")
|
||||
|
||||
flush()
|
||||
|
||||
elapsed_total = (datetime.now() - start).total_seconds()
|
||||
files_total = col_index.count_documents({})
|
||||
size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
|
||||
|
||||
print(f"\n{'='*52}")
|
||||
print(f"Vysledek: emaily={ok_count} | nove soubory={new_count} | duplikaty={skip_count} | err={err_count}")
|
||||
print(f"Souboru v indexu: {files_total} ({size_total/1024/1024:.1f} MB)")
|
||||
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
|
||||
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
if err_count:
|
||||
print(f"Chyby logovany do: {LOG_FILE}")
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,428 +0,0 @@
|
||||
"""
|
||||
download_attachments_v1.1.py
|
||||
Nazev: download_attachments_v1.1.py
|
||||
Verze: 1.1
|
||||
Datum: 2026-06-02
|
||||
Autor: vladimir.buzalka
|
||||
|
||||
Popis:
|
||||
Stahuje skutecne prilohy (is_inline=False) vsech emailu z MongoDB
|
||||
pres Microsoft Graph API a uklada je do adresare
|
||||
/mnt/Emails/<schránka>/Attachments/.
|
||||
|
||||
Schránka se predava jako povinny parametr --mailbox.
|
||||
|
||||
Deduplikace podle SHA256 hashe obsahu:
|
||||
- stejny hash = soubor uz existuje -> preskoci
|
||||
- prvni vyskytu souboru: ulozi pod puvodnimnazvem
|
||||
- kolize nazvu (stejny nazev, jiny hash): faktura_2.pdf, faktura_3.pdf ...
|
||||
|
||||
Po ulozeni aktualizuje MongoDB:
|
||||
- v email dokumentu: kazda priloha dostane file_hash + local_path
|
||||
- kolekce emaily.attachments_index: _id=hash, filename, path, size_bytes,
|
||||
mime_type, mailbox, first_seen_at, ref_count
|
||||
|
||||
Bezpecne prerusit a opakovat — emaily kde vsechny prilohy maji file_hash
|
||||
se preskoci. --force-recheck znovu overi i uz stazene.
|
||||
|
||||
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
|
||||
|
||||
Spousteni:
|
||||
python download_attachments_v1.1.py --mailbox ordinace@buzalkova.cz
|
||||
python download_attachments_v1.1.py --mailbox vladimir.buzalka@buzalka.cz --limit 50
|
||||
python download_attachments_v1.1.py --mailbox ordinace@buzalkova.cz --force-recheck
|
||||
|
||||
Docker:
|
||||
docker exec -it python-runner python /scripts/download_attachments_v1.1.py \\
|
||||
--mailbox ordinace@buzalkova.cz
|
||||
|
||||
Zavislosti:
|
||||
msal, requests, pymongo
|
||||
Python 3.10+
|
||||
|
||||
Struktura na disku:
|
||||
/mnt/Emails/
|
||||
└── <mailbox>/
|
||||
└── Attachments/
|
||||
├── faktura_2026.pdf
|
||||
├── vysledky_lab.pdf
|
||||
├── vysledky_lab_2.pdf
|
||||
└── ...
|
||||
|
||||
Kolekce emaily.attachments_index:
|
||||
_id SHA256 hash (hex)
|
||||
filename nazev souboru na disku
|
||||
local_path relativni cesta od Attachments/
|
||||
size_bytes velikost souboru
|
||||
mime_type MIME typ
|
||||
mailbox schránka ze ktere pochazi prvni vyskytu
|
||||
first_seen_at datetime UTC
|
||||
ref_count v kolika emailech se tato priloha vyskytuje
|
||||
|
||||
Historie verzi:
|
||||
1.0 2026-06-02 Inicialni verze
|
||||
1.1 2026-06-02 Schránka jako parametr --mailbox (univerzalni pouziti)
|
||||
"""
|
||||
|
||||
import sys
|
||||
import hashlib
|
||||
import logging
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import msal
|
||||
import requests
|
||||
from pymongo import MongoClient, UpdateOne
|
||||
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
|
||||
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
|
||||
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
|
||||
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
|
||||
GRAPH_URL = "https://graph.microsoft.com/v1.0"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
MONGO_COL_INDEX = "attachments_index"
|
||||
|
||||
EMAILS_BASE_DIR = Path("/mnt/Emails")
|
||||
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
|
||||
SCRIPT_VERSION = "1.1"
|
||||
BATCH_SIZE = 50
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
logging.basicConfig(
|
||||
filename=str(LOG_FILE),
|
||||
level=logging.ERROR,
|
||||
format="%(asctime)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
_graph_token: Optional[str] = None
|
||||
|
||||
|
||||
# ─── Graph API ────────────────────────────────────────────────────────────────
|
||||
|
||||
def get_token() -> str:
|
||||
global _graph_token
|
||||
app = msal.ConfidentialClientApplication(
|
||||
GRAPH_CLIENT_ID,
|
||||
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
|
||||
client_credential=GRAPH_CLIENT_SECRET,
|
||||
)
|
||||
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
||||
if "access_token" not in result:
|
||||
raise RuntimeError(f"Graph auth failed: {result}")
|
||||
_graph_token = result["access_token"]
|
||||
return _graph_token
|
||||
|
||||
|
||||
def graph_get_bytes(url: str) -> bytes:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, timeout=120, stream=True)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.content
|
||||
raise RuntimeError(f"Graph GET bytes failed: {url}")
|
||||
|
||||
|
||||
def graph_get_json(url: str, params: dict = None) -> dict:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
raise RuntimeError(f"Graph GET json failed: {url}")
|
||||
|
||||
|
||||
def fetch_message_attachments(mailbox: str, graph_message_id: str) -> list[dict]:
|
||||
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments"
|
||||
try:
|
||||
data = graph_get_json(url, {"$select": "id,name,contentType,size,isInline,contentId"})
|
||||
return data.get("value", [])
|
||||
except Exception as e:
|
||||
logging.error("fetch_message_attachments failed [%s]: %s", graph_message_id, e)
|
||||
return []
|
||||
|
||||
|
||||
def fetch_attachment_content(mailbox: str, graph_message_id: str, attachment_id: str) -> Optional[bytes]:
|
||||
url = f"{GRAPH_URL}/users/{mailbox}/messages/{graph_message_id}/attachments/{attachment_id}/$value"
|
||||
try:
|
||||
return graph_get_bytes(url)
|
||||
except Exception as e:
|
||||
logging.error("fetch_attachment_content failed [msg=%s att=%s]: %s", graph_message_id, attachment_id, e)
|
||||
return None
|
||||
|
||||
|
||||
# ─── Dedup + ukládání ─────────────────────────────────────────────────────────
|
||||
|
||||
def sha256(data: bytes) -> str:
|
||||
return hashlib.sha256(data).hexdigest()
|
||||
|
||||
|
||||
def safe_filename(name: str) -> str:
|
||||
safe = "".join(c if c.isalnum() or c in "._- " else "_" for c in name).strip()
|
||||
return safe or "attachment"
|
||||
|
||||
|
||||
def resolve_filename(desired_name: str, att_dir: Path, hash_val: str, col_index) -> str:
|
||||
"""Vrati nazev souboru pro ulozeni — resi kolize (stejny nazev, jiny hash)."""
|
||||
existing = col_index.find_one({"filename": desired_name})
|
||||
if existing:
|
||||
if existing["_id"] == hash_val:
|
||||
return desired_name # Dedup hit — stejny hash
|
||||
# Kolize — hledej volny suffix
|
||||
stem = Path(desired_name).stem
|
||||
suffix = Path(desired_name).suffix
|
||||
n = 2
|
||||
while True:
|
||||
candidate = f"{stem}_{n}{suffix}"
|
||||
ex2 = col_index.find_one({"filename": candidate})
|
||||
if not ex2 or ex2["_id"] == hash_val:
|
||||
if not (att_dir / candidate).exists() or (ex2 and ex2["_id"] == hash_val):
|
||||
return candidate
|
||||
n += 1
|
||||
return desired_name
|
||||
|
||||
|
||||
def save_attachment(
|
||||
content: bytes,
|
||||
original_name: str,
|
||||
mime_type: str,
|
||||
mailbox: str,
|
||||
att_dir: Path,
|
||||
col_index,
|
||||
) -> tuple[str, str, bool]:
|
||||
"""
|
||||
Ulozi prilohu s deduplikaci.
|
||||
Vraci (hash, local_path, was_new).
|
||||
"""
|
||||
hash_val = sha256(content)
|
||||
|
||||
existing = col_index.find_one({"_id": hash_val})
|
||||
if existing:
|
||||
col_index.update_one({"_id": hash_val}, {"$inc": {"ref_count": 1}})
|
||||
return hash_val, existing["local_path"], False
|
||||
|
||||
filename = resolve_filename(safe_filename(original_name), att_dir, hash_val, col_index)
|
||||
file_path = att_dir / filename
|
||||
file_path.write_bytes(content)
|
||||
|
||||
col_index.insert_one({
|
||||
"_id": hash_val,
|
||||
"filename": filename,
|
||||
"local_path": filename,
|
||||
"size_bytes": len(content),
|
||||
"mime_type": mime_type,
|
||||
"mailbox": mailbox,
|
||||
"first_seen_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
"ref_count": 1,
|
||||
})
|
||||
|
||||
return hash_val, filename, True
|
||||
|
||||
|
||||
# ─── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=f"download_attachments v{SCRIPT_VERSION}")
|
||||
ap.add_argument("--mailbox", required=True,
|
||||
help="Emailova schranka (napr. ordinace@buzalkova.cz)")
|
||||
ap.add_argument("--limit", type=int, default=0,
|
||||
help="Zpracovat max N emailu (0 = vse)")
|
||||
ap.add_argument("--force-recheck", action="store_true",
|
||||
help="Znovu overi i emaily kde prilohy uz maji file_hash")
|
||||
ap.add_argument("--no-indexes", action="store_true",
|
||||
help="Nevytvorit indexy na attachments_index kolekci")
|
||||
args = ap.parse_args()
|
||||
|
||||
mailbox = args.mailbox
|
||||
att_dir = EMAILS_BASE_DIR / mailbox / "Attachments"
|
||||
mongo_col = mailbox
|
||||
|
||||
start = datetime.now()
|
||||
print(f"=== download_attachments v{SCRIPT_VERSION} ===")
|
||||
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Schránka: {mailbox}")
|
||||
print(f"Cilovy adresar: {att_dir}")
|
||||
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{mongo_col}")
|
||||
|
||||
att_dir.mkdir(parents=True, exist_ok=True)
|
||||
print(" Adresar OK")
|
||||
|
||||
print("\nPřipojuji se k Graph API...")
|
||||
try:
|
||||
get_token()
|
||||
print(" Graph API OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
try:
|
||||
client.admin.command("ping")
|
||||
print(" MongoDB OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: MongoDB neni dostupna -- {e}")
|
||||
sys.exit(1)
|
||||
|
||||
col_emails = client[MONGO_DB][mongo_col]
|
||||
col_index = client[MONGO_DB][MONGO_COL_INDEX]
|
||||
|
||||
if not args.no_indexes:
|
||||
col_index.create_index("filename")
|
||||
col_index.create_index("mime_type")
|
||||
col_index.create_index("mailbox")
|
||||
|
||||
# Dotaz
|
||||
if args.force_recheck:
|
||||
query = {"has_attachments": True}
|
||||
else:
|
||||
query = {
|
||||
"has_attachments": True,
|
||||
"attachments": {
|
||||
"$elemMatch": {
|
||||
"is_inline": False,
|
||||
"file_hash": {"$exists": False},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
total = col_emails.count_documents(query)
|
||||
print(f"\nEmailu ke zpracovani: {total}")
|
||||
if total == 0:
|
||||
print("Neni co stahnout.")
|
||||
client.close()
|
||||
return
|
||||
|
||||
cursor = col_emails.find(query, {"_id": 1, "graph_id": 1, "subject": 1, "attachments": 1})
|
||||
if args.limit:
|
||||
cursor = cursor.limit(args.limit)
|
||||
|
||||
ok_count = 0
|
||||
new_count = 0
|
||||
dup_count = 0
|
||||
err_count = 0
|
||||
email_i = 0
|
||||
batch = []
|
||||
|
||||
def flush():
|
||||
if not batch:
|
||||
return
|
||||
try:
|
||||
col_emails.bulk_write(batch, ordered=False)
|
||||
except Exception as e:
|
||||
logging.error("bulk_write: %s", e)
|
||||
print(f" CHYBA bulk_write: {e}")
|
||||
batch.clear()
|
||||
|
||||
for email_doc in cursor:
|
||||
email_i += 1
|
||||
email_id = email_doc["_id"]
|
||||
graph_id = email_doc.get("graph_id", "")
|
||||
subject = (email_doc.get("subject") or "")[:60]
|
||||
att_list = email_doc.get("attachments") or []
|
||||
|
||||
real_atts = [a for a in att_list if not a.get("is_inline", False)]
|
||||
if not real_atts:
|
||||
continue
|
||||
|
||||
print(f"\n {email_i:>5}/{total} {subject}")
|
||||
|
||||
graph_atts = fetch_message_attachments(mailbox, graph_id)
|
||||
graph_att_map = {a["name"]: a for a in graph_atts if not a.get("isInline", False)}
|
||||
|
||||
updated_atts = list(att_list)
|
||||
email_ok = True
|
||||
|
||||
for i, att in enumerate(updated_atts):
|
||||
if att.get("is_inline", False):
|
||||
continue
|
||||
if not args.force_recheck and att.get("file_hash"):
|
||||
print(f" SKIP {att['filename']}")
|
||||
continue
|
||||
|
||||
att_name = att.get("filename", "")
|
||||
graph_att = graph_att_map.get(att_name)
|
||||
if not graph_att:
|
||||
for gname, ga in graph_att_map.items():
|
||||
if att_name.lower() in gname.lower():
|
||||
graph_att = ga
|
||||
break
|
||||
|
||||
if not graph_att:
|
||||
logging.error("attachment not found in Graph [email=%s att=%s]", email_id, att_name)
|
||||
print(f" ERR {att_name} (nenalezeno v Graph)")
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
continue
|
||||
|
||||
content = fetch_attachment_content(mailbox, graph_id, graph_att["id"])
|
||||
if content is None:
|
||||
err_count += 1
|
||||
email_ok = False
|
||||
print(f" ERR {att_name} (stazeni selhalo)")
|
||||
continue
|
||||
|
||||
mime_type = att.get("mime_type") or graph_att.get("contentType", "")
|
||||
hash_val, local_path, was_new = save_attachment(
|
||||
content, att_name, mime_type, mailbox, att_dir, col_index
|
||||
)
|
||||
|
||||
updated_atts[i] = {**att, "file_hash": hash_val, "local_path": local_path}
|
||||
|
||||
if was_new:
|
||||
new_count += 1
|
||||
print(f" NEW {local_path} ({len(content):,} B)")
|
||||
else:
|
||||
dup_count += 1
|
||||
print(f" DUP {att_name} -> {local_path}")
|
||||
|
||||
if email_ok:
|
||||
ok_count += 1
|
||||
|
||||
batch.append(UpdateOne({"_id": email_id}, {"$set": {"attachments": updated_atts}}))
|
||||
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
flush()
|
||||
|
||||
if email_i % 100 == 0:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
print(f" {'─'*60}")
|
||||
print(f" Průběh: emaily={email_i}/{total} nove={new_count} dup={dup_count} err={err_count}")
|
||||
print(f" {'─'*60}")
|
||||
|
||||
flush()
|
||||
|
||||
elapsed_total = (datetime.now() - start).total_seconds()
|
||||
files_total = col_index.count_documents({})
|
||||
size_total = sum(d.get("size_bytes", 0) for d in col_index.find({}, {"size_bytes": 1}))
|
||||
|
||||
print(f"\n{'='*52}")
|
||||
print(f"Vysledek: emaily={ok_count} | nove={new_count} | dup={dup_count} | err={err_count}")
|
||||
print(f"Souboru v indexu: {files_total} ({size_total / 1024 / 1024:.1f} MB)")
|
||||
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
|
||||
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
if err_count:
|
||||
print(f"Chyby logovany do: {LOG_FILE}")
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,560 +0,0 @@
|
||||
"""
|
||||
parse_emails_graph_v1.0.py
|
||||
Nazev: parse_emails_graph_v1.0.py
|
||||
Verze: 1.0
|
||||
Datum: 2026-06-02
|
||||
Autor: vladimir.buzalka
|
||||
|
||||
Popis:
|
||||
Cte vsechny emaily ze schranky ordinace@buzalkova.cz primo pres
|
||||
Microsoft Graph API a importuje je jako dokumenty do MongoDB.
|
||||
Ze kazde zpravy extrahuje vsechny dostupne vlastnosti:
|
||||
|
||||
- predmet, odesilatel, prijemci (To/CC/BCC s typy)
|
||||
- cas doruceni, odeslani, vytvoreni, modifikace (UTC)
|
||||
- telo HTML (max 2 MB) + textovy preview
|
||||
- prilohy (metadata: jmeno, velikost, MIME typ, inline flag)
|
||||
- internet headers (SPF, DKIM, Received, X-*, ...)
|
||||
- MAPI-ekvivalenty: dulezitost, priznak, konverzacni vlakno,
|
||||
kategorie, In-Reply-To, References, ...
|
||||
- navic: isRead, isDraft, folder_path, inferenceClassification
|
||||
|
||||
Prochazi VSECHNY slozky schranky rekurzivne (Inbox, Sent, Deleted,
|
||||
archivni slozky, ...).
|
||||
|
||||
DB: emaily
|
||||
Kolekce: ordinace@buzalkova.cz
|
||||
_id: Internet Message-ID (nebo "graphid:<id>" jako fallback)
|
||||
|
||||
Bezpecne prerusit a opakovat:
|
||||
- upsert podle _id — duplicity se automaticky prepisi
|
||||
- --skip-existing nacte seznam hotovych _id z MongoDB a preskoci je
|
||||
|
||||
POZOR: Skript pouze CIST ze schranky — zadny zapis do schranky!
|
||||
|
||||
Spousteni:
|
||||
python parse_emails_graph_v1.0.py # kompletni import
|
||||
python parse_emails_graph_v1.0.py --limit 50 # test na prvnich 50
|
||||
python parse_emails_graph_v1.0.py --skip-existing # pokracovani po preruseni
|
||||
python parse_emails_graph_v1.0.py --folder Inbox # jen jedna slozka
|
||||
python parse_emails_graph_v1.0.py --no-indexes # bez indexu na konci
|
||||
|
||||
Zavislosti:
|
||||
msal, requests, pymongo, python-dateutil
|
||||
Python 3.10+
|
||||
|
||||
Struktura dokumentu v MongoDB:
|
||||
_id Internet Message-ID (nebo graphid: fallback)
|
||||
graph_id Graph API message ID (pro pripadne dalsi operace)
|
||||
subject predmet zpravy
|
||||
normalized_subject predmet bez RE:/FW:/AW: prefixu
|
||||
importance 0=nizka 1=normalni 2=vysoka
|
||||
flag_status 0=bez priznaku 1=oznaceno 2=dokonceno
|
||||
is_read bool — aktualni stav precteni ve schrance
|
||||
is_draft bool
|
||||
has_attachments bool
|
||||
attachment_count int
|
||||
inference_classification focused / other (Outlook AI trideni)
|
||||
categories [str]
|
||||
conversation_id Graph conversationId
|
||||
conversation_index base64 conversationIndex
|
||||
conversation_topic tema vlakna (z internet headers Thread-Topic)
|
||||
in_reply_to Message-ID predchozi zpravy
|
||||
internet_references [Message-ID] — cela historia vlakna
|
||||
received_at datetime UTC
|
||||
sent_at datetime UTC
|
||||
created_at datetime UTC — cas vytvoreni zaznamu v M365
|
||||
modified_at datetime UTC — cas posledni modifikace
|
||||
folder_id Graph parentFolderId
|
||||
folder_path cela cesta slozky (napr. Inbox/Subfolder)
|
||||
sender.email emailova adresa odesilatele
|
||||
sender.name zobrazovane jmeno odesilatele
|
||||
to retezec To (joined)
|
||||
cc retezec CC
|
||||
bcc retezec BCC
|
||||
recipients [{type, email, name}] — to/cc/bcc s typy
|
||||
body_html HTML telo (max 2 MB)
|
||||
body_preview textovy nahled (max 255 znaku z Graph)
|
||||
attachments [{filename, size_bytes, mime_type,
|
||||
content_id, is_inline}]
|
||||
headers dict internet headers (lowercase_s_podtrzitky)
|
||||
parsed_at datetime UTC — cas parsovani
|
||||
|
||||
Indexy:
|
||||
received_at, sent_at, sender.email, graph_id (unique),
|
||||
conversation_id, folder_path, has_attachments, categories,
|
||||
importance, flag_status, is_read,
|
||||
text_search (subject + body_preview + to + cc)
|
||||
|
||||
Historie verzi:
|
||||
1.0 2026-06-02 Inicialni verze — Graph API jako zdroj
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
import logging
|
||||
import argparse
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import msal
|
||||
import requests
|
||||
from dateutil import parser as dtparser
|
||||
from pymongo import MongoClient, UpdateOne, ASCENDING, TEXT
|
||||
|
||||
if hasattr(sys.stdout, "reconfigure"):
|
||||
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
|
||||
|
||||
# ─── KONFIGURACE ──────────────────────────────────────────────────────────────
|
||||
GRAPH_TENANT_ID = "7d269944-37a4-43a1-8140-c7517dc426e9"
|
||||
GRAPH_CLIENT_ID = "4b222bfd-78c9-4239-a53f-43006b3ed07f"
|
||||
GRAPH_CLIENT_SECRET = "Txg8Q~MjhocuopxsJyJBhPmDfMxZ2r5WpTFj1dfk"
|
||||
GRAPH_MAILBOX = "ordinace@buzalkova.cz"
|
||||
GRAPH_URL = "https://graph.microsoft.com/v1.0"
|
||||
|
||||
MONGO_URI = "mongodb://192.168.1.76:27017"
|
||||
MONGO_DB = "emaily"
|
||||
MONGO_COL = "ordinace@buzalkova.cz"
|
||||
BATCH_SIZE = 100
|
||||
PAGE_SIZE = 50
|
||||
LOG_FILE = Path(__file__).parent / "parse_emails_errors.log"
|
||||
SCRIPT_VERSION = "1.0"
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
logging.basicConfig(
|
||||
filename=str(LOG_FILE),
|
||||
level=logging.ERROR,
|
||||
format="%(asctime)s | %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
IMPORTANCE_MAP = {"low": 0, "normal": 1, "high": 2}
|
||||
FLAG_STATUS_MAP = {"notFlagged": 0, "flagged": 1, "complete": 2}
|
||||
RE_SUBJECT = re.compile(r"^(RE|FW|AW|SV|VS|TR|WG|odpov[eě]d[ťt]|fwd?)[:\s]+", re.IGNORECASE)
|
||||
|
||||
MSG_SELECT = (
|
||||
"id,internetMessageId,subject,bodyPreview,body,"
|
||||
"importance,isRead,isDraft,hasAttachments,"
|
||||
"receivedDateTime,sentDateTime,createdDateTime,lastModifiedDateTime,"
|
||||
"sender,from,toRecipients,ccRecipients,bccRecipients,replyTo,"
|
||||
"conversationId,conversationIndex,parentFolderId,"
|
||||
"categories,flag,inferenceClassification,internetMessageHeaders"
|
||||
)
|
||||
|
||||
|
||||
# ─── Graph API helpers ────────────────────────────────────────────────────────
|
||||
|
||||
_graph_token: Optional[str] = None
|
||||
|
||||
|
||||
def get_token() -> str:
|
||||
global _graph_token
|
||||
app = msal.ConfidentialClientApplication(
|
||||
GRAPH_CLIENT_ID,
|
||||
authority=f"https://login.microsoftonline.com/{GRAPH_TENANT_ID}",
|
||||
client_credential=GRAPH_CLIENT_SECRET,
|
||||
)
|
||||
result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
|
||||
if "access_token" not in result:
|
||||
raise RuntimeError(f"Graph auth failed: {result}")
|
||||
_graph_token = result["access_token"]
|
||||
return _graph_token
|
||||
|
||||
|
||||
def graph_get(url: str, params: dict = None) -> dict:
|
||||
global _graph_token
|
||||
if not _graph_token:
|
||||
get_token()
|
||||
for attempt in range(2):
|
||||
r = requests.get(url, headers={"Authorization": f"Bearer {_graph_token}"}, params=params, timeout=30)
|
||||
if r.status_code == 401:
|
||||
get_token()
|
||||
continue
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
raise RuntimeError(f"Graph GET failed after retry: {url}")
|
||||
|
||||
|
||||
def get_all_folders(parent_id: str = None, parent_path: str = "") -> list[dict]:
|
||||
"""Rekurzivne nacte vsechny slozky schranky. Vraci [{id, path}]."""
|
||||
if parent_id is None:
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders"
|
||||
else:
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{parent_id}/childFolders"
|
||||
|
||||
folders = []
|
||||
params = {"$top": 100, "$select": "id,displayName,childFolderCount"}
|
||||
while url:
|
||||
data = graph_get(url, params)
|
||||
for f in data.get("value", []):
|
||||
path = f"{parent_path}/{f['displayName']}".lstrip("/")
|
||||
folders.append({"id": f["id"], "path": path})
|
||||
if f.get("childFolderCount", 0) > 0:
|
||||
folders.extend(get_all_folders(f["id"], path))
|
||||
url = data.get("@odata.nextLink")
|
||||
params = None
|
||||
return folders
|
||||
|
||||
|
||||
def iter_folder_messages(folder_id: str):
|
||||
"""Generator: vraci zpravy ze slozky po strankach."""
|
||||
url = f"{GRAPH_URL}/users/{GRAPH_MAILBOX}/mailFolders/{folder_id}/messages"
|
||||
params = {"$top": PAGE_SIZE, "$select": MSG_SELECT, "$expand": "attachments"}
|
||||
while url:
|
||||
data = graph_get(url, params)
|
||||
for msg in data.get("value", []):
|
||||
yield msg
|
||||
url = data.get("@odata.nextLink")
|
||||
params = None
|
||||
|
||||
|
||||
# ─── Pomocné funkce ───────────────────────────────────────────────────────────
|
||||
|
||||
def parse_date(raw) -> Optional[datetime]:
|
||||
if raw is None:
|
||||
return None
|
||||
if isinstance(raw, datetime):
|
||||
if raw.tzinfo:
|
||||
return raw.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
return raw
|
||||
try:
|
||||
dt = dtparser.parse(str(raw))
|
||||
if dt.tzinfo:
|
||||
return dt.astimezone(timezone.utc).replace(tzinfo=None)
|
||||
return dt
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def normalize_subject(subject: str) -> str:
|
||||
s = subject.strip()
|
||||
while True:
|
||||
m = RE_SUBJECT.match(s)
|
||||
if not m:
|
||||
break
|
||||
s = s[m.end():].strip()
|
||||
return s
|
||||
|
||||
|
||||
def parse_headers(raw_headers: list) -> dict:
|
||||
result = {}
|
||||
for h in raw_headers:
|
||||
k = h["name"].lower().replace("-", "_")
|
||||
v = h["value"]
|
||||
if k in result:
|
||||
existing = result[k]
|
||||
if isinstance(existing, list):
|
||||
existing.append(v)
|
||||
else:
|
||||
result[k] = [existing, v]
|
||||
else:
|
||||
result[k] = v
|
||||
return result
|
||||
|
||||
|
||||
def format_recipients(lst: list) -> str:
|
||||
return "; ".join(
|
||||
f'{r["emailAddress"].get("name", "")} <{r["emailAddress"].get("address", "")}>'.strip()
|
||||
for r in lst
|
||||
)
|
||||
|
||||
|
||||
# ─── Hlavní extrakce ─────────────────────────────────────────────────────────
|
||||
|
||||
def extract_message(msg: dict, folder_path: str) -> Optional[dict]:
|
||||
try:
|
||||
# _id
|
||||
mid = (msg.get("internetMessageId") or "").strip()
|
||||
if not mid:
|
||||
mid = f"graphid:{msg['id']}"
|
||||
|
||||
subject = msg.get("subject") or ""
|
||||
norm_subject = normalize_subject(subject)
|
||||
|
||||
# tělo
|
||||
body_html = None
|
||||
body_preview = msg.get("bodyPreview") or ""
|
||||
body = msg.get("body", {})
|
||||
if body.get("contentType") == "html":
|
||||
content = body.get("content") or ""
|
||||
body_html = content if len(content) <= 2 * 1024 * 1024 else content[:2 * 1024 * 1024]
|
||||
elif body.get("contentType") == "text":
|
||||
body_preview = (body.get("content") or "")[:2000]
|
||||
|
||||
# odesílatel
|
||||
sender_ea = (msg.get("from") or msg.get("sender") or {}).get("emailAddress", {})
|
||||
sender_email = sender_ea.get("address", "")
|
||||
sender_name = sender_ea.get("name", "")
|
||||
|
||||
# příjemci
|
||||
to_list = msg.get("toRecipients", [])
|
||||
cc_list = msg.get("ccRecipients", [])
|
||||
bcc_list = msg.get("bccRecipients", [])
|
||||
|
||||
recipients = (
|
||||
[{"type": "to", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in to_list] +
|
||||
[{"type": "cc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in cc_list] +
|
||||
[{"type": "bcc", "email": r["emailAddress"].get("address",""), "name": r["emailAddress"].get("name","")} for r in bcc_list]
|
||||
)
|
||||
|
||||
# příznaky
|
||||
importance = IMPORTANCE_MAP.get(msg.get("importance", "normal"), 1)
|
||||
flag_status = FLAG_STATUS_MAP.get((msg.get("flag") or {}).get("flagStatus", "notFlagged"), 0)
|
||||
|
||||
# internet headers
|
||||
raw_headers = msg.get("internetMessageHeaders") or []
|
||||
headers = parse_headers(raw_headers)
|
||||
|
||||
in_reply_to = headers.get("in_reply_to", "")
|
||||
if isinstance(in_reply_to, list):
|
||||
in_reply_to = in_reply_to[0]
|
||||
|
||||
refs_raw = headers.get("references", "")
|
||||
if isinstance(refs_raw, list):
|
||||
refs_raw = " ".join(refs_raw)
|
||||
internet_refs = [r.strip() for r in refs_raw.split() if r.strip()] if refs_raw else []
|
||||
|
||||
conv_topic = headers.get("thread_topic", "")
|
||||
if isinstance(conv_topic, list):
|
||||
conv_topic = conv_topic[0]
|
||||
|
||||
# conversation index
|
||||
conv_index = ""
|
||||
ci_raw = msg.get("conversationIndex")
|
||||
if ci_raw:
|
||||
try:
|
||||
conv_index = base64.b64encode(base64.b64decode(ci_raw)).decode()
|
||||
except Exception:
|
||||
conv_index = ci_raw
|
||||
|
||||
# přílohy (jen metadata, bez obsahu)
|
||||
attachments = []
|
||||
for att in msg.get("attachments") or []:
|
||||
fname = att.get("name") or ""
|
||||
if not fname:
|
||||
continue
|
||||
attachments.append({
|
||||
"filename": fname,
|
||||
"size_bytes": att.get("size", 0),
|
||||
"mime_type": att.get("contentType", "application/octet-stream"),
|
||||
"content_id": att.get("contentId"),
|
||||
"is_inline": att.get("isInline", False),
|
||||
})
|
||||
|
||||
return {
|
||||
"_id": mid,
|
||||
"graph_id": msg["id"],
|
||||
|
||||
"subject": subject,
|
||||
"normalized_subject": norm_subject,
|
||||
"importance": importance,
|
||||
"flag_status": flag_status,
|
||||
"is_read": msg.get("isRead", False),
|
||||
"is_draft": msg.get("isDraft", False),
|
||||
"has_attachments": msg.get("hasAttachments", False),
|
||||
"attachment_count": len(attachments),
|
||||
"inference_classification": msg.get("inferenceClassification", ""),
|
||||
"categories": msg.get("categories") or [],
|
||||
|
||||
"conversation_id": msg.get("conversationId", ""),
|
||||
"conversation_index": conv_index,
|
||||
"conversation_topic": conv_topic,
|
||||
"in_reply_to": in_reply_to,
|
||||
"internet_references": internet_refs,
|
||||
|
||||
"received_at": parse_date(msg.get("receivedDateTime")),
|
||||
"sent_at": parse_date(msg.get("sentDateTime")),
|
||||
"created_at": parse_date(msg.get("createdDateTime")),
|
||||
"modified_at": parse_date(msg.get("lastModifiedDateTime")),
|
||||
|
||||
"folder_id": msg.get("parentFolderId", ""),
|
||||
"folder_path": folder_path,
|
||||
|
||||
"sender": {
|
||||
"email": sender_email,
|
||||
"name": sender_name,
|
||||
},
|
||||
"to": format_recipients(to_list),
|
||||
"cc": format_recipients(cc_list),
|
||||
"bcc": format_recipients(bcc_list),
|
||||
"recipients": recipients,
|
||||
|
||||
"body_html": body_html,
|
||||
"body_preview": body_preview,
|
||||
|
||||
"attachments": attachments,
|
||||
"headers": headers,
|
||||
|
||||
"parsed_at": datetime.now(timezone.utc).replace(tzinfo=None),
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logging.error("extract_message failed [%s]: %s", msg.get("id", "?"), e)
|
||||
return None
|
||||
|
||||
|
||||
# ─── MongoDB indexy ───────────────────────────────────────────────────────────
|
||||
|
||||
def create_indexes(col):
|
||||
print(" Vytvarim indexy...")
|
||||
col.create_index([("received_at", ASCENDING)])
|
||||
col.create_index([("sent_at", ASCENDING)])
|
||||
col.create_index([("sender.email", ASCENDING)])
|
||||
col.create_index([("graph_id", ASCENDING)], unique=True, sparse=True)
|
||||
col.create_index([("conversation_id", ASCENDING)])
|
||||
col.create_index([("folder_path", ASCENDING)])
|
||||
col.create_index([("has_attachments", ASCENDING)])
|
||||
col.create_index([("categories", ASCENDING)])
|
||||
col.create_index([("importance", ASCENDING)])
|
||||
col.create_index([("flag_status", ASCENDING)])
|
||||
col.create_index([("is_read", ASCENDING)])
|
||||
col.create_index([
|
||||
("subject", TEXT),
|
||||
("body_preview", TEXT),
|
||||
("to", TEXT),
|
||||
("cc", TEXT),
|
||||
], name="text_search", default_language="none")
|
||||
print(" Indexy hotovy.")
|
||||
|
||||
|
||||
# ─── MAIN ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description=f"parse_emails_graph v{SCRIPT_VERSION}")
|
||||
ap.add_argument("--limit", type=int, default=0,
|
||||
help="Zpracovat max N zprav (0 = vse)")
|
||||
ap.add_argument("--skip-existing", action="store_true",
|
||||
help="Preskocit zpravy ktere jiz jsou v MongoDB")
|
||||
ap.add_argument("--folder", default="",
|
||||
help="Zpracovat jen slozku se zadanym nazvem (napr. Inbox)")
|
||||
ap.add_argument("--no-indexes", action="store_true",
|
||||
help="Nevytvorit indexy na konci")
|
||||
args = ap.parse_args()
|
||||
|
||||
start = datetime.now()
|
||||
print(f"=== parse_emails_graph v{SCRIPT_VERSION} ===")
|
||||
print(f"Start: {start.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f"Schránka: {GRAPH_MAILBOX}")
|
||||
print(f"MongoDB: {MONGO_URI} -> {MONGO_DB}.{MONGO_COL}")
|
||||
|
||||
# Graph token
|
||||
print("\nPřipojuji se k Graph API...")
|
||||
try:
|
||||
get_token()
|
||||
print(" Graph API OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# MongoDB
|
||||
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
|
||||
try:
|
||||
client.admin.command("ping")
|
||||
print(" MongoDB OK")
|
||||
except Exception as e:
|
||||
print(f" CHYBA: MongoDB neni dostupna -- {e}")
|
||||
sys.exit(1)
|
||||
col = client[MONGO_DB][MONGO_COL]
|
||||
|
||||
# Skip existing
|
||||
existing: set = set()
|
||||
if args.skip_existing:
|
||||
print(" Nacitam existujici zaznamy z MongoDB...")
|
||||
existing = set(col.distinct("_id"))
|
||||
print(f" {len(existing)} jiz importovano")
|
||||
|
||||
# Slozky
|
||||
print("\nNacitam seznam slozek...")
|
||||
all_folders = get_all_folders()
|
||||
if args.folder:
|
||||
all_folders = [f for f in all_folders if args.folder.lower() in f["path"].lower()]
|
||||
print(f" Slozek ke zpracovani: {len(all_folders)}")
|
||||
for f in all_folders:
|
||||
print(f" {f['path']}")
|
||||
|
||||
# Import
|
||||
batch = []
|
||||
ok_count = 0
|
||||
err_count = 0
|
||||
skip_count = 0
|
||||
total_i = 0
|
||||
|
||||
def flush():
|
||||
if not batch:
|
||||
return
|
||||
try:
|
||||
col.bulk_write(batch, ordered=False)
|
||||
except Exception as e:
|
||||
logging.error("bulk_write: %s", e)
|
||||
print(f" CHYBA bulk_write: {e}")
|
||||
batch.clear()
|
||||
|
||||
print()
|
||||
for folder in all_folders:
|
||||
print(f"--- Složka: {folder['path']} ---")
|
||||
folder_count = 0
|
||||
|
||||
for msg in iter_folder_messages(folder["id"]):
|
||||
if args.limit and total_i >= args.limit:
|
||||
break
|
||||
|
||||
mid = (msg.get("internetMessageId") or "").strip() or f"graphid:{msg['id']}"
|
||||
|
||||
if mid in existing:
|
||||
skip_count += 1
|
||||
total_i += 1
|
||||
continue
|
||||
|
||||
doc = extract_message(msg, folder["path"])
|
||||
total_i += 1
|
||||
folder_count += 1
|
||||
|
||||
if doc is None:
|
||||
err_count += 1
|
||||
else:
|
||||
batch.append(UpdateOne({"_id": doc["_id"]}, {"$set": doc}, upsert=True))
|
||||
ok_count += 1
|
||||
|
||||
if len(batch) >= BATCH_SIZE:
|
||||
flush()
|
||||
|
||||
status = "ERR " if doc is None else "OK "
|
||||
subject_str = (doc.get("subject") or "")[:60] if doc else "?"
|
||||
sender_str = (doc.get("sender", {}).get("email") or "")[:40] if doc else "?"
|
||||
print(f" {total_i:>6} {status} {subject_str:<60} {sender_str}")
|
||||
|
||||
if total_i % 500 == 0:
|
||||
elapsed = (datetime.now() - start).total_seconds()
|
||||
rate = total_i / elapsed if elapsed > 0 else 0
|
||||
print(f" {'─'*80}")
|
||||
print(f" Průběh: ok={ok_count} skip={skip_count} err={err_count} {rate:.1f} msg/s")
|
||||
print(f" {'─'*80}")
|
||||
|
||||
flush()
|
||||
print(f" → {folder_count} zprav ze slozky {folder['path']}")
|
||||
|
||||
if args.limit and total_i >= args.limit:
|
||||
break
|
||||
|
||||
elapsed_total = (datetime.now() - start).total_seconds()
|
||||
print(f"\n{'='*52}")
|
||||
print(f"Vysledek: ok={ok_count} | skip={skip_count} | err={err_count}")
|
||||
print(f"Celkovy cas: {int(elapsed_total//3600)}h {int((elapsed_total%3600)//60)}m {int(elapsed_total%60)}s")
|
||||
print(f"Dokumentu v kolekci: {col.count_documents({})}")
|
||||
|
||||
if not args.no_indexes:
|
||||
print()
|
||||
create_indexes(col)
|
||||
|
||||
print(f"\nKonec: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
if err_count:
|
||||
print(f"Chyby logovany do: {LOG_FILE}")
|
||||
|
||||
client.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+121
-39
@@ -39,53 +39,138 @@ c.close()
|
||||
|
||||
## Volume mounty
|
||||
|
||||
| Host (Unraid) | Kontejner | Popis |
|
||||
|-----------------------|-------------------|------------------------------|
|
||||
| `/mnt/user/Scripts` | `/scripts` | Skripty, logy — working dir |
|
||||
| `/mnt/user/JNJEMAILS` | `/mnt/JNJEMAILS` | .msg soubory emailů (JNJ) |
|
||||
| Host (Unraid) | Kontejner | Popis |
|
||||
|-----------------------|-------------------|----------------------------------|
|
||||
| `/mnt/user/Scripts` | `/scripts` | Skripty, logy — working dir |
|
||||
| `/mnt/user/Emails` | `/mnt/Emails` | Stažené přílohy `<schránka>/Attachments/` |
|
||||
|
||||
---
|
||||
|
||||
## Spouštění skriptů
|
||||
|
||||
```bash
|
||||
# Interaktivně (vidíš výstup):
|
||||
docker exec -it python-runner python /scripts/parse_emails_tower_v1.1.py --limit 50 --no-indexes
|
||||
|
||||
# Na pozadí (log do souboru):
|
||||
docker exec -d python-runner bash -c \
|
||||
"python /scripts/parse_emails_tower_v1.1.py > /scripts/parse_emails.log 2>&1"
|
||||
|
||||
# Pokračování po přerušení (skip hotových):
|
||||
docker exec -d python-runner bash -c \
|
||||
"python /scripts/parse_emails_tower_v1.1.py --skip-existing > /scripts/parse_emails.log 2>&1"
|
||||
|
||||
# Sledování průběhu:
|
||||
docker exec -it python-runner tail -f /scripts/parse_emails.log
|
||||
```
|
||||
> Skripty čtou emaily **přímo přes Microsoft Graph API** — lokální `.msg` soubory už nejsou potřeba.
|
||||
|
||||
---
|
||||
|
||||
## Aktuální skripty v /scripts
|
||||
|
||||
| Soubor | Popis |
|
||||
|-------------------------------|------------------------------------------------|
|
||||
| `parse_emails_tower_v1.1.py` | Import .msg → MongoDB (db: emaily, kolekce: vbuzalka@its.jnj.com) |
|
||||
| `parse_emails_tower_v1.1.md` | Dokumentace ke skriptu |
|
||||
| `parse_emails.log` | Log průběhu importu |
|
||||
| `parse_emails_errors.log` | Log chyb (soubory které selhaly) |
|
||||
| Soubor | Popis |
|
||||
|---------------------------------|--------------------------------------------------------------|
|
||||
| `parse_emails_graph_v1.3.py` | Import emailů ze schránky přes Graph API → MongoDB |
|
||||
| `download_attachments_v1.3.py` | Stažení skutečných příloh emailů (Graph API) → `/mnt/Emails` |
|
||||
| `python_runner.md` | Tato dokumentace |
|
||||
| `parse_emails_errors.log` | Log chyb (soubory/zprávy které selhaly) |
|
||||
|
||||
Lokální protějšek: `EmailsImport/parse_emails_v1.0.py` — identický kód, liší se jen cestou
|
||||
(`\\tower\JNJEMAILS` SMB vs. `/mnt/JNJEMAILS` lokální mount) a verzí hlavičky.
|
||||
> **POZOR:** oba skripty pouze **čtou** ze schránky — žádný zápis do schránky.
|
||||
|
||||
---
|
||||
|
||||
## Microsoft Graph API — konfigurace (v obou skriptech)
|
||||
|
||||
| Parametr | Hodnota |
|
||||
|-----------------|----------------------------------------|
|
||||
| Graph URL | `https://graph.microsoft.com/v1.0` |
|
||||
| Tenant ID | `7d269944-37a4-43a1-8140-c7517dc426e9` |
|
||||
| Client ID | `4b222bfd-78c9-4239-a53f-43006b3ed07f` |
|
||||
| Auth | client credentials (msal) |
|
||||
|
||||
| MongoDB | Hodnota |
|
||||
|-----------------|----------------------------------------|
|
||||
| URI | `mongodb://192.168.1.76:27017` |
|
||||
| DB | `emaily` |
|
||||
| Kolekce emailů | `<mailbox>` (např. `ordinace@buzalkova.cz`) |
|
||||
| Index příloh | `attachments_index` |
|
||||
|
||||
---
|
||||
|
||||
## 1) parse_emails_graph_v1.3.py — import emailů → MongoDB
|
||||
|
||||
Čte **všechny složky** schránky rekurzivně (Inbox, Sent, Deleted, archivy …) přes
|
||||
Graph API a importuje každou zprávu jako dokument do MongoDB. `_id` = Internet
|
||||
Message-ID (fallback `graphid:<id>`). Upsert → bezpečné přerušit a opakovat.
|
||||
|
||||
Z každé zprávy extrahuje: předmět, odesílatel, příjemci To/CC/BCC, časy (UTC),
|
||||
HTML tělo (max 2 MB) + text preview, přílohy (metadata + `graph_att_id`),
|
||||
internet headers (SPF/DKIM/Received/X-*), MAPI-ekvivalenty (důležitost, příznak,
|
||||
konverzační vlákno, kategorie, In-Reply-To, References), `isRead`, `isDraft`,
|
||||
`folder_path`, `inferenceClassification`.
|
||||
|
||||
```bash
|
||||
# První import (vše):
|
||||
docker exec -it python-runner python /scripts/parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz
|
||||
|
||||
# Test na 50 zprávách bez indexů:
|
||||
docker exec -it python-runner python /scripts/parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --limit 50 --no-indexes
|
||||
|
||||
# Pravidelný sync na pozadí (log do souboru):
|
||||
docker exec -d python-runner bash -c "python /scripts/parse_emails_graph_v1.3.py --mailbox ordinace@buzalkova.cz --mode sync > /scripts/parse_emails.log 2>&1"
|
||||
```
|
||||
|
||||
> **`-d` = detached:** příkaz se hned vrátí a skript běží dál v kontejneru i po
|
||||
> zavření terminálu / odpojení SSH. Bez `-d` (resp. s `-it`) skript skončí ve chvíli,
|
||||
> kdy se spojení zavře. Pro dlouhé běhy vždy pouštěj s `-d` a logem do souboru,
|
||||
> průběh pak sleduj přes `tail -f` (viz [Sledování průběhu](#sledování-průběhu)).
|
||||
|
||||
### Parametry
|
||||
|
||||
| Parametr | Popis |
|
||||
|---|---|
|
||||
| `--mailbox` | **Povinný.** Schránka (e-mail), zároveň název kolekce v MongoDB. |
|
||||
| `--mode` | `full` (výchozí — plný upsert), `new-only` (jen nové), `sync` (existující: aktualizuje `is_read`/`flag_status`/`categories`/`modified_at`/`folder_path`; nové importuje celé — ideální pro pravidelné spouštění). |
|
||||
| `--folder` | Import jen jedné složky (např. `Inbox`). |
|
||||
| `--limit N` | Zpracuje jen prvních N zpráv (test). |
|
||||
| `--no-indexes` | Nevytváří indexy na konci. |
|
||||
|
||||
---
|
||||
|
||||
## 2) download_attachments_v1.3.py — stažení příloh → /mnt/Emails
|
||||
|
||||
Stahuje skutečné přílohy (`is_inline=False`) všech emailů z MongoDB přes Graph API
|
||||
do `/mnt/Emails/<schránka>/Attachments/`. Primárně přes `graph_att_id` (přímé ID),
|
||||
name-matching jako fallback pro staré emaily.
|
||||
|
||||
Deduplikace podle **SHA256** obsahu:
|
||||
- stejný hash → soubor už existuje → přeskočí
|
||||
- kolize názvu (stejný název, jiný hash) → `faktura_2.pdf`, `faktura_3.pdf` …
|
||||
|
||||
Po uložení aktualizuje MongoDB: každá příloha dostane `file_hash` + `local_path`;
|
||||
kolekce `emaily.attachments_index` (`_id`=hash, filename, path, size_bytes,
|
||||
mime_type, mailbox, first_seen_at, ref_count). Emaily kde mají všechny přílohy
|
||||
`file_hash` se přeskočí → bezpečné opakovat.
|
||||
|
||||
```bash
|
||||
# Interaktivně (vidíš výstup, skončí zavřením terminálu):
|
||||
docker exec -it python-runner python /scripts/download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz
|
||||
|
||||
# Na pozadí (běží dál i po zavření terminálu, log do souboru):
|
||||
docker exec -d python-runner bash -c "python /scripts/download_attachments_v1.3.py --mailbox ordinace@buzalkova.cz > /scripts/download_attachments.log 2>&1"
|
||||
```
|
||||
|
||||
> `-d` = detached — stejné chování jako u skriptu výše (viz poznámka v sekci 1).
|
||||
|
||||
### Parametry
|
||||
|
||||
| Parametr | Popis |
|
||||
|---|---|
|
||||
| `--mailbox` | **Povinný.** Schránka (e-mail) = kolekce v MongoDB. |
|
||||
| `--limit N` | Zpracuje jen prvních N emailů (test). |
|
||||
| `--force-recheck` | Znovu ověří i už stažené přílohy. |
|
||||
| `--no-indexes` | Nevytváří indexy na konci. |
|
||||
|
||||
---
|
||||
|
||||
## Sledování průběhu
|
||||
|
||||
```bash
|
||||
docker exec -it python-runner tail -f /scripts/parse_emails.log
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Nainstalované Python balíčky
|
||||
|
||||
```
|
||||
extract-msg 0.55.0
|
||||
msal (Graph API auth)
|
||||
requests
|
||||
pymongo 4.17.0
|
||||
python-dateutil 2.9.0.post0
|
||||
extract-msg 0.55.0
|
||||
cryptography 48.0.0
|
||||
beautifulsoup4 4.13.5
|
||||
oletools 0.60.2
|
||||
@@ -112,11 +197,8 @@ docker exec python-runner pip install <balicek>
|
||||
|
||||
---
|
||||
|
||||
## Logika parse_emails (oba skripty)
|
||||
## Historie
|
||||
|
||||
- Čte všechny `.msg` soubory z MSGS_DIR
|
||||
- Extrahuje: předmět, odesílatel, příjemci (To/CC/BCC), tělo (text+HTML), přílohy, internet headers, všechny raw MAPI properties
|
||||
- Ukládá do MongoDB: `emaily` → `vbuzalka@its.jnj.com`
|
||||
- `_id` = Internet Message-ID (nebo `filename:<stem>` jako fallback)
|
||||
- Upsert → bezpečné opakování, `--skip-existing` pro pokračování
|
||||
- Indexy: received_at, sent_at, sender.email, filename (unique), full-text (subject+body+to+cc)
|
||||
| Datum | Změna |
|
||||
|---|---|
|
||||
| 2026-06-02 | Přechod z `.msg` souborů na Microsoft Graph API. Skript `parse_emails_tower_v1.1.py` (import lokálních `.msg`) nahrazen `parse_emails_graph_v1.3.py`; přidán `download_attachments_v1.3.py`. Staré verze v `Trash/`. |
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
- [Pracovat v maintree](feedback_worktree.md) — vždy pracuj v `U:/janssen/`, ne ve worktree větvích
|
||||
- [Projekt Covance UCO3001](project_covance.md) — report vzorků studie 77242113UCO3001, skript `create_report.py`, zdroj + logika OK statusů
|
||||
- [EDC import do MongoDB](project_edc_mongo.md) — skript `medidata/edc_import.py`, import Data Listing + QueryDetails CSV do MongoDB (192.168.1.76), kolekce `queries` + `queries_snapshots` pro tracking vývoje queries v čase
|
||||
- [IWRS notifikace v Mongo](project_iwrs_mongo.md) — parser `IWRS/Patients/parse_notifications_to_mongo.py` čte texty notifikací z MySQL a ukládá strukturovaná data do `studie.iwrs` (lot, expirace, clinical response, audit trail)
|
||||
- [Dropbox file transfer](project_dropbox_file_transfer.md) — přenos souborů z JNJ PC do Dropboxu přes msgreceiver kontejner na Unraidu
|
||||
- [Graph email import](project_graph_email_import.md) — import JNJ emailů do schránky vladimir.buzalka@buzalka.cz přes Graph API
|
||||
- [Memory sync přes Giteu](setup_memory_sync.md) — paměť je v `claude-memory/` v janssen repu, junction + git push synchronizuje mezi PC
|
||||
- [Claude Code learning path](project_claude_learning.md) — Level 2 Intermediate, mezery: Skills/Subagenty/Hooks/Print mode, tutoriál v `claude-howto/`
|
||||
|
||||
Reference in New Issue
Block a user