z230

2026-06-09 08:22:49 +02:00
parent cf5e681a42
commit 915357cca9
2110 changed files with 54778 additions and 1 deletions
@@ -0,0 +1,181 @@
+# =============================================================================
+# Název:   import_to_mongo_v1.2.py
+# Verze:   1.2
+# Datum:   2026-05-28
+# Popis:   Import CSV reportů do MongoDB (db: covance).
+#          Pipeline 1 — allSamples: kolekce allsamples, klíč Container Barcode No.
+#                       Zdroj: UCO3001/Source + MDD3003/Source
+#          Pipeline 2 — kits: kolekce kits, klíč Accession
+#                       Zdroj: UCO3001/Source (oba study 36940 + 35472)
+#          Upsert s historií změn, zpracovaný soubor přesunut do Zpracovano/.
+# =============================================================================
+
+import csv
+import re
+import shutil
+import sys
+from datetime import datetime
+from pathlib import Path
+
+from pymongo import MongoClient, ASCENDING
+
+MONGO_URI = "mongodb://192.168.1.76:27017"
+DB_NAME   = "covance"
+
+UCO3001_SOURCE = Path(__file__).parent / "Source"
+MDD3003_SOURCE = Path(__file__).parent.parent / "Covance_MDD3003" / "Source"
+
+PIPELINES = [
+    {
+        "name":       "allsamples",
+        "collection": "allsamples",
+        "upsert_key": "Container Barcode No.",
+        "pattern":    re.compile(r".*-allSamples\.csv$", re.IGNORECASE),
+        "sources":    [UCO3001_SOURCE, MDD3003_SOURCE],
+        "indexes":    [
+            [("fields.Sample Status", ASCENDING)],
+            [("fields.Specimen Type", ASCENDING)],
+        ],
+    },
+    {
+        "name":       "kits",
+        "collection": "kits",
+        "upsert_key": "Accession",
+        "pattern":    re.compile(r".*-kit-inventory-on-hand-expiration\.csv$", re.IGNORECASE),
+        "sources":    [UCO3001_SOURCE],
+        "indexes":    [
+            [("fields.Kit Type", ASCENDING)],
+            [("fields.Site", ASCENDING)],
+            [("fields.Expiration Date", ASCENDING)],
+        ],
+    },
+]
+
+
+def extract_snapshot_date(filename: str) -> str:
+    match = re.match(r"(\d{4}-\d{2}-\d{2})", Path(filename).name)
+    return match.group(1) if match else datetime.now().strftime("%Y-%m-%d")
+
+
+def clean_value(val: str) -> str | None:
+    val = val.strip()
+    return val if val else None
+
+
+def import_file(csv_path: Path, collection, upsert_key: str) -> dict:
+    snapshot_date = extract_snapshot_date(csv_path.name)
+    inserted = changed = unchanged = skipped = 0
+
+    with open(csv_path, newline="", encoding="utf-8-sig") as f:
+        reader = csv.DictReader(f)
+        rows = list(reader)
+
+    for row in rows:
+        fields = {k: clean_value(v) for k, v in row.items() if k}
+
+        key_val = fields.get(upsert_key)
+        if not key_val:
+            skipped += 1
+            continue
+
+        existing = collection.find_one({"record_id": key_val})
+
+        if existing is None:
+            collection.insert_one({
+                "record_id":  key_val,
+                "fields":     fields,
+                "sourceFile": csv_path.name,
+                "firstSeen":  snapshot_date,
+                "lastSeen":   snapshot_date,
+                "history":    [],
+            })
+            inserted += 1
+
+        elif existing["fields"] != fields:
+            collection.update_one(
+                {"_id": existing["_id"]},
+                {
+                    "$push": {"history": {"date": existing["lastSeen"], "fields": existing["fields"]}},
+                    "$set":  {"fields": fields, "sourceFile": csv_path.name, "lastSeen": snapshot_date},
+                },
+            )
+            changed += 1
+
+        else:
+            collection.update_one(
+                {"_id": existing["_id"]},
+                {"$set": {"lastSeen": snapshot_date, "sourceFile": csv_path.name}},
+            )
+            unchanged += 1
+
+    total_rows = len(rows)
+    db_count   = collection.count_documents({})
+    print(f"  [{snapshot_date}]: +{inserted} new, ~{changed} changed, ={unchanged} same, -{skipped} bez klice")
+    print(f"  Radku v CSV: {total_rows}, dokumentu v DB: {db_count}")
+
+    if inserted + changed + unchanged + skipped != total_rows:
+        print(f"  !!! VAROVANI: soucet ({inserted+changed+unchanged+skipped}) != radku v CSV ({total_rows})")
+
+    return {"inserted": inserted, "changed": changed, "unchanged": unchanged}
+
+
+def collect_files(pipeline: dict, cli_args: list[str]) -> list[Path]:
+    if cli_args:
+        paths = []
+        for arg in cli_args:
+            p = Path(arg)
+            if p.is_file() and pipeline["pattern"].match(p.name):
+                paths.append(p)
+        return paths
+
+    paths = []
+    for src_dir in pipeline["sources"]:
+        if src_dir.exists():
+            paths.extend(sorted(p for p in src_dir.glob("*.csv") if pipeline["pattern"].match(p.name)))
+    return paths
+
+
+def run_pipeline(pipeline: dict, client, cli_args: list[str]):
+    paths = collect_files(pipeline, cli_args)
+    if not paths:
+        print(f"[{pipeline['name']}] Zadne soubory k importu.")
+        return
+
+    print(f"\n=== Pipeline: {pipeline['name']} ({len(paths)} souboru) ===")
+
+    col = client[DB_NAME][pipeline["collection"]]
+    col.create_index([("record_id", ASCENDING)], unique=True)
+    for idx in pipeline["indexes"]:
+        col.create_index(idx)
+
+    for src_dir in pipeline["sources"]:
+        (src_dir / "Zpracovano").mkdir(exist_ok=True)
+
+    total = {"inserted": 0, "changed": 0, "unchanged": 0}
+
+    for csv_path in paths:
+        print(f"Import: {csv_path.name}  [{csv_path.parent.parent.name}]")
+        stats = import_file(csv_path, col, pipeline["upsert_key"])
+        for k in total:
+            total[k] += stats[k]
+        dest = csv_path.parent / "Zpracovano" / csv_path.name
+        shutil.move(str(csv_path), str(dest))
+        print(f"  -> presunut do Zpracovano/\n")
+
+    print(f"[{pipeline['name']}] Celkem: +{total['inserted']} new, ~{total['changed']} changed, ={total['unchanged']} same")
+
+
+def main():
+    cli_args = sys.argv[1:]
+
+    client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+    client.admin.command("ping")
+
+    for pipeline in PIPELINES:
+        run_pipeline(pipeline, client, cli_args)
+
+    client.close()
+
+
+if __name__ == "__main__":
+    main()