# ============================================================================= # Název: import_to_mongo_v1.2.py # Verze: 1.2 # Datum: 2026-05-28 # Popis: Import CSV reportů do MongoDB (db: covance). # Pipeline 1 — allSamples: kolekce allsamples, klíč Container Barcode No. # Zdroj: UCO3001/Source + MDD3003/Source # Pipeline 2 — kits: kolekce kits, klíč Accession # Zdroj: UCO3001/Source (oba study 36940 + 35472) # Upsert s historií změn, zpracovaný soubor přesunut do Zpracovano/. # ============================================================================= import csv import re import shutil import sys from datetime import datetime from pathlib import Path from pymongo import MongoClient, ASCENDING MONGO_URI = "mongodb://192.168.1.76:27017" DB_NAME = "covance" UCO3001_SOURCE = Path(__file__).parent / "Source" MDD3003_SOURCE = Path(__file__).parent.parent / "Covance_MDD3003" / "Source" PIPELINES = [ { "name": "allsamples", "collection": "allsamples", "upsert_key": "Container Barcode No.", "pattern": re.compile(r".*-allSamples\.csv$", re.IGNORECASE), "sources": [UCO3001_SOURCE, MDD3003_SOURCE], "indexes": [ [("fields.Sample Status", ASCENDING)], [("fields.Specimen Type", ASCENDING)], ], }, { "name": "kits", "collection": "kits", "upsert_key": "Accession", "pattern": re.compile(r".*-kit-inventory-on-hand-expiration\.csv$", re.IGNORECASE), "sources": [UCO3001_SOURCE], "indexes": [ [("fields.Kit Type", ASCENDING)], [("fields.Site", ASCENDING)], [("fields.Expiration Date", ASCENDING)], ], }, ] def extract_snapshot_date(filename: str) -> str: match = re.match(r"(\d{4}-\d{2}-\d{2})", Path(filename).name) return match.group(1) if match else datetime.now().strftime("%Y-%m-%d") def clean_value(val: str) -> str | None: val = val.strip() return val if val else None def import_file(csv_path: Path, collection, upsert_key: str) -> dict: snapshot_date = extract_snapshot_date(csv_path.name) inserted = changed = unchanged = skipped = 0 with open(csv_path, newline="", encoding="utf-8-sig") as f: reader = csv.DictReader(f) rows = list(reader) for row in rows: fields = {k: clean_value(v) for k, v in row.items() if k} key_val = fields.get(upsert_key) if not key_val: skipped += 1 continue existing = collection.find_one({"record_id": key_val}) if existing is None: collection.insert_one({ "record_id": key_val, "fields": fields, "sourceFile": csv_path.name, "firstSeen": snapshot_date, "lastSeen": snapshot_date, "history": [], }) inserted += 1 elif existing["fields"] != fields: collection.update_one( {"_id": existing["_id"]}, { "$push": {"history": {"date": existing["lastSeen"], "fields": existing["fields"]}}, "$set": {"fields": fields, "sourceFile": csv_path.name, "lastSeen": snapshot_date}, }, ) changed += 1 else: collection.update_one( {"_id": existing["_id"]}, {"$set": {"lastSeen": snapshot_date, "sourceFile": csv_path.name}}, ) unchanged += 1 total_rows = len(rows) db_count = collection.count_documents({}) print(f" [{snapshot_date}]: +{inserted} new, ~{changed} changed, ={unchanged} same, -{skipped} bez klice") print(f" Radku v CSV: {total_rows}, dokumentu v DB: {db_count}") if inserted + changed + unchanged + skipped != total_rows: print(f" !!! VAROVANI: soucet ({inserted+changed+unchanged+skipped}) != radku v CSV ({total_rows})") return {"inserted": inserted, "changed": changed, "unchanged": unchanged} def collect_files(pipeline: dict, cli_args: list[str]) -> list[Path]: if cli_args: paths = [] for arg in cli_args: p = Path(arg) if p.is_file() and pipeline["pattern"].match(p.name): paths.append(p) return paths paths = [] for src_dir in pipeline["sources"]: if src_dir.exists(): paths.extend(sorted(p for p in src_dir.glob("*.csv") if pipeline["pattern"].match(p.name))) return paths def run_pipeline(pipeline: dict, client, cli_args: list[str]): paths = collect_files(pipeline, cli_args) if not paths: print(f"[{pipeline['name']}] Zadne soubory k importu.") return print(f"\n=== Pipeline: {pipeline['name']} ({len(paths)} souboru) ===") col = client[DB_NAME][pipeline["collection"]] col.create_index([("record_id", ASCENDING)], unique=True) for idx in pipeline["indexes"]: col.create_index(idx) for src_dir in pipeline["sources"]: (src_dir / "Zpracovano").mkdir(exist_ok=True) total = {"inserted": 0, "changed": 0, "unchanged": 0} for csv_path in paths: print(f"Import: {csv_path.name} [{csv_path.parent.parent.name}]") stats = import_file(csv_path, col, pipeline["upsert_key"]) for k in total: total[k] += stats[k] dest = csv_path.parent / "Zpracovano" / csv_path.name shutil.move(str(csv_path), str(dest)) print(f" -> presunut do Zpracovano/\n") print(f"[{pipeline['name']}] Celkem: +{total['inserted']} new, ~{total['changed']} changed, ={total['unchanged']} same") def main(): cli_args = sys.argv[1:] client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000) client.admin.command("ping") for pipeline in PIPELINES: run_pipeline(pipeline, client, cli_args) client.close() if __name__ == "__main__": main()