Add Panorama MongoDB import + post-import move to Zpracovano

New Panorama/import_to_mongo.py: imports Issues & Deviations XLSX into
MongoDB (db: Panorama, collection: IssuesAndDeviations), all countries,
upsert on record ID with field change history tracking.

Both import scripts (Medidata + Panorama) now move processed files to
Downloads/Zpracovano/ after import to avoid re-processing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-26 14:11:37 +02:00
parent 08d8cd75ee
commit c46815cea7
16 changed files with 205 additions and 0 deletions
+8
View File
@@ -12,6 +12,7 @@ Použití:
import csv import csv
import re import re
import shutil
import sys import sys
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
@@ -21,6 +22,7 @@ from pymongo import MongoClient, ASCENDING
MONGO_URI = "mongodb://192.168.1.76:27017" MONGO_URI = "mongodb://192.168.1.76:27017"
DB_NAME = "edc" DB_NAME = "edc"
DOWNLOADS_DIR = Path(__file__).parent / "downloads" DOWNLOADS_DIR = Path(__file__).parent / "downloads"
PROCESSED_DIR = DOWNLOADS_DIR / "Zpracovano"
COUNTRY_FILTER = "CZE" COUNTRY_FILTER = "CZE"
@@ -266,6 +268,8 @@ def main():
client.admin.command("ping") client.admin.command("ping")
db = client[DB_NAME] db = client[DB_NAME]
PROCESSED_DIR.mkdir(exist_ok=True)
total = {"inserted": 0, "changed": 0, "unchanged": 0} total = {"inserted": 0, "changed": 0, "unchanged": 0}
for csv_path in paths: for csv_path in paths:
@@ -275,6 +279,10 @@ def main():
for k in total: for k in total:
total[k] += stats.get(k, 0) total[k] += stats.get(k, 0)
dest = PROCESSED_DIR / csv_path.name
shutil.move(str(csv_path), str(dest))
print(f" -> presunut do Zpracovano/")
client.close() client.close()
print(f"\nCelkem: +{total['inserted']} new, ~{total['changed']} changed, ={total['unchanged']} same") print(f"\nCelkem: +{total['inserted']} new, ~{total['changed']} changed, ={total['unchanged']} same")
+197
View File
@@ -0,0 +1,197 @@
"""
Import Panorama Issues & Deviations XLSX do MongoDB (databáze: Panorama).
Kolekce: IssuesAndDeviations
Filtr: pouze řádky s Country Name == "Czechia"
Historie: při změně fields se stará verze uloží do pole history[]
Použití:
python import_to_mongo.py # importuje všechny xlsx z Downloads/
python import_to_mongo.py Downloads/konkretni.xlsx # jeden soubor
"""
import re
import shutil
import sys
from datetime import datetime, date
from pathlib import Path
import openpyxl
from pymongo import MongoClient, ASCENDING
MONGO_URI = "mongodb://192.168.1.76:27017"
DB_NAME = "Panorama"
COLLECTION_NAME = "IssuesAndDeviations"
DOWNLOADS_DIR = Path(__file__).parent / "Downloads"
PROCESSED_DIR = DOWNLOADS_DIR / "Zpracovano"
COUNTRY_FILTER = None # None = všechny země
HEADER_ROW = 5 # 0-indexed řádek s hlavičkou
DATA_START_ROW = 6 # 0-indexed první datový řádek
UPSERT_KEY = "ID" # unikátní klíč pro upsert
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def extract_snapshot_date(filename: str) -> str:
match = re.match(r"(\d{4}-\d{2}-\d{2})", Path(filename).name)
return match.group(1) if match else datetime.now().strftime("%Y-%m-%d")
def clean_value(val):
"""Převede datetime na ISO string, None nechá, zbytek strip."""
if val is None:
return None
if isinstance(val, datetime):
return val.isoformat()
if isinstance(val, date):
return val.isoformat()
if isinstance(val, str):
val = val.strip()
return val if val else None
return val
# ---------------------------------------------------------------------------
# Import jednoho souboru
# ---------------------------------------------------------------------------
def import_file(xlsx_path: str, collection) -> dict:
filename = Path(xlsx_path).name
snapshot_date = extract_snapshot_date(filename)
wb = openpyxl.load_workbook(xlsx_path, read_only=True)
ws = wb[wb.sheetnames[0]]
rows = list(ws.iter_rows(values_only=True))
wb.close()
header = rows[HEADER_ROW]
inserted = changed = unchanged = filtered_out = 0
for row in rows[DATA_START_ROW:]:
raw = dict(zip(header, row))
country = (raw.get("Country Name") or "")
if COUNTRY_FILTER and country != COUNTRY_FILTER:
filtered_out += 1
continue
record_id = raw.get(UPSERT_KEY)
if record_id is None:
continue
record_id = str(int(record_id)) if isinstance(record_id, (int, float)) else str(record_id).strip()
fields = {}
for k, v in raw.items():
if k is None:
continue
fields[k] = clean_value(v)
existing = collection.find_one({"record_id": record_id})
if existing is None:
doc = {
"record_id": record_id,
"fields": fields,
"sourceFile": filename,
"firstSeen": snapshot_date,
"lastSeen": snapshot_date,
"history": [],
}
collection.insert_one(doc)
inserted += 1
elif existing.get("fields") != fields:
old_entry = {
"date": existing.get("lastSeen", snapshot_date),
"fields": existing["fields"],
}
collection.update_one(
{"_id": existing["_id"]},
{
"$push": {"history": old_entry},
"$set": {
"fields": fields,
"sourceFile": filename,
"lastSeen": snapshot_date,
},
},
)
changed += 1
else:
collection.update_one(
{"_id": existing["_id"]},
{"$set": {"lastSeen": snapshot_date, "sourceFile": filename}},
)
unchanged += 1
stats = {
"snapshot": snapshot_date,
"inserted": inserted,
"changed": changed,
"unchanged": unchanged,
"filtered_out": filtered_out,
}
print(f" {COLLECTION_NAME} [{snapshot_date}]: +{inserted} new, ~{changed} changed, ={unchanged} same, -{filtered_out} non-CZ")
return stats
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
paths: list[Path] = []
if len(sys.argv) > 1:
for arg in sys.argv[1:]:
p = Path(arg)
if p.is_file():
paths.append(p)
else:
print(f"Soubor nenalezen: {arg}")
else:
paths = sorted(DOWNLOADS_DIR.glob("*.xlsx"))
if not paths:
print("Zadne XLSX soubory k importu.")
return
print(f"Nalezeno {len(paths)} souboru.\n")
client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
client.admin.command("ping")
db = client[DB_NAME]
collection = db[COLLECTION_NAME]
collection.create_index([("record_id", ASCENDING)], unique=True)
collection.create_index([("fields.Country Name", ASCENDING)])
collection.create_index([("fields.Site ID", ASCENDING)])
collection.create_index([("fields.Status", ASCENDING)])
collection.create_index([("fields.Brief Description - Subject ID", ASCENDING)])
PROCESSED_DIR.mkdir(exist_ok=True)
total = {"inserted": 0, "changed": 0, "unchanged": 0}
for xlsx_path in paths:
print(f"Import: {xlsx_path.name}")
stats = import_file(str(xlsx_path), collection)
for k in total:
total[k] += stats.get(k, 0)
dest = PROCESSED_DIR / xlsx_path.name
shutil.move(str(xlsx_path), str(dest))
print(f" -> presunut do Zpracovano/")
client.close()
print(f"\nCelkem: +{total['inserted']} new, ~{total['changed']} changed, ={total['unchanged']} same")
if __name__ == "__main__":
main()