z230
This commit is contained in:
@@ -1,9 +1,10 @@
|
|||||||
"""
|
"""
|
||||||
import_CZ_contacts.py
|
import_CZ_contacts.py
|
||||||
Importuje kontakty středisek Czechia z PANORAMA Dashboard xlsx do MySQL tabulky CTMS_contacts.
|
Importuje kontakty středisek Czechia z PANORAMA Dashboard xlsx do MySQL tabulky CTMS_contacts.
|
||||||
|
- Zpracuje všechny *.xlsx soubory ve SOURCE_DIR
|
||||||
- Filtruje pouze řádky Country Name == 'Czechia'
|
- Filtruje pouze řádky Country Name == 'Czechia'
|
||||||
- file_date bere z document properties xlsx (dcterms:created)
|
- file_date bere z document properties xlsx (dcterms:created)
|
||||||
- Před importem smaže stávající záznamy se stejným file_date + country_name == 'Czechia'
|
- Každý soubor vždy přepíše (delete + insert podle file_date + protocol_id + country)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import zipfile
|
import zipfile
|
||||||
@@ -15,7 +16,7 @@ import pandas as pd
|
|||||||
import mysql.connector
|
import mysql.connector
|
||||||
|
|
||||||
# ── Konfigurace ────────────────────────────────────────────────────────────────
|
# ── Konfigurace ────────────────────────────────────────────────────────────────
|
||||||
SOURCE_FILE = Path(r"U:\PythonProject\Janssen\CTMS\PanoramaContacts\SourceData\PANORAMA Dashboard (33).xlsx")
|
SOURCE_DIR = Path(r"U:\PythonProject\Janssen\CTMS\PanoramaContacts\SourceData")
|
||||||
|
|
||||||
DB_CONFIG = {
|
DB_CONFIG = {
|
||||||
"host": "192.168.1.76",
|
"host": "192.168.1.76",
|
||||||
@@ -31,57 +32,7 @@ COUNTRY = "Czechia"
|
|||||||
SHEET = "Site Contacts"
|
SHEET = "Site Contacts"
|
||||||
HEADER_ROW = 5 # 0-based → řádek č. 6 v Excelu
|
HEADER_ROW = 5 # 0-based → řádek č. 6 v Excelu
|
||||||
|
|
||||||
|
COL_MAP = {
|
||||||
# ── Pomocné funkce ─────────────────────────────────────────────────────────────
|
|
||||||
def get_file_created_date(xlsx_path: Path) -> datetime.date:
|
|
||||||
"""Vrátí datum vytvoření souboru z docProps/core.xml (dcterms:created)."""
|
|
||||||
ns = {"dcterms": "http://purl.org/dc/terms/"}
|
|
||||||
with zipfile.ZipFile(xlsx_path) as z:
|
|
||||||
with z.open("docProps/core.xml") as f:
|
|
||||||
root = ET.parse(f).getroot()
|
|
||||||
created_el = root.find("{http://purl.org/dc/terms/}created")
|
|
||||||
dt = datetime.fromisoformat(created_el.text.replace("Z", "+00:00"))
|
|
||||||
return dt.astimezone(timezone.utc).date()
|
|
||||||
|
|
||||||
|
|
||||||
def clean_value(val):
|
|
||||||
"""Převede NaN / NaT / float na None, jinak vrátí string nebo date."""
|
|
||||||
if val is None:
|
|
||||||
return None
|
|
||||||
if isinstance(val, float):
|
|
||||||
import math
|
|
||||||
return None if math.isnan(val) else val
|
|
||||||
if hasattr(val, "_value"): # pd.NaT
|
|
||||||
return None
|
|
||||||
try:
|
|
||||||
import pandas as pd
|
|
||||||
if pd.isna(val):
|
|
||||||
return None
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
if isinstance(val, pd.Timestamp):
|
|
||||||
return val.date() if not pd.isnull(val) else None
|
|
||||||
return val
|
|
||||||
|
|
||||||
|
|
||||||
# ── Hlavní logika ──────────────────────────────────────────────────────────────
|
|
||||||
def main():
|
|
||||||
print(f"Soubor : {SOURCE_FILE}")
|
|
||||||
|
|
||||||
# 1) Datum vytvoření z properties
|
|
||||||
file_date = get_file_created_date(SOURCE_FILE)
|
|
||||||
print(f"file_date (z docProps): {file_date}")
|
|
||||||
|
|
||||||
# 2) Načtení dat
|
|
||||||
print("Načítám Excel…")
|
|
||||||
df = pd.read_excel(SOURCE_FILE, sheet_name=SHEET, header=HEADER_ROW)
|
|
||||||
|
|
||||||
# 3) Filtr CZ
|
|
||||||
df_cz = df[df["Country Name"] == COUNTRY].copy()
|
|
||||||
print(f"Řádků CZ: {len(df_cz)}")
|
|
||||||
|
|
||||||
# 4) Mapování Excel sloupců → DB sloupce
|
|
||||||
col_map = {
|
|
||||||
"Sector": "sector",
|
"Sector": "sector",
|
||||||
"TA": "ta",
|
"TA": "ta",
|
||||||
"Protocol ID": "protocol_id",
|
"Protocol ID": "protocol_id",
|
||||||
@@ -119,39 +70,122 @@ def main():
|
|||||||
"Contact Zip/Postal Code": "zip_postal_code",
|
"Contact Zip/Postal Code": "zip_postal_code",
|
||||||
}
|
}
|
||||||
|
|
||||||
df_cz = df_cz.rename(columns=col_map)
|
|
||||||
db_cols = list(col_map.values())
|
|
||||||
|
|
||||||
# 5) Připojení k DB
|
# ── Pomocné funkce ─────────────────────────────────────────────────────────────
|
||||||
print("Připojuji se k MySQL…")
|
def get_file_created_date(xlsx_path: Path):
|
||||||
conn = mysql.connector.connect(**DB_CONFIG)
|
"""Vrátí date z dcterms:created v docProps/core.xml."""
|
||||||
cursor = conn.cursor()
|
with zipfile.ZipFile(xlsx_path) as z:
|
||||||
|
with z.open("docProps/core.xml") as f:
|
||||||
|
root = ET.parse(f).getroot()
|
||||||
|
el = root.find("{http://purl.org/dc/terms/}created")
|
||||||
|
dt = datetime.fromisoformat(el.text.replace("Z", "+00:00"))
|
||||||
|
return dt.astimezone(timezone.utc).date()
|
||||||
|
|
||||||
# 6) Smazání stávajících záznamů pro stejný file_date + CZ (idempotentní import)
|
|
||||||
|
def clean_value(val):
|
||||||
|
"""Převede NaN / NaT / Timestamp na typy přijatelné MySQL driverem."""
|
||||||
|
import math
|
||||||
|
if val is None:
|
||||||
|
return None
|
||||||
|
if isinstance(val, float):
|
||||||
|
return None if math.isnan(val) else val
|
||||||
|
if isinstance(val, pd.Timestamp):
|
||||||
|
return None if pd.isnull(val) else val.date()
|
||||||
|
try:
|
||||||
|
if pd.isna(val):
|
||||||
|
return None
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return val
|
||||||
|
|
||||||
|
|
||||||
|
def get_protocol_id(xlsx_path: Path) -> str:
|
||||||
|
"""Přečte protocol_id z prvního datového řádku (rychle, bez načtení celého souboru)."""
|
||||||
|
df = pd.read_excel(xlsx_path, sheet_name=SHEET, header=HEADER_ROW,
|
||||||
|
usecols=["Protocol ID"], nrows=1)
|
||||||
|
return str(df["Protocol ID"].iloc[0])
|
||||||
|
|
||||||
|
|
||||||
|
def import_file(xlsx_path: Path, cursor, conn):
|
||||||
|
"""Zpracuje jeden xlsx soubor — vždy přepíše (delete + insert)."""
|
||||||
|
file_date = get_file_created_date(xlsx_path)
|
||||||
|
protocol_id = get_protocol_id(xlsx_path)
|
||||||
|
print(f" file_date : {file_date}")
|
||||||
|
print(f" protocol_id : {protocol_id}")
|
||||||
|
|
||||||
|
df = pd.read_excel(xlsx_path, sheet_name=SHEET, header=HEADER_ROW)
|
||||||
|
df_cz = df[df["Country Name"] == COUNTRY].copy()
|
||||||
|
print(f" radku CZ : {len(df_cz)}")
|
||||||
|
|
||||||
|
if df_cz.empty:
|
||||||
|
print(" -> zadne CZ radky, preskoceno")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
df_cz = df_cz.rename(columns=COL_MAP)
|
||||||
|
db_cols = list(COL_MAP.values())
|
||||||
|
|
||||||
|
# Smazání stávajících záznamů pro tento soubor (přepis)
|
||||||
cursor.execute(
|
cursor.execute(
|
||||||
f"DELETE FROM {TABLE} WHERE file_date = %s AND country_name = %s",
|
f"DELETE FROM {TABLE} "
|
||||||
(file_date, COUNTRY)
|
f"WHERE file_date = %s AND protocol_id = %s AND country_name = %s",
|
||||||
|
(file_date, protocol_id, COUNTRY)
|
||||||
)
|
)
|
||||||
deleted = cursor.rowcount
|
deleted = cursor.rowcount
|
||||||
print(f"Smazáno starých záznamů: {deleted}")
|
if deleted:
|
||||||
|
print(f" prepis : smazano {deleted} starych radku")
|
||||||
|
|
||||||
# 7) Insert
|
placeholders = ", ".join(["%s"] * (len(db_cols) + 1))
|
||||||
placeholders = ", ".join(["%s"] * (len(db_cols) + 1)) # +1 pro file_date
|
|
||||||
insert_cols = "file_date, " + ", ".join(db_cols)
|
insert_cols = "file_date, " + ", ".join(db_cols)
|
||||||
sql_insert = f"INSERT INTO {TABLE} ({insert_cols}) VALUES ({placeholders})"
|
sql_insert = f"INSERT INTO {TABLE} ({insert_cols}) VALUES ({placeholders})"
|
||||||
|
|
||||||
inserted = 0
|
|
||||||
for _, row in df_cz.iterrows():
|
for _, row in df_cz.iterrows():
|
||||||
values = [file_date] + [clean_value(row.get(col)) for col in db_cols]
|
values = [file_date] + [clean_value(row.get(col)) for col in db_cols]
|
||||||
cursor.execute(sql_insert, values)
|
cursor.execute(sql_insert, values)
|
||||||
inserted += 1
|
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
return len(df_cz)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Hlavní logika ──────────────────────────────────────────────────────────────
|
||||||
|
def main():
|
||||||
|
files = sorted(SOURCE_DIR.glob("*.xlsx"))
|
||||||
|
if not files:
|
||||||
|
print(f"Zadne xlsx soubory v {SOURCE_DIR}")
|
||||||
|
return
|
||||||
|
|
||||||
|
today = datetime.now(timezone.utc).date()
|
||||||
|
print(f"Nalezeno souboru: {len(files)} | dnesni datum: {today}")
|
||||||
|
print(f"Pripojuji se k MySQL...")
|
||||||
|
conn = mysql.connector.connect(**DB_CONFIG)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
summary = []
|
||||||
|
for xlsx in files:
|
||||||
|
print(f"\n[{xlsx.name}]")
|
||||||
|
try:
|
||||||
|
file_date = get_file_created_date(xlsx)
|
||||||
|
if file_date != today:
|
||||||
|
print(f" file_date : {file_date} -> PRESKOCENO (neni dnesni datum)")
|
||||||
|
summary.append((xlsx.name, f"preskoceno (file_date={file_date})"))
|
||||||
|
continue
|
||||||
|
n = import_file(xlsx, cursor, conn)
|
||||||
|
if n is None:
|
||||||
|
summary.append((xlsx.name, "preskoceno"))
|
||||||
|
else:
|
||||||
|
summary.append((xlsx.name, f"importovano {n} radku"))
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
summary.append((xlsx.name, f"CHYBA: {e}"))
|
||||||
|
print(f" CHYBA: {e}")
|
||||||
|
|
||||||
cursor.close()
|
cursor.close()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
print(f"Importováno záznamů : {inserted}")
|
print("\n" + "=" * 60)
|
||||||
print("Hotovo OK")
|
print("SOUHRN:")
|
||||||
|
for name, status in summary:
|
||||||
|
print(f" {name:<45} {status}")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user