This commit is contained in:
2026-04-21 13:39:36 +02:00
parent ac21a7c84a
commit 8638f98748
2 changed files with 111 additions and 0 deletions
+111
View File
@@ -0,0 +1,111 @@
import re
import copy
import datetime
from collections import Counter
from pathlib import Path
from openpyxl import load_workbook, Workbook
from openpyxl.utils import get_column_letter
TO_PROCESS = Path(__file__).parent / "ToProcess"
PROCESSED = Path(__file__).parent / "Processed"
SITE_PATTERN = re.compile(r'DD5-CZ\d+')
ACTIVE_SITES = {
"DD5-CZ10001", "DD5-CZ10003", "DD5-CZ10006", "DD5-CZ10009",
"DD5-CZ10010", "DD5-CZ10012", "DD5-CZ10013", "DD5-CZ10015",
"DD5-CZ10016", "DD5-CZ10020", "DD5-CZ10021", "DD5-CZ10022",
}
TAB_GREEN = "00B050"
TAB_RED = "FF0000"
def copy_row(ws_dst, dst_row_num, src_row):
for src_cell in src_row:
dst_cell = ws_dst.cell(row=dst_row_num, column=src_cell.column)
dst_cell.value = src_cell.value
if src_cell.has_style:
dst_cell.font = copy.copy(src_cell.font)
dst_cell.fill = copy.copy(src_cell.fill)
dst_cell.border = copy.copy(src_cell.border)
dst_cell.alignment = copy.copy(src_cell.alignment)
dst_cell.number_format = src_cell.number_format
def process_file(src_path: Path):
print(f"Processing: {src_path.name}")
wb_src = load_workbook(src_path)
ws_src = wb_src["Sheet0"]
headers = [ws_src.cell(1, c).value for c in range(1, ws_src.max_column + 1)]
site_col_idx = headers.index("Site") + 1
date_col_idx = headers.index("Approval Complete Date") + 1
all_rows = list(ws_src.iter_rows(min_row=2))
all_sites = set(
m for row in all_rows
for m in SITE_PATTERN.findall(str(row[site_col_idx - 1].value or ""))
)
active = sorted(s for s in all_sites if s in ACTIVE_SITES)
inactive = sorted(s for s in all_sites if s not in ACTIVE_SITES)
ordered_sites = active + inactive
print(f" Found {len(active)} active, {len(inactive)} inactive DD5-CZ sites")
wb_out = Workbook()
wb_out.remove(wb_out.active)
header_row = list(ws_src.iter_rows(min_row=1, max_row=1))[0]
for site in ordered_sites:
site_rows = [
row for row in all_rows
if site in (row[site_col_idx - 1].value or "")
]
site_rows.sort(
key=lambda r: r[date_col_idx - 1].value or datetime.datetime.min,
reverse=True,
)
ws = wb_out.create_sheet(title=site)
ws.sheet_properties.tabColor = TAB_GREEN if site in ACTIVE_SITES else TAB_RED
copy_row(ws, 1, header_row)
for i, row in enumerate(site_rows, start=2):
copy_row(ws, i, row)
for col_idx in range(1, ws_src.max_column + 1):
col_letter = get_column_letter(col_idx)
if col_letter in ws_src.column_dimensions:
ws.column_dimensions[col_letter].width = ws_src.column_dimensions[col_letter].width
ws.auto_filter.ref = ws.dimensions
print(f" {site}: {len(site_rows)} rows")
study_col_idx = headers.index("Study") + 1
study_values = [
s.strip()
for row in all_rows
for s in str(row[study_col_idx - 1].value or "").split(",")
if s.strip()
]
study_number = Counter(study_values).most_common(1)[0][0]
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H_%M_%S")
filename = f"{timestamp} {study_number} {src_path.name}"
out_path = PROCESSED / filename
wb_out.save(out_path)
print(f" Saved: {out_path}")
def main():
xlsx_files = list(TO_PROCESS.glob("*.xlsx"))
if not xlsx_files:
print("No .xlsx files found in ToProcess/")
return
for f in xlsx_files:
process_file(f)
if __name__ == "__main__":
main()