import re import copy import datetime from collections import Counter from pathlib import Path from openpyxl import load_workbook, Workbook from openpyxl.utils import get_column_letter TO_PROCESS = Path(__file__).parent / "ToProcess" PROCESSED = Path(__file__).parent / "Processed" SITE_PATTERN = re.compile(r'DD5-CZ\d+') ACTIVE_SITES = { "DD5-CZ10001", "DD5-CZ10003", "DD5-CZ10006", "DD5-CZ10009", "DD5-CZ10010", "DD5-CZ10012", "DD5-CZ10013", "DD5-CZ10015", "DD5-CZ10016", "DD5-CZ10020", "DD5-CZ10021", "DD5-CZ10022", } TAB_GREEN = "00B050" TAB_RED = "FF0000" def copy_row(ws_dst, dst_row_num, src_row): for src_cell in src_row: dst_cell = ws_dst.cell(row=dst_row_num, column=src_cell.column) dst_cell.value = src_cell.value if src_cell.has_style: dst_cell.font = copy.copy(src_cell.font) dst_cell.fill = copy.copy(src_cell.fill) dst_cell.border = copy.copy(src_cell.border) dst_cell.alignment = copy.copy(src_cell.alignment) dst_cell.number_format = src_cell.number_format def process_file(src_path: Path): print(f"Processing: {src_path.name}") wb_src = load_workbook(src_path) ws_src = wb_src["Sheet0"] headers = [ws_src.cell(1, c).value for c in range(1, ws_src.max_column + 1)] site_col_idx = headers.index("Site") + 1 date_col_idx = headers.index("Approval Complete Date") + 1 all_rows = list(ws_src.iter_rows(min_row=2)) all_sites = set( m for row in all_rows for m in SITE_PATTERN.findall(str(row[site_col_idx - 1].value or "")) ) active = sorted(s for s in all_sites if s in ACTIVE_SITES) inactive = sorted(s for s in all_sites if s not in ACTIVE_SITES) ordered_sites = active + inactive print(f" Found {len(active)} active, {len(inactive)} inactive DD5-CZ sites") wb_out = Workbook() wb_out.remove(wb_out.active) header_row = list(ws_src.iter_rows(min_row=1, max_row=1))[0] for site in ordered_sites: site_rows = [ row for row in all_rows if site in (row[site_col_idx - 1].value or "") ] site_rows.sort( key=lambda r: r[date_col_idx - 1].value or datetime.datetime.min, reverse=True, ) ws = wb_out.create_sheet(title=site) ws.sheet_properties.tabColor = TAB_GREEN if site in ACTIVE_SITES else TAB_RED copy_row(ws, 1, header_row) for i, row in enumerate(site_rows, start=2): copy_row(ws, i, row) for col_idx in range(1, ws_src.max_column + 1): col_letter = get_column_letter(col_idx) if col_letter in ws_src.column_dimensions: ws.column_dimensions[col_letter].width = ws_src.column_dimensions[col_letter].width ws.auto_filter.ref = ws.dimensions print(f" {site}: {len(site_rows)} rows") study_col_idx = headers.index("Study") + 1 study_values = [ s.strip() for row in all_rows for s in str(row[study_col_idx - 1].value or "").split(",") if s.strip() ] study_number = Counter(study_values).most_common(1)[0][0] timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H_%M_%S") filename = f"{timestamp} {study_number} {src_path.name}" out_path = PROCESSED / filename wb_out.save(out_path) print(f" Saved: {out_path}") def main(): xlsx_files = list(TO_PROCESS.glob("*.xlsx")) if not xlsx_files: print("No .xlsx files found in ToProcess/") return for f in xlsx_files: process_file(f) if __name__ == "__main__": main()