From 40e8fb7d1b467c316ab5b19e1be187a3174e94cc Mon Sep 17 00:00:00 2001 From: lucindasisk Date: Tue, 10 Feb 2026 13:25:40 -0500 Subject: [PATCH] Update RoxygenNote to 7.3.3 and enhance writeResults function with reentry capabilities. Added backward compatibility for column names in ModelArray and included tests for reentry mode functionality. --- DESCRIPTION | 2 +- R/ModelArray_Constructor.R | 194 +++++++++++++++++- man/writeResults.Rd | 34 ++- tests/testthat/test-writeResults-reentry.R | 227 +++++++++++++++++++++ 4 files changed, 452 insertions(+), 5 deletions(-) create mode 100644 tests/testthat/test-writeResults-reentry.R diff --git a/DESCRIPTION b/DESCRIPTION index e4cc7cb..77882dd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,7 +40,7 @@ Imports: rlang, tibble, tidyr -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.3 Suggests: rmarkdown, knitr, diff --git a/R/ModelArray_Constructor.R b/R/ModelArray_Constructor.R index 05cd092..58bb845 100644 --- a/R/ModelArray_Constructor.R +++ b/R/ModelArray_Constructor.R @@ -195,6 +195,10 @@ ModelArray <- function(filepath, # load source filenames (column_names): prefer attribute; fallback to dataset attrs <- rhdf5::h5readAttributes(filepath, name = sprintf("scalars/%s/values", scalar_types[x])) colnames_attr <- attrs$column_names + if (is.null(colnames_attr)) { + # Backward-compatibility fallback for older writers + colnames_attr <- attrs$colnames + } if (is.null(colnames_attr)) { # Fallback: attempt to read from dataset-based column names # Try multiple plausible locations for compatibility across writers @@ -296,7 +300,11 @@ ModelArray <- function(filepath, attrs <- rhdf5::h5readAttributes(filepath, name = sprintf("results/%s/results_matrix", analysis_name) ) - names_results_matrix <- attrs$colnames + names_results_matrix <- attrs$column_names + if (is.null(names_results_matrix)) { + # Backward-compatibility fallback for older writers + names_results_matrix <- attrs$colnames + } if (is.null(names_results_matrix)) { # Fallback to dataset-based column names (similar to scalar handling) paths_to_try <- c( @@ -357,7 +365,14 @@ ModelArray <- function(filepath, results_data[[x]]$results_matrix <- t(results_data[[x]]$results_matrix) } - colnames(results_data[[x]]$results_matrix) <- as.character(DelayedArray::realize(names_results_matrix)) # designate the column names + if (inherits(names_results_matrix, "DelayedArray")) { + names_results_matrix <- DelayedArray::realize(names_results_matrix) + } + names_results_matrix <- as.vector(names_results_matrix) + names_results_matrix <- as.character(names_results_matrix) + names_results_matrix <- gsub("[\\x00]+$", "", names_results_matrix, perl = TRUE, useBytes = TRUE) + names_results_matrix <- trimws(names_results_matrix) + colnames(results_data[[x]]$results_matrix) <- names_results_matrix # designate the column names # /results//lut_col?: # LOOP OVER # OF COL OF $RESULTS_MATRIX, AND SEE IF THERE IS LUT_COL @@ -1370,12 +1385,40 @@ analyseOneElement.wrap <- function(i_element, #' @param analysis_name A character, the name of the results #' @param overwrite If a group with the same analysis_name exists in HDF5 file, #' whether overwrite it (TRUE) or not (FALSE) +#' @param reentry Logical. If TRUE, also write a scalar-style dataset under +#' `/scalars/` so outputs can be loaded back via `ModelArray()`. +#' Default: FALSE. +#' @param reentry_scalar_name Character scalar name to use for reentry output under +#' `/scalars`. If NULL and `reentry=TRUE`, defaults to `paste0(analysis_name, "_reentry")`. +#' @param reentry_col Character. Optional single result column in `df.output` to write +#' as the reentry scalar values. If NULL, reentry auto-selects: +#' 1) all non-`element_id` columns when they match source count, or +#' 2) the single non-`element_id` column when only one exists. +#' @param reentry_ref_scalar Character. Optional existing scalar name under `/scalars` +#' to use as the reentry reference for output dimensions and source ordering. If NULL, +#' the first scalar group is used. +#' @param reentry_overwrite Logical. If TRUE and reentry scalar group exists, overwrite it. +#' If FALSE, error when the target reentry scalar already exists. +#' +#' @details +#' Reentry mode expects `df.output` to contain an `element_id` column (0-based), which is +#' mapped back to scalar row indices (`element_id + 1` in R indexing). Rows not present in +#' `df.output` are filled with `NaN` in the reentry scalar matrix. +#' +#' Column names metadata are written in both canonical and legacy forms for compatibility: +#' - dataset: `/scalars//column_names` +#' - attributes on `/scalars//values`: `column_names` and `colnames` #' @import hdf5r #' @export writeResults <- function(fn.output, df.output, analysis_name = "myAnalysis", - overwrite = TRUE) { + overwrite = TRUE, + reentry = FALSE, + reentry_scalar_name = NULL, + reentry_col = NULL, + reentry_ref_scalar = NULL, + reentry_overwrite = TRUE) { # This is enhanced version with: 1) change to hdf5r; 2) write results with only one row for one element # check "df.output" @@ -1463,6 +1506,151 @@ writeResults <- function(fn.output, # results_matrix_ds = results_matrix_ds) # return(output_list) + if (isTRUE(reentry)) { + if (is.null(reentry_scalar_name)) { + reentry_scalar_name <- paste0(analysis_name, "_reentry") + } + if (!("element_id" %in% colnames(df.output))) { + stop("reentry=TRUE requires df.output to include column 'element_id'") + } + + # guard against ambiguous mappings caused by duplicate result column names + if (anyDuplicated(colnames(df.output)) > 0) { + dup_cols <- unique(colnames(df.output)[duplicated(colnames(df.output))]) + stop(paste0( + "reentry=TRUE does not allow duplicate column names in df.output: ", + paste(dup_cols, collapse = ", ") + )) + } + + # infer the dimensions of a scalar matrix from an existing scalar dataset + if (!fn.output.h5$exists("scalars")) { + stop("reentry=TRUE requires group '/scalars' to exist in fn.output") + } + scalars.grp <- fn.output.h5$open("scalars") + scalar_group_names <- names(scalars.grp) + if (length(scalar_group_names) == 0) { + stop("reentry=TRUE requires at least one existing scalar under '/scalars'") + } + if (!is.null(reentry_ref_scalar)) { + if (!(reentry_ref_scalar %in% scalar_group_names)) { + stop(paste0( + "reentry_ref_scalar not found under /scalars: ", + reentry_ref_scalar, + ". Available: ", + paste(scalar_group_names, collapse = ", ") + )) + } + ref_scalar_name <- reentry_ref_scalar + } else { + ref_scalar_name <- scalar_group_names[[1]] + } + ref_values <- fn.output.h5[[sprintf("scalars/%s/values", ref_scalar_name)]] + ref_dims <- ref_values$dims + n_elements <- ref_dims[1] + n_sources_ref <- ref_dims[2] + + # get source names from the same reference scalar + ref_attrs <- tryCatch( + { + hdf5r::h5attr(ref_values) + }, + error = function(e) list() + ) + source_names <- ref_attrs$column_names + if (is.null(source_names)) { + source_names <- ref_attrs$colnames + } + if (is.null(source_names)) { + source_names <- tryCatch( + { + as.character(fn.output.h5[[sprintf("scalars/%s/column_names", ref_scalar_name)]][]) + }, + error = function(e) NULL + ) + } + if (!is.null(source_names)) { + source_names <- as.character(source_names) + } + + # element ids in ModelArray outputs are 0-based + element_id_vec <- suppressWarnings(as.integer(df.output$element_id)) + if (any(is.na(element_id_vec))) { + stop("reentry=TRUE requires integer-convertible values in df.output$element_id") + } + if (anyDuplicated(element_id_vec) > 0) { + dup_ids <- unique(element_id_vec[duplicated(element_id_vec)]) + stop(paste0( + "reentry=TRUE does not allow duplicate element_id values: ", + paste(dup_ids, collapse = ", ") + )) + } + row_idx <- element_id_vec + 1L + if (any(row_idx < 1L | row_idx > n_elements)) { + stop("reentry=TRUE found element_id outside valid range for output scalar matrix") + } + + result_colnames <- setdiff(colnames(df.output), "element_id") + if (length(result_colnames) == 0) { + stop("reentry=TRUE requires at least one non-element_id column in df.output") + } + + # Select columns for reentry: + # 1) explicit reentry_col; 2) all columns if they match source count; 3) single column fallback + if (!is.null(reentry_col)) { + if (!(reentry_col %in% result_colnames)) { + stop(paste0("reentry_col not found in df.output: ", reentry_col)) + } + selected_colnames <- reentry_col + } else if (length(result_colnames) == n_sources_ref) { + selected_colnames <- result_colnames + } else if (length(result_colnames) == 1) { + selected_colnames <- result_colnames + } else { + stop(paste0( + "Ambiguous reentry columns in df.output (", length(result_colnames), " columns). ", + "Provide reentry_col, or provide exactly 1 column, or exactly ", n_sources_ref, " columns." + )) + } + + # Build the output scalar matrix (all elements x selected columns), default NaN + n_selected <- length(selected_colnames) + out_mat <- matrix(NaN, nrow = n_elements, ncol = n_selected) + selected_df <- df.output[, selected_colnames, drop = FALSE] + for (i_col in seq_len(ncol(selected_df))) { + col_class <- as.character(class(selected_df[[i_col]])[1]) + if (!(col_class %in% c("numeric", "integer"))) { + selected_df[[i_col]] <- as.numeric(factor(selected_df[[i_col]])) + } else { + selected_df[[i_col]] <- as.numeric(selected_df[[i_col]]) + } + } + out_mat[row_idx, ] <- as.matrix(selected_df) + + # If selected columns are source names, align to source_names order for robust downstream matching + out_colnames <- selected_colnames + if (!is.null(source_names) && length(source_names) == n_selected && + all(source_names %in% selected_colnames) && all(selected_colnames %in% source_names)) { + reorder_idx <- match(source_names, selected_colnames) + out_mat <- out_mat[, reorder_idx, drop = FALSE] + out_colnames <- source_names + } + + # create/overwrite /scalars/ + if (scalars.grp$exists(reentry_scalar_name) && !isTRUE(reentry_overwrite)) { + stop(paste0("reentry scalar already exists and reentry_overwrite=FALSE: ", reentry_scalar_name)) + } + if (scalars.grp$exists(reentry_scalar_name) && isTRUE(reentry_overwrite)) { + scalars.grp$link_delete(reentry_scalar_name) + } + reentry.grp <- scalars.grp$create_group(reentry_scalar_name) + reentry.grp[["values"]] <- out_mat + reentry.grp[["column_names"]] <- as.character(out_colnames) + hdf5r::h5attr(reentry.grp[["values"]], "column_names") <- as.character(out_colnames) + # Backward-compatibility alias for older readers + hdf5r::h5attr(reentry.grp[["values"]], "colnames") <- as.character(out_colnames) + } + fn.output.h5$close_all() diff --git a/man/writeResults.Rd b/man/writeResults.Rd index 927de8e..7f7f003 100644 --- a/man/writeResults.Rd +++ b/man/writeResults.Rd @@ -8,7 +8,12 @@ writeResults( fn.output, df.output, analysis_name = "myAnalysis", - overwrite = TRUE + overwrite = TRUE, + reentry = FALSE, + reentry_scalar_name = NULL, + reentry_col = NULL, + reentry_ref_scalar = NULL, + reentry_overwrite = TRUE ) } \arguments{ @@ -20,6 +25,25 @@ writeResults( \item{overwrite}{If a group with the same analysis_name exists in HDF5 file, whether overwrite it (TRUE) or not (FALSE)} + +\item{reentry}{Logical. If TRUE, also write a scalar-style dataset under +`/scalars/` so outputs can be loaded back via `ModelArray()`. +Default: FALSE.} + +\item{reentry_scalar_name}{Character scalar name to use for reentry output under +`/scalars`. If NULL and `reentry=TRUE`, defaults to `paste0(analysis_name, "_reentry")`.} + +\item{reentry_col}{Character. Optional single result column in `df.output` to write +as the reentry scalar values. If NULL, reentry auto-selects: +1) all non-`element_id` columns when they match source count, or +2) the single non-`element_id` column when only one exists.} + +\item{reentry_ref_scalar}{Character. Optional existing scalar name under `/scalars` +to use as the reentry reference for output dimensions and source ordering. If NULL, +the first scalar group is used.} + +\item{reentry_overwrite}{Logical. If TRUE and reentry scalar group exists, overwrite it. +If FALSE, error when the target reentry scalar already exists.} } \description{ Create a group named `analysis_name` in HDF5 file, @@ -28,4 +52,12 @@ then write the statistical results data.frame (i.e. for one analysis) in it. \details{ debug tip: For "Error in H5File.open(filename, mode, file_create_pl, file_access_pl)", check if there is message 'No such file or directory'. Try absolute .h5 filename. + +Reentry mode expects `df.output` to contain an `element_id` column (0-based), which is +mapped back to scalar row indices (`element_id + 1` in R indexing). Rows not present in +`df.output` are filled with `NaN` in the reentry scalar matrix. + +Column names metadata are written in both canonical and legacy forms for compatibility: +- dataset: `/scalars//column_names` +- attributes on `/scalars//values`: `column_names` and `colnames` } diff --git a/tests/testthat/test-writeResults-reentry.R b/tests/testthat/test-writeResults-reentry.R new file mode 100644 index 0000000..d89786c --- /dev/null +++ b/tests/testthat/test-writeResults-reentry.R @@ -0,0 +1,227 @@ +test_that("writeResults reentry mode writes scalar-style output (single-column)", { + h5_path <- system.file("extdata", "n50_fixels.h5", package = "ModelArray") + csv_path <- system.file("extdata", "n50_cohort.csv", package = "ModelArray") + phenotypes <- read.csv(csv_path) + + h5_tmp <- tempfile(fileext = ".h5") + file.copy(h5_path, h5_tmp, overwrite = TRUE) + on.exit(unlink(h5_tmp), add = TRUE) + + modelarray <- ModelArray(h5_tmp, scalar_types = c("FD")) + element.subset <- 1:10 + mylm <- ModelArray.lm(FD ~ age, + data = modelarray, + phenotypes = phenotypes, + scalar = "FD", + element.subset = element.subset, + n_cores = 1, + pbar = FALSE + ) + + writeResults( + h5_tmp, + df.output = mylm, + analysis_name = "result_lm", + overwrite = TRUE, + reentry = TRUE, + reentry_scalar_name = "FD_reentry_single", + reentry_col = "age.estimate", + reentry_overwrite = TRUE + ) + + modelarray_new <- ModelArray( + h5_tmp, + scalar_types = c("FD", "FD_reentry_single"), + analysis_names = c("result_lm") + ) + + fd_reentry <- scalars(modelarray_new)[["FD_reentry_single"]] + expect_equal(nrow(fd_reentry), nrow(scalars(modelarray_new)[["FD"]])) + expect_equal(ncol(fd_reentry), 1) + + idx <- mylm$element_id + 1L + reentry_vals <- as.numeric(fd_reentry[idx, 1]) + expect_equal(reentry_vals, mylm$age.estimate, tolerance = 1e-12) + + # check one row outside subset remains NaN + outside_idx <- max(idx) + 1L + expect_true(is.nan(as.numeric(fd_reentry[outside_idx, 1]))) +}) + +test_that("writeResults reentry mode writes scalar-style output (matrix-style columns)", { + h5_path <- system.file("extdata", "n50_fixels.h5", package = "ModelArray") + h5_tmp <- tempfile(fileext = ".h5") + file.copy(h5_path, h5_tmp, overwrite = TRUE) + on.exit(unlink(h5_tmp), add = TRUE) + + modelarray <- ModelArray(h5_tmp, scalar_types = c("FD")) + src_names <- sources(modelarray)[["FD"]] + + element.subset <- 1:10 + element_id <- element.subset - 1L + set.seed(20260209) + mock_mat <- matrix( + rnorm(length(element.subset) * length(src_names)), + nrow = length(element.subset), + ncol = length(src_names) + ) + colnames(mock_mat) <- src_names + df_mock <- cbind(data.frame(element_id = element_id), as.data.frame(mock_mat)) + + writeResults( + h5_tmp, + df.output = df_mock, + analysis_name = "result_mock", + overwrite = TRUE, + reentry = TRUE, + reentry_scalar_name = "FD_reentry_matrix", + reentry_overwrite = TRUE + ) + + modelarray_new <- ModelArray(h5_tmp, scalar_types = c("FD_reentry_matrix")) + fd_reentry <- scalars(modelarray_new)[["FD_reentry_matrix"]] + expect_equal(dim(fd_reentry), c(nrow(scalars(modelarray)[["FD"]]), length(src_names))) + expect_equal(colnames(fd_reentry), src_names) + + idx <- element_id + 1L + loaded <- as.matrix(fd_reentry[idx, ]) + expect_equal(loaded, mock_mat, tolerance = 1e-12) +}) + +test_that("ModelArray scalar loader accepts legacy 'colnames' attribute", { + h5_path <- system.file("extdata", "n50_fixels.h5", package = "ModelArray") + h5_tmp <- tempfile(fileext = ".h5") + file.copy(h5_path, h5_tmp, overwrite = TRUE) + on.exit(unlink(h5_tmp), add = TRUE) + + modelarray <- ModelArray(h5_tmp, scalar_types = c("FD")) + n_elements <- nrow(scalars(modelarray)[["FD"]]) + legacy_vals <- matrix(NaN, nrow = n_elements, ncol = 1) + legacy_vals[1:10, 1] <- seq_len(10) + + rhdf5::h5createGroup(h5_tmp, "scalars/FD_reentry_legacy") + rhdf5::h5write(legacy_vals, h5_tmp, "scalars/FD_reentry_legacy/values") + rhdf5::h5writeAttribute( + attr = "legacy_source", + h5obj = h5_tmp, + h5loc = "/scalars/FD_reentry_legacy/values", + name = "colnames" + ) + + legacy_loaded <- ModelArray(h5_tmp, scalar_types = c("FD_reentry_legacy")) + expect_equal(dim(scalars(legacy_loaded)[["FD_reentry_legacy"]]), c(n_elements, 1)) + expect_equal(sources(legacy_loaded)[["FD_reentry_legacy"]], "legacy_source") +}) + +test_that("writeResults reentry rejects duplicate element_id values", { + h5_path <- system.file("extdata", "n50_fixels.h5", package = "ModelArray") + h5_tmp <- tempfile(fileext = ".h5") + file.copy(h5_path, h5_tmp, overwrite = TRUE) + on.exit(unlink(h5_tmp), add = TRUE) + + df_bad <- data.frame( + element_id = c(0, 0, 1), + age.estimate = c(1.1, 2.2, 3.3) + ) + + expect_error( + writeResults( + h5_tmp, + df.output = df_bad, + analysis_name = "result_dup_element_id", + overwrite = TRUE, + reentry = TRUE, + reentry_scalar_name = "FD_reentry_dup_element_id" + ), + "duplicate element_id" + ) +}) + +test_that("writeResults reentry rejects duplicate non-element_id column names", { + h5_path <- system.file("extdata", "n50_fixels.h5", package = "ModelArray") + h5_tmp <- tempfile(fileext = ".h5") + file.copy(h5_path, h5_tmp, overwrite = TRUE) + on.exit(unlink(h5_tmp), add = TRUE) + + df_bad <- data.frame( + element_id = c(0, 1, 2), + x = c(10, 20, 30), + x = c(40, 50, 60), + check.names = FALSE + ) + + expect_error( + writeResults( + h5_tmp, + df.output = df_bad, + analysis_name = "result_dup_colnames", + overwrite = TRUE, + reentry = TRUE, + reentry_scalar_name = "FD_reentry_dup_colnames" + ), + "duplicate column names" + ) +}) + +test_that("writeResults reentry uses explicit reentry_ref_scalar", { + h5_path <- system.file("extdata", "n50_fixels.h5", package = "ModelArray") + h5_tmp <- tempfile(fileext = ".h5") + file.copy(h5_path, h5_tmp, overwrite = TRUE) + on.exit(unlink(h5_tmp), add = TRUE) + + # add a tiny scalar so default-first behavior could be wrong/non-deterministic + tiny_vals <- matrix(c(1, 2, 3, 4), nrow = 2, ncol = 2) + tiny_src <- c("tiny_a", "tiny_b") + rhdf5::h5createGroup(h5_tmp, "scalars/AA_tiny") + rhdf5::h5write(tiny_vals, h5_tmp, "scalars/AA_tiny/values") + rhdf5::h5write(tiny_src, h5_tmp, "scalars/AA_tiny/column_names") + rhdf5::h5writeAttribute( + attr = tiny_src, + h5obj = h5_tmp, + h5loc = "/scalars/AA_tiny/values", + name = "column_names" + ) + + modelarray <- ModelArray(h5_tmp, scalar_types = c("FD")) + fd_src <- sources(modelarray)[["FD"]] + element_id <- 0:4 + payload <- matrix( + seq_len(length(element_id) * length(fd_src)), + nrow = length(element_id), + ncol = length(fd_src) + ) + colnames(payload) <- fd_src + df_ok <- cbind(data.frame(element_id = element_id), as.data.frame(payload)) + + writeResults( + h5_tmp, + df.output = df_ok, + analysis_name = "result_ref_scalar_ok", + overwrite = TRUE, + reentry = TRUE, + reentry_scalar_name = "FD_reentry_ref_scalar", + reentry_ref_scalar = "FD", + reentry_overwrite = TRUE + ) + + ma_new <- ModelArray(h5_tmp, scalar_types = c("FD_reentry_ref_scalar")) + expect_equal(colnames(scalars(ma_new)[["FD_reentry_ref_scalar"]]), fd_src) + expect_equal( + as.matrix(scalars(ma_new)[["FD_reentry_ref_scalar"]])[element_id + 1L, ], + payload + ) + + expect_error( + writeResults( + h5_tmp, + df.output = df_ok, + analysis_name = "result_ref_scalar_bad", + overwrite = TRUE, + reentry = TRUE, + reentry_scalar_name = "FD_reentry_ref_scalar_bad", + reentry_ref_scalar = "DOES_NOT_EXIST", + reentry_overwrite = TRUE + ), + "reentry_ref_scalar not found" + ) +})