From 0b1f79aa32f6d809d4853427994650ed7f1aeea6 Mon Sep 17 00:00:00 2001 From: asizemore Date: Fri, 3 Jan 2025 10:24:33 -0500 Subject: [PATCH 1/8] pca skeleton --- R/method-pca.R | 34 ++++++++++++++++++++++++++++++++ tests/testthat/test-method-pca.R | 4 ++++ 2 files changed, 38 insertions(+) create mode 100644 R/method-pca.R create mode 100644 tests/testthat/test-method-pca.R diff --git a/R/method-pca.R b/R/method-pca.R new file mode 100644 index 0000000..af68d40 --- /dev/null +++ b/R/method-pca.R @@ -0,0 +1,34 @@ +# PCA + +# Args: ntop, data, nPCs, +# test with testCountDataCollection +testData <- testCountDataCollection + + +assay <- getCollectionData(testData) +recordIdColumn <- testData@recordIdColumn +ancestorIdColumns <- testData@ancestorIdColumns +allIdColumns <- c(recordIdColumn, ancestorIdColumns) +# remove id columns +features <- assay[, -..allIdColumns] +# Currently taxa/genes are columns and samples are rows + +# From deseq plotpca, +rv <- matrixStats::rowVars(t(as.matrix(features))) +# Now we have a vector of variances for each gene +ntop = 500 # Default value. Should we make this a variable? +select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))] +pca <- prcomp(features[, ..select]) + +# The good stuff is in pca$x +x <- pca$x[, 1] +y <- pca$x[, 2] + +# assemble data frame +dt <- assay[, ..allIdColumns] +# combine with the first 20 columns of pca$x +dt <- cbind(dt, pca$x[, 1:20]) # this works fine even with one id column + +# Then we send to plot.data? +# have to merge with overlay variable + diff --git a/tests/testthat/test-method-pca.R b/tests/testthat/test-method-pca.R new file mode 100644 index 0000000..5e14259 --- /dev/null +++ b/tests/testthat/test-method-pca.R @@ -0,0 +1,4 @@ +# Test PCA + +testData <- testCountDataCollection + From 5c5430da972ab82a752dd331bb6ecff6432956c0 Mon Sep 17 00:00:00 2001 From: asizemore Date: Fri, 3 Jan 2025 10:37:22 -0500 Subject: [PATCH 2/8] created pca function --- R/method-pca.R | 104 +++++++++++++++++++++---------- tests/testthat/test-method-pca.R | 2 + 2 files changed, 73 insertions(+), 33 deletions(-) diff --git a/R/method-pca.R b/R/method-pca.R index af68d40..de5a6ea 100644 --- a/R/method-pca.R +++ b/R/method-pca.R @@ -1,34 +1,72 @@ -# PCA - -# Args: ntop, data, nPCs, -# test with testCountDataCollection -testData <- testCountDataCollection - - -assay <- getCollectionData(testData) -recordIdColumn <- testData@recordIdColumn -ancestorIdColumns <- testData@ancestorIdColumns -allIdColumns <- c(recordIdColumn, ancestorIdColumns) -# remove id columns -features <- assay[, -..allIdColumns] -# Currently taxa/genes are columns and samples are rows - -# From deseq plotpca, -rv <- matrixStats::rowVars(t(as.matrix(features))) -# Now we have a vector of variances for each gene -ntop = 500 # Default value. Should we make this a variable? -select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))] -pca <- prcomp(features[, ..select]) - -# The good stuff is in pca$x -x <- pca$x[, 1] -y <- pca$x[, 2] - -# assemble data frame -dt <- assay[, ..allIdColumns] -# combine with the first 20 columns of pca$x -dt <- cbind(dt, pca$x[, 1:20]) # this works fine even with one id column - -# Then we send to plot.data? -# have to merge with overlay variable + +#' PCA +#' +#' @param collection A Collection object +#' @param nPCs Number of principal components to return. Default 10 +#' @param ntop Use the top ntop genes with the highest variance for the pca computation. Mirrors the deseq2 plotPCA argument. Default 500. +#' @param verbose Boolean indicating if extra messaging should be printed. +#' @return A data frame with the first two principal components +#' @export +setGeneric("pca", + function(collection, nPCs = 10, ntop = 500, verbose = c(TRUE, FALSE)) standardGeneric("pca"), + signature = c("collection") +) + +#' @export +setMethod(pca, "Collection", + function(collection, nPCs = 10, ntop = 500, verbose = c(TRUE, FALSE)) { + + # Get the assay data + assay <- getCollectionData(collection) + recordIdColumn <- collection@recordIdColumn + ancestorIdColumns <- collection@ancestorIdColumns + allIdColumns <- c(recordIdColumn, ancestorIdColumns) + # remove id columns + features <- assay[, -..allIdColumns] + # Currently taxa/genes are columns and samples are rows + + # From deseq plotpca, + rv <- matrixStats::rowVars(t(as.matrix(features))) + # Now we have a vector of variances for each gene + select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))] + pca <- prcomp(features[, ..select]) + + + # assemble data frame + dt <- assay[, ..allIdColumns] + # combine with the first 20 columns of pca$x + dt <- cbind(dt, pca$x[, 1:20]) # this works fine even with one id column + + # Then we send to plot.data? + # have to merge with overlay variable + return(dt) + } +) + +# assay <- getCollectionData(testData) +# recordIdColumn <- testData@recordIdColumn +# ancestorIdColumns <- testData@ancestorIdColumns +# allIdColumns <- c(recordIdColumn, ancestorIdColumns) +# # remove id columns +# features <- assay[, -..allIdColumns] +# # Currently taxa/genes are columns and samples are rows + +# # From deseq plotpca, +# rv <- matrixStats::rowVars(t(as.matrix(features))) +# # Now we have a vector of variances for each gene +# ntop = 500 # Default value. Should we make this a variable? +# select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))] +# pca <- prcomp(features[, ..select]) + +# # The good stuff is in pca$x +# x <- pca$x[, 1] +# y <- pca$x[, 2] + +# # assemble data frame +# dt <- assay[, ..allIdColumns] +# # combine with the first 20 columns of pca$x +# dt <- cbind(dt, pca$x[, 1:20]) # this works fine even with one id column + +# # Then we send to plot.data? +# # have to merge with overlay variable diff --git a/tests/testthat/test-method-pca.R b/tests/testthat/test-method-pca.R index 5e14259..37917c8 100644 --- a/tests/testthat/test-method-pca.R +++ b/tests/testthat/test-method-pca.R @@ -2,3 +2,5 @@ testData <- testCountDataCollection +output <- pca(testData) +head(output) From 4538df5e02e9eee54ffb75007b567ccf2b980e16 Mon Sep 17 00:00:00 2001 From: asizemore Date: Fri, 3 Jan 2025 11:10:02 -0500 Subject: [PATCH 3/8] cleaned and added first tests --- DESCRIPTION | 1 + NAMESPACE | 2 + R/method-pca.R | 73 ++++++++++++-------------------- man/pca.Rd | 23 ++++++++++ tests/testthat/test-method-pca.R | 31 ++++++++++++-- 5 files changed, 80 insertions(+), 50 deletions(-) create mode 100644 man/pca.Rd diff --git a/DESCRIPTION b/DESCRIPTION index c59bb6b..d75502c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -54,6 +54,7 @@ Collate: 'data.R' 'method-correlation.R' 'method-differentialExpression.R' + 'method-pca.R' 'methods-Bin.R' 'methods-CollectionWithMetadata.R' 'methods-Collections.R' diff --git a/NAMESPACE b/NAMESPACE index 5708f1d..7c82203 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -97,6 +97,7 @@ export(naToZero) export(name) export(new_data_frame) export(nonZeroRound) +export(pca) export(predicateFactory) export(pruneFeatures) export(removeAttr) @@ -170,6 +171,7 @@ exportMethods(getStudyIdColumnName) exportMethods(getVariableSpec) exportMethods(getVariableSpecColumnName) exportMethods(merge) +exportMethods(pca) exportMethods(predicateFactory) exportMethods(toJSON) exportMethods(whichValuesInBin) diff --git a/R/method-pca.R b/R/method-pca.R index de5a6ea..c52492c 100644 --- a/R/method-pca.R +++ b/R/method-pca.R @@ -5,7 +5,7 @@ #' @param nPCs Number of principal components to return. Default 10 #' @param ntop Use the top ntop genes with the highest variance for the pca computation. Mirrors the deseq2 plotPCA argument. Default 500. #' @param verbose Boolean indicating if extra messaging should be printed. -#' @return A data frame with the first two principal components +#' @return A data table with the id columns and the first nPCs principal components. #' @export setGeneric("pca", function(collection, nPCs = 10, ntop = 500, verbose = c(TRUE, FALSE)) standardGeneric("pca"), @@ -16,57 +16,36 @@ setGeneric("pca", setMethod(pca, "Collection", function(collection, nPCs = 10, ntop = 500, verbose = c(TRUE, FALSE)) { - # Get the assay data + verbose <- veupathUtils::matchArg(verbose) assay <- getCollectionData(collection) recordIdColumn <- collection@recordIdColumn ancestorIdColumns <- collection@ancestorIdColumns allIdColumns <- c(recordIdColumn, ancestorIdColumns) - # remove id columns - features <- assay[, -..allIdColumns] - # Currently taxa/genes are columns and samples are rows - - # From deseq plotpca, - rv <- matrixStats::rowVars(t(as.matrix(features))) - # Now we have a vector of variances for each gene - select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))] - pca <- prcomp(features[, ..select]) - - - # assemble data frame + + # Remove id columns from the assay to get only the features. + features <- assay[, -..allIdColumns] # features has samples as rows. + + # Update ntop if it's too large. + if (ntop > ncol(features)) { + if (verbose) { + message("ntop is larger than the number of features. Using all features.") + } + ntop <- min(ntop, ncol(features)) + } + + # Use prcomp to perform PCA. + # The following is heavily borrowed from the deseq2 plotPCA function. + rowVariances <- matrixStats::rowVars(t(as.matrix(features))) + print(length(rowVariances)) + keepFeatures <- order(rowVariances, decreasing=TRUE)[seq_len(ntop)] + pcaResult <- prcomp(features[, ..keepFeatures]) + + + # Assemble output data dt <- assay[, ..allIdColumns] - # combine with the first 20 columns of pca$x - dt <- cbind(dt, pca$x[, 1:20]) # this works fine even with one id column + # The PCA results are in pcaResult$x. Keep the first nPCs PCS. + dt <- cbind(dt, pcaResult$x[, 1:nPCs]) # this works fine even with one id column - # Then we send to plot.data? - # have to merge with overlay variable return(dt) } -) - -# assay <- getCollectionData(testData) -# recordIdColumn <- testData@recordIdColumn -# ancestorIdColumns <- testData@ancestorIdColumns -# allIdColumns <- c(recordIdColumn, ancestorIdColumns) -# # remove id columns -# features <- assay[, -..allIdColumns] -# # Currently taxa/genes are columns and samples are rows - -# # From deseq plotpca, -# rv <- matrixStats::rowVars(t(as.matrix(features))) -# # Now we have a vector of variances for each gene -# ntop = 500 # Default value. Should we make this a variable? -# select <- order(rv, decreasing=TRUE)[seq_len(min(ntop, length(rv)))] -# pca <- prcomp(features[, ..select]) - -# # The good stuff is in pca$x -# x <- pca$x[, 1] -# y <- pca$x[, 2] - -# # assemble data frame -# dt <- assay[, ..allIdColumns] -# # combine with the first 20 columns of pca$x -# dt <- cbind(dt, pca$x[, 1:20]) # this works fine even with one id column - -# # Then we send to plot.data? -# # have to merge with overlay variable - +) \ No newline at end of file diff --git a/man/pca.Rd b/man/pca.Rd new file mode 100644 index 0000000..62805f9 --- /dev/null +++ b/man/pca.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/method-pca.R +\name{pca} +\alias{pca} +\title{PCA} +\usage{ +pca(collection, nPCs = 10, ntop = 500, verbose = c(TRUE, FALSE)) +} +\arguments{ +\item{collection}{A Collection object} + +\item{nPCs}{Number of principal components to return. Default 10} + +\item{ntop}{Use the top ntop genes with the highest variance for the pca computation. Mirrors the deseq2 plotPCA argument. Default 500.} + +\item{verbose}{Boolean indicating if extra messaging should be printed.} +} +\value{ +A data table with the id columns and the first nPCs principal components. +} +\description{ +PCA +} diff --git a/tests/testthat/test-method-pca.R b/tests/testthat/test-method-pca.R index 37917c8..4e4c971 100644 --- a/tests/testthat/test-method-pca.R +++ b/tests/testthat/test-method-pca.R @@ -1,6 +1,31 @@ # Test PCA -testData <- testCountDataCollection +test_that("The pca function works", { + + testData <- testCountDataCollection + + # Run pca + output <- pca(testData, nPCs = 2) + expect_is(output, "data.table") + expect_equal(nrow(output), nrow(getCollectionData(testData))) + expect_equal(ncol(output), 3) + expect_equal(names(output), c("entity.SampleID", "PC1", "PC2")) -output <- pca(testData) -head(output) + + # Generate some fake data + nSamples <- 500 + fakeData <- data.table( + entity.SampleID = paste0("Sample", 1:nSamples), + feat1 = rnorm(nSamples), + feat2 = rnorm(nSamples), + feat3 = rnorm(nSamples) + ) + + fakeCollection <- new("Collection", data = fakeData, recordIdColumn = "entity.SampleID", name="test") + + output <- pca(fakeCollection, nPCs = 2, ntop=20) + expect_is(output, "data.table") + expect_equal(nrow(output), nSamples) + expect_equal(ncol(output), 6) + expect_equal(names(output), c("entity.SampleID", "PC1", "PC2", "PC3", "PC4", "PC5")) +}) From 8876aab6d26e932e7fcaf6e90afa5a69726163e6 Mon Sep 17 00:00:00 2001 From: asizemore Date: Fri, 10 Jan 2025 16:27:06 -0500 Subject: [PATCH 4/8] add tests for messy data --- R/method-pca.R | 6 ++++- tests/testthat/test-method-pca.R | 39 ++++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/R/method-pca.R b/R/method-pca.R index c52492c..d2d489f 100644 --- a/R/method-pca.R +++ b/R/method-pca.R @@ -33,10 +33,14 @@ setMethod(pca, "Collection", ntop <- min(ntop, ncol(features)) } + # Ensure ntop is at least 1. + if (ntop <= 1) { + stop("ntop must be at least 2.") + } + # Use prcomp to perform PCA. # The following is heavily borrowed from the deseq2 plotPCA function. rowVariances <- matrixStats::rowVars(t(as.matrix(features))) - print(length(rowVariances)) keepFeatures <- order(rowVariances, decreasing=TRUE)[seq_len(ntop)] pcaResult <- prcomp(features[, ..keepFeatures]) diff --git a/tests/testthat/test-method-pca.R b/tests/testthat/test-method-pca.R index 4e4c971..f39f485 100644 --- a/tests/testthat/test-method-pca.R +++ b/tests/testthat/test-method-pca.R @@ -6,7 +6,7 @@ test_that("The pca function works", { # Run pca output <- pca(testData, nPCs = 2) - expect_is(output, "data.table") + expect_s3_class(output, "data.table") expect_equal(nrow(output), nrow(getCollectionData(testData))) expect_equal(ncol(output), 3) expect_equal(names(output), c("entity.SampleID", "PC1", "PC2")) @@ -24,8 +24,39 @@ test_that("The pca function works", { fakeCollection <- new("Collection", data = fakeData, recordIdColumn = "entity.SampleID", name="test") output <- pca(fakeCollection, nPCs = 2, ntop=20) - expect_is(output, "data.table") + expect_s3_class(output, "data.table") expect_equal(nrow(output), nSamples) - expect_equal(ncol(output), 6) - expect_equal(names(output), c("entity.SampleID", "PC1", "PC2", "PC3", "PC4", "PC5")) + expect_equal(ncol(output), 3) + expect_equal(names(output), c("entity.SampleID", "PC1", "PC2")) +}) + +test_that("The pca function can handle messy data", { + + # Generate some fake data + nSamples <- 500 + fakeData <- data.table( + entity.SampleID = paste0("Sample", 1:nSamples), + feat1 = rnorm(nSamples), + feat2 = rnorm(nSamples), + feat3 = rnorm(nSamples) + ) + + fakeCollection <- new("Collection", data = fakeData, recordIdColumn = "entity.SampleID", name="test", imputeZero=T) + + # Add some missing values and ensure one sample gets dropped + fakeCollection@data[1:50, "feat1"] <- NA + fakeCollection@data[25:100, "feat2"] <- 0 + fakeCollection@data[(nSamples-50):nSamples, "feat3"] <- NA + fakeCollection@data[26, "feat3"] <- NA + + output <- pca(fakeCollection, nPCs = 2, ntop=5, verbose=T) + expect_s3_class(output, "data.table") + expect_equal(nrow(output), nSamples-1) + expect_equal(ncol(output), 3) + expect_equal(names(output), c("entity.SampleID", "PC1", "PC2")) + + + # Test with not enough features + expect_error(pca(fakeCollection, nPCs = 2, ntop=1, verbose=T), "ntop must be at least 2.") + }) From 5991845db4d885a57471ad1f0302fcca13ae1c7f Mon Sep 17 00:00:00 2001 From: asizemore Date: Thu, 16 Jan 2025 09:33:30 -0500 Subject: [PATCH 5/8] make the output a ComputeResult --- R/method-pca.R | 26 ++++++++++-- tests/testthat/test-method-pca.R | 68 ++++++++++++++++++++++---------- 2 files changed, 71 insertions(+), 23 deletions(-) diff --git a/R/method-pca.R b/R/method-pca.R index d2d489f..2280331 100644 --- a/R/method-pca.R +++ b/R/method-pca.R @@ -5,7 +5,7 @@ #' @param nPCs Number of principal components to return. Default 10 #' @param ntop Use the top ntop genes with the highest variance for the pca computation. Mirrors the deseq2 plotPCA argument. Default 500. #' @param verbose Boolean indicating if extra messaging should be printed. -#' @return A data table with the id columns and the first nPCs principal components. +#' @return A ComputeResult object. The data slot contains a data.table with the id columns and the first nPCs principal components. #' @export setGeneric("pca", function(collection, nPCs = 10, ntop = 500, verbose = c(TRUE, FALSE)) standardGeneric("pca"), @@ -45,11 +45,31 @@ setMethod(pca, "Collection", pcaResult <- prcomp(features[, ..keepFeatures]) - # Assemble output data + # Assemble the output ComputeResult data and variable metadata. dt <- assay[, ..allIdColumns] # The PCA results are in pcaResult$x. Keep the first nPCs PCS. dt <- cbind(dt, pcaResult$x[, 1:nPCs]) # this works fine even with one id column - return(dt) + variableMetadataList <- lapply(1:nPCs, function(i) { + veupathUtils::VariableMetadata( + variableClass = veupathUtils::VariableClass(value = "computed"), + variableSpec = veupathUtils::VariableSpec(variableId = paste0("PC",i), entityId = ''), # computed var so not assinging entity. + displayName = paste0("PC ",i), + displayRangeMin = min(pcaResult$x[,i]), + displayRangeMax = max(pcaResult$x[,i]), + dataType = veupathUtils::DataType(value = "NUMBER"), + dataShape = veupathUtils::DataShape(value = "CONTINUOUS") + ) + }) + + result <- new("ComputeResult") + result@name <- 'pca' + result@recordIdColumn <- recordIdColumn + result@ancestorIdColumns <- ancestorIdColumns + result@data <- dt + result@parameters <- paste0('recordIdColumn = ', recordIdColumn,", nPCs = ", nPCs, ', ntop = ', ntop) + result@computedVariableMetadata <- veupathUtils::VariableMetadataList(S4Vectors::SimpleList(variableMetadataList)) + + return(result) } ) \ No newline at end of file diff --git a/tests/testthat/test-method-pca.R b/tests/testthat/test-method-pca.R index f39f485..6052313 100644 --- a/tests/testthat/test-method-pca.R +++ b/tests/testthat/test-method-pca.R @@ -1,15 +1,25 @@ # Test PCA -test_that("The pca function works", { +test_that("The pca function produces the expected output", { testData <- testCountDataCollection # Run pca - output <- pca(testData, nPCs = 2) - expect_s3_class(output, "data.table") - expect_equal(nrow(output), nrow(getCollectionData(testData))) - expect_equal(ncol(output), 3) - expect_equal(names(output), c("entity.SampleID", "PC1", "PC2")) + load_all() + output <- veupathUtils::pca(testData, nPCs = 2) + expect_s4_class(output, "ComputeResult") + outputData <- output@data + expect_s3_class(outputData, "data.table") + expect_equal(nrow(outputData), nrow(getCollectionData(testData))) + expect_equal(ncol(outputData), 3) + expect_equal(names(outputData), c("entity.SampleID", "PC1", "PC2")) + computedVariableMetadata <- output@computedVariableMetadata + expect_s4_class(computedVariableMetadata, "VariableMetadataList") + expect_equal(length(computedVariableMetadata), 2) + expect_s4_class(computedVariableMetadata[[1]], "VariableMetadata") + expect_s4_class(computedVariableMetadata[[2]], "VariableMetadata") + expect_equal(computedVariableMetadata[[1]]@variableSpec@variableId, "PC1") + expect_equal(computedVariableMetadata[[2]]@variableSpec@variableId, "PC2") # Generate some fake data @@ -23,11 +33,20 @@ test_that("The pca function works", { fakeCollection <- new("Collection", data = fakeData, recordIdColumn = "entity.SampleID", name="test") - output <- pca(fakeCollection, nPCs = 2, ntop=20) - expect_s3_class(output, "data.table") - expect_equal(nrow(output), nSamples) - expect_equal(ncol(output), 3) - expect_equal(names(output), c("entity.SampleID", "PC1", "PC2")) + output <- veupathUtils::pca(fakeCollection, nPCs = 2, ntop=20) + expect_s4_class(output, "ComputeResult") + outputData <- output@data + expect_s3_class(outputData, "data.table") + expect_equal(nrow(outputData), nSamples) + expect_equal(ncol(outputData), 3) + expect_equal(names(outputData), c("entity.SampleID", "PC1", "PC2")) + computedVariableMetadata <- output@computedVariableMetadata + expect_s4_class(computedVariableMetadata, "VariableMetadataList") + expect_equal(length(computedVariableMetadata), 2) + expect_s4_class(computedVariableMetadata[[1]], "VariableMetadata") + expect_s4_class(computedVariableMetadata[[2]], "VariableMetadata") + expect_equal(computedVariableMetadata[[1]]@variableSpec@variableId, "PC1") + expect_equal(computedVariableMetadata[[2]]@variableSpec@variableId, "PC2") }) test_that("The pca function can handle messy data", { @@ -36,9 +55,9 @@ test_that("The pca function can handle messy data", { nSamples <- 500 fakeData <- data.table( entity.SampleID = paste0("Sample", 1:nSamples), - feat1 = rnorm(nSamples), - feat2 = rnorm(nSamples), - feat3 = rnorm(nSamples) + entity.feat1 = rnorm(nSamples), + entity.feat2 = rnorm(nSamples), + entity.feat3 = rnorm(nSamples) ) fakeCollection <- new("Collection", data = fakeData, recordIdColumn = "entity.SampleID", name="test", imputeZero=T) @@ -49,14 +68,23 @@ test_that("The pca function can handle messy data", { fakeCollection@data[(nSamples-50):nSamples, "feat3"] <- NA fakeCollection@data[26, "feat3"] <- NA - output <- pca(fakeCollection, nPCs = 2, ntop=5, verbose=T) - expect_s3_class(output, "data.table") - expect_equal(nrow(output), nSamples-1) - expect_equal(ncol(output), 3) - expect_equal(names(output), c("entity.SampleID", "PC1", "PC2")) + output <- veupathUtils::pca(fakeCollection, nPCs = 2, ntop=5, verbose=T) + expect_s4_class(output, "ComputeResult") + outputData <- output@data + expect_s3_class(outputData, "data.table") + expect_equal(nrow(outputData), nSamples-1) + expect_equal(ncol(outputData), 3) + expect_equal(names(outputData), c("entity.SampleID", "PC1", "PC2")) + computedVariableMetadata <- output@computedVariableMetadata + expect_s4_class(computedVariableMetadata, "VariableMetadataList") + expect_equal(length(computedVariableMetadata), 2) + expect_s4_class(computedVariableMetadata[[1]], "VariableMetadata") + expect_s4_class(computedVariableMetadata[[2]], "VariableMetadata") + expect_equal(computedVariableMetadata[[1]]@variableSpec@variableId, "PC1") + expect_equal(computedVariableMetadata[[2]]@variableSpec@variableId, "PC2") # Test with not enough features - expect_error(pca(fakeCollection, nPCs = 2, ntop=1, verbose=T), "ntop must be at least 2.") + expect_error(veupathUtils::pca(fakeCollection, nPCs = 2, ntop=1, verbose=T), "ntop must be at least 2.") }) From d23800d772651f768c7fe02d3291697b8a881235 Mon Sep 17 00:00:00 2001 From: asizemore Date: Fri, 17 Jan 2025 09:43:26 -0500 Subject: [PATCH 6/8] fix failing empty data test --- R/method-pca.R | 2 +- tests/testthat/test-method-pca.R | 25 ++++++++++++++++--------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/R/method-pca.R b/R/method-pca.R index 2280331..be2a66f 100644 --- a/R/method-pca.R +++ b/R/method-pca.R @@ -21,7 +21,7 @@ setMethod(pca, "Collection", recordIdColumn <- collection@recordIdColumn ancestorIdColumns <- collection@ancestorIdColumns allIdColumns <- c(recordIdColumn, ancestorIdColumns) - + # Remove id columns from the assay to get only the features. features <- assay[, -..allIdColumns] # features has samples as rows. diff --git a/tests/testthat/test-method-pca.R b/tests/testthat/test-method-pca.R index 6052313..ae56f52 100644 --- a/tests/testthat/test-method-pca.R +++ b/tests/testthat/test-method-pca.R @@ -60,28 +60,35 @@ test_that("The pca function can handle messy data", { entity.feat3 = rnorm(nSamples) ) - fakeCollection <- new("Collection", data = fakeData, recordIdColumn = "entity.SampleID", name="test", imputeZero=T) + fakeCollection <- new("Collection", + data = fakeData, + recordIdColumn = "entity.SampleID", + name="test", + imputeZero=TRUE + ) # Add some missing values and ensure one sample gets dropped - fakeCollection@data[1:50, "feat1"] <- NA - fakeCollection@data[25:100, "feat2"] <- 0 - fakeCollection@data[(nSamples-50):nSamples, "feat3"] <- NA - fakeCollection@data[26, "feat3"] <- NA + fakeCollection@data[1:50, "entity.feat1"] <- NA + fakeCollection@data[25:100, "entity.feat2"] <- 0 + fakeCollection@data[(nSamples-50):nSamples, "entity.feat3"] <- NA + fakeCollection@data[26, "entity.feat3"] <- NA - output <- veupathUtils::pca(fakeCollection, nPCs = 2, ntop=5, verbose=T) + output <- pca(fakeCollection, nPCs = 3, ntop=5, verbose=T) expect_s4_class(output, "ComputeResult") outputData <- output@data expect_s3_class(outputData, "data.table") expect_equal(nrow(outputData), nSamples-1) - expect_equal(ncol(outputData), 3) - expect_equal(names(outputData), c("entity.SampleID", "PC1", "PC2")) + expect_equal(ncol(outputData), 4) + expect_equal(names(outputData), c("entity.SampleID", "PC1", "PC2", "PC3")) computedVariableMetadata <- output@computedVariableMetadata expect_s4_class(computedVariableMetadata, "VariableMetadataList") - expect_equal(length(computedVariableMetadata), 2) + expect_equal(length(computedVariableMetadata), 3) expect_s4_class(computedVariableMetadata[[1]], "VariableMetadata") expect_s4_class(computedVariableMetadata[[2]], "VariableMetadata") + expect_s4_class(computedVariableMetadata[[3]], "VariableMetadata") expect_equal(computedVariableMetadata[[1]]@variableSpec@variableId, "PC1") expect_equal(computedVariableMetadata[[2]]@variableSpec@variableId, "PC2") + expect_equal(computedVariableMetadata[[3]]@variableSpec@variableId, "PC3") # Test with not enough features From 29fc9ddb1f314124ac0f3592727a6ff004623d14 Mon Sep 17 00:00:00 2001 From: asizemore Date: Tue, 21 Jan 2025 16:00:11 -0500 Subject: [PATCH 7/8] add entity to pca output vars --- R/method-pca.R | 7 +++++-- tests/testthat/test-method-pca.R | 3 +++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/R/method-pca.R b/R/method-pca.R index be2a66f..ed49b2c 100644 --- a/R/method-pca.R +++ b/R/method-pca.R @@ -21,6 +21,7 @@ setMethod(pca, "Collection", recordIdColumn <- collection@recordIdColumn ancestorIdColumns <- collection@ancestorIdColumns allIdColumns <- c(recordIdColumn, ancestorIdColumns) + entity <- veupathUtils::strSplit(recordIdColumn,".", 4, 1) # Remove id columns from the assay to get only the features. features <- assay[, -..allIdColumns] # features has samples as rows. @@ -53,7 +54,7 @@ setMethod(pca, "Collection", variableMetadataList <- lapply(1:nPCs, function(i) { veupathUtils::VariableMetadata( variableClass = veupathUtils::VariableClass(value = "computed"), - variableSpec = veupathUtils::VariableSpec(variableId = paste0("PC",i), entityId = ''), # computed var so not assinging entity. + variableSpec = veupathUtils::VariableSpec(variableId = paste0("PC",i), entityId = entity), displayName = paste0("PC ",i), displayRangeMin = min(pcaResult$x[,i]), displayRangeMax = max(pcaResult$x[,i]), @@ -68,7 +69,9 @@ setMethod(pca, "Collection", result@ancestorIdColumns <- ancestorIdColumns result@data <- dt result@parameters <- paste0('recordIdColumn = ', recordIdColumn,", nPCs = ", nPCs, ', ntop = ', ntop) - result@computedVariableMetadata <- veupathUtils::VariableMetadataList(S4Vectors::SimpleList(variableMetadataList)) + result@computedVariableMetadata <- veupathUtils::VariableMetadataList( + S4Vectors::SimpleList(variableMetadataList) + ) return(result) } diff --git a/tests/testthat/test-method-pca.R b/tests/testthat/test-method-pca.R index ae56f52..a2b7360 100644 --- a/tests/testthat/test-method-pca.R +++ b/tests/testthat/test-method-pca.R @@ -87,8 +87,11 @@ test_that("The pca function can handle messy data", { expect_s4_class(computedVariableMetadata[[2]], "VariableMetadata") expect_s4_class(computedVariableMetadata[[3]], "VariableMetadata") expect_equal(computedVariableMetadata[[1]]@variableSpec@variableId, "PC1") + expect_equal(computedVariableMetadata[[1]]@variableSpec@entityId, "entity") expect_equal(computedVariableMetadata[[2]]@variableSpec@variableId, "PC2") + expect_equal(computedVariableMetadata[[2]]@variableSpec@entityId, "entity") expect_equal(computedVariableMetadata[[3]]@variableSpec@variableId, "PC3") + expect_equal(computedVariableMetadata[[3]]@variableSpec@entityId, "entity") # Test with not enough features From adc86b89a8b27240d7cd2c032a2c9af7d49873d1 Mon Sep 17 00:00:00 2001 From: asizemore Date: Mon, 27 Jan 2025 18:14:19 -0500 Subject: [PATCH 8/8] add a PlotReference value to pca outputs --- R/class-VariableMetadata.R | 2 +- R/method-pca.R | 3 ++- man/pca.Rd | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/R/class-VariableMetadata.R b/R/class-VariableMetadata.R index 2db8f78..af20343 100644 --- a/R/class-VariableMetadata.R +++ b/R/class-VariableMetadata.R @@ -3,7 +3,7 @@ variable_classes <- c('native', 'derived', 'computed') #the other option is to just let this be any character vector so long as it has only a single value.. -plot_references <- c('xAxis', 'yAxis', 'zAxis', 'overlay', 'facet1', 'facet2', 'geo', 'latitude', 'longitude') +plot_references <- c('xAxis', 'yAxis', 'zAxis', 'overlay', 'facet1', 'facet2', 'geo', 'latitude', 'longitude', 'undefined') data_types <- c('NUMBER', 'STRING', 'INTEGER', 'DATE', 'LONGITUDE') data_shapes <- c('CONTINUOUS', 'CATEGORICAL', 'ORDINAL', 'BINARY') diff --git a/R/method-pca.R b/R/method-pca.R index ed49b2c..de2609a 100644 --- a/R/method-pca.R +++ b/R/method-pca.R @@ -59,7 +59,8 @@ setMethod(pca, "Collection", displayRangeMin = min(pcaResult$x[,i]), displayRangeMax = max(pcaResult$x[,i]), dataType = veupathUtils::DataType(value = "NUMBER"), - dataShape = veupathUtils::DataShape(value = "CONTINUOUS") + dataShape = veupathUtils::DataShape(value = "CONTINUOUS"), + plotReference = veupathUtils::PlotReference(value = "undefined") ) }) diff --git a/man/pca.Rd b/man/pca.Rd index 62805f9..c8333fd 100644 --- a/man/pca.Rd +++ b/man/pca.Rd @@ -16,7 +16,7 @@ pca(collection, nPCs = 10, ntop = 500, verbose = c(TRUE, FALSE)) \item{verbose}{Boolean indicating if extra messaging should be printed.} } \value{ -A data table with the id columns and the first nPCs principal components. +A ComputeResult object. The data slot contains a data.table with the id columns and the first nPCs principal components. } \description{ PCA