Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature 54 add pca #56

Merged
merged 8 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ Collate:
'data.R'
'method-correlation.R'
'method-differentialExpression.R'
'method-pca.R'
'methods-Bin.R'
'methods-CollectionWithMetadata.R'
'methods-Collections.R'
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ export(naToZero)
export(name)
export(new_data_frame)
export(nonZeroRound)
export(pca)
export(predicateFactory)
export(pruneFeatures)
export(removeAttr)
Expand Down Expand Up @@ -170,6 +171,7 @@ exportMethods(getStudyIdColumnName)
exportMethods(getVariableSpec)
exportMethods(getVariableSpecColumnName)
exportMethods(merge)
exportMethods(pca)
exportMethods(predicateFactory)
exportMethods(toJSON)
exportMethods(whichValuesInBin)
Expand Down
2 changes: 1 addition & 1 deletion R/class-VariableMetadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

variable_classes <- c('native', 'derived', 'computed')
#the other option is to just let this be any character vector so long as it has only a single value..
plot_references <- c('xAxis', 'yAxis', 'zAxis', 'overlay', 'facet1', 'facet2', 'geo', 'latitude', 'longitude')
plot_references <- c('xAxis', 'yAxis', 'zAxis', 'overlay', 'facet1', 'facet2', 'geo', 'latitude', 'longitude', 'undefined')
data_types <- c('NUMBER', 'STRING', 'INTEGER', 'DATE', 'LONGITUDE')
data_shapes <- c('CONTINUOUS', 'CATEGORICAL', 'ORDINAL', 'BINARY')

Expand Down
79 changes: 79 additions & 0 deletions R/method-pca.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@

#' PCA
#'
#' @param collection A Collection object
#' @param nPCs Number of principal components to return. Default 10
#' @param ntop Use the top ntop genes with the highest variance for the pca computation. Mirrors the deseq2 plotPCA argument. Default 500.
#' @param verbose Boolean indicating if extra messaging should be printed.
#' @return A ComputeResult object. The data slot contains a data.table with the id columns and the first nPCs principal components.
#' @export
setGeneric("pca",
function(collection, nPCs = 10, ntop = 500, verbose = c(TRUE, FALSE)) standardGeneric("pca"),
signature = c("collection")
)

#' @export
setMethod(pca, "Collection",
function(collection, nPCs = 10, ntop = 500, verbose = c(TRUE, FALSE)) {

verbose <- veupathUtils::matchArg(verbose)
assay <- getCollectionData(collection)
recordIdColumn <- collection@recordIdColumn
ancestorIdColumns <- collection@ancestorIdColumns
allIdColumns <- c(recordIdColumn, ancestorIdColumns)
entity <- veupathUtils::strSplit(recordIdColumn,".", 4, 1)

# Remove id columns from the assay to get only the features.
features <- assay[, -..allIdColumns] # features has samples as rows.

# Update ntop if it's too large.
if (ntop > ncol(features)) {
if (verbose) {
message("ntop is larger than the number of features. Using all features.")
}
ntop <- min(ntop, ncol(features))
}

# Ensure ntop is at least 1.
if (ntop <= 1) {
stop("ntop must be at least 2.")
}

# Use prcomp to perform PCA.
# The following is heavily borrowed from the deseq2 plotPCA function.
rowVariances <- matrixStats::rowVars(t(as.matrix(features)))
keepFeatures <- order(rowVariances, decreasing=TRUE)[seq_len(ntop)]
pcaResult <- prcomp(features[, ..keepFeatures])


# Assemble the output ComputeResult data and variable metadata.
dt <- assay[, ..allIdColumns]
# The PCA results are in pcaResult$x. Keep the first nPCs PCS.
dt <- cbind(dt, pcaResult$x[, 1:nPCs]) # this works fine even with one id column

variableMetadataList <- lapply(1:nPCs, function(i) {
veupathUtils::VariableMetadata(
variableClass = veupathUtils::VariableClass(value = "computed"),
variableSpec = veupathUtils::VariableSpec(variableId = paste0("PC",i), entityId = entity),
displayName = paste0("PC ",i),
displayRangeMin = min(pcaResult$x[,i]),
displayRangeMax = max(pcaResult$x[,i]),
dataType = veupathUtils::DataType(value = "NUMBER"),
dataShape = veupathUtils::DataShape(value = "CONTINUOUS"),
plotReference = veupathUtils::PlotReference(value = "undefined")
)
})

result <- new("ComputeResult")
result@name <- 'pca'
result@recordIdColumn <- recordIdColumn
result@ancestorIdColumns <- ancestorIdColumns
result@data <- dt
result@parameters <- paste0('recordIdColumn = ', recordIdColumn,", nPCs = ", nPCs, ', ntop = ', ntop)
result@computedVariableMetadata <- veupathUtils::VariableMetadataList(
S4Vectors::SimpleList(variableMetadataList)
)

return(result)
}
)
23 changes: 23 additions & 0 deletions man/pca.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

100 changes: 100 additions & 0 deletions tests/testthat/test-method-pca.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# Test PCA

test_that("The pca function produces the expected output", {

testData <- testCountDataCollection

# Run pca
load_all()
output <- veupathUtils::pca(testData, nPCs = 2)
expect_s4_class(output, "ComputeResult")
outputData <- output@data
expect_s3_class(outputData, "data.table")
expect_equal(nrow(outputData), nrow(getCollectionData(testData)))
expect_equal(ncol(outputData), 3)
expect_equal(names(outputData), c("entity.SampleID", "PC1", "PC2"))
computedVariableMetadata <- output@computedVariableMetadata
expect_s4_class(computedVariableMetadata, "VariableMetadataList")
expect_equal(length(computedVariableMetadata), 2)
expect_s4_class(computedVariableMetadata[[1]], "VariableMetadata")
expect_s4_class(computedVariableMetadata[[2]], "VariableMetadata")
expect_equal(computedVariableMetadata[[1]]@variableSpec@variableId, "PC1")
expect_equal(computedVariableMetadata[[2]]@variableSpec@variableId, "PC2")


# Generate some fake data
nSamples <- 500
fakeData <- data.table(
entity.SampleID = paste0("Sample", 1:nSamples),
feat1 = rnorm(nSamples),
feat2 = rnorm(nSamples),
feat3 = rnorm(nSamples)
)

fakeCollection <- new("Collection", data = fakeData, recordIdColumn = "entity.SampleID", name="test")

output <- veupathUtils::pca(fakeCollection, nPCs = 2, ntop=20)
expect_s4_class(output, "ComputeResult")
outputData <- output@data
expect_s3_class(outputData, "data.table")
expect_equal(nrow(outputData), nSamples)
expect_equal(ncol(outputData), 3)
expect_equal(names(outputData), c("entity.SampleID", "PC1", "PC2"))
computedVariableMetadata <- output@computedVariableMetadata
expect_s4_class(computedVariableMetadata, "VariableMetadataList")
expect_equal(length(computedVariableMetadata), 2)
expect_s4_class(computedVariableMetadata[[1]], "VariableMetadata")
expect_s4_class(computedVariableMetadata[[2]], "VariableMetadata")
expect_equal(computedVariableMetadata[[1]]@variableSpec@variableId, "PC1")
expect_equal(computedVariableMetadata[[2]]@variableSpec@variableId, "PC2")
})

test_that("The pca function can handle messy data", {

# Generate some fake data
nSamples <- 500
fakeData <- data.table(
entity.SampleID = paste0("Sample", 1:nSamples),
entity.feat1 = rnorm(nSamples),
entity.feat2 = rnorm(nSamples),
entity.feat3 = rnorm(nSamples)
)

fakeCollection <- new("Collection",
data = fakeData,
recordIdColumn = "entity.SampleID",
name="test",
imputeZero=TRUE
)

# Add some missing values and ensure one sample gets dropped
fakeCollection@data[1:50, "entity.feat1"] <- NA
fakeCollection@data[25:100, "entity.feat2"] <- 0
fakeCollection@data[(nSamples-50):nSamples, "entity.feat3"] <- NA
fakeCollection@data[26, "entity.feat3"] <- NA

output <- pca(fakeCollection, nPCs = 3, ntop=5, verbose=T)
expect_s4_class(output, "ComputeResult")
outputData <- output@data
expect_s3_class(outputData, "data.table")
expect_equal(nrow(outputData), nSamples-1)
expect_equal(ncol(outputData), 4)
expect_equal(names(outputData), c("entity.SampleID", "PC1", "PC2", "PC3"))
computedVariableMetadata <- output@computedVariableMetadata
expect_s4_class(computedVariableMetadata, "VariableMetadataList")
expect_equal(length(computedVariableMetadata), 3)
expect_s4_class(computedVariableMetadata[[1]], "VariableMetadata")
expect_s4_class(computedVariableMetadata[[2]], "VariableMetadata")
expect_s4_class(computedVariableMetadata[[3]], "VariableMetadata")
expect_equal(computedVariableMetadata[[1]]@variableSpec@variableId, "PC1")
expect_equal(computedVariableMetadata[[1]]@variableSpec@entityId, "entity")
expect_equal(computedVariableMetadata[[2]]@variableSpec@variableId, "PC2")
expect_equal(computedVariableMetadata[[2]]@variableSpec@entityId, "entity")
expect_equal(computedVariableMetadata[[3]]@variableSpec@variableId, "PC3")
expect_equal(computedVariableMetadata[[3]]@variableSpec@entityId, "entity")


# Test with not enough features
expect_error(veupathUtils::pca(fakeCollection, nPCs = 2, ntop=1, verbose=T), "ntop must be at least 2.")

})
Loading