Skip to content

Commit

Permalink
Merge pull request #86 from stemangiola/speedup-aggregate_cells
Browse files Browse the repository at this point in the history
Speedup aggregate cells
  • Loading branch information
stemangiola authored Aug 25, 2023
2 parents 325b0a9 + dfc16a9 commit f38d190
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 41 deletions.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ importFrom(Matrix,rowSums)
importFrom(S4Vectors,"metadata<-")
importFrom(S4Vectors,DataFrame)
importFrom(S4Vectors,metadata)
importFrom(S4Vectors,split)
importFrom(SingleCellExperiment,cbind)
importFrom(SingleCellExperiment,reducedDims)
importFrom(SummarizedExperiment,"assays<-")
Expand All @@ -69,6 +70,7 @@ importFrom(dplyr,filter)
importFrom(dplyr,full_join)
importFrom(dplyr,group_by)
importFrom(dplyr,group_by_drop_default)
importFrom(dplyr,group_split)
importFrom(dplyr,inner_join)
importFrom(dplyr,left_join)
importFrom(dplyr,mutate)
Expand Down Expand Up @@ -123,6 +125,7 @@ importFrom(rlang,quo_squash)
importFrom(stats,setNames)
importFrom(stringr,regex)
importFrom(stringr,str_detect)
importFrom(stringr,str_remove)
importFrom(stringr,str_replace_all)
importFrom(tibble,as_tibble)
importFrom(tibble,enframe)
Expand Down
112 changes: 76 additions & 36 deletions R/methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ setMethod("join_features", "SingleCellExperiment", function(.data,
# CRAN Note
.cell <- NULL
.feature <- NULL

# Shape is long
if (shape == "long") {
.data %>%
Expand All @@ -50,11 +50,11 @@ setMethod("join_features", "SingleCellExperiment", function(.data,
features=features,
all=all,
exclude_zeros=exclude_zeros)) %>%
select(!!c_(.data)$symbol, .feature,
select(!!c_(.data)$symbol, .feature,
contains(".abundance"), everything())
# Shape if wide
} else {
.data %>%
.data %>%
left_join(
by=c_(.data)$name,
get_abundance_sc_wide(
Expand Down Expand Up @@ -84,7 +84,7 @@ tidy <- function(object) {
#' @importFrom lifecycle deprecate_warn
#' @export
tidy.SingleCellExperiment <- function(object) {

# DEPRECATE
deprecate_warn(
when="1.1.1",
Expand All @@ -98,8 +98,8 @@ tidy.SingleCellExperiment <- function(object) {
#' @rdname aggregate_cells
#' @inherit ttservice::aggregate_cells
#' @aliases aggregate_cells,SingleCellExperiment-method
#'
#' @examples
#'
#' @examples
#' data(pbmc_small)
#' pbmc_small_pseudo_bulk <- pbmc_small |>
#' aggregate_cells(c(groups, ident), assays="counts")
Expand All @@ -110,48 +110,88 @@ tidy.SingleCellExperiment <- function(object) {
#' @importFrom Matrix rowSums
#' @importFrom ttservice aggregate_cells
#' @importFrom SummarizedExperiment assays assays<- assayNames
#' @importFrom S4Vectors split
#' @importFrom stringr str_remove
#' @importFrom dplyr group_split
#'
#'
#' @export
setMethod("aggregate_cells", "SingleCellExperiment", function(.data,
.sample=NULL, slot="data", assays=NULL,
.sample=NULL, slot="data", assays=NULL,
aggregation_function=Matrix::rowSums,
...) {

# Fix NOTEs
feature <- NULL
.sample <- enquo(.sample)

# Subset only wanted assays
if (!is.null(assays)) {
assays(.data) <- assays(.data)[assays]
}

.data %>%
nest(data=-!!.sample) %>%
mutate(.aggregated_cells=as.integer(map(data, ~ ncol(.x)))) %>%
mutate(
data=map(data, ~ {
# Loop over assays
map2(as.list(assays(.x)), assayNames(.x), ~ {
# Get counts
.x %>%
aggregation_function(na.rm=TRUE) %>%
enframe(
name ="feature",
value=sprintf("%s", .y)) %>%
mutate(feature=as.character(feature))
}) %>%
Reduce(function(...) full_join(..., by="feature"), .)
})
) %>%


grouping_factor =
.data |>
colData() |>
as_tibble() |>
select(!!.sample) |>
suppressMessages() |>
unite("my_id_to_split_by___", !!.sample, sep = "___") |>
pull(my_id_to_split_by___) |>
as.factor()

list_count_cells = table(grouping_factor) |> as.list()

# New method
list_assays =
.data |>
assays() |>
as.list() |>
map(~ .x |> splitColData(grouping_factor)) |>
unlist(recursive=FALSE)

list_assays =
list_assays |>
map2(names(list_assays), ~ {
# Get counts
.x %>%
aggregation_function(na.rm=TRUE) %>%
enframe(
name =".feature",
value="x") %>% # sprintf("%s", .y)) %>%

# In case we don't have rownames
mutate(.feature=as.character(.feature))
}) |>
enframe(name = ".sample") |>

# Clean groups
mutate(assay_name = assayNames(!!.data) |> rep(each=length(levels(grouping_factor)))) |>
mutate(.sample = .sample |> str_remove(assay_name) |> str_remove("\\.")) |>
group_split(.sample) |>
map(~ .x |> unnest(value) |> pivot_wider(names_from = assay_name, values_from = x) ) |>

# Add cell count
map2(
list_count_cells,
~ .x |> mutate(.aggregated_cells = .y)
)


do.call(rbind, list_assays) |>

left_join(
.data %>%
as_tibble() %>%
subset(!!.sample),
by=quo_names(.sample)) %>%
unnest(data) %>%
drop_class("tidySingleCellExperiment_nested") %>%
.data |>
colData() |>
as_tibble() |>
subset(!!.sample) |>
unite("my_id_to_split_by___", !!.sample, remove=FALSE, sep = "___"),
by= join_by(".sample" == "my_id_to_split_by___")
) |>

as_SummarizedExperiment(
.sample=!!.sample,
.transcript=feature,
.sample=.sample,
.transcript=.feature,
.abundance=!!as.symbol(names(.data@assays)))
})
32 changes: 27 additions & 5 deletions R/utilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,6 @@ get_special_column_name_cell <- function(name) {
list(name=name, symbol=as.symbol(name))
}

cell__ <- get_special_column_name_symbol(".cell")

#' @importFrom S4Vectors metadata
c_ <- function(x) {
Expand Down Expand Up @@ -442,12 +441,17 @@ get_specific_annotation_columns <- function(.data, .col) {
# x-annotation df
n_x <- .data |> distinct_at(vars(!!.col)) |> nrow()

# Exclude columns that have more values than my .col
columns_unique_length = .data |> select(-!!.col) |> lapply(function(x) unique(x) |> length())
columns_unique_length = columns_unique_length[columns_unique_length<=n_x]

.sample = .data |> select(!!.col) |> unite(".sample", !!.col) |> pull(.sample)

# element wise columns
.data |>
select(-!!.col) |>
colnames() |>
columns_unique_length |>
names() |>
map(~ {
n_.x <- .data |> distinct_at(vars(!!.col, .x)) |> nrow()
n_.x <- .data |> pull(all_of(.x)) |> paste(.sample) |> unique() |> length()
if (n_.x == n_x) .x else NULL
}) %>%
# Drop NULL
Expand Down Expand Up @@ -482,5 +486,23 @@ subset <- function(.data, .column) {
distinct()
}


splitColData <- function(x, f) {
# This is by @jma1991
# at https://github.com/drisso/SingleCellExperiment/issues/55

i <- split(seq_along(f), f)

v <- vector(mode = "list", length = length(i))

names(v) <- names(i)

for (n in names(i)) { v[[n]] <- x[, i[[n]]] }

return(v)

}

cell__ <- get_special_column_name_symbol(".cell")
feature__ <- get_special_column_name_symbol(".feature")
sample__ <- get_special_column_name_symbol(".sample")

0 comments on commit f38d190

Please sign in to comment.