From 5f48a4adf97c70eddd680f281f71317b3915c2de Mon Sep 17 00:00:00 2001 From: ehennestad Date: Sat, 23 Nov 2024 10:43:56 +0100 Subject: [PATCH 01/19] Add initial chunk parameters and function to read it from file --- +io/+config/readDefaultChunkConfiguration.m | 19 ++++++++++++++++ configuration/chunk_params.json | 25 +++++++++++++++++++++ 2 files changed, 44 insertions(+) create mode 100644 +io/+config/readDefaultChunkConfiguration.m create mode 100644 configuration/chunk_params.json diff --git a/+io/+config/readDefaultChunkConfiguration.m b/+io/+config/readDefaultChunkConfiguration.m new file mode 100644 index 00000000..46365403 --- /dev/null +++ b/+io/+config/readDefaultChunkConfiguration.m @@ -0,0 +1,19 @@ +function configObject = readDefaultChunkConfiguration() +% READDEFAULTCHUNKCONFIGURATION Reads the default chunking configuration from a JSON file. +% +% configObject = READDEFAULTCHUNKCONFIGURATION() loads the default chunking +% parameters from a JSON configuration file located in the 'configuration' +% directory within the MatNWB directory. +% +% Output: +% configObject - A MATLAB structure containing the chunking parameters +% defined in the JSON configuration file. +% +% Example: +% % Load the default chunk configuration +% config = readDefaultChunkConfiguration(); +% disp(config); + + configFilePath = fullfile(misc.getMatnwbDir, 'configuration', 'chunk_params.json'); + configObject = jsondecode(fileread(configFilePath)); +end diff --git a/configuration/chunk_params.json b/configuration/chunk_params.json new file mode 100644 index 00000000..f9f2e3c8 --- /dev/null +++ b/configuration/chunk_params.json @@ -0,0 +1,25 @@ +{ + "NWBContainer": { + "chunk_compression": "gzip", + "chunk_compression_args": 4, + "chunk_default_size": 10000000.0, + "chunk_default_size_unit": "bytes" + }, + "ElectricalSeries": { + "data": { + "chunk_dimensions": [ + null, + 32 + ] + } + }, + "ImageSeries": { + "data": { + "chunk_dimensions": [ + null, + "max", + "max" + ] + } + } +} From 4822417601ba34f95442a921e6925c28ae96d9fc Mon Sep 17 00:00:00 2001 From: ehennestad Date: Sat, 23 Nov 2024 12:38:52 +0100 Subject: [PATCH 02/19] First draft of applying chunk configurations --- .../+internal/computeChunkSizeFromConfig.m | 62 +++++++++++++++ +io/+config/+internal/getDataByteSize.m | 7 ++ .../+internal/resolveDataTypeChunkConfig.m | 76 +++++++++++++++++++ +io/+config/applyChunkConfiguration.m | 49 ++++++++++++ +io/+config/readDefaultChunkConfiguration.m | 2 +- configuration/chunk_params.json | 26 ++++++- 6 files changed, 220 insertions(+), 2 deletions(-) create mode 100644 +io/+config/+internal/computeChunkSizeFromConfig.m create mode 100644 +io/+config/+internal/getDataByteSize.m create mode 100644 +io/+config/+internal/resolveDataTypeChunkConfig.m create mode 100644 +io/+config/applyChunkConfiguration.m diff --git a/+io/+config/+internal/computeChunkSizeFromConfig.m b/+io/+config/+internal/computeChunkSizeFromConfig.m new file mode 100644 index 00000000..14f0d7d7 --- /dev/null +++ b/+io/+config/+internal/computeChunkSizeFromConfig.m @@ -0,0 +1,62 @@ +function chunkSize = computeChunkSizeFromConfig(A, chunkSpecification) +% computeChunkSizeFromConfig - Compute the chunk size for a dataset using the provided specification. +% This function determines the chunk size for a dataset based on the chunk +% dimensions provided in the chunkSpecification. It adjusts dimensions according +% to rules: 'max' uses the dataset size, fixed numbers use their value, and 'null' +% calculates the dimension size to approximate the target chunk size in bytes. +% +% Inputs: +% A - A numeric dataset whose chunk size is to be computed. +% chunkSpecification (1,1) struct - Struct defining chunk dimensions and settings. +% +% Output: +% chunkSize - A vector specifying the chunk size for each dimension. + + arguments + A {mustBeNumeric} + chunkSpecification (1,1) struct + end + + % Get dataset size + dataSize = size(A); + dataSize = fliplr(dataSize); % matnwb quirk + numDimensions = numel(dataSize); + + % Extract relevant configuration parameters + chunkDimensions = squeeze(chunkSpecification.data.chunk_dimensions); + defaultChunkSize = chunkSpecification.chunk_default_size; % in bytes + dataByteSize = io.config.internal.getDataByteSize(A); + + % Initialize chunk size array + chunkSize = zeros(1, numDimensions); + + % Calculate chunk size for each dimension + for dim = 1:numDimensions + if dim > numel(chunkDimensions) + % Use full size for dimensions beyond the specification + chunkSize(dim) = dataSize(dim); + else + dimSpec = chunkDimensions{dim}; + if isempty(dimSpec) + % Compute chunk size for 'null' dimensions + % Estimate proportional size based on remaining chunk size + remainingChunkSize = defaultChunkSize / dataByteSize; % scale factor for all dimensions + nullDimensions = find(cellfun(@isempty, chunkDimensions)); + proportionalSize = nthroot(remainingChunkSize, numel(nullDimensions)); + chunkSize(dim) = max(1, round(proportionalSize*dataSize(dim))); + elseif isnumeric(dimSpec) + % Fixed chunk size + chunkSize(dim) = dimSpec; + elseif ischar(dimSpec) && strcmp(dimSpec, 'max') + % Use full dimension size + chunkSize(dim) = dataSize(dim); + else + error('Invalid chunk specification for dimension %d.', dim); + end + end + end + + % Ensure chunk size does not exceed dataset dimensions + chunkSize = min(chunkSize, dataSize); + chunkSize = fliplr(chunkSize); +end diff --git a/+io/+config/+internal/getDataByteSize.m b/+io/+config/+internal/getDataByteSize.m new file mode 100644 index 00000000..b24a6617 --- /dev/null +++ b/+io/+config/+internal/getDataByteSize.m @@ -0,0 +1,7 @@ +function byteSize = getDataByteSize(data) +% getDataByteSize - Get bytesize of a numeric array + dataType = class(data); + bytesPerDataPoint = io.getMatTypeSize(dataType); + + byteSize = numel(data) .* bytesPerDataPoint; +end diff --git a/+io/+config/+internal/resolveDataTypeChunkConfig.m b/+io/+config/+internal/resolveDataTypeChunkConfig.m new file mode 100644 index 00000000..f5d8df2e --- /dev/null +++ b/+io/+config/+internal/resolveDataTypeChunkConfig.m @@ -0,0 +1,76 @@ +function resolvedOptions = resolveDataTypeChunkConfig(chunkSpecification, nwbObject) +% resolveDataTypeChunkConfig - Resolve the chunk options for individual datatypes +% This function resolves the chunk configuration options for a given NWB object +% by traversing the object hierarchy and combining options from the most specific +% type to the base type, as defined in the chunkSpecification. +% +% Input: +% chunkSpecification (struct): A struct representation of the chunk configuration JSON. +% nwbObject (types.untyped.MetaClass): An NWB object whose chunk configuration will be resolved. +% +% Output: +% resolvedOptions (struct): A struct containing the resolved chunk configuration options. + + arguments + chunkSpecification (1,1) struct + nwbObject (1,1) types.untyped.MetaClass + end + + % Initialize resolvedOptions with an empty struct + resolvedOptions = struct(); + + % Get the NWB object type hierarchy (from most specific to base type) + typeHierarchy = getTypeHierarchy(nwbObject); + + % Traverse the type hierarchy to resolve options + for i = numel(typeHierarchy):-1:1 + typeName = typeHierarchy{i}; + + % Check if the type has a chunkSpecification + if isfield(chunkSpecification, typeName) + typeOptions = chunkSpecification.(typeName); + + % Merge options into resolvedOptions + resolvedOptions = mergeStructs(resolvedOptions, typeOptions); + end + end +end + +function typeHierarchy = getTypeHierarchy(nwbObject) +% getTypeHierarchy - Retrieve the type hierarchy of an NWB object. +% This function returns a cell array of type names, starting from the specific +% type of the given NWB object up to its base type. + + typeHierarchy = {}; % Initialize an empty cell array + currentType = class(nwbObject); % Start with the specific type + + while ~isempty(currentType) + shortClassName = regexp(currentType, '[^.]+$', 'match', 'once'); + typeHierarchy{end+1} = shortClassName; %#ok + + % Use MetaClass information to get the parent type + metaClass = meta.class.fromName(currentType); + if isempty(metaClass.SuperclassList) + break; % Reached the base type + end + currentType = metaClass.SuperclassList(1).Name; + end +end + +function merged = mergeStructs(baseStruct, newStruct) +% mergeStructs - Merge two structs, with fields in newStruct overriding those in baseStruct. + + merged = baseStruct; % Start with the base struct + + fields = fieldnames(newStruct); + for i = 1:numel(fields) + field = fields{i}; + if isstruct(newStruct.(field)) && isfield(baseStruct, field) && isstruct(baseStruct.(field)) + % Recursively merge if both fields are structs + merged.(field) = mergeStructs(baseStruct.(field), newStruct.(field)); + else + % Otherwise, override the field + merged.(field) = newStruct.(field); + end + end +end diff --git a/+io/+config/applyChunkConfiguration.m b/+io/+config/applyChunkConfiguration.m new file mode 100644 index 00000000..6f59fb45 --- /dev/null +++ b/+io/+config/applyChunkConfiguration.m @@ -0,0 +1,49 @@ +function applyChunkConfiguration(nwbObject, chunkConfiguration) + arguments + nwbObject (1,1) NwbFile + chunkConfiguration (1,1) struct = io.config.readDefaultChunkConfiguration() + end + + objectMap = nwbObject.searchFor(''); + objectKeys = objectMap.keys(); + + filteredObjectMap = containers.Map(); + for i = 1:numel(objectKeys) + thisObjectKey = objectKeys{i}; + thisNwbObject = objectMap(thisObjectKey); + if startsWith(class(thisNwbObject), "types.") && ~startsWith(class(thisNwbObject), "types.untyped") + filteredObjectMap(thisObjectKey) = thisNwbObject; + end + end + clear objectMap + + objectKeys = filteredObjectMap.keys(); + for i = 1:numel(objectKeys) + thisObjectKey = objectKeys{i}; + thisNwbObject = filteredObjectMap(thisObjectKey); + + % Todo: Find dataset properties where it makes sense to do chunking + % I.e data, timestamps etc. Can this be determined automatically, + % or do we need a lookup? + + dataTypeChunkOptions = io.config.internal.resolveDataTypeChunkConfig(chunkConfiguration, thisNwbObject); + + if isprop(thisNwbObject, 'data') + if ~isa(thisNwbObject.data, 'types.untyped.DataPipe') + % Create a datapipe object for the property value. + dataByteSize = io.config.internal.getDataByteSize(thisNwbObject.data); + if dataByteSize > dataTypeChunkOptions.chunk_default_size + chunkSize = io.config.internal.computeChunkSizeFromConfig(thisNwbObject.data, dataTypeChunkOptions); + maxSize = size(thisNwbObject.data); + + dataPipe = types.untyped.DataPipe( ... + 'data', thisNwbObject.data, ... + 'maxSize', maxSize, ... + 'chunkSize', chunkSize, ... + 'compressionLevel', dataTypeChunkOptions.chunk_compression_args); + thisNwbObject.data = dataPipe; + end + end + end + end +end diff --git a/+io/+config/readDefaultChunkConfiguration.m b/+io/+config/readDefaultChunkConfiguration.m index 46365403..1c08c375 100644 --- a/+io/+config/readDefaultChunkConfiguration.m +++ b/+io/+config/readDefaultChunkConfiguration.m @@ -2,7 +2,7 @@ % READDEFAULTCHUNKCONFIGURATION Reads the default chunking configuration from a JSON file. % % configObject = READDEFAULTCHUNKCONFIGURATION() loads the default chunking -% parameters from a JSON configuration file located in the 'configuration' +% parameters from a JSON configuration file located in the 'configuration' % directory within the MatNWB directory. % % Output: diff --git a/configuration/chunk_params.json b/configuration/chunk_params.json index f9f2e3c8..2ce804d9 100644 --- a/configuration/chunk_params.json +++ b/configuration/chunk_params.json @@ -3,7 +3,23 @@ "chunk_compression": "gzip", "chunk_compression_args": 4, "chunk_default_size": 10000000.0, - "chunk_default_size_unit": "bytes" + "chunk_default_size_unit": "bytes", + "data": { + "chunk_dimensions": [ + null + ] + } + }, + "Data": { + "chunk_compression": "gzip", + "chunk_compression_args": 4, + "chunk_default_size": 10000000.0, + "chunk_default_size_unit": "bytes", + "data": { + "chunk_dimensions": [ + null + ] + } }, "ElectricalSeries": { "data": { @@ -13,6 +29,14 @@ ] } }, + "TimeSeries": { + "data": { + "chunk_dimensions": [ + null, + 32 + ] + } + }, "ImageSeries": { "data": { "chunk_dimensions": [ From cfdefd6e4eb4de4540d4edd127486f8d4cb7fdbf Mon Sep 17 00:00:00 2001 From: ehennestad Date: Sat, 23 Nov 2024 12:55:12 +0100 Subject: [PATCH 03/19] Minor fixes --- +io/+config/+internal/computeChunkSizeFromConfig.m | 2 +- +io/+config/applyChunkConfiguration.m | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/+io/+config/+internal/computeChunkSizeFromConfig.m b/+io/+config/+internal/computeChunkSizeFromConfig.m index 14f0d7d7..fdfa5a34 100644 --- a/+io/+config/+internal/computeChunkSizeFromConfig.m +++ b/+io/+config/+internal/computeChunkSizeFromConfig.m @@ -23,7 +23,7 @@ numDimensions = numel(dataSize); % Extract relevant configuration parameters - chunkDimensions = squeeze(chunkSpecification.data.chunk_dimensions); + chunkDimensions = chunkSpecification.data.chunk_dimensions; defaultChunkSize = chunkSpecification.chunk_default_size; % in bytes dataByteSize = io.config.internal.getDataByteSize(A); diff --git a/+io/+config/applyChunkConfiguration.m b/+io/+config/applyChunkConfiguration.m index 6f59fb45..e0d0856e 100644 --- a/+io/+config/applyChunkConfiguration.m +++ b/+io/+config/applyChunkConfiguration.m @@ -29,7 +29,7 @@ function applyChunkConfiguration(nwbObject, chunkConfiguration) dataTypeChunkOptions = io.config.internal.resolveDataTypeChunkConfig(chunkConfiguration, thisNwbObject); if isprop(thisNwbObject, 'data') - if ~isa(thisNwbObject.data, 'types.untyped.DataPipe') + if isnumeric(thisNwbObject.data) % Create a datapipe object for the property value. dataByteSize = io.config.internal.getDataByteSize(thisNwbObject.data); if dataByteSize > dataTypeChunkOptions.chunk_default_size From e164ce020a15464e93837196b36f97e116d0af58 Mon Sep 17 00:00:00 2001 From: ehennestad Date: Tue, 21 Jan 2025 11:13:53 +0100 Subject: [PATCH 04/19] Create listDatasetsOfNeurodataType.m --- +schemes/listDatasetsOfNeurodataType.m | 32 ++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 +schemes/listDatasetsOfNeurodataType.m diff --git a/+schemes/listDatasetsOfNeurodataType.m b/+schemes/listDatasetsOfNeurodataType.m new file mode 100644 index 00000000..8a6ee902 --- /dev/null +++ b/+schemes/listDatasetsOfNeurodataType.m @@ -0,0 +1,32 @@ +function datasetNames = listDatasetsOfNeurodataType(typeClassName) +% listDatasetsOfNeurodataType - List names of datasets of a neurodata type +% +% Input Arguments: +% - typeClassName (string) - +% Full MatNWB class name for a neurodata type, i.e "types.core.TimeSeries" +% +% Output Arguments: +% - datasetNames (string) - +% Names of datasets contained in the specified neurodata type + + arguments + typeClassName (1,1) string + end + + classNameSplit = string( split(typeClassName, '.') ); + typesIdx = find(classNameSplit == "types"); + + assert(~isempty(typesIdx), 'Expected class name to contain "types"') + namespaceName = classNameSplit(typesIdx+1); + namespace = schemes.loadNamespace(namespaceName, misc.getMatnwbDir); + + neurodataTypeName = classNameSplit(typesIdx+2); + typeScheme = namespace.registry(neurodataTypeName); + + datasetMaps = typeScheme('datasets'); + + datasetNames = repmat("", size(datasetMaps)); + for i = 1:numel(datasetMaps) + datasetNames(i) = datasetMaps{i}('name'); + end +end From e5f9bc74a56ce7c3e7f70d37f23b9d3c9fe5a102 Mon Sep 17 00:00:00 2001 From: ehennestad Date: Tue, 21 Jan 2025 17:07:42 +0100 Subject: [PATCH 05/19] Add new template for dataset configuration json --- .../cloud_dataset_configuration.json | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 configuration/cloud_dataset_configuration.json diff --git a/configuration/cloud_dataset_configuration.json b/configuration/cloud_dataset_configuration.json new file mode 100644 index 00000000..b410b2fc --- /dev/null +++ b/configuration/cloud_dataset_configuration.json @@ -0,0 +1,48 @@ +{ + "Default": { + "layout": "chunked", + "target_chunk_size": { + "value": 10000000, + "unit": "bytes" + }, + "chunk_dimensions": [ + [null], + [null, "max"], + [null, "max", "max"], + [null, "max", "max", "max"] + ], + "compression": { + "algorithm": "deflate", + "level": 3, + "parameters": {}, + "prefilters": ["shuffle"] + } + }, + "TimeSeries": { + "data": { + "chunk_dimensions": [[null, 32], [null, 32, "max"]], + "compression": { + "algorithm": "deflate", + "level": 4 + } + }, + "timestamps": { + "chunk_dimensions": [null] + } + }, + "ImageSeries": { + "data": { + "chunk_dimensions": [[null, "max", "max"], [null, "max", "max", "max"]] + } + }, + "ElectricalSeries": { + "data": { + "chunk_dimensions": [[null, 32], [null, 32, "max"]] + } + }, + "SpikeEventSeries": { + "data": { + "chunk_dimensions": [1000] + } + } +} \ No newline at end of file From c7402d896b911ac02b20f7a84d173e3df1b68097 Mon Sep 17 00:00:00 2001 From: ehennestad Date: Tue, 21 Jan 2025 17:09:45 +0100 Subject: [PATCH 06/19] Update applyChunkConfiguration and dependent functions to work with new template --- .../+internal/computeChunkSizeFromConfig.m | 9 +- .../+internal/configureDataPipeFromData.m | 41 ++++++ +io/+config/+internal/reconfigureDataPipe.m | 4 + .../+internal/resolveDataTypeChunkConfig.m | 17 ++- +io/+config/applyChunkConfiguration.m | 119 ++++++++++++------ +io/+config/readDefaultChunkConfiguration.m | 25 ++-- +matnwb/+common/getParentType.m | 7 ++ +schemes/listDatasetsOfNeurodataType.m | 26 +++- 8 files changed, 187 insertions(+), 61 deletions(-) create mode 100644 +io/+config/+internal/configureDataPipeFromData.m create mode 100644 +io/+config/+internal/reconfigureDataPipe.m create mode 100644 +matnwb/+common/getParentType.m diff --git a/+io/+config/+internal/computeChunkSizeFromConfig.m b/+io/+config/+internal/computeChunkSizeFromConfig.m index fdfa5a34..f25158dc 100644 --- a/+io/+config/+internal/computeChunkSizeFromConfig.m +++ b/+io/+config/+internal/computeChunkSizeFromConfig.m @@ -23,8 +23,13 @@ numDimensions = numel(dataSize); % Extract relevant configuration parameters - chunkDimensions = chunkSpecification.data.chunk_dimensions; - defaultChunkSize = chunkSpecification.chunk_default_size; % in bytes + chunkDimensions = chunkSpecification.chunk_dimensions; + if iscell(chunkDimensions) + numChunkDimensions = cellfun(@numel, chunkDimensions); + chunkDimensions = chunkDimensions{numChunkDimensions == numDimensions}; + end + + defaultChunkSize = chunkSpecification.target_chunk_size.value; % in bytes dataByteSize = io.config.internal.getDataByteSize(A); % Initialize chunk size array diff --git a/+io/+config/+internal/configureDataPipeFromData.m b/+io/+config/+internal/configureDataPipeFromData.m new file mode 100644 index 00000000..cbc7a46f --- /dev/null +++ b/+io/+config/+internal/configureDataPipeFromData.m @@ -0,0 +1,41 @@ +function dataPipe = configureDataPipeFromData(numericData, datasetConfig) +% configureDataPipeFromData - Configure a DataPipe from numeric data and dataset configuration + + import io.config.internal.computeChunkSizeFromConfig + import types.untyped.datapipe.properties.DynamicFilter + + chunkSize = computeChunkSizeFromConfig(numericData, datasetConfig); + maxSize = size(numericData); + + dataPipeArgs = {... + "data", numericData, ... + "maxSize", maxSize, ... + "chunkSize", chunkSize }; + + hasShuffle = contains(datasetConfig.compression.prefilters, 'shuffle'); + + if strcmpi(datasetConfig.compression.algorithm, "Deflate") + % Use standard compression filters + dataPipeArgs = [ dataPipeArgs, ... + {'hasShuffle', hasShuffle, ... + 'compressionLevel', datasetConfig.compression.level} ... + ]; + else + % Create property list of custom filters for dataset creation + compressionFilter = DynamicFilter( ... + datasetConfig.compression.algorithm, ... + datasetConfig.compression.level ); + + if hasShuffle + shuffleFilter = types.untyped.datapipe.properties.Shuffle(); + filters = [shuffleFilter compressionFilter]; + else + filters = compressionFilter; + end + dataPipeArgs = [ dataPipeArgs, ... + {'filters', filters} ]; + end + + % Create the datapipe. + dataPipe = types.untyped.DataPipe( dataPipeArgs{:} ); +end \ No newline at end of file diff --git a/+io/+config/+internal/reconfigureDataPipe.m b/+io/+config/+internal/reconfigureDataPipe.m new file mode 100644 index 00000000..183c0994 --- /dev/null +++ b/+io/+config/+internal/reconfigureDataPipe.m @@ -0,0 +1,4 @@ +function dataPipe = reconfigureDataPipe(dataPipe, datasetConfig) + % todo +end + diff --git a/+io/+config/+internal/resolveDataTypeChunkConfig.m b/+io/+config/+internal/resolveDataTypeChunkConfig.m index f5d8df2e..f701450b 100644 --- a/+io/+config/+internal/resolveDataTypeChunkConfig.m +++ b/+io/+config/+internal/resolveDataTypeChunkConfig.m @@ -1,4 +1,4 @@ -function resolvedOptions = resolveDataTypeChunkConfig(chunkSpecification, nwbObject) +function resolvedOptions = resolveDataTypeChunkConfig(chunkSpecification, nwbObject, datasetName) % resolveDataTypeChunkConfig - Resolve the chunk options for individual datatypes % This function resolves the chunk configuration options for a given NWB object % by traversing the object hierarchy and combining options from the most specific @@ -14,10 +14,11 @@ arguments chunkSpecification (1,1) struct nwbObject (1,1) types.untyped.MetaClass + datasetName (1,1) string end - % Initialize resolvedOptions with an empty struct - resolvedOptions = struct(); + % Initialize resolvedOptions with default options. + resolvedOptions = chunkSpecification.Default; % Get the NWB object type hierarchy (from most specific to base type) typeHierarchy = getTypeHierarchy(nwbObject); @@ -26,12 +27,16 @@ for i = numel(typeHierarchy):-1:1 typeName = typeHierarchy{i}; - % Check if the type has a chunkSpecification + % Check if the neurodata type has a chunkSpecification if isfield(chunkSpecification, typeName) typeOptions = chunkSpecification.(typeName); - % Merge options into resolvedOptions - resolvedOptions = mergeStructs(resolvedOptions, typeOptions); + % Is datasetName part of typeOptions? + if isfield(typeOptions, datasetName) + % Merge options into resolvedOptions + datasetOptions = typeOptions.(datasetName); + resolvedOptions = mergeStructs(resolvedOptions, datasetOptions); + end end end end diff --git a/+io/+config/applyChunkConfiguration.m b/+io/+config/applyChunkConfiguration.m index e0d0856e..568620fc 100644 --- a/+io/+config/applyChunkConfiguration.m +++ b/+io/+config/applyChunkConfiguration.m @@ -1,49 +1,90 @@ -function applyChunkConfiguration(nwbObject, chunkConfiguration) +function applyChunkConfiguration(nwbObject, chunkConfiguration, options) +% applyChunkConfiguration - Apply chunk configuration to datasets of an NWB object + arguments - nwbObject (1,1) NwbFile - chunkConfiguration (1,1) struct = io.config.readDefaultChunkConfiguration() + nwbObject (1,1) types.untyped.MetaClass + chunkConfiguration (1,1) struct = io.config.readDefaultChunkConfiguration() % Todo: class for this...? + options.OverrideExisting (1,1) logical = false end + + import io.config.internal.resolveDataTypeChunkConfig - objectMap = nwbObject.searchFor(''); - objectKeys = objectMap.keys(); - - filteredObjectMap = containers.Map(); - for i = 1:numel(objectKeys) - thisObjectKey = objectKeys{i}; - thisNwbObject = objectMap(thisObjectKey); - if startsWith(class(thisNwbObject), "types.") && ~startsWith(class(thisNwbObject), "types.untyped") - filteredObjectMap(thisObjectKey) = thisNwbObject; - end + if isa(nwbObject, 'NwbFile') + neurodataObjects = getNeurodataObjectsFromNwbFile(nwbObject); + else + neurodataObjects = {nwbObject}; end - clear objectMap + + for iNeurodataObject = 1:numel(neurodataObjects) + thisNeurodataObject = neurodataObjects{iNeurodataObject}; + thisNeurodataClassName = class(thisNeurodataObject); + + % Need to keep track of this. A dataset can be defined across + % multiple levels of the class hierarchy, the lowest class should + % take precedence + processedDatasets = string.empty; + + isFinished = false; + while ~isFinished % Iterate over type and it's ancestor types (superclasses) + + datasetNames = schemes.listDatasetsOfNeurodataType( thisNeurodataClassName ); + + for thisDatasetName = datasetNames % Iterate over all datasets of a type... + + if ismember(thisDatasetName, processedDatasets) + continue + end + + datasetConfig = resolveDataTypeChunkConfig(... + chunkConfiguration, ... + thisNeurodataObject, ... + thisDatasetName); - objectKeys = filteredObjectMap.keys(); - for i = 1:numel(objectKeys) - thisObjectKey = objectKeys{i}; - thisNwbObject = filteredObjectMap(thisObjectKey); - - % Todo: Find dataset properties where it makes sense to do chunking - % I.e data, timestamps etc. Can this be determined automatically, - % or do we need a lookup? - - dataTypeChunkOptions = io.config.internal.resolveDataTypeChunkConfig(chunkConfiguration, thisNwbObject); - - if isprop(thisNwbObject, 'data') - if isnumeric(thisNwbObject.data) - % Create a datapipe object for the property value. - dataByteSize = io.config.internal.getDataByteSize(thisNwbObject.data); - if dataByteSize > dataTypeChunkOptions.chunk_default_size - chunkSize = io.config.internal.computeChunkSizeFromConfig(thisNwbObject.data, dataTypeChunkOptions); - maxSize = size(thisNwbObject.data); - - dataPipe = types.untyped.DataPipe( ... - 'data', thisNwbObject.data, ... - 'maxSize', maxSize, ... - 'chunkSize', chunkSize, ... - 'compressionLevel', dataTypeChunkOptions.chunk_compression_args); - thisNwbObject.data = dataPipe; + datasetData = thisNeurodataObject.(thisDatasetName); + + if isnumeric(datasetData) + % Create a datapipe object for a numeric dataset value. + dataByteSize = io.config.internal.getDataByteSize(datasetData); + if dataByteSize > datasetConfig.target_chunk_size.value + dataPipe = io.config.internal.configureDataPipeFromData(datasetData, datasetConfig); + end + elseif isa(datasetData, 'types.untyped.DataPipe') + if options.OverrideExisting + dataPipe = io.config.internal.reconfigureDataPipe(datasetData, datasetConfig); + end + elseif isa(datasetData, 'types.untyped.DataStub') + % pass + %error('Not implemented for files obtained by nwbRead') + else + disp( class(datasetData) ) end + + if exist('dataPipe', 'var') + thisNeurodataObject.(thisDatasetName) = dataPipe; + processedDatasets = [processedDatasets, thisDatasetName]; %#ok + clear dataPipe + end + end + + parentType = matnwb.common.getParentType(thisNeurodataClassName); + + if isempty(parentType) + isFinished = true; + else + thisNeurodataClassName = parentType; end end end end + +function neurodataObjects = getNeurodataObjectsFromNwbFile(nwbObject) +% getNeurodataObjectsFromNwbObject - Return all neurodata objects in a NwbFile object + + objectMap = nwbObject.searchFor('types.'); + + neurodataObjects = objectMap.values(); + neurodataClassNames = cellfun(@(c) class(c), neurodataObjects, 'uni', 0); + + toIgnore = startsWith(neurodataClassNames, "types.untyped"); + neurodataObjects(toIgnore) = []; +end diff --git a/+io/+config/readDefaultChunkConfiguration.m b/+io/+config/readDefaultChunkConfiguration.m index 1c08c375..6ee3a720 100644 --- a/+io/+config/readDefaultChunkConfiguration.m +++ b/+io/+config/readDefaultChunkConfiguration.m @@ -1,19 +1,24 @@ function configObject = readDefaultChunkConfiguration() % READDEFAULTCHUNKCONFIGURATION Reads the default chunking configuration from a JSON file. % -% configObject = READDEFAULTCHUNKCONFIGURATION() loads the default chunking -% parameters from a JSON configuration file located in the 'configuration' -% directory within the MatNWB directory. +% Syntax: +% configObject = io.config.READDEFAULTCHUNKCONFIGURATION() loads the default +% chunking parameters from a JSON configuration file located in the +% "configuration" folder inside the MatNWB directory. % -% Output: -% configObject - A MATLAB structure containing the chunking parameters +% Output Arguments: +% - configObject - A MATLAB structure containing the chunking parameters % defined in the JSON configuration file. % -% Example: -% % Load the default chunk configuration -% config = readDefaultChunkConfiguration(); -% disp(config); +% Example 1 - Load default dataset configurations:: +% % Load the default chunk configuration +% config = readDefaultChunkConfiguration(); +% disp(config); + + configFilePath = fullfile(... + misc.getMatnwbDir, ... + 'configuration', ... + 'cloud_dataset_configuration.json'); - configFilePath = fullfile(misc.getMatnwbDir, 'configuration', 'chunk_params.json'); configObject = jsondecode(fileread(configFilePath)); end diff --git a/+matnwb/+common/getParentType.m b/+matnwb/+common/getParentType.m new file mode 100644 index 00000000..816d30ea --- /dev/null +++ b/+matnwb/+common/getParentType.m @@ -0,0 +1,7 @@ +function parentTypeClassName = getParentType(typeClassName) + mc = meta.class.fromName(typeClassName); + parentTypeClassName = mc.SuperclassList(1).Name; + if strcmp(parentTypeClassName, "types.untyped.MetaClass") + parentTypeClassName = string.empty; + end +end \ No newline at end of file diff --git a/+schemes/listDatasetsOfNeurodataType.m b/+schemes/listDatasetsOfNeurodataType.m index 8a6ee902..1d5c6654 100644 --- a/+schemes/listDatasetsOfNeurodataType.m +++ b/+schemes/listDatasetsOfNeurodataType.m @@ -18,15 +18,33 @@ assert(~isempty(typesIdx), 'Expected class name to contain "types"') namespaceName = classNameSplit(typesIdx+1); + namespaceName = strrep(namespaceName, '_', '-'); namespace = schemes.loadNamespace(namespaceName, misc.getMatnwbDir); neurodataTypeName = classNameSplit(typesIdx+2); typeScheme = namespace.registry(neurodataTypeName); - datasetMaps = typeScheme('datasets'); + switch typeScheme('class_type') + case 'groups' + if isKey(typeScheme, 'datasets') + datasetMaps = typeScheme('datasets'); + + datasetNames = repmat("", size(datasetMaps)); + for i = 1:numel(datasetMaps) + if isKey(datasetMaps{i}, 'name') + datasetNames(i) = datasetMaps{i}('name'); + else + keyboard + end + end + datasetNames(datasetNames=="") = []; + else + datasetNames = string.empty; + end - datasetNames = repmat("", size(datasetMaps)); - for i = 1:numel(datasetMaps) - datasetNames(i) = datasetMaps{i}('name'); + case 'datasets' + datasetNames = "data"; + otherwise + error('Unexpected class type') end end From b32c3c42ac67d7b52b08e6bebccd8bf97a05f57a Mon Sep 17 00:00:00 2001 From: ehennestad Date: Tue, 21 Jan 2025 22:03:33 +0100 Subject: [PATCH 07/19] Remove unused condition in applyChunkConfiguration --- +io/+config/applyChunkConfiguration.m | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/+io/+config/applyChunkConfiguration.m b/+io/+config/applyChunkConfiguration.m index 568620fc..9a467a6c 100644 --- a/+io/+config/applyChunkConfiguration.m +++ b/+io/+config/applyChunkConfiguration.m @@ -2,34 +2,28 @@ function applyChunkConfiguration(nwbObject, chunkConfiguration, options) % applyChunkConfiguration - Apply chunk configuration to datasets of an NWB object arguments - nwbObject (1,1) types.untyped.MetaClass + nwbObject (1,1) NwbFile chunkConfiguration (1,1) struct = io.config.readDefaultChunkConfiguration() % Todo: class for this...? options.OverrideExisting (1,1) logical = false end import io.config.internal.resolveDataTypeChunkConfig - if isa(nwbObject, 'NwbFile') - neurodataObjects = getNeurodataObjectsFromNwbFile(nwbObject); - else - neurodataObjects = {nwbObject}; - end + neurodataObjects = getNeurodataObjectsFromNwbFile(nwbObject); for iNeurodataObject = 1:numel(neurodataObjects) thisNeurodataObject = neurodataObjects{iNeurodataObject}; thisNeurodataClassName = class(thisNeurodataObject); - % Need to keep track of this. A dataset can be defined across - % multiple levels of the class hierarchy, the lowest class should - % take precedence + % A dataset can be defined on multiple levels of the class hierarchy, + % so need to keep track of which datasets have been processed. processedDatasets = string.empty; isFinished = false; while ~isFinished % Iterate over type and it's ancestor types (superclasses) datasetNames = schemes.listDatasetsOfNeurodataType( thisNeurodataClassName ); - - for thisDatasetName = datasetNames % Iterate over all datasets of a type... + for thisDatasetName = datasetNames % Iterate over all datasets of a type if ismember(thisDatasetName, processedDatasets) continue @@ -53,8 +47,8 @@ function applyChunkConfiguration(nwbObject, chunkConfiguration, options) dataPipe = io.config.internal.reconfigureDataPipe(datasetData, datasetConfig); end elseif isa(datasetData, 'types.untyped.DataStub') - % pass - %error('Not implemented for files obtained by nwbRead') + % todo + % error('Not implemented for files obtained by nwbRead') else disp( class(datasetData) ) end From 37e68e1e8705e9bd94a480f03aaa2fa648cc077b Mon Sep 17 00:00:00 2001 From: ehennestad Date: Tue, 21 Jan 2025 22:03:39 +0100 Subject: [PATCH 08/19] Update getParentType.m --- +matnwb/+common/getParentType.m | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/+matnwb/+common/getParentType.m b/+matnwb/+common/getParentType.m index 816d30ea..ba087ab5 100644 --- a/+matnwb/+common/getParentType.m +++ b/+matnwb/+common/getParentType.m @@ -4,4 +4,4 @@ if strcmp(parentTypeClassName, "types.untyped.MetaClass") parentTypeClassName = string.empty; end -end \ No newline at end of file +end From 3d5d2ac94ce816df31b8ab1e8420ac52ae41ecda Mon Sep 17 00:00:00 2001 From: ehennestad Date: Wed, 22 Jan 2025 12:00:06 +0100 Subject: [PATCH 09/19] Add different dataset configuration profiles --- .../archive_dataset_configuration.json | 21 ++++++++ configuration/chunk_params.json | 49 ------------------- .../default_dataset_configuration.json | 21 ++++++++ 3 files changed, 42 insertions(+), 49 deletions(-) create mode 100644 configuration/archive_dataset_configuration.json delete mode 100644 configuration/chunk_params.json create mode 100644 configuration/default_dataset_configuration.json diff --git a/configuration/archive_dataset_configuration.json b/configuration/archive_dataset_configuration.json new file mode 100644 index 00000000..94b7bd23 --- /dev/null +++ b/configuration/archive_dataset_configuration.json @@ -0,0 +1,21 @@ +{ + "Default": { + "layout": "chunked", + "target_chunk_size": { + "value": 100000000, + "unit": "bytes" + }, + "chunk_dimensions": [ + [null], + [null, "max"], + [null, "max", "max"], + [null, "max", "max", "max"] + ], + "compression": { + "algorithm": "ZStandard", + "level": 9, + "parameters": {}, + "prefilters": ["shuffle"] + } + } +} \ No newline at end of file diff --git a/configuration/chunk_params.json b/configuration/chunk_params.json deleted file mode 100644 index 2ce804d9..00000000 --- a/configuration/chunk_params.json +++ /dev/null @@ -1,49 +0,0 @@ -{ - "NWBContainer": { - "chunk_compression": "gzip", - "chunk_compression_args": 4, - "chunk_default_size": 10000000.0, - "chunk_default_size_unit": "bytes", - "data": { - "chunk_dimensions": [ - null - ] - } - }, - "Data": { - "chunk_compression": "gzip", - "chunk_compression_args": 4, - "chunk_default_size": 10000000.0, - "chunk_default_size_unit": "bytes", - "data": { - "chunk_dimensions": [ - null - ] - } - }, - "ElectricalSeries": { - "data": { - "chunk_dimensions": [ - null, - 32 - ] - } - }, - "TimeSeries": { - "data": { - "chunk_dimensions": [ - null, - 32 - ] - } - }, - "ImageSeries": { - "data": { - "chunk_dimensions": [ - null, - "max", - "max" - ] - } - } -} diff --git a/configuration/default_dataset_configuration.json b/configuration/default_dataset_configuration.json new file mode 100644 index 00000000..8443ffd7 --- /dev/null +++ b/configuration/default_dataset_configuration.json @@ -0,0 +1,21 @@ +{ + "Default": { + "layout": "chunked", + "target_chunk_size": { + "value": null, + "unit": "bytes" + }, + "chunk_dimensions": [ + [null], + [null, "max"], + [null, "max", "max"], + [null, "max", "max", "max"] + ], + "compression": { + "algorithm": "deflate", + "level": 3, + "parameters": {}, + "prefilters": [] + } + } +} \ No newline at end of file From 9e623a24e58b94711910838d8a40ec90eaa287ee Mon Sep 17 00:00:00 2001 From: ehennestad Date: Wed, 22 Jan 2025 12:05:30 +0100 Subject: [PATCH 10/19] Consistently name functions and code using datasetConfiguration instead of chunkConfiguration --- .../+internal/computeChunkSizeFromConfig.m | 26 +-- .../+internal/configureDataPipeFromData.m | 4 +- +io/+config/+internal/reconfigureDataPipe.m | 1 - ...ig.m => resolveDatasetConfigForDataType.m} | 24 +-- ...guration.m => applyDatasetConfiguration.m} | 14 +- +io/+config/readDatasetConfiguration.m | 44 +++++ +io/+config/readDefaultChunkConfiguration.m | 24 --- .../+io/+config/DatasetConfigurationTest.m | 166 ++++++++++++++++++ 8 files changed, 247 insertions(+), 56 deletions(-) rename +io/+config/+internal/{resolveDataTypeChunkConfig.m => resolveDatasetConfigForDataType.m} (76%) rename +io/+config/{applyChunkConfiguration.m => applyDatasetConfiguration.m} (87%) create mode 100644 +io/+config/readDatasetConfiguration.m delete mode 100644 +io/+config/readDefaultChunkConfiguration.m create mode 100644 +tests/+unit/+io/+config/DatasetConfigurationTest.m diff --git a/+io/+config/+internal/computeChunkSizeFromConfig.m b/+io/+config/+internal/computeChunkSizeFromConfig.m index f25158dc..2fb76843 100644 --- a/+io/+config/+internal/computeChunkSizeFromConfig.m +++ b/+io/+config/+internal/computeChunkSizeFromConfig.m @@ -1,35 +1,41 @@ -function chunkSize = computeChunkSizeFromConfig(A, chunkSpecification) -% computeChunkSizeFromConfig - Compute the chunk size for a dataset using the provided specification. +function chunkSize = computeChunkSizeFromConfig(A, datasetConfig) +% computeChunkSizeFromConfig - Compute the chunk size for a dataset using the provided configuration. % This function determines the chunk size for a dataset based on the chunk -% dimensions provided in the chunkSpecification. It adjusts dimensions according -% to rules: 'max' uses the dataset size, fixed numbers use their value, and 'null' -% calculates the dimension size to approximate the target chunk size in bytes. +% dimensions provided in the datasetConfig structure. It adjusts dimensions +% according to rules: 'max' uses the dataset size, fixed numbers use their +% value, and 'null' calculates the dimension size to approximate the target +% chunk size in bytes. % % Inputs: % A - A numeric dataset whose chunk size is to be computed. -% chunkSpecification (1,1) struct - Struct defining chunk dimensions and settings. +% datasetConfig (1,1) struct - Struct defining chunk dimensions and chunk target size. % % Output: % chunkSize - A vector specifying the chunk size for each dimension. arguments A {mustBeNumeric} - chunkSpecification (1,1) struct + datasetConfig (1,1) struct end + + assert(isfield(datasetConfig, 'chunk_dimensions')), ... + 'Expected datasetConfig to have field "chunk_dimensions"') + assert(isfield(datasetConfig, 'target_chunk_size'), ... + 'Expected datasetConfig to have field "target_chunk_size"') % Get dataset size dataSize = size(A); dataSize = fliplr(dataSize); % matnwb quirk numDimensions = numel(dataSize); - + % Extract relevant configuration parameters - chunkDimensions = chunkSpecification.chunk_dimensions; + chunkDimensions = datasetConfig.chunk_dimensions; if iscell(chunkDimensions) numChunkDimensions = cellfun(@numel, chunkDimensions); chunkDimensions = chunkDimensions{numChunkDimensions == numDimensions}; end - defaultChunkSize = chunkSpecification.target_chunk_size.value; % in bytes + defaultChunkSize = datasetConfig.target_chunk_size.value; % in bytes dataByteSize = io.config.internal.getDataByteSize(A); % Initialize chunk size array diff --git a/+io/+config/+internal/configureDataPipeFromData.m b/+io/+config/+internal/configureDataPipeFromData.m index cbc7a46f..04f321ba 100644 --- a/+io/+config/+internal/configureDataPipeFromData.m +++ b/+io/+config/+internal/configureDataPipeFromData.m @@ -20,7 +20,7 @@ {'hasShuffle', hasShuffle, ... 'compressionLevel', datasetConfig.compression.level} ... ]; - else + else % Create property list of custom filters for dataset creation compressionFilter = DynamicFilter( ... datasetConfig.compression.algorithm, ... @@ -38,4 +38,4 @@ % Create the datapipe. dataPipe = types.untyped.DataPipe( dataPipeArgs{:} ); -end \ No newline at end of file +end diff --git a/+io/+config/+internal/reconfigureDataPipe.m b/+io/+config/+internal/reconfigureDataPipe.m index 183c0994..3c68a046 100644 --- a/+io/+config/+internal/reconfigureDataPipe.m +++ b/+io/+config/+internal/reconfigureDataPipe.m @@ -1,4 +1,3 @@ function dataPipe = reconfigureDataPipe(dataPipe, datasetConfig) % todo end - diff --git a/+io/+config/+internal/resolveDataTypeChunkConfig.m b/+io/+config/+internal/resolveDatasetConfigForDataType.m similarity index 76% rename from +io/+config/+internal/resolveDataTypeChunkConfig.m rename to +io/+config/+internal/resolveDatasetConfigForDataType.m index f701450b..402ba0a1 100644 --- a/+io/+config/+internal/resolveDataTypeChunkConfig.m +++ b/+io/+config/+internal/resolveDatasetConfigForDataType.m @@ -1,24 +1,24 @@ -function resolvedOptions = resolveDataTypeChunkConfig(chunkSpecification, nwbObject, datasetName) -% resolveDataTypeChunkConfig - Resolve the chunk options for individual datatypes -% This function resolves the chunk configuration options for a given NWB object +function resolvedOptions = resolveDatasetConfigForDataType(datasetConfig, nwbObject, datasetName) +% resolveDatasetConfigForDataType - Resolve the dataset configuration for individual neurodata types +% This function resolves the dataset configuration options for a given NWB object % by traversing the object hierarchy and combining options from the most specific -% type to the base type, as defined in the chunkSpecification. +% type to the base type, as defined in the datasetConfig structure. % % Input: -% chunkSpecification (struct): A struct representation of the chunk configuration JSON. -% nwbObject (types.untyped.MetaClass): An NWB object whose chunk configuration will be resolved. +% datasetConfig (struct): A struct representation of the dataset configuration JSON. +% nwbObject (types.untyped.MetaClass): An NWB object whose dataset configuration will be resolved. % % Output: -% resolvedOptions (struct): A struct containing the resolved chunk configuration options. +% resolvedOptions (struct): A struct containing the resolved dataset configuration options. arguments - chunkSpecification (1,1) struct + datasetConfig (1,1) struct nwbObject (1,1) types.untyped.MetaClass datasetName (1,1) string end % Initialize resolvedOptions with default options. - resolvedOptions = chunkSpecification.Default; + resolvedOptions = datasetConfig.Default; % Get the NWB object type hierarchy (from most specific to base type) typeHierarchy = getTypeHierarchy(nwbObject); @@ -27,9 +27,9 @@ for i = numel(typeHierarchy):-1:1 typeName = typeHierarchy{i}; - % Check if the neurodata type has a chunkSpecification - if isfield(chunkSpecification, typeName) - typeOptions = chunkSpecification.(typeName); + % Check if the neurodata type has a datasetConfig + if isfield(datasetConfig, typeName) + typeOptions = datasetConfig.(typeName); % Is datasetName part of typeOptions? if isfield(typeOptions, datasetName) diff --git a/+io/+config/applyChunkConfiguration.m b/+io/+config/applyDatasetConfiguration.m similarity index 87% rename from +io/+config/applyChunkConfiguration.m rename to +io/+config/applyDatasetConfiguration.m index 9a467a6c..cbba8193 100644 --- a/+io/+config/applyChunkConfiguration.m +++ b/+io/+config/applyDatasetConfiguration.m @@ -1,13 +1,13 @@ -function applyChunkConfiguration(nwbObject, chunkConfiguration, options) -% applyChunkConfiguration - Apply chunk configuration to datasets of an NWB object +function applyDatasetConfiguration(nwbObject, datasetConfiguration, options) +% applyDatasetConfiguration - Apply dataset configuration to datasets of an NWB object arguments nwbObject (1,1) NwbFile - chunkConfiguration (1,1) struct = io.config.readDefaultChunkConfiguration() % Todo: class for this...? + datasetConfiguration (1,1) struct = io.config.readDatasetConfiguration() options.OverrideExisting (1,1) logical = false end - import io.config.internal.resolveDataTypeChunkConfig + import io.config.internal.resolveDatasetConfigForDataType neurodataObjects = getNeurodataObjectsFromNwbFile(nwbObject); @@ -29,8 +29,8 @@ function applyChunkConfiguration(nwbObject, chunkConfiguration, options) continue end - datasetConfig = resolveDataTypeChunkConfig(... - chunkConfiguration, ... + datasetConfig = resolveDatasetConfigForDataType(... + datasetConfiguration, ... thisNeurodataObject, ... thisDatasetName); @@ -77,7 +77,7 @@ function applyChunkConfiguration(nwbObject, chunkConfiguration, options) objectMap = nwbObject.searchFor('types.'); neurodataObjects = objectMap.values(); - neurodataClassNames = cellfun(@(c) class(c), neurodataObjects, 'uni', 0); + neurodataClassNames = cellfun(@(c) class(c), neurodataObjects, 'uni', 0); toIgnore = startsWith(neurodataClassNames, "types.untyped"); neurodataObjects(toIgnore) = []; diff --git a/+io/+config/readDatasetConfiguration.m b/+io/+config/readDatasetConfiguration.m new file mode 100644 index 00000000..06d5ace1 --- /dev/null +++ b/+io/+config/readDatasetConfiguration.m @@ -0,0 +1,44 @@ +function datasetConfig = readDatasetConfiguration(profile) +% READDATASETCONFIGURATION Reads the default dataset configuration from a JSON file. +% +% Syntax: +% configObject = io.config.READDATASETCONFIGURATION() loads the default +% dataset configuration parameters from a JSON file located in the +% "configuration" folder in the MatNWB root directory. +% +% configObject = io.config.READDATASETCONFIGURATION(profile) loads the +% dataset configuration parameters for the specified "configuration profile" +% from a JSON file located in the "configuration" folder in the MatNWB root +% directory. +% +% Output Arguments: +% - datasetConfig - A MATLAB structure containing the dataset configuration +% parameters (chunking & compression) defined in the JSON +% configuration file. +% +% Example 1 - Load default dataset configurations:: +% +% % Load the default dataset configuration +% datasetConfig = io.config.readDatasetConfiguration(); +% disp(datasetConfig); + + arguments + profile (1,1) string {mustBeMember(profile, [ ... + "default", ... + "cloud", ... + "archive" + ])} = "default" + end + + switch profile + case "default" + filename = 'default_dataset_configuration.json'; + case "cloud" + filename = 'cloud_dataset_configuration.json'; + case "archive" + filename = 'archive_dataset_configuration.json'; + end + + configFilePath = fullfile(misc.getMatnwbDir, 'configuration', filename); + datasetConfig = jsondecode(fileread(configFilePath)); +end diff --git a/+io/+config/readDefaultChunkConfiguration.m b/+io/+config/readDefaultChunkConfiguration.m deleted file mode 100644 index 6ee3a720..00000000 --- a/+io/+config/readDefaultChunkConfiguration.m +++ /dev/null @@ -1,24 +0,0 @@ -function configObject = readDefaultChunkConfiguration() -% READDEFAULTCHUNKCONFIGURATION Reads the default chunking configuration from a JSON file. -% -% Syntax: -% configObject = io.config.READDEFAULTCHUNKCONFIGURATION() loads the default -% chunking parameters from a JSON configuration file located in the -% "configuration" folder inside the MatNWB directory. -% -% Output Arguments: -% - configObject - A MATLAB structure containing the chunking parameters -% defined in the JSON configuration file. -% -% Example 1 - Load default dataset configurations:: -% % Load the default chunk configuration -% config = readDefaultChunkConfiguration(); -% disp(config); - - configFilePath = fullfile(... - misc.getMatnwbDir, ... - 'configuration', ... - 'cloud_dataset_configuration.json'); - - configObject = jsondecode(fileread(configFilePath)); -end diff --git a/+tests/+unit/+io/+config/DatasetConfigurationTest.m b/+tests/+unit/+io/+config/DatasetConfigurationTest.m new file mode 100644 index 00000000..ed2057ba --- /dev/null +++ b/+tests/+unit/+io/+config/DatasetConfigurationTest.m @@ -0,0 +1,166 @@ +classdef DatasetConfigurationTest < matlab.unittest.TestCase +% Tests for io.config.applyDatasetConfiguration function + + properties + DefaultConfig + end + + methods(TestMethodSetup) + function setup(testCase) + % Setup default configuration before each test + testCase.DefaultConfig = io.config.readDatasetConfiguration(); + end + end + + methods(Test) + function testBasicFunctionality(testCase) + % Test basic functionality with default configuration + nwbFile = NwbFile( ... + 'identifier', 'TEST123', ... + 'session_description', 'test session', ... + 'session_start_time', datetime()); + + % Should not throw any errors + io.config.applyDatasetConfiguration(nwbFile, testCase.DefaultConfig); + end + + function testNumericDatasetConfiguration(testCase) + % Test configuration of numeric datasets + nwbFile = NwbFile( ... + 'identifier', 'TEST123', ... + 'session_description', 'test session', ... + 'session_start_time', datetime()); + + % Create a large numeric dataset + data = types.core.TimeSeries( ... + 'data', rand(1000, 1000), ... + 'data_unit', 'n/a', ... + 'timestamps', 1:1000); + + nwbFile.acquisition.set('test_data', data); + + % Apply configuration + io.config.applyDatasetConfiguration(nwbFile, testCase.DefaultConfig); + + % Verify the dataset was converted to DataPipe + testCase.verifyTrue(isa(nwbFile.acquisition.get('test_data').data, ... + 'types.untyped.DataPipe'), ... + 'Large numeric dataset should be converted to DataPipe'); + end + + function testSmallNumericDataset(testCase) + % Test that small numeric datasets remain unchanged + nwbFile = NwbFile( ... + 'identifier', 'TEST123', ... + 'session_description', 'test session', ... + 'session_start_time', datetime()); + + % Create a small numeric dataset + data = types.core.TimeSeries( ... + 'data', rand(10, 10), ... + 'data_unit', 'n/a', ... + 'timestamps', 1:10); + + nwbFile.acquisition.set('test_data', data); + + % Apply configuration + io.config.applyDatasetConfiguration(nwbFile, testCase.DefaultConfig); + + % Verify the dataset remains numeric + testCase.verifyTrue(isnumeric(nwbFile.acquisition.get('test_data').data), ... + 'Small numeric dataset should remain numeric'); + end + + function testOverrideExisting(testCase) + % Test override behavior for existing DataPipe objects + nwbFile = NwbFile( ... + 'identifier', 'TEST123', ... + 'session_description', 'test session', ... + 'session_start_time', datetime()); + + % Create a DataPipe object + rawData = rand(1000, 1000); + dataPipe = types.untyped.DataPipe('data', rawData, 'axis', 1, 'chunk_size', 100); + + data = types.core.TimeSeries( ... + 'data', dataPipe, ... + 'data_unit', 'n/a', ... + 'timestamps', 1:1000); + + nwbFile.acquisition.set('test_data', data); + + % Apply configuration with override + io.config.applyDatasetConfiguration(nwbFile, testCase.DefaultConfig, ... + 'OverrideExisting', true); + + % Verify the DataPipe was reconfigured + resultPipe = nwbFile.acquisition.get('test_data').data; + testCase.verifyTrue(isa(resultPipe, 'types.untyped.DataPipe'), ... + 'Result should still be a DataPipe'); + end + + function testNoOverrideExisting(testCase) + % Test that existing DataPipe objects are not modified without override + nwbFile = NwbFile( ... + 'identifier', 'TEST123', ... + 'session_description', 'test session', ... + 'session_start_time', datetime()); + + % Create a DataPipe object with specific configuration + rawData = rand(1000, 1000); + originalChunkSize = 100; + dataPipe = types.untyped.DataPipe('data', rawData, 'axis', 1, ... + 'chunk_size', originalChunkSize); + + data = types.core.TimeSeries( ... + 'data', dataPipe, ... + 'data_unit', 'n/a', ... + 'timestamps', 1:1000); + + nwbFile.acquisition.set('test_data', data); + + % Apply configuration without override + io.config.applyDatasetConfiguration(nwbFile, testCase.DefaultConfig, ... + 'OverrideExisting', false); + + % Verify the DataPipe configuration remains unchanged + resultPipe = nwbFile.acquisition.get('test_data').data; + testCase.verifyEqual(resultPipe.chunk_size, originalChunkSize, ... + 'DataPipe configuration should remain unchanged without override'); + end + + function testGetNeurodataObjects(testCase) + % Test the nested getNeurodataObjectsFromNwbFile function + nwbFile = NwbFile( ... + 'identifier', 'TEST123', ... + 'session_description', 'test session', ... + 'session_start_time', datetime()); + + % Add various types of objects + timeseries = types.core.TimeSeries( ... + 'data', rand(10, 10), ... + 'data_unit', 'n/a', ... + 'timestamps', 1:10); + + nwbFile.acquisition.set('test_timeseries', timeseries); + + % Add an untyped object that should be ignored + untypedObj = types.untyped.Group(); + nwbFile.acquisition.set('untyped_obj', untypedObj); + + % Get private access to the nested function + metaClass = metaclass(nwbFile); + methodList = metaClass.MethodList; + getNeurodataObjectsFcn = str2func('io.config.applyDatasetConfiguration>getNeurodataObjectsFromNwbFile'); + + % Call the function + neurodataObjects = getNeurodataObjectsFcn(nwbFile); + + % Verify results + testCase.verifySize(neurodataObjects, [1 1], ... + 'Should find one neurodata object'); + testCase.verifyTrue(isa(neurodataObjects{1}, 'types.core.TimeSeries'), ... + 'Should find TimeSeries object'); + end + end +end From 665bf5c60df446a3cac3a3077cf2a2bdd0c02de5 Mon Sep 17 00:00:00 2001 From: ehennestad Date: Wed, 22 Jan 2025 12:17:19 +0100 Subject: [PATCH 11/19] Test-related fixes --- .../+internal/computeChunkSizeFromConfig.m | 2 +- .../+internal/configureDataPipeFromData.m | 3 +- .../+io/+config/DatasetConfigurationTest.m | 42 ++----------------- .../default_dataset_configuration.json | 2 +- 4 files changed, 8 insertions(+), 41 deletions(-) diff --git a/+io/+config/+internal/computeChunkSizeFromConfig.m b/+io/+config/+internal/computeChunkSizeFromConfig.m index 2fb76843..4a6de83c 100644 --- a/+io/+config/+internal/computeChunkSizeFromConfig.m +++ b/+io/+config/+internal/computeChunkSizeFromConfig.m @@ -18,7 +18,7 @@ datasetConfig (1,1) struct end - assert(isfield(datasetConfig, 'chunk_dimensions')), ... + assert(isfield(datasetConfig, 'chunk_dimensions'), ... 'Expected datasetConfig to have field "chunk_dimensions"') assert(isfield(datasetConfig, 'target_chunk_size'), ... 'Expected datasetConfig to have field "target_chunk_size"') diff --git a/+io/+config/+internal/configureDataPipeFromData.m b/+io/+config/+internal/configureDataPipeFromData.m index 04f321ba..d4b8eff9 100644 --- a/+io/+config/+internal/configureDataPipeFromData.m +++ b/+io/+config/+internal/configureDataPipeFromData.m @@ -12,7 +12,8 @@ "maxSize", maxSize, ... "chunkSize", chunkSize }; - hasShuffle = contains(datasetConfig.compression.prefilters, 'shuffle'); + hasShuffle = ~isempty(datasetConfig.compression.prefilters)... + && contains(datasetConfig.compression.prefilters, 'shuffle'); if strcmpi(datasetConfig.compression.algorithm, "Deflate") % Use standard compression filters diff --git a/+tests/+unit/+io/+config/DatasetConfigurationTest.m b/+tests/+unit/+io/+config/DatasetConfigurationTest.m index ed2057ba..c285a36a 100644 --- a/+tests/+unit/+io/+config/DatasetConfigurationTest.m +++ b/+tests/+unit/+io/+config/DatasetConfigurationTest.m @@ -80,7 +80,7 @@ function testOverrideExisting(testCase) % Create a DataPipe object rawData = rand(1000, 1000); - dataPipe = types.untyped.DataPipe('data', rawData, 'axis', 1, 'chunk_size', 100); + dataPipe = types.untyped.DataPipe('data', rawData, 'axis', 1, 'chunkSize', 100); data = types.core.TimeSeries( ... 'data', dataPipe, ... @@ -108,9 +108,9 @@ function testNoOverrideExisting(testCase) % Create a DataPipe object with specific configuration rawData = rand(1000, 1000); - originalChunkSize = 100; + originalChunkSize = [100, 100]; dataPipe = types.untyped.DataPipe('data', rawData, 'axis', 1, ... - 'chunk_size', originalChunkSize); + 'chunkSize', originalChunkSize); data = types.core.TimeSeries( ... 'data', dataPipe, ... @@ -125,42 +125,8 @@ function testNoOverrideExisting(testCase) % Verify the DataPipe configuration remains unchanged resultPipe = nwbFile.acquisition.get('test_data').data; - testCase.verifyEqual(resultPipe.chunk_size, originalChunkSize, ... + testCase.verifyEqual(resultPipe.chunkSize, originalChunkSize, ... 'DataPipe configuration should remain unchanged without override'); end - - function testGetNeurodataObjects(testCase) - % Test the nested getNeurodataObjectsFromNwbFile function - nwbFile = NwbFile( ... - 'identifier', 'TEST123', ... - 'session_description', 'test session', ... - 'session_start_time', datetime()); - - % Add various types of objects - timeseries = types.core.TimeSeries( ... - 'data', rand(10, 10), ... - 'data_unit', 'n/a', ... - 'timestamps', 1:10); - - nwbFile.acquisition.set('test_timeseries', timeseries); - - % Add an untyped object that should be ignored - untypedObj = types.untyped.Group(); - nwbFile.acquisition.set('untyped_obj', untypedObj); - - % Get private access to the nested function - metaClass = metaclass(nwbFile); - methodList = metaClass.MethodList; - getNeurodataObjectsFcn = str2func('io.config.applyDatasetConfiguration>getNeurodataObjectsFromNwbFile'); - - % Call the function - neurodataObjects = getNeurodataObjectsFcn(nwbFile); - - % Verify results - testCase.verifySize(neurodataObjects, [1 1], ... - 'Should find one neurodata object'); - testCase.verifyTrue(isa(neurodataObjects{1}, 'types.core.TimeSeries'), ... - 'Should find TimeSeries object'); - end end end diff --git a/configuration/default_dataset_configuration.json b/configuration/default_dataset_configuration.json index 8443ffd7..df12d3e7 100644 --- a/configuration/default_dataset_configuration.json +++ b/configuration/default_dataset_configuration.json @@ -2,7 +2,7 @@ "Default": { "layout": "chunked", "target_chunk_size": { - "value": null, + "value": 1000000, "unit": "bytes" }, "chunk_dimensions": [ From 32771ea938188f5c9ac110ea6b6ee9ff5c66c16b Mon Sep 17 00:00:00 2001 From: ehennestad Date: Wed, 22 Jan 2025 12:35:30 +0100 Subject: [PATCH 12/19] simplify readDatasetConfiguration Replaces switch block with formatted string --- +io/+config/readDatasetConfiguration.m | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/+io/+config/readDatasetConfiguration.m b/+io/+config/readDatasetConfiguration.m index 06d5ace1..4a05e8b6 100644 --- a/+io/+config/readDatasetConfiguration.m +++ b/+io/+config/readDatasetConfiguration.m @@ -30,14 +30,7 @@ ])} = "default" end - switch profile - case "default" - filename = 'default_dataset_configuration.json'; - case "cloud" - filename = 'cloud_dataset_configuration.json'; - case "archive" - filename = 'archive_dataset_configuration.json'; - end + filename = sprintf('%s_dataset_configuration.json', profile); configFilePath = fullfile(misc.getMatnwbDir, 'configuration', filename); datasetConfig = jsondecode(fileread(configFilePath)); From 8bde77596f96e89c56cef4a7a9143222571fc8e9 Mon Sep 17 00:00:00 2001 From: ehennestad Date: Sat, 22 Feb 2025 11:43:19 +0100 Subject: [PATCH 13/19] Create applyCustomMatNWBPropertyNames.m Function that will ensure the dataset configuration conforms with MatNWB specific implementation details --- .../applyCustomMatNWBPropertyNames.m | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 +io/+config/+internal/applyCustomMatNWBPropertyNames.m diff --git a/+io/+config/+internal/applyCustomMatNWBPropertyNames.m b/+io/+config/+internal/applyCustomMatNWBPropertyNames.m new file mode 100644 index 00000000..d56ece38 --- /dev/null +++ b/+io/+config/+internal/applyCustomMatNWBPropertyNames.m @@ -0,0 +1,73 @@ +function datasetConfiguration = applyCustomMatNWBPropertyNames(datasetConfiguration) + + arguments + datasetConfiguration (1,1) struct + end + + fields = fieldnames(datasetConfiguration); + classNameMap = getNwbTypesClassnameMap(); + + for i = 1:numel(fields) + + thisField = fields{i}; + if ~isKey(classNameMap, thisField) + continue + end + + fullClassName = classNameMap(thisField); + superclassNames = superclasses(fullClassName); + + if any(strcmp(superclassNames, "types.untyped.MetaClass")) + thisSubConfig = datasetConfiguration.(thisField); + if any(strcmp(superclassNames, "types.untyped.GroupClass")) + % Recursively process subgroups + datasetConfiguration.(thisField) = ... + io.config.internal.applyCustomMatNWBPropertyNames(thisSubConfig); + elseif any(strcmp(superclassNames, "types.untyped.DatasetClass")) + % MatNWB adds a "data" property on Dataset type classes, + % which is not originally part of the schema. + datasetConfiguration.(thisField) = struct('data', thisSubConfig); + else + error('NWB:UnexpectedError', 'Something unexpected happened.') + end + else + % Do nothing. + end + end +end + +function ancestorPath = getAncestorPath(initialPath, numSteps) + arguments + initialPath (1,1) string + numSteps (1,1) double + end + splitPath = split(initialPath, filesep); + + ancestorPath = fullfile(splitPath{1:end-numSteps}); % char output + if isunix && ~startsWith(ancestorPath, filesep) + ancestorPath = [filesep ancestorPath]; + end +end + +function map = getNwbTypesClassnameMap() + + typesClassDirectory = getAncestorPath( which('types.core.NWBFile'), 2 ); + + % Find names of all nwb types: + L = dir(fullfile(typesClassDirectory, '**', '*.m')); + ignore = contains({L.folder}, fullfile('+types', '+untyped')) | ... + contains({L.folder}, fullfile('+types', '+util')); + L(ignore) = []; + + + [~, namespaceNames] = fileparts({L.folder}); + namespaceNames = string( strrep(namespaceNames, '+', '') ); + classNames = string( strrep( {L.name}, '.m', '') ); + + fullClassNames = compose("types.%s.%s", namespaceNames', classNames'); + try + map = dictionary(classNames', fullClassNames); + catch % If older version of MATLAB + map = containers.Map(classNames, fullClassNames); + end +end \ No newline at end of file From 12f0453f23747f892128269cd9234ebab9ae57a7 Mon Sep 17 00:00:00 2001 From: ehennestad Date: Sat, 22 Feb 2025 14:06:56 +0100 Subject: [PATCH 14/19] Update configuration/archive_dataset_configuration.json Co-authored-by: Ben Dichter --- configuration/archive_dataset_configuration.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configuration/archive_dataset_configuration.json b/configuration/archive_dataset_configuration.json index 94b7bd23..0b26f018 100644 --- a/configuration/archive_dataset_configuration.json +++ b/configuration/archive_dataset_configuration.json @@ -13,7 +13,7 @@ ], "compression": { "algorithm": "ZStandard", - "level": 9, + "level": 5, "parameters": {}, "prefilters": ["shuffle"] } From 7bfc2c8c8c26404eb4510d867be8aed56ee63047 Mon Sep 17 00:00:00 2001 From: ehennestad Date: Sat, 22 Feb 2025 14:20:15 +0100 Subject: [PATCH 15/19] Add docstring for function applyCustomMatNWBPropertyNames.m --- .../applyCustomMatNWBPropertyNames.m | 81 ++++++++++++++++--- +matnwb/+common/composeFullClassName.m | 9 +++ 2 files changed, 78 insertions(+), 12 deletions(-) create mode 100644 +matnwb/+common/composeFullClassName.m diff --git a/+io/+config/+internal/applyCustomMatNWBPropertyNames.m b/+io/+config/+internal/applyCustomMatNWBPropertyNames.m index d56ece38..4fa82c3b 100644 --- a/+io/+config/+internal/applyCustomMatNWBPropertyNames.m +++ b/+io/+config/+internal/applyCustomMatNWBPropertyNames.m @@ -1,17 +1,42 @@ function datasetConfiguration = applyCustomMatNWBPropertyNames(datasetConfiguration) - +% applyCustomMatNWBPropertyNames - Processes a dataset configuration structure to apply custom MatNWB property names. +% +% datasetConfiguration = applyCustomMatNWBPropertyNames(datasetConfiguration) +% +% This function iterates through each field of the input structure and checks +% if the field corresponds to a known NWB type (using a mapping from short +% names to fully qualified class names). For each recognized field: +% +% - It retrieves the full class name and determines its superclasses. +% - If the class is a subclass of "types.untyped.MetaClass": +% * If it is also a "types.untyped.GroupClass", the function recursively +% processes the subgroup configuration. +% * If it is a "types.untyped.DatasetClass", it wraps the existing +% configuration in a structure with a "data" property. +% - If the field is not associated with a recognized NWB type, it remains +% unchanged. +% +% Input: +% datasetConfiguration - A 1x1 struct containing dataset configuration +% data. +% +% Output: +% datasetConfiguration - The updated configuration structure with custom +% property names. + arguments datasetConfiguration (1,1) struct end fields = fieldnames(datasetConfiguration); + classNameMap = getNwbTypesClassnameMap(); for i = 1:numel(fields) thisField = fields{i}; if ~isKey(classNameMap, thisField) - continue + continue % Not a neurodata / nwb type end fullClassName = classNameMap(thisField); @@ -20,23 +45,34 @@ if any(strcmp(superclassNames, "types.untyped.MetaClass")) thisSubConfig = datasetConfiguration.(thisField); if any(strcmp(superclassNames, "types.untyped.GroupClass")) - % Recursively process subgroups + % Recursively process subgroups. datasetConfiguration.(thisField) = ... io.config.internal.applyCustomMatNWBPropertyNames(thisSubConfig); elseif any(strcmp(superclassNames, "types.untyped.DatasetClass")) - % MatNWB adds a "data" property on Dataset type classes, - % which is not originally part of the schema. + % Wrap Dataset type configurations in a struct with a "data" field. datasetConfiguration.(thisField) = struct('data', thisSubConfig); else error('NWB:UnexpectedError', 'Something unexpected happened.') end else - % Do nothing. + % For non-NWB types, leave the field unmodified. end end end function ancestorPath = getAncestorPath(initialPath, numSteps) +% getAncestorPath - Get an ancestor directory path. +% +% ancestorPath = GETANCESTORPATH(initialPath, numSteps) +% +% Input: +% initialPath - A string representing the starting file or directory path. +% numSteps - A positive integer indicating the number of directory +% levels to move up. +% +% Output: +% ancestorPath - A string representing the ancestor directory path. + arguments initialPath (1,1) string numSteps (1,1) double @@ -44,30 +80,51 @@ splitPath = split(initialPath, filesep); ancestorPath = fullfile(splitPath{1:end-numSteps}); % char output + + % Ensure the path starts with a file separator on Unix systems. if isunix && ~startsWith(ancestorPath, filesep) ancestorPath = [filesep ancestorPath]; end end function map = getNwbTypesClassnameMap() +% getNwbTypesClassnameMap - Constructs a mapping between NWB type short names +% and their fully qualified class names. +% +% map = GETNWBTYPESCLASSNAMEMAP() +% +% The function locates the directory containing NWB type definitions +% (using the location of 'types.core.NWBFile' as a reference) and searches +% recursively for all MATLAB class definition files (*.m). It then filters +% out files in the '+types/+untyped' and '+types/+util' folders. +% +% Output: +% map - A mapping object (either a dictionary or containers.Map) where: +% * Keys : Short class names (derived from file names without the .m extension). +% * Values : Fully qualified class names in the format "types.namespace.ClassName". typesClassDirectory = getAncestorPath( which('types.core.NWBFile'), 2 ); - % Find names of all nwb types: + % Find all MATLAB class files recursively within the directory. L = dir(fullfile(typesClassDirectory, '**', '*.m')); + + % Exclude files from the '+types/+untyped' and '+types/+util' directories. ignore = contains({L.folder}, fullfile('+types', '+untyped')) | ... contains({L.folder}, fullfile('+types', '+util')); L(ignore) = []; - + % Extract namespace and class names from the file paths. [~, namespaceNames] = fileparts({L.folder}); namespaceNames = string( strrep(namespaceNames, '+', '') ); classNames = string( strrep( {L.name}, '.m', '') ); - fullClassNames = compose("types.%s.%s", namespaceNames', classNames'); + % Compose fully qualified class names using the namespace and class name. + fullClassNames = matnwb.common.composeFullClassName(namespaceNames, classNames); + + % Create a mapping from the short class names to the fully qualified class names. try - map = dictionary(classNames', fullClassNames); - catch % If older version of MATLAB + map = dictionary(classNames, fullClassNames); + catch % Fallback for older versions of MATLAB. map = containers.Map(classNames, fullClassNames); end -end \ No newline at end of file +end diff --git a/+matnwb/+common/composeFullClassName.m b/+matnwb/+common/composeFullClassName.m new file mode 100644 index 00000000..1c08a09a --- /dev/null +++ b/+matnwb/+common/composeFullClassName.m @@ -0,0 +1,9 @@ +function fullClassName = composeFullClassName(namespaceName, neurodataType) + arguments + namespaceName (:, 1) string + neurodataType (:, 1) string + end + + fullClassName = compose("types.%s.%s", namespaceName, neurodataType); + fullClassName = transpose(fullClassName); % Return as row vector +end From a986a95ff14ef1412f83a680317fa409c311aa5e Mon Sep 17 00:00:00 2001 From: ehennestad Date: Sat, 22 Feb 2025 20:54:10 +0100 Subject: [PATCH 16/19] Update listDatasetsOfNeurodataType.m Resolve name for dataset if the name field is missing --- +schemes/listDatasetsOfNeurodataType.m | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/+schemes/listDatasetsOfNeurodataType.m b/+schemes/listDatasetsOfNeurodataType.m index 1d5c6654..216f5045 100644 --- a/+schemes/listDatasetsOfNeurodataType.m +++ b/+schemes/listDatasetsOfNeurodataType.m @@ -33,8 +33,13 @@ for i = 1:numel(datasetMaps) if isKey(datasetMaps{i}, 'name') datasetNames(i) = datasetMaps{i}('name'); + elseif isKey(datasetMaps{i}, 'data_type_inc') + datasetNames(i) = lower( datasetMaps{i}('data_type_inc') ); + elseif isKey(datasetMaps{i}, 'data_type_def') + datasetNames(i) = lower( datasetMaps{i}('data_type_def') ); else keyboard + error('NWB:UnexpectedError', 'Something unexpected happened.') end end datasetNames(datasetNames=="") = []; From 9e7f242b9b65aaee873c6f2bfab3aadf9ca3c201 Mon Sep 17 00:00:00 2001 From: ehennestad Date: Sat, 22 Feb 2025 20:56:31 +0100 Subject: [PATCH 17/19] Fix compute chunk size Rename flexible dimension to "flex" Use product of fixed dimensions to compute size of flex dimension --- .../+internal/computeChunkSizeFromConfig.m | 77 +++++++++++++++---- .../cloud_dataset_configuration.json | 24 +++--- 2 files changed, 77 insertions(+), 24 deletions(-) diff --git a/+io/+config/+internal/computeChunkSizeFromConfig.m b/+io/+config/+internal/computeChunkSizeFromConfig.m index 4a6de83c..a9eec0e9 100644 --- a/+io/+config/+internal/computeChunkSizeFromConfig.m +++ b/+io/+config/+internal/computeChunkSizeFromConfig.m @@ -3,7 +3,7 @@ % This function determines the chunk size for a dataset based on the chunk % dimensions provided in the datasetConfig structure. It adjusts dimensions % according to rules: 'max' uses the dataset size, fixed numbers use their -% value, and 'null' calculates the dimension size to approximate the target +% value, and 'flex' calculates the dimension size to approximate the target % chunk size in bytes. % % Inputs: @@ -28,18 +28,47 @@ dataSize = fliplr(dataSize); % matnwb quirk numDimensions = numel(dataSize); - % Extract relevant configuration parameters + % Extract chunk dimensions configuration chunkDimensions = datasetConfig.chunk_dimensions; - if iscell(chunkDimensions) - numChunkDimensions = cellfun(@numel, chunkDimensions); + if ~iscell(chunkDimensions) + if isscalar(chunkDimensions) + chunkDimensions = {chunkDimensions}; + else + error('Unexpected chunk_dimensions format.'); + end + end + + % Find the chunk dimensions specification matching the number of + % dimensions of the input array A + numChunkDimensions = cellfun(@numel, chunkDimensions); + if any(ismember(numChunkDimensions, numDimensions)) chunkDimensions = chunkDimensions{numChunkDimensions == numDimensions}; + elseif all(numDimensions > numChunkDimensions) + chunkDimensions = chunkDimensions{end}; + else + error('NWB:UnexpectedError', 'Unexpected chunk dimension size.') + end + + if ~iscell(chunkDimensions) + chunkDimensions = arrayfun(@(x) x, chunkDimensions, 'UniformOutput', false); end defaultChunkSize = datasetConfig.target_chunk_size.value; % in bytes dataByteSize = io.config.internal.getDataByteSize(A); + elementSize = io.config.internal.getDataByteSize(A) / numel(A); % bytes per element + + % Determine the target number of elements per chunk. + targetNumElements = defaultChunkSize / elementSize; + % Initialize chunk size array chunkSize = zeros(1, numDimensions); + flexDims = false(1, numDimensions); + + assert(iscell(chunkDimensions), "Something unexpected happened") + + isFlex = @(x) ischar(x) && strcmp(x, 'flex'); + isMax = @(x) ischar(x) && strcmp(x, 'max'); % Calculate chunk size for each dimension for dim = 1:numDimensions @@ -48,18 +77,12 @@ chunkSize(dim) = dataSize(dim); else dimSpec = chunkDimensions{dim}; - if isempty(dimSpec) - % Compute chunk size for 'null' dimensions - % Estimate proportional size based on remaining chunk size - remainingChunkSize = defaultChunkSize / dataByteSize; % scale factor for all dimensions - nullDimensions = find(cellfun(@isempty, chunkDimensions)); - proportionalSize = nthroot(remainingChunkSize, numel(nullDimensions)); - chunkSize(dim) = max(1, round(proportionalSize*dataSize(dim))); + if isFlex(dimSpec) + flexDims(dim) = true; + % Leave chunkSize(dim) to be determined. elseif isnumeric(dimSpec) - % Fixed chunk size chunkSize(dim) = dimSpec; - elseif ischar(dimSpec) && strcmp(dimSpec, 'max') - % Use full dimension size + elseif isMax(dimSpec) chunkSize(dim) = dataSize(dim); else error('Invalid chunk specification for dimension %d.', dim); @@ -67,7 +90,31 @@ end end + % Compute the product of fixed dimensions (number of elements per chunk). + if any(~flexDims) + fixedProduct = prod(chunkSize(~flexDims)); + else + fixedProduct = 1; + end + + % For flex dimensions, compute the remaining number of elements + % and allocate them equally in the exponent space. + nFlex = sum(flexDims); + if nFlex > 0 + remainingElements = targetNumElements / fixedProduct; + % Ensure remainingElements is at least 1. + remainingElements = max(remainingElements, 1); + % Compute an equal allocation factor for each flex dimension. + elementsPerFlexDimension = nthroot(remainingElements, nFlex); + % Assign computed chunk size for each flex dimension. + for dim = find(flexDims) + proposedSize = max(1, round(elementsPerFlexDimension)); + % Do not exceed the full dimension size. + chunkSize(dim) = min(proposedSize, dataSize(dim)); + end + end + % Ensure chunk size does not exceed dataset dimensions - chunkSize = min(chunkSize, dataSize); chunkSize = fliplr(chunkSize); + chunkSize = min(chunkSize, dataSize); end diff --git a/configuration/cloud_dataset_configuration.json b/configuration/cloud_dataset_configuration.json index b410b2fc..d162cd8c 100644 --- a/configuration/cloud_dataset_configuration.json +++ b/configuration/cloud_dataset_configuration.json @@ -6,10 +6,10 @@ "unit": "bytes" }, "chunk_dimensions": [ - [null], - [null, "max"], - [null, "max", "max"], - [null, "max", "max", "max"] + ["flex"], + ["flex", "max"], + ["flex", "max", "max"], + ["flex", "max", "max", "max"] ], "compression": { "algorithm": "deflate", @@ -18,26 +18,32 @@ "prefilters": ["shuffle"] } }, + "VectorData": { + "compression": { + "algorithm": "deflate", + "level": 7 + } + }, "TimeSeries": { "data": { - "chunk_dimensions": [[null, 32], [null, 32, "max"]], + "chunk_dimensions": [["flex", 32], ["flex", 32, "max"]], "compression": { "algorithm": "deflate", "level": 4 } }, "timestamps": { - "chunk_dimensions": [null] + "chunk_dimensions": ["flex"] } }, "ImageSeries": { "data": { - "chunk_dimensions": [[null, "max", "max"], [null, "max", "max", "max"]] + "chunk_dimensions": [["flex", "max", "max"], ["flex", "max", "max", "max"]] } }, "ElectricalSeries": { "data": { - "chunk_dimensions": [[null, 32], [null, 32, "max"]] + "chunk_dimensions": [["flex", 64], ["flex", 64, "max"]] } }, "SpikeEventSeries": { @@ -45,4 +51,4 @@ "chunk_dimensions": [1000] } } -} \ No newline at end of file +} From 58698fe3f2ca24dd8e67aee1d52c40ef6c7e806b Mon Sep 17 00:00:00 2001 From: ehennestad Date: Sat, 22 Feb 2025 20:58:02 +0100 Subject: [PATCH 18/19] Update readDatasetConfiguration.m Add function to update dataset configuration to conform with MatNWB specific implementation (i.e, Dataset types (like VectorData) having a data property) --- +io/+config/readDatasetConfiguration.m | 2 ++ 1 file changed, 2 insertions(+) diff --git a/+io/+config/readDatasetConfiguration.m b/+io/+config/readDatasetConfiguration.m index 4a05e8b6..38c44ead 100644 --- a/+io/+config/readDatasetConfiguration.m +++ b/+io/+config/readDatasetConfiguration.m @@ -34,4 +34,6 @@ configFilePath = fullfile(misc.getMatnwbDir, 'configuration', filename); datasetConfig = jsondecode(fileread(configFilePath)); + + datasetConfig = io.config.internal.applyCustomMatNWBPropertyNames(datasetConfig); end From 560b501bba2a9a17318c5e48b067ce358bff802c Mon Sep 17 00:00:00 2001 From: ehennestad Date: Sat, 22 Feb 2025 20:58:27 +0100 Subject: [PATCH 19/19] Update resolveDatasetConfigForDataType.m --- +io/+config/+internal/resolveDatasetConfigForDataType.m | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/+io/+config/+internal/resolveDatasetConfigForDataType.m b/+io/+config/+internal/resolveDatasetConfigForDataType.m index 402ba0a1..b058519d 100644 --- a/+io/+config/+internal/resolveDatasetConfigForDataType.m +++ b/+io/+config/+internal/resolveDatasetConfigForDataType.m @@ -1,8 +1,9 @@ function resolvedOptions = resolveDatasetConfigForDataType(datasetConfig, nwbObject, datasetName) % resolveDatasetConfigForDataType - Resolve the dataset configuration for individual neurodata types -% This function resolves the dataset configuration options for a given NWB object -% by traversing the object hierarchy and combining options from the most specific -% type to the base type, as defined in the datasetConfig structure. +% This function resolves the dataset configuration options for a given NWB +% object by traversing the object hierarchy and combining options from the +% most specific type to the base type, as defined in the datasetConfig +% structure. % % Input: % datasetConfig (struct): A struct representation of the dataset configuration JSON.