From fe079044de5e6afb7e6e3168256bb094dfd0d816 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Wed, 21 Oct 2020 23:49:09 +0100 Subject: [PATCH 1/5] refactor codecs into a single registry spec --- docs/codecs.rst | 208 ++++++++++++++++++++++++++++++++++++-- docs/codecs/gzip/v1.0.rst | 122 ---------------------- docs/index.rst | 2 +- 3 files changed, 203 insertions(+), 129 deletions(-) delete mode 100644 docs/codecs/gzip/v1.0.rst diff --git a/docs/codecs.rst b/docs/codecs.rst index 5e6702a8..a9f1312e 100644 --- a/docs/codecs.rst +++ b/docs/codecs.rst @@ -1,11 +1,207 @@ -====== +============== +Codec registry +============== +------------------------------ +Editor's Draft 21 October 2020 +------------------------------ + +Specification URI: + https://purl.org/zarr/specs/codecs +Issue tracking: + `GitHub issues `_ +Suggest an edit for this spec: + `GitHub editor `_ + +Copyright 2020 `Zarr core development team +`_. This work +is licensed under a `Creative Commons Attribution 3.0 Unported License +`_. + +---- + + +Abstract +======== + +This document defines codecs for use as compressors and/or filters as +part of a Zarr implementation. + + +Status of this documents +======================== + +This document is a **Work in Progress**. It may be updated, replaced +or obsoleted by other documents at any time. It is inappapropriate to +cite this document as other than work in progress. + +Comments, questions or contributions to this document are very +welcome. Comments and questions should be raised via `GitHub issues +`_. + +This document is maintained by the `Zarr core development team +`_. + + +Document conventions +==================== + +This document lists a collection of codecs. For each codec, the +following information is provided: + +* A URI which can be used to uniquely identify the codec in Zarr array + metadata. +* Any configuration parameters which can be set in Zarr array + metadata. +* A definition of encoding/decoding algorithm and the encoded format, + or a citation to an existing specification where this is defined. +* Any additional headers added to the encoded data. + +Conformance requirements are expressed with a combination of +descriptive assertions and [RFC2119]_ terminology. The key words +"MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", +"SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in the normative +parts of this document are to be interpreted as described in +[RFC2119]_. However, for readability, these words do not appear in all +uppercase letters in this specification. + +All of the text of this specification is normative except sections +explicitly marked as non-normative, examples, and notes. Examples in +this specification are introduced with the words "for example". + + Codecs ====== -Under construction. +Gzip +---- + +Codec URI: + https://purl.org/zarr/spec/codecs/gzip + + +Configuration parameters +~~~~~~~~~~~~~~~~~~~~~~~~ + +level: + An integer from 0 to 9 which controls the speed and level of + compression. A level of 1 is the fastest compression method and + produces the least compressions, while 9 is slowest and produces + the most compression. Compression is turned off completely when + level is 0. + +For example, the array metadata below specifies that the compressor is +the Gzip codec configured with a compression level of 1:: + + { + "compressor": { + "codec": "https://purl.org/zarr/spec/codecs/gzip", + "configuration": { + "level": 1 + } + }, + } + + +Format and algorithm +~~~~~~~~~~~~~~~~~~~~ + +Encoding and decoding is performed using the algorithm defined in +[RFC1951]_. + +Encoded data should conform to the Gzip file format [RFC1952]_. + + +Blosc +----- + +Codec URI: + https://purl.org/zarr/spec/codecs/blosc + + +Configuration parameters +~~~~~~~~~~~~~~~~~~~~~~~~ + +cname: + A string identifying the internal compression algorithm to be + used. At the time of writing, the following values are supported + by the c-blosc library: "lz4", "lz4hc", "blosclz", "zstd", + "snappy", "zlib". + +clevel: + An integer between 1 and 9 indicating the compression level. + +shuffle: + An integer value in the set {0, 1, 2, -1}. A value of 1 + indicates that byte-wise shuffling is performed in addition to + compression. A value of 2 indicates the bit-wise shuffling is + performed in addition to compression. If a value of -1 is given, + then default shuffling is used: bit-wise shuffling for buffers + with item size of 1 byte, byte-wise shuffling otherwise. + Shuffling is turned off completely when the value is 0. + +blocksize: + An integer giving the size in bytes of blocks into which a + buffer is divided before compression. + +For example, the array metadata document below specifies that the +compressor is the Blosc codec configured with a compression level of +1, byte-wise shuffling, the ``lz4`` compression algorithm and the +default block size:: + + { + "compressor": { + "codec": "https://purl.org/zarr/spec/codecs/blosc", + "configuration": { + "cname": "lz4", + "clevel": 1, + "shuffle": 1, + "blocksize": 0 + } + }, + } + + +Format and algorithm +~~~~~~~~~~~~~~~~~~~~ + +Blosc is a meta-compressor, which divides an input buffer into blocks, +then applies an internal compression algorithm to each block, then +packs the encoded blocks together into a single output buffer with a +header. The format of the encoded buffer is defined in [BLOSC]_. The +reference implementation is provided by the `c-blosc library +`_. + + +Deprecated codecs +================= + +There are no deprecated codecs at this time. + + +References +========== + +.. [RFC2119] S. Bradner. Key words for use in RFCs to Indicate + Requirement Levels. March 1997. Best Current Practice. URL: + https://tools.ietf.org/html/rfc2119 + +.. [RFC1951] P. Deutsch. DEFLATE Compressed Data Format Specification version + 1.3. Requirement Levels. May 1996. Informational. URL: + https://tools.ietf.org/html/rfc1951 + +.. [RFC1952] P. Deutsch. GZIP file format specification version 4.3. + Requirement Levels. May 1996. Informational. URL: + https://tools.ietf.org/html/rfc1952 + +.. [BLOSC] F. Alted. Blosc Chunk Format. URL: + https://github.com/Blosc/c-blosc/blob/master/README_CHUNK_FORMAT.rst + + +Change log +========== -.. toctree:: - :maxdepth: 1 - :caption: Contents: +Editor's Draft 21 October 2020 +------------------------------ - codecs/gzip/v1.0 +* Added Gzip codec. +* Added Blosc codec. diff --git a/docs/codecs/gzip/v1.0.rst b/docs/codecs/gzip/v1.0.rst deleted file mode 100644 index b0cfe808..00000000 --- a/docs/codecs/gzip/v1.0.rst +++ /dev/null @@ -1,122 +0,0 @@ -======================== -Gzip Codec (version 1.0) -======================== ------------------------------ - Editor's draft 31 March 2020 ------------------------------ - -Specification URI: - https://purl.org/zarr/spec/codecs/gzip/1.0 -Issue tracking: - `GitHub issues `_ -Suggest an edit for this spec: - `GitHub editor `_ - -Copyright 2020 `Zarr core development -team `_ (@@TODO -list institutions?). This work is licensed under a `Creative Commons -Attribution 3.0 Unported -License `_. - ----- - - -Abstract -======== - -This specification defines an codec for chunk compression using Gzip - - -Status of this document -======================= - -This document is a **Work in Progress**. It may be updated, replaced -or obsoleted by other documents at any time. It is inappapropriate to -cite this document as other than work in progress. - -Comments, questions or contributions to this document are very -welcome. Comments and questions should be raised via `GitHub issues -`_. When -raising an issue, please add the label "codecs-gzip-v1.0". - -This document was produced by the `Zarr core development team -`_. - - -Document conventions -==================== - -Conformance requirements are expressed with a combination of -descriptive assertions and [RFC2119]_ terminology. The key words -"MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", -"SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in the normative -parts of this document are to be interpreted as described in -[RFC2119]_. However, for readability, these words do not appear in all -uppercase letters in this specification. - -All of the text of this specification is normative except sections -explicitly marked as non-normative, examples, and notes. Examples in -this specification are introduced with the words "for example". - - -Chunk encoding/decoding with Gzip -================================= - -@@TODO define how chunks are encoded and decoded -@@TODO be sure to clarify that the encoded data should conform to the gzip file format - -Chunks are encoded and decoded using the compression algorithm defined in -[RFC1951]_ and encoded data should conform to the Gzip file format [RFC1952]_. -The compression level is an integer from 0 to 9 which controls the speed and -level of compression. A level of 1 is the fastest compression method and -produces the least compressions, while 9 is slowest and produces the most -compression. Compression is turned off completely when level is 0. - - -Configuring codec in array metadata -=================================== - -@@TODO define how to specify in array metadata documents. - -The Gzip codec can be specified as a compressor for a Zarr array under the -``compressor`` name in the corresponding array metadata document. The URI for -the Gzip codec defined in this specification is -https://purl.org/zarr/spec/codecs/gzip/1.0. - -Additionally, the compression level must be specified as the value of the -``level`` name in the ``configuration`` metadata name. For example, the array -metadata document below specifies a Gzip codec configured with a compression -level of 1:: - - - { - "compressor": { - "codec": "https://purl.org/zarr/spec/codecs/gzip/1.0", - "configuration": { - "level": 1 - } - }, - } - - -References -========== - -.. [RFC2119] S. Bradner. Key words for use in RFCs to Indicate - Requirement Levels. March 1997. Best Current Practice. URL: - https://tools.ietf.org/html/rfc2119 - -.. [RFC1951] P. Deutsch. DEFLATE Compressed Data Format Specification version - 1.3. Requirement Levels. May 1996. Informational. URL: - https://tools.ietf.org/html/rfc1951 - -.. [RFC1952] P. Deutsch. GZIP file format specification version 4.3. - Requirement Levels. May 1996. Informational. URL: - https://tools.ietf.org/html/rfc1952 - - - -Change log -========== - -@@TODO diff --git a/docs/index.rst b/docs/index.rst index 7a985d07..498ae301 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,8 +10,8 @@ Under construction. :caption: Contents: protocol - stores codecs + stores Indices and tables From 571cbc02afb29d9b2232f071c999c09736feee33 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 23 Oct 2020 16:42:29 +0100 Subject: [PATCH 2/5] allow compression level 0 Co-authored-by: David Brochart --- docs/codecs.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/codecs.rst b/docs/codecs.rst index a9f1312e..98ae17f3 100644 --- a/docs/codecs.rst +++ b/docs/codecs.rst @@ -128,7 +128,11 @@ cname: "snappy", "zlib". clevel: - An integer between 1 and 9 indicating the compression level. + An integer from 0 to 9 which controls the speed and level of + compression. A level of 1 is the fastest compression method and + produces the least compressions, while 9 is slowest and produces + the most compression. Compression is turned off completely when + level is 0. shuffle: An integer value in the set {0, 1, 2, -1}. A value of 1 From b646e1b8d81c6420351bf1a0e6c3aa10a4be25fc Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 23 Oct 2020 16:42:59 +0100 Subject: [PATCH 3/5] clarify shuffling Co-authored-by: David Brochart --- docs/codecs.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docs/codecs.rst b/docs/codecs.rst index 98ae17f3..d007d860 100644 --- a/docs/codecs.rst +++ b/docs/codecs.rst @@ -135,10 +135,12 @@ clevel: level is 0. shuffle: - An integer value in the set {0, 1, 2, -1}. A value of 1 - indicates that byte-wise shuffling is performed in addition to + An integer value in the set {0, 1, 2, -1} indicating the way + bytes or bits are rearranged, which can lead to faster + and/or greater compression. A value of 1 + indicates that byte-wise shuffling is performed prior to compression. A value of 2 indicates the bit-wise shuffling is - performed in addition to compression. If a value of -1 is given, + performed prior to compression. If a value of -1 is given, then default shuffling is used: bit-wise shuffling for buffers with item size of 1 byte, byte-wise shuffling otherwise. Shuffling is turned off completely when the value is 0. From dffe91d9f20e800575599d5684395149203d8bb2 Mon Sep 17 00:00:00 2001 From: Alistair Miles Date: Fri, 23 Oct 2020 16:43:25 +0100 Subject: [PATCH 4/5] Clarify automatic block size Co-authored-by: David Brochart --- docs/codecs.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/codecs.rst b/docs/codecs.rst index d007d860..2f3371a3 100644 --- a/docs/codecs.rst +++ b/docs/codecs.rst @@ -147,7 +147,8 @@ shuffle: blocksize: An integer giving the size in bytes of blocks into which a - buffer is divided before compression. + buffer is divided before compression. A value of 0 + indicates that an automatic size will be used. For example, the array metadata document below specifies that the compressor is the Blosc codec configured with a compression level of From 231419248f688c4c7f154c5e2b84c0c960b62f84 Mon Sep 17 00:00:00 2001 From: jmoore Date: Fri, 6 May 2022 15:38:10 +0200 Subject: [PATCH 5/5] Re-apply s/codecs/codec/ change --- docs/codecs.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/codecs.rst b/docs/codecs.rst index 2f3371a3..b2734e3c 100644 --- a/docs/codecs.rst +++ b/docs/codecs.rst @@ -6,7 +6,7 @@ Editor's Draft 21 October 2020 ------------------------------ Specification URI: - https://purl.org/zarr/specs/codecs + https://purl.org/zarr/specs/codec Issue tracking: `GitHub issues `_ Suggest an edit for this spec: @@ -76,7 +76,7 @@ Gzip ---- Codec URI: - https://purl.org/zarr/spec/codecs/gzip + https://purl.org/zarr/spec/codec/gzip Configuration parameters @@ -94,7 +94,7 @@ the Gzip codec configured with a compression level of 1:: { "compressor": { - "codec": "https://purl.org/zarr/spec/codecs/gzip", + "codec": "https://purl.org/zarr/spec/codec/gzip", "configuration": { "level": 1 } @@ -115,7 +115,7 @@ Blosc ----- Codec URI: - https://purl.org/zarr/spec/codecs/blosc + https://purl.org/zarr/spec/codec/blosc Configuration parameters @@ -157,7 +157,7 @@ default block size:: { "compressor": { - "codec": "https://purl.org/zarr/spec/codecs/blosc", + "codec": "https://purl.org/zarr/spec/codec/blosc", "configuration": { "cname": "lz4", "clevel": 1,