Fix data-corruption error in CBF compression (#314)

The byte-offset compression introduced by #238, #228 was incorrect, causing problems for anything _writing_ cbf files... so dlsnsx2cbf (and probably dials.merge_cbf). - 0xf777 -> 0x7fff as one greater than -0x8000 - values with too large a negative value were being incorrectly encoded - treatment of the fall through / last case in compression for similarity with decompression layout - more thorough testing in particular tests with negative delta of magnitude greater than 0x8000 - test added which verifies that the HDF5 and CBF representation of the data from dlsnxs2cbf are the same
cctbx · Feb 22, 2021 · d1f9583 · d1f9583
1 parent af49159
commit d1f9583
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 24 deletions.
diff --git a/boost_python/compression.cc b/boost_python/compression.cc
@@ -1,4 +1,5 @@
 #include "compression.h"
+#include <assert.h>
 
 typedef union {
   char b[2];
@@ -62,7 +63,7 @@ std::vector<char> dxtbx::boost_python::cbf_compress(const int *values,
 
     packed.push_back(-0x80);
 
-    if ((-0xf777 <= delta) && (delta < 0x8000)) {
+    if ((-0x7fff <= delta) && (delta < 0x8000)) {
       s = (short)delta;
       b = ((union_short *)&s)[0].b;
 
@@ -75,6 +76,7 @@ std::vector<char> dxtbx::boost_python::cbf_compress(const int *values,
       current += delta;
       continue;
     }
+
     s = -0x8000;
     b = ((union_short *)&s)[0].b;
 
@@ -85,21 +87,20 @@ std::vector<char> dxtbx::boost_python::cbf_compress(const int *values,
     packed.push_back(b[0]);
     packed.push_back(b[1]);
 
-    if ((-0x7fffffff <= delta) && (delta < 0x80000000)) {
-      i = delta;
-      b = ((union_int *)&i)[0].b;
+    assert((-0x7fffffff <= delta) && (delta < 0x80000000));
 
-      if (!le) {
-        byte_swap_int(b);
-      }
+    i = delta;
+    b = ((union_int *)&i)[0].b;
 
-      packed.push_back(b[0]);
-      packed.push_back(b[1]);
-      packed.push_back(b[2]);
-      packed.push_back(b[3]);
-      current += delta;
-      continue;
+    if (!le) {
+      byte_swap_int(b);
     }
+
+    packed.push_back(b[0]);
+    packed.push_back(b[1]);
+    packed.push_back(b[2]);
+    packed.push_back(b[3]);
+    current += delta;
   }
 
   return packed;
@@ -119,7 +120,7 @@ void dxtbx::boost_python::cbf_decompress(const char *packed,
     c = packed[j];
     j += 1;
 
-    if (c != -128) {
+    if (c != -0x80) {
       current += c;
       *values = current;
       values++;
@@ -134,7 +135,7 @@ void dxtbx::boost_python::cbf_decompress(const char *packed,
       byte_swap_short((char *)&s);
     }
 
-    if (s != -32768) {
+    if (s != -0x8000) {
       current += s;
       *values = current;
       values++;

diff --git a/newsfragments/314.bugfix b/newsfragments/314.bugfix
@@ -0,0 +1,2 @@
+Fix error corrupting data when writing CBF files with large pixel values.
+This affected ``dxtbx.dlsnxs2cbf`` and ``dials.merge_cbf``
diff --git a/tests/command_line/test_dlsnxs2cbf.py b/tests/command_line/test_dlsnxs2cbf.py
@@ -1,24 +1,41 @@
+import h5py
+import numpy as np
 import procrunner
 import pytest
 
+from dxtbx.format.FormatCBFMiniEigerDLS16MSN160 import FormatCBFMiniEigerDLS16MSN160
+from dxtbx.model.experiment_list import ExperimentListFactory
+
 
 @pytest.mark.xfail(
     "os.name == 'nt'", reason="crashes python process on Windows", run=False
 )
-def test_dlsnxs2cbf(dials_data, tmpdir):
+def test_dlsnxs2cbf(dials_data, tmp_path):
     screen = dials_data("thaumatin_eiger_screen")
     master = screen.join("Therm_6_1_master.h5")
     result = procrunner.run(
-        ["dxtbx.dlsnxs2cbf", master, "junk_%04d.cbf"], working_directory=tmpdir
+        ["dxtbx.dlsnxs2cbf", master, "junk_%04d.cbf"], working_directory=tmp_path
     )
     assert not result.returncode and not result.stderr
 
-    expected_output = "\n".join("junk_%04d.cbf" % j for j in (1, 2, 3))
-
-    for record in expected_output.split("\n"):
-        assert record.strip().encode("latin-1") in result.stdout, record
+    output_files = ["junk_%04d.cbf" % j for j in (1, 2, 3)]
+    for file in output_files:
+        assert file.encode("latin-1") in result.stdout
+        assert tmp_path.joinpath(file).is_file()
 
-    # check files on disk
+    expts = ExperimentListFactory.from_filenames(
+        str(tmp_path / file) for file in output_files
+    )
+    assert all(
+        imgset.get_format_class() == FormatCBFMiniEigerDLS16MSN160
+        for imgset in expts.imagesets()
+    )
 
-    for j in (1, 2, 3):
-        assert tmpdir.join("junk_%04d.cbf" % j).check()
+    with h5py.File(master) as fh:
+        for i, imgset in enumerate(expts.imagesets()):
+            original = fh["/entry/data/data_000001"][i][()]
+            sel = np.where(original < original.max())
+            np.testing.assert_equal(
+                fh["/entry/data/data_000001"][i][sel],
+                imgset.get_raw_data(0)[0].as_numpy_array()[sel],
+            )
diff --git a/tests/test_compression.py b/tests/test_compression.py
@@ -0,0 +1,15 @@
+from scitbx.array_family import flex
+
+from dxtbx.ext import compress, uncompress
+
+
+def test_compress_decompress():
+    x, y = 10, 10
+
+    data = flex.int(x * y, 1)
+    data[10] = 44369
+    data[11] = 214
+    compressed = compress(data)
+    uncompressed = uncompress(compressed, x, y)
+
+    assert list(data) == list(uncompressed)