Added support for EXIF orientation transform in read_image for PNG

vfdev-5 · vfdev-5 · commit cd6baeb85d5b · 2024-03-06T08:52:57.000+01:00
Description:
- added support for EXIF orientation transform in read_image for PNG
- restructured exif.h
- added tests
diff --git a/test/test_image.py b/test/test_image.py
@@ -100,14 +100,15 @@ def test_decode_jpeg(img_path, pil_mode, mode):
     assert abs_mean_diff < 2
 
 
+@pytest.mark.parametrize("codec", [("png", "PNG"), ("jpg", "JPEG")])
 @pytest.mark.parametrize("orientation", [1, 2, 3, 4, 5, 6, 7, 8, 0])
-def test_decode_jpeg_with_exif_orientation(tmpdir, orientation):
-    fp = os.path.join(tmpdir, f"exif_oriented_{orientation}.jpg")
+def test_decode_with_exif_orientation(tmpdir, codec, orientation):
+    fp = os.path.join(tmpdir, f"exif_oriented_{orientation}.{codec[0]}")
     t = torch.randint(0, 256, size=(3, 256, 257), dtype=torch.uint8)
     im = F.to_pil_image(t)
     exif = im.getexif()
     exif[0x0112] = orientation  # set exif orientation
-    im.save(fp, "JPEG", exif=exif.tobytes())
+    im.save(fp, codec[1], exif=exif.tobytes())
 
     data = read_file(fp)
     output = decode_image(data, apply_exif_orientation=True)
diff --git a/torchvision/csrc/io/image/cpu/decode_image.cpp b/torchvision/csrc/io/image/cpu/decode_image.cpp
@@ -27,7 +27,8 @@ torch::Tensor decode_image(
   if (memcmp(jpeg_signature, datap, 3) == 0) {
     return decode_jpeg(data, mode, apply_exif_orientation);
   } else if (memcmp(png_signature, datap, 4) == 0) {
-    return decode_png(data, mode);
+    return decode_png(
+        data, mode, /*allow_16_bits=*/false, apply_exif_orientation);
   } else {
     TORCH_CHECK(
         false,
diff --git a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
@@ -203,7 +203,7 @@ torch::Tensor decode_jpeg(
 
   int exif_orientation = -1;
   if (apply_exif_orientation) {
-    exif_orientation = fetch_exif_orientation(&cinfo);
+    exif_orientation = fetch_jpeg_exif_orientation(&cinfo);
   }
 
   jpeg_start_decompress(&cinfo);
diff --git a/torchvision/csrc/io/image/cpu/decode_png.cpp b/torchvision/csrc/io/image/cpu/decode_png.cpp
@@ -1,14 +1,18 @@
 #include "decode_png.h"
 #include "common_png.h"
+#include "exif.h"
 
 namespace vision {
 namespace image {
 
+using namespace exif_private;
+
 #if !PNG_FOUND
 torch::Tensor decode_png(
     const torch::Tensor& data,
     ImageReadMode mode,
-    bool allow_16_bits) {
+    bool allow_16_bits,
+    bool apply_exif_orientation) {
   TORCH_CHECK(
       false, "decode_png: torchvision not compiled with libPNG support");
 }
@@ -22,7 +26,8 @@ bool is_little_endian() {
 torch::Tensor decode_png(
     const torch::Tensor& data,
     ImageReadMode mode,
-    bool allow_16_bits) {
+    bool allow_16_bits,
+    bool apply_exif_orientation) {
   C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.image.cpu.decode_png.decode_png");
   // Check that the input tensor dtype is uint8
   TORCH_CHECK(data.dtype() == torch::kU8, "Expected a torch.uint8 tensor");
@@ -234,8 +239,19 @@ torch::Tensor decode_png(
       t_ptr = tensor.accessor<int32_t, 3>().data();
     }
   }
+
+  int exif_orientation = -1;
+  if (apply_exif_orientation) {
+    exif_orientation = fetch_png_exif_orientation(png_ptr, info_ptr);
+  }
+
   png_destroy_read_struct(&png_ptr, &info_ptr, nullptr);
-  return tensor.permute({2, 0, 1});
+
+  auto output = tensor.permute({2, 0, 1});
+  if (apply_exif_orientation) {
+    return exif_orientation_transform(output, exif_orientation);
+  }
+  return output;
 }
 #endif
 
diff --git a/torchvision/csrc/io/image/cpu/decode_png.h b/torchvision/csrc/io/image/cpu/decode_png.h
@@ -9,7 +9,8 @@ namespace image {
 C10_EXPORT torch::Tensor decode_png(
     const torch::Tensor& data,
     ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED,
-    bool allow_16_bits = false);
+    bool allow_16_bits = false,
+    bool apply_exif_orientation = false);
 
 } // namespace image
 } // namespace vision
diff --git a/torchvision/csrc/io/image/cpu/exif.h b/torchvision/csrc/io/image/cpu/exif.h
@@ -51,8 +51,12 @@ direct,
 // https://github.com/opencv/opencv/blob/097891e311fae1d8354eb092a0fd0171e630d78c/modules/imgcodecs/src/exif.cpp
 
 #if JPEG_FOUND
-
 #include <jpeglib.h>
+#endif
+#if PNG_FOUND
+#include <png.h>
+#endif
+
 #include <torch/types.h>
 
 namespace vision {
@@ -125,8 +129,48 @@ inline uint32_t get_uint32(
       (exif_data[offset + 2] << 8) + exif_data[offset + 3];
 }
 
-inline int fetch_exif_orientation(j_decompress_ptr cinfo) {
+inline int fetch_exif_orientation(unsigned char* exif_data_ptr, size_t size) {
   int exif_orientation = -1;
+
+  // Exif binary structure looks like this
+  // First 6 bytes: [E, x, i, f, 0, 0]
+  // Endianness, 2 bytes : [M, M] or [I, I]
+  // Tag mark, 2 bytes: [0, 0x2a]
+  // Offset, 4 bytes
+  // Num entries, 2 bytes
+  // Tag entries and data, tag has 2 bytes and its data has 10 bytes
+  // For more details:
+  // http://www.media.mit.edu/pia/Research/deepview/exif.html
+
+  ExifDataReader exif_data(exif_data_ptr, size);
+  auto endianness = get_endianness(exif_data);
+
+  // Checking whether Tag Mark (0x002A) correspond to one contained in the
+  // Jpeg file
+  uint16_t tag_mark = get_uint16(exif_data, endianness, 2);
+  if (tag_mark == REQ_EXIF_TAG_MARK) {
+    auto offset = get_uint32(exif_data, endianness, 4);
+    size_t num_entry = get_uint16(exif_data, endianness, offset);
+    offset += 2; // go to start of tag fields
+    constexpr size_t tiff_field_size = 12;
+    for (size_t entry = 0; entry < num_entry; entry++) {
+      // Here we just search for orientation tag and parse it
+      auto tag_num = get_uint16(exif_data, endianness, offset);
+      if (tag_num == INCORRECT_TAG) {
+        break;
+      }
+      if (tag_num == ORIENTATION_EXIF_TAG) {
+        exif_orientation = get_uint16(exif_data, endianness, offset + 8);
+        break;
+      }
+      offset += tiff_field_size;
+    }
+  }
+  return exif_orientation;
+}
+
+#if JPEG_FOUND
+inline int fetch_jpeg_exif_orientation(j_decompress_ptr cinfo) {
   // Check for Exif marker APP1
   jpeg_saved_marker_ptr exif_marker = 0;
   jpeg_saved_marker_ptr cmarker = cinfo->marker_list;
@@ -137,51 +181,48 @@ inline int fetch_exif_orientation(j_decompress_ptr cinfo) {
     cmarker = cmarker->next;
   }
 
-  if (exif_marker) {
-    // Exif binary structure looks like this
-    // First 6 bytes: [E, x, i, f, 0, 0]
-    // Endianness, 2 bytes : [M, M] or [I, I]
-    // Tag mark, 2 bytes: [0, 0x2a]
-    // Offset, 4 bytes
-    // Num entries, 2 bytes
-    // Tag entries and data, tag has 2 bytes and its data has 10 bytes
-    // For more details:
-    // http://www.media.mit.edu/pia/Research/deepview/exif.html
-
-    // Bytes from Exif size field to the first TIFF header
-    constexpr size_t start_offset = 6;
-    if (exif_marker->data_length > start_offset) {
-      auto* exif_data_ptr = exif_marker->data + start_offset;
-      auto size = exif_marker->data_length - start_offset;
-
-      ExifDataReader exif_data(exif_data_ptr, size);
-      auto endianness = get_endianness(exif_data);
-
-      // Checking whether Tag Mark (0x002A) correspond to one contained in the
-      // Jpeg file
-      uint16_t tag_mark = get_uint16(exif_data, endianness, 2);
-      if (tag_mark == REQ_EXIF_TAG_MARK) {
-        auto offset = get_uint32(exif_data, endianness, 4);
-        size_t num_entry = get_uint16(exif_data, endianness, offset);
-        offset += 2; // go to start of tag fields
-        constexpr size_t tiff_field_size = 12;
-        for (size_t entry = 0; entry < num_entry; entry++) {
-          // Here we just search for orientation tag and parse it
-          auto tag_num = get_uint16(exif_data, endianness, offset);
-          if (tag_num == INCORRECT_TAG) {
-            break;
-          }
-          if (tag_num == ORIENTATION_EXIF_TAG) {
-            exif_orientation = get_uint16(exif_data, endianness, offset + 8);
-            break;
-          }
-          offset += tiff_field_size;
-        }
-      }
-    }
+  if (!exif_marker) {
+    return -1;
   }
-  return exif_orientation;
+
+  constexpr size_t start_offset = 6;
+  if (exif_marker->data_length <= start_offset) {
+    return -1;
+  }
+
+  auto* exif_data_ptr = exif_marker->data + start_offset;
+  auto size = exif_marker->data_length - start_offset;
+
+  return fetch_exif_orientation(exif_data_ptr, size);
+}
+#else
+inline int fetch_jpeg_exif_orientation(j_decompress_ptr cinfo) {
+  return -1;
+}
+#endif // #if JPEG_FOUND
+
+#if PNG_FOUND
+inline int fetch_png_exif_orientation(png_structp png_ptr, png_infop info_ptr) {
+#ifdef PNG_eXIf_SUPPORTED
+  png_uint_32 num_exif = 0;
+  png_bytep exif = 0;
+
+  // Exif info could be in info_ptr
+  if (png_get_valid(png_ptr, info_ptr, PNG_INFO_eXIf)) {
+    png_get_eXIf_1(png_ptr, info_ptr, &num_exif, &exif);
+  }
+
+  if (exif && num_exif > 0) {
+    return fetch_exif_orientation(exif, num_exif);
+  }
+#endif // #ifdef PNG_eXIf_SUPPORTED
+  return -1;
+}
+#else
+inline int fetch_png_exif_orientation(j_decompress_ptr cinfo) {
+  return -1;
 }
+#endif // #if PNG_FOUND
 
 constexpr uint16_t IMAGE_ORIENTATION_TL = 1; // normal orientation
 constexpr uint16_t IMAGE_ORIENTATION_TR = 2; // needs horizontal flip
@@ -222,5 +263,3 @@ inline torch::Tensor exif_orientation_transform(
 } // namespace exif_private
 } // namespace image
 } // namespace vision
-
-#endif
diff --git a/torchvision/csrc/io/image/image.cpp b/torchvision/csrc/io/image/image.cpp
@@ -21,7 +21,8 @@ namespace image {
 
 static auto registry =
     torch::RegisterOperators()
-        .op("image::decode_png", &decode_png)
+        .op("image::decode_png(Tensor data, int mode, bool allow_16_bits = False, bool apply_exif_orientation=False) -> Tensor",
+            &decode_png)
         .op("image::encode_png", &encode_png)
         .op("image::decode_jpeg(Tensor data, int mode, bool apply_exif_orientation=False) -> Tensor",
             &decode_jpeg)
diff --git a/torchvision/io/image.py b/torchvision/io/image.py
@@ -67,7 +67,9 @@ def write_file(filename: str, data: torch.Tensor) -> None:
     torch.ops.image.write_file(filename, data)
 
 
-def decode_png(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANGED) -> torch.Tensor:
+def decode_png(
+    input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANGED, apply_exif_orientation: bool = False
+) -> torch.Tensor:
     """
     Decodes a PNG image into a 3 dimensional RGB or grayscale Tensor.
     Optionally converts the image to the desired format.
@@ -80,13 +82,15 @@ def decode_png(input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANGE
             converting the image. Default: ``ImageReadMode.UNCHANGED``.
             See `ImageReadMode` class for more information on various
             available modes.
+        apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor.
+            Default: False.
 
     Returns:
         output (Tensor[image_channels, image_height, image_width])
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(decode_png)
-    output = torch.ops.image.decode_png(input, mode.value, False)
+    output = torch.ops.image.decode_png(input, mode.value, False, apply_exif_orientation)
     return output
 
 

Original file line number	Diff line number	Diff line change
`@@ -203,7 +203,7 @@ torch::Tensor decode_jpeg(`
`203`	`203`
`204`	`204`	`int exif_orientation = -1;`
`205`	`205`	`if (apply_exif_orientation) {`
`206`		`- exif_orientation = fetch_exif_orientation(&cinfo);`
	`206`	`+ exif_orientation = fetch_jpeg_exif_orientation(&cinfo);`
`207`	`207`	`}`
`208`	`208`
`209`	`209`	`jpeg_start_decompress(&cinfo);`