From 8a8fa522b27d694e6b91a3d9b8078a6c8da986d3 Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sun, 20 Oct 2024 09:16:29 -0700
Subject: [PATCH] perf: file read performance -- double reads, extra copies

Doing some benchmarking and adding extra LoggedTimer instrumentation
(included in another PR), I discovered that quite frequently, the
in-memory flavor of ImageBuf was reading its image multiple times --
once upon construction, and then again upon later calls to read(),
even though the latter should have taken early outs when the read was
unnecessary.

So I put some additional mechanism for the IB to internally know if it
read the pixels from disk into the buffer, and more effectively make
calls to IB.read() be a no-op. To help facilitate this, I also
clarified in the docs that read() should not be used to move an
already-read image to a different subimage or MIP level or a different
data type -- reset() really should be used for that purpose. (I don't
know that anybody ever relied on read() to do reset's job, but
assuming that they won't helps eliminate some tricky cases.)

Also, I noticed that inside ImageInput::read_image(), depending on
whether or not a data type conversion is needed, it can either read
directly into the user's buffer, or must read the native data into a
temp buffer and subsequently do a copy-and-type-convert into the
user's buffer. The latter is obviously more expensive. The direct read
approach was used when the user asked for no type conversion (just to
receive the native data type in the fiel), but it neglected to take
the same shortcut if the user asked for an explicit type conversion,
but it just so happened that the requested type was the type in the
file.

Additional changes:

The evaluation of whether the buffer was contiguous (for various
shortcuts) only applied to LOCALBUFFER style storage, not APPBUFFER.

Fixing these two problems dramatically speeds up some reads. A benchmark
I was looking at where it reads a 10k x 10k uint8 uncompressed TIFF file
into an ImageBuf while requesting uint8 data went from (timings on my
laptop):

    function                  calls    total
    IB::read                      2   0.505s  (avg   0.25s)
    II::read_image                2   0.478s  (avg   0.24s)

to:

    IB::read                      1   0.172s  (avg   0.17s)
    II::read_image                1   0.172s  (avg   0.17s)

So almost 3x faster. Most of which was due doing 2 reads instead of 1,
but still each individual read improved from 0.25s to 0.17s because of
the fix that lets us avoid the unnecessary buffer copies.

Grain of salt: this was a very large image, and it was stored
uncompressed (so NO decompression that can sometimes be a significant
amount of overhead), so you may not see this much improvement in real
world scenarios.

This is only scratching the surface, I believe there are lots of
other areas to optimize more carefully.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
---
 src/include/OpenImageIO/imagebuf.h       | 71 +++++++++++++-----------
 src/libOpenImageIO/imagebuf.cpp          | 48 ++++++++++++++--
 src/libOpenImageIO/imagebufalgo.cpp      |  2 +-
 src/libOpenImageIO/imagebufalgo_test.cpp |  7 ++-
 src/libOpenImageIO/imageinput.cpp        |  6 +-
 5 files changed, 92 insertions(+), 42 deletions(-)

diff --git a/src/include/OpenImageIO/imagebuf.h b/src/include/OpenImageIO/imagebuf.h
index 4fd4a4db30..be6548c370 100644
--- a/src/include/OpenImageIO/imagebuf.h
+++ b/src/include/OpenImageIO/imagebuf.h
@@ -416,38 +416,44 @@ class OIIO_API ImageBuf {
     /// @{
     /// @name  Reading and Writing disk images
 
-    /// Read the particular subimage and MIP level of the image.  Generally,
-    /// this will skip the expensive read if the file has already been read
-    /// into the ImageBuf (at the specified subimage and MIP level).  It
-    /// will clear and re-allocate memory if the previously allocated space
-    /// was not appropriate for the size or data type of the image being
-    /// read.
+    /// Read the particular subimage and MIP level of the image, if it has not
+    /// already been read. It will clear and re-allocate memory if the
+    /// previously allocated space was not appropriate for the size or data
+    /// type of the image being read.
     ///
-    /// In general, `read()` will try not to do any I/O at the time of the
-    /// `read()` call, but rather to have the ImageBuf "backed" by an
-    /// ImageCache, which will do the file I/O on demand, as pixel values
-    /// are needed, and in that case the ImageBuf doesn't actually allocate
-    /// memory for the pixels (the data lives in the ImageCache).  However,
-    /// there are several conditions for which the ImageCache will be
-    /// bypassed, the ImageBuf will allocate "local" memory, and the disk
-    /// file will be read directly into allocated buffer at the time of the
-    /// `read()` call: (a) if the `force` parameter is `true`; (b) if the
-    /// `convert` parameter requests a data format conversion to a type that
-    /// is not the native file type and also is not one of the internal
-    /// types supported by the ImageCache (specifically, `float` and
-    /// `uint8`); (c) if the ImageBuf already has local pixel memory
-    /// allocated, or "wraps" an application buffer.
+    /// In general, calling `read()` should be unnecessary for most uses of
+    /// ImageBuf. When an ImageBuf is created (or when `reset()` is called),
+    /// usually the opening of the file and reading of the header is deferred
+    /// until the spec is accessed or needed, and the reading of the pixel
+    /// values is usually deferred until pixel values are needed, at which
+    /// point these things happen automatically. That is, every ImageBuf
+    /// method that needs pixel values will call `read()` itself if it has
+    /// not previously been called.
     ///
-    /// Note that `read()` is not strictly necessary. If you are happy with
-    /// the filename, subimage and MIP level specified by the ImageBuf
-    /// constructor (or the last call to `reset()`), and you want the
-    /// storage to be backed by the ImageCache (including storing the
-    /// pixels in whatever data format that implies), then the file contents
-    /// will be automatically read the first time you make any other
-    /// ImageBuf API call that requires the spec or pixel values.  The only
-    /// reason to call `read()` yourself is if you are changing the
-    /// filename, subimage, or MIP level, or if you want to use `force=true`
-    /// or a specific `convert` value to force data format conversion.
+    /// There are a few situations where you want to call read() explicitly,
+    /// after the ImageBuf is constructed but before any other methods have
+    /// been called that would implicitly read the file:
+    ///
+    /// 1. If you want to request that the internal buffer be a specific
+    ///    pixel data type that might differ from the pixel data type in
+    ///    the file itself (conveyed by the `convert` parameter).
+    /// 2. You want the ImageBuf to read and contain only a subset of the
+    ///    channels in the file (conveyed by the `chmin` and `chmax`
+    ///    parameters on the version of `read()` that accepts them).
+    /// 3. The ImageBuf has been set up to be backed by ImageCache, but you
+    ///    want to force it to read the whole file into memory now (conveyed
+    ///    by the `force` parameter, or if the `convert` parameter specifies
+    ///    a type that is not the native file type and also cannot be
+    ///    accommodated directly by the cache).
+    /// 4. For whatever reason, you want to force a full read of the pixels
+    ///    to occur at this point in program execution, rather than at some
+    ///    undetermined future time when you first need to access those
+    ///    pixels.
+    ///
+    /// The `read()` function should not be used to *change* an already
+    /// established subimage, MIP level, pixel data type, or channel range
+    /// of a file that has already read its pixels. You should use the
+    /// `reset()` method for that purpose.
     ///
     /// @param subimage/miplevel
     ///             The subimage and MIP level to read.
@@ -460,7 +466,7 @@ class OIIO_API ImageBuf {
     ///             `IMAGECACHE`, if the ImageBuf was originally constructed
     ///             or reset with an ImageCache specified).
     /// @param  convert
-    ///             If set to a specific type (not`UNKNOWN`), the ImageBuf
+    ///             If set to a specific type (not `UNKNOWN`), the ImageBuf
     ///             memory will be allocated for that type specifically and
     ///             converted upon read.
     /// @param  progress_callback/progress_callback_data
@@ -1219,6 +1225,7 @@ class OIIO_API ImageBuf {
     bool contains_roi(const ROI& roi) const;
 
     bool pixels_valid(void) const;
+    bool pixels_read(void) const;
 
     /// The data type of the pixels stored in the buffer (equivalent to
     /// `spec().format`).
@@ -1242,7 +1249,7 @@ class OIIO_API ImageBuf {
     /// Z plane stride within the localpixels memory.
     stride_t z_stride() const;
 
-    /// Is the data layout "contiguous", i.e.,
+    /// Is this an in-memory buffer with the data layout "contiguous", i.e.,
     /// ```
     ///     pixel_stride == nchannels * pixeltype().size()
     ///     scanline_stride == pixel_stride * spec().width
diff --git a/src/libOpenImageIO/imagebuf.cpp b/src/libOpenImageIO/imagebuf.cpp
index 798b6857c9..47ff3eba53 100644
--- a/src/libOpenImageIO/imagebuf.cpp
+++ b/src/libOpenImageIO/imagebuf.cpp
@@ -317,7 +317,9 @@ class ImageBufImpl {
 
     void eval_contiguous()
     {
-        m_contiguous = m_localpixels && m_storage == ImageBuf::LOCALBUFFER
+        m_contiguous = m_localpixels
+                       && (m_storage == ImageBuf::LOCALBUFFER
+                           || m_storage == ImageBuf::APPBUFFER)
                        && m_xstride == m_spec.nchannels * m_channel_stride
                        && m_ystride == m_xstride * m_spec.width
                        && m_zstride == m_ystride * m_spec.height;
@@ -347,6 +349,7 @@ class ImageBufImpl {
     mutable mutex_t m_mutex;      ///< Thread safety for this ImageBuf
     mutable bool m_spec_valid;    ///< Is the spec valid
     mutable bool m_pixels_valid;  ///< Image is valid
+    mutable bool m_pixels_read;   ///< Is file already in the local pixels?
     bool m_readonly;              ///< The bufspan is read-only
     bool m_badfile;               ///< File not found
     float m_pixelaspect;          ///< Pixel aspect ratio of the image
@@ -421,6 +424,7 @@ ImageBufImpl::ImageBufImpl(string_view filename, int subimage, int miplevel,
     , m_localpixels(NULL)
     , m_spec_valid(false)
     , m_pixels_valid(false)
+    , m_pixels_read(false)
     , m_readonly(readonly)
     , m_badfile(false)
     , m_pixelaspect(1)
@@ -506,6 +510,7 @@ ImageBufImpl::ImageBufImpl(const ImageBufImpl& src)
 {
     m_spec_valid   = src.m_spec_valid;
     m_pixels_valid = src.m_pixels_valid;
+    m_pixels_read  = src.m_pixels_read;
     if (src.m_localpixels) {
         // Source had the image fully in memory (no cache)
         if (m_storage == ImageBuf::APPBUFFER) {
@@ -533,6 +538,7 @@ ImageBufImpl::ImageBufImpl(const ImageBufImpl& src)
         m_nmiplevels       = 0;
         m_spec.erase_attribute("oiio:subimages");
         m_nativespec.erase_attribute("oiio:subimages");
+        m_pixels_read = true;
     }
     if (src.m_configspec)
         m_configspec.reset(new ImageSpec(*src.m_configspec));
@@ -804,6 +810,7 @@ ImageBufImpl::clear()
     m_spec_valid     = false;
     m_pixels_valid   = false;
     m_badfile        = false;
+    m_pixels_read    = false;
     m_pixelaspect    = 1;
     m_xstride        = 0;
     m_ystride        = 0;
@@ -856,11 +863,14 @@ ImageBufImpl::reset(string_view filename, int subimage, int miplevel,
         m_configspec->attribute("oiio:ioproxy", TypeDesc::PTR, &m_rioproxy);
     }
     m_bufspan = {};
+    m_storage = ImageBuf::LOCALBUFFER;
     if (m_name.length() > 0) {
-        // filename non-empty means this ImageBuf refers to a file.
-        read(subimage, miplevel);
-        // FIXME: investigate if the above read is really necessary, or if
-        // it can be eliminated and done fully lazily.
+        // For IC-backed file ImageBuf's, call read now. For other file-based
+        // images, just init the spec.
+        if (m_imagecache)
+            read(subimage, miplevel);
+        else
+            init_spec(m_name, subimage, miplevel, DoLock(true));
     }
 }
 
@@ -1004,7 +1014,8 @@ ImageBufImpl::realloc()
         m_deepdata.init(m_spec);
         m_storage = ImageBuf::LOCALBUFFER;
     }
-    m_readonly = false;
+    m_readonly    = false;
+    m_pixels_read = false;
     eval_contiguous();
 #if 0
     std::cerr << "ImageBuf " << m_name << " local allocation: " << m_allocated_size << "\n";
@@ -1138,6 +1149,7 @@ ImageBufImpl::init_spec(string_view filename, int subimage, int miplevel,
         m_badfile          = true;
         m_pixels_valid     = false;
         m_spec_valid       = false;
+        m_pixels_read      = false;
         m_nsubimages       = 0;
         m_nmiplevels       = 0;
         m_badfile          = false;
@@ -1207,13 +1219,27 @@ ImageBufImpl::read(int subimage, int miplevel, int chbegin, int chend,
     if (do_lock)
         lock.lock();
 
+    // If this doesn't reference a file in any way, nothing to do here.
     if (!m_name.length())
         return true;
 
+    // If the pixels have already been read and we aren't switching
+    // subimage/miplevel or being force to read (for example, turning a cached
+    // image into an in-memory image), then there is nothing to do.
     if (m_pixels_valid && !force && subimage == m_current_subimage
         && miplevel == m_current_miplevel)
         return true;
 
+    // If it's a local buffer from a file and we've already read the pixels
+    // into memory, we're done, provided that we aren't asking it to force
+    // a read with a different data type conversion or different number of
+    // channels.
+    if (m_storage == ImageBuf::LOCALBUFFER && m_pixels_valid && m_pixels_read
+        && (convert == TypeUnknown || convert == m_spec.format)
+        && subimage == m_current_subimage && miplevel == m_current_miplevel
+        && ((chend - chbegin) == m_spec.nchannels || (chend <= chbegin)))
+        return true;
+
     if (!init_spec(m_name.string(), subimage, miplevel,
                    DoLock(false) /* we already hold the lock */)) {
         m_badfile    = true;
@@ -1241,6 +1267,7 @@ ImageBufImpl::read(int subimage, int miplevel, int chbegin, int chend,
         if (ok) {
             m_spec = m_nativespec;  // Deep images always use native data
             m_pixels_valid = true;
+            m_pixels_read  = true;
             m_storage      = ImageBuf::LOCALBUFFER;
         } else {
             error(input->geterror());
@@ -1348,6 +1375,7 @@ ImageBufImpl::read(int subimage, int miplevel, int chbegin, int chend,
             in->close();
             if (ok) {
                 m_pixels_valid = true;
+                m_pixels_read  = true;
             } else {
                 m_pixels_valid = false;
                 error(in->geterror());
@@ -1943,6 +1971,14 @@ ImageBuf::pixels_valid(void) const
 
 
 
+bool
+ImageBuf::pixels_read(void) const
+{
+    return m_impl->m_pixels_read;
+}
+
+
+
 TypeDesc
 ImageBuf::pixeltype() const
 {
diff --git a/src/libOpenImageIO/imagebufalgo.cpp b/src/libOpenImageIO/imagebufalgo.cpp
index 9ca9f406ee..c3f3c50b92 100644
--- a/src/libOpenImageIO/imagebufalgo.cpp
+++ b/src/libOpenImageIO/imagebufalgo.cpp
@@ -354,7 +354,7 @@ ImageBufAlgo::IBAprep(ROI& roi, ImageBuf& dst, cspan<const ImageBuf*> srcs,
                       KWArgs options, ImageSpec* force_spec)
 {
     // OIIO_ASSERT(dst);
-    // Helper: ANY_SRC returns true if any of the source images s satisfy the
+    // Helper: ANY_SRC returns true if any of the source images satisfy the
     // condition cond.
 #define ANY_SRC(cond)                     \
     std::any_of(srcs.begin(), srcs.end(), \
diff --git a/src/libOpenImageIO/imagebufalgo_test.cpp b/src/libOpenImageIO/imagebufalgo_test.cpp
index 25dd865c96..9d02969da5 100644
--- a/src/libOpenImageIO/imagebufalgo_test.cpp
+++ b/src/libOpenImageIO/imagebufalgo_test.cpp
@@ -16,6 +16,7 @@
 #include <OpenImageIO/argparse.h>
 #include <OpenImageIO/benchmark.h>
 #include <OpenImageIO/color.h>
+#include <OpenImageIO/filesystem.h>
 #include <OpenImageIO/half.h>
 #include <OpenImageIO/imagebuf.h>
 #include <OpenImageIO/imagebufalgo.h>
@@ -1109,8 +1110,10 @@ test_opencv()
     OIIO_CHECK_EQUAL(comp.maxerror, 0.0f);
 
     // Regression test: reading from ImageBuf-backed image to OpenCV
-    auto loaded_image = OIIO::ImageBuf("../../testsuite/common/tahoe-tiny.tif",
-                                       0, 0, ImageCache::create());
+    std::string filename = "testsuite/common/tahoe-tiny.tif";
+    if (!Filesystem::exists(filename))
+        filename = "../../testsuite/common/tahoe-tiny.tif";
+    auto loaded_image = OIIO::ImageBuf(filename, 0, 0, ImageCache::create());
     OIIO_CHECK_ASSERT(loaded_image.initialized());
     if (!loaded_image.initialized()) {
         std::cout << loaded_image.geterror() << 'n';
diff --git a/src/libOpenImageIO/imageinput.cpp b/src/libOpenImageIO/imageinput.cpp
index bccf7eb7b0..8033bd51bd 100644
--- a/src/libOpenImageIO/imageinput.cpp
+++ b/src/libOpenImageIO/imageinput.cpp
@@ -302,6 +302,7 @@ ImageInput::read_scanlines(int subimage, int miplevel, int ybegin, int yend,
         // For scanline files, we also need one piece of metadata
         if (!spec.tile_width)
             rps = m_spec.get_int_attribute("tiff:RowsPerStrip", 64);
+        // FIXME: does the above search of metadata have a significant cost?
     }
     if (spec.image_bytes() < 1) {
         errorfmt("Invalid image size {} x {} ({} chans)", m_spec.width,
@@ -330,7 +331,10 @@ ImageInput::read_scanlines(int subimage, int miplevel, int ybegin, int yend,
     bool contiguous                = (xstride == (stride_t)buffer_pixel_bytes
                        && ystride == (stride_t)buffer_scanline_bytes);
 
-    if (native && contiguous) {
+    // no_type_convert is true if asking for data in the native format
+    bool no_type_convert = (format == spec.format
+                            && spec.channelformats.empty());
+    if ((native || no_type_convert) && contiguous) {
         if (chbegin == 0 && chend == spec.nchannels)
             return read_native_scanlines(subimage, miplevel, ybegin, yend, z,
                                          data);