From 3cfddb7acc7472d2eba6455500e3391280496417 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Wed, 4 Feb 2026 18:18:45 +0100
Subject: [PATCH 1/9] OpenCL helper dt_opencl_local_buffer_opt() returns error
 code

For readability and maintenance this returns a valid OpenCL error code instead of a gboolean.
Some error codes fixed.
---
 src/common/bilateralcl.c      |  2 +-
 src/common/gaussian.c         |  2 +-
 src/common/interpolation.c    |  2 +-
 src/common/nlmeans_core.c     |  4 +--
 src/common/opencl.c           | 31 +++++++++----------
 src/common/opencl.h           |  9 +++---
 src/iop/bloom.c               |  4 +--
 src/iop/colorreconstruction.c |  4 +--
 src/iop/demosaicing/basics.c  | 51 ++++++++++---------------------
 src/iop/demosaicing/rcd.c     | 16 ++++------
 src/iop/demosaicing/vng.c     | 16 ++++------
 src/iop/demosaicing/xtrans.c  | 56 ++++++++++-------------------------
 src/iop/denoiseprofile.c      | 18 ++++-------
 src/iop/globaltonemap.c       | 14 +++------
 src/iop/highlights.c          |  2 +-
 src/iop/highpass.c            |  4 +--
 src/iop/nlmeans.c             |  4 +--
 src/iop/sharpen.c             |  4 +--
 src/iop/soften.c              |  4 +--
 19 files changed, 90 insertions(+), 157 deletions(-)

diff --git a/src/common/bilateralcl.c b/src/common/bilateralcl.c
index e93261a85ede..19b59a83a568 100644
--- a/src/common/bilateralcl.c
+++ b/src/common/bilateralcl.c
@@ -84,7 +84,7 @@ dt_bilateral_cl_t *dt_bilateral_init_cl(const int devid,
                                   .cellsize = 8 * sizeof(float) + sizeof(int), .overhead = 0,
                                   .sizex = 1 << 6, .sizey = 1 << 6 };
 
-  if(!dt_opencl_local_buffer_opt(devid, darktable.opencl->bilateral->kernel_splat, &locopt))
+  if(dt_opencl_local_buffer_opt(devid, darktable.opencl->bilateral->kernel_splat, &locopt) != CL_SUCCESS)
   {
     dt_print(DT_DEBUG_OPENCL,
              "[opencl_bilateral] can not identify resource limits for device %d in bilateral grid", devid);
diff --git a/src/common/gaussian.c b/src/common/gaussian.c
index 5b4dc202f5e3..12565630d9db 100644
--- a/src/common/gaussian.c
+++ b/src/common/gaussian.c
@@ -849,7 +849,7 @@ dt_gaussian_cl_t *dt_gaussian_init_cl(const int devid,
                                   .cellsize = channels * sizeof(float), .overhead = 0,
                                   .sizex = BLOCKSIZE, .sizey = BLOCKSIZE };
 
-  if(dt_opencl_local_buffer_opt(devid, kernel_gaussian_transpose, &locopt))
+  if(dt_opencl_local_buffer_opt(devid, kernel_gaussian_transpose, &locopt) == CL_SUCCESS)
     blocksize = MIN(locopt.sizex, locopt.sizey);
   else
     blocksize = 1;
diff --git a/src/common/interpolation.c b/src/common/interpolation.c
index 31afdd30bb70..59deae83a3c9 100644
--- a/src/common/interpolation.c
+++ b/src/common/interpolation.c
@@ -1335,7 +1335,7 @@ int dt_interpolation_resample_cl(const dt_interpolation_t *itor,
           .sizex = 1,
           .sizey = (1 << 16) * taps };
 
-  if(dt_opencl_local_buffer_opt(devid, kernel, &locopt))
+  if(dt_opencl_local_buffer_opt(devid, kernel, &locopt) == CL_SUCCESS)
     vblocksize = locopt.sizey;
   else
     vblocksize = 1;
diff --git a/src/common/nlmeans_core.c b/src/common/nlmeans_core.c
index b54387555d05..089f2c5a8d68 100644
--- a/src/common/nlmeans_core.c
+++ b/src/common/nlmeans_core.c
@@ -572,14 +572,14 @@ static void get_blocksizes(
                                   .cellsize = sizeof(float), .overhead = 0,
                                   .sizex = 1 << 16, .sizey = 1 };
 
-  *h = dt_opencl_local_buffer_opt(devid, horiz_kernel, &hlocopt) ? hlocopt.sizex : 1;
+  *h = dt_opencl_local_buffer_opt(devid, horiz_kernel, &hlocopt) == CL_SUCCESS ? hlocopt.sizex : 1;
 
   dt_opencl_local_buffer_t vlocopt
     = (dt_opencl_local_buffer_t){ .xoffset = 1, .xfactor = 1, .yoffset = 2 * radius, .yfactor = 1,
                                   .cellsize = sizeof(float), .overhead = 0,
                                   .sizex = 1, .sizey = 1 << 16 };
 
-  *v = dt_opencl_local_buffer_opt(devid, vert_kernel, &vlocopt) ? vlocopt.sizey : 1;
+  *v = dt_opencl_local_buffer_opt(devid, vert_kernel, &vlocopt) == CL_SUCCESS ? vlocopt.sizey : 1;
   return;
 }
 
diff --git a/src/common/opencl.c b/src/common/opencl.c
index afc2204d6f49..362026b4d285 100644
--- a/src/common/opencl.c
+++ b/src/common/opencl.c
@@ -2657,7 +2657,7 @@ int dt_opencl_get_max_work_item_sizes(const int dev,
                                       size_t *sizes)
 {
   dt_opencl_t *cl = darktable.opencl;
-  if(!cl->inited || dev < 0) return -1;
+  if(!cl->inited || dev < 0) return CL_DEVICE_NOT_AVAILABLE;
   return (cl->dlocl->symbols->dt_clGetDeviceInfo)(cl->dev[dev].devid,
                                                   CL_DEVICE_MAX_WORK_ITEM_SIZES,
                                                   sizeof(size_t) * 3, sizes, NULL);
@@ -2670,7 +2670,7 @@ int dt_opencl_get_work_group_limits(const int dev,
                                     unsigned long *localmemsize)
 {
   dt_opencl_t *cl = darktable.opencl;
-  if(!cl->inited || dev < 0) return -1;
+  if(!cl->inited || dev < 0) return CL_DEVICE_NOT_AVAILABLE;
   cl_ulong lmemsize;
   cl_int err = (cl->dlocl->symbols->dt_clGetDeviceInfo)(cl->dev[dev].devid,
                                                         CL_DEVICE_LOCAL_MEM_SIZE,
@@ -2693,7 +2693,7 @@ int dt_opencl_get_kernel_work_group_size(const int dev,
                                          const int kernel,
                                          size_t *kernelworkgroupsize)
 {
-  if(!_check_kernel(dev, kernel)) return -1;
+  if(!_check_kernel(dev, kernel)) return CL_DEVICE_NOT_AVAILABLE;
 
   dt_opencl_t *cl = darktable.opencl;
   return (cl->dlocl->symbols->dt_clGetKernelWorkGroupInfo)(cl->dev[dev].kernel[kernel],
@@ -4076,13 +4076,13 @@ static int _nextpow2(const int n)
 
 // utility function to calculate optimal work group dimensions for a given kernel
 // taking device specific restrictions and local memory limitations into account
-// returns TRUE in case of success
-gboolean dt_opencl_local_buffer_opt(const int devid,
-                                    const int kernel,
-                                    dt_opencl_local_buffer_t *factors)
+// returns CL_SUCCESS or an error code
+cl_int dt_opencl_local_buffer_opt(const int devid,
+                                  const int kernel,
+                                  dt_opencl_local_buffer_t *factors)
 {
   dt_opencl_t *cl = darktable.opencl;
-  if(!cl->inited || devid < 0) return FALSE;
+  if(!cl->inited || devid < 0) return DT_OPENCL_NODEVICE;
 
   size_t maxsizes[3] = { 0 };     // the maximum dimensions for a work group
   size_t workgroupsize = 0;       // the maximum number of items in a work group
@@ -4098,10 +4098,8 @@ gboolean dt_opencl_local_buffer_opt(const int devid,
   *blocksizex = CLAMP(_nextpow2(*blocksizex), 1, 1 << 16);
   *blocksizey = CLAMP(_nextpow2(*blocksizey), 1, 1 << 16);
 
-  if(dt_opencl_get_work_group_limits
-      (devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS
-     && dt_opencl_get_kernel_work_group_size
-          (devid, kernel, &kernelworkgroupsize) == CL_SUCCESS)
+  if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS
+     && dt_opencl_get_kernel_work_group_size(devid, kernel, &kernelworkgroupsize) == CL_SUCCESS)
   {
     while(maxsizes[0] < *blocksizex
           || maxsizes[1] < *blocksizey
@@ -4114,9 +4112,8 @@ gboolean dt_opencl_local_buffer_opt(const int devid,
       if(*blocksizex == 1 && *blocksizey == 1)
       {
         dt_print(DT_DEBUG_OPENCL,
-             "[dt_opencl_local_buffer_opt] no valid resource limits for device %d",
-                 devid);
-        return FALSE;
+             "[dt_opencl_local_buffer_opt] no valid resource limits for device %d", devid);
+        return CL_INVALID_WORK_GROUP_SIZE;
       }
 
       if(*blocksizex > *blocksizey)
@@ -4130,10 +4127,10 @@ gboolean dt_opencl_local_buffer_opt(const int devid,
     dt_print(DT_DEBUG_OPENCL,
              "[dt_opencl_local_buffer_opt] can not identify"
              " resource limits for device %d", devid);
-    return FALSE;
+    return CL_INVALID_WORK_GROUP_SIZE;
   }
 
-  return TRUE;
+  return CL_SUCCESS;
 }
 
 #endif
diff --git a/src/common/opencl.h b/src/common/opencl.h
index 901b2fe0f220..45f30f828ea6 100644
--- a/src/common/opencl.h
+++ b/src/common/opencl.h
@@ -595,10 +595,11 @@ cl_int dt_opencl_events_flush(const int devid,
                               const gboolean reset);
 
 /** utility function to calculate optimal work group dimensions for a
- * given kernel */
-gboolean dt_opencl_local_buffer_opt(const int devid,
-                               const int kernel,
-                               dt_opencl_local_buffer_t *factors);
+    given kernel, returns an error code
+*/
+cl_int dt_opencl_local_buffer_opt(const int devid,
+                                  const int kernel,
+                                  dt_opencl_local_buffer_t *factors);
 
 /** utility functions handling device specific properties */
 void dt_opencl_write_device_config(const int devid);
diff --git a/src/iop/bloom.c b/src/iop/bloom.c
index 81fd69808e40..d2116c8b611c 100644
--- a/src/iop/bloom.c
+++ b/src/iop/bloom.c
@@ -215,7 +215,7 @@ int process_cl(dt_iop_module_t *self,
                                   .sizex = 1 << 16,
                                   .sizey = 1 };
 
-  if(dt_opencl_local_buffer_opt(devid, gd->kernel_bloom_hblur, &hlocopt))
+  if(dt_opencl_local_buffer_opt(devid, gd->kernel_bloom_hblur, &hlocopt) == CL_SUCCESS)
     hblocksize = hlocopt.sizex;
   else
     hblocksize = 1;
@@ -231,7 +231,7 @@ int process_cl(dt_iop_module_t *self,
                                   .sizex = 1,
                                   .sizey = 1 << 16 };
 
-  if(dt_opencl_local_buffer_opt(devid, gd->kernel_bloom_vblur, &vlocopt))
+  if(dt_opencl_local_buffer_opt(devid, gd->kernel_bloom_vblur, &vlocopt) == CL_SUCCESS)
     vblocksize = vlocopt.sizey;
   else
     vblocksize = 1;
diff --git a/src/iop/colorreconstruction.c b/src/iop/colorreconstruction.c
index 3217eaad49f6..2a0c64347c70 100644
--- a/src/iop/colorreconstruction.c
+++ b/src/iop/colorreconstruction.c
@@ -712,7 +712,7 @@ static dt_iop_colorreconstruct_bilateral_cl_t *dt_iop_colorreconstruct_bilateral
                                   .cellsize = 4 * sizeof(float) + sizeof(int), .overhead = 0,
                                   .sizex = 1 << 6, .sizey = 1 << 6 };
 
-  if(dt_opencl_local_buffer_opt(devid, global->kernel_colorreconstruct_splat, &locopt))
+  if(dt_opencl_local_buffer_opt(devid, global->kernel_colorreconstruct_splat, &locopt) == CL_SUCCESS)
   {
     blocksizex = locopt.sizex;
     blocksizey = locopt.sizey;
@@ -852,7 +852,7 @@ static dt_iop_colorreconstruct_bilateral_cl_t *dt_iop_colorreconstruct_bilateral
                                   .cellsize = 4 * sizeof(float) + sizeof(int), .overhead = 0,
                                   .sizex = 1 << 6, .sizey = 1 << 6 };
 
-  if(dt_opencl_local_buffer_opt(devid, global->kernel_colorreconstruct_splat, &locopt))
+  if(dt_opencl_local_buffer_opt(devid, global->kernel_colorreconstruct_splat, &locopt) == CL_SUCCESS)
   {
     blocksizex = locopt.sizex;
     blocksizey = locopt.sizey;
diff --git a/src/iop/demosaicing/basics.c b/src/iop/demosaicing/basics.c
index 948fe1e43a53..dffe0ac9117c 100644
--- a/src/iop/demosaicing/basics.c
+++ b/src/iop/demosaicing/basics.c
@@ -249,7 +249,7 @@ static int color_smoothing_cl(const dt_iop_module_t *self,
 
   const int devid = piece->pipe->devid;
 
-  cl_int err = DT_OPENCL_DEFAULT_ERROR;
+  cl_int err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
 
   cl_mem dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4);
   if(dev_tmp == NULL) goto error;
@@ -259,11 +259,8 @@ static int color_smoothing_cl(const dt_iop_module_t *self,
                                   .cellsize = 4 * sizeof(float), .overhead = 0,
                                   .sizex = 1 << 8, .sizey = 1 << 8 };
 
-  if(!dt_opencl_local_buffer_opt(devid, gd->kernel_color_smoothing, &locopt))
-  {
-    err = CL_INVALID_WORK_DIMENSION;
-    goto error;
-  }
+  err = dt_opencl_local_buffer_opt(devid, gd->kernel_color_smoothing, &locopt);
+  if(err != CL_SUCCESS) goto error;
 
   // two buffer references for our ping-pong
   cl_mem dev_t1 = dev_out;
@@ -360,11 +357,8 @@ static int green_equilibration_cl(const dt_iop_module_t *self,
                                     .cellsize = 2 * sizeof(float), .overhead = 0,
                                     .sizex = 1 << 4, .sizey = 1 << 4 };
 
-    if(!dt_opencl_local_buffer_opt(devid, gd->kernel_green_eq_favg_reduce_first, &flocopt))
-    {
-      err = CL_INVALID_WORK_DIMENSION;
-      goto error;
-    }
+    err = dt_opencl_local_buffer_opt(devid, gd->kernel_green_eq_favg_reduce_first, &flocopt);
+    if(err != CL_SUCCESS) goto error;
 
     const size_t bwidth = ROUNDUP(width, flocopt.sizex);
     const size_t bheight = ROUNDUP(height, flocopt.sizey);
@@ -392,11 +386,8 @@ static int green_equilibration_cl(const dt_iop_module_t *self,
                                     .cellsize = sizeof(float) * 2, .overhead = 0,
                                     .sizex = 1 << 16, .sizey = 1 };
 
-    if(!dt_opencl_local_buffer_opt(devid, gd->kernel_green_eq_favg_reduce_second, &slocopt))
-    {
-      err = CL_INVALID_WORK_DIMENSION;
-      goto error;
-    }
+    err = dt_opencl_local_buffer_opt(devid, gd->kernel_green_eq_favg_reduce_second, &slocopt);
+    if(err != CL_SUCCESS) goto error;
 
     const int reducesize = MIN(DT_REDUCESIZE_MIN, ROUNDUP(bufsize, slocopt.sizex) / slocopt.sizex);
 
@@ -451,11 +442,8 @@ static int green_equilibration_cl(const dt_iop_module_t *self,
                                     .cellsize = 1 * sizeof(float), .overhead = 0,
                                     .sizex = 1 << 8, .sizey = 1 << 8 };
 
-    if(!dt_opencl_local_buffer_opt(devid, gd->kernel_green_eq_lavg, &locopt))
-    {
-      err = CL_INVALID_WORK_DIMENSION;
-      goto error;
-    }
+    err = dt_opencl_local_buffer_opt(devid, gd->kernel_green_eq_lavg, &locopt);
+    if(err != CL_SUCCESS) goto error;
 
     const size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
     const size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
@@ -538,11 +526,8 @@ static int process_default_cl(const dt_iop_module_t *self,
                                         .cellsize = 1 * sizeof(float), .overhead = 0,
                                         .sizex = 1 << 8, .sizey = 1 << 8 };
 
-        if(!dt_opencl_local_buffer_opt(devid, gd->kernel_pre_median, &locopt))
-        {
-          err = CL_INVALID_WORK_DIMENSION;
-          goto error;
-        }
+        err = dt_opencl_local_buffer_opt(devid, gd->kernel_pre_median, &locopt);
+        if(err != CL_SUCCESS) goto error;
 
         const size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
         const size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
@@ -561,11 +546,8 @@ static int process_default_cl(const dt_iop_module_t *self,
                                         .cellsize = sizeof(float) * 1, .overhead = 0,
                                         .sizex = 1 << 8, .sizey = 1 << 8 };
 
-        if(!dt_opencl_local_buffer_opt(devid, gd->kernel_ppg_green, &locopt))
-        {
-          err = CL_INVALID_WORK_DIMENSION;
-          goto error;
-        }
+        err = dt_opencl_local_buffer_opt(devid, gd->kernel_ppg_green, &locopt);
+        if(err != CL_SUCCESS) goto error;
 
         const size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
         const size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
@@ -583,11 +565,8 @@ static int process_default_cl(const dt_iop_module_t *self,
                                         .cellsize = 4 * sizeof(float), .overhead = 0,
                                         .sizex = 1 << 8, .sizey = 1 << 8 };
 
-        if(!dt_opencl_local_buffer_opt(devid, gd->kernel_ppg_redblue, &locopt))
-        {
-          err = CL_INVALID_WORK_DIMENSION;
-          goto error;
-        }
+        err = dt_opencl_local_buffer_opt(devid, gd->kernel_ppg_redblue, &locopt);
+        if(err != CL_SUCCESS) goto error;
 
         const size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
         const size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
diff --git a/src/iop/demosaicing/rcd.c b/src/iop/demosaicing/rcd.c
index eb548a794861..eaf026681f1e 100644
--- a/src/iop/demosaicing/rcd.c
+++ b/src/iop/demosaicing/rcd.c
@@ -626,11 +626,9 @@ static cl_int process_rcd_cl(dt_iop_module_t *self,
                                       .cellsize = sizeof(float) * 1, .overhead = 0,
                                       .sizex = 64, .sizey = 64 };
 
-    if(!dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_green, &locopt))
-    {
-      err = CL_INVALID_WORK_DIMENSION;
-      goto error;
-    }
+    err = dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_green, &locopt);
+    if(err != CL_SUCCESS) goto error;
+
     myborder = 32;
     size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
     size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
@@ -647,11 +645,9 @@ static cl_int process_rcd_cl(dt_iop_module_t *self,
                                       .cellsize = 4 * sizeof(float), .overhead = 0,
                                       .sizex = 64, .sizey = 64 };
 
-    if(!dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_redblue, &locopt))
-    {
-      err = CL_INVALID_WORK_DIMENSION;
-      goto error;
-    }
+    err = dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_redblue, &locopt);
+    if(err != CL_SUCCESS) goto error;
+
     myborder = 16;
     size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
     size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
diff --git a/src/iop/demosaicing/vng.c b/src/iop/demosaicing/vng.c
index be1086ee1542..4f09623f27ba 100644
--- a/src/iop/demosaicing/vng.c
+++ b/src/iop/demosaicing/vng.c
@@ -482,11 +482,9 @@ static cl_int process_vng_cl(const dt_iop_module_t *self,
                                       .cellsize = 1 * sizeof(float), .overhead = 0,
                                       .sizex = 1 << 8, .sizey = 1 << 8 };
 
-    if(!dt_opencl_local_buffer_opt(devid, gd->kernel_vng_lin_interpolate, &locopt))
-    {
-      err = CL_INVALID_WORK_DIMENSION;
-      goto finish;
-    }
+    err = dt_opencl_local_buffer_opt(devid, gd->kernel_vng_lin_interpolate, &locopt);
+    if(err != CL_SUCCESS) goto finish;
+
     size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
     size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
     dt_opencl_set_kernel_args(devid, gd->kernel_vng_lin_interpolate, 0,
@@ -508,11 +506,9 @@ static cl_int process_vng_cl(const dt_iop_module_t *self,
                                       .cellsize = 4 * sizeof(float), .overhead = 0,
                                       .sizex = 1 << 8, .sizey = 1 << 8 };
 
-  if(!dt_opencl_local_buffer_opt(devid, gd->kernel_vng_interpolate, &locopt))
-  {
-    err = CL_INVALID_WORK_DIMENSION;
-    goto finish;
-  }
+  err = dt_opencl_local_buffer_opt(devid, gd->kernel_vng_interpolate, &locopt);
+  if(err != CL_SUCCESS) goto finish;
+
   size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
   size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
   dt_opencl_set_kernel_args(devid, gd->kernel_vng_interpolate, 0,
diff --git a/src/iop/demosaicing/xtrans.c b/src/iop/demosaicing/xtrans.c
index a3be483326f1..5b84ec098a2e 100644
--- a/src/iop/demosaicing/xtrans.c
+++ b/src/iop/demosaicing/xtrans.c
@@ -1733,11 +1733,8 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self,
                                     .cellsize = 1 * sizeof(float), .overhead = 0,
                                     .sizex = 1 << 8, .sizey = 1 << 8 };
 
-    if(!dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_green_minmax, &locopt_g1_g3))
-    {
-      err = CL_INVALID_WORK_DIMENSION;
-      goto error;
-    }
+    err = dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_green_minmax, &locopt_g1_g3);
+    if(err != CL_SUCCESS) goto error;
 
     {
       const size_t sizes[3] = { ROUNDUP(width, locopt_g1_g3.sizex), ROUNDUP(height, locopt_g1_g3.sizey), 1 };
@@ -1757,11 +1754,8 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self,
                                     .cellsize = 4 * sizeof(float), .overhead = 0,
                                     .sizex = 1 << 8, .sizey = 1 << 8 };
 
-    if(!dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_interpolate_green, &locopt_g_interp))
-    {
-      err = CL_INVALID_WORK_DIMENSION;
-      goto error;
-    }
+    err = dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_interpolate_green, &locopt_g_interp);
+    if(err != CL_SUCCESS) goto error;
 
     {
       const size_t sizes[3] = { ROUNDUP(width, locopt_g_interp.sizex), ROUNDUP(height, locopt_g_interp.sizey), 1 };
@@ -1811,11 +1805,8 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self,
                                       .cellsize = 4 * sizeof(float), .overhead = 0,
                                       .sizex = 1 << 8, .sizey = 1 << 8 };
 
-      if(!dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_solitary_green, &locopt_rb_g))
-      {
-        err = CL_INVALID_WORK_DIMENSION;
-        goto error;
-      }
+      err = dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_solitary_green, &locopt_rb_g);
+      if(err != CL_SUCCESS) goto error;
 
       cl_mem *dev_trgb = dev_rgb;
       for(int d = 0, i = 1, h = 0; d < 6; d++, i ^= 1, h ^= 2)
@@ -1841,11 +1832,8 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self,
                                       .cellsize = 4 * sizeof(float), .overhead = 0,
                                       .sizex = 1 << 8, .sizey = 1 << 8 };
 
-      if(!dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_red_and_blue, &locopt_rb_br))
-      {
-        err = CL_INVALID_WORK_DIMENSION;
-        goto error;
-      }
+      err = dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_red_and_blue, &locopt_rb_br);
+      if(err != CL_SUCCESS) goto error;
 
       for(int d = 0; d < 4; d++)
       {
@@ -1865,11 +1853,8 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self,
                                       .cellsize = 4 * sizeof(float), .overhead = 0,
                                       .sizex = 1 << 8, .sizey = 1 << 8 };
 
-      if(!dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_interpolate_twoxtwo, &locopt_g22))
-      {
-        err = CL_INVALID_WORK_DIMENSION;
-        goto error;
-      }
+      err = dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_interpolate_twoxtwo, &locopt_g22);
+      if(err != CL_SUCCESS) goto error;
 
       for(int d = 0, n = 0; d < ndir; d += 2, n++)
       {
@@ -1906,11 +1891,8 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self,
                                     .cellsize = 4 * sizeof(float), .overhead = 0,
                                     .sizex = 1 << 8, .sizey = 1 << 8 };
 
-    if(!dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_differentiate, &locopt_diff))
-    {
-      err = CL_INVALID_WORK_DIMENSION;
-      goto error;
-    }
+    err = dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_differentiate, &locopt_diff);
+    if(err != CL_SUCCESS) goto error;
 
     for(int d = 0; d < ndir; d++)
     {
@@ -1957,11 +1939,8 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self,
                                     .cellsize = 1 * sizeof(float), .overhead = 0,
                                     .sizex = 1 << 8, .sizey = 1 << 8 };
 
-    if(!dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_homo_set, &locopt_homo))
-    {
-      err = CL_INVALID_WORK_DIMENSION;
-      goto error;
-    }
+    err = dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_homo_set, &locopt_homo);
+    if(err != CL_SUCCESS) goto error;
 
     for(int d = 0; d < ndir; d++)
     {
@@ -1987,11 +1966,8 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self,
                                     .cellsize = 1 * sizeof(float), .overhead = 0,
                                     .sizex = 1 << 8, .sizey = 1 << 8 };
 
-    if(!dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_homo_sum, &locopt_homo_sum))
-    {
-      err = CL_INVALID_WORK_DIMENSION;
-      goto error;
-    }
+    err = dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_homo_sum, &locopt_homo_sum);
+    if(err != CL_SUCCESS) goto error;
 
     for(int d = 0; d < ndir; d++)
     {
diff --git a/src/iop/denoiseprofile.c b/src/iop/denoiseprofile.c
index 1f57e36269e7..73a5932af7b0 100644
--- a/src/iop/denoiseprofile.c
+++ b/src/iop/denoiseprofile.c
@@ -2126,7 +2126,7 @@ static int process_nlmeans_cl(dt_iop_module_t *self,
                                   .sizex = 1u << 16,
                                   .sizey = 1 };
 
-  if(dt_opencl_local_buffer_opt(devid, gd->kernel_denoiseprofile_horiz, &hlocopt))
+  if(dt_opencl_local_buffer_opt(devid, gd->kernel_denoiseprofile_horiz, &hlocopt) == CL_SUCCESS)
     hblocksize = hlocopt.sizex;
   else
     hblocksize = 1;
@@ -2142,7 +2142,7 @@ static int process_nlmeans_cl(dt_iop_module_t *self,
                                   .sizex = 1,
                                   .sizey = 1u << 16 };
 
-  if(dt_opencl_local_buffer_opt(devid, gd->kernel_denoiseprofile_vert, &vlocopt))
+  if(dt_opencl_local_buffer_opt(devid, gd->kernel_denoiseprofile_vert, &vlocopt) == CL_SUCCESS)
     vblocksize = vlocopt.sizey;
   else
     vblocksize = 1;
@@ -2355,11 +2355,8 @@ static int process_wavelets_cl(dt_iop_module_t *self,
                                   .sizex = 1u << 4,
                                   .sizey = 1u << 4 };
 
-  if(!dt_opencl_local_buffer_opt(devid, gd->kernel_denoiseprofile_reduce_first, &flocopt))
-  {
-    err = CL_INVALID_WORK_DIMENSION;
-    goto error;
-  }
+  err = dt_opencl_local_buffer_opt(devid, gd->kernel_denoiseprofile_reduce_first, &flocopt);
+  if(err != CL_SUCCESS) goto error;
 
   const size_t bwidth = ROUNDUP(width, flocopt.sizex);
   const size_t bheight = ROUNDUP(height, flocopt.sizey);
@@ -2376,11 +2373,8 @@ static int process_wavelets_cl(dt_iop_module_t *self,
                                   .sizex = 1u << 16,
                                   .sizey = 1 };
 
-  if(!dt_opencl_local_buffer_opt(devid, gd->kernel_denoiseprofile_reduce_first, &slocopt))
-  {
-    err = CL_INVALID_WORK_DIMENSION;
-    goto error;
-  }
+  err = dt_opencl_local_buffer_opt(devid, gd->kernel_denoiseprofile_reduce_first, &slocopt);
+  if(err != CL_SUCCESS) goto error;
 
   const int reducesize = MIN(REDUCESIZE, ROUNDUP(bufsize, slocopt.sizex) / slocopt.sizex);
   err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
diff --git a/src/iop/globaltonemap.c b/src/iop/globaltonemap.c
index 4364ba77b0f1..5de25273f116 100644
--- a/src/iop/globaltonemap.c
+++ b/src/iop/globaltonemap.c
@@ -383,11 +383,8 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_
                                       .cellsize = sizeof(float), .overhead = 0,
                                       .sizex = 1 << 4, .sizey = 1 << 4 };
 
-      if(!dt_opencl_local_buffer_opt(devid, gd->kernel_pixelmax_first, &flocopt))
-      {
-        err = CL_INVALID_WORK_DIMENSION;
-        goto finally;
-      }
+      err = dt_opencl_local_buffer_opt(devid, gd->kernel_pixelmax_first, &flocopt);
+      if(err != CL_SUCCESS) goto finally;
 
       const size_t bwidth = ROUNDUP(width, flocopt.sizex);
       const size_t bheight = ROUNDUP(height, flocopt.sizey);
@@ -399,11 +396,8 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_
                                       .cellsize = sizeof(float), .overhead = 0,
                                       .sizex = 1 << 16, .sizey = 1 };
 
-      if(!dt_opencl_local_buffer_opt(devid, gd->kernel_pixelmax_second, &slocopt))
-      {
-        err = CL_INVALID_WORK_DIMENSION;
-        goto finally;
-      }
+      err = dt_opencl_local_buffer_opt(devid, gd->kernel_pixelmax_second, &slocopt);
+      if(err != CL_SUCCESS) goto finally;
 
       const int reducesize = MIN(REDUCESIZE, ROUNDUP(bufsize, slocopt.sizex) / slocopt.sizex);
 
diff --git a/src/iop/highlights.c b/src/iop/highlights.c
index 15b6ff094493..ec1b5372a09c 100644
--- a/src/iop/highlights.c
+++ b/src/iop/highlights.c
@@ -627,7 +627,7 @@ int process_cl(dt_iop_module_t *self,
                                     .cellsize = sizeof(float), .overhead = 0,
                                     .sizex = 1 << 8, .sizey = 1 << 8 };
 
-    if(dt_opencl_local_buffer_opt(devid, gd->kernel_highlights_1f_lch_xtrans, &locopt))
+    if(dt_opencl_local_buffer_opt(devid, gd->kernel_highlights_1f_lch_xtrans, &locopt) == CL_SUCCESS)
     {
       blocksizex = locopt.sizex;
       blocksizey = locopt.sizey;
diff --git a/src/iop/highpass.c b/src/iop/highpass.c
index bb2fbf58630f..c463197d4eb6 100644
--- a/src/iop/highpass.c
+++ b/src/iop/highpass.c
@@ -162,7 +162,7 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_
                                   .cellsize = sizeof(float), .overhead = 0,
                                   .sizex = 1 << 16, .sizey = 1 };
 
-  if(dt_opencl_local_buffer_opt(devid, gd->kernel_highpass_hblur, &hlocopt))
+  if(dt_opencl_local_buffer_opt(devid, gd->kernel_highpass_hblur, &hlocopt) == CL_SUCCESS)
     hblocksize = hlocopt.sizex;
   else
     hblocksize = 1;
@@ -173,7 +173,7 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_
                                   .cellsize = sizeof(float), .overhead = 0,
                                   .sizex = 1, .sizey = 1 << 16 };
 
-  if(dt_opencl_local_buffer_opt(devid, gd->kernel_highpass_vblur, &vlocopt))
+  if(dt_opencl_local_buffer_opt(devid, gd->kernel_highpass_vblur, &vlocopt) == CL_SUCCESS)
     vblocksize = vlocopt.sizey;
   else
     vblocksize = 1;
diff --git a/src/iop/nlmeans.c b/src/iop/nlmeans.c
index e4030e04dcf3..dd3f084c10f7 100644
--- a/src/iop/nlmeans.c
+++ b/src/iop/nlmeans.c
@@ -248,7 +248,7 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_
                                   .cellsize = sizeof(float), .overhead = 0,
                                   .sizex = 1 << 16, .sizey = 1 };
 
-  if(dt_opencl_local_buffer_opt(devid, gd->kernel_nlmeans_horiz, &hlocopt))
+  if(dt_opencl_local_buffer_opt(devid, gd->kernel_nlmeans_horiz, &hlocopt) == CL_SUCCESS)
     hblocksize = hlocopt.sizex;
   else
     hblocksize = 1;
@@ -259,7 +259,7 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_
                                   .cellsize = sizeof(float), .overhead = 0,
                                   .sizex = 1, .sizey = 1 << 16 };
 
-  if(dt_opencl_local_buffer_opt(devid, gd->kernel_nlmeans_vert, &vlocopt))
+  if(dt_opencl_local_buffer_opt(devid, gd->kernel_nlmeans_vert, &vlocopt) == CL_SUCCESS)
     vblocksize = vlocopt.sizey;
   else
     vblocksize = 1;
diff --git a/src/iop/sharpen.c b/src/iop/sharpen.c
index 544909fc1fc9..5229a54c8446 100644
--- a/src/iop/sharpen.c
+++ b/src/iop/sharpen.c
@@ -171,7 +171,7 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_
                                   .cellsize = sizeof(float), .overhead = 0,
                                   .sizex = 1 << 16, .sizey = 1 };
 
-  if(dt_opencl_local_buffer_opt(devid, gd->kernel_sharpen_hblur, &hlocopt))
+  if(dt_opencl_local_buffer_opt(devid, gd->kernel_sharpen_hblur, &hlocopt) == CL_SUCCESS)
     hblocksize = hlocopt.sizex;
   else
     hblocksize = 1;
@@ -182,7 +182,7 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_
                                   .cellsize = sizeof(float), .overhead = 0,
                                   .sizex = 1, .sizey = 1 << 16 };
 
-  if(dt_opencl_local_buffer_opt(devid, gd->kernel_sharpen_vblur, &vlocopt))
+  if(dt_opencl_local_buffer_opt(devid, gd->kernel_sharpen_vblur, &vlocopt) == CL_SUCCESS)
     vblocksize = vlocopt.sizey;
   else
     vblocksize = 1;
diff --git a/src/iop/soften.c b/src/iop/soften.c
index 1a28a30dbbfc..5c9c62f2c627 100644
--- a/src/iop/soften.c
+++ b/src/iop/soften.c
@@ -208,7 +208,7 @@ int process_cl(dt_iop_module_t *self,
                                   .sizex = 1 << 16,
                                   .sizey = 1 };
 
-  if(dt_opencl_local_buffer_opt(devid, gd->kernel_soften_hblur, &hlocopt))
+  if(dt_opencl_local_buffer_opt(devid, gd->kernel_soften_hblur, &hlocopt) == CL_SUCCESS)
     hblocksize = hlocopt.sizex;
   else
     hblocksize = 1;
@@ -224,7 +224,7 @@ int process_cl(dt_iop_module_t *self,
                                   .sizex = 1,
                                   .sizey = 1 << 16 };
 
-  if(dt_opencl_local_buffer_opt(devid, gd->kernel_soften_vblur, &vlocopt))
+  if(dt_opencl_local_buffer_opt(devid, gd->kernel_soften_vblur, &vlocopt) == CL_SUCCESS)
     vblocksize = vlocopt.sizey;
   else
     vblocksize = 1;

From e1ad1c0dc7083a6ab6a38c50b4aff946aec851eb Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Wed, 4 Feb 2026 16:39:59 +0100
Subject: [PATCH 2/9] Introduce OpenCL CLARGINT() and CLARGFLOAT() macros

As parameters for OpenCK kernels must be presented as 'pointer-to-value' we use
compound literals for readability.
---
 src/common/opencl.h          |  6 ++++++
 src/iop/demosaicing/dual.c   |  3 +--
 src/iop/demosaicing/rcd.c    | 10 +++-------
 src/iop/demosaicing/xtrans.c | 38 ++++++++++++++++--------------------
 4 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/src/common/opencl.h b/src/common/opencl.h
index 45f30f828ea6..a3aff0148148 100644
--- a/src/common/opencl.h
+++ b/src/common/opencl.h
@@ -359,6 +359,12 @@ int dt_opencl_get_kernel_work_group_size(const int dev,
 /** wrap opencl single argument */
 #define CLARG(arg) CLWRAP(sizeof(arg), &arg)
 
+/** wrap inline parameters as compound literals (C99) used for #defines / constants ..
+  See https://en.cppreference.com/w/c/language/compound_literal.html
+*/
+#define CLARGINT(arg) CLWRAP(sizeof(int), &((int){arg}))
+#define CLARGFLOAT(arg) CLWRAP(sizeof(float), &((float){arg}))
+
 /** wrap opencl argument array */
 #define CLARRAY(num, arg) CLWRAP(num * sizeof(*arg), arg)
 
diff --git a/src/iop/demosaicing/dual.c b/src/iop/demosaicing/dual.c
index 0ce80430e3f2..cf39c210fb62 100644
--- a/src/iop/demosaicing/dual.c
+++ b/src/iop/demosaicing/dual.c
@@ -126,9 +126,8 @@ int dual_demosaic_cl(const dt_iop_module_t *self,
      CLARG(mask), CLARG(tmp), CLARG(width), CLARG(height));
   if(err != CL_SUCCESS) goto finish;
 
-  const int detail = 1;
   err = dt_opencl_enqueue_kernel_2d_args(devid, darktable.opencl->blendop->kernel_calc_blend, width, height,
-      CLARG(tmp), CLARG(mask), CLARG(width), CLARG(height), CLARG(contrastf), CLARG(detail));
+      CLARG(tmp), CLARG(mask), CLARG(width), CLARG(height), CLARG(contrastf), CLARGINT(1));
   if(err != CL_SUCCESS) goto finish;
 
   err = dt_gaussian_fast_blur_cl_buffer(devid, mask, tmp, width, height, 2.0f, 1, 0.0f, 1.0f);
diff --git a/src/iop/demosaicing/rcd.c b/src/iop/demosaicing/rcd.c
index eaf026681f1e..30b82e40eb7d 100644
--- a/src/iop/demosaicing/rcd.c
+++ b/src/iop/demosaicing/rcd.c
@@ -615,7 +615,6 @@ static cl_int process_rcd_cl(dt_iop_module_t *self,
   dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4);
   if(dev_tmp == NULL) goto error;
 
-  int myborder = 3;
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_border_interpolate, width, height,
         CLARG(dev_in), CLARG(dev_tmp), CLARG(width), CLARG(height), CLARG(filters));
   if(err != CL_SUCCESS) goto error;
@@ -629,12 +628,11 @@ static cl_int process_rcd_cl(dt_iop_module_t *self,
     err = dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_green, &locopt);
     if(err != CL_SUCCESS) goto error;
 
-    myborder = 32;
     size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
     size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
     dt_opencl_set_kernel_args(devid, gd->kernel_rcd_border_green, 0, CLARG(dev_in), CLARG(dev_tmp),
         CLARG(width), CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * (locopt.sizex + 2*3) * (locopt.sizey + 2*3)),
-        CLARG(myborder));
+        CLARGINT(32));
     err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_rcd_border_green, sizes, local);
     if(err != CL_SUCCESS) goto error;
   }
@@ -648,12 +646,11 @@ static cl_int process_rcd_cl(dt_iop_module_t *self,
     err = dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_redblue, &locopt);
     if(err != CL_SUCCESS) goto error;
 
-    myborder = 16;
     size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 };
     size_t local[3] = { locopt.sizex, locopt.sizey, 1 };
     dt_opencl_set_kernel_args(devid, gd->kernel_rcd_border_redblue, 0, CLARG(dev_tmp), CLARG(dev_out),
       CLARG(width), CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 2) * (locopt.sizey + 2)),
-      CLARG(myborder));
+      CLARGINT(16));
     err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_rcd_border_redblue, sizes, local);
     if(err != CL_SUCCESS) goto error;
   }
@@ -722,9 +719,8 @@ static cl_int process_rcd_cl(dt_iop_module_t *self,
 
   scaler = dt_iop_get_processed_maximum(piece);
   // write output
-  myborder = RCD_MARGIN;
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_rcd_write_output, width, height,
-        CLARG(dev_out), CLARG(rgb0), CLARG(rgb1), CLARG(rgb2), CLARG(width), CLARG(height), CLARG(scaler), CLARG(myborder));
+        CLARG(dev_out), CLARG(rgb0), CLARG(rgb1), CLARG(rgb2), CLARG(width), CLARG(height), CLARG(scaler), CLARGINT(RCD_MARGIN));
 
 error:
   dt_opencl_release_mem_object(dev_tmp);
diff --git a/src/iop/demosaicing/xtrans.c b/src/iop/demosaicing/xtrans.c
index 5b84ec098a2e..e0d329318642 100644
--- a/src/iop/demosaicing/xtrans.c
+++ b/src/iop/demosaicing/xtrans.c
@@ -21,6 +21,10 @@
 // tile size, optimized to keep data in L2 cache
 #define TS 122
 
+#define PAD_G1_G3 3
+#define PAD_G_INTERP 3
+#define PAD_G_RECALC 6
+
 /** Lookup for allhex[], making sure that row/col aren't negative **/
 static inline const short *_hexmap(const int row,
                                    const int col,
@@ -186,13 +190,12 @@ static void xtrans_markesteijn_interpolate(float *out,
       // and g3 values to the min/max of green pixels surrounding the
       // pair. Use a 3 pixel border as gmin/gmax is used by
       // interpolate green which has a 3 pixel border.
-      const int pad_g1_g3 = 3;
-      for(int row = top + pad_g1_g3; row < mrow - pad_g1_g3; row++)
+      for(int row = top + PAD_G1_G3; row < mrow - PAD_G1_G3; row++)
       {
         // setting max to 0.0f signifies that this is a new pair, which
         // requires a new min/max calculation of its neighboring greens
         float min = FLT_MAX, max = 0.0f;
-        for(int col = left + pad_g1_g3; col < mcol - pad_g1_g3; col++)
+        for(int col = left + PAD_G1_G3; col < mcol - PAD_G1_G3; col++)
         {
           // if in row of horizontal red & blue pairs (or processing
           // vertical red & blue pairs near image bottom), reset min/max
@@ -238,9 +241,8 @@ static void xtrans_markesteijn_interpolate(float *out,
 
       /* Interpolate green horizontally, vertically, and along both diagonals: */
       // need a 3 pixel border here as 3*hex[] can have a 3 unit offset
-      const int pad_g_interp = 3;
-      for(int row = top + pad_g_interp; row < mrow - pad_g_interp; row++)
-        for(int col = left + pad_g_interp; col < mcol - pad_g_interp; col++)
+      for(int row = top + PAD_G_INTERP; row < mrow - PAD_G_INTERP; row++)
+        for(int col = left + PAD_G_INTERP; col < mcol - PAD_G_INTERP; col++)
         {
           float color[8];
           const int f = FCNxtrans(row, col, xtrans);
@@ -274,9 +276,8 @@ static void xtrans_markesteijn_interpolate(float *out,
         /* Recalculate green from interpolated values of closer pixels: */
         if(pass)
         {
-          const int pad_g_recalc = 6;
-          for(int row = top + pad_g_recalc; row < mrow - pad_g_recalc; row++)
-            for(int col = left + pad_g_recalc; col < mcol - pad_g_recalc; col++)
+          for(int row = top + PAD_G_RECALC; row < mrow - PAD_G_RECALC; row++)
+            for(int col = left + PAD_G_RECALC; col < mcol - PAD_G_RECALC; col++)
             {
               const int f = FCNxtrans(row, col, xtrans);
               if(f == 1) continue;
@@ -1227,13 +1228,12 @@ static void xtrans_fdc_interpolate(float *out,
       // and g3 values to the min/max of green pixels surrounding the
       // pair. Use a 3 pixel border as gmin/gmax is used by
       // interpolate green which has a 3 pixel border.
-      const int pad_g1_g3 = 3;
-      for(int row = top + pad_g1_g3; row < mrow - pad_g1_g3; row++)
+      for(int row = top + PAD_G1_G3; row < mrow - PAD_G1_G3; row++)
       {
         // setting max to 0.0f signifies that this is a new pair, which
         // requires a new min/max calculation of its neighboring greens
         float min = FLT_MAX, max = 0.0f;
-        for(int col = left + pad_g1_g3; col < mcol - pad_g1_g3; col++)
+        for(int col = left + PAD_G1_G3; col < mcol - PAD_G1_G3; col++)
         {
           // if in row of horizontal red & blue pairs (or processing
           // vertical red & blue pairs near image bottom), reset min/max
@@ -1279,9 +1279,8 @@ static void xtrans_fdc_interpolate(float *out,
 
       /* Interpolate green horizontally, vertically, and along both diagonals: */
       // need a 3 pixel border here as 3*hex[] can have a 3 unit offset
-      const int pad_g_interp = 3;
-      for(int row = top + pad_g_interp; row < mrow - pad_g_interp; row++)
-        for(int col = left + pad_g_interp; col < mcol - pad_g_interp; col++)
+      for(int row = top + PAD_G_INTERP; row < mrow - PAD_G_INTERP; row++)
+        for(int col = left + PAD_G_INTERP; col < mcol - PAD_G_INTERP; col++)
         {
           float color[8];
           const int f = FCNxtrans(row, col, xtrans);
@@ -1727,7 +1726,6 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self,
     }
 
     // find minimum and maximum allowed green values of red/blue pixel pairs
-    const int pad_g1_g3 = 3;
     dt_opencl_local_buffer_t locopt_g1_g3
       = (dt_opencl_local_buffer_t){ .xoffset = 2*3, .xfactor = 1, .yoffset = 2*3, .yfactor = 1,
                                     .cellsize = 1 * sizeof(float), .overhead = 0,
@@ -1741,14 +1739,13 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self,
       const size_t local[3] = { locopt_g1_g3.sizex, locopt_g1_g3.sizey, 1 };
       dt_opencl_set_kernel_args(devid, gd->kernel_markesteijn_green_minmax, 0,
         CLARG(dev_rgb[0]), CLARG(dev_gminmax),
-        CLARG(width), CLARG(height), CLARG(pad_g1_g3), CLARRAY(2, sgreen),
+        CLARG(width), CLARG(height), CLARGINT(PAD_G1_G3), CLARRAY(2, sgreen),
         CLARG(dev_xtrans), CLARG(dev_allhex), CLLOCAL(sizeof(float) * (locopt_g1_g3.sizex + 2*3) * (locopt_g1_g3.sizey + 2*3)));
       err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_markesteijn_green_minmax, sizes, local);
       if(err != CL_SUCCESS) goto error;
     }
 
     // interpolate green horizontally, vertically, and along both diagonals
-    const int pad_g_interp = 3;
     dt_opencl_local_buffer_t locopt_g_interp
       = (dt_opencl_local_buffer_t){ .xoffset = 2*6, .xfactor = 1, .yoffset = 2*6, .yfactor = 1,
                                     .cellsize = 4 * sizeof(float), .overhead = 0,
@@ -1763,7 +1760,7 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self,
       dt_opencl_set_kernel_args(devid, gd->kernel_markesteijn_interpolate_green, 0,
         CLARG(dev_rgb[0]), CLARG(dev_rgb[1]), CLARG(dev_rgb[2]), CLARG(dev_rgb[3]),
         CLARG(dev_gminmax), CLARG(width), CLARG(height),
-        CLARG(pad_g_interp), CLARRAY(2, sgreen), CLARG(dev_xtrans),
+        CLARGINT(PAD_G_INTERP), CLARRAY(2, sgreen), CLARG(dev_xtrans),
         CLARG(dev_allhex), CLLOCAL(sizeof(float) * 4 * (locopt_g_interp.sizex + 2*6) * (locopt_g_interp.sizey + 2*6)));
       err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_markesteijn_interpolate_green, sizes, local);
       if(err != CL_SUCCESS) goto error;
@@ -1790,10 +1787,9 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self,
       if(pass)
       {
         // recalculate green from interpolated values of closer pixels
-        const int pad_g_recalc = 6;
         err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_markesteijn_recalculate_green, width, height,
           CLARG(dev_rgb[0]), CLARG(dev_rgb[1]), CLARG(dev_rgb[2]), CLARG(dev_rgb[3]), CLARG(dev_gminmax),
-          CLARG(width), CLARG(height), CLARG(pad_g_recalc), CLARRAY(2, sgreen),
+          CLARG(width), CLARG(height), CLARGINT(PAD_G_RECALC), CLARRAY(2, sgreen),
           CLARG(dev_xtrans), CLARG(dev_allhex));
         if(err != CL_SUCCESS) goto error;
       }

From bd74658be9144ef448acf543dd9786563e9a9ffc Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Wed, 4 Feb 2026 16:43:15 +0100
Subject: [PATCH 3/9] OpenCL basecurve maintenance

- kernels called with the _args() variant
- a missing mem allocation check added
- using CLARGFLOAT
---
 src/iop/basecurve.c | 32 +++++++++++---------------------
 1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/src/iop/basecurve.c b/src/iop/basecurve.c
index a222747199a6..54eeb94e7533 100644
--- a/src/iop/basecurve.c
+++ b/src/iop/basecurve.c
@@ -1,6 +1,6 @@
 /*
     This file is part of darktable,
-    Copyright (C) 2010-2025 darktable developers.
+    Copyright (C) 2010-2026 darktable developers.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -920,7 +920,7 @@ int process_cl_lut(dt_iop_module_t *self,
 
   cl_mem dev_m = NULL;
   cl_mem dev_coeffs = NULL;
-  cl_int err = DT_OPENCL_DEFAULT_ERROR;
+  cl_int err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
 
   cl_mem dev_profile_info = NULL;
   cl_mem dev_profile_lut = NULL;
@@ -933,36 +933,26 @@ int process_cl_lut(dt_iop_module_t *self,
   const int height = roi_in->height;
   const int preserve_colors = d->preserve_colors;
 
-  const float mul = 1.0f;
-
-  size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 };
   dev_m = dt_opencl_copy_host_to_device(devid, d->table, 256, 256, sizeof(float));
-  if(dev_m == NULL) goto error;
+  dev_coeffs = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 3, d->unbounded_coeffs);
+  if(!dev_m || !dev_coeffs) goto error;
 
   err = dt_ioppr_build_iccprofile_params_cl(work_profile, devid, &profile_info_cl, &profile_lut_cl,
                                             &dev_profile_info, &dev_profile_lut);
   if(err != CL_SUCCESS) goto error;
 
-  dev_coeffs = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 3, d->unbounded_coeffs);
-
-  if(dev_coeffs == NULL) goto error;
-
   // read data/kernels/basecurve.cl for a description of "legacy" vs current
   // Conditional is moved outside of the OpenCL operations for performance.
   if(d->preserve_colors == DT_RGB_NORM_NONE)
-  {
-    dt_opencl_set_kernel_args(devid, gd->kernel_basecurve_legacy_lut, 0, CLARG(dev_in), CLARG(dev_out),
-      CLARG(width), CLARG(height), CLARG(mul), CLARG(dev_m), CLARG(dev_coeffs));
-    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_basecurve_legacy_lut, sizes);
-  }
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_basecurve_legacy_lut, width, height,
+          CLARG(dev_in), CLARG(dev_out),
+          CLARG(width), CLARG(height), CLARGFLOAT(1.0f), CLARG(dev_m), CLARG(dev_coeffs));
   else
-  {
-    //FIXME:  There are still conditionals on d->preserve_colors within this flow that could impact performance
-    dt_opencl_set_kernel_args(devid, gd->kernel_basecurve_lut, 0, CLARG(dev_in), CLARG(dev_out), CLARG(width),
-      CLARG(height), CLARG(mul), CLARG(dev_m), CLARG(dev_coeffs), CLARG(preserve_colors), CLARG(dev_profile_info),
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_basecurve_lut, width, height,
+      CLARG(dev_in), CLARG(dev_out),
+      CLARG(width), CLARG(height),
+      CLARGFLOAT(1.0f), CLARG(dev_m), CLARG(dev_coeffs), CLARG(preserve_colors), CLARG(dev_profile_info),
       CLARG(dev_profile_lut), CLARG(use_work_profile));
-    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_basecurve_lut, sizes);
-  }
 
 error:
   dt_opencl_release_mem_object(dev_m);

From 495d748d98eb01023a3ea9c146cb798b7fccbaf4 Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Wed, 4 Feb 2026 08:08:02 +0100
Subject: [PATCH 4/9] OpenCL bilateral and colorreconstruct maintenance

- deduplicated code for atomic floats to common.h
  preparation for better support also for modern devices instead of that workaround
- use _args() variants for kernel calls in bilateral
- constify in both related kernels
---
 data/kernels/bilateral.cl           | 103 +++++++++-------------------
 data/kernels/colorreconstruction.cl |  67 ++++--------------
 data/kernels/common.h               |  38 ++++++++++
 src/common/bilateralcl.c            |  33 ++++-----
 4 files changed, 95 insertions(+), 146 deletions(-)

diff --git a/data/kernels/bilateral.cl b/data/kernels/bilateral.cl
index 3c8d7c45dde1..8c48678bbbe8 100644
--- a/data/kernels/bilateral.cl
+++ b/data/kernels/bilateral.cl
@@ -1,6 +1,6 @@
 /*
     This file is part of darktable,
-    copyright (c) 2012-2025 darktable developers.
+    copyright (c) 2012-2026 darktable developers.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -30,45 +30,6 @@ image_to_grid(
     clamp(p.z/sigma.z, 0.0f, size.z-1.0f), 0.0f);
 }
 
-void
-atomic_add_f(
-    global float *val,
-    const  float  delta)
-{
-#ifdef NVIDIA_SM_20
-  // buys me another 3x--10x over the `algorithmic' improvements in the splat kernel below,
-  // depending on configuration (sigma_s and sigma_r)
-  float res = 0;
-  asm volatile ("atom.global.add.f32 %0, [%1], %2;" : "=f"(res) : "l"(val), "f"(delta));
-
-#else
-  union
-  {
-    float f;
-    unsigned int i;
-  }
-  old_val;
-  union
-  {
-    float f;
-    unsigned int i;
-  }
-  new_val;
-
-  global volatile unsigned int *ival = (global volatile unsigned int *)val;
-
-  do
-  {
-    // the following is equivalent to old_val.f = *val. however, as according to the opencl standard
-    // we can not rely on global buffer val to be consistently cached (relaxed memory consistency) we 
-    // access it via a slower but consistent atomic operation.
-    old_val.i = atomic_add(ival, 0);
-    new_val.f = old_val.f + delta;
-  }
-  while (atomic_cmpxchg (ival, old_val.i, new_val.i) != old_val.i);
-#endif
-}
-
 kernel void
 zero(
     global float *grid,
@@ -102,29 +63,29 @@ splat(
   const int j = get_local_id(1);
   int li = lszx*j + i;
 
-  int4   size  = (int4)(sizex, sizey, sizez, 0);
-  float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0);
+  const int4   size  = (int4)(sizex, sizey, sizez, 0);
+  const float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0);
 
   int ox = 1;
-  int oy = size.x;
-  int oz = size.y*size.x;
+  const int oy = size.x;
+  const int oz = size.y*size.x;
 
   if(x < width && y < height)
   {
     // splat into downsampled grid
 
     const float4 pixel = read_imagef (in, samplerc, (int2)(x, y));
-    float L = pixel.x;
-    float4 p = (float4)(x, y, L, 0);
-    float4 gridp = image_to_grid(p, size, sigma);
-    int4 xi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0));
-    float fx = gridp.x - xi.x;
-    float fy = gridp.y - xi.y;
-    float fz = gridp.z - xi.z;
+    const float L = pixel.x;
+    const float4 p = (float4)(x, y, L, 0.0f);
+    const float4 gridp = image_to_grid(p, size, sigma);
+    const int4 xi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0));
+    const float fx = gridp.x - xi.x;
+    const float fy = gridp.y - xi.y;
+    const float fz = gridp.z - xi.z;
 
     // first accumulate into local memory
     gi[li] = xi.x + oy*xi.y + oz*xi.z;
-    float contrib = 100.0f/(sigma_s*sigma_s);
+    const float contrib = 100.0f/(sigma_s*sigma_s);
     li *= 8;
     accum[li++] = contrib * (1.0f-fx) * (1.0f-fy) * (1.0f-fz);
     accum[li++] = contrib * (     fx) * (1.0f-fy) * (1.0f-fz);
@@ -301,18 +262,18 @@ slice_to_output(
   const int oy = sizex;
   const int oz = sizey*sizex;
 
-  int4   size  = (int4)(sizex, sizey, sizez, 0);
-  float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0);
+  const int4   size  = (int4)(sizex, sizey, sizez, 0);
+  const float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0);
 
-  float4 pixel  = read_imagef (in,   samplerc, (int2)(x, y));
+  const float4 pixel  = read_imagef (in,   samplerc, (int2)(x, y));
   float4 pixel2 = read_imagef (target, samplerc, (int2)(x, y));
-  float L = pixel.x;
-  float4 p = (float4)(x, y, L, 0);
-  float4 gridp = image_to_grid(p, size, sigma);
-  int4 gridi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0));
-  float fx = gridp.x - gridi.x;
-  float fy = gridp.y - gridi.y;
-  float fz = gridp.z - gridi.z;
+  const float L = pixel.x;
+  const float4 p = (float4)(x, y, L, 0);
+  const float4 gridp = image_to_grid(p, size, sigma);
+  const int4 gridi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0));
+  const float fx = gridp.x - gridi.x;
+  const float fy = gridp.y - gridi.y;
+  const float fz = gridp.z - gridi.z;
 
   // trilinear lookup (wouldn't read/write access to 3d textures be cool)
   // could actually use an array of 2d textures, these only require opencl 1.2
@@ -354,17 +315,17 @@ slice(
   const int oy = sizex;
   const int oz = sizey*sizex;
 
-  int4   size  = (int4)(sizex, sizey, sizez, 0);
-  float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0);
+  const int4   size  = (int4)(sizex, sizey, sizez, 0);
+  const float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0);
 
   float4 pixel = read_imagef (in, samplerc, (int2)(x, y));
-  float L = pixel.x;
-  float4 p = (float4)(x, y, L, 0);
-  float4 gridp = image_to_grid(p, size, sigma);
-  int4 gridi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0));
-  float fx = gridp.x - gridi.x;
-  float fy = gridp.y - gridi.y;
-  float fz = gridp.z - gridi.z;
+  const float L = pixel.x;
+  const float4 p = (float4)(x, y, L, 0);
+  const float4 gridp = image_to_grid(p, size, sigma);
+  const int4 gridi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0));
+  const float fx = gridp.x - gridi.x;
+  const float fy = gridp.y - gridi.y;
+  const float fz = gridp.z - gridi.z;
 
   // trilinear lookup (wouldn't read/write access to 3d textures be cool)
   // could actually use an array of 2d textures, these only require opencl 1.2
diff --git a/data/kernels/colorreconstruction.cl b/data/kernels/colorreconstruction.cl
index 32a291414deb..c7a582fae39d 100644
--- a/data/kernels/colorreconstruction.cl
+++ b/data/kernels/colorreconstruction.cl
@@ -1,7 +1,6 @@
 /*
     This file is part of darktable,
-    copyright (c) 2012 johannes hanika.
-    copyright (c) 2015 Ulrich Pegelow.
+    Copyright (C) 2012-2026 darktable developers.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -48,46 +47,6 @@ grid_rescale(
   return convert_float2(roixy + pxy) * scale - convert_float2(bxy);
 }
 
-
-void
-atomic_add_f(
-    global float *val,
-    const  float  delta)
-{
-#ifdef NVIDIA_SM_20
-  // buys me another 3x--10x over the `algorithmic' improvements in the splat kernel below,
-  // depending on configuration (sigma_s and sigma_r)
-  float res = 0;
-  asm volatile ("atom.global.add.f32 %0, [%1], %2;" : "=f"(res) : "l"(val), "f"(delta));
-
-#else
-  union
-  {
-    float f;
-    unsigned int i;
-  }
-  old_val;
-  union
-  {
-    float f;
-    unsigned int i;
-  }
-  new_val;
-
-  global volatile unsigned int *ival = (global volatile unsigned int *)val;
-
-  do
-  {
-    // the following is equivalent to old_val.f = *val. however, as according to the opencl standard
-    // we can not rely on global buffer val to be consistently cached (relaxed memory consistency) we
-    // access it via a slower but consistent atomic operation.
-    old_val.i = atomic_add(ival, 0);
-    new_val.f = old_val.f + delta;
-  }
-  while (atomic_cmpxchg (ival, old_val.i, new_val.i) != old_val.i);
-#endif
-}
-
 kernel void
 colorreconstruction_zero(
     global float  *grid,
@@ -125,8 +84,8 @@ colorreconstruction_splat(
   const int j = get_local_id(1);
   int li = lszx*j + i;
 
-  int4   size  = (int4)(sizex, sizey, sizez, 0);
-  float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0);
+  const int4   size  = (int4)(sizex, sizey, sizez, 0);
+  const float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0);
 
   const float4 pixel = read_imagef (in, samplerc, (int2)(x, y));
   float weight, m;
@@ -153,11 +112,11 @@ colorreconstruction_splat(
   if(x < width && y < height)
   {
     // splat into downsampled grid
-    float4 p = (float4)(x, y, pixel.x, 0);
-    float4 gridp = image_to_grid(p, size, sigma);
+    const float4 p = (float4)(x, y, pixel.x, 0);
+    const float4 gridp = image_to_grid(p, size, sigma);
 
     // closest integer splatting:
-    int4 xi = clamp(convert_int4(round(gridp)), 0, size - 1);
+    const int4 xi = clamp(convert_int4(round(gridp)), 0, size - 1);
 
     // first accumulate into local memory
     gi[li] = xi.x + size.x*xi.y + size.x*size.y*xi.z;
@@ -280,15 +239,15 @@ colorreconstruction_slice(
   const int oy = sizex;
   const int oz = sizey*sizex;
 
-  int4   size  = (int4)(sizex, sizey, sizez, 0);
-  float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0);
+  const int4   size  = (int4)(sizex, sizey, sizez, 0);
+  const float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0);
 
   float4 pixel = read_imagef (in, samplerc, (int2)(x, y));
-  float blend = clipf(20.0f / threshold * pixel.x - 19.0f);
-  float2 pxy = grid_rescale((int2)(x, y), roixy, bxy, scale);
-  float4 p = (float4)(pxy.x, pxy.y, pixel.x, 0);
-  float4 gridp = image_to_grid(p, size, sigma);
-  int4 gridi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0));
+  const float blend = clipf(20.0f / threshold * pixel.x - 19.0f);
+  const float2 pxy = grid_rescale((int2)(x, y), roixy, bxy, scale);
+  const float4 p = (float4)(pxy.x, pxy.y, pixel.x, 0);
+  const float4 gridp = image_to_grid(p, size, sigma);
+  const int4 gridi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0));
   float fx = gridp.x - gridi.x;
   float fy = gridp.y - gridi.y;
   float fz = gridp.z - gridi.z;
diff --git a/data/kernels/common.h b/data/kernels/common.h
index 83bdf28ee4ee..234f00b44149 100644
--- a/data/kernels/common.h
+++ b/data/kernels/common.h
@@ -85,6 +85,44 @@ fcol(const int row, const int col, const unsigned int filters, global const unsi
                         : filters >> ((((row) << 1 & 14) + ((col) & 1)) << 1) & 3;
 }
 
+void
+atomic_add_f(
+    global float *val,
+    const  float  delta)
+{
+#ifdef NVIDIA_SM_20
+  // buys me another 3x--10x over the `algorithmic' improvements in the splat kernel below,
+  // depending on configuration (sigma_s and sigma_r)
+  float res = 0;
+  asm volatile ("atom.global.add.f32 %0, [%1], %2;" : "=f"(res) : "l"(val), "f"(delta));
+
+#else
+  union
+  {
+    float f;
+    unsigned int i;
+  }
+  old_val;
+  union
+  {
+    float f;
+    unsigned int i;
+  }
+  new_val;
+
+  global volatile unsigned int *ival = (global volatile unsigned int *)val;
+
+  do
+  {
+    // the following is equivalent to old_val.f = *val. however, as according to the opencl standard
+    // we can not rely on global buffer val to be consistently cached (relaxed memory consistency) we 
+    // access it via a slower but consistent atomic operation.
+    old_val.i = atomic_add(ival, 0);
+    new_val.f = old_val.f + delta;
+  }
+  while (atomic_cmpxchg (ival, old_val.i, new_val.i) != old_val.i);
+#endif
+}
 
 static inline float
 dt_fast_hypot(const float x, const float y)
diff --git a/src/common/bilateralcl.c b/src/common/bilateralcl.c
index 19b59a83a568..5932d41d6f1d 100644
--- a/src/common/bilateralcl.c
+++ b/src/common/bilateralcl.c
@@ -166,41 +166,32 @@ cl_int dt_bilateral_splat_cl(dt_bilateral_cl_t *b, cl_mem in)
 
 cl_int dt_bilateral_blur_cl(dt_bilateral_cl_t *b)
 {
-  size_t sizes[3] = { 0, 0, 1 };
-
   cl_int err = dt_opencl_enqueue_copy_buffer_to_buffer(b->devid, b->dev_grid, b->dev_grid_tmp, 0, 0,
                                                 sizeof(float) * b->size_x * b->size_y * b->size_z);
   if(err != CL_SUCCESS) return err;
 
-  sizes[0] = ROUNDUPDWD(b->size_z, b->devid);
-  sizes[1] = ROUNDUPDHT(b->size_y, b->devid);
-  int stride1, stride2, stride3;
-  stride1 = b->size_x * b->size_y;
-  stride2 = b->size_x;
-  stride3 = 1;
-  dt_opencl_set_kernel_args(b->devid, b->global->kernel_blur_line, 0, CLARG(b->dev_grid_tmp), CLARG(b->dev_grid),
-    CLARG(stride1), CLARG(stride2), CLARG(stride3), CLARG(b->size_z), CLARG(b->size_y), CLARG(b->size_x));
-  err = dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_blur_line, sizes);
+  int stride1 = b->size_x * b->size_y;
+  int stride2 = b->size_x;
+  int stride3 = 1;
+  err = dt_opencl_enqueue_kernel_2d_args(b->devid, b->global->kernel_blur_line, b->size_z, b->size_y,
+        CLARG(b->dev_grid_tmp), CLARG(b->dev_grid),
+        CLARG(stride1), CLARG(stride2), CLARG(stride3), CLARG(b->size_z), CLARG(b->size_y), CLARG(b->size_x));
   if(err != CL_SUCCESS) return err;
 
   stride1 = b->size_x * b->size_y;
   stride2 = 1;
   stride3 = b->size_x;
-  sizes[0] = ROUNDUPDWD(b->size_z, b->devid);
-  sizes[1] = ROUNDUPDHT(b->size_x, b->devid);
-  dt_opencl_set_kernel_args(b->devid, b->global->kernel_blur_line, 0, CLARG(b->dev_grid), CLARG(b->dev_grid_tmp),
-    CLARG(stride1), CLARG(stride2), CLARG(stride3), CLARG(b->size_z), CLARG(b->size_x), CLARG(b->size_y));
-  err = dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_blur_line, sizes);
+  err = dt_opencl_enqueue_kernel_2d_args(b->devid, b->global->kernel_blur_line, b->size_z, b->size_x,
+        CLARG(b->dev_grid), CLARG(b->dev_grid_tmp),
+        CLARG(stride1), CLARG(stride2), CLARG(stride3), CLARG(b->size_z), CLARG(b->size_x), CLARG(b->size_y));
   if(err != CL_SUCCESS) return err;
 
   stride1 = 1;
   stride2 = b->size_x;
   stride3 = b->size_x * b->size_y;
-  sizes[0] = ROUNDUPDWD(b->size_x, b->devid);
-  sizes[1] = ROUNDUPDHT(b->size_y, b->devid);
-  dt_opencl_set_kernel_args(b->devid, b->global->kernel_blur_line_z, 0, CLARG(b->dev_grid_tmp), CLARG(b->dev_grid),
-    CLARG(stride1), CLARG(stride2), CLARG(stride3), CLARG(b->size_x), CLARG(b->size_y), CLARG(b->size_z));
-  return dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_blur_line_z, sizes);
+  return dt_opencl_enqueue_kernel_2d_args(b->devid, b->global->kernel_blur_line_z, b->size_x, b->size_y,
+          CLARG(b->dev_grid_tmp), CLARG(b->dev_grid),
+          CLARG(stride1), CLARG(stride2), CLARG(stride3), CLARG(b->size_x), CLARG(b->size_y), CLARG(b->size_z));   
 }
 
 cl_int dt_bilateral_slice_to_output_cl(dt_bilateral_cl_t *b, cl_mem in, cl_mem out, const float detail)

From e0e93f17bc563fb0fdcb99875fb5a71777b9ec9d Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Fri, 6 Feb 2026 08:07:25 +0100
Subject: [PATCH 5/9] Atrous OpenCL maintenance

1. All non-local OpenCL kernel calls use _args() interface
2. new kernels as asked for by @ralfbrown (code does not work)
   - eaw_zero
   - eaw_addbuffers
---
 data/kernels/atrous.cl |  75 +++++++++++-----
 src/iop/atrous.c       | 197 +++++++++++++++--------------------------
 2 files changed, 126 insertions(+), 146 deletions(-)

diff --git a/data/kernels/atrous.cl b/data/kernels/atrous.cl
index 504250f2056f..79a4f17a47b1 100644
--- a/data/kernels/atrous.cl
+++ b/data/kernels/atrous.cl
@@ -1,6 +1,6 @@
 /*
     This file is part of darktable,
-    copyright (c) 2009--2010 johannes hanika.
+    Copyright (C) 2009-2026 darktable developers.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -19,20 +19,24 @@
 #include "common.h"
 
 
-
 float4
 weight(const float4 c1, const float4 c2, const float sharpen)
 {
-  // native_exp is faster than the cpu floating point aliasing hack:
-  const float wc = native_exp(-((c1.y - c2.y)*(c1.y - c2.y) + (c1.z - c2.z)*(c1.z - c2.z)) * sharpen);
-  const float wl = native_exp(- (c1.x - c2.x)*(c1.x - c2.x) * sharpen);
+  const float wc = dtcl_exp(-((c1.y - c2.y)*(c1.y - c2.y) + (c1.z - c2.z)*(c1.z - c2.z)) * sharpen);
+  const float wl = dtcl_exp(- (c1.x - c2.x)*(c1.x - c2.x) * sharpen);
   return (float4)(wl, wc, wc, 1.0f);
 }
 
 
 __kernel void
-eaw_decompose (__read_only image2d_t in, __write_only image2d_t coarse, __write_only image2d_t detail,
-     const int width, const int height, const int scale, const float sharpen, global const float *filter)
+eaw_decompose(__read_only image2d_t in,
+              __write_only image2d_t coarse,
+              __write_only image2d_t detail,
+              const int width,
+              const int height,
+              const int scale,
+              const float sharpen,
+              global const float *filter)
 {
   const int x = get_global_id(0);
   const int y = get_global_id(1);
@@ -41,17 +45,17 @@ eaw_decompose (__read_only image2d_t in, __write_only image2d_t coarse, __write_
 
   const int mult = 1<<scale;
 
-  float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
+  const float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
   float4 sum = (float4)(0.0f);
   float4 wgt = (float4)(0.0f);
   for(int j=0;j<5;j++) for(int i=0;i<5;i++)
   {
-    int xx = mad24(mult, i - 2, x);
-    int yy = mad24(mult, j - 2, y);
-    int k  = mad24(j, 5, i);
+    const int xx = mad24(mult, i - 2, x);
+    const int yy = mad24(mult, j - 2, y);
+    const int k  = mad24(j, 5, i);
 
-    float4 px = read_imagef(in, sampleri, (int2)(xx, yy));
-    float4 w = filter[k]*weight(pixel, px, sharpen);
+    const float4 px = read_imagef(in, sampleri, (int2)(xx, yy));
+    const float4 w = filter[k]*weight(pixel, px, sharpen);
 
     sum += w*px;
     wgt += w;
@@ -65,23 +69,50 @@ eaw_decompose (__read_only image2d_t in, __write_only image2d_t coarse, __write_
 
 
 __kernel void
-eaw_synthesize (__write_only image2d_t out, __read_only image2d_t coarse, __read_only image2d_t detail,
-     const int width, const int height,
-     const float t0, const float t1, const float t2, const float t3,
-     const float b0, const float b1, const float b2, const float b3)
+eaw_synthesize(__write_only image2d_t out,
+               __read_only image2d_t coarse,
+               __read_only image2d_t detail,
+               const int width,
+               const int height,
+               const float4 threshold,
+               const float4 boost)
 {
   const int x = get_global_id(0);
   const int y = get_global_id(1);
 
   if(x >= width || y >= height) return;
 
-  const float4 threshold = (float4)(t0, t1, t2, t3);
-  const float4 boost     = (float4)(b0, b1, b2, b3);
-  float4 c = read_imagef(coarse, sampleri, (int2)(x, y));
-  float4 d = read_imagef(detail, sampleri, (int2)(x, y));
-  float4 amount = copysign(max((float4)(0.0f), fabs(d) - threshold), d);
+  const float4 c = read_imagef(coarse, sampleri, (int2)(x, y));
+  const float4 d = read_imagef(detail, sampleri, (int2)(x, y));
+  const float4 amount = copysign(fmax((float4)(0.0f), fabs(d) - threshold), d);
   float4 sum = c + boost*amount;
   sum.w = c.w;
   write_imagef (out, (int2)(x, y), sum);
 }
 
+__kernel void
+eaw_zero(__write_only image2d_t out,
+         const int width,
+         const int height)
+{
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  if(x >= width || y >= height) return;
+  write_imagef(out, (int2)(x, y), (float4)0.0f);
+}
+
+__kernel void
+eaw_addbuffers(__write_only image2d_t out_out,
+               __read_only image2d_t out_in,  
+               __read_only image2d_t diff,
+               const int width,
+               const int height)
+{
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  if(x >= width || y >= height) return;
+
+  const float4 cs = read_imagef(diff, sampleri, (int2)(x, y));
+  const float4 o = read_imagef(out_in, sampleri, (int2)(x, y));
+  write_imagef(out_out, (int2)(x, y), (cs + o));  
+}
\ No newline at end of file
diff --git a/src/iop/atrous.c b/src/iop/atrous.c
index 4b5e2a3b28d2..8471e4f3a370 100644
--- a/src/iop/atrous.c
+++ b/src/iop/atrous.c
@@ -1,6 +1,6 @@
 /*
     This file is part of darktable,
-    Copyright (C) 2010-2025 darktable developers.
+    Copyright (C) 2010-2026 darktable developers.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -279,7 +279,7 @@ static void process_wavelets(dt_iop_module_t *self,
                              const dt_iop_roi_t *const roi_in,
                              const dt_iop_roi_t *const roi_out)
 {
-  dt_iop_atrous_data_t *d = piece->data;
+  const dt_iop_atrous_data_t *d = piece->data;
   dt_aligned_pixel_t thrs[MAX_NUM_SCALES];
   dt_aligned_pixel_t boost[MAX_NUM_SCALES];
   float sharp[MAX_NUM_SCALES];
@@ -360,8 +360,6 @@ void process(dt_iop_module_t *self,
 #ifdef HAVE_OPENCL
 
 #ifdef USE_NEW_CL
-/* this version is adapted to the new global tiling mechanism. it no
- * longer does tiling by itself. */
 int process_cl(dt_iop_module_t *self,
                dt_dev_pixelpipe_iop_t *piece,
                cl_mem dev_in,
@@ -369,7 +367,10 @@ int process_cl(dt_iop_module_t *self,
                const dt_iop_roi_t *const roi_in,
                const dt_iop_roi_t *const roi_out)
 {
-  dt_iop_atrous_data_t *d = piece->data;
+  const dt_iop_atrous_data_t *d = piece->data;
+  const dt_iop_atrous_global_data_t *gd = self->global_data;
+  const int devid = piece->pipe->devid;
+
   dt_aligned_pixel_t thrs[MAX_NUM_SCALES];
   dt_aligned_pixel_t boost[MAX_NUM_SCALES];
   float sharp[MAX_NUM_SCALES];
@@ -384,100 +385,81 @@ int process_cl(dt_iop_module_t *self,
     // dt_control_queue_draw(GTK_WIDGET(g->area));
   }
 
-  dt_iop_atrous_global_data_t *gd = self->global_data;
-
-  const int devid = piece->pipe->devid;
   cl_int err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
   cl_mem dev_filter = NULL;
   cl_mem dev_tmp = NULL;
   cl_mem dev_tmp2 = NULL;
   cl_mem dev_detail = NULL;
 
-  float m[] = { 0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f }; // 1/16, 4/16, 6/16, 4/16, 1/16
+  const float m[5] = { 0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f }; // 1/16, 4/16, 6/16, 4/16, 1/16
   float mm[5][5];
   for(int j = 0; j < 5; j++)
     for(int i = 0; i < 5; i++) mm[j][i] = m[i] * m[j];
 
+  const int width = roi_out->width;
+  const int height = roi_out->height;
   dev_filter = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 25, mm);
-  if(dev_filter == NULL) goto error;
 
   /* allocate space for two temporary buffer to participate_in in the
-     buffer ping-pong below.  We need dev_out to accumulate the result
-     and dev_in needs to stay unchanged for blendops */
-  dev_tmp = dt_opencl_alloc_device
-    (devid, roi_out->width, roi_out->height, sizeof(float) * 4);
-  if(dev_tmp == NULL) goto error;
-  dev_tmp2 = dt_opencl_alloc_device
-    (devid, roi_out->width, roi_out->height, sizeof(float) * 4);
-  if(dev_tmp2 == NULL) goto error;
-
+     buffer ping-pong below.
+     We need dev_out to accumulate the result
+     and dev_in must stay unchanged
+  */
+  dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4);
+  dev_tmp2 = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4);
   /* allocate a buffer for storing the detail information. */
-  dev_detail = dt_opencl_alloc_device
-    (devid, roi_out->width, roi_out->height, sizeof(float) * 4);
-  if(dev_detail == NULL) goto error;
-
-  const int width = roi_out->width;
-  const int height = roi_out->height;
-  size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 };
+  dev_detail = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4);
+  if(!dev_detail || !dev_tmp || !dev_tmp2 || !dev_filter) goto error;
 
   // clear dev_out to zeros, as we will be incrementally accumulating results there
-  dt_opencl_set_kernel_args(devid, gd->kernel_zero, 0, CLARG(dev_out));
-  err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_zero, sizes);
+  err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_zero, width, height,
+    CLARG(dev_out), CLARG(width), CLARG(height));
   if(err != CL_SUCCESS) goto error;
 
   // the buffers for the buffer ping-pong.  We start with dev_in as
   // the input half for the first scale, then switch to using dev_tmp
-  // and dev_tmp2 as the two scratch buffers
-  void* dev_buf1 = &dev_in;
-  void* dev_buf2 = &dev_tmp;
+  // and dev_tmp2 as the two scratch buffers at the end of scaling loop
+  cl_mem pp_in = dev_in;
+  cl_mem pp_coarse = dev_tmp;
 
   /* decompose image into detail scales and coarse (the latter is left
-   * in dev_tmp or dev_out) */
+     in dev_tmp or dev_out)
+  */
   for(int s = 0; s < max_scale; s++)
   {
     const int scale = s;
 
     // run the decomposition
-    dt_opencl_set_kernel_args(devid, gd->kernel_decompose, 0,
-                              CLARG(dev_buf2), CLARG(dev_buf1), CLARG(dev_detail),
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_decompose, width, height,
+                              CLARG(pp_in), CLARG(pp_coarse), CLARG(dev_detail),
                               CLARG(width), CLARG(height),
                               CLARG(scale), CLARG(sharp[s]), CLARG(dev_filter));
-
-    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_decompose, sizes);
     if(err != CL_SUCCESS) goto error;
 
-    // indirectly give gpu some air to breathe (and to do display related stuff)
-    dt_opencl_micro_nap(devid);
-
     // now immediately run the synthesis for the current scale, accumulating the details into dev_out
-    dt_opencl_set_kernel_args(devid, gd->kernel_synthesize, 0,
-                              CLARG(dev_out), CLARG(dev_out), CLARG(dev_detail),
+    // dev_out as the accumulator must be given twice as an OpenCL 1.2 workaround
+    // Is this safe here? or would we need another temp buff and accumalate?
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_synthesize, width, height,
+                              CLARG(dev_out), CLARG(pp_coarse), CLARG(dev_detail),
                               CLARG(width), CLARG(height),
-                              CLARG(thrs[scale][0]), CLARG(thrs[scale][1]),
-                              CLARG(thrs[scale][2]), CLARG(thrs[scale][3]),
-                              CLARG(boost[scale][0]), CLARG(boost[scale][1]),
-                              CLARG(boost[scale][2]), CLARG(boost[scale][3]));
-
-    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_synthesize, sizes);
+                              CLFLARRAY(4, &thrs[scale]), CLFLARRAY(4, &boost[scale]));
     if(err != CL_SUCCESS) goto error;
 
-    // indirectly give gpu some air to breathe (and to do display related stuff)
-    dt_opencl_micro_nap(devid);
-
-    // swap scratch buffers
-    if(scale == 0) dev_buf1 = dev_tmp2;
-    void* tmp = dev_buf2;
-    dev_buf2 = dev_buf1;
-    dev_buf1 = tmp;
+    // swap scratch buffers but leave as is for the final round to keep pp_coarse correct
+    if(s != max_scale -1)
+    {
+      cl_mem tmp = (s == 0) ? dev_tmp2 : pp_in;
+      pp_in = pp_coarse;
+      pp_coarse = tmp;
+    }
   }
 
   // add the residue (the coarse scale from the final decomposition)
   // to the accumulated details
-  dt_opencl_set_kernel_args
-    (devid, gd->kernel_addbuffers, 0, CLARG(dev_out), CLARG(dev_buf1));
-
-  err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_addbuffers, sizes);
-
+  // work around CL 1.20 restriction is safe with the kernel,
+  err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_addbuffers, width, height,
+          CLARG(dev_out), CLARG(dev_out), CLARG(pp_coarse),
+          CLARG(width), CLARG(height));
 error:
   dt_opencl_release_mem_object(dev_filter);
   dt_opencl_release_mem_object(dev_tmp);
@@ -495,7 +477,10 @@ int process_cl(dt_iop_module_t *self,
                const dt_iop_roi_t *const roi_in,
                const dt_iop_roi_t *const roi_out)
 {
-  dt_iop_atrous_data_t *d = piece->data;
+  const dt_iop_atrous_data_t *d = piece->data;
+  const dt_iop_atrous_global_data_t *gd = self->global_data;
+  const int devid = piece->pipe->devid;
+
   dt_aligned_pixel_t thrs[MAX_NUM_SCALES];
   dt_aligned_pixel_t boost[MAX_NUM_SCALES];
   float sharp[MAX_NUM_SCALES];
@@ -510,45 +495,38 @@ int process_cl(dt_iop_module_t *self,
     // dt_control_queue_draw(GTK_WIDGET(g->area));
   }
 
-  dt_iop_atrous_global_data_t *gd = self->global_data;
-
-  const int devid = piece->pipe->devid;
   cl_int err = DT_OPENCL_DEFAULT_ERROR;
   cl_mem dev_filter = NULL;
   cl_mem dev_tmp = NULL;
   cl_mem *dev_detail = calloc(max_scale, sizeof(cl_mem));
 
-  float m[] = { 0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f }; // 1/16, 4/16, 6/16, 4/16, 1/16
+  const float m[5] = { 0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f }; // 1/16, 4/16, 6/16, 4/16, 1/16
   float mm[5][5];
   for(int j = 0; j < 5; j++)
     for(int i = 0; i < 5; i++)
       mm[j][i] = m[i] * m[j];
 
+  const int width = roi_out->width;
+  const int height = roi_out->height;
+
   dev_filter = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 25, mm);
-  if(dev_filter == NULL) goto error;
 
   /* allocate space for a temporary buffer. we don't want to use
      dev_in in the buffer ping-pong below, as we need to keep it for
      blendops */
-  dev_tmp = dt_opencl_alloc_device
-    (devid, roi_out->width, roi_out->height, sizeof(float) * 4);
-  if(dev_tmp == NULL) goto error;
+  dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4);
+  if(!dev_tmp || !dev_filter) goto error;
 
   /* allocate space to store detail information. Requires a number of
    * additional buffers, each with full image size */
   for(int k = 0; k < max_scale; k++)
   {
-    dev_detail[k] = dt_opencl_alloc_device
-      (devid, roi_out->width, roi_out->height, sizeof(float) * 4);
+    dev_detail[k] = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4);
     if(dev_detail[k] == NULL) goto error;
   }
 
-  const int width = roi_out->width;
-  const int height = roi_out->height;
-  size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 };
   size_t origin[] = { 0, 0, 0 };
   size_t region[] = { width, height, 1 };
-
   // copy original input from dev_in -> dev_out as starting point
   err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, origin, origin, region);
   if(err != CL_SUCCESS) goto error;
@@ -560,55 +538,36 @@ int process_cl(dt_iop_module_t *self,
     const int scale = s;
 
     if(s & 1)
-    {
-      dt_opencl_set_kernel_args(devid, gd->kernel_decompose, 0,
-                                CLARG(dev_tmp), CLARG(dev_out));
-    }
+      err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_decompose, width, height,
+                                CLARG(dev_tmp), CLARG(dev_out),
+                                CLARG(dev_detail[s]), CLARG(width), CLARG(height),
+                                CLARG(scale), CLARG(sharp[s]), CLARG(dev_filter));
     else
-    {
-      dt_opencl_set_kernel_args(devid, gd->kernel_decompose, 0,
-                                CLARG(dev_out), CLARG(dev_tmp));
-    }
-    dt_opencl_set_kernel_args(devid, gd->kernel_decompose, 2,
-                              CLARG(dev_detail[s]), CLARG(width), CLARG(height),
-                              CLARG(scale), CLARG(sharp[s]), CLARG(dev_filter));
-
-    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_decompose, sizes);
+      err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_decompose, width, height,
+                                CLARG(dev_out), CLARG(dev_tmp),
+                                CLARG(dev_detail[s]), CLARG(width), CLARG(height),
+                                CLARG(scale), CLARG(sharp[s]), CLARG(dev_filter));
     if(err != CL_SUCCESS) goto error;
-
-    // indirectly give gpu some air to breathe (and to do display related stuff)
-    dt_opencl_micro_nap(devid);
   }
 
   /* now synthesize again */
   for(int scale = max_scale - 1; scale >= 0; scale--)
   {
     if(scale & 1)
-    {
-      dt_opencl_set_kernel_args(devid, gd->kernel_synthesize, 0,
-                                CLARG(dev_tmp), CLARG(dev_out));
-    }
-    else
-    {
-      dt_opencl_set_kernel_args(devid, gd->kernel_synthesize, 0,
-                                CLARG(dev_out), CLARG(dev_tmp));
-    }
-
-    dt_opencl_set_kernel_args(devid, gd->kernel_synthesize, 2,
-                              CLARG(dev_detail[scale]), CLARG(width),
-                              CLARG(height), CLARG(thrs[scale][0]),
-                              CLARG(thrs[scale][1]), CLARG(thrs[scale][2]),
-                              CLARG(thrs[scale][3]), CLARG(boost[scale][0]),
-                              CLARG(boost[scale][1]), CLARG(boost[scale][2]),
-                              CLARG(boost[scale][3]));
+      err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_synthesize, width, height,
+                              CLARG(dev_tmp), CLARG(dev_out),
+                              CLARG(dev_detail[scale]),
+                              CLARG(width), CLARG(height),
+                              CLFLARRAY(4, &thrs[scale]), CLFLARRAY(4, &boost[scale]));  
 
-    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_synthesize, sizes);
+    else
+      err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_synthesize, width, height,
+                              CLARG(dev_out), CLARG(dev_tmp),
+                              CLARG(dev_detail[scale]),
+                              CLARG(width), CLARG(height),
+                              CLFLARRAY(4, &thrs[scale]), CLFLARRAY(4, &boost[scale]));  
     if(err != CL_SUCCESS) goto error;
-
-    // indirectly give gpu some air to breathe (and to do display related stuff)
-    dt_opencl_micro_nap(devid);
   }
-
   dt_opencl_finish_sync_pipe(devid, piece->pipe->type);
 
 error:
@@ -637,7 +596,7 @@ void tiling_callback(dt_iop_module_t *self,
   const int max_filter_radius = 2 * (1 << max_scale); // 2 * 2^max_scale
 
   tiling->factor = 4.0f;                // in + out + 2*tmp
-  tiling->factor_cl = 3.0f + max_scale; // in + out + tmp + scale buffers
+  tiling->factor_cl = 5.0f; // in + out + details + 2*tmp
   tiling->maxbuf = 1.0f;
   tiling->maxbuf_cl = 1.0f;
   tiling->overhead = 0;
@@ -709,16 +668,6 @@ void commit_params(dt_iop_module_t *self,
   dt_iop_atrous_params_t *p = (dt_iop_atrous_params_t *)params;
   dt_iop_atrous_data_t *d = piece->data;
 
-#if 0
-  printf("---------- atrous preset begin\n");
-  printf("p.octaves = %d;  p.mix = %.2f\n", p->octaves, p->mix);
-  for(int ch=0; ch<atrous_none; ch++) for(int k=0; k<BANDS; k++)
-    {
-      printf("p.x[%d][%d] = %f;\n", ch, k, p->x[ch][k]);
-      printf("p.y[%d][%d] = %f;\n", ch, k, p->y[ch][k]);
-    }
-  printf("---------- atrous preset end\n");
-#endif
   d->octaves = p->octaves;
   for(int ch = 0; ch < atrous_none; ch++)
     for(int k = 0; k < BANDS; k++)

From 9aec41a0d116aa658d21caa7ec25172f5a52d8fc Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Sat, 7 Feb 2026 19:24:15 +0100
Subject: [PATCH 6/9] Denoise profile OpenCL maintenance

1. All non-local OpenCL kernel calls now use the _args() variant
2. Deduplicated code for old/new variant (still using "old" code
3. Fixed some possible clmem-leaks if kernels would fail
---
 src/iop/denoiseprofile.c | 336 ++++++++++++---------------------------
 1 file changed, 101 insertions(+), 235 deletions(-)

diff --git a/src/iop/denoiseprofile.c b/src/iop/denoiseprofile.c
index 73a5932af7b0..5ed4e4b035c8 100644
--- a/src/iop/denoiseprofile.c
+++ b/src/iop/denoiseprofile.c
@@ -1,6 +1,6 @@
 /*
     This file is part of darktable,
-    Copyright (C) 2012-2024 darktable developers.
+    Copyright (C) 2012-2026 darktable developers.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -1976,13 +1976,14 @@ static int process_nlmeans_cl(dt_iop_module_t *self,
                               const dt_iop_roi_t *const roi_in,
                               const dt_iop_roi_t *const roi_out)
 {
-  dt_iop_denoiseprofile_data_t *d = piece->data;
-  dt_iop_denoiseprofile_global_data_t *gd = self->global_data;
-#if USE_NEW_IMPL_CL
+  const dt_iop_denoiseprofile_data_t *d = piece->data;
+  const dt_iop_denoiseprofile_global_data_t *gd = self->global_data;
+
   const int width = roi_in->width;
   const int height = roi_in->height;
+  const int devid = piece->pipe->devid;
 
-  cl_int err = DT_OPENCL_DEFAULT_ERROR;
+  cl_int err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
 
   const float scale = fminf(fminf(roi_in->scale, 2.0f) / fmaxf(piece->iscale, 1.0f), 1.0f);
   const int P = ceilf(d->radius * scale); // pixel filter size
@@ -1995,43 +1996,32 @@ static int process_nlmeans_cl(dt_iop_module_t *self,
   dt_aligned_pixel_t p;
   dt_aligned_pixel_t aa;
   dt_aligned_pixel_t bb;
-  (void)nlmeans_precondition_cl(d,piece,wb,scale,aa,bb,p);
 
-  // allocate a buffer for a preconditioned copy of the image
-  const int devid = piece->pipe->devid;
+  nlmeans_precondition_cl(d,piece,wb,scale,aa,bb,p);
+
   cl_mem dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4);
-  if(dev_tmp == NULL) return CL_MEM_OBJECT_ALLOCATION_FAILURE;
+  cl_mem dev_U2 = dt_opencl_alloc_device_buffer(devid, sizeof(float) * 4 * width * height);
+  if(!dev_tmp || !dev_U2) goto final;
 
-  const size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 };
-  const float sigma2[4] = { (bb[0] / aa[0]) * (bb[0] / aa[0]),
-                            (bb[1] / aa[1]) * (bb[1] / aa[1]),
-                            (bb[2] / aa[2]) * (bb[2] / aa[2]),
-                            0.0f };
+  const dt_aligned_pixel_t sigma2 = { (bb[0] / aa[0]) * (bb[0] / aa[0]),
+                                      (bb[1] / aa[1]) * (bb[1] / aa[1]),
+                                      (bb[2] / aa[2]) * (bb[2] / aa[2]),
+                                      0.0f };
 
   if(!d->use_new_vst)
-  {
-    dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_precondition,
-                              0, CLARG(dev_in), CLARG(dev_tmp),
-      CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2));
-    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_precondition, sizes);
-  }
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_precondition, width, height,
+            CLARG(dev_in), CLARG(dev_tmp),
+            CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2));      
   else
-  {
-    dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_precondition_v2,
-                              0, CLARG(dev_in), CLARG(dev_tmp),
-      CLARG(width), CLARG(height), CLARG(aa), CLARG(p), CLARG(bb), CLARG(wb));
-    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_precondition_v2,
-                                      sizes);
-  }
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_precondition_v2, width, height,
+            CLARG(dev_in), CLARG(dev_tmp),
+            CLARG(width), CLARG(height), CLARG(aa), CLARG(p), CLARG(bb), CLARG(wb));      
+  if(err != CL_SUCCESS) goto final;
 
-  // allocate a buffer to receive the denoised image
-  cl_mem dev_U2 = dt_opencl_alloc_device_buffer(devid, sizeof(float) * 4 * width * height);
-  if(dev_U2 == NULL) err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+#if USE_NEW_IMPL_CL
 
-  if(err == CL_SUCCESS)
-  {
-    const dt_aligned_pixel_t norm2 = { 1.0f, 1.0f, 1.0f, 1.0f };
-    const dt_nlmeans_param_t params =
+  const dt_aligned_pixel_t norm2 = { 1.0f, 1.0f, 1.0f, 1.0f };
+  const dt_nlmeans_param_t params =
       {
         .scattering = scattering,
         .scale = scale,
@@ -2050,64 +2040,13 @@ static int process_nlmeans_cl(dt_iop_module_t *self,
         .kernel_vert = gd->kernel_denoiseprofile_vert,
         .kernel_accu = gd->kernel_denoiseprofile_accu
       };
-    err = nlmeans_denoiseprofile_cl(&params, devid, dev_tmp, dev_U2, roi_in);
-  }
-  if(err == CL_SUCCESS)
-  {
-    if(!d->use_new_vst)
-    {
-      dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_finish,
-                                0, CLARG(dev_in), CLARG(dev_U2),
-        CLARG(dev_out), CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2));
-      err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_finish, sizes);
-    }
-    else
-    {
-      const float bias = d->bias - 0.5 * logf(scale);
-      dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_finish_v2, 0,
-                                CLARG(dev_in), CLARG(dev_U2),
-                                CLARG(dev_out), CLARG(width), CLARG(height),
-                                CLARG(aa), CLARG(p),
-                                CLARG(bb), CLARG(bias), CLARG(wb));
-      err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_finish_v2, sizes);
-    }
-  }
-  dt_opencl_release_mem_object(dev_U2);
-  dt_opencl_release_mem_object(dev_tmp);
-  return err;
+  err = nlmeans_denoiseprofile_cl(&params, devid, dev_tmp, dev_U2, roi_in);
+  if(err != CL_SUCCESS) goto final;
 
-#else
-  const int width = roi_in->width;
-  const int height = roi_in->height;
-
-  cl_int err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
-
-  const float scale = fminf(fminf(roi_in->scale, 2.0f) / fmaxf(piece->iscale, 1.0f), 1.0f);
-  const int P = ceilf(d->radius * scale); // pixel filter size
-  int K = d->nbhood; // nbhood
-  const float scattering = nlmeans_scattering(&K,d,piece,scale);
-  const float norm = nlmeans_norm(P,d);
-  const float central_pixel_weight = d->central_pixel_weight * scale;
-
-  dt_aligned_pixel_t wb;
-  dt_aligned_pixel_t p;
-  dt_aligned_pixel_t aa;
-  dt_aligned_pixel_t bb;
-  (void)nlmeans_precondition_cl(d,piece,wb,scale,aa,bb,p);
-
-  const dt_aligned_pixel_t sigma2 = { (bb[0] / aa[0]) * (bb[0] / aa[0]),
-                                      (bb[1] / aa[1]) * (bb[1] / aa[1]),
-                                      (bb[2] / aa[2]) * (bb[2] / aa[2]),
-                                      0.0f };
-
-  const int devid = piece->pipe->devid;
-  cl_mem dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4);
-  if(dev_tmp == NULL) goto error;
-
-  cl_mem dev_U2 = dt_opencl_alloc_device_buffer(devid, sizeof(float) * 4 * width * height);
-  if(dev_U2 == NULL) goto error;
+#else // old&current code
 
   cl_mem buckets[NUM_BUCKETS] = { NULL };
+
   unsigned int state = 0;
   for(int k = 0; k < NUM_BUCKETS; k++)
   {
@@ -2147,38 +2086,14 @@ static int process_nlmeans_cl(dt_iop_module_t *self,
   else
     vblocksize = 1;
 
-
-  const size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 };
-  size_t sizesl[3];
-  size_t local[3];
-
-  if(!d->use_new_vst)
-  {
-    dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_precondition,
-                              0, CLARG(dev_in), CLARG(dev_tmp),
-      CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2));
-    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_precondition, sizes);
-    if(err != CL_SUCCESS) goto error;
-  }
-  else
-  {
-    dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_precondition_v2, 0,
-                              CLARG(dev_in), CLARG(dev_tmp),
-                              CLARG(width), CLARG(height),
-                              CLARG(aa), CLARG(p), CLARG(bb), CLARG(wb));
-    err = dt_opencl_enqueue_kernel_2d(devid,
-                                      gd->kernel_denoiseprofile_precondition_v2, sizes);
-    if(err != CL_SUCCESS) goto error;
-  }
-
-  dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_init, 0,
-                            CLARG(dev_U2), CLARG(width),
-                            CLARG(height));
-  err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_init, sizes);
+  err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_init, width, height,
+            CLARG(dev_U2), CLARG(width), CLARG(height));
   if(err != CL_SUCCESS) goto error;
 
   const size_t bwidth = ROUNDUP(width, hblocksize);
   const size_t bheight = ROUNDUP(height, vblocksize);
+  size_t sizesl[3];
+  size_t local[3];
 
   for(int kj_index = -K; kj_index <= 0; kj_index++)
   {
@@ -2200,10 +2115,8 @@ static int process_nlmeans_cl(dt_iop_module_t *self,
       int q[2] = { i, j };
 
       cl_mem dev_U4 = buckets[bucket_next(&state, NUM_BUCKETS)];
-      dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_dist, 0,
-                                CLARG(dev_tmp), CLARG(dev_U4),
-                                CLARG(width), CLARG(height), CLARG(q));
-      err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_dist, sizes);
+      err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_dist, width, height,
+                CLARG(dev_tmp), CLARG(dev_U4), CLARG(width), CLARG(height), CLARG(q));
       if(err != CL_SUCCESS) goto error;
 
       sizesl[0] = bwidth;
@@ -2235,53 +2148,48 @@ static int process_nlmeans_cl(dt_iop_module_t *self,
                                 CLARG(q), CLARG(P), CLARG(norm),
                                 CLLOCAL(sizeof(float) * (vblocksize + 2 * P)),
                                 CLARG(central_pixel_weight), CLARG(dev_U4));
-      err = dt_opencl_enqueue_kernel_2d_with_local
-        (devid,
-         gd->kernel_denoiseprofile_vert, sizesl, local);
+      err = dt_opencl_enqueue_kernel_2d_with_local(devid,
+                                                   gd->kernel_denoiseprofile_vert,
+                                                   sizesl, local);
       if(err != CL_SUCCESS) goto error;
 
-
-      dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_accu,
-                                0, CLARG(dev_tmp), CLARG(dev_U2),
-                                CLARG(dev_U4_tt), CLARG(width),
-                                CLARG(height), CLARG(q));
-      err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_accu, sizes);
+      err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_accu, width, height,
+                          CLARG(dev_tmp), CLARG(dev_U2), CLARG(dev_U4_tt),
+                          CLARG(width), CLARG(height), CLARG(q));        
       if(err != CL_SUCCESS) goto error;
-
       dt_opencl_finish_sync_pipe(devid, piece->pipe->type);
-
-      // indirectly give gpu some air to breathe (and to do display related stuff)
-      dt_opencl_micro_nap(devid);
     }
   }
 
-  if(!d->use_new_vst)
+ error:
+  for(int k = 0; k < NUM_BUCKETS; k++)
+    dt_opencl_release_mem_object(buckets[k]);
+
+#endif /* shared finalize USE_NEW_IMPL_CL */
+
+  if(err == CL_SUCCESS)
   {
-    dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_finish, 0,
+    if(!d->use_new_vst)
+    {
+      err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_finish, width, height,
                               CLARG(dev_in), CLARG(dev_U2),
                               CLARG(dev_out), CLARG(width), CLARG(height),
                               CLARG(aa), CLARG(sigma2));
-    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_finish, sizes);
-  }
-  else
-  {
-    const float bias = d->bias - 0.5 * logf(scale);
-    dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_finish_v2, 0,
+    }
+    else
+    {
+      const float bias = d->bias - 0.5 * logf(scale);
+      err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_finish_v2, width, height,
                               CLARG(dev_in), CLARG(dev_U2),
                               CLARG(dev_out), CLARG(width), CLARG(height),
                               CLARG(aa), CLARG(p), CLARG(bb), CLARG(bias), CLARG(wb));
-    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_finish_v2, sizes);
+    }
   }
 
- error:
-  for(int k = 0; k < NUM_BUCKETS; k++)
-  {
-    dt_opencl_release_mem_object(buckets[k]);
-  }
+final:
   dt_opencl_release_mem_object(dev_U2);
   dt_opencl_release_mem_object(dev_tmp);
   return err;
-#endif /* USE_NEW_IMPL_CL */
 }
 
 
@@ -2292,8 +2200,8 @@ static int process_wavelets_cl(dt_iop_module_t *self,
                                const dt_iop_roi_t *const roi_in,
                                const dt_iop_roi_t *const roi_out)
 {
-  dt_iop_denoiseprofile_data_t *d = piece->data;
-  dt_iop_denoiseprofile_global_data_t *gd = self->global_data;
+  const dt_iop_denoiseprofile_data_t *d = piece->data;
+  const dt_iop_denoiseprofile_global_data_t *gd = self->global_data;
 
   const int max_max_scale = DT_IOP_DENOISE_PROFILE_BANDS; // hard limit
   int max_scale = 0;
@@ -2328,6 +2236,8 @@ static int process_wavelets_cl(dt_iop_module_t *self,
   cl_mem dev_m = NULL;
   cl_mem dev_r = NULL;
   cl_mem dev_filter = NULL;
+  cl_mem dev_Y0U0V0 = NULL;
+  cl_mem dev_RGB = NULL;
   cl_mem *dev_detail = calloc(max_max_scale, sizeof(cl_mem));
   float *sumsum = NULL;
 
@@ -2379,18 +2289,13 @@ static int process_wavelets_cl(dt_iop_module_t *self,
   const int reducesize = MIN(REDUCESIZE, ROUNDUP(bufsize, slocopt.sizex) / slocopt.sizex);
   err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
   dev_m = dt_opencl_alloc_device_buffer(devid, sizeof(float) * 4 * bufsize);
-  if(dev_m == NULL) goto error;
-
   dev_r = dt_opencl_alloc_device_buffer(devid, sizeof(float) * 4 * reducesize);
-  if(dev_r == NULL) goto error;
-
   sumsum = dt_alloc_align_float((size_t)4 * reducesize);
-  if(sumsum == NULL) goto error;
-
   dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4);
-  if(dev_tmp == NULL) goto error;
 
-  float m[] = { 0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f }; // 1/16, 4/16, 6/16, 4/16, 1/16
+  if(!dev_tmp || !dev_r || !dev_m || !sumsum) goto error;
+
+  const float m[] = { 0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f }; // 1/16, 4/16, 6/16, 4/16, 1/16
   float mm[5][5];
   for(int j = 0; j < 5; j++)
     for(int i = 0; i < 5; i++) mm[j][i] = m[i] * m[j];
@@ -2463,41 +2368,27 @@ static int process_wavelets_cl(dt_iop_module_t *self,
     }
   }
 
-  size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 };
-
   if(!d->use_new_vst)
   {
-    dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_precondition,
-                              0, CLARG(dev_in), CLARG(dev_out),
-                              CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2));
-    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_precondition, sizes);
-    if(err != CL_SUCCESS) goto error;
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_precondition, width, height,
+            CLARG(dev_in), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2));    
   }
   else if(d->wavelet_color_mode == MODE_RGB)
   {
-    dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_precondition_v2,
-                              0, CLARG(dev_in), CLARG(dev_out),
-                              CLARG(width), CLARG(height),
-                              CLARG(aa), CLARG(p), CLARG(bb), CLARG(wb));
-    err = dt_opencl_enqueue_kernel_2d(devid,
-                                      gd->kernel_denoiseprofile_precondition_v2, sizes);
-    if(err != CL_SUCCESS) goto error;
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_precondition_v2, width, height,
+            CLARG(dev_in), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(aa), CLARG(p), CLARG(bb), CLARG(wb));      
   }
   else
   {
     err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
-    cl_mem dev_Y0U0V0 = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 9, toY0U0V0);
+    dev_Y0U0V0 = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 9, toY0U0V0);
     if(dev_Y0U0V0 == NULL) goto error;
-
-    dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_precondition_Y0U0V0,
-                                0, CLARG(dev_in),
-                                CLARG(dev_out), CLARG(width), CLARG(height),
-                                CLARG(aa), CLARG(p), CLARG(bb), CLARG(dev_Y0U0V0));
-    err = dt_opencl_enqueue_kernel_2d(devid,
-                                      gd->kernel_denoiseprofile_precondition_Y0U0V0,
-                                      sizes);
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_precondition_Y0U0V0, width, height,
+            CLARG(dev_in), CLARG(dev_out), CLARG(width), CLARG(height),
+            CLARG(aa), CLARG(p), CLARG(bb), CLARG(dev_Y0U0V0));
     dt_opencl_release_mem_object(dev_Y0U0V0);
   }
+  if(err != CL_SUCCESS) goto error;
 
   dev_buf1 = dev_out;
   dev_buf2 = dev_tmp;
@@ -2510,16 +2401,12 @@ static int process_wavelets_cl(dt_iop_module_t *self,
     const float sigma_band = powf(varf, s) * sigma;
     const float inv_sigma2 = 1.0f / (sigma_band * sigma_band);
 
-    dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_decompose,
-                              0, CLARG(dev_buf1), CLARG(dev_buf2),
-                              CLARG(dev_detail[s]), CLARG(width), CLARG(height),
-                              CLARG(s), CLARG(inv_sigma2), CLARG(dev_filter));
-    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_decompose, sizes);
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_decompose, width, height,
+            CLARG(dev_buf1), CLARG(dev_buf2),
+            CLARG(dev_detail[s]), CLARG(width), CLARG(height),
+            CLARG(s), CLARG(inv_sigma2), CLARG(dev_filter));      
     if(err != CL_SUCCESS) goto error;
 
-    // indirectly give gpu some air to breathe (and to do display related stuff)
-    dt_opencl_micro_nap(devid);
-
     // swap buffers
     cl_mem dev_buf3 = dev_buf2;
     dev_buf2 = dev_buf1;
@@ -2547,15 +2434,14 @@ static int process_wavelets_cl(dt_iop_module_t *self,
     llocal[0] = flocopt.sizex;
     llocal[1] = flocopt.sizey;
     llocal[2] = 1;
-    dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_reduce_first,
-                              0, CLARG((dev_detail[s])),
+    dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_reduce_first, 0,
+                              CLARG((dev_detail[s])),
                               CLARG(width), CLARG(height),
                               CLARG(dev_m),
                               CLLOCAL(sizeof(float) * 4 * flocopt.sizex * flocopt.sizey));
-    err = dt_opencl_enqueue_kernel_2d_with_local
-      (devid,
-       gd->kernel_denoiseprofile_reduce_first, lsizes,
-       llocal);
+    err = dt_opencl_enqueue_kernel_2d_with_local(devid,
+                                                 gd->kernel_denoiseprofile_reduce_first,
+                                                 lsizes, llocal);
     if(err != CL_SUCCESS) goto error;
 
 
@@ -2565,19 +2451,17 @@ static int process_wavelets_cl(dt_iop_module_t *self,
     llocal[0] = slocopt.sizex;
     llocal[1] = 1;
     llocal[2] = 1;
-    dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_reduce_second,
-                              0, CLARG(dev_m), CLARG(dev_r),
+    dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_reduce_second, 0,
+                              CLARG(dev_m), CLARG(dev_r),
                               CLARG(bufsize), CLLOCAL(sizeof(float) * 4 * slocopt.sizex));
-    err = dt_opencl_enqueue_kernel_2d_with_local
-      (devid,
-       gd->kernel_denoiseprofile_reduce_second, lsizes,
-       llocal);
+    err = dt_opencl_enqueue_kernel_2d_with_local(devid,
+                                                 gd->kernel_denoiseprofile_reduce_second,
+                                                 lsizes, llocal);
     if(err != CL_SUCCESS) goto error;
 
     err = dt_opencl_read_buffer_from_device(devid, (void *)sumsum, dev_r, 0,
                                             sizeof(float) * 4 * reducesize, CL_TRUE);
-    if(err != CL_SUCCESS)
-      goto error;
+    if(err != CL_SUCCESS) goto error;
 
     for(int k = 0; k < reducesize; k++)
     {
@@ -2651,19 +2535,14 @@ static int process_wavelets_cl(dt_iop_module_t *self,
     // dt_print(DT_DEBUG_ALWAYS, "scale %d thrs %f %f %f", s, thrs[0], thrs[1], thrs[2]);
 
     const dt_aligned_pixel_t boost = { 1.0f, 1.0f, 1.0f, 1.0f };
-
-    dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_synthesize,
-                              0, CLARG(dev_buf1), CLARG(dev_detail[s]),
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_synthesize, width, height,
+                              CLARG(dev_buf1), CLARG(dev_detail[s]),
                               CLARG(dev_buf2), CLARG(width), CLARG(height),
                               CLARG(thrs[0]), CLARG(thrs[1]), CLARG(thrs[2]),
                               CLARG(thrs[3]), CLARG(boost[0]), CLARG(boost[1]),
-                              CLARG(boost[2]), CLARG(boost[3]));
-    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_synthesize, sizes);
+                              CLARG(boost[2]), CLARG(boost[3]));      
     if(err != CL_SUCCESS) goto error;
 
-    // indirectly give gpu some air to breathe (and to do display related stuff)
-    dt_opencl_micro_nap(devid);
-
     // swap buffers
     cl_mem dev_buf3 = dev_buf2;
     dev_buf2 = dev_buf1;
@@ -2683,47 +2562,34 @@ static int process_wavelets_cl(dt_iop_module_t *self,
 
   if(!d->use_new_vst)
   {
-    dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_backtransform,
-                              0, CLARG(dev_tmp), CLARG(dev_out),
-                              CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2));
-    err = dt_opencl_enqueue_kernel_2d(devid,
-                                      gd->kernel_denoiseprofile_backtransform, sizes);
-    if(err != CL_SUCCESS) goto error;
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_backtransform, width, height,
+            CLARG(dev_tmp), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2));
   }
   else if(d->wavelet_color_mode == MODE_RGB)
   {
     const float bias = d->bias - 0.5 * logf(scale);
-    dt_opencl_set_kernel_args(devid,
-                              gd->kernel_denoiseprofile_backtransform_v2, 0,
-                              CLARG(dev_tmp),
-                              CLARG(dev_out), CLARG(width), CLARG(height),
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_backtransform_v2, width, height,
+                              CLARG(dev_tmp), CLARG(dev_out), CLARG(width), CLARG(height),
                               CLARG(aa), CLARG(p), CLARG(bb), CLARG(bias), CLARG(wb));
-    err = dt_opencl_enqueue_kernel_2d(devid,
-                                      gd->kernel_denoiseprofile_backtransform_v2, sizes);
-    if(err != CL_SUCCESS) goto error;
   }
   else
   {
-    cl_mem dev_RGB = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 9, toRGB);
+    err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+    dev_RGB = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 9, toRGB);
     if(dev_RGB == NULL) goto error;
 
     const float bias = d->bias - 0.5 * logf(scale);
-    dt_opencl_set_kernel_args(devid,
-                              gd->kernel_denoiseprofile_backtransform_Y0U0V0, 0,
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_backtransform_Y0U0V0, width, height,
                               CLARG(dev_tmp),
                               CLARG(dev_out), CLARG(width), CLARG(height),
                               CLARG(aa), CLARG(p), CLARG(bb), CLARG(bias), CLARG(wb),
                               CLARG(dev_RGB));
-    err = dt_opencl_enqueue_kernel_2d(devid,
-                                        gd->kernel_denoiseprofile_backtransform_Y0U0V0,
-                                        sizes);
-    dt_opencl_release_mem_object(dev_RGB);
-    if(err != CL_SUCCESS) goto error;
   }
-
-  dt_opencl_finish_sync_pipe(devid, piece->pipe->type);
+  if(err == CL_SUCCESS)
+    dt_opencl_finish_sync_pipe(devid, piece->pipe->type);
 
 error:
+  dt_opencl_release_mem_object(dev_RGB);
   dt_opencl_release_mem_object(dev_r);
   dt_opencl_release_mem_object(dev_m);
   dt_opencl_release_mem_object(dev_tmp);

From aa8921136f275e04b35645b44915266eb33e71aa Mon Sep 17 00:00:00 2001
From: Hanno Schwalm <hanno@schwalm-bremen.de>
Date: Mon, 9 Feb 2026 11:20:43 +0100
Subject: [PATCH 7/9] Proper mix of OpenCL blend requirements

As the tiling factor_cl for blendop might be less than the overall now we have to
mix properly.
This often avoids 1x1 OpenCL tiling in the pipe for blended modules.
---
 src/develop/pixelpipe_hb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/develop/pixelpipe_hb.c b/src/develop/pixelpipe_hb.c
index 486e189266d3..c70579f2816b 100644
--- a/src/develop/pixelpipe_hb.c
+++ b/src/develop/pixelpipe_hb.c
@@ -1961,9 +1961,9 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe,
 
     /* aggregate in structure tiling */
     tiling.factor = MAX(tiling.factor, tiling_blendop.factor);
-    tiling.factor_cl = MAX(tiling.factor_cl, tiling_blendop.factor);
+    tiling.factor_cl = MAX(tiling.factor_cl, tiling_blendop.factor_cl);
     tiling.maxbuf = MAX(tiling.maxbuf, tiling_blendop.maxbuf);
-    tiling.maxbuf_cl = MAX(tiling.maxbuf_cl, tiling_blendop.maxbuf);
+    tiling.maxbuf_cl = MAX(tiling.maxbuf_cl, tiling_blendop.maxbuf_cl);
     tiling.overhead = MAX(tiling.overhead, tiling_blendop.overhead);
     tiling.overlap = MAX(tiling.overlap, tiling_blendop.overlap);
   }

From f0361227888648b350eeb7e3d094b7bce15c3bcf Mon Sep 17 00:00:00 2001
From: Pascal Obry <pascal@obry.net>
Date: Mon, 9 Feb 2026 22:33:47 +0100
Subject: [PATCH 8/9] Make sure we do not mess with gtk tree path.

As we need to use it0 (selected->data) later don't mess with it. We
do a copy before checking for first/last item.

Closes #20290.
---
 src/libs/masks.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/libs/masks.c b/src/libs/masks.c
index 03b9c879e805..acbfb9c0383a 100644
--- a/src/libs/masks.c
+++ b/src/libs/masks.c
@@ -941,13 +941,15 @@ static int _tree_button_pressed(GtkWidget *treeview,
         // feature only meaningful for rows with prev/next.
 
         GtkTreeIter it;
-        gtk_tree_model_get_iter(model, &it, it0);
+        GtkTreePath *item = gtk_tree_path_copy(it0);
+        gtk_tree_model_get_iter(model, &it, item);
         is_last_row = !gtk_tree_model_iter_next(model, &it);
 
-        if(!is_last_row && !gtk_tree_path_prev(it0))
+        if(!is_last_row && !gtk_tree_path_prev(item))
         {
           is_first_row = TRUE;
         }
+        gtk_tree_path_free(item);
       }
 
       for(const GList *items_iter = selected;

From 4a6dfb52dbf44cd6e756f636cebbf365f44ae33d Mon Sep 17 00:00:00 2001
From: Christian Bouhon <christian.bouhon@outlook.be>
Date: Wed, 18 Feb 2026 18:20:50 +0100
Subject: [PATCH 9/9] 20260218 implement adaptive JzAzBz shoulder extension

---
 data/kernels/basecurve.cl |  260 ++++++++-
 src/iop/basecurve.c       | 1058 +++++++++++++++++++++++++++++++++----
 2 files changed, 1212 insertions(+), 106 deletions(-)
 mode change 100644 => 100755 data/kernels/basecurve.cl
 mode change 100644 => 100755 src/iop/basecurve.c

diff --git a/data/kernels/basecurve.cl b/data/kernels/basecurve.cl
old mode 100644
new mode 100755
index 24a675fe73ab..30c222bb316c
--- a/data/kernels/basecurve.cl
+++ b/data/kernels/basecurve.cl
@@ -1,6 +1,6 @@
 /*
     This file is part of darktable,
-    copyright (c) 2016-2025 darktable developers.
+    copyright (c) 2016-2026 darktable developers.
 
     darktable is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -19,6 +19,28 @@
 #include "color_conversion.h"
 #include "rgb_norms.h"
 
+inline float _aces_tone_map(const float x)
+{
+  const float a = 2.51f;
+  const float b = 0.03f;
+  const float c = 2.43f;
+  const float d = 0.59f;
+  const float e = 0.14f;
+
+  return clamp((x * (a * x + b)) / (x * (c * x + d) + e), 0.0f, 1.0f);
+}
+
+inline float _aces_20_tonemap(const float x)
+{
+  const float a = 0.0245786f;
+  const float b = 0.000090537f;
+  const float c = 0.983729f;
+  const float d = 0.4329510f;
+  const float e = 0.238081f;
+
+  return clamp((x * (x + a) - b) / (x * (c * x + d) + e), 0.0f, 1.0f);
+}
+
 /*
   Primary LUT lookup.  Measures the luminance of a given pixel using a selectable function, looks up that
   luminance in the configured basecurve, and then scales each channel by the result.
@@ -86,9 +108,11 @@ basecurve_legacy_lut(read_only image2d_t in, write_only image2d_t out, const int
   float4 pixel = read_imagef(in, sampleri, (int2)(x, y));
 
   // apply ev multiplier and use lut or extrapolation:
-  pixel.x = lookup_unbounded(table, mul * pixel.x, a);
-  pixel.y = lookup_unbounded(table, mul * pixel.y, a);
-  pixel.z = lookup_unbounded(table, mul * pixel.z, a);
+  float3 f = pixel.xyz * mul;
+
+  pixel.x = lookup_unbounded(table, f.x, a);
+  pixel.y = lookup_unbounded(table, f.y, a);
+  pixel.z = lookup_unbounded(table, f.z, a);
   pixel = fmax(pixel, 0.f);
   write_imagef (out, (int2)(x, y), pixel);
 }
@@ -298,14 +322,238 @@ basecurve_reconstruct(read_only image2d_t in, read_only image2d_t tmp, write_onl
 }
 
 kernel void
-basecurve_finalize(read_only image2d_t in, read_only image2d_t comb, write_only image2d_t out, const int width, const int height)
+basecurve_finalize(read_only image2d_t in, read_only image2d_t comb, write_only image2d_t out, const int width,
+                   const int height, const int workflow_mode, const float shadow_lift, const float highlight_gain,
+                   const float ucs_saturation_balance, const float gamut_strength, const float highlight_corr, const int target_gamut, const float look_opacity, const float16 look_mat, const float alpha)
 {
   const int x = get_global_id(0);
   const int y = get_global_id(1);
 
   if(x >= width || y >= height) return;
 
-  float4 pixel = fmax(read_imagef(comb, sampleri, (int2)(x, y)), 0.f);
+  float4 pixel = read_imagef(comb, sampleri, (int2)(x, y));
+
+  // Sanitize to avoid Inf/NaN propagation
+  pixel.xyz = clamp(pixel.xyz, -1e6f, 1e6f);
+
+  if(workflow_mode > 0)
+  {
+    float3 pixel_in = pixel.xyz;
+    float3 look_transformed;
+    look_transformed.x = dot(pixel_in, (float3)(look_mat.s0, look_mat.s1, look_mat.s2));
+    look_transformed.y = dot(pixel_in, (float3)(look_mat.s3, look_mat.s4, look_mat.s5));
+    look_transformed.z = dot(pixel_in, (float3)(look_mat.s6, look_mat.s7, look_mat.s8));
+
+    // Mix between original and transformed
+    pixel.xyz = mix(pixel_in, look_transformed, look_opacity);
+    pixel.xyz = fmax(pixel.xyz, 0.0f); // Anti-black artifacts
+
+    if(highlight_gain != 1.0f)
+      pixel.xyz *= highlight_gain;
+
+    if(shadow_lift != 1.0f)
+    {
+      pixel.x = (pixel.x > 0.0f) ? native_powr(pixel.x, shadow_lift) : pixel.x;
+      pixel.y = (pixel.y > 0.0f) ? native_powr(pixel.y, shadow_lift) : pixel.y;
+      pixel.z = (pixel.z > 0.0f) ? native_powr(pixel.z, shadow_lift) : pixel.z;
+    }
+
+    const float r_coeff = 0.2627f;
+    const float g_coeff = 0.6780f;
+    const float b_coeff = 0.0593f;
+    
+    float y_in = pixel.x * r_coeff + pixel.y * g_coeff + pixel.z * b_coeff;
+    float y_out = y_in;
+
+    /* Scene-referred: luminance-adaptive shoulder extension for ACES-like
+       tonemapping using perceptual luminance Jz. */
+    if(workflow_mode == 1 || workflow_mode == 2)
+    {
+      float3 xyz;
+      xyz.x = 0.636958f * pixel.x + 0.144617f * pixel.y + 0.168881f * pixel.z;
+      xyz.y = 0.262700f * pixel.x + 0.677998f * pixel.y + 0.059302f * pixel.z;
+      xyz.z = 0.000000f * pixel.x + 0.028073f * pixel.y + 1.060985f * pixel.z;
+
+      xyz = fmax(xyz, (float3)(0.0f));
+
+      float4 xyz_scaled = (float4)(xyz.x * 400.0f, xyz.y * 400.0f, xyz.z * 400.0f, 0.0f);
+      float4 jab = XYZ_to_JzAzBz(xyz_scaled);
+
+      const float L = clamp(jab.x, 0.0f, 1.0f);
+      const float k = 1.0f + alpha * L * L;
+
+      const float x_scaled = y_in / k;
+      if(workflow_mode == 1)
+        y_out = _aces_tone_map(x_scaled) * k;
+      else
+        y_out = _aces_20_tonemap(x_scaled * 1.257f) * k;
+    }
+
+    float gain = y_out / fmax(y_in, 1e-6f);
+    pixel.xyz *= gain;
+
+    const float threshold = 0.80f;
+    if(y_out > threshold)
+    {
+      float factor = (y_out - threshold) / (1.0f - threshold);
+      factor = clamp(factor, 0.0f, 1.0f);
+      pixel.xyz = mix(pixel.xyz, (float3)y_out, factor);
+    }
+
+    float4 jab = (float4)(0.0f);
+    if(ucs_saturation_balance != 0.0f || gamut_strength > 0.0f || highlight_corr != 0.0f)
+    {
+      // RGB Rec2020 to XYZ D65
+      float3 xyz;
+      xyz.x = 0.636958f * pixel.x + 0.144617f * pixel.y + 0.168881f * pixel.z;
+      xyz.y = 0.262700f * pixel.x + 0.677998f * pixel.y + 0.059302f * pixel.z;
+      xyz.z = 0.000000f * pixel.x + 0.028073f * pixel.y + 1.060985f * pixel.z;
+
+      xyz = fmax(xyz, 0.0f);
+
+      // XYZ to JzAzBz
+      float4 xyz_scaled = (float4)(xyz.x * 400.0f, xyz.y * 400.0f, xyz.z * 400.0f, 0.0f);
+      jab = XYZ_to_JzAzBz(xyz_scaled);
+
+      int modified = 0;
+
+      if(ucs_saturation_balance != 0.0f)
+      {
+        // Chroma-based modulation for saturation balance
+        const float chroma = fmax(fmax(pixel.x, pixel.y), pixel.z) - fmin(fmin(pixel.x, pixel.y), pixel.z);
+        const float effective_saturation = ucs_saturation_balance * fmin(chroma * 2.0f, 1.0f);
+
+        // Apply saturation balance
+        const float Y = xyz.y;
+        const float L = native_sqrt(fmax(Y, 0.0f));
+        const float fulcrum = 0.5f;
+        const float n = (L - fulcrum) / fulcrum;
+        const float mask_shadow = 1.0f / (1.0f + dtcl_exp(n * 4.0f));
+        
+        float sat_adjust = effective_saturation * (2.0f * mask_shadow - 1.0f);
+        sat_adjust *= fmin(L * 4.0f, 1.0f);
+        const float sat_factor = 1.0f + sat_adjust;
+        jab.y *= sat_factor;
+        jab.z *= sat_factor;
+        modified = 1;
+      }
+
+      if(gamut_strength > 0.0f)
+      {
+        const float Y = xyz.y;
+        const float L = native_sqrt(fmax(Y, 0.0f));
+        const float chroma_factor = 1.0f - gamut_strength * (0.2f + 0.2f * L);
+        jab.y *= chroma_factor;
+        jab.z *= chroma_factor;
+        modified = 1;
+      }
+
+      // HIGH SENSITIVITY CORRECTION
+      // Start effect at 0.20 up to 0.90. Linear transition.
+      float hl_mask = clamp((jab.x - 0.20f) / 0.70f, 0.0f, 1.0f);
+
+      if(hl_mask > 0.0f && highlight_corr != 0.0f)
+      {
+        // 1. Soft symmetric desaturation (0.75 factor)
+        float desat = 1.0f - (fabs(highlight_corr) * hl_mask * 0.75f);
+        jab.y *= desat;
+        jab.z *= desat;
+
+        // 2. Controlled Hue Rotation (2.0 factor)
+        float angle = highlight_corr * hl_mask * 2.0f;
+        float ca = native_cos(angle);
+        float sa = native_sin(angle);
+        float az = jab.y;
+        float bz = jab.z;
+
+        jab.y = az * ca - bz * sa;
+        jab.z = az * sa + bz * ca;
+        modified = 1;
+      }
+
+      if(jab.x > 0.95f)
+      {
+        const float desat = clamp((1.0f - jab.x) * 20.0f, 0.0f, 1.0f);
+        jab.y *= desat;
+        jab.z *= desat;
+        modified = 1;
+      }
+
+      if(modified)
+      {
+        // JzAzBz to XYZ
+        xyz = JzAzBz_2_XYZ(jab).xyz / 400.0f;
+
+        // XYZ D65 to RGB Rec2020
+        pixel.x =  1.716651f * xyz.x - 0.355671f * xyz.y - 0.253366f * xyz.z;
+        pixel.y = -0.666684f * xyz.x + 1.616481f * xyz.y + 0.015768f * xyz.z;
+        pixel.z =  0.017640f * xyz.x - 0.042771f * xyz.y + 0.942103f * xyz.z;
+        
+        float min_val = fmin(pixel.x, fmin(pixel.y, pixel.z));
+        if(min_val < 0.0f)
+        {
+          float lum = 0.2627f * pixel.x + 0.6780f * pixel.y + 0.0593f * pixel.z;
+          if(lum > 0.0f)
+          {
+            float factor = lum / (lum - min_val);
+            pixel.xyz = lum + factor * (pixel.xyz - lum);
+          }
+        }
+        pixel.xyz = clamp(pixel.xyz, 0.0f, 1.0f);
+      }
+    }
+
+    if(gamut_strength > 0.0f)
+    {
+      float4 orig = pixel;
+
+      float Y = 0.2126f * pixel.x + 0.7152f * pixel.y + 0.0722f * pixel.z;
+      float lum_weight = clamp((Y - 0.3f) / (0.8f - 0.3f), 0.0f, 1.0f);
+      lum_weight = lum_weight * lum_weight * (3.0f - 2.0f * lum_weight);
+      float effective_strength = gamut_strength * lum_weight;
+
+      float limit = 0.90f;
+      if (target_gamut == 1) limit = 0.95f;
+      else if (target_gamut == 2) limit = 1.00f;
+
+      float threshold = limit * (1.0f - (effective_strength * 0.25f));
+      float max_val = fmax(pixel.x, fmax(pixel.y, pixel.z));
+
+      if (max_val > threshold)
+      {
+        float range = limit - threshold;
+        float delta = max_val - threshold;
+        const float compressed = threshold + range * delta / (delta + range);
+        const float factor = compressed / max_val;
+
+        float range_blue = 1.1f * range;
+        const float compressed_blue = threshold + range * delta / (delta + range_blue);
+        const float factor_blue = compressed_blue / max_val;
+
+        pixel.x *= factor;
+        pixel.y *= factor;
+        pixel.z *= factor_blue;
+      }
+      pixel = mix(orig, pixel, effective_strength);
+    }
+
+    // Final gamut check to preserve hue
+    if(pixel.x < 0.0f || pixel.x > 1.0f || pixel.y < 0.0f || pixel.y > 1.0f || pixel.z < 0.0f || pixel.z > 1.0f)
+    {
+      const float luma = 0.2627f * pixel.x + 0.6780f * pixel.y + 0.0593f * pixel.z;
+      const float target_luma = clamp(luma, 0.0f, 1.0f);
+      float t = 1.0f;
+      if (pixel.x < 0.0f) t = fmin(t, target_luma / (target_luma - pixel.x));
+      if (pixel.y < 0.0f) t = fmin(t, target_luma / (target_luma - pixel.y));
+      if (pixel.z < 0.0f) t = fmin(t, target_luma / (target_luma - pixel.z));
+      if (pixel.x > 1.0f) t = fmin(t, (1.0f - target_luma) / (pixel.x - target_luma));
+      if (pixel.y > 1.0f) t = fmin(t, (1.0f - target_luma) / (pixel.y - target_luma));
+      if (pixel.z > 1.0f) t = fmin(t, (1.0f - target_luma) / (pixel.z - target_luma));
+      t = fmax(0.0f, t);
+      pixel.xyz = target_luma + t * (pixel.xyz - target_luma);
+    }
+  }
+
   pixel.w = read_imagef(in, sampleri, (int2)(x, y)).w;
 
   write_imagef (out, (int2)(x, y), pixel);
diff --git a/src/iop/basecurve.c b/src/iop/basecurve.c
old mode 100644
new mode 100755
index 54eeb94e7533..b917b3ddfde7
--- a/src/iop/basecurve.c
+++ b/src/iop/basecurve.c
@@ -48,7 +48,7 @@
 #define MAXNODES 20
 
 
-DT_MODULE_INTROSPECTION(6, dt_iop_basecurve_params_t)
+DT_MODULE_INTROSPECTION(7, dt_iop_basecurve_params_t)
 
 typedef struct dt_iop_basecurve_node_t
 {
@@ -63,15 +63,34 @@ typedef struct dt_iop_basecurve_params_t
   dt_iop_basecurve_node_t basecurve[3][MAXNODES];
   int basecurve_nodes[3]; // $MIN: 0 $MAX: MAXNODES $DEFAULT: 0
   int basecurve_type[3];  // $MIN: 0 $MAX: MONOTONE_HERMITE $DEFAULT: MONOTONE_HERMITE
-  int exposure_fusion;    /* number of exposure fusion steps
-                             $DEFAULT: 0 $DESCRIPTION: "fusion" */
-  float exposure_stops;   /* number of stops between fusion images
-                             $MIN: 0.01 $MAX: 4.0 $DEFAULT: 1.0 $DESCRIPTION: "exposure shift" */
-  float exposure_bias;    /* whether to do exposure-fusion with over or under-exposure
-                             $MIN: -1.0 $MAX: 1.0 $DEFAULT: 1.0 $DESCRIPTION: "exposure bias" */
+  int exposure_fusion;    // number of exposure fusion steps $DEFAULT: 0 $DESCRIPTION: "fusion"
+  float exposure_stops;   // number of stops between fusion images $MIN: 0.01 $MAX: 4.0 $DEFAULT: 1.0 $DESCRIPTION: "exposure shift"
+  float exposure_bias;    // whether to do exposure-fusion with over or under-exposure $MIN: -1.0 $MAX: 1.0 $DEFAULT: 1.0 $DESCRIPTION: "exposure bias"
   dt_iop_rgb_norms_t preserve_colors; /* $DEFAULT: DT_RGB_NORM_LUMINANCE $DESCRIPTION: "preserve colors" */
+  int workflow_mode;      // $DEFAULT: 1
+  float shadow_lift;      // $MIN: 0.25 $MAX: 1.75 $DEFAULT: 1.0 $DESCRIPTION: "shadow correction"
+  float highlight_gain;   // $MIN: 0.25 $MAX: 1.75 $DEFAULT: 1.0 $DESCRIPTION: "highlight gain"
+  float ucs_saturation_balance; // $MIN: -0.75 $MAX: 0.75 $DEFAULT: 0.2 $DESCRIPTION: "balance saturation ucs"
+  float gamut_strength;   // $MIN: 0.0 $MAX: 1.0 $DEFAULT: 0.0 $DESCRIPTION: "gamut compression"
+  float highlight_corr;   // $MIN: -1.0 $MAX: 1.0 $DEFAULT: 0.0 $DESCRIPTION: "Highlight Hue/Sat"
+  int target_gamut;       // $DEFAULT: 0 $DESCRIPTION: "target gamut"
+  int color_look;         // $DEFAULT: 1 $DESCRIPTION: "color look style"
+  float look_opacity;     // $MIN: 0.1 $MAX: 1.0 $DEFAULT: 1.0 $DESCRIPTION: "look opacity"
 } dt_iop_basecurve_params_t;
 
+static const float color_looks[10][10] = {
+  {1.000f, 0.000f, 0.000f,  0.000f, 1.000f, 0.000f,  0.000f, 0.000f, 1.000f}, // 1. Neutral
+  {0.932f, 0.051f, 0.017f,  0.021f, 0.945f, 0.034f,  0.011f, 0.025f, 0.964f}, // 2. Natural look
+  {1.029f, -0.008f, -0.074f, -0.023f, 1.008f, 0.046f, -0.002f, 0.007f, 1.010f}, // 3. Portrait
+  {1.084f, -0.006f, -0.093f, -0.074f, 1.008f, 0.060f, -0.011f, 0.005f, 1.024f}, // 4. Nature
+  {1.074f, 0.006f, -0.103f, -0.054f, 1.009f, 0.060f, -0.071f, -0.059f, 1.086f}, // 5. Vibrant
+  {1.218f, 0.007f, -0.192f, -0.119f, 1.076f, 0.048f, -0.099f, -0.069f, 1.154f}, // 6. Blue Sky
+  {1.082f, -0.020f, 0.103f, -0.051f, 1.052f, 0.042f, -0.047f, -0.045f, 1.073f}, // 7. Soft Warm
+  {1.050f, 0.020f, -0.010f, -0.020f, 1.020f, 0.000f, -0.010f, -0.020f, 1.030f}, // 8. Soft
+  {0.980f, -0.010f, -0.010f,  0.000f, 1.050f, -0.020f,  0.020f, 0.010f, 1.100f}, // 9. Deep Cool
+  {1.020f, -0.010f, -0.010f, -0.030f, 1.040f, -0.010f, 0.000f, -0.030f, 1.030f}  // 10. Authentic Cinema
+};
+
 int legacy_params(dt_iop_module_t *self,
                   const void *const old_params,
                   const int old_version,
@@ -227,6 +246,20 @@ int legacy_params(dt_iop_module_t *self,
     *new_version = 6;
     return 0;
   }
+  if(old_version == 6)
+  {
+    const dt_iop_basecurve_params_v6_t *o = (dt_iop_basecurve_params_v6_t *)old_params;
+    dt_iop_basecurve_params_t *n = calloc(1, sizeof(dt_iop_basecurve_params_t));
+    memcpy(n, o, sizeof(dt_iop_basecurve_params_v6_t));
+    n->workflow_mode = 0;
+    n->shadow_lift = 1.0f;
+    n->highlight_gain = 1.0f;
+
+    *new_params = n;
+    *new_params_size = sizeof(dt_iop_basecurve_params_t);
+    *new_version = 7;
+    return 0;
+  }
   return 1;
 }
 
@@ -234,18 +267,21 @@ typedef struct dt_iop_basecurve_gui_data_t
 {
   dt_draw_curve_t *minmax_curve; // curve for gui to draw
   int minmax_curve_type, minmax_curve_nodes;
-  GtkBox *hbox;
   GtkDrawingArea *area;
-  GtkWidget *fusion, *exposure_step, *exposure_bias;
+  GtkWidget *fusion, *exposure_step, *exposure_bias, *shadow_lift, *highlight_gain;
   GtkWidget *cmb_preserve_colors;
+  GtkWidget *workflow_mode;
   double mouse_x, mouse_y;
   int selected;
-  double selected_offset, selected_y, selected_min, selected_max;
-  float draw_xs[DT_IOP_TONECURVE_RES], draw_ys[DT_IOP_TONECURVE_RES];
-  float draw_min_xs[DT_IOP_TONECURVE_RES], draw_min_ys[DT_IOP_TONECURVE_RES];
-  float draw_max_xs[DT_IOP_TONECURVE_RES], draw_max_ys[DT_IOP_TONECURVE_RES];
+  float draw_ys[DT_IOP_TONECURVE_RES];
   float loglogscale;
   GtkWidget *logbase;
+  GtkWidget *ucs_saturation_balance;
+  GtkWidget *gamut_strength;
+  GtkWidget *highlight_corr;
+  GtkWidget *target_gamut;
+  GtkWidget *color_look;
+  GtkWidget *look_opacity;
 } dt_iop_basecurve_gui_data_t;
 
 typedef struct basecurve_preset_t
@@ -335,6 +371,15 @@ typedef struct dt_iop_basecurve_data_t
   float exposure_stops;
   float exposure_bias;
   int preserve_colors;
+  int workflow_mode;
+  float shadow_lift;
+  float highlight_gain;
+  float ucs_saturation_balance;
+  float gamut_strength;
+  float highlight_corr;
+  int target_gamut;
+  int color_look;
+  float look_opacity;
 } dt_iop_basecurve_data_t;
 
 typedef struct dt_iop_basecurve_global_data_t
@@ -370,8 +415,8 @@ const char **description(dt_iop_module_t *self)
      _("apply a view transform based on personal or camera maker look,\n"
        "for corrective purposes, to prepare images for display"),
      _("corrective"),
-     _("linear, RGB, display-referred"),
-     _("non-linear, RGB"),
+     _("linear, RGB, scene-referred"),
+     _("linear, non-linear, RGB"),
      _("non-linear, RGB, display-referred"));
 }
 
@@ -490,6 +535,8 @@ void reload_defaults(dt_iop_module_t *self)
 {
   dt_iop_basecurve_params_t *const d = self->default_params;
 
+  *d = basecurve_presets[0].params;
+
   if(self->multi_priority == 0)
   {
     const dt_image_t *const image = &(self->dev->image_storage);
@@ -529,6 +576,21 @@ void reload_defaults(dt_iop_module_t *self)
     d->exposure_stops = 1.0f;
     d->exposure_bias = 1.0f;
   }
+
+  if(!dt_is_display_referred())
+  {
+    // Force ACES defaults on top of whatever curve was found
+    d->workflow_mode = 1;
+    d->shadow_lift = 1.0f;
+    d->highlight_gain = 1.0f;
+    d->ucs_saturation_balance = 0.2f;
+    d->color_look = 1; // Natural look
+
+    d->basecurve_nodes[0] = 2;
+    d->basecurve_type[0] = CUBIC_SPLINE;
+    d->basecurve[0][0].x = 0.0f; d->basecurve[0][0].y = 0.0f;
+    d->basecurve[0][1].x = 1.0f; d->basecurve[0][1].y = 1.0f;
+  }
 }
 
 void init_presets(dt_iop_module_so_t *self)
@@ -583,12 +645,12 @@ int gauss_blur_cl(dt_iop_module_t *self,
   cl_int err = DT_OPENCL_DEFAULT_ERROR;
   const int devid = piece->pipe->devid;
 
-  /* horizontal blur */
+  //horizontal blur
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_basecurve_blur_h, width, height,
     CLARG(dev_in), CLARG(dev_tmp), CLARG(width), CLARG(height));
   if(err != CL_SUCCESS) return FALSE;
 
-  /* vertical blur */
+  // vertical blur
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_basecurve_blur_v, width, height,
     CLARG(dev_tmp), CLARG(dev_out), CLARG(width), CLARG(height));
   if(err != CL_SUCCESS) return FALSE;
@@ -693,6 +755,11 @@ int process_cl_fusion(dt_iop_module_t *self,
   cl_mem dev_m = NULL;
   cl_mem dev_coeffs = NULL;
 
+  // Prepare Color Look matrix (9 floats packed into float16 for OpenCL)
+  float look_mat_buf[16] = {0.0f};
+  for(int i=0; i<9; i++) look_mat_buf[i] = color_looks[d->color_look][i];
+  const float alpha = 0.5f;
+
   const int use_work_profile = (work_profile == NULL) ? 0 : 1;
   const int preserve_colors = d->preserve_colors;
 
@@ -743,7 +810,6 @@ int process_cl_fusion(dt_iop_module_t *self,
 
   for(int e = 0; e < d->exposure_fusion + 1; e++)
   {
-    // for every exposure fusion image: push by some ev, apply base curve and compute features
     {
       const float mul = exposure_increment(d->exposure_stops, e, d->exposure_fusion, d->exposure_bias);
 
@@ -886,8 +952,11 @@ int process_cl_fusion(dt_iop_module_t *self,
   }
 
   // copy output buffer
+  // Apply ACES/shadow_lift here if needed
   err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_basecurve_finalize, width, height,
-      CLARG(dev_in), CLARG(dev_comb[0]), CLARG(dev_out), CLARG(width), CLARG(height));
+      CLARG(dev_in), CLARG(dev_comb[0]), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(d->workflow_mode),
+      CLARG(d->shadow_lift), CLARG(d->highlight_gain), CLARG(d->ucs_saturation_balance), CLARG(d->gamut_strength),
+      CLARG(d->highlight_corr), CLARG(d->target_gamut), CLARG(d->look_opacity), CLARG(look_mat_buf), CLARG(alpha));
 
 error:
   for(int k = 0; k < num_levels_max; k++)
@@ -920,7 +989,8 @@ int process_cl_lut(dt_iop_module_t *self,
 
   cl_mem dev_m = NULL;
   cl_mem dev_coeffs = NULL;
-  cl_int err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
+  cl_int err = DT_OPENCL_DEFAULT_ERROR;
+  cl_mem dev_tmp = NULL;
 
   cl_mem dev_profile_info = NULL;
   cl_mem dev_profile_lut = NULL;
@@ -933,28 +1003,61 @@ int process_cl_lut(dt_iop_module_t *self,
   const int height = roi_in->height;
   const int preserve_colors = d->preserve_colors;
 
+  const float mul = 1.0f;
+
+  size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 };
   dev_m = dt_opencl_copy_host_to_device(devid, d->table, 256, 256, sizeof(float));
-  dev_coeffs = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 3, d->unbounded_coeffs);
-  if(!dev_m || !dev_coeffs) goto error;
+  if(dev_m == NULL) goto error;
 
   err = dt_ioppr_build_iccprofile_params_cl(work_profile, devid, &profile_info_cl, &profile_lut_cl,
                                             &dev_profile_info, &dev_profile_lut);
   if(err != CL_SUCCESS) goto error;
 
+  dev_coeffs = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 3, d->unbounded_coeffs);
+
+  if(dev_coeffs == NULL) goto error;
+
+  cl_mem dev_dest = dev_out;
+
+  float look_mat_buf[16] = {0.0f};
+  for(int i=0; i<9; i++) look_mat_buf[i] = color_looks[d->color_look][i];
+  const float alpha = 0.75f;
+
+  if(d->workflow_mode > 0)
+  {
+    dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4);
+    if(dev_tmp == NULL) goto error;
+    dev_dest = dev_tmp;
+  }
+
   // read data/kernels/basecurve.cl for a description of "legacy" vs current
   // Conditional is moved outside of the OpenCL operations for performance.
   if(d->preserve_colors == DT_RGB_NORM_NONE)
-    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_basecurve_legacy_lut, width, height,
-          CLARG(dev_in), CLARG(dev_out),
-          CLARG(width), CLARG(height), CLARGFLOAT(1.0f), CLARG(dev_m), CLARG(dev_coeffs));
+  {
+    dt_opencl_set_kernel_args(devid, gd->kernel_basecurve_legacy_lut, 0, CLARG(dev_in), CLARG(dev_dest),
+      CLARG(width), CLARG(height), CLARG(mul), CLARG(dev_m), CLARG(dev_coeffs));
+    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_basecurve_legacy_lut, sizes);
+  }
   else
-    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_basecurve_lut, width, height,
-      CLARG(dev_in), CLARG(dev_out),
-      CLARG(width), CLARG(height),
-      CLARGFLOAT(1.0f), CLARG(dev_m), CLARG(dev_coeffs), CLARG(preserve_colors), CLARG(dev_profile_info),
+  {
+    //FIXME:  There are still conditionals on d->preserve_colors within this flow that could impact performance
+    dt_opencl_set_kernel_args(devid, gd->kernel_basecurve_lut, 0, CLARG(dev_in), CLARG(dev_dest), CLARG(width),
+      CLARG(height), CLARG(mul), CLARG(dev_m), CLARG(dev_coeffs), CLARG(preserve_colors), CLARG(dev_profile_info),
       CLARG(dev_profile_lut), CLARG(use_work_profile));
+    err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_basecurve_lut, sizes);
+  }
+
+  if(d->workflow_mode > 0)
+  {
+    err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_basecurve_finalize, width, height,
+        CLARG(dev_in), CLARG(dev_tmp), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(d->workflow_mode),
+        CLARG(d->shadow_lift), CLARG(d->highlight_gain), CLARG(d->ucs_saturation_balance), CLARG(d->gamut_strength),
+        CLARG(d->highlight_corr), CLARG(d->target_gamut), CLARG(d->look_opacity), CLARG(look_mat_buf), CLARG(alpha));
+    if(err != CL_SUCCESS) goto error;
+  }
 
 error:
+  dt_opencl_release_mem_object(dev_tmp);
   dt_opencl_release_mem_object(dev_m);
   dt_opencl_release_mem_object(dev_coeffs);
   dt_ioppr_free_iccprofile_params_cl(&profile_info_cl, &profile_lut_cl, &dev_profile_info, &dev_profile_lut);
@@ -1007,6 +1110,28 @@ void tiling_callback(dt_iop_module_t *self,
   }
 }
 
+static inline float _aces_tone_map(const float x)
+{
+  const float a = 2.51f;
+  const float b = 0.03f;
+  const float c = 2.43f;
+  const float d = 0.59f;
+  const float e = 0.14f;
+
+  return CLAMP((x * (a * x + b)) / (x * (c * x + d) + e), 0.0f, 1.0f);
+}
+
+static inline float _aces_20_tonemap(const float x)
+{
+  const float a = 0.0245786f;
+  const float b = 0.000090537f;
+  const float c = 0.983729f;
+  const float d = 0.4329510f;
+  const float e = 0.238081f;
+
+  return CLAMP((x * (x + a) - b) / (x * (c * x + d) + e), 0.0f, 1.0f);
+}
+
 // See comments of opencl version in data/kernels/basecurve.cl for description of the meaning of "legacy"
 static inline void apply_legacy_curve(
     const float *const in,
@@ -1023,12 +1148,16 @@ static inline void apply_legacy_curve(
   {
     for(int i = 0; i < 3; i++)
     {
-      const float f = in[k+i] * mul;
+      float f = in[k+i] * mul;
+
+      float val;
       // use base curve for values < 1, else use extrapolation.
       if(f < 1.0f)
-        out[k+i] = fmaxf(table[CLAMP((int)(f * 0x10000ul), 0, 0xffff)], 0.f);
+        val = fmaxf(table[CLAMP((int)(f * 0x10000ul), 0, 0xffff)], 0.f);
       else
-        out[k+i] = fmaxf(dt_iop_eval_exp(unbounded_coeffs, f), 0.f);
+        val = fmaxf(dt_iop_eval_exp(unbounded_coeffs, f), 0.f);
+
+      out[k+i] = val;
     }
     out[k+3] = in[k+3];
   }
@@ -1059,6 +1188,7 @@ static inline void apply_curve(
       const float curve_lum = (lum < 1.0f)
         ? table[CLAMP((int)(lum * 0x10000ul), 0, 0xffff)]
         : dt_iop_eval_exp(unbounded_coeffs, lum);
+
       ratio = mul * curve_lum / lum;
     }
     for(size_t c = 0; c < 3; c++)
@@ -1204,7 +1334,300 @@ static inline void gauss_reduce(
   }
 }
 
-void process_fusion(dt_iop_module_t *self,
+static void process_lut(dt_iop_module_t *self,
+                 dt_dev_pixelpipe_iop_t *piece,
+                 const void *const ivoid,
+                 void *const ovoid,
+                 const dt_iop_roi_t *const roi_in,
+                 const dt_iop_roi_t *const roi_out)
+{
+  const float *const in = (const float *)ivoid;
+  float *const out = (float *)ovoid;
+  dt_iop_basecurve_data_t *const d = piece->data;
+  const dt_iop_order_iccprofile_info_t *const work_profile = dt_ioppr_get_iop_work_profile_info(piece->module, piece->module->dev->iop);
+
+  const int wd = roi_in->width, ht = roi_in->height;
+
+  if(d->preserve_colors == DT_RGB_NORM_NONE)
+    apply_legacy_curve(in, out, wd, ht, 1.0, d->table, d->unbounded_coeffs);
+  else
+    apply_curve(in, out, wd, ht, d->preserve_colors, 1.0, d->table, d->unbounded_coeffs, work_profile);
+
+  if(d->workflow_mode > 0)
+  {
+    const float *mat = color_looks[d->color_look];
+
+    const size_t npixels = (size_t)wd * ht;
+    DT_OMP_FOR()
+    for(size_t k = 0; k < 4 * npixels; k += 4)
+    {
+      float r = out[k];
+      float g = out[k+1];
+      float b = out[k+2];
+
+      // Sanitize to avoid Inf/NaN issues
+      r = fmaxf(-1e6f, fminf(r, 1e6f));
+      g = fmaxf(-1e6f, fminf(g, 1e6f));
+      b = fmaxf(-1e6f, fminf(b, 1e6f));
+
+      // Apply Color Look
+      float tr = r * mat[0] + g * mat[1] + b * mat[2];
+      float tg = r * mat[3] + g * mat[4] + b * mat[5];
+      float tb = r * mat[6] + g * mat[7] + b * mat[8];
+
+      // Mix with opacity
+      out[k]   = r * (1.0f - d->look_opacity) + tr * d->look_opacity;
+      out[k+1] = g * (1.0f - d->look_opacity) + tg * d->look_opacity;
+      out[k+2] = b * (1.0f - d->look_opacity) + tb * d->look_opacity;
+
+      out[k]   = fmaxf(out[k], 0.0f);
+      out[k+1] = fmaxf(out[k+1], 0.0f);
+      out[k+2] = fmaxf(out[k+2], 0.0f);
+
+      // Reload for next steps
+      r = out[k];
+      g = out[k+1];
+      b = out[k+2];
+
+      if(d->highlight_gain != 1.0f) {
+        r *= d->highlight_gain;
+        g *= d->highlight_gain;
+        b *= d->highlight_gain;
+      }
+      if(d->shadow_lift != 1.0f) {
+        r = powf(r, d->shadow_lift);
+        g = powf(g, d->shadow_lift);
+        b = powf(b, d->shadow_lift);
+      }
+
+      const float r_coeff = 0.2627f, g_coeff = 0.6780f, b_coeff = 0.0593f;
+      float y_in = r * r_coeff + g * g_coeff + b * b_coeff;
+      float y_out = y_in;
+
+      /* Scene-referred: apply luminance-adaptive shoulder extension for
+         ACES-like tonemapping. Compute perceptual luminance Jz from RGB
+         and derive scale k = 1 + alpha * L^2 where L = clamp(Jz,0,1).
+         Then tone-map x_scaled = y_in / k and rescale result by k to
+         extend the shoulder progressively. Keep alpha constant and
+         avoid changing UI or legacy/display-referred behavior. */
+      if(d->workflow_mode == 1 || d->workflow_mode == 2)
+      {
+        // compute Jz from current RGB (Rec2020 -> XYZ -> JzAzBz)
+        float xyz[3];
+        xyz[0] = 0.636958f * r + 0.144617f * g + 0.168881f * b;
+        xyz[1] = 0.262700f * r + 0.677998f * g + 0.059302f * b;
+        xyz[2] = 0.000000f * r + 0.028073f * g + 1.060985f * b;
+        for(int i=0;i<3;i++) xyz[i] = fmaxf(xyz[i], 0.0f);
+
+        float xyz_scaled[4];
+        xyz_scaled[0] = xyz[0] * 400.0f;
+        xyz_scaled[1] = xyz[1] * 400.0f;
+        xyz_scaled[2] = xyz[2] * 400.0f;
+        xyz_scaled[3] = 0.0f;
+
+        float jab[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+        dt_XYZ_2_JzAzBz(xyz_scaled, jab);
+
+        const float L = fminf(fmaxf(jab[0], 0.0f), 1.0f);
+        const float alpha = 0.75f;
+        const float k_scale = 1.0f + alpha * L * L;
+
+        // scale luminance, apply selected tonemap, then undo scaling
+        const float x_scaled = y_in / k_scale;
+        if(d->workflow_mode == 1)
+          y_out = _aces_tone_map(x_scaled) * k_scale;
+        else /* workflow_mode == 2 */
+          y_out = _aces_20_tonemap(x_scaled * 1.257f) * k_scale;
+      }
+
+      float gain = y_out / fmaxf(y_in, 1e-6f);
+
+      out[k] = r * gain;
+      out[k+1] = g * gain;
+      out[k+2] = b * gain;
+
+      const float threshold = 0.80f;
+      if(y_out > threshold)
+      {
+        float factor = (y_out - threshold) / (1.0f - threshold);
+        factor = CLAMP(factor, 0.0f, 1.0f);
+        out[k] = out[k] * (1.0f - factor) + y_out * factor;
+        out[k+1] = out[k+1] * (1.0f - factor) + y_out * factor;
+        out[k+2] = out[k+2] * (1.0f - factor) + y_out * factor;
+      }
+
+      if(d->ucs_saturation_balance != 0.0f || d->gamut_strength > 0.0f || d->highlight_corr != 0.0f)
+      {
+        // RGB Rec2020 to XYZ D65
+        float xyz[4];
+        xyz[0] = 0.636958f * out[k] + 0.144617f * out[k+1] + 0.168881f * out[k+2];
+        xyz[1] = 0.262700f * out[k] + 0.677998f * out[k+1] + 0.059302f * out[k+2];
+        xyz[2] = 0.000000f * out[k] + 0.028073f * out[k+1] + 1.060985f * out[k+2];
+
+        for(int i=0; i<3; i++) xyz[i] = fmaxf(xyz[i], 0.0f);
+
+        // XYZ to JzAzBz
+        float jab[4];
+        float xyz_scaled[4];
+        for(int i=0; i<3; i++) xyz_scaled[i] = xyz[i] * 400.0f; // Scale to 400 nits for JzAzBz
+        dt_XYZ_2_JzAzBz(xyz_scaled, jab);
+
+        int modified = 0;
+
+        if(d->ucs_saturation_balance != 0.0f)
+        {
+          // Chroma-based modulation for saturation balance
+          const float r_sat = out[k];
+          const float g_sat = out[k+1];
+          const float b_sat = out[k+2];
+          const float chroma = fmaxf(fmaxf(r_sat, g_sat), b_sat) - fminf(fminf(r_sat, g_sat), b_sat);
+          const float effective_saturation = d->ucs_saturation_balance * fminf(chroma * 2.0f, 1.0f);
+
+          // Apply saturation balance
+          // Use Rec2020 Luminance Y for mask
+          const float Y = xyz[1];
+          const float L = powf(fmaxf(Y, 0.0f), 0.5f);
+          const float fulcrum = 0.5f;
+          const float n = (L - fulcrum) / fulcrum;
+          const float mask_shadow = 1.0f / (1.0f + expf(n * 4.0f));
+          float sat_adjust = effective_saturation * (2.0f * mask_shadow - 1.0f);
+          sat_adjust *= fminf(L * 4.0f, 1.0f);
+          const float sat_factor = 1.0f + sat_adjust;
+          jab[1] *= sat_factor;
+          jab[2] *= sat_factor;
+          modified = 1;
+        }
+
+        if(d->gamut_strength > 0.0f)
+        {
+          const float Y = xyz[1];
+          const float L = powf(fmaxf(Y, 0.0f), 0.5f);
+          const float chroma_factor = 1.0f - d->gamut_strength * (0.2f + 0.2f * L);
+          jab[1] *= chroma_factor;
+          jab[2] *= chroma_factor;
+          modified = 1;
+        }
+
+        if(d->highlight_corr != 0.0f)
+        {
+          // HIGHLIGHT HUE AND SATURATION CORRECTION (sync with OpenCL)
+          // Mask starts at Jz = 0.20 and is full at Jz = 0.90. Linear transition.
+          float hl_mask = CLAMP((jab[0] - 0.20f) / 0.70f, 0.0f, 1.0f);
+
+          if(hl_mask > 0.0f)
+          {
+            // 1. Soft symmetric desaturation (0.75 factor)
+            float desat = 1.0f - (fabsf(d->highlight_corr) * hl_mask * 0.75f);
+            jab[1] *= desat;
+            jab[2] *= desat;
+
+            // 2. Controlled Hue Rotation (2.0 factor)
+            float angle = d->highlight_corr * hl_mask * 2.0f;
+            float ca = cosf(angle);
+            float sa = sinf(angle);
+            float az = jab[1];
+            float bz = jab[2];
+            jab[1] = az * ca - bz * sa;
+            jab[2] = az * sa + bz * ca;
+            modified = 1;
+          }
+        }
+
+        if(jab[0] > 0.95f)
+        {
+          const float desat = CLAMP((1.0f - jab[0]) * 20.0f, 0.0f, 1.0f);
+          jab[1] *= desat;
+          jab[2] *= desat;
+          modified = 1;
+        }
+
+        if(modified)
+        {
+          // JzAzBz to XYZ
+          dt_JzAzBz_2_XYZ(jab, xyz_scaled);
+          for(int i=0; i<3; i++) xyz[i] = xyz_scaled[i] / 400.0f;
+
+          // XYZ D65 to RGB Rec2020
+          out[k]   =  1.716651f * xyz[0] - 0.355671f * xyz[1] - 0.253366f * xyz[2];
+          out[k+1] = -0.666684f * xyz[0] + 1.616481f * xyz[1] + 0.015768f * xyz[2];
+          out[k+2] =  0.017640f * xyz[0] - 0.042771f * xyz[1] + 0.942103f * xyz[2];
+
+          float min_val = fminf(out[k], fminf(out[k+1], out[k+2]));
+          if(min_val < 0.0f)
+          {
+            float lum = 0.2627f * out[k] + 0.6780f * out[k+1] + 0.0593f * out[k+2];
+            if(lum > 0.0f)
+            {
+              float factor = lum / (lum - min_val);
+              out[k] = lum + factor * (out[k] - lum);
+              out[k+1] = lum + factor * (out[k+1] - lum);
+              out[k+2] = lum + factor * (out[k+2] - lum);
+            }
+          }
+        }
+
+      if(d->gamut_strength > 0.0f)
+      {
+        const float orig_r = out[k];
+        const float orig_g = out[k+1];
+        const float orig_b = out[k+2];
+
+        const float Y = 0.2126f * orig_r + 0.7152f * orig_g + 0.0722f * orig_b;
+        float lum_weight = CLAMP((Y - 0.3f) / (0.8f - 0.3f), 0.0f, 1.0f);
+        lum_weight = lum_weight * lum_weight * (3.0f - 2.0f * lum_weight);
+        const float effective_strength = d->gamut_strength * lum_weight;
+
+        float limit = 0.90f;
+        if (d->target_gamut == 1) limit = 0.95f;
+        else if (d->target_gamut == 2) limit = 1.00f;
+
+        float gamut_threshold = limit * (1.0f - (effective_strength * 0.25f));
+        float max_val = fmaxf(out[k], fmaxf(out[k+1], out[k+2]));
+
+        if (max_val > gamut_threshold)
+        {
+          float range = limit - gamut_threshold;
+          float delta = max_val - gamut_threshold;
+          const float compressed = gamut_threshold + range * delta / (delta + range);
+          const float factor = compressed / max_val;
+
+          float range_blue = 1.1f * range;
+          const float compressed_blue = gamut_threshold + range * delta / (delta + range_blue);
+          const float factor_blue = compressed_blue / max_val;
+
+          out[k] *= factor;
+          out[k+1] *= factor;
+          out[k+2] *= factor_blue;
+        }
+
+        out[k] = orig_r * (1.0f - effective_strength) + out[k] * effective_strength;
+        out[k+1] = orig_g * (1.0f - effective_strength) + out[k+1] * effective_strength;
+        out[k+2] = orig_b * (1.0f - effective_strength) + out[k+2] * effective_strength;
+      }
+
+      // Final gamut check to preserve hue (exact color)
+      if(out[k] < 0.0f || out[k] > 1.0f || out[k+1] < 0.0f || out[k+1] > 1.0f || out[k+2] < 0.0f || out[k+2] > 1.0f)
+      {
+        const float luma = 0.2627f * out[k] + 0.6780f * out[k+1] + 0.0593f * out[k+2];
+        const float target_luma = CLAMP(luma, 0.0f, 1.0f);
+        float t = 1.0f;
+        if (out[k] < 0.0f) t = fminf(t, target_luma / (target_luma - out[k]));
+        if (out[k+1] < 0.0f) t = fminf(t, target_luma / (target_luma - out[k+1]));
+        if (out[k+2] < 0.0f) t = fminf(t, target_luma / (target_luma - out[k+2]));
+        if (out[k] > 1.0f) t = fminf(t, (1.0f - target_luma) / (out[k] - target_luma));
+        if (out[k+1] > 1.0f) t = fminf(t, (1.0f - target_luma) / (out[k+1] - target_luma));
+        if (out[k+2] > 1.0f) t = fminf(t, (1.0f - target_luma) / (out[k+2] - target_luma));
+        t = fmaxf(0.0f, t);
+        out[k] = target_luma + t * (out[k] - target_luma);
+        out[k+1] = target_luma + t * (out[k+1] - target_luma);
+        out[k+2] = target_luma + t * (out[k+2] - target_luma);
+      }
+      }
+    }
+  }
+}
+
+static void process_fusion(dt_iop_module_t *self,
                     dt_dev_pixelpipe_iop_t *piece,
                     const void *const ivoid,
                     void *const ovoid,
@@ -1270,17 +1693,6 @@ void process_fusion(dt_iop_module_t *self,
     for(size_t k = 0; k < 4ul * wd * ht; k += 4)
       col[0][k + 3] *= .1f + sqrtf(out[k] * out[k] + out[k + 1] * out[k + 1] + out[k + 2] * out[k + 2]);
 
-// #define DEBUG_VIS2
-#ifdef DEBUG_VIS2 // transform weights in channels
-    for(size_t k = 0; k < 4ul * w * h; k += 4) col[0][k + e] = col[0][k + 3];
-#endif
-
-// #define DEBUG_VIS
-#ifdef DEBUG_VIS // DEBUG visualise weight buffer
-    for(size_t k = 0; k < 4ul * w * h; k += 4) comb[0][k + e] = col[0][k + 3];
-    continue;
-#endif
-
     for(int k = 1; k < num_levels; k++)
     {
       gauss_reduce(col[k - 1], col[k], 0, w, h);
@@ -1307,14 +1719,10 @@ void process_fusion(dt_iop_module_t *self,
       {
         // blend images into output pyramid
         if(k == num_levels - 1) // blend gaussian base
-#ifdef DEBUG_VIS2
-          ;
-#else
         {
         for(int c = 0; c < 3; c++)
           comb[k][x + c] += col[k][x + 3] * col[k][x + c];
         }
-#endif
         else // laplacian
         {
           for(int c = 0; c < 3; c++)
@@ -1325,7 +1733,6 @@ void process_fusion(dt_iop_module_t *self,
     }
   }
 
-#ifndef DEBUG_VIS // DEBUG: switch off when visualising weight buf
   // normalise and reconstruct output pyramid buffer coarse to fine
   for(int k = num_levels - 1; k >= 0; k--)
   {
@@ -1354,14 +1761,267 @@ void process_fusion(dt_iop_module_t *self,
         }
     }
   }
-#endif
+
   // copy output buffer
+  const float *mat = color_looks[d->color_look];
   DT_OMP_FOR()
   for(size_t k = 0; k < (size_t)4 * wd * ht; k += 4)
   {
-    out[k + 0] = fmaxf(comb[0][k + 0], 0.f);
-    out[k + 1] = fmaxf(comb[0][k + 1], 0.f);
-    out[k + 2] = fmaxf(comb[0][k + 2], 0.f);
+    float val[3];
+    val[0] = fmaxf(comb[0][k + 0], 0.f);
+    val[1] = fmaxf(comb[0][k + 1], 0.f);
+    val[2] = fmaxf(comb[0][k + 2], 0.f);
+
+    // Sanitize to avoid Inf/NaN issues
+    val[0] = fminf(val[0], 1e6f);
+    val[1] = fminf(val[1], 1e6f);
+    val[2] = fminf(val[2], 1e6f);
+
+    // If using ACES workflow, we apply shadow lift and tone mapping here, after fusion
+    if(d->workflow_mode > 0)
+    {
+      // Apply Color Look
+      float r = val[0], g = val[1], b = val[2];
+      float tr = r * mat[0] + g * mat[1] + b * mat[2];
+      float tg = r * mat[3] + g * mat[4] + b * mat[5];
+      float tb = r * mat[6] + g * mat[7] + b * mat[8];
+
+      // Mix with opacity
+      val[0] = r * (1.0f - d->look_opacity) + tr * d->look_opacity;
+      val[1] = g * (1.0f - d->look_opacity) + tg * d->look_opacity;
+      val[2] = b * (1.0f - d->look_opacity) + tb * d->look_opacity;
+
+      val[0] = fmaxf(val[0], 0.0f);
+      val[1] = fmaxf(val[1], 0.0f);
+      val[2] = fmaxf(val[2], 0.0f);
+
+      if(d->highlight_gain != 1.0f) {
+        val[0] *= d->highlight_gain;
+        val[1] *= d->highlight_gain;
+        val[2] *= d->highlight_gain;
+      }
+      if(d->shadow_lift != 1.0f) {
+        val[0] = powf(val[0], d->shadow_lift);
+        val[1] = powf(val[1], d->shadow_lift);
+        val[2] = powf(val[2], d->shadow_lift);
+      }
+
+      const float r_coeff = 0.2627f, g_coeff = 0.6780f, b_coeff = 0.0593f;
+      float y_in = val[0] * r_coeff + val[1] * g_coeff + val[2] * b_coeff;
+      float y_out = y_in;
+
+      if(d->workflow_mode == 1 || d->workflow_mode == 2)
+      {
+        float xyz_local[3];
+        xyz_local[0] = 0.636958f * val[0] + 0.144617f * val[1] + 0.168881f * val[2];
+        xyz_local[1] = 0.262700f * val[0] + 0.677998f * val[1] + 0.059302f * val[2];
+        xyz_local[2] = 0.000000f * val[0] + 0.028073f * val[1] + 1.060985f * val[2];
+        for(int i=0;i<3;i++) xyz_local[i] = fmaxf(xyz_local[i], 0.0f);
+
+        float xyz_scaled_local[4];
+        xyz_scaled_local[0] = xyz_local[0] * 400.0f;
+        xyz_scaled_local[1] = xyz_local[1] * 400.0f;
+        xyz_scaled_local[2] = xyz_local[2] * 400.0f;
+        xyz_scaled_local[3] = 0.0f;
+
+        float jab_local[4] = {0.0f,0.0f,0.0f,0.0f};
+        dt_XYZ_2_JzAzBz(xyz_scaled_local, jab_local);
+
+        const float L = fminf(fmaxf(jab_local[0], 0.0f), 1.0f);
+        const float alpha = 0.5f;
+        const float k_scale = 1.0f + alpha * L * L;
+
+        const float x_scaled = y_in / k_scale;
+        if(d->workflow_mode == 1)
+          y_out = _aces_tone_map(x_scaled) * k_scale;
+        else
+          y_out = _aces_20_tonemap(x_scaled * 1.257f) * k_scale;
+      }
+
+      float gain = y_out / fmaxf(y_in, 1e-6f);
+
+      val[0] *= gain;
+      val[1] *= gain;
+      val[2] *= gain;
+
+      const float threshold = 0.80f;
+      if(y_out > threshold)
+      {
+        float factor = (y_out - threshold) / (1.0f - threshold);
+        factor = CLAMP(factor, 0.0f, 1.0f);
+        val[0] = val[0] * (1.0f - factor) + y_out * factor;
+        val[1] = val[1] * (1.0f - factor) + y_out * factor;
+        val[2] = val[2] * (1.0f - factor) + y_out * factor;
+      }
+
+      if(d->ucs_saturation_balance != 0.0f || d->gamut_strength > 0.0f || d->highlight_corr != 0.0f)
+      {
+        // RGB Rec2020 to XYZ D65
+        float xyz[4];
+        xyz[0] = 0.636958f * val[0] + 0.144617f * val[1] + 0.168881f * val[2];
+        xyz[1] = 0.262700f * val[0] + 0.677998f * val[1] + 0.059302f * val[2];
+        xyz[2] = 0.000000f * val[0] + 0.028073f * val[1] + 1.060985f * val[2];
+
+        for(int i=0; i<3; i++) xyz[i] = fmaxf(xyz[i], 0.0f);
+
+        // XYZ to JzAzBz
+        float jab[4];
+        float xyz_scaled[4];
+        for(int i=0; i<3; i++) xyz_scaled[i] = xyz[i] * 400.0f; // Scale to 400 nits for JzAzBz
+        dt_XYZ_2_JzAzBz(xyz_scaled, jab);
+
+        int modified = 0;
+
+        if(d->ucs_saturation_balance != 0.0f)
+        {
+          // Chroma-based modulation for saturation balance
+          const float r_sat = val[0];
+          const float g_sat = val[1];
+          const float b_sat = val[2];
+          const float chroma = fmaxf(fmaxf(r_sat, g_sat), b_sat) - fminf(fminf(r_sat, g_sat), b_sat);
+          const float effective_saturation = d->ucs_saturation_balance * fminf(chroma * 2.0f, 1.0f);
+
+          // Apply saturation balance
+          // Use Rec2020 Luminance Y for mask
+          const float Y = xyz[1];
+          const float L = powf(fmaxf(Y, 0.0f), 0.5f);
+          const float fulcrum = 0.5f;
+          const float n = (L - fulcrum) / fulcrum;
+          const float mask_shadow = 1.0f / (1.0f + expf(n * 4.0f));
+          const float sat_factor = 1.0f + effective_saturation * (2.0f * mask_shadow - 1.0f);
+          jab[1] *= sat_factor;
+          jab[2] *= sat_factor;
+          modified = 1;
+        }
+
+        if(d->gamut_strength > 0.0f)
+        {
+          const float Y = xyz[1];
+          const float L = powf(fmaxf(Y, 0.0f), 0.5f);
+          const float chroma_factor = 1.0f - d->gamut_strength * (0.2f + 0.2f * L);
+          jab[1] *= chroma_factor;
+          jab[2] *= chroma_factor;
+          modified = 1;
+        }
+
+        if(d->highlight_corr != 0.0f)
+        {
+          // HIGHLIGHT HUE AND SATURATION CORRECTION (sync with OpenCL)
+          // Mask starts at Jz = 0.20 and is full at Jz = 0.90. Linear transition.
+          float hl_mask = CLAMP((jab[0] - 0.20f) / 0.70f, 0.0f, 1.0f);
+
+          if(hl_mask > 0.0f)
+          {
+            // 1. Soft symmetric desaturation (0.75 factor)
+            float desat = 1.0f - (fabsf(d->highlight_corr) * hl_mask * 0.75f);
+            jab[1] *= desat;
+            jab[2] *= desat;
+
+            // 2. Controlled Hue Rotation (2.0 factor)
+            float angle = d->highlight_corr * hl_mask * 2.0f;
+            float ca = cosf(angle);
+            float sa = sinf(angle);
+            float az = jab[1];
+            float bz = jab[2];
+            jab[1] = az * ca - bz * sa;
+            jab[2] = az * sa + bz * ca;
+            modified = 1;
+          }
+        }
+
+        if(jab[0] > 0.95f)
+        {
+          const float desat = CLAMP((1.0f - jab[0]) * 20.0f, 0.0f, 1.0f);
+          jab[1] *= desat;
+          jab[2] *= desat;
+          modified = 1;
+        }
+
+        if(modified)
+        {
+          // JzAzBz to XYZ
+          dt_JzAzBz_2_XYZ(jab, xyz_scaled);
+          for(int i=0; i<3; i++) xyz[i] = xyz_scaled[i] / 400.0f;
+
+          // XYZ D65 to RGB Rec2020
+          val[0] =  1.716651f * xyz[0] - 0.355671f * xyz[1] - 0.253366f * xyz[2];
+          val[1] = -0.666684f * xyz[0] + 1.616481f * xyz[1] + 0.015768f * xyz[2];
+          val[2] =  0.017640f * xyz[0] - 0.042771f * xyz[1] + 0.942103f * xyz[2];
+
+          float min_val = fminf(val[0], fminf(val[1], val[2]));
+          if(min_val < 0.0f)
+          {
+            float lum = 0.2627f * val[0] + 0.6780f * val[1] + 0.0593f * val[2];
+            if(lum > 0.0f)
+            {
+              float factor = lum / (lum - min_val);
+              val[0] = lum + factor * (val[0] - lum);
+              val[1] = lum + factor * (val[1] - lum);
+              val[2] = lum + factor * (val[2] - lum);
+            }
+          }
+        }
+
+      if(d->gamut_strength > 0.0f)
+      {
+        const float orig_r = val[0];
+        const float orig_g = val[1];
+        const float orig_b = val[2];
+
+        const float Y = 0.2126f * orig_r + 0.7152f * orig_g + 0.0722f * orig_b;
+        float lum_weight = CLAMP((Y - 0.3f) / (0.8f - 0.3f), 0.0f, 1.0f);
+        lum_weight = lum_weight * lum_weight * (3.0f - 2.0f * lum_weight);
+        const float effective_strength = d->gamut_strength * lum_weight;
+
+        float limit = 0.90f;
+        if (d->target_gamut == 1) limit = 0.95f;
+        else if (d->target_gamut == 2) limit = 1.00f;
+
+        float gamut_threshold = limit * (1.0f - (effective_strength * 0.25f));
+        float max_val = fmaxf(val[0], fmaxf(val[1], val[2]));
+
+        if (max_val > gamut_threshold)
+        {
+          float range = limit - gamut_threshold;
+          float delta = max_val - gamut_threshold;
+          const float compressed = gamut_threshold + range * delta / (delta + range);
+          const float factor = compressed / max_val;
+
+          float range_blue = 1.1f * range;
+          const float compressed_blue = gamut_threshold + range * delta / (delta + range_blue);
+          const float factor_blue = compressed_blue / max_val;
+
+          val[0] *= factor;
+          val[1] *= factor;
+          val[2] *= factor_blue;
+        }
+
+        val[0] = orig_r * (1.0f - effective_strength) + val[0] * effective_strength;
+        val[1] = orig_g * (1.0f - effective_strength) + val[1] * effective_strength;
+        val[2] = orig_b * (1.0f - effective_strength) + val[2] * effective_strength;
+      }
+
+      // Final gamut check to preserve hue (exact color)
+      if(val[0] < 0.0f || val[0] > 1.0f || val[1] < 0.0f || val[1] > 1.0f || val[2] < 0.0f || val[2] > 1.0f)
+      {
+        const float luma = 0.2627f * val[0] + 0.6780f * val[1] + 0.0593f * val[2];
+        const float target_luma = CLAMP(luma, 0.0f, 1.0f);
+        float t = 1.0f;
+        if (val[0] < 0.0f) t = fminf(t, target_luma / (target_luma - val[0]));
+        if (val[1] < 0.0f) t = fminf(t, target_luma / (target_luma - val[1]));
+        if (val[2] < 0.0f) t = fminf(t, target_luma / (target_luma - val[2]));
+        if (val[0] > 1.0f) t = fminf(t, (1.0f - target_luma) / (val[0] - target_luma));
+        if (val[1] > 1.0f) t = fminf(t, (1.0f - target_luma) / (val[1] - target_luma));
+        if (val[2] > 1.0f) t = fminf(t, (1.0f - target_luma) / (val[2] - target_luma));
+        t = fmaxf(0.0f, t);
+        val[0] = target_luma + t * (val[0] - target_luma);
+        val[1] = target_luma + t * (val[1] - target_luma);
+        val[2] = target_luma + t * (val[2] - target_luma);
+      }
+      }
+    }
+
+    for(int i = 0; i < 3; i++) out[k + i] = val[i];
     out[k + 3] = in[k + 3]; // pass on 4th channel
   }
 
@@ -1376,32 +2036,6 @@ void process_fusion(dt_iop_module_t *self,
   free(comb);
 }
 
-void process_lut(dt_iop_module_t *self,
-                 dt_dev_pixelpipe_iop_t *piece,
-                 const void *const ivoid,
-                 void *const ovoid,
-                 const dt_iop_roi_t *const roi_in,
-                 const dt_iop_roi_t *const roi_out)
-{
-  const float *const in = (const float *)ivoid;
-  float *const out = (float *)ovoid;
-  //const int ch = piece->colors; <-- it appears someone was trying to make this handle monochrome data,
-  //however the for loops only handled RGBA - FIXME, determine what possible data formats and channel
-  //configurations we might encounter here and handle those too
-  dt_iop_basecurve_data_t *const d = piece->data;
-  const dt_iop_order_iccprofile_info_t *const work_profile = dt_ioppr_get_iop_work_profile_info(piece->module, piece->module->dev->iop);
-
-  const int wd = roi_in->width, ht = roi_in->height;
-
-  // Compared to previous implementation, we've at least moved this conditional outside of the image processing loops
-  // so that it is evaluated only once.  See FIXME comments in apply_curve for more potential performance improvements
-  if(d->preserve_colors == DT_RGB_NORM_NONE)
-    apply_legacy_curve(in, out, wd, ht, 1.0, d->table, d->unbounded_coeffs);
-  else
-    apply_curve(in, out, wd, ht, d->preserve_colors, 1.0, d->table, d->unbounded_coeffs, work_profile);
-}
-
-
 void process(dt_iop_module_t *self,
              dt_dev_pixelpipe_iop_t *piece,
              const void *const ivoid,
@@ -1430,6 +2064,15 @@ void commit_params(dt_iop_module_t *self,
   d->exposure_stops = p->exposure_stops;
   d->exposure_bias = p->exposure_bias;
   d->preserve_colors = p->preserve_colors;
+  d->workflow_mode = p->workflow_mode;
+  d->shadow_lift = 2.0f - p->shadow_lift;
+  d->highlight_gain = p->highlight_gain;
+  d->ucs_saturation_balance = p->ucs_saturation_balance;
+  d->gamut_strength = p->gamut_strength;
+  d->highlight_corr = p->highlight_corr;
+  d->target_gamut = p->target_gamut;
+  d->color_look = p->color_look;
+  d->look_opacity = p->look_opacity;
 
   const int ch = 0;
   // take care of possible change of curve type or number of nodes (not yet implemented in UI)
@@ -1484,18 +2127,6 @@ void cleanup_pipe(dt_iop_module_t *self,
   piece->data = NULL;
 }
 
-void gui_update(dt_iop_module_t *self)
-{
-  dt_iop_basecurve_params_t *p = self->params;
-  dt_iop_basecurve_gui_data_t *g = self->gui_data;
-
-  gtk_widget_set_visible(g->exposure_step, p->exposure_fusion != 0);
-  gtk_widget_set_visible(g->exposure_bias, p->exposure_fusion != 0);
-
-  // gui curve is read directly from params during expose event.
-  gtk_widget_queue_draw(GTK_WIDGET(g->area));
-}
-
 static float eval_grey(float x)
 {
   // "log base" is a combined scaling and offset change so that x->[0,1], with
@@ -1509,6 +2140,8 @@ void init(dt_iop_module_t *self)
   dt_iop_basecurve_params_t *d = self->default_params;
   d->basecurve[0][1].x = d->basecurve[0][1].y = 1.0;
   d->basecurve_nodes[0] = 2;
+  d->shadow_lift = 1.0f;
+  d->highlight_gain = 1.0f;
 }
 
 void init_global(dt_iop_module_so_t *self)
@@ -1604,7 +2237,7 @@ static gboolean dt_iop_basecurve_draw(GtkWidget *widget, cairo_t *crf, dt_iop_mo
       dt_draw_curve_set_point(g->minmax_curve, k, p->basecurve[0][k].x, p->basecurve[0][k].y);
   }
   dt_draw_curve_t *minmax_curve = g->minmax_curve;
-  dt_draw_curve_calc_values(minmax_curve, 0.0, 1.0, DT_IOP_TONECURVE_RES, g->draw_xs, g->draw_ys);
+  dt_draw_curve_calc_values(minmax_curve, 0.0, 1.0, DT_IOP_TONECURVE_RES, NULL, g->draw_ys);
 
   float unbounded_coeffs[3];
   const float xm = basecurve[nodes - 1].x;
@@ -1828,7 +2461,7 @@ static gboolean dt_iop_basecurve_motion_notify(GtkWidget *widget,
     // got a vertex selected:
     if(g->selected >= 0)
     {
-      // this is used to translate mause position in loglogscale to make this behavior unified with linear scale.
+      // this is used to translate mouse position in loglogscale to make this behavior unified with linear scale.
       const float translate_mouse_x = old_m_x / width - to_log(basecurve[g->selected].x, g->loglogscale);
       const float translate_mouse_y = 1 - old_m_y / height - to_log(basecurve[g->selected].y, g->loglogscale);
       // dx & dy are in linear coordinates
@@ -1946,12 +2579,22 @@ static gboolean dt_iop_basecurve_button_press(GtkWidget *widget,
     else if(event->type == GDK_2BUTTON_PRESS)
     {
       // reset current curve
-      p->basecurve_nodes[ch] = d->basecurve_nodes[ch];
-      p->basecurve_type[ch] = d->basecurve_type[ch];
-      for(int k = 0; k < d->basecurve_nodes[ch]; k++)
+      if(p->workflow_mode > 0)
+      {
+        p->basecurve_nodes[ch] = 2;
+        p->basecurve_type[ch] = CUBIC_SPLINE;
+        p->basecurve[ch][0].x = 0.0f; p->basecurve[ch][0].y = 0.0f;
+        p->basecurve[ch][1].x = 1.0f; p->basecurve[ch][1].y = 1.0f;
+      }
+      else
       {
-        p->basecurve[ch][k].x = d->basecurve[ch][k].x;
-        p->basecurve[ch][k].y = d->basecurve[ch][k].y;
+        p->basecurve_nodes[ch] = d->basecurve_nodes[ch];
+        p->basecurve_type[ch] = d->basecurve_type[ch];
+        for(int k = 0; k < d->basecurve_nodes[ch]; k++)
+        {
+          p->basecurve[ch][k].x = d->basecurve[ch][k].x;
+          p->basecurve[ch][k].y = d->basecurve[ch][k].y;
+        }
       }
       g->selected = -2; // avoid motion notify re-inserting immediately.
       dt_dev_add_history_item_target(darktable.develop, self, TRUE, widget);
@@ -2088,6 +2731,117 @@ void gui_changed(dt_iop_module_t *self, GtkWidget *w, void *previous)
       gtk_widget_set_visible(g->exposure_bias, FALSE);
     }
   }
+
+      if(!w || w == g->workflow_mode || w == g->color_look)
+  {
+    if(p->workflow_mode == 1 || p->workflow_mode == 2)
+    {
+      gtk_widget_set_visible(g->cmb_preserve_colors, FALSE);
+      if(p->preserve_colors != DT_RGB_NORM_NONE)
+        dt_bauhaus_combobox_set(g->cmb_preserve_colors, DT_RGB_NORM_NONE);
+      gtk_widget_set_visible(g->shadow_lift, TRUE);
+      gtk_widget_set_visible(g->highlight_gain, TRUE);
+      gtk_widget_set_visible(g->ucs_saturation_balance, TRUE);
+      gtk_widget_set_visible(g->gamut_strength, TRUE);
+      gtk_widget_set_visible(g->highlight_corr, TRUE);
+      gtk_widget_set_visible(g->target_gamut, TRUE);
+      gtk_widget_set_visible(g->color_look, TRUE);
+      gtk_widget_set_visible(g->look_opacity, TRUE);
+      gtk_widget_set_sensitive(g->shadow_lift, TRUE);
+      gtk_widget_set_sensitive(g->highlight_gain, TRUE);
+      gtk_widget_set_sensitive(g->ucs_saturation_balance, TRUE);
+      gtk_widget_set_sensitive(g->gamut_strength, TRUE);
+      gtk_widget_set_sensitive(g->highlight_corr, TRUE);
+      gtk_widget_set_sensitive(g->target_gamut, TRUE);
+      gtk_widget_set_sensitive(g->color_look, TRUE);
+      gtk_widget_set_sensitive(g->look_opacity, p->color_look > 0);
+          if(w == g->color_look)
+          {
+            p->look_opacity = 1.0f;
+            dt_bauhaus_slider_set(g->look_opacity, 1.0f);
+          }
+      gtk_widget_set_tooltip_text(g->fusion, _("Exposure fusion operates in linear scene-referred space as a luminance normalization step,\n"
+                                               "providing a stable radiometric reference prior to the final tone-mapping curve.\n"
+                                               "It does not perform HDR blending nor exposure compensation."));
+          if(w == g->workflow_mode)
+      {
+        p->shadow_lift = 1.0f;
+        dt_bauhaus_slider_set(g->shadow_lift, 1.0f);
+        p->highlight_gain = 1.0f;
+        dt_bauhaus_slider_set(g->highlight_gain, 1.0f);
+        p->ucs_saturation_balance = 0.2f;
+        dt_bauhaus_slider_set(g->ucs_saturation_balance, 0.2f);
+        // Set default color look when switching to this workflow
+        p->color_look = 1; // Natural look
+        dt_bauhaus_combobox_set(g->color_look, 1);
+        p->look_opacity = 1.0f;
+        dt_bauhaus_slider_set(g->look_opacity, 1.0f);
+        p->basecurve_type[0] = CUBIC_SPLINE;
+        p->basecurve_nodes[0] = 2;
+        p->basecurve[0][0].x = 0.0f; p->basecurve[0][0].y = 0.0f;
+        p->basecurve[0][1].x = 1.0f; p->basecurve[0][1].y = 1.0f;
+
+        gtk_widget_queue_draw(GTK_WIDGET(g->area));
+      }
+    }
+    else
+    {
+      gtk_widget_set_visible(g->cmb_preserve_colors, TRUE);
+      gtk_widget_set_visible(g->shadow_lift, FALSE);
+      gtk_widget_set_visible(g->highlight_gain, FALSE);
+      gtk_widget_set_visible(g->ucs_saturation_balance, FALSE);
+      gtk_widget_set_visible(g->gamut_strength, FALSE);
+      gtk_widget_set_visible(g->highlight_corr, FALSE);
+      gtk_widget_set_visible(g->target_gamut, FALSE);
+      gtk_widget_set_visible(g->color_look, FALSE);
+      gtk_widget_set_visible(g->look_opacity, FALSE);
+      gtk_widget_set_sensitive(g->shadow_lift, FALSE);
+      gtk_widget_set_sensitive(g->highlight_gain, FALSE);
+      gtk_widget_set_sensitive(g->ucs_saturation_balance, FALSE);
+      gtk_widget_set_sensitive(g->gamut_strength, FALSE);
+      gtk_widget_set_sensitive(g->highlight_corr, FALSE);
+      gtk_widget_set_sensitive(g->target_gamut, FALSE);
+      gtk_widget_set_sensitive(g->color_look, FALSE);
+      gtk_widget_set_sensitive(g->look_opacity, FALSE);
+      gtk_widget_set_tooltip_text(g->fusion, _("fuse this image stopped up/down a couple of times with itself, to "
+                                               "compress high dynamic range. expose for the highlights before use."));
+    }
+  }
+
+  if(!w || w == g->workflow_mode)
+  {
+    if(p->workflow_mode != 0)
+    {
+      gtk_widget_hide(g->logbase);
+    }
+    else
+    {
+      gtk_widget_show(g->logbase);
+    }
+  }
+}
+
+void gui_update(dt_iop_module_t *self)
+{
+  dt_iop_basecurve_params_t *p = self->params;
+  dt_iop_basecurve_gui_data_t *g = self->gui_data;
+
+  gtk_widget_set_visible(g->exposure_step, p->exposure_fusion != 0);
+  gtk_widget_set_visible(g->exposure_bias, p->exposure_fusion != 0);
+
+  dt_bauhaus_slider_set(g->gamut_strength, p->gamut_strength);
+  dt_bauhaus_slider_set(g->highlight_corr, p->highlight_corr);
+  dt_bauhaus_combobox_set(g->target_gamut, p->target_gamut);
+  dt_bauhaus_combobox_set(g->workflow_mode, p->workflow_mode);
+  dt_bauhaus_slider_set(g->shadow_lift, p->shadow_lift);
+  dt_bauhaus_slider_set(g->highlight_gain, p->highlight_gain);
+  dt_bauhaus_slider_set(g->ucs_saturation_balance, p->ucs_saturation_balance);
+  dt_bauhaus_combobox_set(g->color_look, p->color_look);
+  dt_bauhaus_slider_set(g->look_opacity, p->look_opacity);
+  gui_changed(self, NULL, NULL);
+
+  // gui curve is read directly from params during expose event.
+  gtk_widget_queue_draw(GTK_WIDGET(g->area));
 }
 
 static void logbase_callback(GtkWidget *slider, dt_iop_module_t *self)
@@ -2111,15 +2865,68 @@ void gui_init(dt_iop_module_t *self)
   g->selected = -1;
   g->loglogscale = 0;
 
-  g->area = GTK_DRAWING_AREA(dtgtk_drawing_area_new_with_height(0));
+  g->area = GTK_DRAWING_AREA(dt_ui_resize_wrap(NULL, DT_PIXEL_APPLY_DPI(100), "plugins/darkroom/basecurve/graph_height"));
   gtk_widget_set_tooltip_text(GTK_WIDGET(g->area), _("abscissa: input, ordinate: output. works on RGB channels"));
   g_object_set_data(G_OBJECT(g->area), "iop-instance", self);
   dt_action_define_iop(self, NULL, N_("curve"), GTK_WIDGET(g->area), NULL);
 
-  self->widget = dt_gui_vbox(g->area);
+  self->widget = dt_gui_vbox(GTK_WIDGET(g->area));
 
   g->cmb_preserve_colors = dt_bauhaus_combobox_from_params(self, "preserve_colors");
   gtk_widget_set_tooltip_text(g->cmb_preserve_colors, _("method to preserve colors when applying contrast"));
+  dt_gui_box_add(self->widget, g->cmb_preserve_colors);
+
+  g->workflow_mode = dt_bauhaus_combobox_from_params(self, "workflow_mode");
+  dt_bauhaus_combobox_add(g->workflow_mode, _("display"));
+  dt_bauhaus_combobox_add(g->workflow_mode, _("Kinematics (ACES-like)"));
+  dt_bauhaus_combobox_add(g->workflow_mode, _("Kinematics (Narkowicz)"));
+  gtk_widget_set_tooltip_text(g->workflow_mode, _("tone mapping method applied after the curve"));
+  dt_gui_box_add(self->widget, g->workflow_mode);
+
+  g->color_look = dt_bauhaus_combobox_from_params(self, "color_look");
+  dt_bauhaus_widget_set_label(g->color_look, NULL, _("Color Look"));
+  dt_bauhaus_combobox_add(g->color_look, "Neutral");
+  dt_bauhaus_combobox_add(g->color_look, "Natural look");
+  dt_bauhaus_combobox_add(g->color_look, "Portrait");
+  dt_bauhaus_combobox_add(g->color_look, "Vibrant");
+  dt_bauhaus_combobox_add(g->color_look, "Nature");
+  dt_bauhaus_combobox_add(g->color_look, "Blue Sky");
+  dt_bauhaus_combobox_add(g->color_look, "Soft Warm");
+  dt_bauhaus_combobox_add(g->color_look, "Soft");
+  dt_bauhaus_combobox_add(g->color_look, "Deep Cool");
+  dt_bauhaus_combobox_add(g->color_look, "Authentic Cinema");
+  gtk_widget_set_tooltip_text(g->color_look, _("Apply a color style: Neutral (none), Portrait (skin tones), Nature (landscapes), Blue Sky (depth), Soft (organic), or Warm/Cool artistic tints."));
+  dt_gui_box_add(self->widget, g->color_look);
+
+  g->look_opacity = dt_bauhaus_slider_from_params(self, "look_opacity");
+  dt_bauhaus_widget_set_label(g->look_opacity, NULL, _("Look Opacity"));
+  dt_bauhaus_slider_set_format(g->look_opacity, "%");
+  dt_bauhaus_slider_set_factor(g->look_opacity, 100.0);
+  gtk_widget_set_tooltip_text(g->look_opacity, _("Adjust the strength of the selected color style (10% to 100%)."));
+  dt_gui_box_add(self->widget, g->look_opacity);
+
+  g->highlight_gain = dt_bauhaus_slider_from_params(self, "highlight_gain");
+  dt_bauhaus_widget_set_label(g->highlight_gain, NULL, _("highlight gain"));
+  gtk_widget_set_tooltip_text(g->highlight_gain, _("Adjusts the gain before tone mapping.\n"
+                                                   "Higher values push more data into highlights compression."));
+  dt_bauhaus_slider_set_soft_range(g->highlight_gain, 0.25, 1.75);
+  dt_bauhaus_slider_set_format(g->highlight_gain, "%");
+  dt_bauhaus_slider_set_factor(g->highlight_gain, 100.0);
+  dt_bauhaus_slider_set_offset(g->highlight_gain, -100.0);
+  dt_bauhaus_slider_set_default(g->highlight_gain, 1.0);
+  dt_gui_box_add(self->widget, g->highlight_gain);
+
+  g->shadow_lift = dt_bauhaus_slider_from_params(self, "shadow_lift");
+  dt_bauhaus_widget_set_label(g->shadow_lift, NULL, _("shadow lift"));
+  gtk_widget_set_tooltip_text(g->shadow_lift, _("Adjusts the shadows brightness.\n"
+                                                 "Positive values lift shadows,\n"
+                                                 "while negative values darken them."));
+  dt_bauhaus_slider_set_soft_range(g->shadow_lift, 0.25, 1.75);
+  dt_bauhaus_slider_set_format(g->shadow_lift, "%");
+  dt_bauhaus_slider_set_factor(g->shadow_lift, 100.0);
+  dt_bauhaus_slider_set_offset(g->shadow_lift, -100.0);
+  dt_bauhaus_slider_set_default(g->shadow_lift, 1.0);
+  dt_gui_box_add(self->widget, g->shadow_lift);
 
   g->fusion = dt_bauhaus_combobox_from_params(self, "exposure_fusion");
   dt_bauhaus_combobox_add(g->fusion, _("none"));
@@ -2127,12 +2934,15 @@ void gui_init(dt_iop_module_t *self)
   dt_bauhaus_combobox_add(g->fusion, _("three exposures"));
   gtk_widget_set_tooltip_text(g->fusion, _("fuse this image stopped up/down a couple of times with itself, to "
                                            "compress high dynamic range. expose for the highlights before use."));
+  gtk_widget_set_margin_bottom(g->fusion, DT_PIXEL_APPLY_DPI(10));
+  dt_gui_box_add(self->widget, g->fusion);
 
   g->exposure_step = dt_bauhaus_slider_from_params(self, "exposure_stops");
   dt_bauhaus_slider_set_digits(g->exposure_step, 3);
   gtk_widget_set_tooltip_text(g->exposure_step, _("how many stops to shift the individual exposures apart"));
   gtk_widget_set_no_show_all(g->exposure_step, TRUE);
   gtk_widget_set_visible(g->exposure_step, p->exposure_fusion != 0 ? TRUE : FALSE);
+  dt_gui_box_add(self->widget, g->exposure_step);
 
   // initially set to 1 (consistency with previous versions), but double-click resets to 0
   // to get a quick way to reach 0 with the mouse.
@@ -2143,6 +2953,54 @@ void gui_init(dt_iop_module_t *self)
                                                   "(-1: reduce highlight, +1: reduce shadows)"));
   gtk_widget_set_no_show_all(g->exposure_bias, TRUE);
   gtk_widget_set_visible(g->exposure_bias, p->exposure_fusion != 0 ? TRUE : FALSE);
+  dt_gui_box_add(self->widget, g->exposure_bias);
+
+  g->ucs_saturation_balance = dt_bauhaus_slider_from_params(self, "ucs_saturation_balance");
+  dt_bauhaus_widget_set_label(g->ucs_saturation_balance, NULL, _("balance saturation ucs"));
+  gtk_widget_set_tooltip_text(g->ucs_saturation_balance,
+                              _("Balances saturation between shadows and highlights (JzAzBz space).\n"
+                                " Move right to boost shadow saturation while taming highlights.\n"
+                                " Move left to boost highlight saturation while taming shadows.\n"
+                                " Ideal for making dark colors pop without clipping speculars."));
+  dt_bauhaus_slider_set_format(g->ucs_saturation_balance, "%");
+  dt_bauhaus_slider_set_factor(g->ucs_saturation_balance, 100.0);
+  dt_bauhaus_slider_set_soft_range(g->ucs_saturation_balance, -0.75, 0.75);
+  dt_bauhaus_slider_set_default(g->ucs_saturation_balance, 0.2);
+  dt_gui_box_add(self->widget, g->ucs_saturation_balance);
+
+  g->highlight_corr = dt_bauhaus_slider_from_params(self, "highlight_corr");
+  dt_bauhaus_widget_set_label(g->highlight_corr, NULL, _("Highlight Hue/Sat"));
+  dt_bauhaus_slider_set_format(g->highlight_corr, "%");
+  dt_bauhaus_slider_set_factor(g->highlight_corr, 100.0);
+  dt_bauhaus_slider_set_digits(g->highlight_corr, 1);
+  dt_bauhaus_slider_set_soft_range(g->highlight_corr, -1.0, 1.0);
+  dt_bauhaus_slider_set_default(g->highlight_corr, 0.0);
+  dt_bauhaus_slider_set_step(g->highlight_corr, 0.001);
+  gtk_widget_set_tooltip_text(g->highlight_corr, _("corrects hue and saturation in highlights to mitigate color shifts\n"
+                                                   "(e.g. salmon sunsets or magenta blues)"));
+  dt_gui_box_add(self->widget, g->highlight_corr);
+
+  g->target_gamut = dt_bauhaus_combobox_from_params(self, "target_gamut");
+  dt_bauhaus_combobox_add(g->target_gamut, "sRGB (Rec.709)");
+  dt_bauhaus_combobox_add(g->target_gamut, "AdobeRGB");
+  dt_bauhaus_combobox_add(g->target_gamut, "Rec.2020");
+  gtk_widget_set_tooltip_text(g->target_gamut, _("Select the destination color space (sRGB, AdobeRGB,\n" 
+                                                  "or Rec.2020). This sets the legal boundary for color saturation."));
+  dt_gui_box_add(self->widget, g->target_gamut);
+
+  g->gamut_strength = dt_bauhaus_slider_from_params(self, "gamut_strength");
+  dt_bauhaus_widget_set_label(g->gamut_strength, NULL, _("compression smoothness"));
+  gtk_widget_set_tooltip_text(g->gamut_strength,
+                              _("Defines how high in the highlights the compression starts.\n"
+                                " Lower values keep more saturation but may clip;\n"
+                                " higher values create a professional roll-off\n"
+                                " in the brightest colors without affecting midtones."));
+  dt_bauhaus_slider_set_format(g->gamut_strength, "%");
+  dt_bauhaus_slider_set_factor(g->gamut_strength, 100.0);
+  dt_bauhaus_slider_set_digits(g->gamut_strength, 1);
+  dt_bauhaus_slider_set_step(g->gamut_strength, 0.001);
+  dt_bauhaus_slider_set_soft_range(g->gamut_strength, 0.0, 1.0);
+  dt_gui_box_add(self->widget, g->gamut_strength);
 
   g->logbase = dt_bauhaus_slider_new_with_range(self, 0.0f, 40.0f, 0, 0.0f, 2);
   dt_bauhaus_widget_set_label(g->logbase, NULL, N_("scale for graph"));