From 3cfddb7acc7472d2eba6455500e3391280496417 Mon Sep 17 00:00:00 2001 From: Hanno Schwalm Date: Wed, 4 Feb 2026 18:18:45 +0100 Subject: [PATCH 1/9] OpenCL helper dt_opencl_local_buffer_opt() returns error code For readability and maintenance this returns a valid OpenCL error code instead of a gboolean. Some error codes fixed. --- src/common/bilateralcl.c | 2 +- src/common/gaussian.c | 2 +- src/common/interpolation.c | 2 +- src/common/nlmeans_core.c | 4 +-- src/common/opencl.c | 31 +++++++++---------- src/common/opencl.h | 9 +++--- src/iop/bloom.c | 4 +-- src/iop/colorreconstruction.c | 4 +-- src/iop/demosaicing/basics.c | 51 ++++++++++--------------------- src/iop/demosaicing/rcd.c | 16 ++++------ src/iop/demosaicing/vng.c | 16 ++++------ src/iop/demosaicing/xtrans.c | 56 ++++++++++------------------------- src/iop/denoiseprofile.c | 18 ++++------- src/iop/globaltonemap.c | 14 +++------ src/iop/highlights.c | 2 +- src/iop/highpass.c | 4 +-- src/iop/nlmeans.c | 4 +-- src/iop/sharpen.c | 4 +-- src/iop/soften.c | 4 +-- 19 files changed, 90 insertions(+), 157 deletions(-) diff --git a/src/common/bilateralcl.c b/src/common/bilateralcl.c index e93261a85ede..19b59a83a568 100644 --- a/src/common/bilateralcl.c +++ b/src/common/bilateralcl.c @@ -84,7 +84,7 @@ dt_bilateral_cl_t *dt_bilateral_init_cl(const int devid, .cellsize = 8 * sizeof(float) + sizeof(int), .overhead = 0, .sizex = 1 << 6, .sizey = 1 << 6 }; - if(!dt_opencl_local_buffer_opt(devid, darktable.opencl->bilateral->kernel_splat, &locopt)) + if(dt_opencl_local_buffer_opt(devid, darktable.opencl->bilateral->kernel_splat, &locopt) != CL_SUCCESS) { dt_print(DT_DEBUG_OPENCL, "[opencl_bilateral] can not identify resource limits for device %d in bilateral grid", devid); diff --git a/src/common/gaussian.c b/src/common/gaussian.c index 5b4dc202f5e3..12565630d9db 100644 --- a/src/common/gaussian.c +++ b/src/common/gaussian.c @@ -849,7 +849,7 @@ dt_gaussian_cl_t *dt_gaussian_init_cl(const int devid, .cellsize = channels * sizeof(float), .overhead = 0, .sizex = BLOCKSIZE, .sizey = BLOCKSIZE }; - if(dt_opencl_local_buffer_opt(devid, kernel_gaussian_transpose, &locopt)) + if(dt_opencl_local_buffer_opt(devid, kernel_gaussian_transpose, &locopt) == CL_SUCCESS) blocksize = MIN(locopt.sizex, locopt.sizey); else blocksize = 1; diff --git a/src/common/interpolation.c b/src/common/interpolation.c index 31afdd30bb70..59deae83a3c9 100644 --- a/src/common/interpolation.c +++ b/src/common/interpolation.c @@ -1335,7 +1335,7 @@ int dt_interpolation_resample_cl(const dt_interpolation_t *itor, .sizex = 1, .sizey = (1 << 16) * taps }; - if(dt_opencl_local_buffer_opt(devid, kernel, &locopt)) + if(dt_opencl_local_buffer_opt(devid, kernel, &locopt) == CL_SUCCESS) vblocksize = locopt.sizey; else vblocksize = 1; diff --git a/src/common/nlmeans_core.c b/src/common/nlmeans_core.c index b54387555d05..089f2c5a8d68 100644 --- a/src/common/nlmeans_core.c +++ b/src/common/nlmeans_core.c @@ -572,14 +572,14 @@ static void get_blocksizes( .cellsize = sizeof(float), .overhead = 0, .sizex = 1 << 16, .sizey = 1 }; - *h = dt_opencl_local_buffer_opt(devid, horiz_kernel, &hlocopt) ? hlocopt.sizex : 1; + *h = dt_opencl_local_buffer_opt(devid, horiz_kernel, &hlocopt) == CL_SUCCESS ? hlocopt.sizex : 1; dt_opencl_local_buffer_t vlocopt = (dt_opencl_local_buffer_t){ .xoffset = 1, .xfactor = 1, .yoffset = 2 * radius, .yfactor = 1, .cellsize = sizeof(float), .overhead = 0, .sizex = 1, .sizey = 1 << 16 }; - *v = dt_opencl_local_buffer_opt(devid, vert_kernel, &vlocopt) ? vlocopt.sizey : 1; + *v = dt_opencl_local_buffer_opt(devid, vert_kernel, &vlocopt) == CL_SUCCESS ? vlocopt.sizey : 1; return; } diff --git a/src/common/opencl.c b/src/common/opencl.c index afc2204d6f49..362026b4d285 100644 --- a/src/common/opencl.c +++ b/src/common/opencl.c @@ -2657,7 +2657,7 @@ int dt_opencl_get_max_work_item_sizes(const int dev, size_t *sizes) { dt_opencl_t *cl = darktable.opencl; - if(!cl->inited || dev < 0) return -1; + if(!cl->inited || dev < 0) return CL_DEVICE_NOT_AVAILABLE; return (cl->dlocl->symbols->dt_clGetDeviceInfo)(cl->dev[dev].devid, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * 3, sizes, NULL); @@ -2670,7 +2670,7 @@ int dt_opencl_get_work_group_limits(const int dev, unsigned long *localmemsize) { dt_opencl_t *cl = darktable.opencl; - if(!cl->inited || dev < 0) return -1; + if(!cl->inited || dev < 0) return CL_DEVICE_NOT_AVAILABLE; cl_ulong lmemsize; cl_int err = (cl->dlocl->symbols->dt_clGetDeviceInfo)(cl->dev[dev].devid, CL_DEVICE_LOCAL_MEM_SIZE, @@ -2693,7 +2693,7 @@ int dt_opencl_get_kernel_work_group_size(const int dev, const int kernel, size_t *kernelworkgroupsize) { - if(!_check_kernel(dev, kernel)) return -1; + if(!_check_kernel(dev, kernel)) return CL_DEVICE_NOT_AVAILABLE; dt_opencl_t *cl = darktable.opencl; return (cl->dlocl->symbols->dt_clGetKernelWorkGroupInfo)(cl->dev[dev].kernel[kernel], @@ -4076,13 +4076,13 @@ static int _nextpow2(const int n) // utility function to calculate optimal work group dimensions for a given kernel // taking device specific restrictions and local memory limitations into account -// returns TRUE in case of success -gboolean dt_opencl_local_buffer_opt(const int devid, - const int kernel, - dt_opencl_local_buffer_t *factors) +// returns CL_SUCCESS or an error code +cl_int dt_opencl_local_buffer_opt(const int devid, + const int kernel, + dt_opencl_local_buffer_t *factors) { dt_opencl_t *cl = darktable.opencl; - if(!cl->inited || devid < 0) return FALSE; + if(!cl->inited || devid < 0) return DT_OPENCL_NODEVICE; size_t maxsizes[3] = { 0 }; // the maximum dimensions for a work group size_t workgroupsize = 0; // the maximum number of items in a work group @@ -4098,10 +4098,8 @@ gboolean dt_opencl_local_buffer_opt(const int devid, *blocksizex = CLAMP(_nextpow2(*blocksizex), 1, 1 << 16); *blocksizey = CLAMP(_nextpow2(*blocksizey), 1, 1 << 16); - if(dt_opencl_get_work_group_limits - (devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS - && dt_opencl_get_kernel_work_group_size - (devid, kernel, &kernelworkgroupsize) == CL_SUCCESS) + if(dt_opencl_get_work_group_limits(devid, maxsizes, &workgroupsize, &localmemsize) == CL_SUCCESS + && dt_opencl_get_kernel_work_group_size(devid, kernel, &kernelworkgroupsize) == CL_SUCCESS) { while(maxsizes[0] < *blocksizex || maxsizes[1] < *blocksizey @@ -4114,9 +4112,8 @@ gboolean dt_opencl_local_buffer_opt(const int devid, if(*blocksizex == 1 && *blocksizey == 1) { dt_print(DT_DEBUG_OPENCL, - "[dt_opencl_local_buffer_opt] no valid resource limits for device %d", - devid); - return FALSE; + "[dt_opencl_local_buffer_opt] no valid resource limits for device %d", devid); + return CL_INVALID_WORK_GROUP_SIZE; } if(*blocksizex > *blocksizey) @@ -4130,10 +4127,10 @@ gboolean dt_opencl_local_buffer_opt(const int devid, dt_print(DT_DEBUG_OPENCL, "[dt_opencl_local_buffer_opt] can not identify" " resource limits for device %d", devid); - return FALSE; + return CL_INVALID_WORK_GROUP_SIZE; } - return TRUE; + return CL_SUCCESS; } #endif diff --git a/src/common/opencl.h b/src/common/opencl.h index 901b2fe0f220..45f30f828ea6 100644 --- a/src/common/opencl.h +++ b/src/common/opencl.h @@ -595,10 +595,11 @@ cl_int dt_opencl_events_flush(const int devid, const gboolean reset); /** utility function to calculate optimal work group dimensions for a - * given kernel */ -gboolean dt_opencl_local_buffer_opt(const int devid, - const int kernel, - dt_opencl_local_buffer_t *factors); + given kernel, returns an error code +*/ +cl_int dt_opencl_local_buffer_opt(const int devid, + const int kernel, + dt_opencl_local_buffer_t *factors); /** utility functions handling device specific properties */ void dt_opencl_write_device_config(const int devid); diff --git a/src/iop/bloom.c b/src/iop/bloom.c index 81fd69808e40..d2116c8b611c 100644 --- a/src/iop/bloom.c +++ b/src/iop/bloom.c @@ -215,7 +215,7 @@ int process_cl(dt_iop_module_t *self, .sizex = 1 << 16, .sizey = 1 }; - if(dt_opencl_local_buffer_opt(devid, gd->kernel_bloom_hblur, &hlocopt)) + if(dt_opencl_local_buffer_opt(devid, gd->kernel_bloom_hblur, &hlocopt) == CL_SUCCESS) hblocksize = hlocopt.sizex; else hblocksize = 1; @@ -231,7 +231,7 @@ int process_cl(dt_iop_module_t *self, .sizex = 1, .sizey = 1 << 16 }; - if(dt_opencl_local_buffer_opt(devid, gd->kernel_bloom_vblur, &vlocopt)) + if(dt_opencl_local_buffer_opt(devid, gd->kernel_bloom_vblur, &vlocopt) == CL_SUCCESS) vblocksize = vlocopt.sizey; else vblocksize = 1; diff --git a/src/iop/colorreconstruction.c b/src/iop/colorreconstruction.c index 3217eaad49f6..2a0c64347c70 100644 --- a/src/iop/colorreconstruction.c +++ b/src/iop/colorreconstruction.c @@ -712,7 +712,7 @@ static dt_iop_colorreconstruct_bilateral_cl_t *dt_iop_colorreconstruct_bilateral .cellsize = 4 * sizeof(float) + sizeof(int), .overhead = 0, .sizex = 1 << 6, .sizey = 1 << 6 }; - if(dt_opencl_local_buffer_opt(devid, global->kernel_colorreconstruct_splat, &locopt)) + if(dt_opencl_local_buffer_opt(devid, global->kernel_colorreconstruct_splat, &locopt) == CL_SUCCESS) { blocksizex = locopt.sizex; blocksizey = locopt.sizey; @@ -852,7 +852,7 @@ static dt_iop_colorreconstruct_bilateral_cl_t *dt_iop_colorreconstruct_bilateral .cellsize = 4 * sizeof(float) + sizeof(int), .overhead = 0, .sizex = 1 << 6, .sizey = 1 << 6 }; - if(dt_opencl_local_buffer_opt(devid, global->kernel_colorreconstruct_splat, &locopt)) + if(dt_opencl_local_buffer_opt(devid, global->kernel_colorreconstruct_splat, &locopt) == CL_SUCCESS) { blocksizex = locopt.sizex; blocksizey = locopt.sizey; diff --git a/src/iop/demosaicing/basics.c b/src/iop/demosaicing/basics.c index 948fe1e43a53..dffe0ac9117c 100644 --- a/src/iop/demosaicing/basics.c +++ b/src/iop/demosaicing/basics.c @@ -249,7 +249,7 @@ static int color_smoothing_cl(const dt_iop_module_t *self, const int devid = piece->pipe->devid; - cl_int err = DT_OPENCL_DEFAULT_ERROR; + cl_int err = CL_MEM_OBJECT_ALLOCATION_FAILURE; cl_mem dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4); if(dev_tmp == NULL) goto error; @@ -259,11 +259,8 @@ static int color_smoothing_cl(const dt_iop_module_t *self, .cellsize = 4 * sizeof(float), .overhead = 0, .sizex = 1 << 8, .sizey = 1 << 8 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_color_smoothing, &locopt)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_color_smoothing, &locopt); + if(err != CL_SUCCESS) goto error; // two buffer references for our ping-pong cl_mem dev_t1 = dev_out; @@ -360,11 +357,8 @@ static int green_equilibration_cl(const dt_iop_module_t *self, .cellsize = 2 * sizeof(float), .overhead = 0, .sizex = 1 << 4, .sizey = 1 << 4 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_green_eq_favg_reduce_first, &flocopt)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_green_eq_favg_reduce_first, &flocopt); + if(err != CL_SUCCESS) goto error; const size_t bwidth = ROUNDUP(width, flocopt.sizex); const size_t bheight = ROUNDUP(height, flocopt.sizey); @@ -392,11 +386,8 @@ static int green_equilibration_cl(const dt_iop_module_t *self, .cellsize = sizeof(float) * 2, .overhead = 0, .sizex = 1 << 16, .sizey = 1 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_green_eq_favg_reduce_second, &slocopt)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_green_eq_favg_reduce_second, &slocopt); + if(err != CL_SUCCESS) goto error; const int reducesize = MIN(DT_REDUCESIZE_MIN, ROUNDUP(bufsize, slocopt.sizex) / slocopt.sizex); @@ -451,11 +442,8 @@ static int green_equilibration_cl(const dt_iop_module_t *self, .cellsize = 1 * sizeof(float), .overhead = 0, .sizex = 1 << 8, .sizey = 1 << 8 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_green_eq_lavg, &locopt)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_green_eq_lavg, &locopt); + if(err != CL_SUCCESS) goto error; const size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 }; const size_t local[3] = { locopt.sizex, locopt.sizey, 1 }; @@ -538,11 +526,8 @@ static int process_default_cl(const dt_iop_module_t *self, .cellsize = 1 * sizeof(float), .overhead = 0, .sizex = 1 << 8, .sizey = 1 << 8 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_pre_median, &locopt)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_pre_median, &locopt); + if(err != CL_SUCCESS) goto error; const size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 }; const size_t local[3] = { locopt.sizex, locopt.sizey, 1 }; @@ -561,11 +546,8 @@ static int process_default_cl(const dt_iop_module_t *self, .cellsize = sizeof(float) * 1, .overhead = 0, .sizex = 1 << 8, .sizey = 1 << 8 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_ppg_green, &locopt)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_ppg_green, &locopt); + if(err != CL_SUCCESS) goto error; const size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 }; const size_t local[3] = { locopt.sizex, locopt.sizey, 1 }; @@ -583,11 +565,8 @@ static int process_default_cl(const dt_iop_module_t *self, .cellsize = 4 * sizeof(float), .overhead = 0, .sizex = 1 << 8, .sizey = 1 << 8 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_ppg_redblue, &locopt)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_ppg_redblue, &locopt); + if(err != CL_SUCCESS) goto error; const size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 }; const size_t local[3] = { locopt.sizex, locopt.sizey, 1 }; diff --git a/src/iop/demosaicing/rcd.c b/src/iop/demosaicing/rcd.c index eb548a794861..eaf026681f1e 100644 --- a/src/iop/demosaicing/rcd.c +++ b/src/iop/demosaicing/rcd.c @@ -626,11 +626,9 @@ static cl_int process_rcd_cl(dt_iop_module_t *self, .cellsize = sizeof(float) * 1, .overhead = 0, .sizex = 64, .sizey = 64 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_green, &locopt)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_green, &locopt); + if(err != CL_SUCCESS) goto error; + myborder = 32; size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 }; size_t local[3] = { locopt.sizex, locopt.sizey, 1 }; @@ -647,11 +645,9 @@ static cl_int process_rcd_cl(dt_iop_module_t *self, .cellsize = 4 * sizeof(float), .overhead = 0, .sizex = 64, .sizey = 64 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_redblue, &locopt)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_redblue, &locopt); + if(err != CL_SUCCESS) goto error; + myborder = 16; size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 }; size_t local[3] = { locopt.sizex, locopt.sizey, 1 }; diff --git a/src/iop/demosaicing/vng.c b/src/iop/demosaicing/vng.c index be1086ee1542..4f09623f27ba 100644 --- a/src/iop/demosaicing/vng.c +++ b/src/iop/demosaicing/vng.c @@ -482,11 +482,9 @@ static cl_int process_vng_cl(const dt_iop_module_t *self, .cellsize = 1 * sizeof(float), .overhead = 0, .sizex = 1 << 8, .sizey = 1 << 8 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_vng_lin_interpolate, &locopt)) - { - err = CL_INVALID_WORK_DIMENSION; - goto finish; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_vng_lin_interpolate, &locopt); + if(err != CL_SUCCESS) goto finish; + size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 }; size_t local[3] = { locopt.sizex, locopt.sizey, 1 }; dt_opencl_set_kernel_args(devid, gd->kernel_vng_lin_interpolate, 0, @@ -508,11 +506,9 @@ static cl_int process_vng_cl(const dt_iop_module_t *self, .cellsize = 4 * sizeof(float), .overhead = 0, .sizex = 1 << 8, .sizey = 1 << 8 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_vng_interpolate, &locopt)) - { - err = CL_INVALID_WORK_DIMENSION; - goto finish; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_vng_interpolate, &locopt); + if(err != CL_SUCCESS) goto finish; + size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 }; size_t local[3] = { locopt.sizex, locopt.sizey, 1 }; dt_opencl_set_kernel_args(devid, gd->kernel_vng_interpolate, 0, diff --git a/src/iop/demosaicing/xtrans.c b/src/iop/demosaicing/xtrans.c index a3be483326f1..5b84ec098a2e 100644 --- a/src/iop/demosaicing/xtrans.c +++ b/src/iop/demosaicing/xtrans.c @@ -1733,11 +1733,8 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self, .cellsize = 1 * sizeof(float), .overhead = 0, .sizex = 1 << 8, .sizey = 1 << 8 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_green_minmax, &locopt_g1_g3)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_green_minmax, &locopt_g1_g3); + if(err != CL_SUCCESS) goto error; { const size_t sizes[3] = { ROUNDUP(width, locopt_g1_g3.sizex), ROUNDUP(height, locopt_g1_g3.sizey), 1 }; @@ -1757,11 +1754,8 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self, .cellsize = 4 * sizeof(float), .overhead = 0, .sizex = 1 << 8, .sizey = 1 << 8 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_interpolate_green, &locopt_g_interp)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_interpolate_green, &locopt_g_interp); + if(err != CL_SUCCESS) goto error; { const size_t sizes[3] = { ROUNDUP(width, locopt_g_interp.sizex), ROUNDUP(height, locopt_g_interp.sizey), 1 }; @@ -1811,11 +1805,8 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self, .cellsize = 4 * sizeof(float), .overhead = 0, .sizex = 1 << 8, .sizey = 1 << 8 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_solitary_green, &locopt_rb_g)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_solitary_green, &locopt_rb_g); + if(err != CL_SUCCESS) goto error; cl_mem *dev_trgb = dev_rgb; for(int d = 0, i = 1, h = 0; d < 6; d++, i ^= 1, h ^= 2) @@ -1841,11 +1832,8 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self, .cellsize = 4 * sizeof(float), .overhead = 0, .sizex = 1 << 8, .sizey = 1 << 8 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_red_and_blue, &locopt_rb_br)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_red_and_blue, &locopt_rb_br); + if(err != CL_SUCCESS) goto error; for(int d = 0; d < 4; d++) { @@ -1865,11 +1853,8 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self, .cellsize = 4 * sizeof(float), .overhead = 0, .sizex = 1 << 8, .sizey = 1 << 8 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_interpolate_twoxtwo, &locopt_g22)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_interpolate_twoxtwo, &locopt_g22); + if(err != CL_SUCCESS) goto error; for(int d = 0, n = 0; d < ndir; d += 2, n++) { @@ -1906,11 +1891,8 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self, .cellsize = 4 * sizeof(float), .overhead = 0, .sizex = 1 << 8, .sizey = 1 << 8 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_differentiate, &locopt_diff)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_differentiate, &locopt_diff); + if(err != CL_SUCCESS) goto error; for(int d = 0; d < ndir; d++) { @@ -1957,11 +1939,8 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self, .cellsize = 1 * sizeof(float), .overhead = 0, .sizex = 1 << 8, .sizey = 1 << 8 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_homo_set, &locopt_homo)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_homo_set, &locopt_homo); + if(err != CL_SUCCESS) goto error; for(int d = 0; d < ndir; d++) { @@ -1987,11 +1966,8 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self, .cellsize = 1 * sizeof(float), .overhead = 0, .sizex = 1 << 8, .sizey = 1 << 8 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_homo_sum, &locopt_homo_sum)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_markesteijn_homo_sum, &locopt_homo_sum); + if(err != CL_SUCCESS) goto error; for(int d = 0; d < ndir; d++) { diff --git a/src/iop/denoiseprofile.c b/src/iop/denoiseprofile.c index 1f57e36269e7..73a5932af7b0 100644 --- a/src/iop/denoiseprofile.c +++ b/src/iop/denoiseprofile.c @@ -2126,7 +2126,7 @@ static int process_nlmeans_cl(dt_iop_module_t *self, .sizex = 1u << 16, .sizey = 1 }; - if(dt_opencl_local_buffer_opt(devid, gd->kernel_denoiseprofile_horiz, &hlocopt)) + if(dt_opencl_local_buffer_opt(devid, gd->kernel_denoiseprofile_horiz, &hlocopt) == CL_SUCCESS) hblocksize = hlocopt.sizex; else hblocksize = 1; @@ -2142,7 +2142,7 @@ static int process_nlmeans_cl(dt_iop_module_t *self, .sizex = 1, .sizey = 1u << 16 }; - if(dt_opencl_local_buffer_opt(devid, gd->kernel_denoiseprofile_vert, &vlocopt)) + if(dt_opencl_local_buffer_opt(devid, gd->kernel_denoiseprofile_vert, &vlocopt) == CL_SUCCESS) vblocksize = vlocopt.sizey; else vblocksize = 1; @@ -2355,11 +2355,8 @@ static int process_wavelets_cl(dt_iop_module_t *self, .sizex = 1u << 4, .sizey = 1u << 4 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_denoiseprofile_reduce_first, &flocopt)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_denoiseprofile_reduce_first, &flocopt); + if(err != CL_SUCCESS) goto error; const size_t bwidth = ROUNDUP(width, flocopt.sizex); const size_t bheight = ROUNDUP(height, flocopt.sizey); @@ -2376,11 +2373,8 @@ static int process_wavelets_cl(dt_iop_module_t *self, .sizex = 1u << 16, .sizey = 1 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_denoiseprofile_reduce_first, &slocopt)) - { - err = CL_INVALID_WORK_DIMENSION; - goto error; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_denoiseprofile_reduce_first, &slocopt); + if(err != CL_SUCCESS) goto error; const int reducesize = MIN(REDUCESIZE, ROUNDUP(bufsize, slocopt.sizex) / slocopt.sizex); err = CL_MEM_OBJECT_ALLOCATION_FAILURE; diff --git a/src/iop/globaltonemap.c b/src/iop/globaltonemap.c index 4364ba77b0f1..5de25273f116 100644 --- a/src/iop/globaltonemap.c +++ b/src/iop/globaltonemap.c @@ -383,11 +383,8 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_ .cellsize = sizeof(float), .overhead = 0, .sizex = 1 << 4, .sizey = 1 << 4 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_pixelmax_first, &flocopt)) - { - err = CL_INVALID_WORK_DIMENSION; - goto finally; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_pixelmax_first, &flocopt); + if(err != CL_SUCCESS) goto finally; const size_t bwidth = ROUNDUP(width, flocopt.sizex); const size_t bheight = ROUNDUP(height, flocopt.sizey); @@ -399,11 +396,8 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_ .cellsize = sizeof(float), .overhead = 0, .sizex = 1 << 16, .sizey = 1 }; - if(!dt_opencl_local_buffer_opt(devid, gd->kernel_pixelmax_second, &slocopt)) - { - err = CL_INVALID_WORK_DIMENSION; - goto finally; - } + err = dt_opencl_local_buffer_opt(devid, gd->kernel_pixelmax_second, &slocopt); + if(err != CL_SUCCESS) goto finally; const int reducesize = MIN(REDUCESIZE, ROUNDUP(bufsize, slocopt.sizex) / slocopt.sizex); diff --git a/src/iop/highlights.c b/src/iop/highlights.c index 15b6ff094493..ec1b5372a09c 100644 --- a/src/iop/highlights.c +++ b/src/iop/highlights.c @@ -627,7 +627,7 @@ int process_cl(dt_iop_module_t *self, .cellsize = sizeof(float), .overhead = 0, .sizex = 1 << 8, .sizey = 1 << 8 }; - if(dt_opencl_local_buffer_opt(devid, gd->kernel_highlights_1f_lch_xtrans, &locopt)) + if(dt_opencl_local_buffer_opt(devid, gd->kernel_highlights_1f_lch_xtrans, &locopt) == CL_SUCCESS) { blocksizex = locopt.sizex; blocksizey = locopt.sizey; diff --git a/src/iop/highpass.c b/src/iop/highpass.c index bb2fbf58630f..c463197d4eb6 100644 --- a/src/iop/highpass.c +++ b/src/iop/highpass.c @@ -162,7 +162,7 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_ .cellsize = sizeof(float), .overhead = 0, .sizex = 1 << 16, .sizey = 1 }; - if(dt_opencl_local_buffer_opt(devid, gd->kernel_highpass_hblur, &hlocopt)) + if(dt_opencl_local_buffer_opt(devid, gd->kernel_highpass_hblur, &hlocopt) == CL_SUCCESS) hblocksize = hlocopt.sizex; else hblocksize = 1; @@ -173,7 +173,7 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_ .cellsize = sizeof(float), .overhead = 0, .sizex = 1, .sizey = 1 << 16 }; - if(dt_opencl_local_buffer_opt(devid, gd->kernel_highpass_vblur, &vlocopt)) + if(dt_opencl_local_buffer_opt(devid, gd->kernel_highpass_vblur, &vlocopt) == CL_SUCCESS) vblocksize = vlocopt.sizey; else vblocksize = 1; diff --git a/src/iop/nlmeans.c b/src/iop/nlmeans.c index e4030e04dcf3..dd3f084c10f7 100644 --- a/src/iop/nlmeans.c +++ b/src/iop/nlmeans.c @@ -248,7 +248,7 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_ .cellsize = sizeof(float), .overhead = 0, .sizex = 1 << 16, .sizey = 1 }; - if(dt_opencl_local_buffer_opt(devid, gd->kernel_nlmeans_horiz, &hlocopt)) + if(dt_opencl_local_buffer_opt(devid, gd->kernel_nlmeans_horiz, &hlocopt) == CL_SUCCESS) hblocksize = hlocopt.sizex; else hblocksize = 1; @@ -259,7 +259,7 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_ .cellsize = sizeof(float), .overhead = 0, .sizex = 1, .sizey = 1 << 16 }; - if(dt_opencl_local_buffer_opt(devid, gd->kernel_nlmeans_vert, &vlocopt)) + if(dt_opencl_local_buffer_opt(devid, gd->kernel_nlmeans_vert, &vlocopt) == CL_SUCCESS) vblocksize = vlocopt.sizey; else vblocksize = 1; diff --git a/src/iop/sharpen.c b/src/iop/sharpen.c index 544909fc1fc9..5229a54c8446 100644 --- a/src/iop/sharpen.c +++ b/src/iop/sharpen.c @@ -171,7 +171,7 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_ .cellsize = sizeof(float), .overhead = 0, .sizex = 1 << 16, .sizey = 1 }; - if(dt_opencl_local_buffer_opt(devid, gd->kernel_sharpen_hblur, &hlocopt)) + if(dt_opencl_local_buffer_opt(devid, gd->kernel_sharpen_hblur, &hlocopt) == CL_SUCCESS) hblocksize = hlocopt.sizex; else hblocksize = 1; @@ -182,7 +182,7 @@ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_ .cellsize = sizeof(float), .overhead = 0, .sizex = 1, .sizey = 1 << 16 }; - if(dt_opencl_local_buffer_opt(devid, gd->kernel_sharpen_vblur, &vlocopt)) + if(dt_opencl_local_buffer_opt(devid, gd->kernel_sharpen_vblur, &vlocopt) == CL_SUCCESS) vblocksize = vlocopt.sizey; else vblocksize = 1; diff --git a/src/iop/soften.c b/src/iop/soften.c index 1a28a30dbbfc..5c9c62f2c627 100644 --- a/src/iop/soften.c +++ b/src/iop/soften.c @@ -208,7 +208,7 @@ int process_cl(dt_iop_module_t *self, .sizex = 1 << 16, .sizey = 1 }; - if(dt_opencl_local_buffer_opt(devid, gd->kernel_soften_hblur, &hlocopt)) + if(dt_opencl_local_buffer_opt(devid, gd->kernel_soften_hblur, &hlocopt) == CL_SUCCESS) hblocksize = hlocopt.sizex; else hblocksize = 1; @@ -224,7 +224,7 @@ int process_cl(dt_iop_module_t *self, .sizex = 1, .sizey = 1 << 16 }; - if(dt_opencl_local_buffer_opt(devid, gd->kernel_soften_vblur, &vlocopt)) + if(dt_opencl_local_buffer_opt(devid, gd->kernel_soften_vblur, &vlocopt) == CL_SUCCESS) vblocksize = vlocopt.sizey; else vblocksize = 1; From e1ad1c0dc7083a6ab6a38c50b4aff946aec851eb Mon Sep 17 00:00:00 2001 From: Hanno Schwalm Date: Wed, 4 Feb 2026 16:39:59 +0100 Subject: [PATCH 2/9] Introduce OpenCL CLARGINT() and CLARGFLOAT() macros As parameters for OpenCK kernels must be presented as 'pointer-to-value' we use compound literals for readability. --- src/common/opencl.h | 6 ++++++ src/iop/demosaicing/dual.c | 3 +-- src/iop/demosaicing/rcd.c | 10 +++------- src/iop/demosaicing/xtrans.c | 38 ++++++++++++++++-------------------- 4 files changed, 27 insertions(+), 30 deletions(-) diff --git a/src/common/opencl.h b/src/common/opencl.h index 45f30f828ea6..a3aff0148148 100644 --- a/src/common/opencl.h +++ b/src/common/opencl.h @@ -359,6 +359,12 @@ int dt_opencl_get_kernel_work_group_size(const int dev, /** wrap opencl single argument */ #define CLARG(arg) CLWRAP(sizeof(arg), &arg) +/** wrap inline parameters as compound literals (C99) used for #defines / constants .. + See https://en.cppreference.com/w/c/language/compound_literal.html +*/ +#define CLARGINT(arg) CLWRAP(sizeof(int), &((int){arg})) +#define CLARGFLOAT(arg) CLWRAP(sizeof(float), &((float){arg})) + /** wrap opencl argument array */ #define CLARRAY(num, arg) CLWRAP(num * sizeof(*arg), arg) diff --git a/src/iop/demosaicing/dual.c b/src/iop/demosaicing/dual.c index 0ce80430e3f2..cf39c210fb62 100644 --- a/src/iop/demosaicing/dual.c +++ b/src/iop/demosaicing/dual.c @@ -126,9 +126,8 @@ int dual_demosaic_cl(const dt_iop_module_t *self, CLARG(mask), CLARG(tmp), CLARG(width), CLARG(height)); if(err != CL_SUCCESS) goto finish; - const int detail = 1; err = dt_opencl_enqueue_kernel_2d_args(devid, darktable.opencl->blendop->kernel_calc_blend, width, height, - CLARG(tmp), CLARG(mask), CLARG(width), CLARG(height), CLARG(contrastf), CLARG(detail)); + CLARG(tmp), CLARG(mask), CLARG(width), CLARG(height), CLARG(contrastf), CLARGINT(1)); if(err != CL_SUCCESS) goto finish; err = dt_gaussian_fast_blur_cl_buffer(devid, mask, tmp, width, height, 2.0f, 1, 0.0f, 1.0f); diff --git a/src/iop/demosaicing/rcd.c b/src/iop/demosaicing/rcd.c index eaf026681f1e..30b82e40eb7d 100644 --- a/src/iop/demosaicing/rcd.c +++ b/src/iop/demosaicing/rcd.c @@ -615,7 +615,6 @@ static cl_int process_rcd_cl(dt_iop_module_t *self, dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4); if(dev_tmp == NULL) goto error; - int myborder = 3; err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_border_interpolate, width, height, CLARG(dev_in), CLARG(dev_tmp), CLARG(width), CLARG(height), CLARG(filters)); if(err != CL_SUCCESS) goto error; @@ -629,12 +628,11 @@ static cl_int process_rcd_cl(dt_iop_module_t *self, err = dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_green, &locopt); if(err != CL_SUCCESS) goto error; - myborder = 32; size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 }; size_t local[3] = { locopt.sizex, locopt.sizey, 1 }; dt_opencl_set_kernel_args(devid, gd->kernel_rcd_border_green, 0, CLARG(dev_in), CLARG(dev_tmp), CLARG(width), CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * (locopt.sizex + 2*3) * (locopt.sizey + 2*3)), - CLARG(myborder)); + CLARGINT(32)); err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_rcd_border_green, sizes, local); if(err != CL_SUCCESS) goto error; } @@ -648,12 +646,11 @@ static cl_int process_rcd_cl(dt_iop_module_t *self, err = dt_opencl_local_buffer_opt(devid, gd->kernel_rcd_border_redblue, &locopt); if(err != CL_SUCCESS) goto error; - myborder = 16; size_t sizes[3] = { ROUNDUP(width, locopt.sizex), ROUNDUP(height, locopt.sizey), 1 }; size_t local[3] = { locopt.sizex, locopt.sizey, 1 }; dt_opencl_set_kernel_args(devid, gd->kernel_rcd_border_redblue, 0, CLARG(dev_tmp), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(filters), CLLOCAL(sizeof(float) * 4 * (locopt.sizex + 2) * (locopt.sizey + 2)), - CLARG(myborder)); + CLARGINT(16)); err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_rcd_border_redblue, sizes, local); if(err != CL_SUCCESS) goto error; } @@ -722,9 +719,8 @@ static cl_int process_rcd_cl(dt_iop_module_t *self, scaler = dt_iop_get_processed_maximum(piece); // write output - myborder = RCD_MARGIN; err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_rcd_write_output, width, height, - CLARG(dev_out), CLARG(rgb0), CLARG(rgb1), CLARG(rgb2), CLARG(width), CLARG(height), CLARG(scaler), CLARG(myborder)); + CLARG(dev_out), CLARG(rgb0), CLARG(rgb1), CLARG(rgb2), CLARG(width), CLARG(height), CLARG(scaler), CLARGINT(RCD_MARGIN)); error: dt_opencl_release_mem_object(dev_tmp); diff --git a/src/iop/demosaicing/xtrans.c b/src/iop/demosaicing/xtrans.c index 5b84ec098a2e..e0d329318642 100644 --- a/src/iop/demosaicing/xtrans.c +++ b/src/iop/demosaicing/xtrans.c @@ -21,6 +21,10 @@ // tile size, optimized to keep data in L2 cache #define TS 122 +#define PAD_G1_G3 3 +#define PAD_G_INTERP 3 +#define PAD_G_RECALC 6 + /** Lookup for allhex[], making sure that row/col aren't negative **/ static inline const short *_hexmap(const int row, const int col, @@ -186,13 +190,12 @@ static void xtrans_markesteijn_interpolate(float *out, // and g3 values to the min/max of green pixels surrounding the // pair. Use a 3 pixel border as gmin/gmax is used by // interpolate green which has a 3 pixel border. - const int pad_g1_g3 = 3; - for(int row = top + pad_g1_g3; row < mrow - pad_g1_g3; row++) + for(int row = top + PAD_G1_G3; row < mrow - PAD_G1_G3; row++) { // setting max to 0.0f signifies that this is a new pair, which // requires a new min/max calculation of its neighboring greens float min = FLT_MAX, max = 0.0f; - for(int col = left + pad_g1_g3; col < mcol - pad_g1_g3; col++) + for(int col = left + PAD_G1_G3; col < mcol - PAD_G1_G3; col++) { // if in row of horizontal red & blue pairs (or processing // vertical red & blue pairs near image bottom), reset min/max @@ -238,9 +241,8 @@ static void xtrans_markesteijn_interpolate(float *out, /* Interpolate green horizontally, vertically, and along both diagonals: */ // need a 3 pixel border here as 3*hex[] can have a 3 unit offset - const int pad_g_interp = 3; - for(int row = top + pad_g_interp; row < mrow - pad_g_interp; row++) - for(int col = left + pad_g_interp; col < mcol - pad_g_interp; col++) + for(int row = top + PAD_G_INTERP; row < mrow - PAD_G_INTERP; row++) + for(int col = left + PAD_G_INTERP; col < mcol - PAD_G_INTERP; col++) { float color[8]; const int f = FCNxtrans(row, col, xtrans); @@ -274,9 +276,8 @@ static void xtrans_markesteijn_interpolate(float *out, /* Recalculate green from interpolated values of closer pixels: */ if(pass) { - const int pad_g_recalc = 6; - for(int row = top + pad_g_recalc; row < mrow - pad_g_recalc; row++) - for(int col = left + pad_g_recalc; col < mcol - pad_g_recalc; col++) + for(int row = top + PAD_G_RECALC; row < mrow - PAD_G_RECALC; row++) + for(int col = left + PAD_G_RECALC; col < mcol - PAD_G_RECALC; col++) { const int f = FCNxtrans(row, col, xtrans); if(f == 1) continue; @@ -1227,13 +1228,12 @@ static void xtrans_fdc_interpolate(float *out, // and g3 values to the min/max of green pixels surrounding the // pair. Use a 3 pixel border as gmin/gmax is used by // interpolate green which has a 3 pixel border. - const int pad_g1_g3 = 3; - for(int row = top + pad_g1_g3; row < mrow - pad_g1_g3; row++) + for(int row = top + PAD_G1_G3; row < mrow - PAD_G1_G3; row++) { // setting max to 0.0f signifies that this is a new pair, which // requires a new min/max calculation of its neighboring greens float min = FLT_MAX, max = 0.0f; - for(int col = left + pad_g1_g3; col < mcol - pad_g1_g3; col++) + for(int col = left + PAD_G1_G3; col < mcol - PAD_G1_G3; col++) { // if in row of horizontal red & blue pairs (or processing // vertical red & blue pairs near image bottom), reset min/max @@ -1279,9 +1279,8 @@ static void xtrans_fdc_interpolate(float *out, /* Interpolate green horizontally, vertically, and along both diagonals: */ // need a 3 pixel border here as 3*hex[] can have a 3 unit offset - const int pad_g_interp = 3; - for(int row = top + pad_g_interp; row < mrow - pad_g_interp; row++) - for(int col = left + pad_g_interp; col < mcol - pad_g_interp; col++) + for(int row = top + PAD_G_INTERP; row < mrow - PAD_G_INTERP; row++) + for(int col = left + PAD_G_INTERP; col < mcol - PAD_G_INTERP; col++) { float color[8]; const int f = FCNxtrans(row, col, xtrans); @@ -1727,7 +1726,6 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self, } // find minimum and maximum allowed green values of red/blue pixel pairs - const int pad_g1_g3 = 3; dt_opencl_local_buffer_t locopt_g1_g3 = (dt_opencl_local_buffer_t){ .xoffset = 2*3, .xfactor = 1, .yoffset = 2*3, .yfactor = 1, .cellsize = 1 * sizeof(float), .overhead = 0, @@ -1741,14 +1739,13 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self, const size_t local[3] = { locopt_g1_g3.sizex, locopt_g1_g3.sizey, 1 }; dt_opencl_set_kernel_args(devid, gd->kernel_markesteijn_green_minmax, 0, CLARG(dev_rgb[0]), CLARG(dev_gminmax), - CLARG(width), CLARG(height), CLARG(pad_g1_g3), CLARRAY(2, sgreen), + CLARG(width), CLARG(height), CLARGINT(PAD_G1_G3), CLARRAY(2, sgreen), CLARG(dev_xtrans), CLARG(dev_allhex), CLLOCAL(sizeof(float) * (locopt_g1_g3.sizex + 2*3) * (locopt_g1_g3.sizey + 2*3))); err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_markesteijn_green_minmax, sizes, local); if(err != CL_SUCCESS) goto error; } // interpolate green horizontally, vertically, and along both diagonals - const int pad_g_interp = 3; dt_opencl_local_buffer_t locopt_g_interp = (dt_opencl_local_buffer_t){ .xoffset = 2*6, .xfactor = 1, .yoffset = 2*6, .yfactor = 1, .cellsize = 4 * sizeof(float), .overhead = 0, @@ -1763,7 +1760,7 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self, dt_opencl_set_kernel_args(devid, gd->kernel_markesteijn_interpolate_green, 0, CLARG(dev_rgb[0]), CLARG(dev_rgb[1]), CLARG(dev_rgb[2]), CLARG(dev_rgb[3]), CLARG(dev_gminmax), CLARG(width), CLARG(height), - CLARG(pad_g_interp), CLARRAY(2, sgreen), CLARG(dev_xtrans), + CLARGINT(PAD_G_INTERP), CLARRAY(2, sgreen), CLARG(dev_xtrans), CLARG(dev_allhex), CLLOCAL(sizeof(float) * 4 * (locopt_g_interp.sizex + 2*6) * (locopt_g_interp.sizey + 2*6))); err = dt_opencl_enqueue_kernel_2d_with_local(devid, gd->kernel_markesteijn_interpolate_green, sizes, local); if(err != CL_SUCCESS) goto error; @@ -1790,10 +1787,9 @@ static cl_int process_markesteijn_cl(const dt_iop_module_t *self, if(pass) { // recalculate green from interpolated values of closer pixels - const int pad_g_recalc = 6; err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_markesteijn_recalculate_green, width, height, CLARG(dev_rgb[0]), CLARG(dev_rgb[1]), CLARG(dev_rgb[2]), CLARG(dev_rgb[3]), CLARG(dev_gminmax), - CLARG(width), CLARG(height), CLARG(pad_g_recalc), CLARRAY(2, sgreen), + CLARG(width), CLARG(height), CLARGINT(PAD_G_RECALC), CLARRAY(2, sgreen), CLARG(dev_xtrans), CLARG(dev_allhex)); if(err != CL_SUCCESS) goto error; } From bd74658be9144ef448acf543dd9786563e9a9ffc Mon Sep 17 00:00:00 2001 From: Hanno Schwalm Date: Wed, 4 Feb 2026 16:43:15 +0100 Subject: [PATCH 3/9] OpenCL basecurve maintenance - kernels called with the _args() variant - a missing mem allocation check added - using CLARGFLOAT --- src/iop/basecurve.c | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/src/iop/basecurve.c b/src/iop/basecurve.c index a222747199a6..54eeb94e7533 100644 --- a/src/iop/basecurve.c +++ b/src/iop/basecurve.c @@ -1,6 +1,6 @@ /* This file is part of darktable, - Copyright (C) 2010-2025 darktable developers. + Copyright (C) 2010-2026 darktable developers. darktable is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -920,7 +920,7 @@ int process_cl_lut(dt_iop_module_t *self, cl_mem dev_m = NULL; cl_mem dev_coeffs = NULL; - cl_int err = DT_OPENCL_DEFAULT_ERROR; + cl_int err = CL_MEM_OBJECT_ALLOCATION_FAILURE; cl_mem dev_profile_info = NULL; cl_mem dev_profile_lut = NULL; @@ -933,36 +933,26 @@ int process_cl_lut(dt_iop_module_t *self, const int height = roi_in->height; const int preserve_colors = d->preserve_colors; - const float mul = 1.0f; - - size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 }; dev_m = dt_opencl_copy_host_to_device(devid, d->table, 256, 256, sizeof(float)); - if(dev_m == NULL) goto error; + dev_coeffs = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 3, d->unbounded_coeffs); + if(!dev_m || !dev_coeffs) goto error; err = dt_ioppr_build_iccprofile_params_cl(work_profile, devid, &profile_info_cl, &profile_lut_cl, &dev_profile_info, &dev_profile_lut); if(err != CL_SUCCESS) goto error; - dev_coeffs = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 3, d->unbounded_coeffs); - - if(dev_coeffs == NULL) goto error; - // read data/kernels/basecurve.cl for a description of "legacy" vs current // Conditional is moved outside of the OpenCL operations for performance. if(d->preserve_colors == DT_RGB_NORM_NONE) - { - dt_opencl_set_kernel_args(devid, gd->kernel_basecurve_legacy_lut, 0, CLARG(dev_in), CLARG(dev_out), - CLARG(width), CLARG(height), CLARG(mul), CLARG(dev_m), CLARG(dev_coeffs)); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_basecurve_legacy_lut, sizes); - } + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_basecurve_legacy_lut, width, height, + CLARG(dev_in), CLARG(dev_out), + CLARG(width), CLARG(height), CLARGFLOAT(1.0f), CLARG(dev_m), CLARG(dev_coeffs)); else - { - //FIXME: There are still conditionals on d->preserve_colors within this flow that could impact performance - dt_opencl_set_kernel_args(devid, gd->kernel_basecurve_lut, 0, CLARG(dev_in), CLARG(dev_out), CLARG(width), - CLARG(height), CLARG(mul), CLARG(dev_m), CLARG(dev_coeffs), CLARG(preserve_colors), CLARG(dev_profile_info), + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_basecurve_lut, width, height, + CLARG(dev_in), CLARG(dev_out), + CLARG(width), CLARG(height), + CLARGFLOAT(1.0f), CLARG(dev_m), CLARG(dev_coeffs), CLARG(preserve_colors), CLARG(dev_profile_info), CLARG(dev_profile_lut), CLARG(use_work_profile)); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_basecurve_lut, sizes); - } error: dt_opencl_release_mem_object(dev_m); From 495d748d98eb01023a3ea9c146cb798b7fccbaf4 Mon Sep 17 00:00:00 2001 From: Hanno Schwalm Date: Wed, 4 Feb 2026 08:08:02 +0100 Subject: [PATCH 4/9] OpenCL bilateral and colorreconstruct maintenance - deduplicated code for atomic floats to common.h preparation for better support also for modern devices instead of that workaround - use _args() variants for kernel calls in bilateral - constify in both related kernels --- data/kernels/bilateral.cl | 103 +++++++++------------------- data/kernels/colorreconstruction.cl | 67 ++++-------------- data/kernels/common.h | 38 ++++++++++ src/common/bilateralcl.c | 33 ++++----- 4 files changed, 95 insertions(+), 146 deletions(-) diff --git a/data/kernels/bilateral.cl b/data/kernels/bilateral.cl index 3c8d7c45dde1..8c48678bbbe8 100644 --- a/data/kernels/bilateral.cl +++ b/data/kernels/bilateral.cl @@ -1,6 +1,6 @@ /* This file is part of darktable, - copyright (c) 2012-2025 darktable developers. + copyright (c) 2012-2026 darktable developers. darktable is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -30,45 +30,6 @@ image_to_grid( clamp(p.z/sigma.z, 0.0f, size.z-1.0f), 0.0f); } -void -atomic_add_f( - global float *val, - const float delta) -{ -#ifdef NVIDIA_SM_20 - // buys me another 3x--10x over the `algorithmic' improvements in the splat kernel below, - // depending on configuration (sigma_s and sigma_r) - float res = 0; - asm volatile ("atom.global.add.f32 %0, [%1], %2;" : "=f"(res) : "l"(val), "f"(delta)); - -#else - union - { - float f; - unsigned int i; - } - old_val; - union - { - float f; - unsigned int i; - } - new_val; - - global volatile unsigned int *ival = (global volatile unsigned int *)val; - - do - { - // the following is equivalent to old_val.f = *val. however, as according to the opencl standard - // we can not rely on global buffer val to be consistently cached (relaxed memory consistency) we - // access it via a slower but consistent atomic operation. - old_val.i = atomic_add(ival, 0); - new_val.f = old_val.f + delta; - } - while (atomic_cmpxchg (ival, old_val.i, new_val.i) != old_val.i); -#endif -} - kernel void zero( global float *grid, @@ -102,29 +63,29 @@ splat( const int j = get_local_id(1); int li = lszx*j + i; - int4 size = (int4)(sizex, sizey, sizez, 0); - float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0); + const int4 size = (int4)(sizex, sizey, sizez, 0); + const float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0); int ox = 1; - int oy = size.x; - int oz = size.y*size.x; + const int oy = size.x; + const int oz = size.y*size.x; if(x < width && y < height) { // splat into downsampled grid const float4 pixel = read_imagef (in, samplerc, (int2)(x, y)); - float L = pixel.x; - float4 p = (float4)(x, y, L, 0); - float4 gridp = image_to_grid(p, size, sigma); - int4 xi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0)); - float fx = gridp.x - xi.x; - float fy = gridp.y - xi.y; - float fz = gridp.z - xi.z; + const float L = pixel.x; + const float4 p = (float4)(x, y, L, 0.0f); + const float4 gridp = image_to_grid(p, size, sigma); + const int4 xi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0)); + const float fx = gridp.x - xi.x; + const float fy = gridp.y - xi.y; + const float fz = gridp.z - xi.z; // first accumulate into local memory gi[li] = xi.x + oy*xi.y + oz*xi.z; - float contrib = 100.0f/(sigma_s*sigma_s); + const float contrib = 100.0f/(sigma_s*sigma_s); li *= 8; accum[li++] = contrib * (1.0f-fx) * (1.0f-fy) * (1.0f-fz); accum[li++] = contrib * ( fx) * (1.0f-fy) * (1.0f-fz); @@ -301,18 +262,18 @@ slice_to_output( const int oy = sizex; const int oz = sizey*sizex; - int4 size = (int4)(sizex, sizey, sizez, 0); - float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0); + const int4 size = (int4)(sizex, sizey, sizez, 0); + const float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0); - float4 pixel = read_imagef (in, samplerc, (int2)(x, y)); + const float4 pixel = read_imagef (in, samplerc, (int2)(x, y)); float4 pixel2 = read_imagef (target, samplerc, (int2)(x, y)); - float L = pixel.x; - float4 p = (float4)(x, y, L, 0); - float4 gridp = image_to_grid(p, size, sigma); - int4 gridi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0)); - float fx = gridp.x - gridi.x; - float fy = gridp.y - gridi.y; - float fz = gridp.z - gridi.z; + const float L = pixel.x; + const float4 p = (float4)(x, y, L, 0); + const float4 gridp = image_to_grid(p, size, sigma); + const int4 gridi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0)); + const float fx = gridp.x - gridi.x; + const float fy = gridp.y - gridi.y; + const float fz = gridp.z - gridi.z; // trilinear lookup (wouldn't read/write access to 3d textures be cool) // could actually use an array of 2d textures, these only require opencl 1.2 @@ -354,17 +315,17 @@ slice( const int oy = sizex; const int oz = sizey*sizex; - int4 size = (int4)(sizex, sizey, sizez, 0); - float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0); + const int4 size = (int4)(sizex, sizey, sizez, 0); + const float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0); float4 pixel = read_imagef (in, samplerc, (int2)(x, y)); - float L = pixel.x; - float4 p = (float4)(x, y, L, 0); - float4 gridp = image_to_grid(p, size, sigma); - int4 gridi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0)); - float fx = gridp.x - gridi.x; - float fy = gridp.y - gridi.y; - float fz = gridp.z - gridi.z; + const float L = pixel.x; + const float4 p = (float4)(x, y, L, 0); + const float4 gridp = image_to_grid(p, size, sigma); + const int4 gridi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0)); + const float fx = gridp.x - gridi.x; + const float fy = gridp.y - gridi.y; + const float fz = gridp.z - gridi.z; // trilinear lookup (wouldn't read/write access to 3d textures be cool) // could actually use an array of 2d textures, these only require opencl 1.2 diff --git a/data/kernels/colorreconstruction.cl b/data/kernels/colorreconstruction.cl index 32a291414deb..c7a582fae39d 100644 --- a/data/kernels/colorreconstruction.cl +++ b/data/kernels/colorreconstruction.cl @@ -1,7 +1,6 @@ /* This file is part of darktable, - copyright (c) 2012 johannes hanika. - copyright (c) 2015 Ulrich Pegelow. + Copyright (C) 2012-2026 darktable developers. darktable is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -48,46 +47,6 @@ grid_rescale( return convert_float2(roixy + pxy) * scale - convert_float2(bxy); } - -void -atomic_add_f( - global float *val, - const float delta) -{ -#ifdef NVIDIA_SM_20 - // buys me another 3x--10x over the `algorithmic' improvements in the splat kernel below, - // depending on configuration (sigma_s and sigma_r) - float res = 0; - asm volatile ("atom.global.add.f32 %0, [%1], %2;" : "=f"(res) : "l"(val), "f"(delta)); - -#else - union - { - float f; - unsigned int i; - } - old_val; - union - { - float f; - unsigned int i; - } - new_val; - - global volatile unsigned int *ival = (global volatile unsigned int *)val; - - do - { - // the following is equivalent to old_val.f = *val. however, as according to the opencl standard - // we can not rely on global buffer val to be consistently cached (relaxed memory consistency) we - // access it via a slower but consistent atomic operation. - old_val.i = atomic_add(ival, 0); - new_val.f = old_val.f + delta; - } - while (atomic_cmpxchg (ival, old_val.i, new_val.i) != old_val.i); -#endif -} - kernel void colorreconstruction_zero( global float *grid, @@ -125,8 +84,8 @@ colorreconstruction_splat( const int j = get_local_id(1); int li = lszx*j + i; - int4 size = (int4)(sizex, sizey, sizez, 0); - float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0); + const int4 size = (int4)(sizex, sizey, sizez, 0); + const float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0); const float4 pixel = read_imagef (in, samplerc, (int2)(x, y)); float weight, m; @@ -153,11 +112,11 @@ colorreconstruction_splat( if(x < width && y < height) { // splat into downsampled grid - float4 p = (float4)(x, y, pixel.x, 0); - float4 gridp = image_to_grid(p, size, sigma); + const float4 p = (float4)(x, y, pixel.x, 0); + const float4 gridp = image_to_grid(p, size, sigma); // closest integer splatting: - int4 xi = clamp(convert_int4(round(gridp)), 0, size - 1); + const int4 xi = clamp(convert_int4(round(gridp)), 0, size - 1); // first accumulate into local memory gi[li] = xi.x + size.x*xi.y + size.x*size.y*xi.z; @@ -280,15 +239,15 @@ colorreconstruction_slice( const int oy = sizex; const int oz = sizey*sizex; - int4 size = (int4)(sizex, sizey, sizez, 0); - float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0); + const int4 size = (int4)(sizex, sizey, sizez, 0); + const float4 sigma = (float4)(sigma_s, sigma_s, sigma_r, 0); float4 pixel = read_imagef (in, samplerc, (int2)(x, y)); - float blend = clipf(20.0f / threshold * pixel.x - 19.0f); - float2 pxy = grid_rescale((int2)(x, y), roixy, bxy, scale); - float4 p = (float4)(pxy.x, pxy.y, pixel.x, 0); - float4 gridp = image_to_grid(p, size, sigma); - int4 gridi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0)); + const float blend = clipf(20.0f / threshold * pixel.x - 19.0f); + const float2 pxy = grid_rescale((int2)(x, y), roixy, bxy, scale); + const float4 p = (float4)(pxy.x, pxy.y, pixel.x, 0); + const float4 gridp = image_to_grid(p, size, sigma); + const int4 gridi = min(size - 2, (int4)(gridp.x, gridp.y, gridp.z, 0)); float fx = gridp.x - gridi.x; float fy = gridp.y - gridi.y; float fz = gridp.z - gridi.z; diff --git a/data/kernels/common.h b/data/kernels/common.h index 83bdf28ee4ee..234f00b44149 100644 --- a/data/kernels/common.h +++ b/data/kernels/common.h @@ -85,6 +85,44 @@ fcol(const int row, const int col, const unsigned int filters, global const unsi : filters >> ((((row) << 1 & 14) + ((col) & 1)) << 1) & 3; } +void +atomic_add_f( + global float *val, + const float delta) +{ +#ifdef NVIDIA_SM_20 + // buys me another 3x--10x over the `algorithmic' improvements in the splat kernel below, + // depending on configuration (sigma_s and sigma_r) + float res = 0; + asm volatile ("atom.global.add.f32 %0, [%1], %2;" : "=f"(res) : "l"(val), "f"(delta)); + +#else + union + { + float f; + unsigned int i; + } + old_val; + union + { + float f; + unsigned int i; + } + new_val; + + global volatile unsigned int *ival = (global volatile unsigned int *)val; + + do + { + // the following is equivalent to old_val.f = *val. however, as according to the opencl standard + // we can not rely on global buffer val to be consistently cached (relaxed memory consistency) we + // access it via a slower but consistent atomic operation. + old_val.i = atomic_add(ival, 0); + new_val.f = old_val.f + delta; + } + while (atomic_cmpxchg (ival, old_val.i, new_val.i) != old_val.i); +#endif +} static inline float dt_fast_hypot(const float x, const float y) diff --git a/src/common/bilateralcl.c b/src/common/bilateralcl.c index 19b59a83a568..5932d41d6f1d 100644 --- a/src/common/bilateralcl.c +++ b/src/common/bilateralcl.c @@ -166,41 +166,32 @@ cl_int dt_bilateral_splat_cl(dt_bilateral_cl_t *b, cl_mem in) cl_int dt_bilateral_blur_cl(dt_bilateral_cl_t *b) { - size_t sizes[3] = { 0, 0, 1 }; - cl_int err = dt_opencl_enqueue_copy_buffer_to_buffer(b->devid, b->dev_grid, b->dev_grid_tmp, 0, 0, sizeof(float) * b->size_x * b->size_y * b->size_z); if(err != CL_SUCCESS) return err; - sizes[0] = ROUNDUPDWD(b->size_z, b->devid); - sizes[1] = ROUNDUPDHT(b->size_y, b->devid); - int stride1, stride2, stride3; - stride1 = b->size_x * b->size_y; - stride2 = b->size_x; - stride3 = 1; - dt_opencl_set_kernel_args(b->devid, b->global->kernel_blur_line, 0, CLARG(b->dev_grid_tmp), CLARG(b->dev_grid), - CLARG(stride1), CLARG(stride2), CLARG(stride3), CLARG(b->size_z), CLARG(b->size_y), CLARG(b->size_x)); - err = dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_blur_line, sizes); + int stride1 = b->size_x * b->size_y; + int stride2 = b->size_x; + int stride3 = 1; + err = dt_opencl_enqueue_kernel_2d_args(b->devid, b->global->kernel_blur_line, b->size_z, b->size_y, + CLARG(b->dev_grid_tmp), CLARG(b->dev_grid), + CLARG(stride1), CLARG(stride2), CLARG(stride3), CLARG(b->size_z), CLARG(b->size_y), CLARG(b->size_x)); if(err != CL_SUCCESS) return err; stride1 = b->size_x * b->size_y; stride2 = 1; stride3 = b->size_x; - sizes[0] = ROUNDUPDWD(b->size_z, b->devid); - sizes[1] = ROUNDUPDHT(b->size_x, b->devid); - dt_opencl_set_kernel_args(b->devid, b->global->kernel_blur_line, 0, CLARG(b->dev_grid), CLARG(b->dev_grid_tmp), - CLARG(stride1), CLARG(stride2), CLARG(stride3), CLARG(b->size_z), CLARG(b->size_x), CLARG(b->size_y)); - err = dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_blur_line, sizes); + err = dt_opencl_enqueue_kernel_2d_args(b->devid, b->global->kernel_blur_line, b->size_z, b->size_x, + CLARG(b->dev_grid), CLARG(b->dev_grid_tmp), + CLARG(stride1), CLARG(stride2), CLARG(stride3), CLARG(b->size_z), CLARG(b->size_x), CLARG(b->size_y)); if(err != CL_SUCCESS) return err; stride1 = 1; stride2 = b->size_x; stride3 = b->size_x * b->size_y; - sizes[0] = ROUNDUPDWD(b->size_x, b->devid); - sizes[1] = ROUNDUPDHT(b->size_y, b->devid); - dt_opencl_set_kernel_args(b->devid, b->global->kernel_blur_line_z, 0, CLARG(b->dev_grid_tmp), CLARG(b->dev_grid), - CLARG(stride1), CLARG(stride2), CLARG(stride3), CLARG(b->size_x), CLARG(b->size_y), CLARG(b->size_z)); - return dt_opencl_enqueue_kernel_2d(b->devid, b->global->kernel_blur_line_z, sizes); + return dt_opencl_enqueue_kernel_2d_args(b->devid, b->global->kernel_blur_line_z, b->size_x, b->size_y, + CLARG(b->dev_grid_tmp), CLARG(b->dev_grid), + CLARG(stride1), CLARG(stride2), CLARG(stride3), CLARG(b->size_x), CLARG(b->size_y), CLARG(b->size_z)); } cl_int dt_bilateral_slice_to_output_cl(dt_bilateral_cl_t *b, cl_mem in, cl_mem out, const float detail) From e0e93f17bc563fb0fdcb99875fb5a71777b9ec9d Mon Sep 17 00:00:00 2001 From: Hanno Schwalm Date: Fri, 6 Feb 2026 08:07:25 +0100 Subject: [PATCH 5/9] Atrous OpenCL maintenance 1. All non-local OpenCL kernel calls use _args() interface 2. new kernels as asked for by @ralfbrown (code does not work) - eaw_zero - eaw_addbuffers --- data/kernels/atrous.cl | 75 +++++++++++----- src/iop/atrous.c | 197 +++++++++++++++-------------------------- 2 files changed, 126 insertions(+), 146 deletions(-) diff --git a/data/kernels/atrous.cl b/data/kernels/atrous.cl index 504250f2056f..79a4f17a47b1 100644 --- a/data/kernels/atrous.cl +++ b/data/kernels/atrous.cl @@ -1,6 +1,6 @@ /* This file is part of darktable, - copyright (c) 2009--2010 johannes hanika. + Copyright (C) 2009-2026 darktable developers. darktable is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,20 +19,24 @@ #include "common.h" - float4 weight(const float4 c1, const float4 c2, const float sharpen) { - // native_exp is faster than the cpu floating point aliasing hack: - const float wc = native_exp(-((c1.y - c2.y)*(c1.y - c2.y) + (c1.z - c2.z)*(c1.z - c2.z)) * sharpen); - const float wl = native_exp(- (c1.x - c2.x)*(c1.x - c2.x) * sharpen); + const float wc = dtcl_exp(-((c1.y - c2.y)*(c1.y - c2.y) + (c1.z - c2.z)*(c1.z - c2.z)) * sharpen); + const float wl = dtcl_exp(- (c1.x - c2.x)*(c1.x - c2.x) * sharpen); return (float4)(wl, wc, wc, 1.0f); } __kernel void -eaw_decompose (__read_only image2d_t in, __write_only image2d_t coarse, __write_only image2d_t detail, - const int width, const int height, const int scale, const float sharpen, global const float *filter) +eaw_decompose(__read_only image2d_t in, + __write_only image2d_t coarse, + __write_only image2d_t detail, + const int width, + const int height, + const int scale, + const float sharpen, + global const float *filter) { const int x = get_global_id(0); const int y = get_global_id(1); @@ -41,17 +45,17 @@ eaw_decompose (__read_only image2d_t in, __write_only image2d_t coarse, __write_ const int mult = 1<= width || y >= height) return; - const float4 threshold = (float4)(t0, t1, t2, t3); - const float4 boost = (float4)(b0, b1, b2, b3); - float4 c = read_imagef(coarse, sampleri, (int2)(x, y)); - float4 d = read_imagef(detail, sampleri, (int2)(x, y)); - float4 amount = copysign(max((float4)(0.0f), fabs(d) - threshold), d); + const float4 c = read_imagef(coarse, sampleri, (int2)(x, y)); + const float4 d = read_imagef(detail, sampleri, (int2)(x, y)); + const float4 amount = copysign(fmax((float4)(0.0f), fabs(d) - threshold), d); float4 sum = c + boost*amount; sum.w = c.w; write_imagef (out, (int2)(x, y), sum); } +__kernel void +eaw_zero(__write_only image2d_t out, + const int width, + const int height) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + if(x >= width || y >= height) return; + write_imagef(out, (int2)(x, y), (float4)0.0f); +} + +__kernel void +eaw_addbuffers(__write_only image2d_t out_out, + __read_only image2d_t out_in, + __read_only image2d_t diff, + const int width, + const int height) +{ + const int x = get_global_id(0); + const int y = get_global_id(1); + if(x >= width || y >= height) return; + + const float4 cs = read_imagef(diff, sampleri, (int2)(x, y)); + const float4 o = read_imagef(out_in, sampleri, (int2)(x, y)); + write_imagef(out_out, (int2)(x, y), (cs + o)); +} \ No newline at end of file diff --git a/src/iop/atrous.c b/src/iop/atrous.c index 4b5e2a3b28d2..8471e4f3a370 100644 --- a/src/iop/atrous.c +++ b/src/iop/atrous.c @@ -1,6 +1,6 @@ /* This file is part of darktable, - Copyright (C) 2010-2025 darktable developers. + Copyright (C) 2010-2026 darktable developers. darktable is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -279,7 +279,7 @@ static void process_wavelets(dt_iop_module_t *self, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { - dt_iop_atrous_data_t *d = piece->data; + const dt_iop_atrous_data_t *d = piece->data; dt_aligned_pixel_t thrs[MAX_NUM_SCALES]; dt_aligned_pixel_t boost[MAX_NUM_SCALES]; float sharp[MAX_NUM_SCALES]; @@ -360,8 +360,6 @@ void process(dt_iop_module_t *self, #ifdef HAVE_OPENCL #ifdef USE_NEW_CL -/* this version is adapted to the new global tiling mechanism. it no - * longer does tiling by itself. */ int process_cl(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, cl_mem dev_in, @@ -369,7 +367,10 @@ int process_cl(dt_iop_module_t *self, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { - dt_iop_atrous_data_t *d = piece->data; + const dt_iop_atrous_data_t *d = piece->data; + const dt_iop_atrous_global_data_t *gd = self->global_data; + const int devid = piece->pipe->devid; + dt_aligned_pixel_t thrs[MAX_NUM_SCALES]; dt_aligned_pixel_t boost[MAX_NUM_SCALES]; float sharp[MAX_NUM_SCALES]; @@ -384,100 +385,81 @@ int process_cl(dt_iop_module_t *self, // dt_control_queue_draw(GTK_WIDGET(g->area)); } - dt_iop_atrous_global_data_t *gd = self->global_data; - - const int devid = piece->pipe->devid; cl_int err = CL_MEM_OBJECT_ALLOCATION_FAILURE; cl_mem dev_filter = NULL; cl_mem dev_tmp = NULL; cl_mem dev_tmp2 = NULL; cl_mem dev_detail = NULL; - float m[] = { 0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f }; // 1/16, 4/16, 6/16, 4/16, 1/16 + const float m[5] = { 0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f }; // 1/16, 4/16, 6/16, 4/16, 1/16 float mm[5][5]; for(int j = 0; j < 5; j++) for(int i = 0; i < 5; i++) mm[j][i] = m[i] * m[j]; + const int width = roi_out->width; + const int height = roi_out->height; dev_filter = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 25, mm); - if(dev_filter == NULL) goto error; /* allocate space for two temporary buffer to participate_in in the - buffer ping-pong below. We need dev_out to accumulate the result - and dev_in needs to stay unchanged for blendops */ - dev_tmp = dt_opencl_alloc_device - (devid, roi_out->width, roi_out->height, sizeof(float) * 4); - if(dev_tmp == NULL) goto error; - dev_tmp2 = dt_opencl_alloc_device - (devid, roi_out->width, roi_out->height, sizeof(float) * 4); - if(dev_tmp2 == NULL) goto error; - + buffer ping-pong below. + We need dev_out to accumulate the result + and dev_in must stay unchanged + */ + dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4); + dev_tmp2 = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4); /* allocate a buffer for storing the detail information. */ - dev_detail = dt_opencl_alloc_device - (devid, roi_out->width, roi_out->height, sizeof(float) * 4); - if(dev_detail == NULL) goto error; - - const int width = roi_out->width; - const int height = roi_out->height; - size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 }; + dev_detail = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4); + if(!dev_detail || !dev_tmp || !dev_tmp2 || !dev_filter) goto error; // clear dev_out to zeros, as we will be incrementally accumulating results there - dt_opencl_set_kernel_args(devid, gd->kernel_zero, 0, CLARG(dev_out)); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_zero, sizes); + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_zero, width, height, + CLARG(dev_out), CLARG(width), CLARG(height)); if(err != CL_SUCCESS) goto error; // the buffers for the buffer ping-pong. We start with dev_in as // the input half for the first scale, then switch to using dev_tmp - // and dev_tmp2 as the two scratch buffers - void* dev_buf1 = &dev_in; - void* dev_buf2 = &dev_tmp; + // and dev_tmp2 as the two scratch buffers at the end of scaling loop + cl_mem pp_in = dev_in; + cl_mem pp_coarse = dev_tmp; /* decompose image into detail scales and coarse (the latter is left - * in dev_tmp or dev_out) */ + in dev_tmp or dev_out) + */ for(int s = 0; s < max_scale; s++) { const int scale = s; // run the decomposition - dt_opencl_set_kernel_args(devid, gd->kernel_decompose, 0, - CLARG(dev_buf2), CLARG(dev_buf1), CLARG(dev_detail), + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_decompose, width, height, + CLARG(pp_in), CLARG(pp_coarse), CLARG(dev_detail), CLARG(width), CLARG(height), CLARG(scale), CLARG(sharp[s]), CLARG(dev_filter)); - - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_decompose, sizes); if(err != CL_SUCCESS) goto error; - // indirectly give gpu some air to breathe (and to do display related stuff) - dt_opencl_micro_nap(devid); - // now immediately run the synthesis for the current scale, accumulating the details into dev_out - dt_opencl_set_kernel_args(devid, gd->kernel_synthesize, 0, - CLARG(dev_out), CLARG(dev_out), CLARG(dev_detail), + // dev_out as the accumulator must be given twice as an OpenCL 1.2 workaround + // Is this safe here? or would we need another temp buff and accumalate? + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_synthesize, width, height, + CLARG(dev_out), CLARG(pp_coarse), CLARG(dev_detail), CLARG(width), CLARG(height), - CLARG(thrs[scale][0]), CLARG(thrs[scale][1]), - CLARG(thrs[scale][2]), CLARG(thrs[scale][3]), - CLARG(boost[scale][0]), CLARG(boost[scale][1]), - CLARG(boost[scale][2]), CLARG(boost[scale][3])); - - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_synthesize, sizes); + CLFLARRAY(4, &thrs[scale]), CLFLARRAY(4, &boost[scale])); if(err != CL_SUCCESS) goto error; - // indirectly give gpu some air to breathe (and to do display related stuff) - dt_opencl_micro_nap(devid); - - // swap scratch buffers - if(scale == 0) dev_buf1 = dev_tmp2; - void* tmp = dev_buf2; - dev_buf2 = dev_buf1; - dev_buf1 = tmp; + // swap scratch buffers but leave as is for the final round to keep pp_coarse correct + if(s != max_scale -1) + { + cl_mem tmp = (s == 0) ? dev_tmp2 : pp_in; + pp_in = pp_coarse; + pp_coarse = tmp; + } } // add the residue (the coarse scale from the final decomposition) // to the accumulated details - dt_opencl_set_kernel_args - (devid, gd->kernel_addbuffers, 0, CLARG(dev_out), CLARG(dev_buf1)); - - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_addbuffers, sizes); - + // work around CL 1.20 restriction is safe with the kernel, + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_addbuffers, width, height, + CLARG(dev_out), CLARG(dev_out), CLARG(pp_coarse), + CLARG(width), CLARG(height)); error: dt_opencl_release_mem_object(dev_filter); dt_opencl_release_mem_object(dev_tmp); @@ -495,7 +477,10 @@ int process_cl(dt_iop_module_t *self, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { - dt_iop_atrous_data_t *d = piece->data; + const dt_iop_atrous_data_t *d = piece->data; + const dt_iop_atrous_global_data_t *gd = self->global_data; + const int devid = piece->pipe->devid; + dt_aligned_pixel_t thrs[MAX_NUM_SCALES]; dt_aligned_pixel_t boost[MAX_NUM_SCALES]; float sharp[MAX_NUM_SCALES]; @@ -510,45 +495,38 @@ int process_cl(dt_iop_module_t *self, // dt_control_queue_draw(GTK_WIDGET(g->area)); } - dt_iop_atrous_global_data_t *gd = self->global_data; - - const int devid = piece->pipe->devid; cl_int err = DT_OPENCL_DEFAULT_ERROR; cl_mem dev_filter = NULL; cl_mem dev_tmp = NULL; cl_mem *dev_detail = calloc(max_scale, sizeof(cl_mem)); - float m[] = { 0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f }; // 1/16, 4/16, 6/16, 4/16, 1/16 + const float m[5] = { 0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f }; // 1/16, 4/16, 6/16, 4/16, 1/16 float mm[5][5]; for(int j = 0; j < 5; j++) for(int i = 0; i < 5; i++) mm[j][i] = m[i] * m[j]; + const int width = roi_out->width; + const int height = roi_out->height; + dev_filter = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 25, mm); - if(dev_filter == NULL) goto error; /* allocate space for a temporary buffer. we don't want to use dev_in in the buffer ping-pong below, as we need to keep it for blendops */ - dev_tmp = dt_opencl_alloc_device - (devid, roi_out->width, roi_out->height, sizeof(float) * 4); - if(dev_tmp == NULL) goto error; + dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4); + if(!dev_tmp || !dev_filter) goto error; /* allocate space to store detail information. Requires a number of * additional buffers, each with full image size */ for(int k = 0; k < max_scale; k++) { - dev_detail[k] = dt_opencl_alloc_device - (devid, roi_out->width, roi_out->height, sizeof(float) * 4); + dev_detail[k] = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4); if(dev_detail[k] == NULL) goto error; } - const int width = roi_out->width; - const int height = roi_out->height; - size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 }; size_t origin[] = { 0, 0, 0 }; size_t region[] = { width, height, 1 }; - // copy original input from dev_in -> dev_out as starting point err = dt_opencl_enqueue_copy_image(devid, dev_in, dev_out, origin, origin, region); if(err != CL_SUCCESS) goto error; @@ -560,55 +538,36 @@ int process_cl(dt_iop_module_t *self, const int scale = s; if(s & 1) - { - dt_opencl_set_kernel_args(devid, gd->kernel_decompose, 0, - CLARG(dev_tmp), CLARG(dev_out)); - } + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_decompose, width, height, + CLARG(dev_tmp), CLARG(dev_out), + CLARG(dev_detail[s]), CLARG(width), CLARG(height), + CLARG(scale), CLARG(sharp[s]), CLARG(dev_filter)); else - { - dt_opencl_set_kernel_args(devid, gd->kernel_decompose, 0, - CLARG(dev_out), CLARG(dev_tmp)); - } - dt_opencl_set_kernel_args(devid, gd->kernel_decompose, 2, - CLARG(dev_detail[s]), CLARG(width), CLARG(height), - CLARG(scale), CLARG(sharp[s]), CLARG(dev_filter)); - - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_decompose, sizes); + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_decompose, width, height, + CLARG(dev_out), CLARG(dev_tmp), + CLARG(dev_detail[s]), CLARG(width), CLARG(height), + CLARG(scale), CLARG(sharp[s]), CLARG(dev_filter)); if(err != CL_SUCCESS) goto error; - - // indirectly give gpu some air to breathe (and to do display related stuff) - dt_opencl_micro_nap(devid); } /* now synthesize again */ for(int scale = max_scale - 1; scale >= 0; scale--) { if(scale & 1) - { - dt_opencl_set_kernel_args(devid, gd->kernel_synthesize, 0, - CLARG(dev_tmp), CLARG(dev_out)); - } - else - { - dt_opencl_set_kernel_args(devid, gd->kernel_synthesize, 0, - CLARG(dev_out), CLARG(dev_tmp)); - } - - dt_opencl_set_kernel_args(devid, gd->kernel_synthesize, 2, - CLARG(dev_detail[scale]), CLARG(width), - CLARG(height), CLARG(thrs[scale][0]), - CLARG(thrs[scale][1]), CLARG(thrs[scale][2]), - CLARG(thrs[scale][3]), CLARG(boost[scale][0]), - CLARG(boost[scale][1]), CLARG(boost[scale][2]), - CLARG(boost[scale][3])); + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_synthesize, width, height, + CLARG(dev_tmp), CLARG(dev_out), + CLARG(dev_detail[scale]), + CLARG(width), CLARG(height), + CLFLARRAY(4, &thrs[scale]), CLFLARRAY(4, &boost[scale])); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_synthesize, sizes); + else + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_synthesize, width, height, + CLARG(dev_out), CLARG(dev_tmp), + CLARG(dev_detail[scale]), + CLARG(width), CLARG(height), + CLFLARRAY(4, &thrs[scale]), CLFLARRAY(4, &boost[scale])); if(err != CL_SUCCESS) goto error; - - // indirectly give gpu some air to breathe (and to do display related stuff) - dt_opencl_micro_nap(devid); } - dt_opencl_finish_sync_pipe(devid, piece->pipe->type); error: @@ -637,7 +596,7 @@ void tiling_callback(dt_iop_module_t *self, const int max_filter_radius = 2 * (1 << max_scale); // 2 * 2^max_scale tiling->factor = 4.0f; // in + out + 2*tmp - tiling->factor_cl = 3.0f + max_scale; // in + out + tmp + scale buffers + tiling->factor_cl = 5.0f; // in + out + details + 2*tmp tiling->maxbuf = 1.0f; tiling->maxbuf_cl = 1.0f; tiling->overhead = 0; @@ -709,16 +668,6 @@ void commit_params(dt_iop_module_t *self, dt_iop_atrous_params_t *p = (dt_iop_atrous_params_t *)params; dt_iop_atrous_data_t *d = piece->data; -#if 0 - printf("---------- atrous preset begin\n"); - printf("p.octaves = %d; p.mix = %.2f\n", p->octaves, p->mix); - for(int ch=0; chx[ch][k]); - printf("p.y[%d][%d] = %f;\n", ch, k, p->y[ch][k]); - } - printf("---------- atrous preset end\n"); -#endif d->octaves = p->octaves; for(int ch = 0; ch < atrous_none; ch++) for(int k = 0; k < BANDS; k++) From 9aec41a0d116aa658d21caa7ec25172f5a52d8fc Mon Sep 17 00:00:00 2001 From: Hanno Schwalm Date: Sat, 7 Feb 2026 19:24:15 +0100 Subject: [PATCH 6/9] Denoise profile OpenCL maintenance 1. All non-local OpenCL kernel calls now use the _args() variant 2. Deduplicated code for old/new variant (still using "old" code 3. Fixed some possible clmem-leaks if kernels would fail --- src/iop/denoiseprofile.c | 336 ++++++++++++--------------------------- 1 file changed, 101 insertions(+), 235 deletions(-) diff --git a/src/iop/denoiseprofile.c b/src/iop/denoiseprofile.c index 73a5932af7b0..5ed4e4b035c8 100644 --- a/src/iop/denoiseprofile.c +++ b/src/iop/denoiseprofile.c @@ -1,6 +1,6 @@ /* This file is part of darktable, - Copyright (C) 2012-2024 darktable developers. + Copyright (C) 2012-2026 darktable developers. darktable is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -1976,13 +1976,14 @@ static int process_nlmeans_cl(dt_iop_module_t *self, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { - dt_iop_denoiseprofile_data_t *d = piece->data; - dt_iop_denoiseprofile_global_data_t *gd = self->global_data; -#if USE_NEW_IMPL_CL + const dt_iop_denoiseprofile_data_t *d = piece->data; + const dt_iop_denoiseprofile_global_data_t *gd = self->global_data; + const int width = roi_in->width; const int height = roi_in->height; + const int devid = piece->pipe->devid; - cl_int err = DT_OPENCL_DEFAULT_ERROR; + cl_int err = CL_MEM_OBJECT_ALLOCATION_FAILURE; const float scale = fminf(fminf(roi_in->scale, 2.0f) / fmaxf(piece->iscale, 1.0f), 1.0f); const int P = ceilf(d->radius * scale); // pixel filter size @@ -1995,43 +1996,32 @@ static int process_nlmeans_cl(dt_iop_module_t *self, dt_aligned_pixel_t p; dt_aligned_pixel_t aa; dt_aligned_pixel_t bb; - (void)nlmeans_precondition_cl(d,piece,wb,scale,aa,bb,p); - // allocate a buffer for a preconditioned copy of the image - const int devid = piece->pipe->devid; + nlmeans_precondition_cl(d,piece,wb,scale,aa,bb,p); + cl_mem dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4); - if(dev_tmp == NULL) return CL_MEM_OBJECT_ALLOCATION_FAILURE; + cl_mem dev_U2 = dt_opencl_alloc_device_buffer(devid, sizeof(float) * 4 * width * height); + if(!dev_tmp || !dev_U2) goto final; - const size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 }; - const float sigma2[4] = { (bb[0] / aa[0]) * (bb[0] / aa[0]), - (bb[1] / aa[1]) * (bb[1] / aa[1]), - (bb[2] / aa[2]) * (bb[2] / aa[2]), - 0.0f }; + const dt_aligned_pixel_t sigma2 = { (bb[0] / aa[0]) * (bb[0] / aa[0]), + (bb[1] / aa[1]) * (bb[1] / aa[1]), + (bb[2] / aa[2]) * (bb[2] / aa[2]), + 0.0f }; if(!d->use_new_vst) - { - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_precondition, - 0, CLARG(dev_in), CLARG(dev_tmp), - CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2)); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_precondition, sizes); - } + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_precondition, width, height, + CLARG(dev_in), CLARG(dev_tmp), + CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2)); else - { - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_precondition_v2, - 0, CLARG(dev_in), CLARG(dev_tmp), - CLARG(width), CLARG(height), CLARG(aa), CLARG(p), CLARG(bb), CLARG(wb)); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_precondition_v2, - sizes); - } + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_precondition_v2, width, height, + CLARG(dev_in), CLARG(dev_tmp), + CLARG(width), CLARG(height), CLARG(aa), CLARG(p), CLARG(bb), CLARG(wb)); + if(err != CL_SUCCESS) goto final; - // allocate a buffer to receive the denoised image - cl_mem dev_U2 = dt_opencl_alloc_device_buffer(devid, sizeof(float) * 4 * width * height); - if(dev_U2 == NULL) err = CL_MEM_OBJECT_ALLOCATION_FAILURE; +#if USE_NEW_IMPL_CL - if(err == CL_SUCCESS) - { - const dt_aligned_pixel_t norm2 = { 1.0f, 1.0f, 1.0f, 1.0f }; - const dt_nlmeans_param_t params = + const dt_aligned_pixel_t norm2 = { 1.0f, 1.0f, 1.0f, 1.0f }; + const dt_nlmeans_param_t params = { .scattering = scattering, .scale = scale, @@ -2050,64 +2040,13 @@ static int process_nlmeans_cl(dt_iop_module_t *self, .kernel_vert = gd->kernel_denoiseprofile_vert, .kernel_accu = gd->kernel_denoiseprofile_accu }; - err = nlmeans_denoiseprofile_cl(¶ms, devid, dev_tmp, dev_U2, roi_in); - } - if(err == CL_SUCCESS) - { - if(!d->use_new_vst) - { - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_finish, - 0, CLARG(dev_in), CLARG(dev_U2), - CLARG(dev_out), CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2)); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_finish, sizes); - } - else - { - const float bias = d->bias - 0.5 * logf(scale); - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_finish_v2, 0, - CLARG(dev_in), CLARG(dev_U2), - CLARG(dev_out), CLARG(width), CLARG(height), - CLARG(aa), CLARG(p), - CLARG(bb), CLARG(bias), CLARG(wb)); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_finish_v2, sizes); - } - } - dt_opencl_release_mem_object(dev_U2); - dt_opencl_release_mem_object(dev_tmp); - return err; + err = nlmeans_denoiseprofile_cl(¶ms, devid, dev_tmp, dev_U2, roi_in); + if(err != CL_SUCCESS) goto final; -#else - const int width = roi_in->width; - const int height = roi_in->height; - - cl_int err = CL_MEM_OBJECT_ALLOCATION_FAILURE; - - const float scale = fminf(fminf(roi_in->scale, 2.0f) / fmaxf(piece->iscale, 1.0f), 1.0f); - const int P = ceilf(d->radius * scale); // pixel filter size - int K = d->nbhood; // nbhood - const float scattering = nlmeans_scattering(&K,d,piece,scale); - const float norm = nlmeans_norm(P,d); - const float central_pixel_weight = d->central_pixel_weight * scale; - - dt_aligned_pixel_t wb; - dt_aligned_pixel_t p; - dt_aligned_pixel_t aa; - dt_aligned_pixel_t bb; - (void)nlmeans_precondition_cl(d,piece,wb,scale,aa,bb,p); - - const dt_aligned_pixel_t sigma2 = { (bb[0] / aa[0]) * (bb[0] / aa[0]), - (bb[1] / aa[1]) * (bb[1] / aa[1]), - (bb[2] / aa[2]) * (bb[2] / aa[2]), - 0.0f }; - - const int devid = piece->pipe->devid; - cl_mem dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4); - if(dev_tmp == NULL) goto error; - - cl_mem dev_U2 = dt_opencl_alloc_device_buffer(devid, sizeof(float) * 4 * width * height); - if(dev_U2 == NULL) goto error; +#else // old¤t code cl_mem buckets[NUM_BUCKETS] = { NULL }; + unsigned int state = 0; for(int k = 0; k < NUM_BUCKETS; k++) { @@ -2147,38 +2086,14 @@ static int process_nlmeans_cl(dt_iop_module_t *self, else vblocksize = 1; - - const size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 }; - size_t sizesl[3]; - size_t local[3]; - - if(!d->use_new_vst) - { - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_precondition, - 0, CLARG(dev_in), CLARG(dev_tmp), - CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2)); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_precondition, sizes); - if(err != CL_SUCCESS) goto error; - } - else - { - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_precondition_v2, 0, - CLARG(dev_in), CLARG(dev_tmp), - CLARG(width), CLARG(height), - CLARG(aa), CLARG(p), CLARG(bb), CLARG(wb)); - err = dt_opencl_enqueue_kernel_2d(devid, - gd->kernel_denoiseprofile_precondition_v2, sizes); - if(err != CL_SUCCESS) goto error; - } - - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_init, 0, - CLARG(dev_U2), CLARG(width), - CLARG(height)); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_init, sizes); + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_init, width, height, + CLARG(dev_U2), CLARG(width), CLARG(height)); if(err != CL_SUCCESS) goto error; const size_t bwidth = ROUNDUP(width, hblocksize); const size_t bheight = ROUNDUP(height, vblocksize); + size_t sizesl[3]; + size_t local[3]; for(int kj_index = -K; kj_index <= 0; kj_index++) { @@ -2200,10 +2115,8 @@ static int process_nlmeans_cl(dt_iop_module_t *self, int q[2] = { i, j }; cl_mem dev_U4 = buckets[bucket_next(&state, NUM_BUCKETS)]; - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_dist, 0, - CLARG(dev_tmp), CLARG(dev_U4), - CLARG(width), CLARG(height), CLARG(q)); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_dist, sizes); + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_dist, width, height, + CLARG(dev_tmp), CLARG(dev_U4), CLARG(width), CLARG(height), CLARG(q)); if(err != CL_SUCCESS) goto error; sizesl[0] = bwidth; @@ -2235,53 +2148,48 @@ static int process_nlmeans_cl(dt_iop_module_t *self, CLARG(q), CLARG(P), CLARG(norm), CLLOCAL(sizeof(float) * (vblocksize + 2 * P)), CLARG(central_pixel_weight), CLARG(dev_U4)); - err = dt_opencl_enqueue_kernel_2d_with_local - (devid, - gd->kernel_denoiseprofile_vert, sizesl, local); + err = dt_opencl_enqueue_kernel_2d_with_local(devid, + gd->kernel_denoiseprofile_vert, + sizesl, local); if(err != CL_SUCCESS) goto error; - - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_accu, - 0, CLARG(dev_tmp), CLARG(dev_U2), - CLARG(dev_U4_tt), CLARG(width), - CLARG(height), CLARG(q)); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_accu, sizes); + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_accu, width, height, + CLARG(dev_tmp), CLARG(dev_U2), CLARG(dev_U4_tt), + CLARG(width), CLARG(height), CLARG(q)); if(err != CL_SUCCESS) goto error; - dt_opencl_finish_sync_pipe(devid, piece->pipe->type); - - // indirectly give gpu some air to breathe (and to do display related stuff) - dt_opencl_micro_nap(devid); } } - if(!d->use_new_vst) + error: + for(int k = 0; k < NUM_BUCKETS; k++) + dt_opencl_release_mem_object(buckets[k]); + +#endif /* shared finalize USE_NEW_IMPL_CL */ + + if(err == CL_SUCCESS) { - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_finish, 0, + if(!d->use_new_vst) + { + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_finish, width, height, CLARG(dev_in), CLARG(dev_U2), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2)); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_finish, sizes); - } - else - { - const float bias = d->bias - 0.5 * logf(scale); - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_finish_v2, 0, + } + else + { + const float bias = d->bias - 0.5 * logf(scale); + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_finish_v2, width, height, CLARG(dev_in), CLARG(dev_U2), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(aa), CLARG(p), CLARG(bb), CLARG(bias), CLARG(wb)); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_finish_v2, sizes); + } } - error: - for(int k = 0; k < NUM_BUCKETS; k++) - { - dt_opencl_release_mem_object(buckets[k]); - } +final: dt_opencl_release_mem_object(dev_U2); dt_opencl_release_mem_object(dev_tmp); return err; -#endif /* USE_NEW_IMPL_CL */ } @@ -2292,8 +2200,8 @@ static int process_wavelets_cl(dt_iop_module_t *self, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out) { - dt_iop_denoiseprofile_data_t *d = piece->data; - dt_iop_denoiseprofile_global_data_t *gd = self->global_data; + const dt_iop_denoiseprofile_data_t *d = piece->data; + const dt_iop_denoiseprofile_global_data_t *gd = self->global_data; const int max_max_scale = DT_IOP_DENOISE_PROFILE_BANDS; // hard limit int max_scale = 0; @@ -2328,6 +2236,8 @@ static int process_wavelets_cl(dt_iop_module_t *self, cl_mem dev_m = NULL; cl_mem dev_r = NULL; cl_mem dev_filter = NULL; + cl_mem dev_Y0U0V0 = NULL; + cl_mem dev_RGB = NULL; cl_mem *dev_detail = calloc(max_max_scale, sizeof(cl_mem)); float *sumsum = NULL; @@ -2379,18 +2289,13 @@ static int process_wavelets_cl(dt_iop_module_t *self, const int reducesize = MIN(REDUCESIZE, ROUNDUP(bufsize, slocopt.sizex) / slocopt.sizex); err = CL_MEM_OBJECT_ALLOCATION_FAILURE; dev_m = dt_opencl_alloc_device_buffer(devid, sizeof(float) * 4 * bufsize); - if(dev_m == NULL) goto error; - dev_r = dt_opencl_alloc_device_buffer(devid, sizeof(float) * 4 * reducesize); - if(dev_r == NULL) goto error; - sumsum = dt_alloc_align_float((size_t)4 * reducesize); - if(sumsum == NULL) goto error; - dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4); - if(dev_tmp == NULL) goto error; - float m[] = { 0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f }; // 1/16, 4/16, 6/16, 4/16, 1/16 + if(!dev_tmp || !dev_r || !dev_m || !sumsum) goto error; + + const float m[] = { 0.0625f, 0.25f, 0.375f, 0.25f, 0.0625f }; // 1/16, 4/16, 6/16, 4/16, 1/16 float mm[5][5]; for(int j = 0; j < 5; j++) for(int i = 0; i < 5; i++) mm[j][i] = m[i] * m[j]; @@ -2463,41 +2368,27 @@ static int process_wavelets_cl(dt_iop_module_t *self, } } - size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 }; - if(!d->use_new_vst) { - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_precondition, - 0, CLARG(dev_in), CLARG(dev_out), - CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2)); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_precondition, sizes); - if(err != CL_SUCCESS) goto error; + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_precondition, width, height, + CLARG(dev_in), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2)); } else if(d->wavelet_color_mode == MODE_RGB) { - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_precondition_v2, - 0, CLARG(dev_in), CLARG(dev_out), - CLARG(width), CLARG(height), - CLARG(aa), CLARG(p), CLARG(bb), CLARG(wb)); - err = dt_opencl_enqueue_kernel_2d(devid, - gd->kernel_denoiseprofile_precondition_v2, sizes); - if(err != CL_SUCCESS) goto error; + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_precondition_v2, width, height, + CLARG(dev_in), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(aa), CLARG(p), CLARG(bb), CLARG(wb)); } else { err = CL_MEM_OBJECT_ALLOCATION_FAILURE; - cl_mem dev_Y0U0V0 = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 9, toY0U0V0); + dev_Y0U0V0 = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 9, toY0U0V0); if(dev_Y0U0V0 == NULL) goto error; - - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_precondition_Y0U0V0, - 0, CLARG(dev_in), - CLARG(dev_out), CLARG(width), CLARG(height), - CLARG(aa), CLARG(p), CLARG(bb), CLARG(dev_Y0U0V0)); - err = dt_opencl_enqueue_kernel_2d(devid, - gd->kernel_denoiseprofile_precondition_Y0U0V0, - sizes); + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_precondition_Y0U0V0, width, height, + CLARG(dev_in), CLARG(dev_out), CLARG(width), CLARG(height), + CLARG(aa), CLARG(p), CLARG(bb), CLARG(dev_Y0U0V0)); dt_opencl_release_mem_object(dev_Y0U0V0); } + if(err != CL_SUCCESS) goto error; dev_buf1 = dev_out; dev_buf2 = dev_tmp; @@ -2510,16 +2401,12 @@ static int process_wavelets_cl(dt_iop_module_t *self, const float sigma_band = powf(varf, s) * sigma; const float inv_sigma2 = 1.0f / (sigma_band * sigma_band); - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_decompose, - 0, CLARG(dev_buf1), CLARG(dev_buf2), - CLARG(dev_detail[s]), CLARG(width), CLARG(height), - CLARG(s), CLARG(inv_sigma2), CLARG(dev_filter)); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_decompose, sizes); + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_decompose, width, height, + CLARG(dev_buf1), CLARG(dev_buf2), + CLARG(dev_detail[s]), CLARG(width), CLARG(height), + CLARG(s), CLARG(inv_sigma2), CLARG(dev_filter)); if(err != CL_SUCCESS) goto error; - // indirectly give gpu some air to breathe (and to do display related stuff) - dt_opencl_micro_nap(devid); - // swap buffers cl_mem dev_buf3 = dev_buf2; dev_buf2 = dev_buf1; @@ -2547,15 +2434,14 @@ static int process_wavelets_cl(dt_iop_module_t *self, llocal[0] = flocopt.sizex; llocal[1] = flocopt.sizey; llocal[2] = 1; - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_reduce_first, - 0, CLARG((dev_detail[s])), + dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_reduce_first, 0, + CLARG((dev_detail[s])), CLARG(width), CLARG(height), CLARG(dev_m), CLLOCAL(sizeof(float) * 4 * flocopt.sizex * flocopt.sizey)); - err = dt_opencl_enqueue_kernel_2d_with_local - (devid, - gd->kernel_denoiseprofile_reduce_first, lsizes, - llocal); + err = dt_opencl_enqueue_kernel_2d_with_local(devid, + gd->kernel_denoiseprofile_reduce_first, + lsizes, llocal); if(err != CL_SUCCESS) goto error; @@ -2565,19 +2451,17 @@ static int process_wavelets_cl(dt_iop_module_t *self, llocal[0] = slocopt.sizex; llocal[1] = 1; llocal[2] = 1; - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_reduce_second, - 0, CLARG(dev_m), CLARG(dev_r), + dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_reduce_second, 0, + CLARG(dev_m), CLARG(dev_r), CLARG(bufsize), CLLOCAL(sizeof(float) * 4 * slocopt.sizex)); - err = dt_opencl_enqueue_kernel_2d_with_local - (devid, - gd->kernel_denoiseprofile_reduce_second, lsizes, - llocal); + err = dt_opencl_enqueue_kernel_2d_with_local(devid, + gd->kernel_denoiseprofile_reduce_second, + lsizes, llocal); if(err != CL_SUCCESS) goto error; err = dt_opencl_read_buffer_from_device(devid, (void *)sumsum, dev_r, 0, sizeof(float) * 4 * reducesize, CL_TRUE); - if(err != CL_SUCCESS) - goto error; + if(err != CL_SUCCESS) goto error; for(int k = 0; k < reducesize; k++) { @@ -2651,19 +2535,14 @@ static int process_wavelets_cl(dt_iop_module_t *self, // dt_print(DT_DEBUG_ALWAYS, "scale %d thrs %f %f %f", s, thrs[0], thrs[1], thrs[2]); const dt_aligned_pixel_t boost = { 1.0f, 1.0f, 1.0f, 1.0f }; - - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_synthesize, - 0, CLARG(dev_buf1), CLARG(dev_detail[s]), + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_synthesize, width, height, + CLARG(dev_buf1), CLARG(dev_detail[s]), CLARG(dev_buf2), CLARG(width), CLARG(height), CLARG(thrs[0]), CLARG(thrs[1]), CLARG(thrs[2]), CLARG(thrs[3]), CLARG(boost[0]), CLARG(boost[1]), - CLARG(boost[2]), CLARG(boost[3])); - err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_denoiseprofile_synthesize, sizes); + CLARG(boost[2]), CLARG(boost[3])); if(err != CL_SUCCESS) goto error; - // indirectly give gpu some air to breathe (and to do display related stuff) - dt_opencl_micro_nap(devid); - // swap buffers cl_mem dev_buf3 = dev_buf2; dev_buf2 = dev_buf1; @@ -2683,47 +2562,34 @@ static int process_wavelets_cl(dt_iop_module_t *self, if(!d->use_new_vst) { - dt_opencl_set_kernel_args(devid, gd->kernel_denoiseprofile_backtransform, - 0, CLARG(dev_tmp), CLARG(dev_out), - CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2)); - err = dt_opencl_enqueue_kernel_2d(devid, - gd->kernel_denoiseprofile_backtransform, sizes); - if(err != CL_SUCCESS) goto error; + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_backtransform, width, height, + CLARG(dev_tmp), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(aa), CLARG(sigma2)); } else if(d->wavelet_color_mode == MODE_RGB) { const float bias = d->bias - 0.5 * logf(scale); - dt_opencl_set_kernel_args(devid, - gd->kernel_denoiseprofile_backtransform_v2, 0, - CLARG(dev_tmp), - CLARG(dev_out), CLARG(width), CLARG(height), + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_backtransform_v2, width, height, + CLARG(dev_tmp), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(aa), CLARG(p), CLARG(bb), CLARG(bias), CLARG(wb)); - err = dt_opencl_enqueue_kernel_2d(devid, - gd->kernel_denoiseprofile_backtransform_v2, sizes); - if(err != CL_SUCCESS) goto error; } else { - cl_mem dev_RGB = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 9, toRGB); + err = CL_MEM_OBJECT_ALLOCATION_FAILURE; + dev_RGB = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 9, toRGB); if(dev_RGB == NULL) goto error; const float bias = d->bias - 0.5 * logf(scale); - dt_opencl_set_kernel_args(devid, - gd->kernel_denoiseprofile_backtransform_Y0U0V0, 0, + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_denoiseprofile_backtransform_Y0U0V0, width, height, CLARG(dev_tmp), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(aa), CLARG(p), CLARG(bb), CLARG(bias), CLARG(wb), CLARG(dev_RGB)); - err = dt_opencl_enqueue_kernel_2d(devid, - gd->kernel_denoiseprofile_backtransform_Y0U0V0, - sizes); - dt_opencl_release_mem_object(dev_RGB); - if(err != CL_SUCCESS) goto error; } - - dt_opencl_finish_sync_pipe(devid, piece->pipe->type); + if(err == CL_SUCCESS) + dt_opencl_finish_sync_pipe(devid, piece->pipe->type); error: + dt_opencl_release_mem_object(dev_RGB); dt_opencl_release_mem_object(dev_r); dt_opencl_release_mem_object(dev_m); dt_opencl_release_mem_object(dev_tmp); From aa8921136f275e04b35645b44915266eb33e71aa Mon Sep 17 00:00:00 2001 From: Hanno Schwalm Date: Mon, 9 Feb 2026 11:20:43 +0100 Subject: [PATCH 7/9] Proper mix of OpenCL blend requirements As the tiling factor_cl for blendop might be less than the overall now we have to mix properly. This often avoids 1x1 OpenCL tiling in the pipe for blended modules. --- src/develop/pixelpipe_hb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/develop/pixelpipe_hb.c b/src/develop/pixelpipe_hb.c index 486e189266d3..c70579f2816b 100644 --- a/src/develop/pixelpipe_hb.c +++ b/src/develop/pixelpipe_hb.c @@ -1961,9 +1961,9 @@ static gboolean _dev_pixelpipe_process_rec(dt_dev_pixelpipe_t *pipe, /* aggregate in structure tiling */ tiling.factor = MAX(tiling.factor, tiling_blendop.factor); - tiling.factor_cl = MAX(tiling.factor_cl, tiling_blendop.factor); + tiling.factor_cl = MAX(tiling.factor_cl, tiling_blendop.factor_cl); tiling.maxbuf = MAX(tiling.maxbuf, tiling_blendop.maxbuf); - tiling.maxbuf_cl = MAX(tiling.maxbuf_cl, tiling_blendop.maxbuf); + tiling.maxbuf_cl = MAX(tiling.maxbuf_cl, tiling_blendop.maxbuf_cl); tiling.overhead = MAX(tiling.overhead, tiling_blendop.overhead); tiling.overlap = MAX(tiling.overlap, tiling_blendop.overlap); } From f0361227888648b350eeb7e3d094b7bce15c3bcf Mon Sep 17 00:00:00 2001 From: Pascal Obry Date: Mon, 9 Feb 2026 22:33:47 +0100 Subject: [PATCH 8/9] Make sure we do not mess with gtk tree path. As we need to use it0 (selected->data) later don't mess with it. We do a copy before checking for first/last item. Closes #20290. --- src/libs/masks.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/libs/masks.c b/src/libs/masks.c index 03b9c879e805..acbfb9c0383a 100644 --- a/src/libs/masks.c +++ b/src/libs/masks.c @@ -941,13 +941,15 @@ static int _tree_button_pressed(GtkWidget *treeview, // feature only meaningful for rows with prev/next. GtkTreeIter it; - gtk_tree_model_get_iter(model, &it, it0); + GtkTreePath *item = gtk_tree_path_copy(it0); + gtk_tree_model_get_iter(model, &it, item); is_last_row = !gtk_tree_model_iter_next(model, &it); - if(!is_last_row && !gtk_tree_path_prev(it0)) + if(!is_last_row && !gtk_tree_path_prev(item)) { is_first_row = TRUE; } + gtk_tree_path_free(item); } for(const GList *items_iter = selected; From 4a6dfb52dbf44cd6e756f636cebbf365f44ae33d Mon Sep 17 00:00:00 2001 From: Christian Bouhon Date: Wed, 18 Feb 2026 18:20:50 +0100 Subject: [PATCH 9/9] 20260218 implement adaptive JzAzBz shoulder extension --- data/kernels/basecurve.cl | 260 ++++++++- src/iop/basecurve.c | 1058 +++++++++++++++++++++++++++++++++---- 2 files changed, 1212 insertions(+), 106 deletions(-) mode change 100644 => 100755 data/kernels/basecurve.cl mode change 100644 => 100755 src/iop/basecurve.c diff --git a/data/kernels/basecurve.cl b/data/kernels/basecurve.cl old mode 100644 new mode 100755 index 24a675fe73ab..30c222bb316c --- a/data/kernels/basecurve.cl +++ b/data/kernels/basecurve.cl @@ -1,6 +1,6 @@ /* This file is part of darktable, - copyright (c) 2016-2025 darktable developers. + copyright (c) 2016-2026 darktable developers. darktable is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,6 +19,28 @@ #include "color_conversion.h" #include "rgb_norms.h" +inline float _aces_tone_map(const float x) +{ + const float a = 2.51f; + const float b = 0.03f; + const float c = 2.43f; + const float d = 0.59f; + const float e = 0.14f; + + return clamp((x * (a * x + b)) / (x * (c * x + d) + e), 0.0f, 1.0f); +} + +inline float _aces_20_tonemap(const float x) +{ + const float a = 0.0245786f; + const float b = 0.000090537f; + const float c = 0.983729f; + const float d = 0.4329510f; + const float e = 0.238081f; + + return clamp((x * (x + a) - b) / (x * (c * x + d) + e), 0.0f, 1.0f); +} + /* Primary LUT lookup. Measures the luminance of a given pixel using a selectable function, looks up that luminance in the configured basecurve, and then scales each channel by the result. @@ -86,9 +108,11 @@ basecurve_legacy_lut(read_only image2d_t in, write_only image2d_t out, const int float4 pixel = read_imagef(in, sampleri, (int2)(x, y)); // apply ev multiplier and use lut or extrapolation: - pixel.x = lookup_unbounded(table, mul * pixel.x, a); - pixel.y = lookup_unbounded(table, mul * pixel.y, a); - pixel.z = lookup_unbounded(table, mul * pixel.z, a); + float3 f = pixel.xyz * mul; + + pixel.x = lookup_unbounded(table, f.x, a); + pixel.y = lookup_unbounded(table, f.y, a); + pixel.z = lookup_unbounded(table, f.z, a); pixel = fmax(pixel, 0.f); write_imagef (out, (int2)(x, y), pixel); } @@ -298,14 +322,238 @@ basecurve_reconstruct(read_only image2d_t in, read_only image2d_t tmp, write_onl } kernel void -basecurve_finalize(read_only image2d_t in, read_only image2d_t comb, write_only image2d_t out, const int width, const int height) +basecurve_finalize(read_only image2d_t in, read_only image2d_t comb, write_only image2d_t out, const int width, + const int height, const int workflow_mode, const float shadow_lift, const float highlight_gain, + const float ucs_saturation_balance, const float gamut_strength, const float highlight_corr, const int target_gamut, const float look_opacity, const float16 look_mat, const float alpha) { const int x = get_global_id(0); const int y = get_global_id(1); if(x >= width || y >= height) return; - float4 pixel = fmax(read_imagef(comb, sampleri, (int2)(x, y)), 0.f); + float4 pixel = read_imagef(comb, sampleri, (int2)(x, y)); + + // Sanitize to avoid Inf/NaN propagation + pixel.xyz = clamp(pixel.xyz, -1e6f, 1e6f); + + if(workflow_mode > 0) + { + float3 pixel_in = pixel.xyz; + float3 look_transformed; + look_transformed.x = dot(pixel_in, (float3)(look_mat.s0, look_mat.s1, look_mat.s2)); + look_transformed.y = dot(pixel_in, (float3)(look_mat.s3, look_mat.s4, look_mat.s5)); + look_transformed.z = dot(pixel_in, (float3)(look_mat.s6, look_mat.s7, look_mat.s8)); + + // Mix between original and transformed + pixel.xyz = mix(pixel_in, look_transformed, look_opacity); + pixel.xyz = fmax(pixel.xyz, 0.0f); // Anti-black artifacts + + if(highlight_gain != 1.0f) + pixel.xyz *= highlight_gain; + + if(shadow_lift != 1.0f) + { + pixel.x = (pixel.x > 0.0f) ? native_powr(pixel.x, shadow_lift) : pixel.x; + pixel.y = (pixel.y > 0.0f) ? native_powr(pixel.y, shadow_lift) : pixel.y; + pixel.z = (pixel.z > 0.0f) ? native_powr(pixel.z, shadow_lift) : pixel.z; + } + + const float r_coeff = 0.2627f; + const float g_coeff = 0.6780f; + const float b_coeff = 0.0593f; + + float y_in = pixel.x * r_coeff + pixel.y * g_coeff + pixel.z * b_coeff; + float y_out = y_in; + + /* Scene-referred: luminance-adaptive shoulder extension for ACES-like + tonemapping using perceptual luminance Jz. */ + if(workflow_mode == 1 || workflow_mode == 2) + { + float3 xyz; + xyz.x = 0.636958f * pixel.x + 0.144617f * pixel.y + 0.168881f * pixel.z; + xyz.y = 0.262700f * pixel.x + 0.677998f * pixel.y + 0.059302f * pixel.z; + xyz.z = 0.000000f * pixel.x + 0.028073f * pixel.y + 1.060985f * pixel.z; + + xyz = fmax(xyz, (float3)(0.0f)); + + float4 xyz_scaled = (float4)(xyz.x * 400.0f, xyz.y * 400.0f, xyz.z * 400.0f, 0.0f); + float4 jab = XYZ_to_JzAzBz(xyz_scaled); + + const float L = clamp(jab.x, 0.0f, 1.0f); + const float k = 1.0f + alpha * L * L; + + const float x_scaled = y_in / k; + if(workflow_mode == 1) + y_out = _aces_tone_map(x_scaled) * k; + else + y_out = _aces_20_tonemap(x_scaled * 1.257f) * k; + } + + float gain = y_out / fmax(y_in, 1e-6f); + pixel.xyz *= gain; + + const float threshold = 0.80f; + if(y_out > threshold) + { + float factor = (y_out - threshold) / (1.0f - threshold); + factor = clamp(factor, 0.0f, 1.0f); + pixel.xyz = mix(pixel.xyz, (float3)y_out, factor); + } + + float4 jab = (float4)(0.0f); + if(ucs_saturation_balance != 0.0f || gamut_strength > 0.0f || highlight_corr != 0.0f) + { + // RGB Rec2020 to XYZ D65 + float3 xyz; + xyz.x = 0.636958f * pixel.x + 0.144617f * pixel.y + 0.168881f * pixel.z; + xyz.y = 0.262700f * pixel.x + 0.677998f * pixel.y + 0.059302f * pixel.z; + xyz.z = 0.000000f * pixel.x + 0.028073f * pixel.y + 1.060985f * pixel.z; + + xyz = fmax(xyz, 0.0f); + + // XYZ to JzAzBz + float4 xyz_scaled = (float4)(xyz.x * 400.0f, xyz.y * 400.0f, xyz.z * 400.0f, 0.0f); + jab = XYZ_to_JzAzBz(xyz_scaled); + + int modified = 0; + + if(ucs_saturation_balance != 0.0f) + { + // Chroma-based modulation for saturation balance + const float chroma = fmax(fmax(pixel.x, pixel.y), pixel.z) - fmin(fmin(pixel.x, pixel.y), pixel.z); + const float effective_saturation = ucs_saturation_balance * fmin(chroma * 2.0f, 1.0f); + + // Apply saturation balance + const float Y = xyz.y; + const float L = native_sqrt(fmax(Y, 0.0f)); + const float fulcrum = 0.5f; + const float n = (L - fulcrum) / fulcrum; + const float mask_shadow = 1.0f / (1.0f + dtcl_exp(n * 4.0f)); + + float sat_adjust = effective_saturation * (2.0f * mask_shadow - 1.0f); + sat_adjust *= fmin(L * 4.0f, 1.0f); + const float sat_factor = 1.0f + sat_adjust; + jab.y *= sat_factor; + jab.z *= sat_factor; + modified = 1; + } + + if(gamut_strength > 0.0f) + { + const float Y = xyz.y; + const float L = native_sqrt(fmax(Y, 0.0f)); + const float chroma_factor = 1.0f - gamut_strength * (0.2f + 0.2f * L); + jab.y *= chroma_factor; + jab.z *= chroma_factor; + modified = 1; + } + + // HIGH SENSITIVITY CORRECTION + // Start effect at 0.20 up to 0.90. Linear transition. + float hl_mask = clamp((jab.x - 0.20f) / 0.70f, 0.0f, 1.0f); + + if(hl_mask > 0.0f && highlight_corr != 0.0f) + { + // 1. Soft symmetric desaturation (0.75 factor) + float desat = 1.0f - (fabs(highlight_corr) * hl_mask * 0.75f); + jab.y *= desat; + jab.z *= desat; + + // 2. Controlled Hue Rotation (2.0 factor) + float angle = highlight_corr * hl_mask * 2.0f; + float ca = native_cos(angle); + float sa = native_sin(angle); + float az = jab.y; + float bz = jab.z; + + jab.y = az * ca - bz * sa; + jab.z = az * sa + bz * ca; + modified = 1; + } + + if(jab.x > 0.95f) + { + const float desat = clamp((1.0f - jab.x) * 20.0f, 0.0f, 1.0f); + jab.y *= desat; + jab.z *= desat; + modified = 1; + } + + if(modified) + { + // JzAzBz to XYZ + xyz = JzAzBz_2_XYZ(jab).xyz / 400.0f; + + // XYZ D65 to RGB Rec2020 + pixel.x = 1.716651f * xyz.x - 0.355671f * xyz.y - 0.253366f * xyz.z; + pixel.y = -0.666684f * xyz.x + 1.616481f * xyz.y + 0.015768f * xyz.z; + pixel.z = 0.017640f * xyz.x - 0.042771f * xyz.y + 0.942103f * xyz.z; + + float min_val = fmin(pixel.x, fmin(pixel.y, pixel.z)); + if(min_val < 0.0f) + { + float lum = 0.2627f * pixel.x + 0.6780f * pixel.y + 0.0593f * pixel.z; + if(lum > 0.0f) + { + float factor = lum / (lum - min_val); + pixel.xyz = lum + factor * (pixel.xyz - lum); + } + } + pixel.xyz = clamp(pixel.xyz, 0.0f, 1.0f); + } + } + + if(gamut_strength > 0.0f) + { + float4 orig = pixel; + + float Y = 0.2126f * pixel.x + 0.7152f * pixel.y + 0.0722f * pixel.z; + float lum_weight = clamp((Y - 0.3f) / (0.8f - 0.3f), 0.0f, 1.0f); + lum_weight = lum_weight * lum_weight * (3.0f - 2.0f * lum_weight); + float effective_strength = gamut_strength * lum_weight; + + float limit = 0.90f; + if (target_gamut == 1) limit = 0.95f; + else if (target_gamut == 2) limit = 1.00f; + + float threshold = limit * (1.0f - (effective_strength * 0.25f)); + float max_val = fmax(pixel.x, fmax(pixel.y, pixel.z)); + + if (max_val > threshold) + { + float range = limit - threshold; + float delta = max_val - threshold; + const float compressed = threshold + range * delta / (delta + range); + const float factor = compressed / max_val; + + float range_blue = 1.1f * range; + const float compressed_blue = threshold + range * delta / (delta + range_blue); + const float factor_blue = compressed_blue / max_val; + + pixel.x *= factor; + pixel.y *= factor; + pixel.z *= factor_blue; + } + pixel = mix(orig, pixel, effective_strength); + } + + // Final gamut check to preserve hue + if(pixel.x < 0.0f || pixel.x > 1.0f || pixel.y < 0.0f || pixel.y > 1.0f || pixel.z < 0.0f || pixel.z > 1.0f) + { + const float luma = 0.2627f * pixel.x + 0.6780f * pixel.y + 0.0593f * pixel.z; + const float target_luma = clamp(luma, 0.0f, 1.0f); + float t = 1.0f; + if (pixel.x < 0.0f) t = fmin(t, target_luma / (target_luma - pixel.x)); + if (pixel.y < 0.0f) t = fmin(t, target_luma / (target_luma - pixel.y)); + if (pixel.z < 0.0f) t = fmin(t, target_luma / (target_luma - pixel.z)); + if (pixel.x > 1.0f) t = fmin(t, (1.0f - target_luma) / (pixel.x - target_luma)); + if (pixel.y > 1.0f) t = fmin(t, (1.0f - target_luma) / (pixel.y - target_luma)); + if (pixel.z > 1.0f) t = fmin(t, (1.0f - target_luma) / (pixel.z - target_luma)); + t = fmax(0.0f, t); + pixel.xyz = target_luma + t * (pixel.xyz - target_luma); + } + } + pixel.w = read_imagef(in, sampleri, (int2)(x, y)).w; write_imagef (out, (int2)(x, y), pixel); diff --git a/src/iop/basecurve.c b/src/iop/basecurve.c old mode 100644 new mode 100755 index 54eeb94e7533..b917b3ddfde7 --- a/src/iop/basecurve.c +++ b/src/iop/basecurve.c @@ -48,7 +48,7 @@ #define MAXNODES 20 -DT_MODULE_INTROSPECTION(6, dt_iop_basecurve_params_t) +DT_MODULE_INTROSPECTION(7, dt_iop_basecurve_params_t) typedef struct dt_iop_basecurve_node_t { @@ -63,15 +63,34 @@ typedef struct dt_iop_basecurve_params_t dt_iop_basecurve_node_t basecurve[3][MAXNODES]; int basecurve_nodes[3]; // $MIN: 0 $MAX: MAXNODES $DEFAULT: 0 int basecurve_type[3]; // $MIN: 0 $MAX: MONOTONE_HERMITE $DEFAULT: MONOTONE_HERMITE - int exposure_fusion; /* number of exposure fusion steps - $DEFAULT: 0 $DESCRIPTION: "fusion" */ - float exposure_stops; /* number of stops between fusion images - $MIN: 0.01 $MAX: 4.0 $DEFAULT: 1.0 $DESCRIPTION: "exposure shift" */ - float exposure_bias; /* whether to do exposure-fusion with over or under-exposure - $MIN: -1.0 $MAX: 1.0 $DEFAULT: 1.0 $DESCRIPTION: "exposure bias" */ + int exposure_fusion; // number of exposure fusion steps $DEFAULT: 0 $DESCRIPTION: "fusion" + float exposure_stops; // number of stops between fusion images $MIN: 0.01 $MAX: 4.0 $DEFAULT: 1.0 $DESCRIPTION: "exposure shift" + float exposure_bias; // whether to do exposure-fusion with over or under-exposure $MIN: -1.0 $MAX: 1.0 $DEFAULT: 1.0 $DESCRIPTION: "exposure bias" dt_iop_rgb_norms_t preserve_colors; /* $DEFAULT: DT_RGB_NORM_LUMINANCE $DESCRIPTION: "preserve colors" */ + int workflow_mode; // $DEFAULT: 1 + float shadow_lift; // $MIN: 0.25 $MAX: 1.75 $DEFAULT: 1.0 $DESCRIPTION: "shadow correction" + float highlight_gain; // $MIN: 0.25 $MAX: 1.75 $DEFAULT: 1.0 $DESCRIPTION: "highlight gain" + float ucs_saturation_balance; // $MIN: -0.75 $MAX: 0.75 $DEFAULT: 0.2 $DESCRIPTION: "balance saturation ucs" + float gamut_strength; // $MIN: 0.0 $MAX: 1.0 $DEFAULT: 0.0 $DESCRIPTION: "gamut compression" + float highlight_corr; // $MIN: -1.0 $MAX: 1.0 $DEFAULT: 0.0 $DESCRIPTION: "Highlight Hue/Sat" + int target_gamut; // $DEFAULT: 0 $DESCRIPTION: "target gamut" + int color_look; // $DEFAULT: 1 $DESCRIPTION: "color look style" + float look_opacity; // $MIN: 0.1 $MAX: 1.0 $DEFAULT: 1.0 $DESCRIPTION: "look opacity" } dt_iop_basecurve_params_t; +static const float color_looks[10][10] = { + {1.000f, 0.000f, 0.000f, 0.000f, 1.000f, 0.000f, 0.000f, 0.000f, 1.000f}, // 1. Neutral + {0.932f, 0.051f, 0.017f, 0.021f, 0.945f, 0.034f, 0.011f, 0.025f, 0.964f}, // 2. Natural look + {1.029f, -0.008f, -0.074f, -0.023f, 1.008f, 0.046f, -0.002f, 0.007f, 1.010f}, // 3. Portrait + {1.084f, -0.006f, -0.093f, -0.074f, 1.008f, 0.060f, -0.011f, 0.005f, 1.024f}, // 4. Nature + {1.074f, 0.006f, -0.103f, -0.054f, 1.009f, 0.060f, -0.071f, -0.059f, 1.086f}, // 5. Vibrant + {1.218f, 0.007f, -0.192f, -0.119f, 1.076f, 0.048f, -0.099f, -0.069f, 1.154f}, // 6. Blue Sky + {1.082f, -0.020f, 0.103f, -0.051f, 1.052f, 0.042f, -0.047f, -0.045f, 1.073f}, // 7. Soft Warm + {1.050f, 0.020f, -0.010f, -0.020f, 1.020f, 0.000f, -0.010f, -0.020f, 1.030f}, // 8. Soft + {0.980f, -0.010f, -0.010f, 0.000f, 1.050f, -0.020f, 0.020f, 0.010f, 1.100f}, // 9. Deep Cool + {1.020f, -0.010f, -0.010f, -0.030f, 1.040f, -0.010f, 0.000f, -0.030f, 1.030f} // 10. Authentic Cinema +}; + int legacy_params(dt_iop_module_t *self, const void *const old_params, const int old_version, @@ -227,6 +246,20 @@ int legacy_params(dt_iop_module_t *self, *new_version = 6; return 0; } + if(old_version == 6) + { + const dt_iop_basecurve_params_v6_t *o = (dt_iop_basecurve_params_v6_t *)old_params; + dt_iop_basecurve_params_t *n = calloc(1, sizeof(dt_iop_basecurve_params_t)); + memcpy(n, o, sizeof(dt_iop_basecurve_params_v6_t)); + n->workflow_mode = 0; + n->shadow_lift = 1.0f; + n->highlight_gain = 1.0f; + + *new_params = n; + *new_params_size = sizeof(dt_iop_basecurve_params_t); + *new_version = 7; + return 0; + } return 1; } @@ -234,18 +267,21 @@ typedef struct dt_iop_basecurve_gui_data_t { dt_draw_curve_t *minmax_curve; // curve for gui to draw int minmax_curve_type, minmax_curve_nodes; - GtkBox *hbox; GtkDrawingArea *area; - GtkWidget *fusion, *exposure_step, *exposure_bias; + GtkWidget *fusion, *exposure_step, *exposure_bias, *shadow_lift, *highlight_gain; GtkWidget *cmb_preserve_colors; + GtkWidget *workflow_mode; double mouse_x, mouse_y; int selected; - double selected_offset, selected_y, selected_min, selected_max; - float draw_xs[DT_IOP_TONECURVE_RES], draw_ys[DT_IOP_TONECURVE_RES]; - float draw_min_xs[DT_IOP_TONECURVE_RES], draw_min_ys[DT_IOP_TONECURVE_RES]; - float draw_max_xs[DT_IOP_TONECURVE_RES], draw_max_ys[DT_IOP_TONECURVE_RES]; + float draw_ys[DT_IOP_TONECURVE_RES]; float loglogscale; GtkWidget *logbase; + GtkWidget *ucs_saturation_balance; + GtkWidget *gamut_strength; + GtkWidget *highlight_corr; + GtkWidget *target_gamut; + GtkWidget *color_look; + GtkWidget *look_opacity; } dt_iop_basecurve_gui_data_t; typedef struct basecurve_preset_t @@ -335,6 +371,15 @@ typedef struct dt_iop_basecurve_data_t float exposure_stops; float exposure_bias; int preserve_colors; + int workflow_mode; + float shadow_lift; + float highlight_gain; + float ucs_saturation_balance; + float gamut_strength; + float highlight_corr; + int target_gamut; + int color_look; + float look_opacity; } dt_iop_basecurve_data_t; typedef struct dt_iop_basecurve_global_data_t @@ -370,8 +415,8 @@ const char **description(dt_iop_module_t *self) _("apply a view transform based on personal or camera maker look,\n" "for corrective purposes, to prepare images for display"), _("corrective"), - _("linear, RGB, display-referred"), - _("non-linear, RGB"), + _("linear, RGB, scene-referred"), + _("linear, non-linear, RGB"), _("non-linear, RGB, display-referred")); } @@ -490,6 +535,8 @@ void reload_defaults(dt_iop_module_t *self) { dt_iop_basecurve_params_t *const d = self->default_params; + *d = basecurve_presets[0].params; + if(self->multi_priority == 0) { const dt_image_t *const image = &(self->dev->image_storage); @@ -529,6 +576,21 @@ void reload_defaults(dt_iop_module_t *self) d->exposure_stops = 1.0f; d->exposure_bias = 1.0f; } + + if(!dt_is_display_referred()) + { + // Force ACES defaults on top of whatever curve was found + d->workflow_mode = 1; + d->shadow_lift = 1.0f; + d->highlight_gain = 1.0f; + d->ucs_saturation_balance = 0.2f; + d->color_look = 1; // Natural look + + d->basecurve_nodes[0] = 2; + d->basecurve_type[0] = CUBIC_SPLINE; + d->basecurve[0][0].x = 0.0f; d->basecurve[0][0].y = 0.0f; + d->basecurve[0][1].x = 1.0f; d->basecurve[0][1].y = 1.0f; + } } void init_presets(dt_iop_module_so_t *self) @@ -583,12 +645,12 @@ int gauss_blur_cl(dt_iop_module_t *self, cl_int err = DT_OPENCL_DEFAULT_ERROR; const int devid = piece->pipe->devid; - /* horizontal blur */ + //horizontal blur err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_basecurve_blur_h, width, height, CLARG(dev_in), CLARG(dev_tmp), CLARG(width), CLARG(height)); if(err != CL_SUCCESS) return FALSE; - /* vertical blur */ + // vertical blur err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_basecurve_blur_v, width, height, CLARG(dev_tmp), CLARG(dev_out), CLARG(width), CLARG(height)); if(err != CL_SUCCESS) return FALSE; @@ -693,6 +755,11 @@ int process_cl_fusion(dt_iop_module_t *self, cl_mem dev_m = NULL; cl_mem dev_coeffs = NULL; + // Prepare Color Look matrix (9 floats packed into float16 for OpenCL) + float look_mat_buf[16] = {0.0f}; + for(int i=0; i<9; i++) look_mat_buf[i] = color_looks[d->color_look][i]; + const float alpha = 0.5f; + const int use_work_profile = (work_profile == NULL) ? 0 : 1; const int preserve_colors = d->preserve_colors; @@ -743,7 +810,6 @@ int process_cl_fusion(dt_iop_module_t *self, for(int e = 0; e < d->exposure_fusion + 1; e++) { - // for every exposure fusion image: push by some ev, apply base curve and compute features { const float mul = exposure_increment(d->exposure_stops, e, d->exposure_fusion, d->exposure_bias); @@ -886,8 +952,11 @@ int process_cl_fusion(dt_iop_module_t *self, } // copy output buffer + // Apply ACES/shadow_lift here if needed err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_basecurve_finalize, width, height, - CLARG(dev_in), CLARG(dev_comb[0]), CLARG(dev_out), CLARG(width), CLARG(height)); + CLARG(dev_in), CLARG(dev_comb[0]), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(d->workflow_mode), + CLARG(d->shadow_lift), CLARG(d->highlight_gain), CLARG(d->ucs_saturation_balance), CLARG(d->gamut_strength), + CLARG(d->highlight_corr), CLARG(d->target_gamut), CLARG(d->look_opacity), CLARG(look_mat_buf), CLARG(alpha)); error: for(int k = 0; k < num_levels_max; k++) @@ -920,7 +989,8 @@ int process_cl_lut(dt_iop_module_t *self, cl_mem dev_m = NULL; cl_mem dev_coeffs = NULL; - cl_int err = CL_MEM_OBJECT_ALLOCATION_FAILURE; + cl_int err = DT_OPENCL_DEFAULT_ERROR; + cl_mem dev_tmp = NULL; cl_mem dev_profile_info = NULL; cl_mem dev_profile_lut = NULL; @@ -933,28 +1003,61 @@ int process_cl_lut(dt_iop_module_t *self, const int height = roi_in->height; const int preserve_colors = d->preserve_colors; + const float mul = 1.0f; + + size_t sizes[] = { ROUNDUPDWD(width, devid), ROUNDUPDHT(height, devid), 1 }; dev_m = dt_opencl_copy_host_to_device(devid, d->table, 256, 256, sizeof(float)); - dev_coeffs = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 3, d->unbounded_coeffs); - if(!dev_m || !dev_coeffs) goto error; + if(dev_m == NULL) goto error; err = dt_ioppr_build_iccprofile_params_cl(work_profile, devid, &profile_info_cl, &profile_lut_cl, &dev_profile_info, &dev_profile_lut); if(err != CL_SUCCESS) goto error; + dev_coeffs = dt_opencl_copy_host_to_device_constant(devid, sizeof(float) * 3, d->unbounded_coeffs); + + if(dev_coeffs == NULL) goto error; + + cl_mem dev_dest = dev_out; + + float look_mat_buf[16] = {0.0f}; + for(int i=0; i<9; i++) look_mat_buf[i] = color_looks[d->color_look][i]; + const float alpha = 0.75f; + + if(d->workflow_mode > 0) + { + dev_tmp = dt_opencl_alloc_device(devid, width, height, sizeof(float) * 4); + if(dev_tmp == NULL) goto error; + dev_dest = dev_tmp; + } + // read data/kernels/basecurve.cl for a description of "legacy" vs current // Conditional is moved outside of the OpenCL operations for performance. if(d->preserve_colors == DT_RGB_NORM_NONE) - err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_basecurve_legacy_lut, width, height, - CLARG(dev_in), CLARG(dev_out), - CLARG(width), CLARG(height), CLARGFLOAT(1.0f), CLARG(dev_m), CLARG(dev_coeffs)); + { + dt_opencl_set_kernel_args(devid, gd->kernel_basecurve_legacy_lut, 0, CLARG(dev_in), CLARG(dev_dest), + CLARG(width), CLARG(height), CLARG(mul), CLARG(dev_m), CLARG(dev_coeffs)); + err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_basecurve_legacy_lut, sizes); + } else - err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_basecurve_lut, width, height, - CLARG(dev_in), CLARG(dev_out), - CLARG(width), CLARG(height), - CLARGFLOAT(1.0f), CLARG(dev_m), CLARG(dev_coeffs), CLARG(preserve_colors), CLARG(dev_profile_info), + { + //FIXME: There are still conditionals on d->preserve_colors within this flow that could impact performance + dt_opencl_set_kernel_args(devid, gd->kernel_basecurve_lut, 0, CLARG(dev_in), CLARG(dev_dest), CLARG(width), + CLARG(height), CLARG(mul), CLARG(dev_m), CLARG(dev_coeffs), CLARG(preserve_colors), CLARG(dev_profile_info), CLARG(dev_profile_lut), CLARG(use_work_profile)); + err = dt_opencl_enqueue_kernel_2d(devid, gd->kernel_basecurve_lut, sizes); + } + + if(d->workflow_mode > 0) + { + err = dt_opencl_enqueue_kernel_2d_args(devid, gd->kernel_basecurve_finalize, width, height, + CLARG(dev_in), CLARG(dev_tmp), CLARG(dev_out), CLARG(width), CLARG(height), CLARG(d->workflow_mode), + CLARG(d->shadow_lift), CLARG(d->highlight_gain), CLARG(d->ucs_saturation_balance), CLARG(d->gamut_strength), + CLARG(d->highlight_corr), CLARG(d->target_gamut), CLARG(d->look_opacity), CLARG(look_mat_buf), CLARG(alpha)); + if(err != CL_SUCCESS) goto error; + } error: + dt_opencl_release_mem_object(dev_tmp); dt_opencl_release_mem_object(dev_m); dt_opencl_release_mem_object(dev_coeffs); dt_ioppr_free_iccprofile_params_cl(&profile_info_cl, &profile_lut_cl, &dev_profile_info, &dev_profile_lut); @@ -1007,6 +1110,28 @@ void tiling_callback(dt_iop_module_t *self, } } +static inline float _aces_tone_map(const float x) +{ + const float a = 2.51f; + const float b = 0.03f; + const float c = 2.43f; + const float d = 0.59f; + const float e = 0.14f; + + return CLAMP((x * (a * x + b)) / (x * (c * x + d) + e), 0.0f, 1.0f); +} + +static inline float _aces_20_tonemap(const float x) +{ + const float a = 0.0245786f; + const float b = 0.000090537f; + const float c = 0.983729f; + const float d = 0.4329510f; + const float e = 0.238081f; + + return CLAMP((x * (x + a) - b) / (x * (c * x + d) + e), 0.0f, 1.0f); +} + // See comments of opencl version in data/kernels/basecurve.cl for description of the meaning of "legacy" static inline void apply_legacy_curve( const float *const in, @@ -1023,12 +1148,16 @@ static inline void apply_legacy_curve( { for(int i = 0; i < 3; i++) { - const float f = in[k+i] * mul; + float f = in[k+i] * mul; + + float val; // use base curve for values < 1, else use extrapolation. if(f < 1.0f) - out[k+i] = fmaxf(table[CLAMP((int)(f * 0x10000ul), 0, 0xffff)], 0.f); + val = fmaxf(table[CLAMP((int)(f * 0x10000ul), 0, 0xffff)], 0.f); else - out[k+i] = fmaxf(dt_iop_eval_exp(unbounded_coeffs, f), 0.f); + val = fmaxf(dt_iop_eval_exp(unbounded_coeffs, f), 0.f); + + out[k+i] = val; } out[k+3] = in[k+3]; } @@ -1059,6 +1188,7 @@ static inline void apply_curve( const float curve_lum = (lum < 1.0f) ? table[CLAMP((int)(lum * 0x10000ul), 0, 0xffff)] : dt_iop_eval_exp(unbounded_coeffs, lum); + ratio = mul * curve_lum / lum; } for(size_t c = 0; c < 3; c++) @@ -1204,7 +1334,300 @@ static inline void gauss_reduce( } } -void process_fusion(dt_iop_module_t *self, +static void process_lut(dt_iop_module_t *self, + dt_dev_pixelpipe_iop_t *piece, + const void *const ivoid, + void *const ovoid, + const dt_iop_roi_t *const roi_in, + const dt_iop_roi_t *const roi_out) +{ + const float *const in = (const float *)ivoid; + float *const out = (float *)ovoid; + dt_iop_basecurve_data_t *const d = piece->data; + const dt_iop_order_iccprofile_info_t *const work_profile = dt_ioppr_get_iop_work_profile_info(piece->module, piece->module->dev->iop); + + const int wd = roi_in->width, ht = roi_in->height; + + if(d->preserve_colors == DT_RGB_NORM_NONE) + apply_legacy_curve(in, out, wd, ht, 1.0, d->table, d->unbounded_coeffs); + else + apply_curve(in, out, wd, ht, d->preserve_colors, 1.0, d->table, d->unbounded_coeffs, work_profile); + + if(d->workflow_mode > 0) + { + const float *mat = color_looks[d->color_look]; + + const size_t npixels = (size_t)wd * ht; + DT_OMP_FOR() + for(size_t k = 0; k < 4 * npixels; k += 4) + { + float r = out[k]; + float g = out[k+1]; + float b = out[k+2]; + + // Sanitize to avoid Inf/NaN issues + r = fmaxf(-1e6f, fminf(r, 1e6f)); + g = fmaxf(-1e6f, fminf(g, 1e6f)); + b = fmaxf(-1e6f, fminf(b, 1e6f)); + + // Apply Color Look + float tr = r * mat[0] + g * mat[1] + b * mat[2]; + float tg = r * mat[3] + g * mat[4] + b * mat[5]; + float tb = r * mat[6] + g * mat[7] + b * mat[8]; + + // Mix with opacity + out[k] = r * (1.0f - d->look_opacity) + tr * d->look_opacity; + out[k+1] = g * (1.0f - d->look_opacity) + tg * d->look_opacity; + out[k+2] = b * (1.0f - d->look_opacity) + tb * d->look_opacity; + + out[k] = fmaxf(out[k], 0.0f); + out[k+1] = fmaxf(out[k+1], 0.0f); + out[k+2] = fmaxf(out[k+2], 0.0f); + + // Reload for next steps + r = out[k]; + g = out[k+1]; + b = out[k+2]; + + if(d->highlight_gain != 1.0f) { + r *= d->highlight_gain; + g *= d->highlight_gain; + b *= d->highlight_gain; + } + if(d->shadow_lift != 1.0f) { + r = powf(r, d->shadow_lift); + g = powf(g, d->shadow_lift); + b = powf(b, d->shadow_lift); + } + + const float r_coeff = 0.2627f, g_coeff = 0.6780f, b_coeff = 0.0593f; + float y_in = r * r_coeff + g * g_coeff + b * b_coeff; + float y_out = y_in; + + /* Scene-referred: apply luminance-adaptive shoulder extension for + ACES-like tonemapping. Compute perceptual luminance Jz from RGB + and derive scale k = 1 + alpha * L^2 where L = clamp(Jz,0,1). + Then tone-map x_scaled = y_in / k and rescale result by k to + extend the shoulder progressively. Keep alpha constant and + avoid changing UI or legacy/display-referred behavior. */ + if(d->workflow_mode == 1 || d->workflow_mode == 2) + { + // compute Jz from current RGB (Rec2020 -> XYZ -> JzAzBz) + float xyz[3]; + xyz[0] = 0.636958f * r + 0.144617f * g + 0.168881f * b; + xyz[1] = 0.262700f * r + 0.677998f * g + 0.059302f * b; + xyz[2] = 0.000000f * r + 0.028073f * g + 1.060985f * b; + for(int i=0;i<3;i++) xyz[i] = fmaxf(xyz[i], 0.0f); + + float xyz_scaled[4]; + xyz_scaled[0] = xyz[0] * 400.0f; + xyz_scaled[1] = xyz[1] * 400.0f; + xyz_scaled[2] = xyz[2] * 400.0f; + xyz_scaled[3] = 0.0f; + + float jab[4] = {0.0f, 0.0f, 0.0f, 0.0f}; + dt_XYZ_2_JzAzBz(xyz_scaled, jab); + + const float L = fminf(fmaxf(jab[0], 0.0f), 1.0f); + const float alpha = 0.75f; + const float k_scale = 1.0f + alpha * L * L; + + // scale luminance, apply selected tonemap, then undo scaling + const float x_scaled = y_in / k_scale; + if(d->workflow_mode == 1) + y_out = _aces_tone_map(x_scaled) * k_scale; + else /* workflow_mode == 2 */ + y_out = _aces_20_tonemap(x_scaled * 1.257f) * k_scale; + } + + float gain = y_out / fmaxf(y_in, 1e-6f); + + out[k] = r * gain; + out[k+1] = g * gain; + out[k+2] = b * gain; + + const float threshold = 0.80f; + if(y_out > threshold) + { + float factor = (y_out - threshold) / (1.0f - threshold); + factor = CLAMP(factor, 0.0f, 1.0f); + out[k] = out[k] * (1.0f - factor) + y_out * factor; + out[k+1] = out[k+1] * (1.0f - factor) + y_out * factor; + out[k+2] = out[k+2] * (1.0f - factor) + y_out * factor; + } + + if(d->ucs_saturation_balance != 0.0f || d->gamut_strength > 0.0f || d->highlight_corr != 0.0f) + { + // RGB Rec2020 to XYZ D65 + float xyz[4]; + xyz[0] = 0.636958f * out[k] + 0.144617f * out[k+1] + 0.168881f * out[k+2]; + xyz[1] = 0.262700f * out[k] + 0.677998f * out[k+1] + 0.059302f * out[k+2]; + xyz[2] = 0.000000f * out[k] + 0.028073f * out[k+1] + 1.060985f * out[k+2]; + + for(int i=0; i<3; i++) xyz[i] = fmaxf(xyz[i], 0.0f); + + // XYZ to JzAzBz + float jab[4]; + float xyz_scaled[4]; + for(int i=0; i<3; i++) xyz_scaled[i] = xyz[i] * 400.0f; // Scale to 400 nits for JzAzBz + dt_XYZ_2_JzAzBz(xyz_scaled, jab); + + int modified = 0; + + if(d->ucs_saturation_balance != 0.0f) + { + // Chroma-based modulation for saturation balance + const float r_sat = out[k]; + const float g_sat = out[k+1]; + const float b_sat = out[k+2]; + const float chroma = fmaxf(fmaxf(r_sat, g_sat), b_sat) - fminf(fminf(r_sat, g_sat), b_sat); + const float effective_saturation = d->ucs_saturation_balance * fminf(chroma * 2.0f, 1.0f); + + // Apply saturation balance + // Use Rec2020 Luminance Y for mask + const float Y = xyz[1]; + const float L = powf(fmaxf(Y, 0.0f), 0.5f); + const float fulcrum = 0.5f; + const float n = (L - fulcrum) / fulcrum; + const float mask_shadow = 1.0f / (1.0f + expf(n * 4.0f)); + float sat_adjust = effective_saturation * (2.0f * mask_shadow - 1.0f); + sat_adjust *= fminf(L * 4.0f, 1.0f); + const float sat_factor = 1.0f + sat_adjust; + jab[1] *= sat_factor; + jab[2] *= sat_factor; + modified = 1; + } + + if(d->gamut_strength > 0.0f) + { + const float Y = xyz[1]; + const float L = powf(fmaxf(Y, 0.0f), 0.5f); + const float chroma_factor = 1.0f - d->gamut_strength * (0.2f + 0.2f * L); + jab[1] *= chroma_factor; + jab[2] *= chroma_factor; + modified = 1; + } + + if(d->highlight_corr != 0.0f) + { + // HIGHLIGHT HUE AND SATURATION CORRECTION (sync with OpenCL) + // Mask starts at Jz = 0.20 and is full at Jz = 0.90. Linear transition. + float hl_mask = CLAMP((jab[0] - 0.20f) / 0.70f, 0.0f, 1.0f); + + if(hl_mask > 0.0f) + { + // 1. Soft symmetric desaturation (0.75 factor) + float desat = 1.0f - (fabsf(d->highlight_corr) * hl_mask * 0.75f); + jab[1] *= desat; + jab[2] *= desat; + + // 2. Controlled Hue Rotation (2.0 factor) + float angle = d->highlight_corr * hl_mask * 2.0f; + float ca = cosf(angle); + float sa = sinf(angle); + float az = jab[1]; + float bz = jab[2]; + jab[1] = az * ca - bz * sa; + jab[2] = az * sa + bz * ca; + modified = 1; + } + } + + if(jab[0] > 0.95f) + { + const float desat = CLAMP((1.0f - jab[0]) * 20.0f, 0.0f, 1.0f); + jab[1] *= desat; + jab[2] *= desat; + modified = 1; + } + + if(modified) + { + // JzAzBz to XYZ + dt_JzAzBz_2_XYZ(jab, xyz_scaled); + for(int i=0; i<3; i++) xyz[i] = xyz_scaled[i] / 400.0f; + + // XYZ D65 to RGB Rec2020 + out[k] = 1.716651f * xyz[0] - 0.355671f * xyz[1] - 0.253366f * xyz[2]; + out[k+1] = -0.666684f * xyz[0] + 1.616481f * xyz[1] + 0.015768f * xyz[2]; + out[k+2] = 0.017640f * xyz[0] - 0.042771f * xyz[1] + 0.942103f * xyz[2]; + + float min_val = fminf(out[k], fminf(out[k+1], out[k+2])); + if(min_val < 0.0f) + { + float lum = 0.2627f * out[k] + 0.6780f * out[k+1] + 0.0593f * out[k+2]; + if(lum > 0.0f) + { + float factor = lum / (lum - min_val); + out[k] = lum + factor * (out[k] - lum); + out[k+1] = lum + factor * (out[k+1] - lum); + out[k+2] = lum + factor * (out[k+2] - lum); + } + } + } + + if(d->gamut_strength > 0.0f) + { + const float orig_r = out[k]; + const float orig_g = out[k+1]; + const float orig_b = out[k+2]; + + const float Y = 0.2126f * orig_r + 0.7152f * orig_g + 0.0722f * orig_b; + float lum_weight = CLAMP((Y - 0.3f) / (0.8f - 0.3f), 0.0f, 1.0f); + lum_weight = lum_weight * lum_weight * (3.0f - 2.0f * lum_weight); + const float effective_strength = d->gamut_strength * lum_weight; + + float limit = 0.90f; + if (d->target_gamut == 1) limit = 0.95f; + else if (d->target_gamut == 2) limit = 1.00f; + + float gamut_threshold = limit * (1.0f - (effective_strength * 0.25f)); + float max_val = fmaxf(out[k], fmaxf(out[k+1], out[k+2])); + + if (max_val > gamut_threshold) + { + float range = limit - gamut_threshold; + float delta = max_val - gamut_threshold; + const float compressed = gamut_threshold + range * delta / (delta + range); + const float factor = compressed / max_val; + + float range_blue = 1.1f * range; + const float compressed_blue = gamut_threshold + range * delta / (delta + range_blue); + const float factor_blue = compressed_blue / max_val; + + out[k] *= factor; + out[k+1] *= factor; + out[k+2] *= factor_blue; + } + + out[k] = orig_r * (1.0f - effective_strength) + out[k] * effective_strength; + out[k+1] = orig_g * (1.0f - effective_strength) + out[k+1] * effective_strength; + out[k+2] = orig_b * (1.0f - effective_strength) + out[k+2] * effective_strength; + } + + // Final gamut check to preserve hue (exact color) + if(out[k] < 0.0f || out[k] > 1.0f || out[k+1] < 0.0f || out[k+1] > 1.0f || out[k+2] < 0.0f || out[k+2] > 1.0f) + { + const float luma = 0.2627f * out[k] + 0.6780f * out[k+1] + 0.0593f * out[k+2]; + const float target_luma = CLAMP(luma, 0.0f, 1.0f); + float t = 1.0f; + if (out[k] < 0.0f) t = fminf(t, target_luma / (target_luma - out[k])); + if (out[k+1] < 0.0f) t = fminf(t, target_luma / (target_luma - out[k+1])); + if (out[k+2] < 0.0f) t = fminf(t, target_luma / (target_luma - out[k+2])); + if (out[k] > 1.0f) t = fminf(t, (1.0f - target_luma) / (out[k] - target_luma)); + if (out[k+1] > 1.0f) t = fminf(t, (1.0f - target_luma) / (out[k+1] - target_luma)); + if (out[k+2] > 1.0f) t = fminf(t, (1.0f - target_luma) / (out[k+2] - target_luma)); + t = fmaxf(0.0f, t); + out[k] = target_luma + t * (out[k] - target_luma); + out[k+1] = target_luma + t * (out[k+1] - target_luma); + out[k+2] = target_luma + t * (out[k+2] - target_luma); + } + } + } + } +} + +static void process_fusion(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, @@ -1270,17 +1693,6 @@ void process_fusion(dt_iop_module_t *self, for(size_t k = 0; k < 4ul * wd * ht; k += 4) col[0][k + 3] *= .1f + sqrtf(out[k] * out[k] + out[k + 1] * out[k + 1] + out[k + 2] * out[k + 2]); -// #define DEBUG_VIS2 -#ifdef DEBUG_VIS2 // transform weights in channels - for(size_t k = 0; k < 4ul * w * h; k += 4) col[0][k + e] = col[0][k + 3]; -#endif - -// #define DEBUG_VIS -#ifdef DEBUG_VIS // DEBUG visualise weight buffer - for(size_t k = 0; k < 4ul * w * h; k += 4) comb[0][k + e] = col[0][k + 3]; - continue; -#endif - for(int k = 1; k < num_levels; k++) { gauss_reduce(col[k - 1], col[k], 0, w, h); @@ -1307,14 +1719,10 @@ void process_fusion(dt_iop_module_t *self, { // blend images into output pyramid if(k == num_levels - 1) // blend gaussian base -#ifdef DEBUG_VIS2 - ; -#else { for(int c = 0; c < 3; c++) comb[k][x + c] += col[k][x + 3] * col[k][x + c]; } -#endif else // laplacian { for(int c = 0; c < 3; c++) @@ -1325,7 +1733,6 @@ void process_fusion(dt_iop_module_t *self, } } -#ifndef DEBUG_VIS // DEBUG: switch off when visualising weight buf // normalise and reconstruct output pyramid buffer coarse to fine for(int k = num_levels - 1; k >= 0; k--) { @@ -1354,14 +1761,267 @@ void process_fusion(dt_iop_module_t *self, } } } -#endif + // copy output buffer + const float *mat = color_looks[d->color_look]; DT_OMP_FOR() for(size_t k = 0; k < (size_t)4 * wd * ht; k += 4) { - out[k + 0] = fmaxf(comb[0][k + 0], 0.f); - out[k + 1] = fmaxf(comb[0][k + 1], 0.f); - out[k + 2] = fmaxf(comb[0][k + 2], 0.f); + float val[3]; + val[0] = fmaxf(comb[0][k + 0], 0.f); + val[1] = fmaxf(comb[0][k + 1], 0.f); + val[2] = fmaxf(comb[0][k + 2], 0.f); + + // Sanitize to avoid Inf/NaN issues + val[0] = fminf(val[0], 1e6f); + val[1] = fminf(val[1], 1e6f); + val[2] = fminf(val[2], 1e6f); + + // If using ACES workflow, we apply shadow lift and tone mapping here, after fusion + if(d->workflow_mode > 0) + { + // Apply Color Look + float r = val[0], g = val[1], b = val[2]; + float tr = r * mat[0] + g * mat[1] + b * mat[2]; + float tg = r * mat[3] + g * mat[4] + b * mat[5]; + float tb = r * mat[6] + g * mat[7] + b * mat[8]; + + // Mix with opacity + val[0] = r * (1.0f - d->look_opacity) + tr * d->look_opacity; + val[1] = g * (1.0f - d->look_opacity) + tg * d->look_opacity; + val[2] = b * (1.0f - d->look_opacity) + tb * d->look_opacity; + + val[0] = fmaxf(val[0], 0.0f); + val[1] = fmaxf(val[1], 0.0f); + val[2] = fmaxf(val[2], 0.0f); + + if(d->highlight_gain != 1.0f) { + val[0] *= d->highlight_gain; + val[1] *= d->highlight_gain; + val[2] *= d->highlight_gain; + } + if(d->shadow_lift != 1.0f) { + val[0] = powf(val[0], d->shadow_lift); + val[1] = powf(val[1], d->shadow_lift); + val[2] = powf(val[2], d->shadow_lift); + } + + const float r_coeff = 0.2627f, g_coeff = 0.6780f, b_coeff = 0.0593f; + float y_in = val[0] * r_coeff + val[1] * g_coeff + val[2] * b_coeff; + float y_out = y_in; + + if(d->workflow_mode == 1 || d->workflow_mode == 2) + { + float xyz_local[3]; + xyz_local[0] = 0.636958f * val[0] + 0.144617f * val[1] + 0.168881f * val[2]; + xyz_local[1] = 0.262700f * val[0] + 0.677998f * val[1] + 0.059302f * val[2]; + xyz_local[2] = 0.000000f * val[0] + 0.028073f * val[1] + 1.060985f * val[2]; + for(int i=0;i<3;i++) xyz_local[i] = fmaxf(xyz_local[i], 0.0f); + + float xyz_scaled_local[4]; + xyz_scaled_local[0] = xyz_local[0] * 400.0f; + xyz_scaled_local[1] = xyz_local[1] * 400.0f; + xyz_scaled_local[2] = xyz_local[2] * 400.0f; + xyz_scaled_local[3] = 0.0f; + + float jab_local[4] = {0.0f,0.0f,0.0f,0.0f}; + dt_XYZ_2_JzAzBz(xyz_scaled_local, jab_local); + + const float L = fminf(fmaxf(jab_local[0], 0.0f), 1.0f); + const float alpha = 0.5f; + const float k_scale = 1.0f + alpha * L * L; + + const float x_scaled = y_in / k_scale; + if(d->workflow_mode == 1) + y_out = _aces_tone_map(x_scaled) * k_scale; + else + y_out = _aces_20_tonemap(x_scaled * 1.257f) * k_scale; + } + + float gain = y_out / fmaxf(y_in, 1e-6f); + + val[0] *= gain; + val[1] *= gain; + val[2] *= gain; + + const float threshold = 0.80f; + if(y_out > threshold) + { + float factor = (y_out - threshold) / (1.0f - threshold); + factor = CLAMP(factor, 0.0f, 1.0f); + val[0] = val[0] * (1.0f - factor) + y_out * factor; + val[1] = val[1] * (1.0f - factor) + y_out * factor; + val[2] = val[2] * (1.0f - factor) + y_out * factor; + } + + if(d->ucs_saturation_balance != 0.0f || d->gamut_strength > 0.0f || d->highlight_corr != 0.0f) + { + // RGB Rec2020 to XYZ D65 + float xyz[4]; + xyz[0] = 0.636958f * val[0] + 0.144617f * val[1] + 0.168881f * val[2]; + xyz[1] = 0.262700f * val[0] + 0.677998f * val[1] + 0.059302f * val[2]; + xyz[2] = 0.000000f * val[0] + 0.028073f * val[1] + 1.060985f * val[2]; + + for(int i=0; i<3; i++) xyz[i] = fmaxf(xyz[i], 0.0f); + + // XYZ to JzAzBz + float jab[4]; + float xyz_scaled[4]; + for(int i=0; i<3; i++) xyz_scaled[i] = xyz[i] * 400.0f; // Scale to 400 nits for JzAzBz + dt_XYZ_2_JzAzBz(xyz_scaled, jab); + + int modified = 0; + + if(d->ucs_saturation_balance != 0.0f) + { + // Chroma-based modulation for saturation balance + const float r_sat = val[0]; + const float g_sat = val[1]; + const float b_sat = val[2]; + const float chroma = fmaxf(fmaxf(r_sat, g_sat), b_sat) - fminf(fminf(r_sat, g_sat), b_sat); + const float effective_saturation = d->ucs_saturation_balance * fminf(chroma * 2.0f, 1.0f); + + // Apply saturation balance + // Use Rec2020 Luminance Y for mask + const float Y = xyz[1]; + const float L = powf(fmaxf(Y, 0.0f), 0.5f); + const float fulcrum = 0.5f; + const float n = (L - fulcrum) / fulcrum; + const float mask_shadow = 1.0f / (1.0f + expf(n * 4.0f)); + const float sat_factor = 1.0f + effective_saturation * (2.0f * mask_shadow - 1.0f); + jab[1] *= sat_factor; + jab[2] *= sat_factor; + modified = 1; + } + + if(d->gamut_strength > 0.0f) + { + const float Y = xyz[1]; + const float L = powf(fmaxf(Y, 0.0f), 0.5f); + const float chroma_factor = 1.0f - d->gamut_strength * (0.2f + 0.2f * L); + jab[1] *= chroma_factor; + jab[2] *= chroma_factor; + modified = 1; + } + + if(d->highlight_corr != 0.0f) + { + // HIGHLIGHT HUE AND SATURATION CORRECTION (sync with OpenCL) + // Mask starts at Jz = 0.20 and is full at Jz = 0.90. Linear transition. + float hl_mask = CLAMP((jab[0] - 0.20f) / 0.70f, 0.0f, 1.0f); + + if(hl_mask > 0.0f) + { + // 1. Soft symmetric desaturation (0.75 factor) + float desat = 1.0f - (fabsf(d->highlight_corr) * hl_mask * 0.75f); + jab[1] *= desat; + jab[2] *= desat; + + // 2. Controlled Hue Rotation (2.0 factor) + float angle = d->highlight_corr * hl_mask * 2.0f; + float ca = cosf(angle); + float sa = sinf(angle); + float az = jab[1]; + float bz = jab[2]; + jab[1] = az * ca - bz * sa; + jab[2] = az * sa + bz * ca; + modified = 1; + } + } + + if(jab[0] > 0.95f) + { + const float desat = CLAMP((1.0f - jab[0]) * 20.0f, 0.0f, 1.0f); + jab[1] *= desat; + jab[2] *= desat; + modified = 1; + } + + if(modified) + { + // JzAzBz to XYZ + dt_JzAzBz_2_XYZ(jab, xyz_scaled); + for(int i=0; i<3; i++) xyz[i] = xyz_scaled[i] / 400.0f; + + // XYZ D65 to RGB Rec2020 + val[0] = 1.716651f * xyz[0] - 0.355671f * xyz[1] - 0.253366f * xyz[2]; + val[1] = -0.666684f * xyz[0] + 1.616481f * xyz[1] + 0.015768f * xyz[2]; + val[2] = 0.017640f * xyz[0] - 0.042771f * xyz[1] + 0.942103f * xyz[2]; + + float min_val = fminf(val[0], fminf(val[1], val[2])); + if(min_val < 0.0f) + { + float lum = 0.2627f * val[0] + 0.6780f * val[1] + 0.0593f * val[2]; + if(lum > 0.0f) + { + float factor = lum / (lum - min_val); + val[0] = lum + factor * (val[0] - lum); + val[1] = lum + factor * (val[1] - lum); + val[2] = lum + factor * (val[2] - lum); + } + } + } + + if(d->gamut_strength > 0.0f) + { + const float orig_r = val[0]; + const float orig_g = val[1]; + const float orig_b = val[2]; + + const float Y = 0.2126f * orig_r + 0.7152f * orig_g + 0.0722f * orig_b; + float lum_weight = CLAMP((Y - 0.3f) / (0.8f - 0.3f), 0.0f, 1.0f); + lum_weight = lum_weight * lum_weight * (3.0f - 2.0f * lum_weight); + const float effective_strength = d->gamut_strength * lum_weight; + + float limit = 0.90f; + if (d->target_gamut == 1) limit = 0.95f; + else if (d->target_gamut == 2) limit = 1.00f; + + float gamut_threshold = limit * (1.0f - (effective_strength * 0.25f)); + float max_val = fmaxf(val[0], fmaxf(val[1], val[2])); + + if (max_val > gamut_threshold) + { + float range = limit - gamut_threshold; + float delta = max_val - gamut_threshold; + const float compressed = gamut_threshold + range * delta / (delta + range); + const float factor = compressed / max_val; + + float range_blue = 1.1f * range; + const float compressed_blue = gamut_threshold + range * delta / (delta + range_blue); + const float factor_blue = compressed_blue / max_val; + + val[0] *= factor; + val[1] *= factor; + val[2] *= factor_blue; + } + + val[0] = orig_r * (1.0f - effective_strength) + val[0] * effective_strength; + val[1] = orig_g * (1.0f - effective_strength) + val[1] * effective_strength; + val[2] = orig_b * (1.0f - effective_strength) + val[2] * effective_strength; + } + + // Final gamut check to preserve hue (exact color) + if(val[0] < 0.0f || val[0] > 1.0f || val[1] < 0.0f || val[1] > 1.0f || val[2] < 0.0f || val[2] > 1.0f) + { + const float luma = 0.2627f * val[0] + 0.6780f * val[1] + 0.0593f * val[2]; + const float target_luma = CLAMP(luma, 0.0f, 1.0f); + float t = 1.0f; + if (val[0] < 0.0f) t = fminf(t, target_luma / (target_luma - val[0])); + if (val[1] < 0.0f) t = fminf(t, target_luma / (target_luma - val[1])); + if (val[2] < 0.0f) t = fminf(t, target_luma / (target_luma - val[2])); + if (val[0] > 1.0f) t = fminf(t, (1.0f - target_luma) / (val[0] - target_luma)); + if (val[1] > 1.0f) t = fminf(t, (1.0f - target_luma) / (val[1] - target_luma)); + if (val[2] > 1.0f) t = fminf(t, (1.0f - target_luma) / (val[2] - target_luma)); + t = fmaxf(0.0f, t); + val[0] = target_luma + t * (val[0] - target_luma); + val[1] = target_luma + t * (val[1] - target_luma); + val[2] = target_luma + t * (val[2] - target_luma); + } + } + } + + for(int i = 0; i < 3; i++) out[k + i] = val[i]; out[k + 3] = in[k + 3]; // pass on 4th channel } @@ -1376,32 +2036,6 @@ void process_fusion(dt_iop_module_t *self, free(comb); } -void process_lut(dt_iop_module_t *self, - dt_dev_pixelpipe_iop_t *piece, - const void *const ivoid, - void *const ovoid, - const dt_iop_roi_t *const roi_in, - const dt_iop_roi_t *const roi_out) -{ - const float *const in = (const float *)ivoid; - float *const out = (float *)ovoid; - //const int ch = piece->colors; <-- it appears someone was trying to make this handle monochrome data, - //however the for loops only handled RGBA - FIXME, determine what possible data formats and channel - //configurations we might encounter here and handle those too - dt_iop_basecurve_data_t *const d = piece->data; - const dt_iop_order_iccprofile_info_t *const work_profile = dt_ioppr_get_iop_work_profile_info(piece->module, piece->module->dev->iop); - - const int wd = roi_in->width, ht = roi_in->height; - - // Compared to previous implementation, we've at least moved this conditional outside of the image processing loops - // so that it is evaluated only once. See FIXME comments in apply_curve for more potential performance improvements - if(d->preserve_colors == DT_RGB_NORM_NONE) - apply_legacy_curve(in, out, wd, ht, 1.0, d->table, d->unbounded_coeffs); - else - apply_curve(in, out, wd, ht, d->preserve_colors, 1.0, d->table, d->unbounded_coeffs, work_profile); -} - - void process(dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, @@ -1430,6 +2064,15 @@ void commit_params(dt_iop_module_t *self, d->exposure_stops = p->exposure_stops; d->exposure_bias = p->exposure_bias; d->preserve_colors = p->preserve_colors; + d->workflow_mode = p->workflow_mode; + d->shadow_lift = 2.0f - p->shadow_lift; + d->highlight_gain = p->highlight_gain; + d->ucs_saturation_balance = p->ucs_saturation_balance; + d->gamut_strength = p->gamut_strength; + d->highlight_corr = p->highlight_corr; + d->target_gamut = p->target_gamut; + d->color_look = p->color_look; + d->look_opacity = p->look_opacity; const int ch = 0; // take care of possible change of curve type or number of nodes (not yet implemented in UI) @@ -1484,18 +2127,6 @@ void cleanup_pipe(dt_iop_module_t *self, piece->data = NULL; } -void gui_update(dt_iop_module_t *self) -{ - dt_iop_basecurve_params_t *p = self->params; - dt_iop_basecurve_gui_data_t *g = self->gui_data; - - gtk_widget_set_visible(g->exposure_step, p->exposure_fusion != 0); - gtk_widget_set_visible(g->exposure_bias, p->exposure_fusion != 0); - - // gui curve is read directly from params during expose event. - gtk_widget_queue_draw(GTK_WIDGET(g->area)); -} - static float eval_grey(float x) { // "log base" is a combined scaling and offset change so that x->[0,1], with @@ -1509,6 +2140,8 @@ void init(dt_iop_module_t *self) dt_iop_basecurve_params_t *d = self->default_params; d->basecurve[0][1].x = d->basecurve[0][1].y = 1.0; d->basecurve_nodes[0] = 2; + d->shadow_lift = 1.0f; + d->highlight_gain = 1.0f; } void init_global(dt_iop_module_so_t *self) @@ -1604,7 +2237,7 @@ static gboolean dt_iop_basecurve_draw(GtkWidget *widget, cairo_t *crf, dt_iop_mo dt_draw_curve_set_point(g->minmax_curve, k, p->basecurve[0][k].x, p->basecurve[0][k].y); } dt_draw_curve_t *minmax_curve = g->minmax_curve; - dt_draw_curve_calc_values(minmax_curve, 0.0, 1.0, DT_IOP_TONECURVE_RES, g->draw_xs, g->draw_ys); + dt_draw_curve_calc_values(minmax_curve, 0.0, 1.0, DT_IOP_TONECURVE_RES, NULL, g->draw_ys); float unbounded_coeffs[3]; const float xm = basecurve[nodes - 1].x; @@ -1828,7 +2461,7 @@ static gboolean dt_iop_basecurve_motion_notify(GtkWidget *widget, // got a vertex selected: if(g->selected >= 0) { - // this is used to translate mause position in loglogscale to make this behavior unified with linear scale. + // this is used to translate mouse position in loglogscale to make this behavior unified with linear scale. const float translate_mouse_x = old_m_x / width - to_log(basecurve[g->selected].x, g->loglogscale); const float translate_mouse_y = 1 - old_m_y / height - to_log(basecurve[g->selected].y, g->loglogscale); // dx & dy are in linear coordinates @@ -1946,12 +2579,22 @@ static gboolean dt_iop_basecurve_button_press(GtkWidget *widget, else if(event->type == GDK_2BUTTON_PRESS) { // reset current curve - p->basecurve_nodes[ch] = d->basecurve_nodes[ch]; - p->basecurve_type[ch] = d->basecurve_type[ch]; - for(int k = 0; k < d->basecurve_nodes[ch]; k++) + if(p->workflow_mode > 0) + { + p->basecurve_nodes[ch] = 2; + p->basecurve_type[ch] = CUBIC_SPLINE; + p->basecurve[ch][0].x = 0.0f; p->basecurve[ch][0].y = 0.0f; + p->basecurve[ch][1].x = 1.0f; p->basecurve[ch][1].y = 1.0f; + } + else { - p->basecurve[ch][k].x = d->basecurve[ch][k].x; - p->basecurve[ch][k].y = d->basecurve[ch][k].y; + p->basecurve_nodes[ch] = d->basecurve_nodes[ch]; + p->basecurve_type[ch] = d->basecurve_type[ch]; + for(int k = 0; k < d->basecurve_nodes[ch]; k++) + { + p->basecurve[ch][k].x = d->basecurve[ch][k].x; + p->basecurve[ch][k].y = d->basecurve[ch][k].y; + } } g->selected = -2; // avoid motion notify re-inserting immediately. dt_dev_add_history_item_target(darktable.develop, self, TRUE, widget); @@ -2088,6 +2731,117 @@ void gui_changed(dt_iop_module_t *self, GtkWidget *w, void *previous) gtk_widget_set_visible(g->exposure_bias, FALSE); } } + + if(!w || w == g->workflow_mode || w == g->color_look) + { + if(p->workflow_mode == 1 || p->workflow_mode == 2) + { + gtk_widget_set_visible(g->cmb_preserve_colors, FALSE); + if(p->preserve_colors != DT_RGB_NORM_NONE) + dt_bauhaus_combobox_set(g->cmb_preserve_colors, DT_RGB_NORM_NONE); + gtk_widget_set_visible(g->shadow_lift, TRUE); + gtk_widget_set_visible(g->highlight_gain, TRUE); + gtk_widget_set_visible(g->ucs_saturation_balance, TRUE); + gtk_widget_set_visible(g->gamut_strength, TRUE); + gtk_widget_set_visible(g->highlight_corr, TRUE); + gtk_widget_set_visible(g->target_gamut, TRUE); + gtk_widget_set_visible(g->color_look, TRUE); + gtk_widget_set_visible(g->look_opacity, TRUE); + gtk_widget_set_sensitive(g->shadow_lift, TRUE); + gtk_widget_set_sensitive(g->highlight_gain, TRUE); + gtk_widget_set_sensitive(g->ucs_saturation_balance, TRUE); + gtk_widget_set_sensitive(g->gamut_strength, TRUE); + gtk_widget_set_sensitive(g->highlight_corr, TRUE); + gtk_widget_set_sensitive(g->target_gamut, TRUE); + gtk_widget_set_sensitive(g->color_look, TRUE); + gtk_widget_set_sensitive(g->look_opacity, p->color_look > 0); + if(w == g->color_look) + { + p->look_opacity = 1.0f; + dt_bauhaus_slider_set(g->look_opacity, 1.0f); + } + gtk_widget_set_tooltip_text(g->fusion, _("Exposure fusion operates in linear scene-referred space as a luminance normalization step,\n" + "providing a stable radiometric reference prior to the final tone-mapping curve.\n" + "It does not perform HDR blending nor exposure compensation.")); + if(w == g->workflow_mode) + { + p->shadow_lift = 1.0f; + dt_bauhaus_slider_set(g->shadow_lift, 1.0f); + p->highlight_gain = 1.0f; + dt_bauhaus_slider_set(g->highlight_gain, 1.0f); + p->ucs_saturation_balance = 0.2f; + dt_bauhaus_slider_set(g->ucs_saturation_balance, 0.2f); + // Set default color look when switching to this workflow + p->color_look = 1; // Natural look + dt_bauhaus_combobox_set(g->color_look, 1); + p->look_opacity = 1.0f; + dt_bauhaus_slider_set(g->look_opacity, 1.0f); + p->basecurve_type[0] = CUBIC_SPLINE; + p->basecurve_nodes[0] = 2; + p->basecurve[0][0].x = 0.0f; p->basecurve[0][0].y = 0.0f; + p->basecurve[0][1].x = 1.0f; p->basecurve[0][1].y = 1.0f; + + gtk_widget_queue_draw(GTK_WIDGET(g->area)); + } + } + else + { + gtk_widget_set_visible(g->cmb_preserve_colors, TRUE); + gtk_widget_set_visible(g->shadow_lift, FALSE); + gtk_widget_set_visible(g->highlight_gain, FALSE); + gtk_widget_set_visible(g->ucs_saturation_balance, FALSE); + gtk_widget_set_visible(g->gamut_strength, FALSE); + gtk_widget_set_visible(g->highlight_corr, FALSE); + gtk_widget_set_visible(g->target_gamut, FALSE); + gtk_widget_set_visible(g->color_look, FALSE); + gtk_widget_set_visible(g->look_opacity, FALSE); + gtk_widget_set_sensitive(g->shadow_lift, FALSE); + gtk_widget_set_sensitive(g->highlight_gain, FALSE); + gtk_widget_set_sensitive(g->ucs_saturation_balance, FALSE); + gtk_widget_set_sensitive(g->gamut_strength, FALSE); + gtk_widget_set_sensitive(g->highlight_corr, FALSE); + gtk_widget_set_sensitive(g->target_gamut, FALSE); + gtk_widget_set_sensitive(g->color_look, FALSE); + gtk_widget_set_sensitive(g->look_opacity, FALSE); + gtk_widget_set_tooltip_text(g->fusion, _("fuse this image stopped up/down a couple of times with itself, to " + "compress high dynamic range. expose for the highlights before use.")); + } + } + + if(!w || w == g->workflow_mode) + { + if(p->workflow_mode != 0) + { + gtk_widget_hide(g->logbase); + } + else + { + gtk_widget_show(g->logbase); + } + } +} + +void gui_update(dt_iop_module_t *self) +{ + dt_iop_basecurve_params_t *p = self->params; + dt_iop_basecurve_gui_data_t *g = self->gui_data; + + gtk_widget_set_visible(g->exposure_step, p->exposure_fusion != 0); + gtk_widget_set_visible(g->exposure_bias, p->exposure_fusion != 0); + + dt_bauhaus_slider_set(g->gamut_strength, p->gamut_strength); + dt_bauhaus_slider_set(g->highlight_corr, p->highlight_corr); + dt_bauhaus_combobox_set(g->target_gamut, p->target_gamut); + dt_bauhaus_combobox_set(g->workflow_mode, p->workflow_mode); + dt_bauhaus_slider_set(g->shadow_lift, p->shadow_lift); + dt_bauhaus_slider_set(g->highlight_gain, p->highlight_gain); + dt_bauhaus_slider_set(g->ucs_saturation_balance, p->ucs_saturation_balance); + dt_bauhaus_combobox_set(g->color_look, p->color_look); + dt_bauhaus_slider_set(g->look_opacity, p->look_opacity); + gui_changed(self, NULL, NULL); + + // gui curve is read directly from params during expose event. + gtk_widget_queue_draw(GTK_WIDGET(g->area)); } static void logbase_callback(GtkWidget *slider, dt_iop_module_t *self) @@ -2111,15 +2865,68 @@ void gui_init(dt_iop_module_t *self) g->selected = -1; g->loglogscale = 0; - g->area = GTK_DRAWING_AREA(dtgtk_drawing_area_new_with_height(0)); + g->area = GTK_DRAWING_AREA(dt_ui_resize_wrap(NULL, DT_PIXEL_APPLY_DPI(100), "plugins/darkroom/basecurve/graph_height")); gtk_widget_set_tooltip_text(GTK_WIDGET(g->area), _("abscissa: input, ordinate: output. works on RGB channels")); g_object_set_data(G_OBJECT(g->area), "iop-instance", self); dt_action_define_iop(self, NULL, N_("curve"), GTK_WIDGET(g->area), NULL); - self->widget = dt_gui_vbox(g->area); + self->widget = dt_gui_vbox(GTK_WIDGET(g->area)); g->cmb_preserve_colors = dt_bauhaus_combobox_from_params(self, "preserve_colors"); gtk_widget_set_tooltip_text(g->cmb_preserve_colors, _("method to preserve colors when applying contrast")); + dt_gui_box_add(self->widget, g->cmb_preserve_colors); + + g->workflow_mode = dt_bauhaus_combobox_from_params(self, "workflow_mode"); + dt_bauhaus_combobox_add(g->workflow_mode, _("display")); + dt_bauhaus_combobox_add(g->workflow_mode, _("Kinematics (ACES-like)")); + dt_bauhaus_combobox_add(g->workflow_mode, _("Kinematics (Narkowicz)")); + gtk_widget_set_tooltip_text(g->workflow_mode, _("tone mapping method applied after the curve")); + dt_gui_box_add(self->widget, g->workflow_mode); + + g->color_look = dt_bauhaus_combobox_from_params(self, "color_look"); + dt_bauhaus_widget_set_label(g->color_look, NULL, _("Color Look")); + dt_bauhaus_combobox_add(g->color_look, "Neutral"); + dt_bauhaus_combobox_add(g->color_look, "Natural look"); + dt_bauhaus_combobox_add(g->color_look, "Portrait"); + dt_bauhaus_combobox_add(g->color_look, "Vibrant"); + dt_bauhaus_combobox_add(g->color_look, "Nature"); + dt_bauhaus_combobox_add(g->color_look, "Blue Sky"); + dt_bauhaus_combobox_add(g->color_look, "Soft Warm"); + dt_bauhaus_combobox_add(g->color_look, "Soft"); + dt_bauhaus_combobox_add(g->color_look, "Deep Cool"); + dt_bauhaus_combobox_add(g->color_look, "Authentic Cinema"); + gtk_widget_set_tooltip_text(g->color_look, _("Apply a color style: Neutral (none), Portrait (skin tones), Nature (landscapes), Blue Sky (depth), Soft (organic), or Warm/Cool artistic tints.")); + dt_gui_box_add(self->widget, g->color_look); + + g->look_opacity = dt_bauhaus_slider_from_params(self, "look_opacity"); + dt_bauhaus_widget_set_label(g->look_opacity, NULL, _("Look Opacity")); + dt_bauhaus_slider_set_format(g->look_opacity, "%"); + dt_bauhaus_slider_set_factor(g->look_opacity, 100.0); + gtk_widget_set_tooltip_text(g->look_opacity, _("Adjust the strength of the selected color style (10% to 100%).")); + dt_gui_box_add(self->widget, g->look_opacity); + + g->highlight_gain = dt_bauhaus_slider_from_params(self, "highlight_gain"); + dt_bauhaus_widget_set_label(g->highlight_gain, NULL, _("highlight gain")); + gtk_widget_set_tooltip_text(g->highlight_gain, _("Adjusts the gain before tone mapping.\n" + "Higher values push more data into highlights compression.")); + dt_bauhaus_slider_set_soft_range(g->highlight_gain, 0.25, 1.75); + dt_bauhaus_slider_set_format(g->highlight_gain, "%"); + dt_bauhaus_slider_set_factor(g->highlight_gain, 100.0); + dt_bauhaus_slider_set_offset(g->highlight_gain, -100.0); + dt_bauhaus_slider_set_default(g->highlight_gain, 1.0); + dt_gui_box_add(self->widget, g->highlight_gain); + + g->shadow_lift = dt_bauhaus_slider_from_params(self, "shadow_lift"); + dt_bauhaus_widget_set_label(g->shadow_lift, NULL, _("shadow lift")); + gtk_widget_set_tooltip_text(g->shadow_lift, _("Adjusts the shadows brightness.\n" + "Positive values lift shadows,\n" + "while negative values darken them.")); + dt_bauhaus_slider_set_soft_range(g->shadow_lift, 0.25, 1.75); + dt_bauhaus_slider_set_format(g->shadow_lift, "%"); + dt_bauhaus_slider_set_factor(g->shadow_lift, 100.0); + dt_bauhaus_slider_set_offset(g->shadow_lift, -100.0); + dt_bauhaus_slider_set_default(g->shadow_lift, 1.0); + dt_gui_box_add(self->widget, g->shadow_lift); g->fusion = dt_bauhaus_combobox_from_params(self, "exposure_fusion"); dt_bauhaus_combobox_add(g->fusion, _("none")); @@ -2127,12 +2934,15 @@ void gui_init(dt_iop_module_t *self) dt_bauhaus_combobox_add(g->fusion, _("three exposures")); gtk_widget_set_tooltip_text(g->fusion, _("fuse this image stopped up/down a couple of times with itself, to " "compress high dynamic range. expose for the highlights before use.")); + gtk_widget_set_margin_bottom(g->fusion, DT_PIXEL_APPLY_DPI(10)); + dt_gui_box_add(self->widget, g->fusion); g->exposure_step = dt_bauhaus_slider_from_params(self, "exposure_stops"); dt_bauhaus_slider_set_digits(g->exposure_step, 3); gtk_widget_set_tooltip_text(g->exposure_step, _("how many stops to shift the individual exposures apart")); gtk_widget_set_no_show_all(g->exposure_step, TRUE); gtk_widget_set_visible(g->exposure_step, p->exposure_fusion != 0 ? TRUE : FALSE); + dt_gui_box_add(self->widget, g->exposure_step); // initially set to 1 (consistency with previous versions), but double-click resets to 0 // to get a quick way to reach 0 with the mouse. @@ -2143,6 +2953,54 @@ void gui_init(dt_iop_module_t *self) "(-1: reduce highlight, +1: reduce shadows)")); gtk_widget_set_no_show_all(g->exposure_bias, TRUE); gtk_widget_set_visible(g->exposure_bias, p->exposure_fusion != 0 ? TRUE : FALSE); + dt_gui_box_add(self->widget, g->exposure_bias); + + g->ucs_saturation_balance = dt_bauhaus_slider_from_params(self, "ucs_saturation_balance"); + dt_bauhaus_widget_set_label(g->ucs_saturation_balance, NULL, _("balance saturation ucs")); + gtk_widget_set_tooltip_text(g->ucs_saturation_balance, + _("Balances saturation between shadows and highlights (JzAzBz space).\n" + " Move right to boost shadow saturation while taming highlights.\n" + " Move left to boost highlight saturation while taming shadows.\n" + " Ideal for making dark colors pop without clipping speculars.")); + dt_bauhaus_slider_set_format(g->ucs_saturation_balance, "%"); + dt_bauhaus_slider_set_factor(g->ucs_saturation_balance, 100.0); + dt_bauhaus_slider_set_soft_range(g->ucs_saturation_balance, -0.75, 0.75); + dt_bauhaus_slider_set_default(g->ucs_saturation_balance, 0.2); + dt_gui_box_add(self->widget, g->ucs_saturation_balance); + + g->highlight_corr = dt_bauhaus_slider_from_params(self, "highlight_corr"); + dt_bauhaus_widget_set_label(g->highlight_corr, NULL, _("Highlight Hue/Sat")); + dt_bauhaus_slider_set_format(g->highlight_corr, "%"); + dt_bauhaus_slider_set_factor(g->highlight_corr, 100.0); + dt_bauhaus_slider_set_digits(g->highlight_corr, 1); + dt_bauhaus_slider_set_soft_range(g->highlight_corr, -1.0, 1.0); + dt_bauhaus_slider_set_default(g->highlight_corr, 0.0); + dt_bauhaus_slider_set_step(g->highlight_corr, 0.001); + gtk_widget_set_tooltip_text(g->highlight_corr, _("corrects hue and saturation in highlights to mitigate color shifts\n" + "(e.g. salmon sunsets or magenta blues)")); + dt_gui_box_add(self->widget, g->highlight_corr); + + g->target_gamut = dt_bauhaus_combobox_from_params(self, "target_gamut"); + dt_bauhaus_combobox_add(g->target_gamut, "sRGB (Rec.709)"); + dt_bauhaus_combobox_add(g->target_gamut, "AdobeRGB"); + dt_bauhaus_combobox_add(g->target_gamut, "Rec.2020"); + gtk_widget_set_tooltip_text(g->target_gamut, _("Select the destination color space (sRGB, AdobeRGB,\n" + "or Rec.2020). This sets the legal boundary for color saturation.")); + dt_gui_box_add(self->widget, g->target_gamut); + + g->gamut_strength = dt_bauhaus_slider_from_params(self, "gamut_strength"); + dt_bauhaus_widget_set_label(g->gamut_strength, NULL, _("compression smoothness")); + gtk_widget_set_tooltip_text(g->gamut_strength, + _("Defines how high in the highlights the compression starts.\n" + " Lower values keep more saturation but may clip;\n" + " higher values create a professional roll-off\n" + " in the brightest colors without affecting midtones.")); + dt_bauhaus_slider_set_format(g->gamut_strength, "%"); + dt_bauhaus_slider_set_factor(g->gamut_strength, 100.0); + dt_bauhaus_slider_set_digits(g->gamut_strength, 1); + dt_bauhaus_slider_set_step(g->gamut_strength, 0.001); + dt_bauhaus_slider_set_soft_range(g->gamut_strength, 0.0, 1.0); + dt_gui_box_add(self->widget, g->gamut_strength); g->logbase = dt_bauhaus_slider_new_with_range(self, 0.0f, 40.0f, 0, 0.0f, 2); dt_bauhaus_widget_set_label(g->logbase, NULL, N_("scale for graph"));