From c4bc5708909de3e357ff60382da70de323082f48 Mon Sep 17 00:00:00 2001
From: "Sydir, Jerry" <jerry.sydir@intel.com>
Date: Tue, 10 Jun 2025 18:24:22 -0700
Subject: [PATCH 1/3] Modified support for memmove to allow user to select
 whether operations with overlapping buffers are performed entirely on CPU or
 on DSA Modified support for memmove to disregard memmove operations with
 overlapping buffers because they are not split between CPU and DSA Modified
 dsa_execute function to not call dsa_wait_and_adjust since it is called when
 the operation wasn't split and the autotune algorithm should not be used.

Signed-off-by: Sydir, Jerry <jerry.sydir@intel.com>
---
 README.md |   1 +
 dto.c     | 179 +++++++++++++++++++++++++++++++++---------------------
 2 files changed, 112 insertions(+), 68 deletions(-)
diff --git a/README.md b/README.md
index c9f3fe0..576f4de 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,7 @@ Following environment variables control the behavior of DTO library:
    DTO_DSA_MEMSET=0/1, 1 (default) - DTO uses DSA to process memset, 0 - DTO use system memset
    DTO_DSA_MEMCMP=0/1, 1 (default) - DTO uses DSA to process memcmp, 0 - DTO use system memcmp
    DTO_DSA_CC=0/1, 1 (default) - DTO sets DSA Cache Control flag to 1 if DSA supports cache control, 0 - DTO sets DSA Cache Control flag to 0
+   DTO_OVERLAPPING_MEMMOVE_ACTION=0/1 0 (default) DTO submits memmove operations with overlapping buffers entirely to CPU, 1 - entirely to DSA
    DTO_UMWAIT_DELAY=xxxx defines delay for umwait command (check max possible value at: /sys/devices/system/cpu/umwait_control/max_time), default is 100000
 	DTO_LOG_FILE=<dto log file path> Redirect the DTO output to the specified file instead of std output (useful for debugging and statistics collection). file name is suffixed by process pid.
 	DTO_LOG_LEVEL=0/1/2 controls the log level. higher value means more verbose logging (default 0).
diff --git a/dto.c b/dto.c
index b7a3a1c..3c67eef 100644
--- a/dto.c
+++ b/dto.c
@@ -89,6 +89,12 @@ enum numa_aware {
 	NA_LAST_ENTRY
 };
 
+enum overlapping_memmove_actions {
+	OVERLAPPING_CPU = 0,
+	OVERLAPPING_DSA,
+	OVERLAPPING_LAST_ENTRY
+};
+
 static const char * const numa_aware_names[] = {
 	[NA_NONE] = "none",
 	[NA_BUFFER_CENTRIC] = "buffer-centric",
@@ -117,6 +123,8 @@ static uint8_t dto_dsa_cc = 1;
 
 static unsigned long dto_umwait_delay = UMWAIT_DELAY_DEFAULT;
 
+static uint8_t dto_overlapping_memmove_action = OVERLAPPING_CPU;
+
 static uint8_t fork_handler_registered;
 
 enum memop {
@@ -188,7 +196,7 @@ static struct timespec dto_start_time;
 	} while (0)						\
 
 
-#define DTO_COLLECT_STATS_DSA_END(cs, st, et, op, n, tbc, r)				\
+#define DTO_COLLECT_STATS_DSA_END(cs, st, et, op, n, overlap, tbc, r)				\
 	do {										\
 		if (unlikely(cs)) {							\
 			uint64_t t;							\
@@ -196,9 +204,9 @@ static struct timespec dto_start_time;
 			t = (((et.tv_sec*1000000000) + et.tv_nsec) -			\
 					((st.tv_sec*1000000000) + st.tv_nsec));		\
 			if (unlikely(r != SUCCESS))					\
-				update_stats(op, n, tbc, t, DSA_CALL_FAILED, r);	\
+				update_stats(op, n, overlap, tbc, t, DSA_CALL_FAILED, r);	\
 			else								\
-				update_stats(op, n, tbc, t, DSA_CALL_SUCCESS, 0);	\
+				update_stats(op, n, overlap, tbc, t, DSA_CALL_SUCCESS, 0);	\
 		}									\
 	} while (0)									\
 
@@ -209,7 +217,7 @@ static struct timespec dto_start_time;
 			clock_gettime(CLOCK_BOOTTIME, &et);			\
 			t = (((et.tv_sec*1000000000) + et.tv_nsec) -		\
 				((st.tv_sec*1000000000) + st.tv_nsec));		\
-			update_stats(op, orig_n, n, t, STDC_CALL, 0);		\
+			update_stats(op, orig_n, 0, n, t, STDC_CALL, 0);		\
 		}								\
 	} while (0)								\
 
@@ -513,10 +521,7 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
 			ret = 0;
 	}
 	if (!ret) {
-		if (auto_adjust_knobs)
-			dsa_wait_and_adjust(comp);
-		else
-			dsa_wait_no_adjust(comp);
+		dsa_wait_no_adjust(comp);
 
 		if (*comp == DSA_COMP_SUCCESS) {
 			thr_bytes_completed += hw->xfer_size;
@@ -532,9 +537,13 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
 }
 
 #ifdef DTO_STATS_SUPPORT
-static void update_stats(int op, size_t n, size_t bytes_completed,
+static void update_stats(int op, size_t n, uint8_t overlapping, size_t bytes_completed,
 		uint64_t elapsed_ns, int group, int error_code)
 {
+	// dto_memcpymove didn't actually submit the request to DSA, so there is nothing to log. This will be captured by a second call
+	if(op==MEMMOVE && overlapping && dto_overlapping_memmove_action==OVERLAPPING_CPU && group==DSA_CALL_SUCCESS)
+		return;
+
 	int bucket = (n / HIST_BUCKET_SIZE);
 
 	if (bucket >= HIST_NO_BUCKETS)  /* last bucket includes remaining sizes */
@@ -1240,6 +1249,14 @@ static int init_dto(void)
 			dto_dsa_memcmp = !!dto_dsa_memcmp;
 		}
 
+		env_str = getenv("DTO_OVERLAPPING_MEMMOVE_ACTION");
+		if (env_str != NULL) {
+			errno = 0;
+			dto_overlapping_memmove_action = strtoul(env_str, NULL, 10);
+			if (errno)
+				dto_overlapping_memmove_action = OVERLAPPING_CPU;
+		}
+
 #ifdef DTO_STATS_SUPPORT
 		env_str = getenv("DTO_COLLECT_STATS");
 		if (env_str != NULL) {
@@ -1484,10 +1501,11 @@ static bool is_overlapping_buffers (void *dest, const void *src, size_t n)
 	return true;
 }
 
-static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result)
+static uint8_t dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result)
 {
 	struct dto_wq *wq = get_wq(dest);
 	size_t cpu_size, dsa_size;
+	uint8_t is_overlapping = 0;
 
 	thr_desc.opcode = DSA_OPCODE_MEMMOVE;
 	thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR;
@@ -1496,75 +1514,99 @@ static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy
 	thr_desc.completion_addr = (uint64_t)&thr_comp;
 
 	/* cpu_size_fraction guaranteed to be >= 0 and < 1 */
-	if (!is_memcpy && is_overlapping_buffers(dest, src, n))
+	if (!is_memcpy && is_overlapping_buffers(dest, src, n)) {
 		cpu_size = 0;
-	else
+		is_overlapping = 1;
+	} else {
 		cpu_size = n * cpu_size_fraction / 100;
+	}
 
 	dsa_size = n - cpu_size;
 
 	thr_bytes_completed = 0;
 
-	if (dsa_size <= wq->max_transfer_size) {
-		thr_desc.src_addr = (uint64_t) src + cpu_size;
-		thr_desc.dst_addr = (uint64_t) dest + cpu_size;
-		thr_desc.xfer_size = (uint32_t) dsa_size;
-		thr_comp.status = 0;
-		*result = dsa_submit(wq, &thr_desc);
-		if (*result == SUCCESS) {
-			if (cpu_size) {
-				if (is_memcpy)
-					orig_memcpy(dest, src, cpu_size);
-				else
-					orig_memmove(dest, src, cpu_size);
-				thr_bytes_completed += cpu_size;
-			}
-			*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
-		}
+	// If this is an overlapping memmove and the action is to perform on CPU, return having done nothing and
+	// memmove will perform the copy and correctly attribute statistics to stdlib call group
+	if (is_overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU) {
+		thr_bytes_completed = 0;
+		*result = SUCCESS;
 	} else {
-		uint32_t threshold;
-		size_t current_cpu_size_fraction = cpu_size_fraction;  // the cpu_size_fraction might be changed by the auto tune algorithm 
-		threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction);
-		do {
-			size_t len;
 
-			len = n <= threshold ? n : threshold;
-
-			if (!is_memcpy && is_overlapping_buffers(dest, src, len))
+		if (dsa_size <= wq->max_transfer_size) {
+			thr_desc.src_addr = (uint64_t) src + cpu_size;
+			thr_desc.dst_addr = (uint64_t) dest + cpu_size;
+			thr_desc.xfer_size = (uint32_t) dsa_size;
+			thr_comp.status = 0;
+			if (is_overlapping){
+				*result = dsa_execute(wq, &thr_desc, &thr_comp.status);
+			} else {
+				*result = dsa_submit(wq, &thr_desc);
+				if (*result == SUCCESS) {
+					if (cpu_size) {
+						if (is_memcpy)
+							orig_memcpy(dest, src, cpu_size);
+						else
+							orig_memmove(dest, src, cpu_size);
+						thr_bytes_completed += cpu_size;
+					}
+					*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
+				}
+			}
+		} else {
+			uint32_t threshold;
+			size_t current_cpu_size_fraction = cpu_size_fraction;  // the cpu_size_fraction might be changed by the auto tune algorithm 
+			if (is_overlapping) {
+				threshold = wq->max_transfer_size;
 				cpu_size = 0;
-			else
-				cpu_size = len * current_cpu_size_fraction / 100;
+			} else {
+				threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction);
+			}
 
-			dsa_size = len - cpu_size;
+			do {
+				size_t len;
 
-			thr_desc.src_addr = (uint64_t) src + cpu_size + thr_bytes_completed;
-			thr_desc.dst_addr = (uint64_t) dest + cpu_size + thr_bytes_completed;
-			thr_desc.xfer_size = (uint32_t) dsa_size;
-			thr_comp.status = 0;
-			*result = dsa_submit(wq, &thr_desc);
-			if (*result == SUCCESS) {
-				if (cpu_size) {
-					const void *src1 = src + thr_bytes_completed;
-					void *dest1 = dest + thr_bytes_completed;
+				len = n <= threshold ? n : threshold;
 
-					if (is_memcpy)
-						orig_memcpy(dest1, src1, cpu_size);
-					else
-						orig_memmove(dest1, src1, cpu_size);
-					thr_bytes_completed += cpu_size;
+				if (!is_overlapping)
+					cpu_size = len * current_cpu_size_fraction / 100;
+
+				dsa_size = len - cpu_size;
+
+				thr_desc.src_addr = (uint64_t) src + cpu_size + thr_bytes_completed;
+				thr_desc.dst_addr = (uint64_t) dest + cpu_size + thr_bytes_completed;
+				thr_desc.xfer_size = (uint32_t) dsa_size;
+				thr_comp.status = 0;
+				if (is_overlapping){
+					*result = dsa_execute(wq, &thr_desc, &thr_comp.status);
+				} else {
+					*result = dsa_submit(wq, &thr_desc);
+					if (*result == SUCCESS) {
+						if (cpu_size) {
+							const void *src1 = src + thr_bytes_completed;
+							void *dest1 = dest + thr_bytes_completed;
+
+							if (is_memcpy)
+								orig_memcpy(dest1, src1, cpu_size);
+							else
+								orig_memmove(dest1, src1, cpu_size);
+							thr_bytes_completed += cpu_size;
+						}
+						*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
+					}
 				}
-				*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
-			}
 
-			if (*result != SUCCESS)
-				break;
-			n -= len;
-			/* If remaining bytes are less than dsa_min_size,
-			 * dont submit to DSA. Instead, complete remaining
-			 * bytes on CPU
-			 */
-		} while (n >= dsa_min_size);
+				if (*result != SUCCESS)
+					break;
+				n -= len;
+				/* If remaining bytes are less than dsa_min_size,
+				* dont submit to DSA. Instead, complete remaining
+				* bytes on CPU
+				*/
+			} while (n >= dsa_min_size);
+		}
 	}
+
+	return is_overlapping;
 }
 
 static int dto_memcmp(const void *s1, const void *s2, size_t n, int *result)
@@ -1694,7 +1736,7 @@ void *memset(void *s1, int c, size_t n)
 		dto_memset(s1, c, n, &result);
 
 #ifdef DTO_STATS_SUPPORT
-		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, thr_bytes_completed, result);
+		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, 0, thr_bytes_completed, result);
 #endif
 		if (thr_bytes_completed != n) {
 			/* fallback to std call if job is only partially completed */
@@ -1744,7 +1786,7 @@ void *memcpy(void *dest, const void *src, size_t n)
 		dto_memcpymove(dest, src, n, 1, &result);
 
 #ifdef DTO_STATS_SUPPORT
-		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, thr_bytes_completed, result);
+		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, 0, thr_bytes_completed, result);
 #endif
 		if (thr_bytes_completed != n) {
 			/* fallback to std call if job is only partially completed */
@@ -1776,6 +1818,7 @@ void *memmove(void *dest, const void *src, size_t n)
 	int result = 0;
 	void *ret = dest;
 	int use_orig_func = USE_ORIG_FUNC(n, dto_dsa_memmove);
+	uint8_t is_overlapping;
 #ifdef DTO_STATS_SUPPORT
 	struct timespec st, et;
 	size_t orig_n = n;
@@ -1794,10 +1837,10 @@ void *memmove(void *dest, const void *src, size_t n)
 #ifdef DTO_STATS_SUPPORT
 		DTO_COLLECT_STATS_START(collect_stats, st);
 #endif
-		dto_memcpymove(dest, src, n, 0, &result);
+		is_overlapping = dto_memcpymove(dest, src, n, 0, &result);
 
 #ifdef DTO_STATS_SUPPORT
-		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMMOVE, n, thr_bytes_completed, result);
+		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMMOVE, n, is_overlapping, thr_bytes_completed, result);
 #endif
 		if (thr_bytes_completed != n) {
 			/* fallback to std call if job is only partially completed */
@@ -1850,7 +1893,7 @@ int memcmp(const void *s1, const void *s2, size_t n)
 		ret = dto_memcmp(s1, s2, n, &result);
 
 #ifdef DTO_STATS_SUPPORT
-		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, thr_bytes_completed, result);
+		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, 0, thr_bytes_completed, result);
 #endif
 		if (thr_bytes_completed != n) {
 			/* fallback to std call if job is only partially completed */

From ab82ecf96765441ad230015718d60ff4441702f3 Mon Sep 17 00:00:00 2001
From: "Sydir, Jerry" <jerry.sydir@intel.com>
Date: Thu, 12 Jun 2025 15:33:49 -0700
Subject: [PATCH 2/3] Modified flow in dto_memcpymove function to avoid calling
 get_wq in the case where the overlapping memmove is performed entirely on CPU

Signed-off-by: Sydir, Jerry <jerry.sydir@intel.com>
---
 dto.c | 158 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 80 insertions(+), 78 deletions(-)

diff --git a/dto.c b/dto.c
index 3c67eef..ff4db9a 100644
--- a/dto.c
+++ b/dto.c
@@ -217,7 +217,7 @@ static struct timespec dto_start_time;
 			clock_gettime(CLOCK_BOOTTIME, &et);			\
 			t = (((et.tv_sec*1000000000) + et.tv_nsec) -		\
 				((st.tv_sec*1000000000) + st.tv_nsec));		\
-			update_stats(op, orig_n, 0, n, t, STDC_CALL, 0);		\
+			update_stats(op, orig_n, false, n, t, STDC_CALL, 0);		\
 		}								\
 	} while (0)								\
 
@@ -537,12 +537,13 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
 }
 
 #ifdef DTO_STATS_SUPPORT
-static void update_stats(int op, size_t n, uint8_t overlapping, size_t bytes_completed,
+static void update_stats(int op, size_t n, bool overlapping, size_t bytes_completed,
 		uint64_t elapsed_ns, int group, int error_code)
 {
 	// dto_memcpymove didn't actually submit the request to DSA, so there is nothing to log. This will be captured by a second call
-	if(op==MEMMOVE && overlapping && dto_overlapping_memmove_action==OVERLAPPING_CPU && group==DSA_CALL_SUCCESS)
+	if (op == MEMMOVE && overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU && group == DSA_CALL_SUCCESS) {
 		return;
+	}
 
 	int bucket = (n / HIST_BUCKET_SIZE);
 
@@ -1501,40 +1502,81 @@ static bool is_overlapping_buffers (void *dest, const void *src, size_t n)
 	return true;
 }
 
-static uint8_t dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result)
+static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result)
 {
-	struct dto_wq *wq = get_wq(dest);
+	struct dto_wq *wq;
 	size_t cpu_size, dsa_size;
-	uint8_t is_overlapping = 0;
+	bool is_overlapping;
 
-	thr_desc.opcode = DSA_OPCODE_MEMMOVE;
-	thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR;
-	if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY))
-		thr_desc.flags |= IDXD_OP_FLAG_CC;
-	thr_desc.completion_addr = (uint64_t)&thr_comp;
+	thr_bytes_completed = 0;
 
-	/* cpu_size_fraction guaranteed to be >= 0 and < 1 */
 	if (!is_memcpy && is_overlapping_buffers(dest, src, n)) {
 		cpu_size = 0;
-		is_overlapping = 1;
+		is_overlapping = true;
 	} else {
+		/* cpu_size_fraction guaranteed to be >= 0 and < 1 */
 		cpu_size = n * cpu_size_fraction / 100;
+		is_overlapping = false;
 	}
 
-	dsa_size = n - cpu_size;
-
-	thr_bytes_completed = 0;
-
 	// If this is an overlapping memmove and the action is to perform on CPU, return having done nothing and
 	// memmove will perform the copy and correctly attribute statistics to stdlib call group
 	if (is_overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU) {
-		thr_bytes_completed = 0;
 		*result = SUCCESS;
+		return true;
+	}
+
+	dsa_size = n - cpu_size;
+	wq = get_wq(dest);
+
+	thr_desc.opcode = DSA_OPCODE_MEMMOVE;
+	thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR;
+	if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY))
+		thr_desc.flags |= IDXD_OP_FLAG_CC;
+	thr_desc.completion_addr = (uint64_t)&thr_comp;
+
+	if (dsa_size <= wq->max_transfer_size) {
+		thr_desc.src_addr = (uint64_t) src + cpu_size;
+		thr_desc.dst_addr = (uint64_t) dest + cpu_size;
+		thr_desc.xfer_size = (uint32_t) dsa_size;
+		thr_comp.status = 0;
+		if (is_overlapping) {
+			*result = dsa_execute(wq, &thr_desc, &thr_comp.status);
+		} else {
+			*result = dsa_submit(wq, &thr_desc);
+			if (*result == SUCCESS) {
+				if (cpu_size) {
+					if (is_memcpy)
+						orig_memcpy(dest, src, cpu_size);
+					else
+						orig_memmove(dest, src, cpu_size);
+					thr_bytes_completed += cpu_size;
+				}
+				*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
+			}
+		}
 	} else {
+		uint32_t threshold;
+		size_t current_cpu_size_fraction = cpu_size_fraction;  // the cpu_size_fraction might be changed by the auto tune algorithm 
+		if (is_overlapping) {
+			threshold = wq->max_transfer_size;
+			cpu_size = 0;
+		} else {
+			threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction);
+		}
+
+		do {
+			size_t len;
 
-		if (dsa_size <= wq->max_transfer_size) {
-			thr_desc.src_addr = (uint64_t) src + cpu_size;
-			thr_desc.dst_addr = (uint64_t) dest + cpu_size;
+			len = n <= threshold ? n : threshold;
+
+			if (!is_overlapping)
+				cpu_size = len * current_cpu_size_fraction / 100;
+
+			dsa_size = len - cpu_size;
+
+			thr_desc.src_addr = (uint64_t) src + cpu_size + thr_bytes_completed;
+			thr_desc.dst_addr = (uint64_t) dest + cpu_size + thr_bytes_completed;
 			thr_desc.xfer_size = (uint32_t) dsa_size;
 			thr_comp.status = 0;
 			if (is_overlapping){
@@ -1543,67 +1585,27 @@ static uint8_t dto_memcpymove(void *dest, const void *src, size_t n, bool is_mem
 				*result = dsa_submit(wq, &thr_desc);
 				if (*result == SUCCESS) {
 					if (cpu_size) {
+						const void *src1 = src + thr_bytes_completed;
+						void *dest1 = dest + thr_bytes_completed;
+
 						if (is_memcpy)
-							orig_memcpy(dest, src, cpu_size);
+							orig_memcpy(dest1, src1, cpu_size);
 						else
-							orig_memmove(dest, src, cpu_size);
+							orig_memmove(dest1, src1, cpu_size);
 						thr_bytes_completed += cpu_size;
 					}
 					*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
 				}
 			}
-		} else {
-			uint32_t threshold;
-			size_t current_cpu_size_fraction = cpu_size_fraction;  // the cpu_size_fraction might be changed by the auto tune algorithm 
-			if (is_overlapping) {
-				threshold = wq->max_transfer_size;
-				cpu_size = 0;
-			} else {
-				threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction);
-			}
-
-			do {
-				size_t len;
-
-				len = n <= threshold ? n : threshold;
-
-				if (!is_overlapping)
-					cpu_size = len * current_cpu_size_fraction / 100;
-
-				dsa_size = len - cpu_size;
-
-				thr_desc.src_addr = (uint64_t) src + cpu_size + thr_bytes_completed;
-				thr_desc.dst_addr = (uint64_t) dest + cpu_size + thr_bytes_completed;
-				thr_desc.xfer_size = (uint32_t) dsa_size;
-				thr_comp.status = 0;
-				if (is_overlapping){
-					*result = dsa_execute(wq, &thr_desc, &thr_comp.status);
-				} else {
-					*result = dsa_submit(wq, &thr_desc);
-					if (*result == SUCCESS) {
-						if (cpu_size) {
-							const void *src1 = src + thr_bytes_completed;
-							void *dest1 = dest + thr_bytes_completed;
-
-							if (is_memcpy)
-								orig_memcpy(dest1, src1, cpu_size);
-							else
-								orig_memmove(dest1, src1, cpu_size);
-							thr_bytes_completed += cpu_size;
-						}
-						*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
-					}
-				}
 
-				if (*result != SUCCESS)
-					break;
-				n -= len;
-				/* If remaining bytes are less than dsa_min_size,
-				* dont submit to DSA. Instead, complete remaining
-				* bytes on CPU
-				*/
-			} while (n >= dsa_min_size);
-		}
+			if (*result != SUCCESS)
+				break;
+			n -= len;
+			/* If remaining bytes are less than dsa_min_size,
+			* dont submit to DSA. Instead, complete remaining
+			* bytes on CPU
+			*/
+		} while (n >= dsa_min_size);
 	}
 
 	return is_overlapping;
@@ -1736,7 +1738,7 @@ void *memset(void *s1, int c, size_t n)
 		dto_memset(s1, c, n, &result);
 
 #ifdef DTO_STATS_SUPPORT
-		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, 0, thr_bytes_completed, result);
+		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, false, thr_bytes_completed, result);
 #endif
 		if (thr_bytes_completed != n) {
 			/* fallback to std call if job is only partially completed */
@@ -1786,7 +1788,7 @@ void *memcpy(void *dest, const void *src, size_t n)
 		dto_memcpymove(dest, src, n, 1, &result);
 
 #ifdef DTO_STATS_SUPPORT
-		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, 0, thr_bytes_completed, result);
+		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, false, thr_bytes_completed, result);
 #endif
 		if (thr_bytes_completed != n) {
 			/* fallback to std call if job is only partially completed */
@@ -1818,7 +1820,7 @@ void *memmove(void *dest, const void *src, size_t n)
 	int result = 0;
 	void *ret = dest;
 	int use_orig_func = USE_ORIG_FUNC(n, dto_dsa_memmove);
-	uint8_t is_overlapping;
+	bool is_overlapping;
 #ifdef DTO_STATS_SUPPORT
 	struct timespec st, et;
 	size_t orig_n = n;
@@ -1893,7 +1895,7 @@ int memcmp(const void *s1, const void *s2, size_t n)
 		ret = dto_memcmp(s1, s2, n, &result);
 
 #ifdef DTO_STATS_SUPPORT
-		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, 0, thr_bytes_completed, result);
+		DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, false, thr_bytes_completed, result);
 #endif
 		if (thr_bytes_completed != n) {
 			/* fallback to std call if job is only partially completed */

From a9c8b75a3bfab5ed005b818586837d9029276cee Mon Sep 17 00:00:00 2001
From: "Sydir, Jerry" <jerry.sydir@intel.com>
Date: Mon, 16 Jun 2025 13:50:17 -0700
Subject: [PATCH 3/3] When memmove operations is overlapping, we set cpu
 fraction to 0. This is done at the top of the dto_memcpymove function. This
 was done a second time within the loop. This commit removes that second
 instance.

Signed-off-by: Sydir, Jerry <jerry.sydir@intel.com>
---
 dto.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/dto.c b/dto.c
index ff4db9a..a7fd50c 100644
--- a/dto.c
+++ b/dto.c
@@ -1560,7 +1560,6 @@ static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy
 		size_t current_cpu_size_fraction = cpu_size_fraction;  // the cpu_size_fraction might be changed by the auto tune algorithm 
 		if (is_overlapping) {
 			threshold = wq->max_transfer_size;
-			cpu_size = 0;
 		} else {
 			threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction);
 		}