diff --git a/README.md b/README.md index c9f3fe0..576f4de 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ Following environment variables control the behavior of DTO library: DTO_DSA_MEMSET=0/1, 1 (default) - DTO uses DSA to process memset, 0 - DTO use system memset DTO_DSA_MEMCMP=0/1, 1 (default) - DTO uses DSA to process memcmp, 0 - DTO use system memcmp DTO_DSA_CC=0/1, 1 (default) - DTO sets DSA Cache Control flag to 1 if DSA supports cache control, 0 - DTO sets DSA Cache Control flag to 0 + DTO_OVERLAPPING_MEMMOVE_ACTION=0/1 0 (default) DTO submits memmove operations with overlapping buffers entirely to CPU, 1 - entirely to DSA DTO_UMWAIT_DELAY=xxxx defines delay for umwait command (check max possible value at: /sys/devices/system/cpu/umwait_control/max_time), default is 100000 DTO_LOG_FILE= Redirect the DTO output to the specified file instead of std output (useful for debugging and statistics collection). file name is suffixed by process pid. DTO_LOG_LEVEL=0/1/2 controls the log level. higher value means more verbose logging (default 0). diff --git a/dto.c b/dto.c index b7a3a1c..a7fd50c 100644 --- a/dto.c +++ b/dto.c @@ -89,6 +89,12 @@ enum numa_aware { NA_LAST_ENTRY }; +enum overlapping_memmove_actions { + OVERLAPPING_CPU = 0, + OVERLAPPING_DSA, + OVERLAPPING_LAST_ENTRY +}; + static const char * const numa_aware_names[] = { [NA_NONE] = "none", [NA_BUFFER_CENTRIC] = "buffer-centric", @@ -117,6 +123,8 @@ static uint8_t dto_dsa_cc = 1; static unsigned long dto_umwait_delay = UMWAIT_DELAY_DEFAULT; +static uint8_t dto_overlapping_memmove_action = OVERLAPPING_CPU; + static uint8_t fork_handler_registered; enum memop { @@ -188,7 +196,7 @@ static struct timespec dto_start_time; } while (0) \ -#define DTO_COLLECT_STATS_DSA_END(cs, st, et, op, n, tbc, r) \ +#define DTO_COLLECT_STATS_DSA_END(cs, st, et, op, n, overlap, tbc, r) \ do { \ if (unlikely(cs)) { \ uint64_t t; \ @@ -196,9 +204,9 @@ static struct timespec dto_start_time; t = (((et.tv_sec*1000000000) + et.tv_nsec) - \ ((st.tv_sec*1000000000) + st.tv_nsec)); \ if (unlikely(r != SUCCESS)) \ - update_stats(op, n, tbc, t, DSA_CALL_FAILED, r); \ + update_stats(op, n, overlap, tbc, t, DSA_CALL_FAILED, r); \ else \ - update_stats(op, n, tbc, t, DSA_CALL_SUCCESS, 0); \ + update_stats(op, n, overlap, tbc, t, DSA_CALL_SUCCESS, 0); \ } \ } while (0) \ @@ -209,7 +217,7 @@ static struct timespec dto_start_time; clock_gettime(CLOCK_BOOTTIME, &et); \ t = (((et.tv_sec*1000000000) + et.tv_nsec) - \ ((st.tv_sec*1000000000) + st.tv_nsec)); \ - update_stats(op, orig_n, n, t, STDC_CALL, 0); \ + update_stats(op, orig_n, false, n, t, STDC_CALL, 0); \ } \ } while (0) \ @@ -513,10 +521,7 @@ static __always_inline int dsa_execute(struct dto_wq *wq, ret = 0; } if (!ret) { - if (auto_adjust_knobs) - dsa_wait_and_adjust(comp); - else - dsa_wait_no_adjust(comp); + dsa_wait_no_adjust(comp); if (*comp == DSA_COMP_SUCCESS) { thr_bytes_completed += hw->xfer_size; @@ -532,9 +537,14 @@ static __always_inline int dsa_execute(struct dto_wq *wq, } #ifdef DTO_STATS_SUPPORT -static void update_stats(int op, size_t n, size_t bytes_completed, +static void update_stats(int op, size_t n, bool overlapping, size_t bytes_completed, uint64_t elapsed_ns, int group, int error_code) { + // dto_memcpymove didn't actually submit the request to DSA, so there is nothing to log. This will be captured by a second call + if (op == MEMMOVE && overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU && group == DSA_CALL_SUCCESS) { + return; + } + int bucket = (n / HIST_BUCKET_SIZE); if (bucket >= HIST_NO_BUCKETS) /* last bucket includes remaining sizes */ @@ -1240,6 +1250,14 @@ static int init_dto(void) dto_dsa_memcmp = !!dto_dsa_memcmp; } + env_str = getenv("DTO_OVERLAPPING_MEMMOVE_ACTION"); + if (env_str != NULL) { + errno = 0; + dto_overlapping_memmove_action = strtoul(env_str, NULL, 10); + if (errno) + dto_overlapping_memmove_action = OVERLAPPING_CPU; + } + #ifdef DTO_STATS_SUPPORT env_str = getenv("DTO_COLLECT_STATS"); if (env_str != NULL) { @@ -1484,55 +1502,74 @@ static bool is_overlapping_buffers (void *dest, const void *src, size_t n) return true; } -static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result) +static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result) { - struct dto_wq *wq = get_wq(dest); + struct dto_wq *wq; size_t cpu_size, dsa_size; + bool is_overlapping; - thr_desc.opcode = DSA_OPCODE_MEMMOVE; - thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR; - if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY)) - thr_desc.flags |= IDXD_OP_FLAG_CC; - thr_desc.completion_addr = (uint64_t)&thr_comp; + thr_bytes_completed = 0; - /* cpu_size_fraction guaranteed to be >= 0 and < 1 */ - if (!is_memcpy && is_overlapping_buffers(dest, src, n)) + if (!is_memcpy && is_overlapping_buffers(dest, src, n)) { cpu_size = 0; - else + is_overlapping = true; + } else { + /* cpu_size_fraction guaranteed to be >= 0 and < 1 */ cpu_size = n * cpu_size_fraction / 100; + is_overlapping = false; + } + + // If this is an overlapping memmove and the action is to perform on CPU, return having done nothing and + // memmove will perform the copy and correctly attribute statistics to stdlib call group + if (is_overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU) { + *result = SUCCESS; + return true; + } dsa_size = n - cpu_size; + wq = get_wq(dest); - thr_bytes_completed = 0; + thr_desc.opcode = DSA_OPCODE_MEMMOVE; + thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR; + if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY)) + thr_desc.flags |= IDXD_OP_FLAG_CC; + thr_desc.completion_addr = (uint64_t)&thr_comp; if (dsa_size <= wq->max_transfer_size) { thr_desc.src_addr = (uint64_t) src + cpu_size; thr_desc.dst_addr = (uint64_t) dest + cpu_size; thr_desc.xfer_size = (uint32_t) dsa_size; thr_comp.status = 0; - *result = dsa_submit(wq, &thr_desc); - if (*result == SUCCESS) { - if (cpu_size) { - if (is_memcpy) - orig_memcpy(dest, src, cpu_size); - else - orig_memmove(dest, src, cpu_size); - thr_bytes_completed += cpu_size; + if (is_overlapping) { + *result = dsa_execute(wq, &thr_desc, &thr_comp.status); + } else { + *result = dsa_submit(wq, &thr_desc); + if (*result == SUCCESS) { + if (cpu_size) { + if (is_memcpy) + orig_memcpy(dest, src, cpu_size); + else + orig_memmove(dest, src, cpu_size); + thr_bytes_completed += cpu_size; + } + *result = dsa_wait(wq, &thr_desc, &thr_comp.status); } - *result = dsa_wait(wq, &thr_desc, &thr_comp.status); } } else { uint32_t threshold; size_t current_cpu_size_fraction = cpu_size_fraction; // the cpu_size_fraction might be changed by the auto tune algorithm - threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction); + if (is_overlapping) { + threshold = wq->max_transfer_size; + } else { + threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction); + } + do { size_t len; len = n <= threshold ? n : threshold; - if (!is_memcpy && is_overlapping_buffers(dest, src, len)) - cpu_size = 0; - else + if (!is_overlapping) cpu_size = len * current_cpu_size_fraction / 100; dsa_size = len - cpu_size; @@ -1541,30 +1578,36 @@ static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy thr_desc.dst_addr = (uint64_t) dest + cpu_size + thr_bytes_completed; thr_desc.xfer_size = (uint32_t) dsa_size; thr_comp.status = 0; - *result = dsa_submit(wq, &thr_desc); - if (*result == SUCCESS) { - if (cpu_size) { - const void *src1 = src + thr_bytes_completed; - void *dest1 = dest + thr_bytes_completed; - - if (is_memcpy) - orig_memcpy(dest1, src1, cpu_size); - else - orig_memmove(dest1, src1, cpu_size); - thr_bytes_completed += cpu_size; + if (is_overlapping){ + *result = dsa_execute(wq, &thr_desc, &thr_comp.status); + } else { + *result = dsa_submit(wq, &thr_desc); + if (*result == SUCCESS) { + if (cpu_size) { + const void *src1 = src + thr_bytes_completed; + void *dest1 = dest + thr_bytes_completed; + + if (is_memcpy) + orig_memcpy(dest1, src1, cpu_size); + else + orig_memmove(dest1, src1, cpu_size); + thr_bytes_completed += cpu_size; + } + *result = dsa_wait(wq, &thr_desc, &thr_comp.status); } - *result = dsa_wait(wq, &thr_desc, &thr_comp.status); } if (*result != SUCCESS) break; n -= len; /* If remaining bytes are less than dsa_min_size, - * dont submit to DSA. Instead, complete remaining - * bytes on CPU - */ + * dont submit to DSA. Instead, complete remaining + * bytes on CPU + */ } while (n >= dsa_min_size); } + + return is_overlapping; } static int dto_memcmp(const void *s1, const void *s2, size_t n, int *result) @@ -1694,7 +1737,7 @@ void *memset(void *s1, int c, size_t n) dto_memset(s1, c, n, &result); #ifdef DTO_STATS_SUPPORT - DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, thr_bytes_completed, result); + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, false, thr_bytes_completed, result); #endif if (thr_bytes_completed != n) { /* fallback to std call if job is only partially completed */ @@ -1744,7 +1787,7 @@ void *memcpy(void *dest, const void *src, size_t n) dto_memcpymove(dest, src, n, 1, &result); #ifdef DTO_STATS_SUPPORT - DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, thr_bytes_completed, result); + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, false, thr_bytes_completed, result); #endif if (thr_bytes_completed != n) { /* fallback to std call if job is only partially completed */ @@ -1776,6 +1819,7 @@ void *memmove(void *dest, const void *src, size_t n) int result = 0; void *ret = dest; int use_orig_func = USE_ORIG_FUNC(n, dto_dsa_memmove); + bool is_overlapping; #ifdef DTO_STATS_SUPPORT struct timespec st, et; size_t orig_n = n; @@ -1794,10 +1838,10 @@ void *memmove(void *dest, const void *src, size_t n) #ifdef DTO_STATS_SUPPORT DTO_COLLECT_STATS_START(collect_stats, st); #endif - dto_memcpymove(dest, src, n, 0, &result); + is_overlapping = dto_memcpymove(dest, src, n, 0, &result); #ifdef DTO_STATS_SUPPORT - DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMMOVE, n, thr_bytes_completed, result); + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMMOVE, n, is_overlapping, thr_bytes_completed, result); #endif if (thr_bytes_completed != n) { /* fallback to std call if job is only partially completed */ @@ -1850,7 +1894,7 @@ int memcmp(const void *s1, const void *s2, size_t n) ret = dto_memcmp(s1, s2, n, &result); #ifdef DTO_STATS_SUPPORT - DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, thr_bytes_completed, result); + DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, false, thr_bytes_completed, result); #endif if (thr_bytes_completed != n) { /* fallback to std call if job is only partially completed */