Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ Following environment variables control the behavior of DTO library:
DTO_DSA_MEMSET=0/1, 1 (default) - DTO uses DSA to process memset, 0 - DTO use system memset
DTO_DSA_MEMCMP=0/1, 1 (default) - DTO uses DSA to process memcmp, 0 - DTO use system memcmp
DTO_DSA_CC=0/1, 1 (default) - DTO sets DSA Cache Control flag to 1 if DSA supports cache control, 0 - DTO sets DSA Cache Control flag to 0
DTO_OVERLAPPING_MEMMOVE_ACTION=0/1 0 (default) DTO submits memmove operations with overlapping buffers entirely to CPU, 1 - entirely to DSA
DTO_UMWAIT_DELAY=xxxx defines delay for umwait command (check max possible value at: /sys/devices/system/cpu/umwait_control/max_time), default is 100000
DTO_LOG_FILE=<dto log file path> Redirect the DTO output to the specified file instead of std output (useful for debugging and statistics collection). file name is suffixed by process pid.
DTO_LOG_LEVEL=0/1/2 controls the log level. higher value means more verbose logging (default 0).
Expand Down
150 changes: 97 additions & 53 deletions dto.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,12 @@ enum numa_aware {
NA_LAST_ENTRY
};

enum overlapping_memmove_actions {
OVERLAPPING_CPU = 0,
OVERLAPPING_DSA,
OVERLAPPING_LAST_ENTRY
};

static const char * const numa_aware_names[] = {
[NA_NONE] = "none",
[NA_BUFFER_CENTRIC] = "buffer-centric",
Expand Down Expand Up @@ -117,6 +123,8 @@ static uint8_t dto_dsa_cc = 1;

static unsigned long dto_umwait_delay = UMWAIT_DELAY_DEFAULT;

static uint8_t dto_overlapping_memmove_action = OVERLAPPING_CPU;

static uint8_t fork_handler_registered;

enum memop {
Expand Down Expand Up @@ -188,17 +196,17 @@ static struct timespec dto_start_time;
} while (0) \


#define DTO_COLLECT_STATS_DSA_END(cs, st, et, op, n, tbc, r) \
#define DTO_COLLECT_STATS_DSA_END(cs, st, et, op, n, overlap, tbc, r) \
do { \
if (unlikely(cs)) { \
uint64_t t; \
clock_gettime(CLOCK_BOOTTIME, &et); \
t = (((et.tv_sec*1000000000) + et.tv_nsec) - \
((st.tv_sec*1000000000) + st.tv_nsec)); \
if (unlikely(r != SUCCESS)) \
update_stats(op, n, tbc, t, DSA_CALL_FAILED, r); \
update_stats(op, n, overlap, tbc, t, DSA_CALL_FAILED, r); \
else \
update_stats(op, n, tbc, t, DSA_CALL_SUCCESS, 0); \
update_stats(op, n, overlap, tbc, t, DSA_CALL_SUCCESS, 0); \
} \
} while (0) \

Expand All @@ -209,7 +217,7 @@ static struct timespec dto_start_time;
clock_gettime(CLOCK_BOOTTIME, &et); \
t = (((et.tv_sec*1000000000) + et.tv_nsec) - \
((st.tv_sec*1000000000) + st.tv_nsec)); \
update_stats(op, orig_n, n, t, STDC_CALL, 0); \
update_stats(op, orig_n, false, n, t, STDC_CALL, 0); \
} \
} while (0) \

Expand Down Expand Up @@ -513,10 +521,7 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
ret = 0;
}
if (!ret) {
if (auto_adjust_knobs)
dsa_wait_and_adjust(comp);
else
dsa_wait_no_adjust(comp);
dsa_wait_no_adjust(comp);

if (*comp == DSA_COMP_SUCCESS) {
thr_bytes_completed += hw->xfer_size;
Expand All @@ -532,9 +537,14 @@ static __always_inline int dsa_execute(struct dto_wq *wq,
}

#ifdef DTO_STATS_SUPPORT
static void update_stats(int op, size_t n, size_t bytes_completed,
static void update_stats(int op, size_t n, bool overlapping, size_t bytes_completed,
uint64_t elapsed_ns, int group, int error_code)
{
// dto_memcpymove didn't actually submit the request to DSA, so there is nothing to log. This will be captured by a second call
if (op == MEMMOVE && overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU && group == DSA_CALL_SUCCESS) {
return;
}

int bucket = (n / HIST_BUCKET_SIZE);

if (bucket >= HIST_NO_BUCKETS) /* last bucket includes remaining sizes */
Expand Down Expand Up @@ -1240,6 +1250,14 @@ static int init_dto(void)
dto_dsa_memcmp = !!dto_dsa_memcmp;
}

env_str = getenv("DTO_OVERLAPPING_MEMMOVE_ACTION");
if (env_str != NULL) {
errno = 0;
dto_overlapping_memmove_action = strtoul(env_str, NULL, 10);
if (errno)
dto_overlapping_memmove_action = OVERLAPPING_CPU;
}

#ifdef DTO_STATS_SUPPORT
env_str = getenv("DTO_COLLECT_STATS");
if (env_str != NULL) {
Expand Down Expand Up @@ -1484,55 +1502,74 @@ static bool is_overlapping_buffers (void *dest, const void *src, size_t n)
return true;
}

static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result)
static bool dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy, int *result)
{
struct dto_wq *wq = get_wq(dest);
struct dto_wq *wq;
size_t cpu_size, dsa_size;
bool is_overlapping;

thr_desc.opcode = DSA_OPCODE_MEMMOVE;
thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR;
if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY))
thr_desc.flags |= IDXD_OP_FLAG_CC;
thr_desc.completion_addr = (uint64_t)&thr_comp;
thr_bytes_completed = 0;

/* cpu_size_fraction guaranteed to be >= 0 and < 1 */
if (!is_memcpy && is_overlapping_buffers(dest, src, n))
if (!is_memcpy && is_overlapping_buffers(dest, src, n)) {
cpu_size = 0;
else
is_overlapping = true;
} else {
/* cpu_size_fraction guaranteed to be >= 0 and < 1 */
cpu_size = n * cpu_size_fraction / 100;
is_overlapping = false;
}

// If this is an overlapping memmove and the action is to perform on CPU, return having done nothing and
// memmove will perform the copy and correctly attribute statistics to stdlib call group
if (is_overlapping && dto_overlapping_memmove_action == OVERLAPPING_CPU) {
*result = SUCCESS;
return true;
}

dsa_size = n - cpu_size;
wq = get_wq(dest);

thr_bytes_completed = 0;
thr_desc.opcode = DSA_OPCODE_MEMMOVE;
thr_desc.flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR;
if (dto_dsa_cc && (wq->dsa_gencap & GENCAP_CC_MEMORY))
thr_desc.flags |= IDXD_OP_FLAG_CC;
thr_desc.completion_addr = (uint64_t)&thr_comp;

if (dsa_size <= wq->max_transfer_size) {
thr_desc.src_addr = (uint64_t) src + cpu_size;
thr_desc.dst_addr = (uint64_t) dest + cpu_size;
thr_desc.xfer_size = (uint32_t) dsa_size;
thr_comp.status = 0;
*result = dsa_submit(wq, &thr_desc);
if (*result == SUCCESS) {
if (cpu_size) {
if (is_memcpy)
orig_memcpy(dest, src, cpu_size);
else
orig_memmove(dest, src, cpu_size);
thr_bytes_completed += cpu_size;
if (is_overlapping) {
*result = dsa_execute(wq, &thr_desc, &thr_comp.status);
} else {
*result = dsa_submit(wq, &thr_desc);
if (*result == SUCCESS) {
if (cpu_size) {
if (is_memcpy)
orig_memcpy(dest, src, cpu_size);
else
orig_memmove(dest, src, cpu_size);
thr_bytes_completed += cpu_size;
}
*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
}
*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
}
} else {
uint32_t threshold;
size_t current_cpu_size_fraction = cpu_size_fraction; // the cpu_size_fraction might be changed by the auto tune algorithm
threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction);
if (is_overlapping) {
threshold = wq->max_transfer_size;
} else {
threshold = wq->max_transfer_size * 100 / (100 - current_cpu_size_fraction);
}

do {
size_t len;

len = n <= threshold ? n : threshold;

if (!is_memcpy && is_overlapping_buffers(dest, src, len))
cpu_size = 0;
else
if (!is_overlapping)
cpu_size = len * current_cpu_size_fraction / 100;

dsa_size = len - cpu_size;
Expand All @@ -1541,30 +1578,36 @@ static void dto_memcpymove(void *dest, const void *src, size_t n, bool is_memcpy
thr_desc.dst_addr = (uint64_t) dest + cpu_size + thr_bytes_completed;
thr_desc.xfer_size = (uint32_t) dsa_size;
thr_comp.status = 0;
*result = dsa_submit(wq, &thr_desc);
if (*result == SUCCESS) {
if (cpu_size) {
const void *src1 = src + thr_bytes_completed;
void *dest1 = dest + thr_bytes_completed;

if (is_memcpy)
orig_memcpy(dest1, src1, cpu_size);
else
orig_memmove(dest1, src1, cpu_size);
thr_bytes_completed += cpu_size;
if (is_overlapping){
*result = dsa_execute(wq, &thr_desc, &thr_comp.status);
} else {
*result = dsa_submit(wq, &thr_desc);
if (*result == SUCCESS) {
if (cpu_size) {
const void *src1 = src + thr_bytes_completed;
void *dest1 = dest + thr_bytes_completed;

if (is_memcpy)
orig_memcpy(dest1, src1, cpu_size);
else
orig_memmove(dest1, src1, cpu_size);
thr_bytes_completed += cpu_size;
}
*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
}
*result = dsa_wait(wq, &thr_desc, &thr_comp.status);
}

if (*result != SUCCESS)
break;
n -= len;
/* If remaining bytes are less than dsa_min_size,
* dont submit to DSA. Instead, complete remaining
* bytes on CPU
*/
* dont submit to DSA. Instead, complete remaining
* bytes on CPU
*/
} while (n >= dsa_min_size);
}

return is_overlapping;
}

static int dto_memcmp(const void *s1, const void *s2, size_t n, int *result)
Expand Down Expand Up @@ -1694,7 +1737,7 @@ void *memset(void *s1, int c, size_t n)
dto_memset(s1, c, n, &result);

#ifdef DTO_STATS_SUPPORT
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, thr_bytes_completed, result);
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMSET, n, false, thr_bytes_completed, result);
#endif
if (thr_bytes_completed != n) {
/* fallback to std call if job is only partially completed */
Expand Down Expand Up @@ -1744,7 +1787,7 @@ void *memcpy(void *dest, const void *src, size_t n)
dto_memcpymove(dest, src, n, 1, &result);

#ifdef DTO_STATS_SUPPORT
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, thr_bytes_completed, result);
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCOPY, n, false, thr_bytes_completed, result);
#endif
if (thr_bytes_completed != n) {
/* fallback to std call if job is only partially completed */
Expand Down Expand Up @@ -1776,6 +1819,7 @@ void *memmove(void *dest, const void *src, size_t n)
int result = 0;
void *ret = dest;
int use_orig_func = USE_ORIG_FUNC(n, dto_dsa_memmove);
bool is_overlapping;
#ifdef DTO_STATS_SUPPORT
struct timespec st, et;
size_t orig_n = n;
Expand All @@ -1794,10 +1838,10 @@ void *memmove(void *dest, const void *src, size_t n)
#ifdef DTO_STATS_SUPPORT
DTO_COLLECT_STATS_START(collect_stats, st);
#endif
dto_memcpymove(dest, src, n, 0, &result);
is_overlapping = dto_memcpymove(dest, src, n, 0, &result);

#ifdef DTO_STATS_SUPPORT
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMMOVE, n, thr_bytes_completed, result);
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMMOVE, n, is_overlapping, thr_bytes_completed, result);
#endif
if (thr_bytes_completed != n) {
/* fallback to std call if job is only partially completed */
Expand Down Expand Up @@ -1850,7 +1894,7 @@ int memcmp(const void *s1, const void *s2, size_t n)
ret = dto_memcmp(s1, s2, n, &result);

#ifdef DTO_STATS_SUPPORT
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, thr_bytes_completed, result);
DTO_COLLECT_STATS_DSA_END(collect_stats, st, et, MEMCMP, n, false, thr_bytes_completed, result);
#endif
if (thr_bytes_completed != n) {
/* fallback to std call if job is only partially completed */
Expand Down