Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 41 additions & 6 deletions ebpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ TRACEPOINT_PROBE(sched, sched_switch) {
u64 ts = bpf_ktime_get_ns();
u32 cpu = bpf_get_smp_processor_id();

// TASK_COMM_LEN +1 because of \0 ?
// https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#6-bpf_get_current_comm
char prev_comm[TASK_COMM_LEN];
char next_comm[TASK_COMM_LEN];
bpf_probe_read_kernel(&prev_comm, sizeof(prev_comm), args->prev_comm);
Expand All @@ -48,17 +50,33 @@ TRACEPOINT_PROBE(sched, sched_switch) {
// Handle the idle task being scheduled out
if (prev_pid == 0) {
// Idle task is being scheduled out
// For a lookup in the hash map, do I really have to pass the address of the int? Really? Not the value itself?
// https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#19-maplookup
u64 *start_ns = idle_start_time_ns.lookup(&cpu);
// what happens if start_ns is 0? not better compare with NULL ?
// true, but start_ns shouldn't be 0. But you could make it more explicit. As lookup returns NULL if the key is not found
if (start_ns) {
u64 delta = ts - *start_ns;
u64 *total_ns = idle_time_ns.lookup(&cpu);
// what happens if total_ns is 0? This should be more likely than start_ns not better compare with NULL ?
if (total_ns) {
*total_ns += delta;
// missing update command? should it not be idle_time_ns.update(&cpu, total_ns);
// I update the pointed to value. So it should update in the map
} else {
// if you store the addresses and not the values, why are these u64?
// Becaue the map stores the acutal value not the pointer
idle_time_ns.update(&cpu, &delta);

}
idle_start_time_ns.delete(&cpu);
}

// missing return statement? Why continue here?
// if prev_pid = 0 can it also be that next_pid = ?
// the event takes place per CPU, right? So I guess it cannot be ...
// But I want to look at the next task?

}

// Handle the idle task being scheduled in
Expand All @@ -70,31 +88,44 @@ TRACEPOINT_PROBE(sched, sched_switch) {

// Handle the task that is being switched out
u64 *start_ns = start_time.lookup(&prev_pid);
// what happens if start_ns is 0? not better compare with NULL ?
if (start_ns) {
u64 delta = ts - *start_ns;
// why do you use a different command here lookup => lookup_or_try_init ?
// Because the value can not exist here
u64 *total_ns = cpu_time_ns.lookup_or_try_init(&prev_pid, &delta);
if (total_ns) {
*total_ns += delta;
// do you not need to write total_ns somewhere?
// It is saved by the pointer
}
start_time.delete(&prev_pid);
} else {
u64 zero = 0;
u64 *total_ns = cpu_time_ns.lookup_or_try_init(&prev_pid, &zero);
// do you not need to write total_ns somewhere?
}


// Record the start time for the task being switched in
// This call is relevant for including or excluding the overhead of this eBPF script and is somehow in nowhere land ... Neither as early as possible, nor as late as possible. Where is it placed best? Does the event happen AFTER being scheduled on the CPU or BEFORE. In the former case I would argue to push this call to the end of the function. In the latter I would push it more to the start.
// True, I just wanted things that belong togehter to be at the same place in the code
start_time.update(&next_pid, &ts);

// Increment wakeup count for the task being switched in
// is that really all the wakeups? When a process is assigned to a CPU and has I/O wait it is not scheduled out, but still can have wakeups happening from I/O being available. Can we have a validation script for eBPF and perf_events that compares the wakeups?
// you could also compare with sched:sched_wakeup
u64 zero = 0;
u64 *wakeup_count = cpu_wakeups.lookup_or_try_init(&next_pid, &zero);
if (wakeup_count) {
*wakeup_count += 1;
}

// what is coming out of bpf_get_current_task? A void pointer? suprising that you have to cast here
// https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#7-bpf_get_current_task
struct task_struct *task = (struct task_struct *)bpf_get_current_task();
if ((task->flags & PF_KTHREAD) == 0) {
if ((task->flags & PF_KTHREAD) == 0) { // what does this line mean?
// It checks if this is a kernel thread
u32 pid = task->pid;
u64 zero = 0;
u64 *mem_usage = mem_usage_bytes.lookup_or_try_init(&pid, &zero);
Expand Down Expand Up @@ -133,19 +164,21 @@ TRACEPOINT_PROBE(sched, sched_switch) {
// }

// Network packet monitoring
// can you document the event netif_receive_skb and why you chose this over other possible events?
TRACEPOINT_PROBE(net, netif_receive_skb) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u64 one = 1;
u32 pid = bpf_get_current_pid_tgid() >> 32; // what does this right shift do exactly?
u64 one = 1; // why do you supply one here instead of zero in the sched Tracepoint? Is this tracepoint called for EVERY packet?
u64 *rx_packets = net_rx_packets.lookup_or_try_init(&pid, &one);
if (rx_packets) {
*rx_packets += 1;
}
return 0;
}

// can you document the event net_dev_queue and why you chose this over other possible events?
TRACEPOINT_PROBE(net, net_dev_queue) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u64 one = 1;
u32 pid = bpf_get_current_pid_tgid() >> 32; // what does this right shift do exactly?
u64 one = 1; // why do you supply one here instead of zero in the sched Tracepoint? Is this tracepoint called for EVERY packet?
u64 *tx_packets = net_tx_packets.lookup_or_try_init(&pid, &one);
if (tx_packets) {
*tx_packets += 1;
Expand All @@ -154,6 +187,7 @@ TRACEPOINT_PROBE(net, net_dev_queue) {
}

// Disk I/O monitoring
// can you document the event block_rq_issue and why you chose this over other possible events?
TRACEPOINT_PROBE(block, block_rq_issue) {
u32 pid = bpf_get_current_pid_tgid() >> 32;
u64 bytes = args->bytes;
Expand All @@ -162,6 +196,7 @@ TRACEPOINT_PROBE(block, block_rq_issue) {
bpf_probe_read_kernel(&rwbs, sizeof(rwbs), args->rwbs);

// Determine if the operation is a read or write
// Are we ignoring other types like flush and sync? Or do they also trigger a write?
if (rwbs[0] == 'R') {
u64 *read_bytes = disk_read_bytes.lookup_or_try_init(&pid, &bytes);
if (read_bytes) {
Expand All @@ -174,4 +209,4 @@ TRACEPOINT_PROBE(block, block_rq_issue) {
}
}
return 0;
}
}
27 changes: 17 additions & 10 deletions powerletrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ class PidComm(ctypes.Structure):
config = configparser.ConfigParser()
config.read('weights.conf')

## Explanation for the weights?

# Weights for each component
CPU_WEIGHT = float(config['Weights'].get('CPU_WEIGHT', 1))
WAKEUP_WEIGHT = float(config['Weights'].get('WAKEUP_WEIGHT', 0))
Expand All @@ -77,6 +79,7 @@ class PidComm(ctypes.Structure):
page_size = os.sysconf('SC_PAGE_SIZE')


# What does this object represent? A process? Maybe name it process then?
class Data:
def __init__(self):
self.pid = -1
Expand All @@ -93,10 +96,10 @@ def __init__(self):
self.is_kernel_thread = False

def memory_usage_mb(self):
return self.memory_usage / (1024 * 1024)
return self.memory_usage / (1024 * 1024) # Think about defining a constant here? BYTES_TO_MB instead of function?

def ebpf_memory_usage_mb(self):
return self.ebpf_memory_usage / (1024 * 1024)
return self.ebpf_memory_usage / (1024 * 1024) # Think about defining a constant here? BYTES_TO_MB instead of function?

def cpu_utilization(self):
if self.pid == 0:
Expand All @@ -105,13 +108,16 @@ def cpu_utilization(self):
idle_times = b.get_table("idle_time_ns")
self.cpu_time_ns = sum([idle_times[cpu_id].value for cpu_id in idle_times.keys()])

return (data.cpu_time_ns / (interval_ns * num_cpus)) * 100
# What is data ?
return (data.cpu_time_ns / (interval_ns * num_cpus)) * 100 # Is this wall time or is this cycles?

def energy_impact(self):
# I do not understand why the idle thread has no energy impact? Does it not have any of these metrics like wakeups?
if self.pid == 0:
# We can't really assign a value to the system being idle on one to n cores
return 0

# What is data?
return (CPU_WEIGHT * data.cpu_utilization()) + \
(WAKEUP_WEIGHT * data.cpu_wakeups) + \
(DISK_WRITE_WEIGHT * data.disk_write_bytes) + \
Expand All @@ -129,8 +135,8 @@ def energy_impact(self):
print("Starting powerletrics monitoring. Press Ctrl+C to stop.")

while True:
start_loop_time = datetime.datetime.now()
time.sleep(sample_interval_sec)
start_loop_time = datetime.datetime.now() # maybe use a true monotonic timer here? Thinking leap seconds and daylight savings time ...
time.sleep(sample_interval_sec) # how granular does this sleep need to be? Maybe use better python timer?

# Retrieve data from eBPF maps
cpu_times = b.get_table("cpu_time_ns")
Expand All @@ -149,7 +155,7 @@ def energy_impact(self):
is_kernel_thread = False

pid = pid_key.value
data = Data()
data = Data() # this would be nicer in an __init__ function as submitted params, but I see the issue with .value
data.pid = pid
data.cpu_time_ns = cpu_times[pid_key].value
data.cpu_wakeups = wakeups[pid_key].value if pid_key in wakeups else 0
Expand All @@ -159,7 +165,7 @@ def energy_impact(self):
data.disk_write_bytes = disk_writes[pid_key].value if pid_key in disk_writes else 0

pid_comm = pid_comm_map.get(ctypes.c_uint32(pid))
if pid_comm:
if pid_comm: # can this not false to 0 ?
data.comm = pid_comm.comm.decode('utf-8', 'replace')
else:
data.comm = "<unknown>"
Expand Down Expand Up @@ -187,6 +193,7 @@ def energy_impact(self):

if not data.is_kernel_thread:
try:
# I do not understand the limitation here ... why do it if it will not be useful?
# We don't get the memory through ebpf in the default case as there is no way to iterate over all
# processes in eBPF for security reasons.
# If you want to still use eBPD you can enable it with --ebpf-memory.
Expand Down Expand Up @@ -214,7 +221,7 @@ def energy_impact(self):
data_list.sort(key=lambda x: x.energy_impact(), reverse=True)

elapsed_time = datetime.datetime.now() - start_loop_time
elapsed_time_ms = elapsed_time.total_seconds() * 1000
elapsed_time_ms = elapsed_time.total_seconds() * 1000 # Think about defining a constant here? FROM_S_TO_MS ?

current_time = datetime.datetime.now().strftime("%a %b %d %H:%M:%S %Y %z")
if args.format == 'text':
Expand Down Expand Up @@ -327,13 +334,13 @@ def energy_impact(self):

plist_data.append(data_dict)

if args.output_file:
if args.output_file: # is this appended?
with open(args.output_file, 'wb') as f:
plistlib.dump(plist_data, f, fmt=plistlib.FMT_XML)
else:
plistlib.dump(plist_data, sys.stdout.buffer, fmt=plistlib.FMT_XML)

cpu_times.clear()
cpu_times.clear() # What does this do exactly, given hat an eBPF structure is requested?
wakeups.clear()
rx_packets.clear()
tx_packets.clear()
Expand Down