From 7240d589389aa1493d40f01a87e9ff37ed97187e Mon Sep 17 00:00:00 2001 From: Arne Tarara Date: Thu, 10 Oct 2024 16:08:00 +0200 Subject: [PATCH 1/3] Added code review of ebpf.c --- ebpf.c | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/ebpf.c b/ebpf.c index 9726faf..b7ac9ef 100644 --- a/ebpf.c +++ b/ebpf.c @@ -29,6 +29,7 @@ TRACEPOINT_PROBE(sched, sched_switch) { u64 ts = bpf_ktime_get_ns(); u32 cpu = bpf_get_smp_processor_id(); + // TASK_COMM_LEN +1 because of \0 ? char prev_comm[TASK_COMM_LEN]; char next_comm[TASK_COMM_LEN]; bpf_probe_read_kernel(&prev_comm, sizeof(prev_comm), args->prev_comm); @@ -48,17 +49,26 @@ TRACEPOINT_PROBE(sched, sched_switch) { // Handle the idle task being scheduled out if (prev_pid == 0) { // Idle task is being scheduled out + // For a lookup in the hash map, do I really have to pass the address of the int? Really? Not the value itself? u64 *start_ns = idle_start_time_ns.lookup(&cpu); + // what happens if start_ns is 0? not better compare with NULL ? if (start_ns) { u64 delta = ts - *start_ns; u64 *total_ns = idle_time_ns.lookup(&cpu); + // what happens if total_ns is 0? This should be more likely than start_ns not better compare with NULL ? if (total_ns) { *total_ns += delta; + // missing update command? should it not be idle_time_ns.update(&cpu, total_ns); } else { + // if you store the addresses and not the values, why are these u64? idle_time_ns.update(&cpu, &delta); } idle_start_time_ns.delete(&cpu); } + + // missing return statement? Why continue here? + // if prev_pid = 0 can it also be that next_pid = ? + // the event takes place per CPU, right? So I guess it cannot be ... } // Handle the idle task being scheduled in @@ -70,31 +80,39 @@ TRACEPOINT_PROBE(sched, sched_switch) { // Handle the task that is being switched out u64 *start_ns = start_time.lookup(&prev_pid); + // what happens if start_ns is 0? not better compare with NULL ? if (start_ns) { u64 delta = ts - *start_ns; + // why do you use a different command here lookup => lookup_or_try_init ? u64 *total_ns = cpu_time_ns.lookup_or_try_init(&prev_pid, &delta); if (total_ns) { *total_ns += delta; + // do you not need to write total_ns somewhere? } start_time.delete(&prev_pid); } else { u64 zero = 0; u64 *total_ns = cpu_time_ns.lookup_or_try_init(&prev_pid, &zero); + // do you not need to write total_ns somewhere? } // Record the start time for the task being switched in + // This call is relevant for including or excluding the overhead of this eBPF script and is somehow in nowhere land ... Neither as early as possible, nor as late as possible. Where is it placed best? Does the event happen AFTER being scheduled on the CPU or BEFORE. In the former case I would argue to push this call to the end of the function. In the latter I would push it more to the start. start_time.update(&next_pid, &ts); // Increment wakeup count for the task being switched in + // is that really all the wakeups? When a process is assigned to a CPU and has I/O wait it is not scheduled out, but still can have wakeups happening from I/O being available. Can we have a validation script for eBPF and perf_events that compares the wakeups? + // you could also compare with sched:sched_wakeup u64 zero = 0; u64 *wakeup_count = cpu_wakeups.lookup_or_try_init(&next_pid, &zero); if (wakeup_count) { *wakeup_count += 1; } + // what is coming out of bpf_get_current_task? A void pointer? suprising that you have to cast here struct task_struct *task = (struct task_struct *)bpf_get_current_task(); - if ((task->flags & PF_KTHREAD) == 0) { + if ((task->flags & PF_KTHREAD) == 0) { // what does this line mean? u32 pid = task->pid; u64 zero = 0; u64 *mem_usage = mem_usage_bytes.lookup_or_try_init(&pid, &zero); @@ -133,9 +151,10 @@ TRACEPOINT_PROBE(sched, sched_switch) { // } // Network packet monitoring +// can you document the event netif_receive_skb and why you chose this over other possible events? TRACEPOINT_PROBE(net, netif_receive_skb) { - u32 pid = bpf_get_current_pid_tgid() >> 32; - u64 one = 1; + u32 pid = bpf_get_current_pid_tgid() >> 32; // what does this right shift do exactly? + u64 one = 1; // why do you supply one here instead of zero in the sched Tracepoint? Is this tracepoint called for EVERY packet? u64 *rx_packets = net_rx_packets.lookup_or_try_init(&pid, &one); if (rx_packets) { *rx_packets += 1; @@ -143,9 +162,10 @@ TRACEPOINT_PROBE(net, netif_receive_skb) { return 0; } +// can you document the event net_dev_queue and why you chose this over other possible events? TRACEPOINT_PROBE(net, net_dev_queue) { - u32 pid = bpf_get_current_pid_tgid() >> 32; - u64 one = 1; + u32 pid = bpf_get_current_pid_tgid() >> 32; // what does this right shift do exactly? + u64 one = 1; // why do you supply one here instead of zero in the sched Tracepoint? Is this tracepoint called for EVERY packet? u64 *tx_packets = net_tx_packets.lookup_or_try_init(&pid, &one); if (tx_packets) { *tx_packets += 1; @@ -154,6 +174,7 @@ TRACEPOINT_PROBE(net, net_dev_queue) { } // Disk I/O monitoring +// can you document the event block_rq_issue and why you chose this over other possible events? TRACEPOINT_PROBE(block, block_rq_issue) { u32 pid = bpf_get_current_pid_tgid() >> 32; u64 bytes = args->bytes; @@ -162,6 +183,7 @@ TRACEPOINT_PROBE(block, block_rq_issue) { bpf_probe_read_kernel(&rwbs, sizeof(rwbs), args->rwbs); // Determine if the operation is a read or write + // Are we ignoring other types like flush and sync? Or do they also trigger a write? if (rwbs[0] == 'R') { u64 *read_bytes = disk_read_bytes.lookup_or_try_init(&pid, &bytes); if (read_bytes) { From b057aa2f616849a151c7d60b616f2618d7cc1b43 Mon Sep 17 00:00:00 2001 From: Didi Hoffmann <39629+ribalba@users.noreply.github.com> Date: Tue, 29 Oct 2024 12:01:20 +0100 Subject: [PATCH 2/3] Didi comments --- ebpf.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/ebpf.c b/ebpf.c index b7ac9ef..0360d78 100644 --- a/ebpf.c +++ b/ebpf.c @@ -30,6 +30,7 @@ TRACEPOINT_PROBE(sched, sched_switch) { u32 cpu = bpf_get_smp_processor_id(); // TASK_COMM_LEN +1 because of \0 ? + // https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#6-bpf_get_current_comm char prev_comm[TASK_COMM_LEN]; char next_comm[TASK_COMM_LEN]; bpf_probe_read_kernel(&prev_comm, sizeof(prev_comm), args->prev_comm); @@ -50,8 +51,10 @@ TRACEPOINT_PROBE(sched, sched_switch) { if (prev_pid == 0) { // Idle task is being scheduled out // For a lookup in the hash map, do I really have to pass the address of the int? Really? Not the value itself? + // https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#19-maplookup u64 *start_ns = idle_start_time_ns.lookup(&cpu); // what happens if start_ns is 0? not better compare with NULL ? + // true, but start_ns shouldn't be 0. But you could make it more explicit. As lookup returns NULL if the key is not found if (start_ns) { u64 delta = ts - *start_ns; u64 *total_ns = idle_time_ns.lookup(&cpu); @@ -59,9 +62,12 @@ TRACEPOINT_PROBE(sched, sched_switch) { if (total_ns) { *total_ns += delta; // missing update command? should it not be idle_time_ns.update(&cpu, total_ns); + // I update the pointed to value. So it should update in the map } else { // if you store the addresses and not the values, why are these u64? + // Becaue the map stores the acutal value not the pointer idle_time_ns.update(&cpu, &delta); + } idle_start_time_ns.delete(&cpu); } @@ -69,6 +75,8 @@ TRACEPOINT_PROBE(sched, sched_switch) { // missing return statement? Why continue here? // if prev_pid = 0 can it also be that next_pid = ? // the event takes place per CPU, right? So I guess it cannot be ... + // But I want to look at the next task? + } // Handle the idle task being scheduled in @@ -84,10 +92,12 @@ TRACEPOINT_PROBE(sched, sched_switch) { if (start_ns) { u64 delta = ts - *start_ns; // why do you use a different command here lookup => lookup_or_try_init ? + // Because the value can not exist here u64 *total_ns = cpu_time_ns.lookup_or_try_init(&prev_pid, &delta); if (total_ns) { *total_ns += delta; // do you not need to write total_ns somewhere? + // It is saved by the pointer } start_time.delete(&prev_pid); } else { @@ -99,6 +109,7 @@ TRACEPOINT_PROBE(sched, sched_switch) { // Record the start time for the task being switched in // This call is relevant for including or excluding the overhead of this eBPF script and is somehow in nowhere land ... Neither as early as possible, nor as late as possible. Where is it placed best? Does the event happen AFTER being scheduled on the CPU or BEFORE. In the former case I would argue to push this call to the end of the function. In the latter I would push it more to the start. + // True, I just wanted things that belong togehter to be at the same place in the code start_time.update(&next_pid, &ts); // Increment wakeup count for the task being switched in @@ -111,8 +122,10 @@ TRACEPOINT_PROBE(sched, sched_switch) { } // what is coming out of bpf_get_current_task? A void pointer? suprising that you have to cast here + // https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#7-bpf_get_current_task struct task_struct *task = (struct task_struct *)bpf_get_current_task(); if ((task->flags & PF_KTHREAD) == 0) { // what does this line mean? + // It checks if this is a kernel thread u32 pid = task->pid; u64 zero = 0; u64 *mem_usage = mem_usage_bytes.lookup_or_try_init(&pid, &zero); @@ -196,4 +209,4 @@ TRACEPOINT_PROBE(block, block_rq_issue) { } } return 0; -} \ No newline at end of file +} From bddb29a9190bc00cd53ea115f0864ba0c182a449 Mon Sep 17 00:00:00 2001 From: Arne Tarara Date: Tue, 29 Oct 2024 13:36:06 +0100 Subject: [PATCH 3/3] Python code review --- powerletrics.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/powerletrics.py b/powerletrics.py index 9669b44..42a2dba 100755 --- a/powerletrics.py +++ b/powerletrics.py @@ -57,6 +57,8 @@ class PidComm(ctypes.Structure): config = configparser.ConfigParser() config.read('weights.conf') +## Explanation for the weights? + # Weights for each component CPU_WEIGHT = float(config['Weights'].get('CPU_WEIGHT', 1)) WAKEUP_WEIGHT = float(config['Weights'].get('WAKEUP_WEIGHT', 0)) @@ -77,6 +79,7 @@ class PidComm(ctypes.Structure): page_size = os.sysconf('SC_PAGE_SIZE') +# What does this object represent? A process? Maybe name it process then? class Data: def __init__(self): self.pid = -1 @@ -93,10 +96,10 @@ def __init__(self): self.is_kernel_thread = False def memory_usage_mb(self): - return self.memory_usage / (1024 * 1024) + return self.memory_usage / (1024 * 1024) # Think about defining a constant here? BYTES_TO_MB instead of function? def ebpf_memory_usage_mb(self): - return self.ebpf_memory_usage / (1024 * 1024) + return self.ebpf_memory_usage / (1024 * 1024) # Think about defining a constant here? BYTES_TO_MB instead of function? def cpu_utilization(self): if self.pid == 0: @@ -105,13 +108,16 @@ def cpu_utilization(self): idle_times = b.get_table("idle_time_ns") self.cpu_time_ns = sum([idle_times[cpu_id].value for cpu_id in idle_times.keys()]) - return (data.cpu_time_ns / (interval_ns * num_cpus)) * 100 + # What is data ? + return (data.cpu_time_ns / (interval_ns * num_cpus)) * 100 # Is this wall time or is this cycles? def energy_impact(self): + # I do not understand why the idle thread has no energy impact? Does it not have any of these metrics like wakeups? if self.pid == 0: # We can't really assign a value to the system being idle on one to n cores return 0 + # What is data? return (CPU_WEIGHT * data.cpu_utilization()) + \ (WAKEUP_WEIGHT * data.cpu_wakeups) + \ (DISK_WRITE_WEIGHT * data.disk_write_bytes) + \ @@ -129,8 +135,8 @@ def energy_impact(self): print("Starting powerletrics monitoring. Press Ctrl+C to stop.") while True: - start_loop_time = datetime.datetime.now() - time.sleep(sample_interval_sec) + start_loop_time = datetime.datetime.now() # maybe use a true monotonic timer here? Thinking leap seconds and daylight savings time ... + time.sleep(sample_interval_sec) # how granular does this sleep need to be? Maybe use better python timer? # Retrieve data from eBPF maps cpu_times = b.get_table("cpu_time_ns") @@ -149,7 +155,7 @@ def energy_impact(self): is_kernel_thread = False pid = pid_key.value - data = Data() + data = Data() # this would be nicer in an __init__ function as submitted params, but I see the issue with .value data.pid = pid data.cpu_time_ns = cpu_times[pid_key].value data.cpu_wakeups = wakeups[pid_key].value if pid_key in wakeups else 0 @@ -159,7 +165,7 @@ def energy_impact(self): data.disk_write_bytes = disk_writes[pid_key].value if pid_key in disk_writes else 0 pid_comm = pid_comm_map.get(ctypes.c_uint32(pid)) - if pid_comm: + if pid_comm: # can this not false to 0 ? data.comm = pid_comm.comm.decode('utf-8', 'replace') else: data.comm = "" @@ -187,6 +193,7 @@ def energy_impact(self): if not data.is_kernel_thread: try: + # I do not understand the limitation here ... why do it if it will not be useful? # We don't get the memory through ebpf in the default case as there is no way to iterate over all # processes in eBPF for security reasons. # If you want to still use eBPD you can enable it with --ebpf-memory. @@ -214,7 +221,7 @@ def energy_impact(self): data_list.sort(key=lambda x: x.energy_impact(), reverse=True) elapsed_time = datetime.datetime.now() - start_loop_time - elapsed_time_ms = elapsed_time.total_seconds() * 1000 + elapsed_time_ms = elapsed_time.total_seconds() * 1000 # Think about defining a constant here? FROM_S_TO_MS ? current_time = datetime.datetime.now().strftime("%a %b %d %H:%M:%S %Y %z") if args.format == 'text': @@ -327,13 +334,13 @@ def energy_impact(self): plist_data.append(data_dict) - if args.output_file: + if args.output_file: # is this appended? with open(args.output_file, 'wb') as f: plistlib.dump(plist_data, f, fmt=plistlib.FMT_XML) else: plistlib.dump(plist_data, sys.stdout.buffer, fmt=plistlib.FMT_XML) - cpu_times.clear() + cpu_times.clear() # What does this do exactly, given hat an eBPF structure is requested? wakeups.clear() rx_packets.clear() tx_packets.clear()