diff --git a/ebpf.c b/ebpf.c index 9726faf..0360d78 100644 --- a/ebpf.c +++ b/ebpf.c @@ -29,6 +29,8 @@ TRACEPOINT_PROBE(sched, sched_switch) { u64 ts = bpf_ktime_get_ns(); u32 cpu = bpf_get_smp_processor_id(); + // TASK_COMM_LEN +1 because of \0 ? + // https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#6-bpf_get_current_comm char prev_comm[TASK_COMM_LEN]; char next_comm[TASK_COMM_LEN]; bpf_probe_read_kernel(&prev_comm, sizeof(prev_comm), args->prev_comm); @@ -48,17 +50,33 @@ TRACEPOINT_PROBE(sched, sched_switch) { // Handle the idle task being scheduled out if (prev_pid == 0) { // Idle task is being scheduled out + // For a lookup in the hash map, do I really have to pass the address of the int? Really? Not the value itself? + // https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#19-maplookup u64 *start_ns = idle_start_time_ns.lookup(&cpu); + // what happens if start_ns is 0? not better compare with NULL ? + // true, but start_ns shouldn't be 0. But you could make it more explicit. As lookup returns NULL if the key is not found if (start_ns) { u64 delta = ts - *start_ns; u64 *total_ns = idle_time_ns.lookup(&cpu); + // what happens if total_ns is 0? This should be more likely than start_ns not better compare with NULL ? if (total_ns) { *total_ns += delta; + // missing update command? should it not be idle_time_ns.update(&cpu, total_ns); + // I update the pointed to value. So it should update in the map } else { + // if you store the addresses and not the values, why are these u64? + // Becaue the map stores the acutal value not the pointer idle_time_ns.update(&cpu, &delta); + } idle_start_time_ns.delete(&cpu); } + + // missing return statement? Why continue here? + // if prev_pid = 0 can it also be that next_pid = ? + // the event takes place per CPU, right? So I guess it cannot be ... + // But I want to look at the next task? + } // Handle the idle task being scheduled in @@ -70,31 +88,44 @@ TRACEPOINT_PROBE(sched, sched_switch) { // Handle the task that is being switched out u64 *start_ns = start_time.lookup(&prev_pid); + // what happens if start_ns is 0? not better compare with NULL ? if (start_ns) { u64 delta = ts - *start_ns; + // why do you use a different command here lookup => lookup_or_try_init ? + // Because the value can not exist here u64 *total_ns = cpu_time_ns.lookup_or_try_init(&prev_pid, &delta); if (total_ns) { *total_ns += delta; + // do you not need to write total_ns somewhere? + // It is saved by the pointer } start_time.delete(&prev_pid); } else { u64 zero = 0; u64 *total_ns = cpu_time_ns.lookup_or_try_init(&prev_pid, &zero); + // do you not need to write total_ns somewhere? } // Record the start time for the task being switched in + // This call is relevant for including or excluding the overhead of this eBPF script and is somehow in nowhere land ... Neither as early as possible, nor as late as possible. Where is it placed best? Does the event happen AFTER being scheduled on the CPU or BEFORE. In the former case I would argue to push this call to the end of the function. In the latter I would push it more to the start. + // True, I just wanted things that belong togehter to be at the same place in the code start_time.update(&next_pid, &ts); // Increment wakeup count for the task being switched in + // is that really all the wakeups? When a process is assigned to a CPU and has I/O wait it is not scheduled out, but still can have wakeups happening from I/O being available. Can we have a validation script for eBPF and perf_events that compares the wakeups? + // you could also compare with sched:sched_wakeup u64 zero = 0; u64 *wakeup_count = cpu_wakeups.lookup_or_try_init(&next_pid, &zero); if (wakeup_count) { *wakeup_count += 1; } + // what is coming out of bpf_get_current_task? A void pointer? suprising that you have to cast here + // https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#7-bpf_get_current_task struct task_struct *task = (struct task_struct *)bpf_get_current_task(); - if ((task->flags & PF_KTHREAD) == 0) { + if ((task->flags & PF_KTHREAD) == 0) { // what does this line mean? + // It checks if this is a kernel thread u32 pid = task->pid; u64 zero = 0; u64 *mem_usage = mem_usage_bytes.lookup_or_try_init(&pid, &zero); @@ -133,9 +164,10 @@ TRACEPOINT_PROBE(sched, sched_switch) { // } // Network packet monitoring +// can you document the event netif_receive_skb and why you chose this over other possible events? TRACEPOINT_PROBE(net, netif_receive_skb) { - u32 pid = bpf_get_current_pid_tgid() >> 32; - u64 one = 1; + u32 pid = bpf_get_current_pid_tgid() >> 32; // what does this right shift do exactly? + u64 one = 1; // why do you supply one here instead of zero in the sched Tracepoint? Is this tracepoint called for EVERY packet? u64 *rx_packets = net_rx_packets.lookup_or_try_init(&pid, &one); if (rx_packets) { *rx_packets += 1; @@ -143,9 +175,10 @@ TRACEPOINT_PROBE(net, netif_receive_skb) { return 0; } +// can you document the event net_dev_queue and why you chose this over other possible events? TRACEPOINT_PROBE(net, net_dev_queue) { - u32 pid = bpf_get_current_pid_tgid() >> 32; - u64 one = 1; + u32 pid = bpf_get_current_pid_tgid() >> 32; // what does this right shift do exactly? + u64 one = 1; // why do you supply one here instead of zero in the sched Tracepoint? Is this tracepoint called for EVERY packet? u64 *tx_packets = net_tx_packets.lookup_or_try_init(&pid, &one); if (tx_packets) { *tx_packets += 1; @@ -154,6 +187,7 @@ TRACEPOINT_PROBE(net, net_dev_queue) { } // Disk I/O monitoring +// can you document the event block_rq_issue and why you chose this over other possible events? TRACEPOINT_PROBE(block, block_rq_issue) { u32 pid = bpf_get_current_pid_tgid() >> 32; u64 bytes = args->bytes; @@ -162,6 +196,7 @@ TRACEPOINT_PROBE(block, block_rq_issue) { bpf_probe_read_kernel(&rwbs, sizeof(rwbs), args->rwbs); // Determine if the operation is a read or write + // Are we ignoring other types like flush and sync? Or do they also trigger a write? if (rwbs[0] == 'R') { u64 *read_bytes = disk_read_bytes.lookup_or_try_init(&pid, &bytes); if (read_bytes) { @@ -174,4 +209,4 @@ TRACEPOINT_PROBE(block, block_rq_issue) { } } return 0; -} \ No newline at end of file +} diff --git a/powerletrics.py b/powerletrics.py index 9669b44..42a2dba 100755 --- a/powerletrics.py +++ b/powerletrics.py @@ -57,6 +57,8 @@ class PidComm(ctypes.Structure): config = configparser.ConfigParser() config.read('weights.conf') +## Explanation for the weights? + # Weights for each component CPU_WEIGHT = float(config['Weights'].get('CPU_WEIGHT', 1)) WAKEUP_WEIGHT = float(config['Weights'].get('WAKEUP_WEIGHT', 0)) @@ -77,6 +79,7 @@ class PidComm(ctypes.Structure): page_size = os.sysconf('SC_PAGE_SIZE') +# What does this object represent? A process? Maybe name it process then? class Data: def __init__(self): self.pid = -1 @@ -93,10 +96,10 @@ def __init__(self): self.is_kernel_thread = False def memory_usage_mb(self): - return self.memory_usage / (1024 * 1024) + return self.memory_usage / (1024 * 1024) # Think about defining a constant here? BYTES_TO_MB instead of function? def ebpf_memory_usage_mb(self): - return self.ebpf_memory_usage / (1024 * 1024) + return self.ebpf_memory_usage / (1024 * 1024) # Think about defining a constant here? BYTES_TO_MB instead of function? def cpu_utilization(self): if self.pid == 0: @@ -105,13 +108,16 @@ def cpu_utilization(self): idle_times = b.get_table("idle_time_ns") self.cpu_time_ns = sum([idle_times[cpu_id].value for cpu_id in idle_times.keys()]) - return (data.cpu_time_ns / (interval_ns * num_cpus)) * 100 + # What is data ? + return (data.cpu_time_ns / (interval_ns * num_cpus)) * 100 # Is this wall time or is this cycles? def energy_impact(self): + # I do not understand why the idle thread has no energy impact? Does it not have any of these metrics like wakeups? if self.pid == 0: # We can't really assign a value to the system being idle on one to n cores return 0 + # What is data? return (CPU_WEIGHT * data.cpu_utilization()) + \ (WAKEUP_WEIGHT * data.cpu_wakeups) + \ (DISK_WRITE_WEIGHT * data.disk_write_bytes) + \ @@ -129,8 +135,8 @@ def energy_impact(self): print("Starting powerletrics monitoring. Press Ctrl+C to stop.") while True: - start_loop_time = datetime.datetime.now() - time.sleep(sample_interval_sec) + start_loop_time = datetime.datetime.now() # maybe use a true monotonic timer here? Thinking leap seconds and daylight savings time ... + time.sleep(sample_interval_sec) # how granular does this sleep need to be? Maybe use better python timer? # Retrieve data from eBPF maps cpu_times = b.get_table("cpu_time_ns") @@ -149,7 +155,7 @@ def energy_impact(self): is_kernel_thread = False pid = pid_key.value - data = Data() + data = Data() # this would be nicer in an __init__ function as submitted params, but I see the issue with .value data.pid = pid data.cpu_time_ns = cpu_times[pid_key].value data.cpu_wakeups = wakeups[pid_key].value if pid_key in wakeups else 0 @@ -159,7 +165,7 @@ def energy_impact(self): data.disk_write_bytes = disk_writes[pid_key].value if pid_key in disk_writes else 0 pid_comm = pid_comm_map.get(ctypes.c_uint32(pid)) - if pid_comm: + if pid_comm: # can this not false to 0 ? data.comm = pid_comm.comm.decode('utf-8', 'replace') else: data.comm = "" @@ -187,6 +193,7 @@ def energy_impact(self): if not data.is_kernel_thread: try: + # I do not understand the limitation here ... why do it if it will not be useful? # We don't get the memory through ebpf in the default case as there is no way to iterate over all # processes in eBPF for security reasons. # If you want to still use eBPD you can enable it with --ebpf-memory. @@ -214,7 +221,7 @@ def energy_impact(self): data_list.sort(key=lambda x: x.energy_impact(), reverse=True) elapsed_time = datetime.datetime.now() - start_loop_time - elapsed_time_ms = elapsed_time.total_seconds() * 1000 + elapsed_time_ms = elapsed_time.total_seconds() * 1000 # Think about defining a constant here? FROM_S_TO_MS ? current_time = datetime.datetime.now().strftime("%a %b %d %H:%M:%S %Y %z") if args.format == 'text': @@ -327,13 +334,13 @@ def energy_impact(self): plist_data.append(data_dict) - if args.output_file: + if args.output_file: # is this appended? with open(args.output_file, 'wb') as f: plistlib.dump(plist_data, f, fmt=plistlib.FMT_XML) else: plistlib.dump(plist_data, sys.stdout.buffer, fmt=plistlib.FMT_XML) - cpu_times.clear() + cpu_times.clear() # What does this do exactly, given hat an eBPF structure is requested? wakeups.clear() rx_packets.clear() tx_packets.clear()