Subject: Provide CPU load measurement based on idle time
From: Carsten Emde <C.Emde@osadl.org>
Date: Sun, 22 Apr 2012 15:19:46 +0100

The standard method to measure CPU load has a number of problems (for
details refer to Documentation/cpu-load.txt). This patch adds an
additional CPU load measuring method that is based on idle time
processing. The data are available for every CPU in /proc/cpuload/cpuN.
The counters can be reset by writing anything to /proc/cpuload/resetall
for all CPUs and to /proc/cpuload/cpuN/reset for a particular CPU,
respectively. The load value represents the average load since the most
recent reset. It may take up to a second after reset until the load data
reach their final precision.

Signed-off-by: Carsten Emde <C.Emde@osadl.org>

---
 include/linux/sched.h  |    8 +
 init/Kconfig           |   14 ++
 kernel/Makefile        |    1 
 kernel/sched.c         |   24 ++++
 kernel/sched_cpuload.c |  256 +++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 303 insertions(+)

Index: linux-3.2.46-rt67-32/include/linux/sched.h
===================================================================
--- linux-3.2.46-rt67-32.orig/include/linux/sched.h
+++ linux-3.2.46-rt67-32/include/linux/sched.h
@@ -2869,3 +2869,11 @@ static inline unsigned long rlimit_max(u
 #endif /* __KERNEL__ */
 
 #endif
+
+#ifdef CONFIG_IDLETIME_CPULOAD
+extern DEFINE_PER_CPU(unsigned long long, idlestart);
+extern DEFINE_PER_CPU(unsigned long long, idlestop);
+extern DEFINE_PER_CPU(unsigned long long, idletime);
+extern DEFINE_PER_CPU(unsigned long long, runtime);
+extern DEFINE_PER_CPU(raw_spinlock_t, cpuload_lock);
+#endif
Index: linux-3.2.46-rt67-32/init/Kconfig
===================================================================
--- linux-3.2.46-rt67-32.orig/init/Kconfig
+++ linux-3.2.46-rt67-32/init/Kconfig
@@ -302,6 +302,20 @@ config FHANDLE
          get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
          syscalls.
 
+config IDLETIME_CPULOAD
+       bool "Provide CPU load measurement based on idle time"
+       help
+         The standard method to measure CPU load has a number of problems (for
+         details refer to Documentation/cpu-load.txt). If you say Y here,
+         additional CPU load data will be provided that are based on idle
+         time. The data are available for every CPU in /proc/cpuload/cpuN. The
+         counters can be reset by writing anything to /proc/cpuload/resetall
+         for all CPUs and to /proc/cpuload/cpuN/reset for a particular CPU,
+         respectively. The load value represents the average load since the
+         most recent reset. Please note that a certain, though small,
+         performance penalty cannot be avoided when this additional CPU load
+         calculation is enabled.
+
 config TASKSTATS
        bool "Export task/process statistics through netlink (EXPERIMENTAL)"
        depends on NET
Index: linux-3.2.46-rt67-32/kernel/Makefile
===================================================================
--- linux-3.2.46-rt67-32.orig/kernel/Makefile
+++ linux-3.2.46-rt67-32/kernel/Makefile
@@ -107,6 +107,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_CPU_PM) += cpu_pm.o
+obj-$(CONFIG_IDLETIME_CPULOAD) += sched_cpuload.o
 
 obj-$(CONFIG_PERF_EVENTS) += events/
 
Index: linux-3.2.46-rt67-32/kernel/sched.c
===================================================================
--- linux-3.2.46-rt67-32.orig/kernel/sched.c
+++ linux-3.2.46-rt67-32/kernel/sched.c
@@ -3161,6 +3161,30 @@ prepare_task_switch(struct rq *rq, struc
        prepare_lock_switch(rq, next);
        prepare_arch_switch(next);
        trace_sched_switch(prev, next);
+
+#ifdef CONFIG_IDLETIME_CPULOAD
+       if (next->pid == 0) {
+               int cpu = raw_smp_processor_id();
+
+               raw_spin_lock(&per_cpu(cpuload_lock, cpu));
+               per_cpu(idlestart, cpu) = cpu_clock(cpu);
+               if (per_cpu(idlestop, cpu)) {
+                       per_cpu(runtime, cpu) +=
+                           per_cpu(idlestart, cpu) - per_cpu(idlestop, cpu);
+               }
+               raw_spin_unlock(&per_cpu(cpuload_lock, cpu));
+       } else if (prev->pid == 0) {
+               int cpu = raw_smp_processor_id();
+
+               raw_spin_lock(&per_cpu(cpuload_lock, cpu));
+               per_cpu(idlestop, cpu) = cpu_clock(cpu);
+               if (per_cpu(idlestart, cpu)) {
+                       per_cpu(idletime, cpu) +=
+                           per_cpu(idlestop, cpu) - per_cpu(idlestart, cpu);
+               }
+               raw_spin_unlock(&per_cpu(cpuload_lock, cpu));
+       }
+#endif
 }
 
 /**
Index: linux-3.2.46-rt67-32/kernel/sched_cpuload.c
===================================================================
--- /dev/null
+++ linux-3.2.46-rt67-32/kernel/sched_cpuload.c
@@ -0,0 +1,256 @@
+/*
+   cpuload.c: calculate CPU load data that are derived from the
+             idle time
+
+   Copyright (C) 2012 Carsten Emde <C.Emde@osadl.org>
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License
+   as published by the Free Software Foundation; either version 2
+   of the License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
+*/
+
+#include <linux/sched.h>
+#include <linux/proc_fs.h>
+#include <linux/stringify.h>
+
+/*
+ * Total precision of percentage output
+ * If PRECISION is 5, for example, output will be "NNN.NN%"
+ */
+
+#define PRECISION 5
+#if PRECISION < 3
+#error PRECISION must not be smaller than 3
+#endif
+
+#if PRECISION == 3
+#define PRECFACTOR ((int)1E2)
+#elif PRECISION == 4
+#define PRECFACTOR ((int)1E3)
+#elif PRECISION == 5
+#define PRECFACTOR ((int)1E4)
+#elif PRECISION == 6
+#define PRECFACTOR ((int)1E5)
+#elif PRECISION == 7
+#define PRECFACTOR ((int)1E6)
+#else
+#error PRECSICION must not be larger than 7
+#endif
+
+#define PRECFORMAT "%0"__stringify(PRECISION)"llu"
+#define MAXINTPRECISION 3
+#define DECPRECISION (PRECISION-MAXINTPRECISION)
+
+enum cpuload_action_index {
+       CPULOAD_IDLETIME,
+       CPULOAD_RUNTIME,
+       CPULOAD_LOAD,
+       CPULOAD_RESET,
+       CPULOAD_RESETALL,
+};
+
+DEFINE_PER_CPU(unsigned long long, idlestart);
+DEFINE_PER_CPU(unsigned long long, idlestop);
+DEFINE_PER_CPU(unsigned long long, idletime);
+DEFINE_PER_CPU(unsigned long long, runtime);
+DEFINE_PER_CPU(raw_spinlock_t, cpuload_lock);
+
+struct cpuload_data {
+       int cpu;
+       int action;
+};
+
+static DEFINE_PER_CPU(struct cpuload_data, cpuload_data_idletime);
+static DEFINE_PER_CPU(struct cpuload_data, cpuload_data_runtime);
+static DEFINE_PER_CPU(struct cpuload_data, cpuload_data_load);
+static DEFINE_PER_CPU(struct cpuload_data, cpuload_data_reset);
+static struct cpuload_data cpuload_data_resetall = {
+       .cpu = 0,
+       .action = CPULOAD_RESETALL,
+};
+
+static int show_cpuload(char *buf, char **start, off_t off,
+                       int count, int *eof, void *data)
+{
+       int r;
+       struct cpuload_data *cpuload_data = (struct cpuload_data *) data;
+       int cpu = cpuload_data->cpu;
+       unsigned long long now = cpu_clock(cpu);
+
+       raw_spin_lock(&per_cpu(cpuload_lock, cpu));
+
+       /* Update counters */
+       if (per_cpu(idlestart, cpu) > per_cpu(idlestop, cpu)) {
+               /* CPU is idle */
+               per_cpu(idletime, cpu) += now - per_cpu(idlestart, cpu);
+               per_cpu(idlestart, cpu) = now;
+       } else {
+               /* CPU is running */
+               per_cpu(runtime, cpu) += now - per_cpu(idlestop, cpu);
+               per_cpu(idlestop, cpu) = now;
+       }
+
+       switch (cpuload_data->action) {
+       case CPULOAD_IDLETIME:
+               r = snprintf(buf, count, "%llu\n",
+                   per_cpu(idletime, cpu));
+               raw_spin_unlock(&per_cpu(cpuload_lock, cpu));
+               break;
+
+       case CPULOAD_RUNTIME:
+               r = snprintf(buf, count, "%llu\n",
+                   per_cpu(runtime, cpu));
+               raw_spin_unlock(&per_cpu(cpuload_lock, cpu));
+               break;
+
+       case CPULOAD_LOAD: {
+               char str[8], *firstdigit;
+               int intdigits;
+               unsigned long long idletime1, runtime1, alltime;
+
+               idletime1 = per_cpu(idletime, cpu);
+               runtime1 = per_cpu(runtime, cpu);
+               raw_spin_unlock(&per_cpu(cpuload_lock, cpu));
+
+               alltime = idletime1 + runtime1;
+
+               if (alltime) {
+                       /* Fake precision, if measurement was just started */
+                       if (alltime < PRECFACTOR) {
+                               runtime1 *= PRECFACTOR;
+                               alltime *= PRECFACTOR;
+                       }
+
+                       /* Format examples: 100.00%, 1.66%, 0.12%, 0.00% */
+                       snprintf(str, sizeof(str), PRECFORMAT,
+                           div64_u64(runtime1, div_u64(alltime, PRECFACTOR)));
+
+                       firstdigit = str;
+                       while (*firstdigit == '0' && *firstdigit != '\0')
+                               firstdigit++;
+
+                       if (strlen(firstdigit) < DECPRECISION+1)
+                               firstdigit = str + MAXINTPRECISION-1;
+
+                       intdigits = strlen(firstdigit) - DECPRECISION;
+                       strncpy(buf, firstdigit, intdigits);
+                       buf[intdigits] = '.';
+                       strcpy(buf + intdigits + 1, firstdigit + intdigits);
+                       strcat(buf, "%\n");
+               } else
+                       strcpy(buf, "n.a.\n");
+
+               r = strlen(buf);
+               break;
+
+               default:
+               raw_spin_unlock(&per_cpu(cpuload_lock, cpu));
+               r = 0;
+               break;
+               }
+       }
+
+       *eof = 1;
+       return r;
+}
+
+static inline void reset_cpuload1(int cpu)
+{
+       per_cpu(idletime, cpu) = per_cpu(runtime, cpu) = 0;
+}
+
+static int reset_cpuload(struct file *file, const char __user *buffer,
+                          unsigned long count, void *data)
+{
+       struct cpuload_data *cpuload_data = (struct cpuload_data *) data;
+       int cpu = cpuload_data->cpu;
+
+       switch (cpuload_data->action) {
+       case CPULOAD_RESET:
+               reset_cpuload1(cpu);
+               break;
+
+       case CPULOAD_RESETALL:
+               for_each_online_cpu(cpu)
+                       reset_cpuload1(cpu);
+               break;
+       }
+       return count;
+}
+
+static int __init proc_cpuload_init(void)
+{
+       int cpu;
+       struct proc_dir_entry *root_cpuload_dir, *entry;
+
+       root_cpuload_dir = proc_mkdir("cpuload", NULL);
+       if (!root_cpuload_dir)
+               return 0;
+
+       entry = create_proc_entry("resetall", S_IWUGO, root_cpuload_dir);
+       if (entry) {
+               entry->write_proc = reset_cpuload;
+               entry->data = (void *) &cpuload_data_resetall;
+       }
+
+       for_each_possible_cpu(cpu) {
+               char name[32];
+               struct proc_dir_entry *cpuload_cpudir;
+
+               raw_spin_lock_init(&per_cpu(cpuload_lock, cpu));
+
+               snprintf(name, sizeof(name), "cpu%d", cpu);
+               cpuload_cpudir = proc_mkdir(name, root_cpuload_dir);
+               if (!cpuload_cpudir)
+                       return 0;
+
+               per_cpu(cpuload_data_idletime, cpu).cpu = cpu;
+               per_cpu(cpuload_data_idletime, cpu).action = CPULOAD_IDLETIME;
+               entry = create_proc_entry("idletime", S_IRUGO, cpuload_cpudir);
+               if (entry) {
+                       entry->read_proc = show_cpuload;
+                       entry->data = (void *)
+                           &per_cpu(cpuload_data_idletime, cpu);
+               }
+
+               per_cpu(cpuload_data_runtime, cpu).cpu = cpu;
+               per_cpu(cpuload_data_runtime, cpu).action = CPULOAD_RUNTIME;
+               entry = create_proc_entry("runtime", S_IRUGO, cpuload_cpudir);
+               if (entry) {
+                       entry->read_proc = show_cpuload;
+                       entry->data = (void *)
+                           &per_cpu(cpuload_data_runtime, cpu);
+               }
+
+               per_cpu(cpuload_data_load, cpu).cpu = cpu;
+               per_cpu(cpuload_data_load, cpu).action = CPULOAD_LOAD;
+               entry = create_proc_entry("load", S_IRUGO, cpuload_cpudir);
+               if (entry) {
+                       entry->read_proc = show_cpuload;
+                       entry->data = (void *)
+                           &per_cpu(cpuload_data_load, cpu);
+               }
+
+               per_cpu(cpuload_data_reset, cpu).cpu = cpu;
+               per_cpu(cpuload_data_reset, cpu).action = CPULOAD_RESET;
+               entry = create_proc_entry("reset", S_IWUGO, cpuload_cpudir);
+               if (entry) {
+                       entry->write_proc = reset_cpuload;
+                       entry->data = (void *)
+                           &per_cpu(cpuload_data_reset, cpu);
+               }
+       }
+       return 0;
+}
+
+module_init(proc_cpuload_init);