Subject: Allow to disable particular sleep states
From: Carsten Emde <C.Emde@osadl.org>
Date: Fri, July 20 2012 16:12:29 +0100

Deeper sleep states reduce power consumption, but may lead to longer
latencies. In cases where both enery saving and deterministic
responsiveness are needed, it may be necessary to balance real-time
capabilities against power consumption. For this purpose, it is highly
desirable to disable particular sleep states on particular cores.

Such functionality was added for debugging purposes in later kernel
versions. This is a backport to the 3.2.x RT tree from:
3a53396b0381ec9d5180fd8fe7a681c8ce95fd9a (3.4)
dc7fd275ae60ef8edf952aff2a62462f5d892fd4 (3.6)
yet unqueued https://lkml.org/lkml/2012/7/19/388 (3.7)

Signed-off-by: Carsten Emde <C.Emde@osadl.org>

---
 Documentation/cpuidle/sysfs.txt    |   13 ++++++++++
 drivers/cpuidle/governors/ladder.c |    4 ++-
 drivers/cpuidle/governors/menu.c   |    8 ++++--
 drivers/cpuidle/sysfs.c            |   45 ++++++++++++++++++++++++++++++++++++-
 include/linux/cpuidle.h            |    1 
 5 files changed, 67 insertions(+), 4 deletions(-)

Index: linux-3.2.0/Documentation/cpuidle/sysfs.txt
===================================================================
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:39 @ drwxr-xr-x 2 root root 0 Feb  8 10:42 st
 /sys/devices/system/cpu/cpu0/cpuidle/state0:
 total 0
 -r--r--r-- 1 root root 4096 Feb  8 10:42 desc
+-rw-r--r-- 1 root root 4096 Feb  8 10:42 disable
 -r--r--r-- 1 root root 4096 Feb  8 10:42 latency
 -r--r--r-- 1 root root 4096 Feb  8 10:42 name
 -r--r--r-- 1 root root 4096 Feb  8 10:42 power
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:49 @ total 0
 /sys/devices/system/cpu/cpu0/cpuidle/state1:
 total 0
 -r--r--r-- 1 root root 4096 Feb  8 10:42 desc
+-rw-r--r-- 1 root root 4096 Feb  8 10:42 disable
 -r--r--r-- 1 root root 4096 Feb  8 10:42 latency
 -r--r--r-- 1 root root 4096 Feb  8 10:42 name
 -r--r--r-- 1 root root 4096 Feb  8 10:42 power
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:59 @ total 0
 /sys/devices/system/cpu/cpu0/cpuidle/state2:
 total 0
 -r--r--r-- 1 root root 4096 Feb  8 10:42 desc
+-rw-r--r-- 1 root root 4096 Feb  8 10:42 disable
 -r--r--r-- 1 root root 4096 Feb  8 10:42 latency
 -r--r--r-- 1 root root 4096 Feb  8 10:42 name
 -r--r--r-- 1 root root 4096 Feb  8 10:42 power
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:69 @ total 0
 /sys/devices/system/cpu/cpu0/cpuidle/state3:
 total 0
 -r--r--r-- 1 root root 4096 Feb  8 10:42 desc
+-rw-r--r-- 1 root root 4096 Feb  8 10:42 disable
 -r--r--r-- 1 root root 4096 Feb  8 10:42 latency
 -r--r--r-- 1 root root 4096 Feb  8 10:42 name
 -r--r--r-- 1 root root 4096 Feb  8 10:42 power
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:79 @ total 0
 
 
 * desc : Small description about the idle state (string)
+* disable : Option to disable this idle state (bool) -> see note below
 * latency : Latency to exit out of this idle state (in microseconds)
 * name : Name of the idle state (string)
 * power : Power consumed while in this idle state (in milliwatts)
 * time : Total time spent in this idle state (in microseconds)
 * usage : Number of times this state was entered (count)
+
+Note:
+The behavior and the effect of the disable variable depends on the
+implementation of a particular governor. In the ladder governor, for
+example, it is not coherent, i.e. if one is disabling a light state,
+then all deeper states are disabled as well, but the disable variable
+does not reflect it. Likewise, if one enables a deep state but a lighter
+state still is disabled, then this has no effect.
Index: linux-3.2.0/drivers/cpuidle/governors/ladder.c
===================================================================
--- linux-3.2.0.orig/drivers/cpuidle/governors/ladder.c
+++ linux-3.2.0/drivers/cpuidle/governors/ladder.c
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:91 @ static int ladder_select_state(struct cp
 
 	/* consider promotion */
 	if (last_idx < drv->state_count - 1 &&
+	    !dev->states_usage[last_idx + 1].disable &&
 	    last_residency > last_state->threshold.promotion_time &&
 	    drv->states[last_idx + 1].exit_latency <= latency_req) {
 		last_state->stats.promotion_count++;
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:104 @ static int ladder_select_state(struct cp
 
 	/* consider demotion */
 	if (last_idx > CPUIDLE_DRIVER_STATE_START &&
-	    drv->states[last_idx].exit_latency > latency_req) {
+	    (dev->states_usage[last_idx].disable ||
+	    drv->states[last_idx].exit_latency > latency_req)) {
 		int i;
 
 		for (i = last_idx - 1; i > CPUIDLE_DRIVER_STATE_START; i--) {
Index: linux-3.2.0/drivers/cpuidle/governors/menu.c
===================================================================
--- linux-3.2.0.orig/drivers/cpuidle/governors/menu.c
+++ linux-3.2.0/drivers/cpuidle/governors/menu.c
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:239 @ static int menu_select(struct cpuidle_dr
 {
 	struct menu_device *data = &__get_cpu_var(menu_devices);
 	int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY);
-	unsigned int power_usage = -1;
+	int power_usage = -1;
 	int i;
 	int multiplier;
 	struct timespec t;
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:283 @ static int menu_select(struct cpuidle_dr
 	 * We want to default to C1 (hlt), not to busy polling
 	 * unless the timer is happening really really soon.
 	 */
-	if (data->expected_us > 5)
+	if (data->expected_us > 5 &&
+		dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable == 0)
 		data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
 
 	/*
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:293 @ static int menu_select(struct cpuidle_dr
 	 */
 	for (i = CPUIDLE_DRIVER_STATE_START; i < drv->state_count; i++) {
 		struct cpuidle_state *s = &drv->states[i];
+		struct cpuidle_state_usage *su = &dev->states_usage[i];
 
+		if (su->disable)
+			continue;
 		if (s->target_residency > data->predicted_us)
 			continue;
 		if (s->exit_latency > latency_req)
Index: linux-3.2.0/drivers/cpuidle/sysfs.c
===================================================================
--- linux-3.2.0.orig/drivers/cpuidle/sysfs.c
+++ linux-3.2.0/drivers/cpuidle/sysfs.c
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:14 @
 #include <linux/sysfs.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
+#include <linux/capability.h>
 
 #include "cpuidle.h"
 
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:222 @ struct cpuidle_state_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct cpuidle_state *, \
 					struct cpuidle_state_usage *, char *);
-	ssize_t (*store)(struct cpuidle_state *, const char *, size_t);
+	ssize_t (*store)(struct cpuidle_state *, \
+			struct cpuidle_state_usage *, const char *, size_t);
 };
 
 #define define_one_state_ro(_name, show) \
 static struct cpuidle_state_attr attr_##_name = __ATTR(_name, 0444, show, NULL)
 
+#define define_one_state_rw(_name, show, store) \
+static struct cpuidle_state_attr attr_##_name = __ATTR(_name, 0644, show, store)
+
 #define define_show_state_function(_name) \
 static ssize_t show_state_##_name(struct cpuidle_state *state, \
 			 struct cpuidle_state_usage *state_usage, char *buf) \
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:239 @ static ssize_t show_state_##_name(struct
 	return sprintf(buf, "%u\n", state->_name);\
 }
 
+#define define_store_state_ull_function(_name) \
+static ssize_t store_state_##_name(struct cpuidle_state *state, \
+		struct cpuidle_state_usage *state_usage, \
+		const char *buf, size_t size) \
+{ \
+	unsigned long long value; \
+	int err; \
+	if (!capable(CAP_SYS_ADMIN)) \
+		return -EPERM; \
+	err = kstrtoull(buf, 0, &value); \
+	if (err) \
+		return err; \
+	if (value) \
+		state_usage->_name = 1; \
+	else \
+		state_usage->_name = 0; \
+	return size; \
+}
+
 #define define_show_state_ull_function(_name) \
 static ssize_t show_state_##_name(struct cpuidle_state *state, \
 			struct cpuidle_state_usage *state_usage, char *buf) \
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:280 @ define_show_state_ull_function(usage)
 define_show_state_ull_function(time)
 define_show_state_str_function(name)
 define_show_state_str_function(desc)
+define_show_state_ull_function(disable)
+define_store_state_ull_function(disable)
 
 define_one_state_ro(name, show_state_name);
 define_one_state_ro(desc, show_state_desc);
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:289 @ define_one_state_ro(latency, show_state_
 define_one_state_ro(power, show_state_power_usage);
 define_one_state_ro(usage, show_state_usage);
 define_one_state_ro(time, show_state_time);
+define_one_state_rw(disable, show_state_disable, store_state_disable);
 
 static struct attribute *cpuidle_state_default_attrs[] = {
 	&attr_name.attr,
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:298 @ static struct attribute *cpuidle_state_d
 	&attr_power.attr,
 	&attr_usage.attr,
 	&attr_time.attr,
+	&attr_disable.attr,
 	NULL
 };
 
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:320 @ static ssize_t cpuidle_state_show(struct
 	return ret;
 }
 
+static ssize_t cpuidle_state_store(struct kobject *kobj,
+	struct attribute *attr, const char *buf, size_t size)
+{
+	int ret = -EIO;
+	struct cpuidle_state *state = kobj_to_state(kobj);
+	struct cpuidle_state_usage *state_usage = kobj_to_state_usage(kobj);
+	struct cpuidle_state_attr *cattr = attr_to_stateattr(attr);
+
+	if (cattr->store)
+		ret = cattr->store(state, state_usage, buf, size);
+
+	return ret;
+}
+
 static const struct sysfs_ops cpuidle_state_sysfs_ops = {
 	.show = cpuidle_state_show,
+	.store = cpuidle_state_store,
 };
 
 static void cpuidle_state_sysfs_release(struct kobject *kobj)
Index: linux-3.2.0/include/linux/cpuidle.h
===================================================================
--- linux-3.2.0.orig/include/linux/cpuidle.h
+++ linux-3.2.0/include/linux/cpuidle.h
@ linux-3.2.0/Documentation/cpuidle/sysfs.txt:36 @ struct cpuidle_driver;
 struct cpuidle_state_usage {
 	void		*driver_data;
 
+	unsigned long long	disable;
 	unsigned long long	usage;
 	unsigned long long	time; /* in US */
 };