x86/intel_rdt: Limit C-states dynamically when pseudo-locking active
authorReinette Chatre <reinette.chatre@intel.com>
Fri, 22 Jun 2018 22:42:30 +0000 (15:42 -0700)
committerThomas Gleixner <tglx@linutronix.de>
Sun, 24 Jun 2018 13:35:48 +0000 (15:35 +0200)
Deeper C-states impact cache content through shrinking of the cache or
flushing entire cache to memory before reducing power to the cache.
Deeper C-states will thus negatively impact the pseudo-locked regions.

To avoid impacting pseudo-locked regions C-states are limited on
pseudo-locked region creation so that cores associated with the
pseudo-locked region are prevented from entering deeper C-states.
This is accomplished by requesting a CPU latency target which will
prevent the core from entering C6 across all supported platforms.

Signed-off-by: Reinette Chatre <reinette.chatre@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: fenghua.yu@intel.com
Cc: tony.luck@intel.com
Cc: vikas.shivappa@linux.intel.com
Cc: gavin.hindman@intel.com
Cc: jithu.joseph@intel.com
Cc: dave.hansen@intel.com
Cc: hpa@zytor.com
Link: https://lkml.kernel.org/r/1ef4f99dd6ba12fa6fb44c5a1141e75f952b9cd9.1529706536.git.reinette.chatre@intel.com
Documentation/x86/intel_rdt_ui.txt
arch/x86/kernel/cpu/intel_rdt.h
arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c

index bcd0a6d2fcf88fabfb33faf1d7bad5b8783aca00..acac30b67c62547acef60a95a83ce11e96bd4ddf 100644 (file)
@@ -461,8 +461,8 @@ in the cache via carefully configuring the CAT feature and controlling
 application behavior. There is no guarantee that data is placed in
 cache. Instructions like INVD, WBINVD, CLFLUSH, etc. can still evict
 “locked” data from cache. Power management C-states may shrink or
-power off cache. It is thus recommended to limit the processor maximum
-C-state, for example, by setting the processor.max_cstate kernel parameter.
+power off cache. Deeper C-states will automatically be restricted on
+pseudo-locked region creation.
 
 It is required that an application using a pseudo-locked region runs
 with affinity to the cores (or a subset of the cores) associated
index b8e490a4329080e73835ce4f62da48dec98fd449..2d9cbb9d7a58d233b1feb44ca9e20db40eb43cc5 100644 (file)
@@ -142,6 +142,7 @@ struct mongroup {
  *                     region
  * @debugfs_dir:       pointer to this region's directory in the debugfs
  *                     filesystem
+ * @pm_reqs:           Power management QoS requests related to this region
  */
 struct pseudo_lock_region {
        struct rdt_resource     *r;
@@ -155,6 +156,7 @@ struct pseudo_lock_region {
        void                    *kmem;
        unsigned int            minor;
        struct dentry           *debugfs_dir;
+       struct list_head        pm_reqs;
 };
 
 /**
index dd1341557c9d1176903a8c59b326769f6e2629aa..6e83f61552a5ea54cd6ab913a9e65b08d5785740 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/debugfs.h>
 #include <linux/kthread.h>
 #include <linux/mman.h>
+#include <linux/pm_qos.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 
@@ -175,6 +176,76 @@ static struct rdtgroup *region_find_by_minor(unsigned int minor)
        return rdtgrp_match;
 }
 
+/**
+ * pseudo_lock_pm_req - A power management QoS request list entry
+ * @list:      Entry within the @pm_reqs list for a pseudo-locked region
+ * @req:       PM QoS request
+ */
+struct pseudo_lock_pm_req {
+       struct list_head list;
+       struct dev_pm_qos_request req;
+};
+
+static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
+{
+       struct pseudo_lock_pm_req *pm_req, *next;
+
+       list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) {
+               dev_pm_qos_remove_request(&pm_req->req);
+               list_del(&pm_req->list);
+               kfree(pm_req);
+       }
+}
+
+/**
+ * pseudo_lock_cstates_constrain - Restrict cores from entering C6
+ *
+ * To prevent the cache from being affected by power management entering
+ * C6 has to be avoided. This is accomplished by requesting a latency
+ * requirement lower than lowest C6 exit latency of all supported
+ * platforms as found in the cpuidle state tables in the intel_idle driver.
+ * At this time it is possible to do so with a single latency requirement
+ * for all supported platforms.
+ *
+ * Since Goldmont is supported, which is affected by X86_BUG_MONITOR,
+ * the ACPI latencies need to be considered while keeping in mind that C2
+ * may be set to map to deeper sleep states. In this case the latency
+ * requirement needs to prevent entering C2 also.
+ */
+static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
+{
+       struct pseudo_lock_pm_req *pm_req;
+       int cpu;
+       int ret;
+
+       for_each_cpu(cpu, &plr->d->cpu_mask) {
+               pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL);
+               if (!pm_req) {
+                       rdt_last_cmd_puts("fail allocating mem for PM QoS\n");
+                       ret = -ENOMEM;
+                       goto out_err;
+               }
+               ret = dev_pm_qos_add_request(get_cpu_device(cpu),
+                                            &pm_req->req,
+                                            DEV_PM_QOS_RESUME_LATENCY,
+                                            30);
+               if (ret < 0) {
+                       rdt_last_cmd_printf("fail to add latency req cpu%d\n",
+                                           cpu);
+                       kfree(pm_req);
+                       ret = -1;
+                       goto out_err;
+               }
+               list_add(&pm_req->list, &plr->pm_reqs);
+       }
+
+       return 0;
+
+out_err:
+       pseudo_lock_cstates_relax(plr);
+       return ret;
+}
+
 /**
  * pseudo_lock_region_init - Initialize pseudo-lock region information
  * @plr: pseudo-lock region
@@ -242,6 +313,7 @@ static int pseudo_lock_init(struct rdtgroup *rdtgrp)
                return -ENOMEM;
 
        init_waitqueue_head(&plr->lock_thread_wq);
+       INIT_LIST_HEAD(&plr->pm_reqs);
        rdtgrp->plr = plr;
        return 0;
 }
@@ -1135,6 +1207,12 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
        if (ret < 0)
                return ret;
 
+       ret = pseudo_lock_cstates_constrain(plr);
+       if (ret < 0) {
+               ret = -EINVAL;
+               goto out_region;
+       }
+
        plr->thread_done = 0;
 
        thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp,
@@ -1143,7 +1221,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
        if (IS_ERR(thread)) {
                ret = PTR_ERR(thread);
                rdt_last_cmd_printf("locking thread returned error %d\n", ret);
-               goto out_region;
+               goto out_cstates;
        }
 
        kthread_bind(thread, plr->cpu);
@@ -1161,7 +1239,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
                 * empty pseudo-locking loop.
                 */
                rdt_last_cmd_puts("locking thread interrupted\n");
-               goto out_region;
+               goto out_cstates;
        }
 
        if (!IS_ERR_OR_NULL(debugfs_resctrl)) {
@@ -1222,6 +1300,8 @@ out_minor:
        pseudo_lock_minor_release(new_minor);
 out_debugfs:
        debugfs_remove_recursive(plr->debugfs_dir);
+out_cstates:
+       pseudo_lock_cstates_relax(plr);
 out_region:
        pseudo_lock_region_clear(plr);
 out:
@@ -1255,6 +1335,7 @@ void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp)
                goto free;
        }
 
+       pseudo_lock_cstates_relax(plr);
        debugfs_remove_recursive(rdtgrp->plr->debugfs_dir);
        device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor));
        pseudo_lock_minor_release(plr->minor);