From cc1790cf541553263bda024295d7600c7cd7c45d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 21 May 2015 10:57:17 +0200 Subject: [PATCH] perf/x86: Improve HT workaround GP counter constraint The (SNB/IVB/HSW) HT bug only affects events that can be programmed onto GP counters, therefore we should only limit the number of GP counters that can be used per cpu -- iow we should not constrain the FP counters. Furthermore, we should only enfore such a limit when there are in fact exclusive events being scheduled on either sibling. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner [ Fixed build fail for the !CONFIG_CPU_SUP_INTEL case. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 36 +++++++++++++++---- arch/x86/kernel/cpu/perf_event.h | 15 ++++++-- arch/x86/kernel/cpu/perf_event_intel.c | 30 ++++++---------- arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 +- 4 files changed, 53 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1664eeea65e0..2eca19422454 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -611,6 +611,7 @@ struct sched_state { int event; /* event index */ int counter; /* counter index */ int unassigned; /* number of events to be assigned left */ + int nr_gp; /* number of GP counters used */ unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; @@ -620,9 +621,10 @@ struct sched_state { struct perf_sched { int max_weight; int max_events; + int max_gp; + int saved_states; struct event_constraint **constraints; struct sched_state state; - int saved_states; struct sched_state saved[SCHED_STATES_MAX]; }; @@ -630,13 +632,14 @@ struct perf_sched { * Initialize interator that runs through all events and counters. */ static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints, - int num, int wmin, int wmax) + int num, int wmin, int wmax, int gpmax) { int idx; memset(sched, 0, sizeof(*sched)); sched->max_events = num; sched->max_weight = wmax; + sched->max_gp = gpmax; sched->constraints = constraints; for (idx = 0; idx < num; idx++) { @@ -696,11 +699,16 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) goto done; } } + /* Grab the first unused counter starting with idx */ idx = sched->state.counter; for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { - if (!__test_and_set_bit(idx, sched->state.used)) + if (!__test_and_set_bit(idx, sched->state.used)) { + if (sched->state.nr_gp++ >= sched->max_gp) + return false; + goto done; + } } return false; @@ -757,11 +765,11 @@ static bool perf_sched_next_event(struct perf_sched *sched) * Assign a counter for each event. */ int perf_assign_events(struct event_constraint **constraints, int n, - int wmin, int wmax, int *assign) + int wmin, int wmax, int gpmax, int *assign) { struct perf_sched sched; - perf_sched_init(&sched, constraints, n, wmin, wmax); + perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax); do { if (!perf_sched_find_counter(&sched)) @@ -822,8 +830,24 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) /* slow path */ if (i != n) { + int gpmax = x86_pmu.num_counters; + + /* + * Do not allow scheduling of more than half the available + * generic counters. + * + * This helps avoid counter starvation of sibling thread by + * ensuring at most half the counters cannot be in exclusive + * mode. There is no designated counters for the limits. Any + * N/2 counters can be used. This helps with events with + * specific counter constraints. + */ + if (is_ht_workaround_enabled() && !cpuc->is_fake && + READ_ONCE(cpuc->excl_cntrs->exclusive_present)) + gpmax /= 2; + unsched = perf_assign_events(cpuc->event_constraint, n, wmin, - wmax, assign); + wmax, gpmax, assign); } /* diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index fdfaab7c5e55..ef78516850fb 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -74,6 +74,7 @@ struct event_constraint { #define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */ #define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */ #define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ +#define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */ struct amd_nb { @@ -134,8 +135,6 @@ enum intel_excl_state_type { struct intel_excl_states { enum intel_excl_state_type init_state[X86_PMC_IDX_MAX]; enum intel_excl_state_type state[X86_PMC_IDX_MAX]; - int num_alloc_cntrs;/* #counters allocated */ - int max_alloc_cntrs;/* max #counters allowed */ bool sched_started; /* true if scheduling has started */ }; @@ -144,6 +143,11 @@ struct intel_excl_cntrs { struct intel_excl_states states[2]; + union { + u16 has_exclusive[2]; + u32 exclusive_present; + }; + int refcnt; /* per-core: #HT threads */ unsigned core_id; /* per-core: core id */ }; @@ -176,6 +180,7 @@ struct cpu_hw_events { struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ struct event_constraint *event_constraint[X86_PMC_IDX_MAX]; + int n_excl; /* the number of exclusive events */ unsigned int group_flag; int is_fake; @@ -719,7 +724,7 @@ static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, void x86_pmu_enable_all(int added); int perf_assign_events(struct event_constraint **constraints, int n, - int wmin, int wmax, int *assign); + int wmin, int wmax, int gpmax, int *assign); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); void x86_pmu_stop(struct perf_event *event, int flags); @@ -930,4 +935,8 @@ static inline struct intel_shared_regs *allocate_shared_regs(int cpu) return NULL; } +static inline int is_ht_workaround_enabled(void) +{ + return 0; +} #endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 7a58fb5df15c..a1e35c9f06b9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1923,7 +1923,6 @@ intel_start_scheduling(struct cpu_hw_events *cpuc) xl = &excl_cntrs->states[tid]; xl->sched_started = true; - xl->num_alloc_cntrs = 0; /* * lock shared state until we are done scheduling * in stop_event_scheduling() @@ -2000,6 +1999,11 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * across HT threads */ is_excl = c->flags & PERF_X86_EVENT_EXCL; + if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) { + event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT; + if (!cpuc->n_excl++) + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1); + } /* * xl = state of current HT @@ -2008,18 +2012,6 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, xl = &excl_cntrs->states[tid]; xlo = &excl_cntrs->states[o_tid]; - /* - * do not allow scheduling of more than max_alloc_cntrs - * which is set to half the available generic counters. - * this helps avoid counter starvation of sibling thread - * by ensuring at most half the counters cannot be in - * exclusive mode. There is not designated counters for the - * limits. Any N/2 counters can be used. This helps with - * events with specifix counter constraints - */ - if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs) - return &emptyconstraint; - cx = c; /* @@ -2150,6 +2142,11 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, xl = &excl_cntrs->states[tid]; xlo = &excl_cntrs->states[o_tid]; + if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) { + hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT; + if (!--cpuc->n_excl) + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 0); + } /* * put_constraint may be called from x86_schedule_events() @@ -2632,8 +2629,6 @@ static void intel_pmu_cpu_starting(int cpu) cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) { - int h = x86_pmu.num_counters >> 1; - for_each_cpu(i, topology_thread_cpumask(cpu)) { struct intel_excl_cntrs *c; @@ -2647,11 +2642,6 @@ static void intel_pmu_cpu_starting(int cpu) } cpuc->excl_cntrs->core_id = core_id; cpuc->excl_cntrs->refcnt++; - /* - * set hard limit to half the number of generic counters - */ - cpuc->excl_cntrs->states[0].max_alloc_cntrs = h; - cpuc->excl_cntrs->states[1].max_alloc_cntrs = h; } } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index ec2ba578d286..dd319e59246b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -395,7 +395,7 @@ static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int /* slow path */ if (i != n) ret = perf_assign_events(box->event_constraint, n, - wmin, wmax, assign); + wmin, wmax, n, assign); if (!assign || ret) { for (i = 0; i < n; i++) -- 2.30.2