memcg: reduce memcg tree traversals for stats collection
authorShakeel Butt <shakeelb@google.com>
Wed, 22 Aug 2018 04:53:17 +0000 (21:53 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 22 Aug 2018 17:52:44 +0000 (10:52 -0700)
Currently cgroup-v1's memcg_stat_show traverses the memcg tree ~17 times
to collect the stats while cgroup-v2's memory_stat_show traverses the
memcg tree thrice.  On a large machine, a couple thousand memcgs is very
normal and if the churn is high and memcgs stick around during to several
reasons, tens of thousands of nodes in memcg tree can exist.  This patch
has refactored and shared the stat collection code between cgroup-v1 and
cgroup-v2 and has reduced the tree traversal to just one.

I ran a simple benchmark which reads the root_mem_cgroup's stat file
1000 times in the presense of 2500 memcgs on cgroup-v1. The results are:

Without the patch:
$ time ./read-root-stat-1000-times

real    0m1.663s
user    0m0.000s
sys     0m1.660s

With the patch:
$ time ./read-root-stat-1000-times

real    0m0.468s
user    0m0.000s
sys     0m0.467s

Link: http://lkml.kernel.org/r/20180724224635.143944-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Bruce Merry <bmerry@ska.ac.za>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
mm/memcontrol.c

index 6a921890739f7e1c0acad44d5916c5f43487a2a1..59c14c988143b2ce6cfe4b90fc6bf6c2ff564be5 100644 (file)
@@ -2899,29 +2899,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
        return retval;
 }
 
-static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
-{
-       struct mem_cgroup *iter;
-       int i;
-
-       memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
-
-       for_each_mem_cgroup_tree(iter, memcg) {
-               for (i = 0; i < MEMCG_NR_STAT; i++)
-                       stat[i] += memcg_page_state(iter, i);
-       }
-}
+struct accumulated_stats {
+       unsigned long stat[MEMCG_NR_STAT];
+       unsigned long events[NR_VM_EVENT_ITEMS];
+       unsigned long lru_pages[NR_LRU_LISTS];
+       const unsigned int *stats_array;
+       const unsigned int *events_array;
+       int stats_size;
+       int events_size;
+};
 
-static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
+static void accumulate_memcg_tree(struct mem_cgroup *memcg,
+                                 struct accumulated_stats *acc)
 {
-       struct mem_cgroup *iter;
+       struct mem_cgroup *mi;
        int i;
 
-       memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS);
+       for_each_mem_cgroup_tree(mi, memcg) {
+               for (i = 0; i < acc->stats_size; i++)
+                       acc->stat[i] += memcg_page_state(mi,
+                               acc->stats_array ? acc->stats_array[i] : i);
 
-       for_each_mem_cgroup_tree(iter, memcg) {
-               for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
-                       events[i] += memcg_sum_events(iter, i);
+               for (i = 0; i < acc->events_size; i++)
+                       acc->events[i] += memcg_sum_events(mi,
+                               acc->events_array ? acc->events_array[i] : i);
+
+               for (i = 0; i < NR_LRU_LISTS; i++)
+                       acc->lru_pages[i] +=
+                               mem_cgroup_nr_lru_pages(mi, BIT(i));
        }
 }
 
@@ -3332,6 +3337,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
        unsigned long memory, memsw;
        struct mem_cgroup *mi;
        unsigned int i;
+       struct accumulated_stats acc;
 
        BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -3364,32 +3370,27 @@ static int memcg_stat_show(struct seq_file *m, void *v)
                seq_printf(m, "hierarchical_memsw_limit %llu\n",
                           (u64)memsw * PAGE_SIZE);
 
-       for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
-               unsigned long long val = 0;
+       memset(&acc, 0, sizeof(acc));
+       acc.stats_size = ARRAY_SIZE(memcg1_stats);
+       acc.stats_array = memcg1_stats;
+       acc.events_size = ARRAY_SIZE(memcg1_events);
+       acc.events_array = memcg1_events;
+       accumulate_memcg_tree(memcg, &acc);
 
+       for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
                if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
                        continue;
-               for_each_mem_cgroup_tree(mi, memcg)
-                       val += memcg_page_state(mi, memcg1_stats[i]) *
-                       PAGE_SIZE;
-               seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
+               seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
+                          (u64)acc.stat[i] * PAGE_SIZE);
        }
 
-       for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) {
-               unsigned long long val = 0;
-
-               for_each_mem_cgroup_tree(mi, memcg)
-                       val += memcg_sum_events(mi, memcg1_events[i]);
-               seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
-       }
-
-       for (i = 0; i < NR_LRU_LISTS; i++) {
-               unsigned long long val = 0;
+       for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
+               seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
+                          (u64)acc.events[i]);
 
-               for_each_mem_cgroup_tree(mi, memcg)
-                       val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
-               seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
-       }
+       for (i = 0; i < NR_LRU_LISTS; i++)
+               seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
+                          (u64)acc.lru_pages[i] * PAGE_SIZE);
 
 #ifdef CONFIG_DEBUG_VM
        {
@@ -5486,8 +5487,7 @@ static int memory_events_show(struct seq_file *m, void *v)
 static int memory_stat_show(struct seq_file *m, void *v)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-       unsigned long stat[MEMCG_NR_STAT];
-       unsigned long events[NR_VM_EVENT_ITEMS];
+       struct accumulated_stats acc;
        int i;
 
        /*
@@ -5501,66 +5501,62 @@ static int memory_stat_show(struct seq_file *m, void *v)
         * Current memory state:
         */
 
-       tree_stat(memcg, stat);
-       tree_events(memcg, events);
+       memset(&acc, 0, sizeof(acc));
+       acc.stats_size = MEMCG_NR_STAT;
+       acc.events_size = NR_VM_EVENT_ITEMS;
+       accumulate_memcg_tree(memcg, &acc);
 
        seq_printf(m, "anon %llu\n",
-                  (u64)stat[MEMCG_RSS] * PAGE_SIZE);
+                  (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
        seq_printf(m, "file %llu\n",
-                  (u64)stat[MEMCG_CACHE] * PAGE_SIZE);
+                  (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
        seq_printf(m, "kernel_stack %llu\n",
-                  (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
+                  (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
        seq_printf(m, "slab %llu\n",
-                  (u64)(stat[NR_SLAB_RECLAIMABLE] +
-                        stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
+                  (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
+                        acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
        seq_printf(m, "sock %llu\n",
-                  (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
+                  (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
 
        seq_printf(m, "shmem %llu\n",
-                  (u64)stat[NR_SHMEM] * PAGE_SIZE);
+                  (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
        seq_printf(m, "file_mapped %llu\n",
-                  (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE);
+                  (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
        seq_printf(m, "file_dirty %llu\n",
-                  (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE);
+                  (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
        seq_printf(m, "file_writeback %llu\n",
-                  (u64)stat[NR_WRITEBACK] * PAGE_SIZE);
+                  (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
 
-       for (i = 0; i < NR_LRU_LISTS; i++) {
-               struct mem_cgroup *mi;
-               unsigned long val = 0;
-
-               for_each_mem_cgroup_tree(mi, memcg)
-                       val += mem_cgroup_nr_lru_pages(mi, BIT(i));
-               seq_printf(m, "%s %llu\n",
-                          mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
-       }
+       for (i = 0; i < NR_LRU_LISTS; i++)
+               seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
+                          (u64)acc.lru_pages[i] * PAGE_SIZE);
 
        seq_printf(m, "slab_reclaimable %llu\n",
-                  (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
+                  (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
        seq_printf(m, "slab_unreclaimable %llu\n",
-                  (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
+                  (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
 
        /* Accumulated memory events */
 
-       seq_printf(m, "pgfault %lu\n", events[PGFAULT]);
-       seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]);
+       seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
+       seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
 
-       seq_printf(m, "pgrefill %lu\n", events[PGREFILL]);
-       seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] +
-                  events[PGSCAN_DIRECT]);
-       seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] +
-                  events[PGSTEAL_DIRECT]);
-       seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]);
-       seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]);
-       seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]);
-       seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]);
+       seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
+       seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
+                  acc.events[PGSCAN_DIRECT]);
+       seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
+                  acc.events[PGSTEAL_DIRECT]);
+       seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
+       seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
+       seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
+       seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
 
        seq_printf(m, "workingset_refault %lu\n",
-                  stat[WORKINGSET_REFAULT]);
+                  acc.stat[WORKINGSET_REFAULT]);
        seq_printf(m, "workingset_activate %lu\n",
-                  stat[WORKINGSET_ACTIVATE]);
+                  acc.stat[WORKINGSET_ACTIVATE]);
        seq_printf(m, "workingset_nodereclaim %lu\n",
-                  stat[WORKINGSET_NODERECLAIM]);
+                  acc.stat[WORKINGSET_NODERECLAIM]);
 
        return 0;
 }