mm: memcg/slab: rework non-root kmem_cache lifecycle management

author Roman Gushchin <guro@fb.com>

Fri, 12 Jul 2019 03:56:27 +0000 (20:56 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 12 Jul 2019 18:05:44 +0000 (11:05 -0700)
author Roman Gushchin <guro@fb.com>
Fri, 12 Jul 2019 03:56:27 +0000 (20:56 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 12 Jul 2019 18:05:44 +0000 (11:05 -0700)
diff --git a/include/linux/slab.h b/include/linux/slab.h

index 6008d884e6210d307802c56f66f94939266e99e2..bc189a43e680fe3fb6cc36a3ca6cff47da2adf0c 100644 (file)
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -16,6 +16,7 @@
  #include <linux/overflow.h>
  #include <linux/types.h>
  #include <linux/workqueue.h>
+#include <linux/percpu-refcount.h>
  
  
  /*
@@ -152,7 +153,6 @@ int kmem_cache_shrink(struct kmem_cache *);
  
  void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
  void memcg_deactivate_kmem_caches(struct mem_cgroup *);
-void memcg_destroy_kmem_caches(struct mem_cgroup *);
  
  /*
   * Please use this macro to create slab caches. Simply specify the
@@ -642,6 +642,7 @@ struct memcg_cache_params {
                         struct mem_cgroup *memcg;
                         struct list_head children_node;
                         struct list_head kmem_caches_node;
+                       struct percpu_ref refcnt;
  
                         void (*work_fn)(struct kmem_cache *);
                         union {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 25e35a8b8ba296b5f9c49f74d8c0f450ecc6ac8a..ce4ce5e7937b9bcf614d66a0c9b68968f6eabf66 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2667,12 +2667,13 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
  {
         struct memcg_kmem_cache_create_work *cw;
  
+       if (!css_tryget_online(&memcg->css))
+               return;
+
         cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
         if (!cw)
                 return;
  
-       css_get(&memcg->css);
-
         cw->memcg = memcg;
         cw->cachep = cachep;
         INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
@@ -2707,6 +2708,7 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
  {
         struct mem_cgroup *memcg;
         struct kmem_cache *memcg_cachep;
+       struct memcg_cache_array *arr;
         int kmemcg_id;
  
         VM_BUG_ON(!is_root_cache(cachep));
@@ -2714,14 +2716,28 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
         if (memcg_kmem_bypass())
                 return cachep;
  
-       memcg = get_mem_cgroup_from_current();
+       rcu_read_lock();
+
+       if (unlikely(current->active_memcg))
+               memcg = current->active_memcg;
+       else
+               memcg = mem_cgroup_from_task(current);
+
+       if (!memcg || memcg == root_mem_cgroup)
+               goto out_unlock;
+
         kmemcg_id = READ_ONCE(memcg->kmemcg_id);
         if (kmemcg_id < 0)
-               goto out;
+               goto out_unlock;
+
+       arr = rcu_dereference(cachep->memcg_params.memcg_caches);
  
-       memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
-       if (likely(memcg_cachep))
-               return memcg_cachep;
+       /*
+        * Make sure we will access the up-to-date value. The code updating
+        * memcg_caches issues a write barrier to match the data dependency
+        * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
+        */
+       memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
  
         /*
          * If we are in a safe context (can wait, and not in interrupt
@@ -2734,10 +2750,20 @@ struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
          * memcg_create_kmem_cache, this means no further allocation
          * could happen with the slab_mutex held. So it's better to
          * defer everything.
+        *
+        * If the memcg is dying or memcg_cache is about to be released,
+        * don't bother creating new kmem_caches. Because memcg_cachep
+        * is ZEROed as the fist step of kmem offlining, we don't need
+        * percpu_ref_tryget_live() here. css_tryget_online() check in
+        * memcg_schedule_kmem_cache_create() will prevent us from
+        * creation of a new kmem_cache.
          */
-       memcg_schedule_kmem_cache_create(memcg, cachep);
-out:
-       css_put(&memcg->css);
+       if (unlikely(!memcg_cachep))
+               memcg_schedule_kmem_cache_create(memcg, cachep);
+       else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
+               cachep = memcg_cachep;
+out_unlock:
+       rcu_read_unlock();
         return cachep;
  }
  
@@ -2748,7 +2774,7 @@ out:
  void memcg_kmem_put_cache(struct kmem_cache *cachep)
  {
         if (!is_root_cache(cachep))
-               css_put(&cachep->memcg_params.memcg->css);
+               percpu_ref_put(&cachep->memcg_params.refcnt);
  }
  
  /**
@@ -3295,7 +3321,7 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
                 memcg_offline_kmem(memcg);
  
         if (memcg->kmem_state == KMEM_ALLOCATED) {
-               memcg_destroy_kmem_caches(memcg);
+               WARN_ON(!list_empty(&memcg->kmem_caches));
                 static_branch_dec(&memcg_kmem_enabled_key);
                 WARN_ON(page_counter_read(&memcg->kmem));
         }
diff --git a/mm/slab.h b/mm/slab.h

index 46623a576a3c295303531c8ff554d6bcec345e81..5d2b8511e6fb39e272e1ce22338b0c90788849b9 100644 (file)
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -248,31 +248,6 @@ static inline const char *cache_name(struct kmem_cache *s)
         return s->name;
  }
  
-/*
- * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
- * That said the caller must assure the memcg's cache won't go away by either
- * taking a css reference to the owner cgroup, or holding the slab_mutex.
- */
-static inline struct kmem_cache *
-cache_from_memcg_idx(struct kmem_cache *s, int idx)
-{
-       struct kmem_cache *cachep;
-       struct memcg_cache_array *arr;
-
-       rcu_read_lock();
-       arr = rcu_dereference(s->memcg_params.memcg_caches);
-
-       /*
-        * Make sure we will access the up-to-date value. The code updating
-        * memcg_caches issues a write barrier to match this (see
-        * memcg_create_kmem_cache()).
-        */
-       cachep = READ_ONCE(arr->entries[idx]);
-       rcu_read_unlock();
-
-       return cachep;
-}
-
  static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
  {
         if (is_root_cache(s))
@@ -284,14 +259,25 @@ static __always_inline int memcg_charge_slab(struct page *page,
                                              gfp_t gfp, int order,
                                              struct kmem_cache *s)
  {
+       int ret;
+
         if (is_root_cache(s))
                 return 0;
-       return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
+
+       ret = memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
+       if (ret)
+               return ret;
+
+       percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
+
+       return 0;
  }
  
  static __always_inline void memcg_uncharge_slab(struct page *page, int order,
                                                 struct kmem_cache *s)
  {
+       if (!is_root_cache(s))
+               percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
         memcg_kmem_uncharge(page, order);
  }
  
@@ -323,12 +309,6 @@ static inline const char *cache_name(struct kmem_cache *s)
         return s->name;
  }
  
-static inline struct kmem_cache *
-cache_from_memcg_idx(struct kmem_cache *s, int idx)
-{
-       return NULL;
-}
-
  static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
  {
         return s;
diff --git a/mm/slab_common.c b/mm/slab_common.c

index a15557776d7d13a90069d6bd3a074f938c893606..ee3971f7fabce18607b5408349634d593a40859d 100644 (file)
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -132,6 +132,8 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
  LIST_HEAD(slab_root_caches);
  static DEFINE_SPINLOCK(memcg_kmem_wq_lock);
  
+static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref);
+
  void slab_init_memcg_params(struct kmem_cache *s)
  {
         s->memcg_params.root_cache = NULL;
@@ -146,6 +148,12 @@ static int init_memcg_params(struct kmem_cache *s,
         struct memcg_cache_array *arr;
  
         if (root_cache) {
+               int ret = percpu_ref_init(&s->memcg_params.refcnt,
+                                         kmemcg_cache_shutdown,
+                                         0, GFP_KERNEL);
+               if (ret)
+                       return ret;
+
                 s->memcg_params.root_cache = root_cache;
                 INIT_LIST_HEAD(&s->memcg_params.children_node);
                 INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
@@ -171,6 +179,8 @@ static void destroy_memcg_params(struct kmem_cache *s)
  {
         if (is_root_cache(s))
                 kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
+       else
+               percpu_ref_exit(&s->memcg_params.refcnt);
  }
  
  static void free_memcg_params(struct rcu_head *rcu)
@@ -226,6 +236,7 @@ void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg)
         if (is_root_cache(s)) {
                 list_add(&s->root_caches_node, &slab_root_caches);
         } else {
+               css_get(&memcg->css);
                 s->memcg_params.memcg = memcg;
                 list_add(&s->memcg_params.children_node,
                          &s->memcg_params.root_cache->memcg_params.children);
@@ -241,6 +252,7 @@ static void memcg_unlink_cache(struct kmem_cache *s)
         } else {
                 list_del(&s->memcg_params.children_node);
                 list_del(&s->memcg_params.kmem_caches_node);
+               css_put(&s->memcg_params.memcg->css);
         }
  }
  #else
@@ -678,7 +690,7 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
         }
  
         /*
-        * Since readers won't lock (see cache_from_memcg_idx()), we need a
+        * Since readers won't lock (see memcg_kmem_get_cache()), we need a
          * barrier here to ensure nobody will see the kmem_cache partially
          * initialized.
          */
@@ -701,16 +713,11 @@ static void kmemcg_workfn(struct work_struct *work)
         get_online_mems();
  
         mutex_lock(&slab_mutex);
-
         s->memcg_params.work_fn(s);
-
         mutex_unlock(&slab_mutex);
  
         put_online_mems();
         put_online_cpus();
-
-       /* done, put the ref from kmemcg_cache_deactivate() */
-       css_put(&s->memcg_params.memcg->css);
  }
  
  static void kmemcg_rcufn(struct rcu_head *head)
@@ -727,10 +734,38 @@ static void kmemcg_rcufn(struct rcu_head *head)
         queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
  }
  
+static void kmemcg_cache_shutdown_fn(struct kmem_cache *s)
+{
+       WARN_ON(shutdown_cache(s));
+}
+
+static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref)
+{
+       struct kmem_cache *s = container_of(percpu_ref, struct kmem_cache,
+                                           memcg_params.refcnt);
+       unsigned long flags;
+
+       spin_lock_irqsave(&memcg_kmem_wq_lock, flags);
+       if (s->memcg_params.root_cache->memcg_params.dying)
+               goto unlock;
+
+       s->memcg_params.work_fn = kmemcg_cache_shutdown_fn;
+       INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
+       queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
+
+unlock:
+       spin_unlock_irqrestore(&memcg_kmem_wq_lock, flags);
+}
+
+static void kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
+{
+       __kmemcg_cache_deactivate_after_rcu(s);
+       percpu_ref_kill(&s->memcg_params.refcnt);
+}
+
  static void kmemcg_cache_deactivate(struct kmem_cache *s)
  {
-       if (WARN_ON_ONCE(is_root_cache(s)) ||
-           WARN_ON_ONCE(s->memcg_params.work_fn))
+       if (WARN_ON_ONCE(is_root_cache(s)))
                 return;
  
         __kmemcg_cache_deactivate(s);
@@ -744,10 +779,7 @@ static void kmemcg_cache_deactivate(struct kmem_cache *s)
         if (s->memcg_params.root_cache->memcg_params.dying)
                 goto unlock;
  
-       /* pin memcg so that @s doesn't get destroyed in the middle */
-       css_get(&s->memcg_params.memcg->css);
-
-       s->memcg_params.work_fn = __kmemcg_cache_deactivate_after_rcu;
+       s->memcg_params.work_fn = kmemcg_cache_deactivate_after_rcu;
         call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn);
  unlock:
         spin_unlock_irq(&memcg_kmem_wq_lock);
@@ -781,28 +813,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
         put_online_cpus();
  }
  
-void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
-{
-       struct kmem_cache *s, *s2;
-
-       get_online_cpus();
-       get_online_mems();
-
-       mutex_lock(&slab_mutex);
-       list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
-                                memcg_params.kmem_caches_node) {
-               /*
-                * The cgroup is about to be freed and therefore has no charges
-                * left. Hence, all its caches must be empty by now.
-                */
-               BUG_ON(shutdown_cache(s));
-       }
-       mutex_unlock(&slab_mutex);
-
-       put_online_mems();
-       put_online_cpus();
-}
-
  static int shutdown_memcg_caches(struct kmem_cache *s)
  {
         struct memcg_cache_array *arr;
author	Roman Gushchin <guro@fb.com>
	Fri, 12 Jul 2019 03:56:27 +0000 (20:56 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 12 Jul 2019 18:05:44 +0000 (11:05 -0700)
include/linux/slab.h		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/slab.h		patch \| blob \| history
mm/slab_common.c		patch \| blob \| history