kernfs: cache atomic_write_len in kernfs_open_file
authorTejun Heo <tj@kernel.org>
Tue, 4 Mar 2014 20:38:46 +0000 (15:38 -0500)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 9 Mar 2014 06:08:29 +0000 (22:08 -0800)
While implementing atomic_write_len, 4d3773c4bb41 ("kernfs: implement
kernfs_ops->atomic_write_len") moved data copy from userland inside
kernfs_get_active() and kernfs_open_file->mutex so that
kernfs_ops->atomic_write_len can be accessed before copying buffer
from userland; unfortunately, this could lead to locking order
inversion involving mmap_sem if copy_from_user() takes a page fault.

  ======================================================
  [ INFO: possible circular locking dependency detected ]
  3.14.0-rc4-next-20140228-sasha-00011-g4077c67-dirty #26 Tainted: G        W
  -------------------------------------------------------
  trinity-c236/10658 is trying to acquire lock:
   (&of->mutex#2){+.+.+.}, at: [<fs/kernfs/file.c:487>] kernfs_fop_mmap+0x54/0x120

  but task is already holding lock:
   (&mm->mmap_sem){++++++}, at: [<mm/util.c:397>] vm_mmap_pgoff+0x6e/0xe0

  which lock already depends on the new lock.

  the existing dependency chain (in reverse order) is:

 -> #1 (&mm->mmap_sem){++++++}:
 [<kernel/locking/lockdep.c:1945 kernel/locking/lockdep.c:2131>] validate_chain+0x6c5/0x7b0
 [<kernel/locking/lockdep.c:3182>] __lock_acquire+0x4cd/0x5a0
 [<arch/x86/include/asm/current.h:14 kernel/locking/lockdep.c:3602>] lock_acquire+0x182/0x1d0
 [<mm/memory.c:4188>] might_fault+0x7e/0xb0
 [<arch/x86/include/asm/uaccess.h:713 fs/kernfs/file.c:291>] kernfs_fop_write+0xd8/0x190
 [<fs/read_write.c:473>] vfs_write+0xe3/0x1d0
 [<fs/read_write.c:523 fs/read_write.c:515>] SyS_write+0x5d/0xa0
 [<arch/x86/kernel/entry_64.S:749>] tracesys+0xdd/0xe2

 -> #0 (&of->mutex#2){+.+.+.}:
 [<kernel/locking/lockdep.c:1840>] check_prev_add+0x13f/0x560
 [<kernel/locking/lockdep.c:1945 kernel/locking/lockdep.c:2131>] validate_chain+0x6c5/0x7b0
 [<kernel/locking/lockdep.c:3182>] __lock_acquire+0x4cd/0x5a0
 [<arch/x86/include/asm/current.h:14 kernel/locking/lockdep.c:3602>] lock_acquire+0x182/0x1d0
 [<kernel/locking/mutex.c:470 kernel/locking/mutex.c:571>] mutex_lock_nested+0x6a/0x510
 [<fs/kernfs/file.c:487>] kernfs_fop_mmap+0x54/0x120
 [<mm/mmap.c:1573>] mmap_region+0x310/0x5c0
 [<mm/mmap.c:1365>] do_mmap_pgoff+0x385/0x430
 [<mm/util.c:399>] vm_mmap_pgoff+0x8f/0xe0
 [<mm/mmap.c:1416 mm/mmap.c:1374>] SyS_mmap_pgoff+0x1b0/0x210
 [<arch/x86/kernel/sys_x86_64.c:72>] SyS_mmap+0x1d/0x20
 [<arch/x86/kernel/entry_64.S:749>] tracesys+0xdd/0xe2

  other info that might help us debug this:

   Possible unsafe locking scenario:

 CPU0                    CPU1
 ----                    ----
    lock(&mm->mmap_sem);
 lock(&of->mutex#2);
 lock(&mm->mmap_sem);
    lock(&of->mutex#2);

   *** DEADLOCK ***

  1 lock held by trinity-c236/10658:
   #0:  (&mm->mmap_sem){++++++}, at: [<mm/util.c:397>] vm_mmap_pgoff+0x6e/0xe0

  stack backtrace:
  CPU: 2 PID: 10658 Comm: trinity-c236 Tainted: G        W 3.14.0-rc4-next-20140228-sasha-00011-g4077c67-dirty #26
   0000000000000000 ffff88011911fa48 ffffffff8438e945 0000000000000000
   0000000000000000 ffff88011911fa98 ffffffff811a0109 ffff88011911fab8
   ffff88011911fab8 ffff88011911fa98 ffff880119128cc0 ffff880119128cf8
  Call Trace:
   [<lib/dump_stack.c:52>] dump_stack+0x52/0x7f
   [<kernel/locking/lockdep.c:1213>] print_circular_bug+0x129/0x160
   [<kernel/locking/lockdep.c:1840>] check_prev_add+0x13f/0x560
   [<include/linux/spinlock.h:343 mm/slub.c:1933>] ? deactivate_slab+0x511/0x550
   [<kernel/locking/lockdep.c:1945 kernel/locking/lockdep.c:2131>] validate_chain+0x6c5/0x7b0
   [<kernel/locking/lockdep.c:3182>] __lock_acquire+0x4cd/0x5a0
   [<mm/mmap.c:1552>] ? mmap_region+0x24a/0x5c0
   [<arch/x86/include/asm/current.h:14 kernel/locking/lockdep.c:3602>] lock_acquire+0x182/0x1d0
   [<fs/kernfs/file.c:487>] ? kernfs_fop_mmap+0x54/0x120
   [<kernel/locking/mutex.c:470 kernel/locking/mutex.c:571>] mutex_lock_nested+0x6a/0x510
   [<fs/kernfs/file.c:487>] ? kernfs_fop_mmap+0x54/0x120
   [<kernel/sched/core.c:2477>] ? get_parent_ip+0x11/0x50
   [<fs/kernfs/file.c:487>] ? kernfs_fop_mmap+0x54/0x120
   [<fs/kernfs/file.c:487>] kernfs_fop_mmap+0x54/0x120
   [<mm/mmap.c:1573>] mmap_region+0x310/0x5c0
   [<mm/mmap.c:1365>] do_mmap_pgoff+0x385/0x430
   [<mm/util.c:397>] ? vm_mmap_pgoff+0x6e/0xe0
   [<mm/util.c:399>] vm_mmap_pgoff+0x8f/0xe0
   [<kernel/rcu/update.c:97>] ? __rcu_read_unlock+0x44/0xb0
   [<fs/file.c:641>] ? dup_fd+0x3c0/0x3c0
   [<mm/mmap.c:1416 mm/mmap.c:1374>] SyS_mmap_pgoff+0x1b0/0x210
   [<arch/x86/kernel/sys_x86_64.c:72>] SyS_mmap+0x1d/0x20
   [<arch/x86/kernel/entry_64.S:749>] tracesys+0xdd/0xe2

Fix it by caching atomic_write_len in kernfs_open_file during open so
that it can be determined without accessing kernfs_ops in
kernfs_fop_write().  This restores the structure of kernfs_fop_write()
before 4d3773c4bb41 with updated @len determination logic.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Sasha Levin <sasha.levin@oracle.com>
References: http://lkml.kernel.org/g/53113485.2090407@oracle.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
fs/kernfs/file.c
include/linux/kernfs.h

index ddcb471b9cc95ab8ef9d3035acc0cb443fadcf0d..8034706a7af87523bfc40e8660f21cc54238563f 100644 (file)
@@ -253,55 +253,50 @@ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
 {
        struct kernfs_open_file *of = kernfs_of(file);
        const struct kernfs_ops *ops;
-       char *buf = NULL;
-       ssize_t len;
-
-       /*
-        * @of->mutex nests outside active ref and is just to ensure that
-        * the ops aren't called concurrently for the same open file.
-        */
-       mutex_lock(&of->mutex);
-       if (!kernfs_get_active(of->kn)) {
-               mutex_unlock(&of->mutex);
-               return -ENODEV;
-       }
-
-       ops = kernfs_ops(of->kn);
-       if (!ops->write) {
-               len = -EINVAL;
-               goto out_unlock;
-       }
+       size_t len;
+       char *buf;
 
-       if (ops->atomic_write_len) {
+       if (of->atomic_write_len) {
                len = count;
-               if (len > ops->atomic_write_len) {
-                       len = -E2BIG;
-                       goto out_unlock;
-               }
+               if (len > of->atomic_write_len)
+                       return -E2BIG;
        } else {
                len = min_t(size_t, count, PAGE_SIZE);
        }
 
        buf = kmalloc(len + 1, GFP_KERNEL);
-       if (!buf) {
-               len = -ENOMEM;
-               goto out_unlock;
-       }
+       if (!buf)
+               return -ENOMEM;
 
        if (copy_from_user(buf, user_buf, len)) {
                len = -EFAULT;
-               goto out_unlock;
+               goto out_free;
        }
        buf[len] = '\0';        /* guarantee string termination */
 
-       len = ops->write(of, buf, len, *ppos);
-out_unlock:
+       /*
+        * @of->mutex nests outside active ref and is just to ensure that
+        * the ops aren't called concurrently for the same open file.
+        */
+       mutex_lock(&of->mutex);
+       if (!kernfs_get_active(of->kn)) {
+               mutex_unlock(&of->mutex);
+               len = -ENODEV;
+               goto out_free;
+       }
+
+       ops = kernfs_ops(of->kn);
+       if (ops->write)
+               len = ops->write(of, buf, len, *ppos);
+       else
+               len = -EINVAL;
+
        kernfs_put_active(of->kn);
        mutex_unlock(&of->mutex);
 
        if (len > 0)
                *ppos += len;
-
+out_free:
        kfree(buf);
        return len;
 }
@@ -665,6 +660,12 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
        of->kn = kn;
        of->file = file;
 
+       /*
+        * Write path needs to atomic_write_len outside active reference.
+        * Cache it in open_file.  See kernfs_fop_write() for details.
+        */
+       of->atomic_write_len = ops->atomic_write_len;
+
        /*
         * Always instantiate seq_file even if read access doesn't use
         * seq_file or is not requested.  This unifies private data access
index 09669d09274857e4e950438151f4d095b467140e..b0122dc6f96a0a21324f86b5a28c725ac3ea74cf 100644 (file)
@@ -158,6 +158,7 @@ struct kernfs_open_file {
        int                     event;
        struct list_head        list;
 
+       size_t                  atomic_write_len;
        bool                    mmapped;
        const struct vm_operations_struct *vm_ops;
 };