btrfs: move the dio_sem higher up the callchain
authorJosef Bacik <josef@toxicpanda.com>
Fri, 12 Oct 2018 19:32:32 +0000 (15:32 -0400)
committerDavid Sterba <dsterba@suse.com>
Fri, 19 Oct 2018 10:20:04 +0000 (12:20 +0200)
We're getting a lockdep splat because we take the dio_sem under the
log_mutex.  What we really need is to protect fsync() from logging an
extent map for an extent we never waited on higher up, so just guard the
whole thing with dio_sem.

======================================================
WARNING: possible circular locking dependency detected
4.18.0-rc4-xfstests-00025-g5de5edbaf1d4 #411 Not tainted
------------------------------------------------------
aio-dio-invalid/30928 is trying to acquire lock:
0000000092621cfd (&mm->mmap_sem){++++}, at: get_user_pages_unlocked+0x5a/0x1e0

but task is already holding lock:
00000000cefe6b35 (&ei->dio_sem){++++}, at: btrfs_direct_IO+0x3be/0x400

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #5 (&ei->dio_sem){++++}:
       lock_acquire+0xbd/0x220
       down_write+0x51/0xb0
       btrfs_log_changed_extents+0x80/0xa40
       btrfs_log_inode+0xbaf/0x1000
       btrfs_log_inode_parent+0x26f/0xa80
       btrfs_log_dentry_safe+0x50/0x70
       btrfs_sync_file+0x357/0x540
       do_fsync+0x38/0x60
       __ia32_sys_fdatasync+0x12/0x20
       do_fast_syscall_32+0x9a/0x2f0
       entry_SYSENTER_compat+0x84/0x96

-> #4 (&ei->log_mutex){+.+.}:
       lock_acquire+0xbd/0x220
       __mutex_lock+0x86/0xa10
       btrfs_record_unlink_dir+0x2a/0xa0
       btrfs_unlink+0x5a/0xc0
       vfs_unlink+0xb1/0x1a0
       do_unlinkat+0x264/0x2b0
       do_fast_syscall_32+0x9a/0x2f0
       entry_SYSENTER_compat+0x84/0x96

-> #3 (sb_internal#2){.+.+}:
       lock_acquire+0xbd/0x220
       __sb_start_write+0x14d/0x230
       start_transaction+0x3e6/0x590
       btrfs_evict_inode+0x475/0x640
       evict+0xbf/0x1b0
       btrfs_run_delayed_iputs+0x6c/0x90
       cleaner_kthread+0x124/0x1a0
       kthread+0x106/0x140
       ret_from_fork+0x3a/0x50

-> #2 (&fs_info->cleaner_delayed_iput_mutex){+.+.}:
       lock_acquire+0xbd/0x220
       __mutex_lock+0x86/0xa10
       btrfs_alloc_data_chunk_ondemand+0x197/0x530
       btrfs_check_data_free_space+0x4c/0x90
       btrfs_delalloc_reserve_space+0x20/0x60
       btrfs_page_mkwrite+0x87/0x520
       do_page_mkwrite+0x31/0xa0
       __handle_mm_fault+0x799/0xb00
       handle_mm_fault+0x7c/0xe0
       __do_page_fault+0x1d3/0x4a0
       async_page_fault+0x1e/0x30

-> #1 (sb_pagefaults){.+.+}:
       lock_acquire+0xbd/0x220
       __sb_start_write+0x14d/0x230
       btrfs_page_mkwrite+0x6a/0x520
       do_page_mkwrite+0x31/0xa0
       __handle_mm_fault+0x799/0xb00
       handle_mm_fault+0x7c/0xe0
       __do_page_fault+0x1d3/0x4a0
       async_page_fault+0x1e/0x30

-> #0 (&mm->mmap_sem){++++}:
       __lock_acquire+0x42e/0x7a0
       lock_acquire+0xbd/0x220
       down_read+0x48/0xb0
       get_user_pages_unlocked+0x5a/0x1e0
       get_user_pages_fast+0xa4/0x150
       iov_iter_get_pages+0xc3/0x340
       do_direct_IO+0xf93/0x1d70
       __blockdev_direct_IO+0x32d/0x1c20
       btrfs_direct_IO+0x227/0x400
       generic_file_direct_write+0xcf/0x180
       btrfs_file_write_iter+0x308/0x58c
       aio_write+0xf8/0x1d0
       io_submit_one+0x3a9/0x620
       __ia32_compat_sys_io_submit+0xb2/0x270
       do_int80_syscall_32+0x5b/0x1a0
       entry_INT80_compat+0x88/0xa0

other info that might help us debug this:

Chain exists of:
  &mm->mmap_sem --> &ei->log_mutex --> &ei->dio_sem

 Possible unsafe locking scenario:

       CPU0                    CPU1
       ----                    ----
  lock(&ei->dio_sem);
                               lock(&ei->log_mutex);
                               lock(&ei->dio_sem);
  lock(&mm->mmap_sem);

 *** DEADLOCK ***

1 lock held by aio-dio-invalid/30928:
 #0: 00000000cefe6b35 (&ei->dio_sem){++++}, at: btrfs_direct_IO+0x3be/0x400

stack backtrace:
CPU: 0 PID: 30928 Comm: aio-dio-invalid Not tainted 4.18.0-rc4-xfstests-00025-g5de5edbaf1d4 #411
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
Call Trace:
 dump_stack+0x7c/0xbb
 print_circular_bug.isra.37+0x297/0x2a4
 check_prev_add.constprop.45+0x781/0x7a0
 ? __lock_acquire+0x42e/0x7a0
 validate_chain.isra.41+0x7f0/0xb00
 __lock_acquire+0x42e/0x7a0
 lock_acquire+0xbd/0x220
 ? get_user_pages_unlocked+0x5a/0x1e0
 down_read+0x48/0xb0
 ? get_user_pages_unlocked+0x5a/0x1e0
 get_user_pages_unlocked+0x5a/0x1e0
 get_user_pages_fast+0xa4/0x150
 iov_iter_get_pages+0xc3/0x340
 do_direct_IO+0xf93/0x1d70
 ? __alloc_workqueue_key+0x358/0x490
 ? __blockdev_direct_IO+0x14b/0x1c20
 __blockdev_direct_IO+0x32d/0x1c20
 ? btrfs_run_delalloc_work+0x40/0x40
 ? can_nocow_extent+0x490/0x490
 ? kvm_clock_read+0x1f/0x30
 ? can_nocow_extent+0x490/0x490
 ? btrfs_run_delalloc_work+0x40/0x40
 btrfs_direct_IO+0x227/0x400
 ? btrfs_run_delalloc_work+0x40/0x40
 generic_file_direct_write+0xcf/0x180
 btrfs_file_write_iter+0x308/0x58c
 aio_write+0xf8/0x1d0
 ? kvm_clock_read+0x1f/0x30
 ? __might_fault+0x3e/0x90
 io_submit_one+0x3a9/0x620
 ? io_submit_one+0xe5/0x620
 __ia32_compat_sys_io_submit+0xb2/0x270
 do_int80_syscall_32+0x5b/0x1a0
 entry_INT80_compat+0x88/0xa0

CC: stable@vger.kernel.org # 4.14+
Reviewed-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/file.c
fs/btrfs/tree-log.c

index 15b925142793b67b0153ec031dd9b01bf1856bd4..97c7a086f7bd69f1dddb68fbdbf5c8bcc995024c 100644 (file)
@@ -2078,6 +2078,14 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                goto out;
 
        inode_lock(inode);
+
+       /*
+        * We take the dio_sem here because the tree log stuff can race with
+        * lockless dio writes and get an extent map logged for an extent we
+        * never waited on.  We need it this high up for lockdep reasons.
+        */
+       down_write(&BTRFS_I(inode)->dio_sem);
+
        atomic_inc(&root->log_batch);
 
        /*
@@ -2086,6 +2094,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         */
        ret = btrfs_wait_ordered_range(inode, start, len);
        if (ret) {
+               up_write(&BTRFS_I(inode)->dio_sem);
                inode_unlock(inode);
                goto out;
        }
@@ -2109,6 +2118,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
                 * checked called fsync.
                 */
                ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
+               up_write(&BTRFS_I(inode)->dio_sem);
                inode_unlock(inode);
                goto out;
        }
@@ -2127,6 +2137,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
+               up_write(&BTRFS_I(inode)->dio_sem);
                inode_unlock(inode);
                goto out;
        }
@@ -2148,6 +2159,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         * file again, but that will end up using the synchronization
         * inside btrfs_sync_log to keep things safe.
         */
+       up_write(&BTRFS_I(inode)->dio_sem);
        inode_unlock(inode);
 
        /*
index 7147d329ddbc648263e86e99b10544dcbf88412f..e07f3376b7dfc0c9350117bf3db956781e57d45e 100644 (file)
@@ -4390,7 +4390,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 
        INIT_LIST_HEAD(&extents);
 
-       down_write(&inode->dio_sem);
        write_lock(&tree->lock);
        test_gen = root->fs_info->last_trans_committed;
        logged_start = start;
@@ -4456,7 +4455,6 @@ process:
        }
        WARN_ON(!list_empty(&extents));
        write_unlock(&tree->lock);
-       up_write(&inode->dio_sem);
 
        btrfs_release_path(path);
        if (!ret)