ext4: fix races between changing inode journal mode and ext4_writepages
authorDaeho Jeong <daeho.jeong@samsung.com>
Tue, 26 Apr 2016 03:22:35 +0000 (23:22 -0400)
committerTheodore Ts'o <tytso@mit.edu>
Tue, 26 Apr 2016 03:22:35 +0000 (23:22 -0400)
In ext4, there is a race condition between changing inode journal mode
and ext4_writepages(). While ext4_writepages() is executed on a
non-journalled mode inode, the inode's journal mode could be enabled
by ioctl() and then, some pages dirtied after switching the journal
mode will be still exposed to ext4_writepages() in non-journaled mode.
To resolve this problem, we use fs-wide per-cpu rw semaphore by Jan
Kara's suggestion because we don't want to waste ext4_inode_info's
space for this extra rare case.

Signed-off-by: Daeho Jeong <daeho.jeong@samsung.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
fs/ext4/ext4.h
fs/ext4/inode.c
fs/ext4/super.c
kernel/locking/percpu-rwsem.c

index cb00b1119ec935ae36dd927d93af6c4d7db2dbcc..ba5aecc07fbc842e61a5176be8b693fed3e840d3 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/ratelimit.h>
 #include <crypto/hash.h>
 #include <linux/falloc.h>
+#include <linux/percpu-rwsem.h>
 #ifdef __KERNEL__
 #include <linux/compat.h>
 #endif
@@ -1508,6 +1509,9 @@ struct ext4_sb_info {
        struct ratelimit_state s_err_ratelimit_state;
        struct ratelimit_state s_warning_ratelimit_state;
        struct ratelimit_state s_msg_ratelimit_state;
+
+       /* Barrier between changing inodes' journal flags and writepages ops. */
+       struct percpu_rw_semaphore s_journal_flag_rwsem;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
index 779ef4c11bc17c66620aa2f5facd86a15779409a..4d8ebbe0045674410be430476e811b8fb27545db 100644 (file)
@@ -2612,11 +2612,14 @@ static int ext4_writepages(struct address_space *mapping,
        struct blk_plug plug;
        bool give_up_on_write = false;
 
+       percpu_down_read(&sbi->s_journal_flag_rwsem);
        trace_ext4_writepages(inode, wbc);
 
-       if (dax_mapping(mapping))
-               return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
-                                                  wbc);
+       if (dax_mapping(mapping)) {
+               ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
+                                                 wbc);
+               goto out_writepages;
+       }
 
        /*
         * No pages to write? This is mainly a kludge to avoid starting
@@ -2786,6 +2789,7 @@ retry:
 out_writepages:
        trace_ext4_writepages_result(inode, wbc, ret,
                                     nr_to_write - wbc->nr_to_write);
+       percpu_up_read(&sbi->s_journal_flag_rwsem);
        return ret;
 }
 
@@ -5436,6 +5440,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
        journal_t *journal;
        handle_t *handle;
        int err;
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
        /*
         * We have to be very careful here: changing a data block's
@@ -5475,6 +5480,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
                }
        }
 
+       percpu_down_write(&sbi->s_journal_flag_rwsem);
        jbd2_journal_lock_updates(journal);
 
        /*
@@ -5491,6 +5497,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
                err = jbd2_journal_flush(journal);
                if (err < 0) {
                        jbd2_journal_unlock_updates(journal);
+                       percpu_up_write(&sbi->s_journal_flag_rwsem);
                        ext4_inode_resume_unlocked_dio(inode);
                        return err;
                }
@@ -5499,6 +5506,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
        ext4_set_aops(inode);
 
        jbd2_journal_unlock_updates(journal);
+       percpu_up_write(&sbi->s_journal_flag_rwsem);
+
        if (val)
                up_write(&EXT4_I(inode)->i_mmap_sem);
        ext4_inode_resume_unlocked_dio(inode);
index 304c712dbe12e8ba7d5c5abfaca30f7ef3f5e5a6..20c5d52253b4924c9c1d0cf27110a566e5e3f234 100644 (file)
@@ -859,6 +859,7 @@ static void ext4_put_super(struct super_block *sb)
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
        percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
+       percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
        brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
        for (i = 0; i < EXT4_MAXQUOTAS; i++)
@@ -3930,6 +3931,9 @@ no_journal:
        if (!err)
                err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
                                          GFP_KERNEL);
+       if (!err)
+               err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
+
        if (err) {
                ext4_msg(sb, KERN_ERR, "insufficient memory");
                goto failed_mount6;
index f231e0bb311ce0827d281d34f737a3a06405c072..bec0b647f9cc5291df0211c6532edd5ffe167eb2 100644 (file)
@@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
        free_percpu(brw->fast_read_ctr);
        brw->fast_read_ctr = NULL; /* catch use after free bugs */
 }
+EXPORT_SYMBOL_GPL(percpu_free_rwsem);
 
 /*
  * This is the fast-path for down_read/up_read. If it succeeds we rely