Btrfs: fix snapshot inconsistency after a file write followed by truncate

author Filipe Manana <fdmanana@suse.com>

Wed, 29 Oct 2014 11:57:59 +0000 (11:57 +0000)

committer Chris Mason <clm@fb.com>

Tue, 25 Nov 2014 15:41:23 +0000 (07:41 -0800)
author Filipe Manana <fdmanana@suse.com>
Wed, 29 Oct 2014 11:57:59 +0000 (11:57 +0000)
committer Chris Mason <clm@fb.com>
Tue, 25 Nov 2014 15:41:23 +0000 (07:41 -0800)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index 9918ba3ec2b28ce5a07c68d350f523c3df524ee2..fc73e86235e8882dc43682563a6a75d4c1d93fef 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3480,8 +3480,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
  int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
                                          struct btrfs_fs_info *fs_info);
  int __get_raid_index(u64 flags);
-int btrfs_start_nocow_write(struct btrfs_root *root);
-void btrfs_end_nocow_write(struct btrfs_root *root);
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
  /* ctree.c */
  int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                      int level, int *slot);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 5e81e3694d92711f30d449057010f68df413c8e5..b4e3ab115f5f5b1cf9e8db46d3e8153a957b2b20 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -9656,12 +9656,14 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
  }
  
  /*
- * btrfs_{start,end}_write() is similar to mnt_{want, drop}_write(),
- * they are used to prevent the some tasks writing data into the page cache
- * by nocow before the subvolume is snapshoted, but flush the data into
- * the disk after the snapshot creation.
+ * btrfs_{start,end}_write_no_snapshoting() are similar to
+ * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
+ * data into the page cache through nocow before the subvolume is snapshoted,
+ * but flush the data into disk after the snapshot creation, or to prevent
+ * operations while snapshoting is ongoing and that cause the snapshot to be
+ * inconsistent (writes followed by expanding truncates for example).
   */
-void btrfs_end_nocow_write(struct btrfs_root *root)
+void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
  {
         percpu_counter_dec(&root->subv_writers->counter);
         /*
@@ -9673,7 +9675,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root)
                 wake_up(&root->subv_writers->wait);
  }
  
-int btrfs_start_nocow_write(struct btrfs_root *root)
+int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
  {
         if (atomic_read(&root->will_be_snapshoted))
                 return 0;
@@ -9684,7 +9686,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root)
          */
         smp_mb();
         if (atomic_read(&root->will_be_snapshoted)) {
-               btrfs_end_nocow_write(root);
+               btrfs_end_write_no_snapshoting(root);
                 return 0;
         }
         return 1;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index 0fbf0e7bc606eac0cb03dd56c70620cd096a5d09..e4090259569bccfdb3ec788481814e769ff84615 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1428,7 +1428,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
         u64 num_bytes;
         int ret;
  
-       ret = btrfs_start_nocow_write(root);
+       ret = btrfs_start_write_no_snapshoting(root);
         if (!ret)
                 return -ENOSPC;
  
@@ -1451,7 +1451,7 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
         ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
         if (ret <= 0) {
                 ret = 0;
-               btrfs_end_nocow_write(root);
+               btrfs_end_write_no_snapshoting(root);
         } else {
                 *write_bytes = min_t(size_t, *write_bytes ,
                                      num_bytes - pos + lockstart);
@@ -1543,7 +1543,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                                 btrfs_free_reserved_data_space(inode,
                                                                reserve_bytes);
                         else
-                               btrfs_end_nocow_write(root);
+                               btrfs_end_write_no_snapshoting(root);
                         break;
                 }
  
@@ -1632,7 +1632,7 @@ again:
  
                 release_bytes = 0;
                 if (only_release_metadata)
-                       btrfs_end_nocow_write(root);
+                       btrfs_end_write_no_snapshoting(root);
  
                 if (only_release_metadata && copied > 0) {
                         u64 lockstart = round_down(pos, root->sectorsize);
@@ -1661,7 +1661,7 @@ again:
  
         if (release_bytes) {
                 if (only_release_metadata) {
-                       btrfs_end_nocow_write(root);
+                       btrfs_end_write_no_snapshoting(root);
                         btrfs_delalloc_release_metadata(inode, release_bytes);
                 } else {
                         btrfs_delalloc_release_space(inode, release_bytes);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index a5374c2bb94328c08d5d7db3337e5c2fd3ba20b1..8de23355f6cf66d8c97531157301ec7e8384d61a 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1337,7 +1337,7 @@ next_slot:
                          * we fall into common COW way.
                          */
                         if (!nolock) {
-                               err = btrfs_start_nocow_write(root);
+                               err = btrfs_start_write_no_snapshoting(root);
                                 if (!err)
                                         goto out_check;
                         }
@@ -1361,7 +1361,7 @@ out_check:
                 if (extent_end <= start) {
                         path->slots[0]++;
                         if (!nolock && nocow)
-                               btrfs_end_nocow_write(root);
+                               btrfs_end_write_no_snapshoting(root);
                         goto next_slot;
                 }
                 if (!nocow) {
@@ -1381,7 +1381,7 @@ out_check:
                                              page_started, nr_written, 1);
                         if (ret) {
                                 if (!nolock && nocow)
-                                       btrfs_end_nocow_write(root);
+                                       btrfs_end_write_no_snapshoting(root);
                                 goto error;
                         }
                         cow_start = (u64)-1;
@@ -1432,7 +1432,7 @@ out_check:
                                                       num_bytes);
                         if (ret) {
                                 if (!nolock && nocow)
-                                       btrfs_end_nocow_write(root);
+                                       btrfs_end_write_no_snapshoting(root);
                                 goto error;
                         }
                 }
@@ -1443,7 +1443,7 @@ out_check:
                                              EXTENT_DELALLOC, PAGE_UNLOCK |
                                              PAGE_SET_PRIVATE2);
                 if (!nolock && nocow)
-                       btrfs_end_nocow_write(root);
+                       btrfs_end_write_no_snapshoting(root);
                 cur_offset = extent_end;
                 if (cur_offset > end)
                         break;
@@ -4599,6 +4599,26 @@ next:
         return err;
  }
  
+static int wait_snapshoting_atomic_t(atomic_t *a)
+{
+       schedule();
+       return 0;
+}
+
+static void wait_for_snapshot_creation(struct btrfs_root *root)
+{
+       while (true) {
+               int ret;
+
+               ret = btrfs_start_write_no_snapshoting(root);
+               if (ret)
+                       break;
+               wait_on_atomic_t(&root->will_be_snapshoted,
+                                wait_snapshoting_atomic_t,
+                                TASK_UNINTERRUPTIBLE);
+       }
+}
+
  static int btrfs_setsize(struct inode *inode, struct iattr *attr)
  {
         struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4623,17 +4643,30 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
  
         if (newsize > oldsize) {
                 truncate_pagecache(inode, newsize);
+               /*
+                * Don't do an expanding truncate while snapshoting is ongoing.
+                * This is to ensure the snapshot captures a fully consistent
+                * state of this file - if the snapshot captures this expanding
+                * truncation, it must capture all writes that happened before
+                * this truncation.
+                */
+               wait_for_snapshot_creation(root);
                 ret = btrfs_cont_expand(inode, oldsize, newsize);
-               if (ret)
+               if (ret) {
+                       btrfs_end_write_no_snapshoting(root);
                         return ret;
+               }
  
                 trans = btrfs_start_transaction(root, 1);
-               if (IS_ERR(trans))
+               if (IS_ERR(trans)) {
+                       btrfs_end_write_no_snapshoting(root);
                         return PTR_ERR(trans);
+               }
  
                 i_size_write(inode, newsize);
                 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
                 ret = btrfs_update_inode(trans, root, inode);
+               btrfs_end_write_no_snapshoting(root);
                 btrfs_end_transaction(trans, root);
         } else {
  
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c

index 3abc068c55437140fdc54d9a959b140aaa7ed04a..b590e23fa03e402c904b68c1f08a949d00ee4b33 100644 (file)
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -617,7 +617,7 @@ fail:
         return ret;
  }
  
-static void btrfs_wait_nocow_write(struct btrfs_root *root)
+static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
  {
         s64 writers;
         DEFINE_WAIT(wait);
@@ -649,7 +649,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
  
         atomic_inc(&root->will_be_snapshoted);
         smp_mb__after_atomic();
-       btrfs_wait_nocow_write(root);
+       btrfs_wait_for_no_snapshoting_writes(root);
  
         ret = btrfs_start_delalloc_inodes(root, 0);
         if (ret)
@@ -732,7 +732,8 @@ fail:
  free:
         kfree(pending_snapshot);
  out:
-       atomic_dec(&root->will_be_snapshoted);
+       if (atomic_dec_and_test(&root->will_be_snapshoted))
+               wake_up_atomic_t(&root->will_be_snapshoted);
         return ret;
  }
author	Filipe Manana <fdmanana@suse.com>
	Wed, 29 Oct 2014 11:57:59 +0000 (11:57 +0000)
committer	Chris Mason <clm@fb.com>
	Tue, 25 Nov 2014 15:41:23 +0000 (07:41 -0800)
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/extent-tree.c		patch \| blob \| history
fs/btrfs/file.c		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/ioctl.c		patch \| blob \| history