Btrfs: add support for fallocate's zero range operation
authorFilipe Manana <fdmanana@suse.com>
Wed, 25 Oct 2017 10:55:28 +0000 (11:55 +0100)
committerDavid Sterba <dsterba@suse.com>
Mon, 22 Jan 2018 15:08:20 +0000 (16:08 +0100)
This implements support the zero range operation of fallocate. For now
at least it's as simple as possible while reusing most of the existing
fallocate and hole punching infrastructure.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/file.c

index 1ed2e6e9e2046d3b850276bfc2ccb46d65a98ee4..16c8031db645fa74ed9836bb2bf2265d2184e133 100644 (file)
@@ -2458,6 +2458,46 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
        return ret;
 }
 
+static int btrfs_punch_hole_lock_range(struct inode *inode,
+                                      const u64 lockstart,
+                                      const u64 lockend,
+                                      struct extent_state **cached_state)
+{
+       while (1) {
+               struct btrfs_ordered_extent *ordered;
+               int ret;
+
+               truncate_pagecache_range(inode, lockstart, lockend);
+
+               lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                cached_state);
+               ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+               /*
+                * We need to make sure we have no ordered extents in this range
+                * and nobody raced in and read a page in this range, if we did
+                * we need to try again.
+                */
+               if ((!ordered ||
+                   (ordered->file_offset + ordered->len <= lockstart ||
+                    ordered->file_offset > lockend)) &&
+                    !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
+                       if (ordered)
+                               btrfs_put_ordered_extent(ordered);
+                       break;
+               }
+               if (ordered)
+                       btrfs_put_ordered_extent(ordered);
+               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+                                    lockend, cached_state);
+               ret = btrfs_wait_ordered_range(inode, lockstart,
+                                              lockend - lockstart + 1);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
 static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
@@ -2574,38 +2614,11 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                goto out_only_mutex;
        }
 
-       while (1) {
-               struct btrfs_ordered_extent *ordered;
-
-               truncate_pagecache_range(inode, lockstart, lockend);
-
-               lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-                                &cached_state);
-               ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
-
-               /*
-                * We need to make sure we have no ordered extents in this range
-                * and nobody raced in and read a page in this range, if we did
-                * we need to try again.
-                */
-               if ((!ordered ||
-                   (ordered->file_offset + ordered->len <= lockstart ||
-                    ordered->file_offset > lockend)) &&
-                    !btrfs_page_exists_in_range(inode, lockstart, lockend)) {
-                       if (ordered)
-                               btrfs_put_ordered_extent(ordered);
-                       break;
-               }
-               if (ordered)
-                       btrfs_put_ordered_extent(ordered);
-               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
-                                    lockend, &cached_state);
-               ret = btrfs_wait_ordered_range(inode, lockstart,
-                                              lockend - lockstart + 1);
-               if (ret) {
-                       inode_unlock(inode);
-                       return ret;
-               }
+       ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+                                         &cached_state);
+       if (ret) {
+               inode_unlock(inode);
+               goto out_only_mutex;
        }
 
        path = btrfs_alloc_path();
@@ -2814,6 +2827,217 @@ insert:
        return 0;
 }
 
+static int btrfs_fallocate_update_isize(struct inode *inode,
+                                       const u64 end,
+                                       const int mode)
+{
+       struct btrfs_trans_handle *trans;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret;
+       int ret2;
+
+       if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
+               return 0;
+
+       trans = btrfs_start_transaction(root, 1);
+       if (IS_ERR(trans))
+               return PTR_ERR(trans);
+
+       inode->i_ctime = current_time(inode);
+       i_size_write(inode, end);
+       btrfs_ordered_update_i_size(inode, end, NULL);
+       ret = btrfs_update_inode(trans, root, inode);
+       ret2 = btrfs_end_transaction(trans);
+
+       return ret ? ret : ret2;
+}
+
+static int btrfs_zero_range_check_range_boundary(struct inode *inode,
+                                                u64 offset)
+{
+       const u64 sectorsize = btrfs_inode_sectorsize(inode);
+       struct extent_map *em;
+       int ret = 0;
+
+       offset = round_down(offset, sectorsize);
+       em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0);
+       if (IS_ERR(em))
+               return PTR_ERR(em);
+
+       if (em->block_start == EXTENT_MAP_HOLE)
+               ret = 1;
+
+       free_extent_map(em);
+       return ret;
+}
+
+static int btrfs_zero_range(struct inode *inode,
+                           loff_t offset,
+                           loff_t len,
+                           const int mode)
+{
+       struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+       struct extent_map *em;
+       struct extent_changeset *data_reserved = NULL;
+       int ret;
+       u64 alloc_hint = 0;
+       const u64 sectorsize = btrfs_inode_sectorsize(inode);
+       u64 alloc_start = round_down(offset, sectorsize);
+       u64 alloc_end = round_up(offset + len, sectorsize);
+       u64 bytes_to_reserve = 0;
+       bool space_reserved = false;
+
+       inode_dio_wait(inode);
+
+       em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+                             alloc_start, alloc_end - alloc_start, 0);
+       if (IS_ERR(em)) {
+               ret = PTR_ERR(em);
+               goto out;
+       }
+
+       /*
+        * Avoid hole punching and extent allocation for some cases. More cases
+        * could be considered, but these are unlikely common and we keep things
+        * as simple as possible for now. Also, intentionally, if the target
+        * range contains one or more prealloc extents together with regular
+        * extents and holes, we drop all the existing extents and allocate a
+        * new prealloc extent, so that we get a larger contiguous disk extent.
+        */
+       if (em->start <= alloc_start &&
+           test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+               const u64 em_end = em->start + em->len;
+
+               if (em_end >= offset + len) {
+                       /*
+                        * The whole range is already a prealloc extent,
+                        * do nothing except updating the inode's i_size if
+                        * needed.
+                        */
+                       free_extent_map(em);
+                       ret = btrfs_fallocate_update_isize(inode, offset + len,
+                                                          mode);
+                       goto out;
+               }
+               /*
+                * Part of the range is already a prealloc extent, so operate
+                * only on the remaining part of the range.
+                */
+               alloc_start = em_end;
+               ASSERT(IS_ALIGNED(alloc_start, sectorsize));
+               len = offset + len - alloc_start;
+               offset = alloc_start;
+               alloc_hint = em->block_start + em->len;
+       }
+       free_extent_map(em);
+
+       if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
+           BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
+               em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
+                                     alloc_start, sectorsize, 0);
+               if (IS_ERR(em)) {
+                       ret = PTR_ERR(em);
+                       goto out;
+               }
+
+               if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+                       free_extent_map(em);
+                       ret = btrfs_fallocate_update_isize(inode, offset + len,
+                                                          mode);
+                       goto out;
+               }
+               if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
+                       free_extent_map(em);
+                       ret = btrfs_truncate_block(inode, offset, len, 0);
+                       if (!ret)
+                               ret = btrfs_fallocate_update_isize(inode,
+                                                                  offset + len,
+                                                                  mode);
+                       return ret;
+               }
+               free_extent_map(em);
+               alloc_start = round_down(offset, sectorsize);
+               alloc_end = alloc_start + sectorsize;
+               goto reserve_space;
+       }
+
+       alloc_start = round_up(offset, sectorsize);
+       alloc_end = round_down(offset + len, sectorsize);
+
+       /*
+        * For unaligned ranges, check the pages at the boundaries, they might
+        * map to an extent, in which case we need to partially zero them, or
+        * they might map to a hole, in which case we need our allocation range
+        * to cover them.
+        */
+       if (!IS_ALIGNED(offset, sectorsize)) {
+               ret = btrfs_zero_range_check_range_boundary(inode, offset);
+               if (ret < 0)
+                       goto out;
+               if (ret) {
+                       alloc_start = round_down(offset, sectorsize);
+                       ret = 0;
+               } else {
+                       ret = btrfs_truncate_block(inode, offset, 0, 0);
+                       if (ret)
+                               goto out;
+               }
+       }
+
+       if (!IS_ALIGNED(offset + len, sectorsize)) {
+               ret = btrfs_zero_range_check_range_boundary(inode,
+                                                           offset + len);
+               if (ret < 0)
+                       goto out;
+               if (ret) {
+                       alloc_end = round_up(offset + len, sectorsize);
+                       ret = 0;
+               } else {
+                       ret = btrfs_truncate_block(inode, offset + len, 0, 1);
+                       if (ret)
+                               goto out;
+               }
+       }
+
+reserve_space:
+       if (alloc_start < alloc_end) {
+               struct extent_state *cached_state = NULL;
+               const u64 lockstart = alloc_start;
+               const u64 lockend = alloc_end - 1;
+
+               bytes_to_reserve = alloc_end - alloc_start;
+               ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+                                                     bytes_to_reserve);
+               if (ret < 0)
+                       goto out;
+               space_reserved = true;
+               ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+                                               alloc_start, bytes_to_reserve);
+               if (ret)
+                       goto out;
+               ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
+                                                 &cached_state);
+               if (ret)
+                       goto out;
+               ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
+                                               alloc_end - alloc_start,
+                                               i_blocksize(inode),
+                                               offset + len, &alloc_hint);
+               unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+                                    lockend, &cached_state);
+               /* btrfs_prealloc_file_range releases reserved space on error */
+               if (ret)
+                       space_reserved = false;
+       }
+ out:
+       if (ret && space_reserved)
+               btrfs_free_reserved_data_space(inode, data_reserved,
+                                              alloc_start, bytes_to_reserve);
+       extent_changeset_free(data_reserved);
+
+       return ret;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
                            loff_t offset, loff_t len)
 {
@@ -2839,7 +3063,8 @@ static long btrfs_fallocate(struct file *file, int mode,
        cur_offset = alloc_start;
 
        /* Make sure we aren't being give some crap mode */
-       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
+                    FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;
 
        if (mode & FALLOC_FL_PUNCH_HOLE)
@@ -2850,10 +3075,12 @@ static long btrfs_fallocate(struct file *file, int mode,
         *
         * For qgroup space, it will be checked later.
         */
-       ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
-                       alloc_end - alloc_start);
-       if (ret < 0)
-               return ret;
+       if (!(mode & FALLOC_FL_ZERO_RANGE)) {
+               ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
+                                                     alloc_end - alloc_start);
+               if (ret < 0)
+                       return ret;
+       }
 
        inode_lock(inode);
 
@@ -2895,6 +3122,12 @@ static long btrfs_fallocate(struct file *file, int mode,
        if (ret)
                goto out;
 
+       if (mode & FALLOC_FL_ZERO_RANGE) {
+               ret = btrfs_zero_range(inode, offset, len, mode);
+               inode_unlock(inode);
+               return ret;
+       }
+
        locked_end = alloc_end - 1;
        while (1) {
                struct btrfs_ordered_extent *ordered;
@@ -2988,37 +3221,18 @@ static long btrfs_fallocate(struct file *file, int mode,
        if (ret < 0)
                goto out_unlock;
 
-       if (actual_end > inode->i_size &&
-           !(mode & FALLOC_FL_KEEP_SIZE)) {
-               struct btrfs_trans_handle *trans;
-               struct btrfs_root *root = BTRFS_I(inode)->root;
-
-               /*
-                * We didn't need to allocate any more space, but we
-                * still extended the size of the file so we need to
-                * update i_size and the inode item.
-                */
-               trans = btrfs_start_transaction(root, 1);
-               if (IS_ERR(trans)) {
-                       ret = PTR_ERR(trans);
-               } else {
-                       inode->i_ctime = current_time(inode);
-                       i_size_write(inode, actual_end);
-                       btrfs_ordered_update_i_size(inode, actual_end, NULL);
-                       ret = btrfs_update_inode(trans, root, inode);
-                       if (ret)
-                               btrfs_end_transaction(trans);
-                       else
-                               ret = btrfs_end_transaction(trans);
-               }
-       }
+       /*
+        * We didn't need to allocate any more space, but we still extended the
+        * size of the file so we need to update i_size and the inode item.
+        */
+       ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
 out_unlock:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                             &cached_state);
 out:
        inode_unlock(inode);
        /* Let go of our reservation. */
-       if (ret != 0)
+       if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
                btrfs_free_reserved_data_space(inode, data_reserved,
                                alloc_start, alloc_end - cur_offset);
        extent_changeset_free(data_reserved);