ext4: Transfer initialized block to right neighbor if possible
authorLukas Czerner <lczerner@redhat.com>
Thu, 4 Apr 2013 03:33:27 +0000 (23:33 -0400)
committerTheodore Ts'o <tytso@mit.edu>
Thu, 4 Apr 2013 03:33:27 +0000 (23:33 -0400)
Currently when converting extent to initialized we attempt to transfer
initialized block to the left neighbour if possible when certain
criteria are met. However we do not attempt to do the same for the
right neighbor.

This commit adds the possibility to transfer initialized block to the
right neighbour if:

1. We're not converting the whole extent
2. Both extents are stored in the same extent tree node
3. Right neighbor is initialized
4. Right neighbor is logically abutting the current one
5. Right neighbor is physically abutting the current one
6. Right neighbor would not overflow the length limit

This is basically the same logic as with transferring to the left. This
will gain us some performance benefits since it is faster than inserting
extent and then merging it.

It would also prevent some situation in delalloc patch when we might run
out of metadata reservation. This is due to the fact that we would
attempt to split the extent first (possibly allocating new metadata
block) even though we did not counted for that because it can (and will)
be merged again. This commit fix that scenario, because we no longer
need to split the extent in such case.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
fs/ext4/extents.c

index 23524671907442222b47a1a1373c179e88f7a255..892662334be9e998c85cc2b202df408eb02d741a 100644 (file)
@@ -3153,29 +3153,28 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        struct ext4_extent_header *eh;
        struct ext4_map_blocks split_map;
        struct ext4_extent zero_ex;
-       struct ext4_extent *ex;
+       struct ext4_extent *ex, *abut_ex;
        ext4_lblk_t ee_block, eof_block;
-       unsigned int ee_len, depth;
-       int allocated, max_zeroout = 0;
+       unsigned int ee_len, depth, map_len = map->m_len;
+       int allocated = 0, max_zeroout = 0;
        int err = 0;
        int split_flag = 0;
 
        ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
                "block %llu, max_blocks %u\n", inode->i_ino,
-               (unsigned long long)map->m_lblk, map->m_len);
+               (unsigned long long)map->m_lblk, map_len);
 
        sbi = EXT4_SB(inode->i_sb);
        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
                inode->i_sb->s_blocksize_bits;
-       if (eof_block < map->m_lblk + map->m_len)
-               eof_block = map->m_lblk + map->m_len;
+       if (eof_block < map->m_lblk + map_len)
+               eof_block = map->m_lblk + map_len;
 
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
-       allocated = ee_len - (map->m_lblk - ee_block);
        zero_ex.ee_len = 0;
 
        trace_ext4_ext_convert_to_initialized_enter(inode, map, ex);
@@ -3186,77 +3185,121 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 
        /*
         * Attempt to transfer newly initialized blocks from the currently
-        * uninitialized extent to its left neighbor. This is much cheaper
+        * uninitialized extent to its neighbor. This is much cheaper
         * than an insertion followed by a merge as those involve costly
-        * memmove() calls. This is the common case in steady state for
-        * workloads doing fallocate(FALLOC_FL_KEEP_SIZE) followed by append
-        * writes.
+        * memmove() calls. Transferring to the left is the common case in
+        * steady state for workloads doing fallocate(FALLOC_FL_KEEP_SIZE)
+        * followed by append writes.
         *
         * Limitations of the current logic:
-        *  - L1: we only deal with writes at the start of the extent.
-        *    The approach could be extended to writes at the end
-        *    of the extent but this scenario was deemed less common.
-        *  - L2: we do not deal with writes covering the whole extent.
+        *  - L1: we do not deal with writes covering the whole extent.
         *    This would require removing the extent if the transfer
         *    is possible.
-        *  - L3: we only attempt to merge with an extent stored in the
+        *  - L2: we only attempt to merge with an extent stored in the
         *    same extent tree node.
         */
-       if ((map->m_lblk == ee_block) &&        /*L1*/
-               (map->m_len < ee_len) &&        /*L2*/
-               (ex > EXT_FIRST_EXTENT(eh))) {  /*L3*/
-               struct ext4_extent *prev_ex;
+       if ((map->m_lblk == ee_block) &&
+               /* See if we can merge left */
+               (map_len < ee_len) &&           /*L1*/
+               (ex > EXT_FIRST_EXTENT(eh))) {  /*L2*/
                ext4_lblk_t prev_lblk;
                ext4_fsblk_t prev_pblk, ee_pblk;
-               unsigned int prev_len, write_len;
+               unsigned int prev_len;
 
-               prev_ex = ex - 1;
-               prev_lblk = le32_to_cpu(prev_ex->ee_block);
-               prev_len = ext4_ext_get_actual_len(prev_ex);
-               prev_pblk = ext4_ext_pblock(prev_ex);
+               abut_ex = ex - 1;
+               prev_lblk = le32_to_cpu(abut_ex->ee_block);
+               prev_len = ext4_ext_get_actual_len(abut_ex);
+               prev_pblk = ext4_ext_pblock(abut_ex);
                ee_pblk = ext4_ext_pblock(ex);
-               write_len = map->m_len;
 
                /*
-                * A transfer of blocks from 'ex' to 'prev_ex' is allowed
+                * A transfer of blocks from 'ex' to 'abut_ex' is allowed
                 * upon those conditions:
-                * - C1: prev_ex is initialized,
-                * - C2: prev_ex is logically abutting ex,
-                * - C3: prev_ex is physically abutting ex,
-                * - C4: prev_ex can receive the additional blocks without
+                * - C1: abut_ex is initialized,
+                * - C2: abut_ex is logically abutting ex,
+                * - C3: abut_ex is physically abutting ex,
+                * - C4: abut_ex can receive the additional blocks without
                 *   overflowing the (initialized) length limit.
                 */
-               if ((!ext4_ext_is_uninitialized(prev_ex)) &&            /*C1*/
+               if ((!ext4_ext_is_uninitialized(abut_ex)) &&            /*C1*/
                        ((prev_lblk + prev_len) == ee_block) &&         /*C2*/
                        ((prev_pblk + prev_len) == ee_pblk) &&          /*C3*/
-                       (prev_len < (EXT_INIT_MAX_LEN - write_len))) {  /*C4*/
+                       (prev_len < (EXT_INIT_MAX_LEN - map_len))) {    /*C4*/
                        err = ext4_ext_get_access(handle, inode, path + depth);
                        if (err)
                                goto out;
 
                        trace_ext4_ext_convert_to_initialized_fastpath(inode,
-                               map, ex, prev_ex);
+                               map, ex, abut_ex);
 
-                       /* Shift the start of ex by 'write_len' blocks */
-                       ex->ee_block = cpu_to_le32(ee_block + write_len);
-                       ext4_ext_store_pblock(ex, ee_pblk + write_len);
-                       ex->ee_len = cpu_to_le16(ee_len - write_len);
+                       /* Shift the start of ex by 'map_len' blocks */
+                       ex->ee_block = cpu_to_le32(ee_block + map_len);
+                       ext4_ext_store_pblock(ex, ee_pblk + map_len);
+                       ex->ee_len = cpu_to_le16(ee_len - map_len);
                        ext4_ext_mark_uninitialized(ex); /* Restore the flag */
 
-                       /* Extend prev_ex by 'write_len' blocks */
-                       prev_ex->ee_len = cpu_to_le16(prev_len + write_len);
+                       /* Extend abut_ex by 'map_len' blocks */
+                       abut_ex->ee_len = cpu_to_le16(prev_len + map_len);
 
-                       /* Mark the block containing both extents as dirty */
-                       ext4_ext_dirty(handle, inode, path + depth);
+                       /* Result: number of initialized blocks past m_lblk */
+                       allocated = map_len;
+               }
+       } else if (((map->m_lblk + map_len) == (ee_block + ee_len)) &&
+                  (map_len < ee_len) &&        /*L1*/
+                  ex < EXT_LAST_EXTENT(eh)) {  /*L2*/
+               /* See if we can merge right */
+               ext4_lblk_t next_lblk;
+               ext4_fsblk_t next_pblk, ee_pblk;
+               unsigned int next_len;
+
+               abut_ex = ex + 1;
+               next_lblk = le32_to_cpu(abut_ex->ee_block);
+               next_len = ext4_ext_get_actual_len(abut_ex);
+               next_pblk = ext4_ext_pblock(abut_ex);
+               ee_pblk = ext4_ext_pblock(ex);
 
-                       /* Update path to point to the right extent */
-                       path[depth].p_ext = prev_ex;
+               /*
+                * A transfer of blocks from 'ex' to 'abut_ex' is allowed
+                * upon those conditions:
+                * - C1: abut_ex is initialized,
+                * - C2: abut_ex is logically abutting ex,
+                * - C3: abut_ex is physically abutting ex,
+                * - C4: abut_ex can receive the additional blocks without
+                *   overflowing the (initialized) length limit.
+                */
+               if ((!ext4_ext_is_uninitialized(abut_ex)) &&            /*C1*/
+                   ((map->m_lblk + map_len) == next_lblk) &&           /*C2*/
+                   ((ee_pblk + ee_len) == next_pblk) &&                /*C3*/
+                   (next_len < (EXT_INIT_MAX_LEN - map_len))) {        /*C4*/
+                       err = ext4_ext_get_access(handle, inode, path + depth);
+                       if (err)
+                               goto out;
+
+                       trace_ext4_ext_convert_to_initialized_fastpath(inode,
+                               map, ex, abut_ex);
+
+                       /* Shift the start of abut_ex by 'map_len' blocks */
+                       abut_ex->ee_block = cpu_to_le32(next_lblk - map_len);
+                       ext4_ext_store_pblock(abut_ex, next_pblk - map_len);
+                       ex->ee_len = cpu_to_le16(ee_len - map_len);
+                       ext4_ext_mark_uninitialized(ex); /* Restore the flag */
+
+                       /* Extend abut_ex by 'map_len' blocks */
+                       abut_ex->ee_len = cpu_to_le16(next_len + map_len);
 
                        /* Result: number of initialized blocks past m_lblk */
-                       allocated = write_len;
-                       goto out;
+                       allocated = map_len;
                }
        }
+       if (allocated) {
+               /* Mark the block containing both extents as dirty */
+               ext4_ext_dirty(handle, inode, path + depth);
+
+               /* Update path to point to the right extent */
+               path[depth].p_ext = abut_ex;
+               goto out;
+       } else
+               allocated = ee_len - (map->m_lblk - ee_block);
 
        WARN_ON(map->m_lblk < ee_block);
        /*