ext4: reduce reserved cluster count by number of allocated clusters
authorEric Whitney <enwlinux@gmail.com>
Mon, 1 Oct 2018 18:24:08 +0000 (14:24 -0400)
committerTheodore Ts'o <tytso@mit.edu>
Mon, 1 Oct 2018 18:24:08 +0000 (14:24 -0400)
Ext4 does not always reduce the reserved cluster count by the number
of clusters allocated when mapping a delayed extent.  It sometimes
adds back one or more clusters after allocation if delalloc blocks
adjacent to the range allocated by ext4_ext_map_blocks() share the
clusters newly allocated for that range.  However, this overcounts
the number of clusters needed to satisfy future mapping requests
(holding one or more reservations for clusters that have already been
allocated) and premature ENOSPC and quota failures, etc., result.

Ext4 also does not reduce the reserved cluster count when allocating
clusters for non-delayed allocated writes that have previously been
reserved for delayed writes.  This also results in overcounts.

To make it possible to handle reserved cluster accounting for
fallocated regions in the same manner as used for other non-delayed
writes, do the reserved cluster accounting for them at the time of
allocation.  In the current code, this is only done later when a
delayed extent sharing the fallocated region is finally mapped.

Address comment correcting handling of unsigned long long constant
from Jan Kara's review of RFC version of this patch.

Signed-off-by: Eric Whitney <enwlinux@gmail.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
fs/ext4/extents.c
fs/ext4/extents_status.c
fs/ext4/extents_status.h

index 26481e543312bac2fb760d1faf7ab9f7b79a17ec..b52ac813ca2078846825fc2a86beb25f22f6f56a 100644 (file)
@@ -3819,83 +3819,6 @@ out:
        return ext4_mark_inode_dirty(handle, inode);
 }
 
-/**
- * Determines how many complete clusters (out of those specified by the 'map')
- * are under delalloc and were reserved quota for.
- * This function is called when we are writing out the blocks that were
- * originally written with their allocation delayed, but then the space was
- * allocated using fallocate() before the delayed allocation could be resolved.
- * The cases to look for are:
- * ('=' indicated delayed allocated blocks
- *  '-' indicates non-delayed allocated blocks)
- * (a) partial clusters towards beginning and/or end outside of allocated range
- *     are not delalloc'ed.
- *     Ex:
- *     |----c---=|====c====|====c====|===-c----|
- *              |++++++ allocated ++++++|
- *     ==> 4 complete clusters in above example
- *
- * (b) partial cluster (outside of allocated range) towards either end is
- *     marked for delayed allocation. In this case, we will exclude that
- *     cluster.
- *     Ex:
- *     |----====c========|========c========|
- *          |++++++ allocated ++++++|
- *     ==> 1 complete clusters in above example
- *
- *     Ex:
- *     |================c================|
- *            |++++++ allocated ++++++|
- *     ==> 0 complete clusters in above example
- *
- * The ext4_da_update_reserve_space will be called only if we
- * determine here that there were some "entire" clusters that span
- * this 'allocated' range.
- * In the non-bigalloc case, this function will just end up returning num_blks
- * without ever calling ext4_find_delalloc_range.
- */
-static unsigned int
-get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
-                          unsigned int num_blks)
-{
-       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-       ext4_lblk_t alloc_cluster_start, alloc_cluster_end;
-       ext4_lblk_t lblk_from, lblk_to, c_offset;
-       unsigned int allocated_clusters = 0;
-
-       alloc_cluster_start = EXT4_B2C(sbi, lblk_start);
-       alloc_cluster_end = EXT4_B2C(sbi, lblk_start + num_blks - 1);
-
-       /* max possible clusters for this allocation */
-       allocated_clusters = alloc_cluster_end - alloc_cluster_start + 1;
-
-       trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
-
-       /* Check towards left side */
-       c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
-       if (c_offset) {
-               lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
-               lblk_to = lblk_from + c_offset - 1;
-
-               if (ext4_es_scan_range(inode, &ext4_es_is_delayed, lblk_from,
-                                      lblk_to))
-                       allocated_clusters--;
-       }
-
-       /* Now check towards right. */
-       c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
-       if (allocated_clusters && c_offset) {
-               lblk_from = lblk_start + num_blks;
-               lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
-
-               if (ext4_es_scan_range(inode, &ext4_es_is_delayed, lblk_from,
-                                      lblk_to))
-                       allocated_clusters--;
-       }
-
-       return allocated_clusters;
-}
-
 static int
 convert_initialized_extent(handle_t *handle, struct inode *inode,
                           struct ext4_map_blocks *map,
@@ -4077,23 +4000,6 @@ out:
        }
        map->m_len = allocated;
 
-       /*
-        * If we have done fallocate with the offset that is already
-        * delayed allocated, we would have block reservation
-        * and quota reservation done in the delayed write path.
-        * But fallocate would have already updated quota and block
-        * count for this offset. So cancel these reservation
-        */
-       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
-               unsigned int reserved_clusters;
-               reserved_clusters = get_reserved_cluster_alloc(inode,
-                               map->m_lblk, map->m_len);
-               if (reserved_clusters)
-                       ext4_da_update_reserve_space(inode,
-                                                    reserved_clusters,
-                                                    0);
-       }
-
 map_out:
        map->m_flags |= EXT4_MAP_MAPPED;
        if ((flags & EXT4_GET_BLOCKS_KEEP_SIZE) == 0) {
@@ -4482,77 +4388,39 @@ got_allocated_blocks:
        map->m_flags |= EXT4_MAP_NEW;
 
        /*
-        * Update reserved blocks/metadata blocks after successful
-        * block allocation which had been deferred till now.
+        * Reduce the reserved cluster count to reflect successful deferred
+        * allocation of delayed allocated clusters or direct allocation of
+        * clusters discovered to be delayed allocated.  Once allocated, a
+        * cluster is not included in the reserved count.
         */
-       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
-               unsigned int reserved_clusters;
-               /*
-                * Check how many clusters we had reserved this allocated range
-                */
-               reserved_clusters = get_reserved_cluster_alloc(inode,
-                                               map->m_lblk, allocated);
-               if (!map_from_cluster) {
-                       BUG_ON(allocated_clusters < reserved_clusters);
-                       if (reserved_clusters < allocated_clusters) {
-                               struct ext4_inode_info *ei = EXT4_I(inode);
-                               int reservation = allocated_clusters -
-                                                 reserved_clusters;
-                               /*
-                                * It seems we claimed few clusters outside of
-                                * the range of this allocation. We should give
-                                * it back to the reservation pool. This can
-                                * happen in the following case:
-                                *
-                                * * Suppose s_cluster_ratio is 4 (i.e., each
-                                *   cluster has 4 blocks. Thus, the clusters
-                                *   are [0-3],[4-7],[8-11]...
-                                * * First comes delayed allocation write for
-                                *   logical blocks 10 & 11. Since there were no
-                                *   previous delayed allocated blocks in the
-                                *   range [8-11], we would reserve 1 cluster
-                                *   for this write.
-                                * * Next comes write for logical blocks 3 to 8.
-                                *   In this case, we will reserve 2 clusters
-                                *   (for [0-3] and [4-7]; and not for [8-11] as
-                                *   that range has a delayed allocated blocks.
-                                *   Thus total reserved clusters now becomes 3.
-                                * * Now, during the delayed allocation writeout
-                                *   time, we will first write blocks [3-8] and
-                                *   allocate 3 clusters for writing these
-                                *   blocks. Also, we would claim all these
-                                *   three clusters above.
-                                * * Now when we come here to writeout the
-                                *   blocks [10-11], we would expect to claim
-                                *   the reservation of 1 cluster we had made
-                                *   (and we would claim it since there are no
-                                *   more delayed allocated blocks in the range
-                                *   [8-11]. But our reserved cluster count had
-                                *   already gone to 0.
-                                *
-                                *   Thus, at the step 4 above when we determine
-                                *   that there are still some unwritten delayed
-                                *   allocated blocks outside of our current
-                                *   block range, we should increment the
-                                *   reserved clusters count so that when the
-                                *   remaining blocks finally gets written, we
-                                *   could claim them.
-                                */
-                               dquot_reserve_block(inode,
-                                               EXT4_C2B(sbi, reservation));
-                               spin_lock(&ei->i_block_reservation_lock);
-                               ei->i_reserved_data_blocks += reservation;
-                               spin_unlock(&ei->i_block_reservation_lock);
-                       }
+       if (test_opt(inode->i_sb, DELALLOC) && !map_from_cluster) {
+               if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
                        /*
-                        * We will claim quota for all newly allocated blocks.
-                        * We're updating the reserved space *after* the
-                        * correction above so we do not accidentally free
-                        * all the metadata reservation because we might
-                        * actually need it later on.
+                        * When allocating delayed allocated clusters, simply
+                        * reduce the reserved cluster count and claim quota
                         */
                        ext4_da_update_reserve_space(inode, allocated_clusters,
                                                        1);
+               } else {
+                       ext4_lblk_t lblk, len;
+                       unsigned int n;
+
+                       /*
+                        * When allocating non-delayed allocated clusters
+                        * (from fallocate, filemap, DIO, or clusters
+                        * allocated when delalloc has been disabled by
+                        * ext4_nonda_switch), reduce the reserved cluster
+                        * count by the number of allocated clusters that
+                        * have previously been delayed allocated.  Quota
+                        * has been claimed by ext4_mb_new_blocks() above,
+                        * so release the quota reservations made for any
+                        * previously delayed allocated clusters.
+                        */
+                       lblk = EXT4_LBLK_CMASK(sbi, map->m_lblk);
+                       len = allocated_clusters << sbi->s_cluster_bits;
+                       n = ext4_es_delayed_clu(inode, lblk, len);
+                       if (n > 0)
+                               ext4_da_update_reserve_space(inode, (int) n, 0);
                }
        }
 
index c5d456e12062a438c9db7d6a7fcd00b233f85b94..c92fbf444d0878110dce8ea200251172b56701f9 100644 (file)
@@ -150,6 +150,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
 static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
                       struct ext4_inode_info *locked_ei);
+static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+                            ext4_lblk_t len);
 
 int __init ext4_init_es(void)
 {
@@ -808,6 +810,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
        struct extent_status newes;
        ext4_lblk_t end = lblk + len - 1;
        int err = 0;
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 
        es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
                 lblk, len, pblk, status, inode->i_ino);
@@ -844,6 +847,11 @@ retry:
        if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
                err = 0;
 
+       if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
+           (status & EXTENT_STATUS_WRITTEN ||
+            status & EXTENT_STATUS_UNWRITTEN))
+               __revise_pending(inode, lblk, len);
+
 error:
        write_unlock(&EXT4_I(inode)->i_es_lock);
 
@@ -1605,3 +1613,170 @@ error:
 
        return err;
 }
+
+/*
+ * __es_delayed_clu - count number of clusters containing blocks that
+ *                    are delayed only
+ *
+ * @inode - file containing block range
+ * @start - logical block defining start of range
+ * @end - logical block defining end of range
+ *
+ * Returns the number of clusters containing only delayed (not delayed
+ * and unwritten) blocks in the range specified by @start and @end.  Any
+ * cluster or part of a cluster within the range and containing a delayed
+ * and not unwritten block within the range is counted as a whole cluster.
+ */
+static unsigned int __es_delayed_clu(struct inode *inode, ext4_lblk_t start,
+                                    ext4_lblk_t end)
+{
+       struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
+       struct extent_status *es;
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       struct rb_node *node;
+       ext4_lblk_t first_lclu, last_lclu;
+       unsigned long long last_counted_lclu;
+       unsigned int n = 0;
+
+       /* guaranteed to be unequal to any ext4_lblk_t value */
+       last_counted_lclu = ~0ULL;
+
+       es = __es_tree_search(&tree->root, start);
+
+       while (es && (es->es_lblk <= end)) {
+               if (ext4_es_is_delonly(es)) {
+                       if (es->es_lblk <= start)
+                               first_lclu = EXT4_B2C(sbi, start);
+                       else
+                               first_lclu = EXT4_B2C(sbi, es->es_lblk);
+
+                       if (ext4_es_end(es) >= end)
+                               last_lclu = EXT4_B2C(sbi, end);
+                       else
+                               last_lclu = EXT4_B2C(sbi, ext4_es_end(es));
+
+                       if (first_lclu == last_counted_lclu)
+                               n += last_lclu - first_lclu;
+                       else
+                               n += last_lclu - first_lclu + 1;
+                       last_counted_lclu = last_lclu;
+               }
+               node = rb_next(&es->rb_node);
+               if (!node)
+                       break;
+               es = rb_entry(node, struct extent_status, rb_node);
+       }
+
+       return n;
+}
+
+/*
+ * ext4_es_delayed_clu - count number of clusters containing blocks that
+ *                       are both delayed and unwritten
+ *
+ * @inode - file containing block range
+ * @lblk - logical block defining start of range
+ * @len - number of blocks in range
+ *
+ * Locking for external use of __es_delayed_clu().
+ */
+unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
+                                ext4_lblk_t len)
+{
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       ext4_lblk_t end;
+       unsigned int n;
+
+       if (len == 0)
+               return 0;
+
+       end = lblk + len - 1;
+       WARN_ON(end < lblk);
+
+       read_lock(&ei->i_es_lock);
+
+       n = __es_delayed_clu(inode, lblk, end);
+
+       read_unlock(&ei->i_es_lock);
+
+       return n;
+}
+
+/*
+ * __revise_pending - makes, cancels, or leaves unchanged pending cluster
+ *                    reservations for a specified block range depending
+ *                    upon the presence or absence of delayed blocks
+ *                    outside the range within clusters at the ends of the
+ *                    range
+ *
+ * @inode - file containing the range
+ * @lblk - logical block defining the start of range
+ * @len  - length of range in blocks
+ *
+ * Used after a newly allocated extent is added to the extents status tree.
+ * Requires that the extents in the range have either written or unwritten
+ * status.  Must be called while holding i_es_lock.
+ */
+static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+                            ext4_lblk_t len)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       ext4_lblk_t end = lblk + len - 1;
+       ext4_lblk_t first, last;
+       bool f_del = false, l_del = false;
+
+       if (len == 0)
+               return;
+
+       /*
+        * Two cases - block range within single cluster and block range
+        * spanning two or more clusters.  Note that a cluster belonging
+        * to a range starting and/or ending on a cluster boundary is treated
+        * as if it does not contain a delayed extent.  The new range may
+        * have allocated space for previously delayed blocks out to the
+        * cluster boundary, requiring that any pre-existing pending
+        * reservation be canceled.  Because this code only looks at blocks
+        * outside the range, it should revise pending reservations
+        * correctly even if the extent represented by the range can't be
+        * inserted in the extents status tree due to ENOSPC.
+        */
+
+       if (EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) {
+               first = EXT4_LBLK_CMASK(sbi, lblk);
+               if (first != lblk)
+                       f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+                                               first, lblk - 1);
+               if (f_del) {
+                       __insert_pending(inode, first);
+               } else {
+                       last = EXT4_LBLK_CMASK(sbi, end) +
+                              sbi->s_cluster_ratio - 1;
+                       if (last != end)
+                               l_del = __es_scan_range(inode,
+                                                       &ext4_es_is_delonly,
+                                                       end + 1, last);
+                       if (l_del)
+                               __insert_pending(inode, last);
+                       else
+                               __remove_pending(inode, last);
+               }
+       } else {
+               first = EXT4_LBLK_CMASK(sbi, lblk);
+               if (first != lblk)
+                       f_del = __es_scan_range(inode, &ext4_es_is_delonly,
+                                               first, lblk - 1);
+               if (f_del)
+                       __insert_pending(inode, first);
+               else
+                       __remove_pending(inode, first);
+
+               last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
+               if (last != end)
+                       l_del = __es_scan_range(inode, &ext4_es_is_delonly,
+                                               end + 1, last);
+               if (l_del)
+                       __insert_pending(inode, last);
+               else
+                       __remove_pending(inode, last);
+       }
+}
index 9d3c676ec623af3b6a8afa9b11acb6129b78b493..131a8b7df265ff1a8dd504ab104a77cb12fb4c77 100644 (file)
@@ -244,5 +244,9 @@ extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
 extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
 extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
                                        bool allocated);
+extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
+                                       ext4_lblk_t len);
+extern void ext4_es_remove_blks(struct inode *inode, ext4_lblk_t lblk,
+                               ext4_lblk_t len);
 
 #endif /* _EXT4_EXTENTS_STATUS_H */