atomic_inc(&root->log_batch);
full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
+
/*
- * We might have have had more pages made dirty after calling
- * start_ordered_ops and before acquiring the inode's i_mutex.
+ * We have to do this here to avoid the priority inversion of waiting on
+ * IO of a lower priority task while holding a transaciton open.
*/
- if (full_sync) {
- /*
- * For a full sync, we need to make sure any ordered operations
- * start and finish before we start logging the inode, so that
- * all extents are persisted and the respective file extent
- * items are in the fs/subvol btree.
- */
- ret = btrfs_wait_ordered_range(inode, start, len);
- } else {
- /*
- * Start any new ordered operations before starting to log the
- * inode. We will wait for them to finish in btrfs_sync_log().
- *
- * Right before acquiring the inode's mutex, we might have new
- * writes dirtying pages, which won't immediately start the
- * respective ordered operations - that is done through the
- * fill_delalloc callbacks invoked from the writepage and
- * writepages address space operations. So make sure we start
- * all ordered operations before starting to log our inode. Not
- * doing this means that while logging the inode, writeback
- * could start and invoke writepage/writepages, which would call
- * the fill_delalloc callbacks (cow_file_range,
- * submit_compressed_extents). These callbacks add first an
- * extent map to the modified list of extents and then create
- * the respective ordered operation, which means in
- * tree-log.c:btrfs_log_inode() we might capture all existing
- * ordered operations (with btrfs_get_logged_extents()) before
- * the fill_delalloc callback adds its ordered operation, and by
- * the time we visit the modified list of extent maps (with
- * btrfs_log_changed_extents()), we see and process the extent
- * map they created. We then use the extent map to construct a
- * file extent item for logging without waiting for the
- * respective ordered operation to finish - this file extent
- * item points to a disk location that might not have yet been
- * written to, containing random data - so after a crash a log
- * replay will make our inode have file extent items that point
- * to disk locations containing invalid data, as we returned
- * success to userspace without waiting for the respective
- * ordered operation to finish, because it wasn't captured by
- * btrfs_get_logged_extents().
- */
- ret = start_ordered_ops(inode, start, end);
- }
+ ret = btrfs_wait_ordered_range(inode, start, len);
if (ret) {
inode_unlock(inode);
goto out;
goto out;
}
}
- if (!full_sync) {
- ret = btrfs_wait_ordered_range(inode, start, len);
- if (ret) {
- btrfs_end_transaction(trans);
- goto out;
- }
- }
ret = btrfs_commit_transaction(trans);
} else {
ret = btrfs_end_transaction(trans);