The fs_locks is used to block other ops(ex, recovery) when doing checkpoint.
And each other operate routine(besides checkpoint) needs to acquire a fs_lock,
there is a terrible problem here, if these are too many concurrency threads acquiring
fs_lock, so that they will block each other and may lead to some performance problem,
but this is not the phenomenon we want to see.
Though there are some optimization patches introduced to enhance the usage of fs_lock,
but the thorough solution is using a *rw_sem* to replace the fs_lock.
Checkpoint routine takes write_sem, and other ops take read_sem, so that we can block
other ops(ex, recovery) when doing checkpoint, and other ops will not disturb each other,
this can avoid the problem described above completely.
Because of the weakness of rw_sem, the above change may introduce a potential problem
that the checkpoint thread might get starved if other threads are intensively locking
the read semaphore for I/O.(Pointed out by Xu Jin)
In order to avoid this, a wait_list is introduced, the appending read semaphore ops
will be dropped into the wait_list if checkpoint thread is waiting for write semaphore,
and will be waked up when checkpoint thread gives up write semaphore.
Thanks to Kim's previous review and test, and will be very glad to see other guys'
performance tests about this patch.
V2:
-fix the potential starvation problem.
-use more suitable func name suggested by Xu Jin.
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
[Jaegeuk Kim: adjust minor coding standard]
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
blk_start_plug(&plug);
retry_flush_dents:
- mutex_lock_all(sbi);
-
+ f2fs_lock_all(sbi);
/* write all the dirty dentry pages */
if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
- mutex_unlock_all(sbi);
+ f2fs_unlock_all(sbi);
sync_dirty_dir_inodes(sbi);
goto retry_flush_dents;
}
static void unblock_operations(struct f2fs_sb_info *sbi)
{
mutex_unlock(&sbi->node_write);
- mutex_unlock_all(sbi);
+ f2fs_unlock_all(sbi);
}
static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount)
inode_dec_dirty_dents(inode);
err = do_write_data_page(page);
} else {
- int ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
err = do_write_data_page(page);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
need_balance_fs = true;
}
if (err == -ENOENT)
pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
struct dnode_of_data dn;
int err = 0;
- int ilock;
f2fs_balance_fs(sbi);
repeat:
return -ENOMEM;
*pagep = page;
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, index, ALLOC_NODE);
if (err)
goto err;
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
return 0;
return 0;
err:
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
f2fs_put_page(page, 1);
return err;
}
#include <linux/crc32.h>
#include <linux/magic.h>
#include <linux/kobject.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
/*
* For mount options
NR_COUNT_TYPE,
};
-/*
- * Uses as sbi->fs_lock[NR_GLOBAL_LOCKS].
- * The checkpoint procedure blocks all the locks in this fs_lock array.
- * Some FS operations grab free locks, and if there is no free lock,
- * then wait to grab a lock in a round-robin manner.
- */
-#define NR_GLOBAL_LOCKS 8
-
/*
* The below are the page types of bios used in submti_bio().
* The available types are:
struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */
struct inode *meta_inode; /* cache meta blocks */
struct mutex cp_mutex; /* checkpoint procedure lock */
- struct mutex fs_lock[NR_GLOBAL_LOCKS]; /* blocking FS operations */
+ struct rw_semaphore cp_rwsem; /* blocking FS operations */
+ wait_queue_head_t cp_wait; /* checkpoint wait queue */
struct mutex node_write; /* locking node writes */
struct mutex writepages; /* mutex for writepages() */
- unsigned char next_lock_num; /* round-robin global locks */
int por_doing; /* recovery is doing or not */
int on_build_free_nids; /* build_free_nids is doing */
cp->ckpt_flags = cpu_to_le32(ckpt_flags);
}
-static inline void mutex_lock_all(struct f2fs_sb_info *sbi)
+static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
{
- int i;
-
- for (i = 0; i < NR_GLOBAL_LOCKS; i++) {
- /*
- * This is the only time we take multiple fs_lock[]
- * instances; the order is immaterial since we
- * always hold cp_mutex, which serializes multiple
- * such operations.
- */
- mutex_lock_nest_lock(&sbi->fs_lock[i], &sbi->cp_mutex);
- }
+ /*
+ * If the checkpoint thread is waiting for cp_rwsem, add cuurent task
+ * into wait list to avoid the checkpoint thread starvation
+ */
+ while (!list_empty(&sbi->cp_rwsem.wait_list))
+ wait_event_interruptible(sbi->cp_wait,
+ list_empty(&sbi->cp_rwsem.wait_list));
+ down_read(&sbi->cp_rwsem);
}
-static inline void mutex_unlock_all(struct f2fs_sb_info *sbi)
+static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi)
{
- int i = 0;
- for (; i < NR_GLOBAL_LOCKS; i++)
- mutex_unlock(&sbi->fs_lock[i]);
+ up_read(&sbi->cp_rwsem);
}
-static inline int mutex_lock_op(struct f2fs_sb_info *sbi)
+static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
{
- unsigned char next_lock;
- int i = 0;
-
- for (; i < NR_GLOBAL_LOCKS; i++)
- if (mutex_trylock(&sbi->fs_lock[i]))
- return i;
-
- next_lock = sbi->next_lock_num++ % NR_GLOBAL_LOCKS;
- mutex_lock(&sbi->fs_lock[next_lock]);
- return next_lock;
+ down_write_nest_lock(&sbi->cp_rwsem, &sbi->cp_mutex);
}
-static inline void mutex_unlock_op(struct f2fs_sb_info *sbi, int ilock)
+static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
{
- if (ilock < 0)
- return;
- BUG_ON(ilock >= NR_GLOBAL_LOCKS);
- mutex_unlock(&sbi->fs_lock[ilock]);
+ up_write(&sbi->cp_rwsem);
+
+ /* wake up all tasks blocked by checkpoint */
+ wake_up_all(&sbi->cp_wait);
}
/*
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
block_t old_blk_addr;
struct dnode_of_data dn;
- int err, ilock;
+ int err;
f2fs_balance_fs(sbi);
sb_start_pagefault(inode->i_sb);
/* block allocation */
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, page->index, ALLOC_NODE);
if (err) {
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
goto out;
}
err = reserve_new_block(&dn);
if (err) {
f2fs_put_dnode(&dn);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
goto out;
}
}
f2fs_put_dnode(&dn);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
file_update_time(vma->vm_file);
lock_page(page);
unsigned int blocksize = inode->i_sb->s_blocksize;
struct dnode_of_data dn;
pgoff_t free_from;
- int count = 0, ilock = -1;
+ int count = 0;
int err;
trace_f2fs_truncate_blocks_enter(inode, from);
free_from = (pgoff_t)
((from + blocksize - 1) >> (sbi->log_blocksize));
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE);
if (err) {
if (err == -ENOENT)
goto free_next;
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
trace_f2fs_truncate_blocks_exit(inode, err);
return err;
}
f2fs_put_dnode(&dn);
free_next:
err = truncate_inode_blocks(inode, free_from);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
/* lastly zero out the first data page */
truncate_partial_data_page(inode, from);
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
struct page *page;
- int ilock;
if (!len)
return;
f2fs_balance_fs(sbi);
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
page = get_new_data_page(inode, NULL, index, false);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
if (!IS_ERR(page)) {
wait_on_page_writeback(page);
struct address_space *mapping = inode->i_mapping;
loff_t blk_start, blk_end;
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- int ilock;
f2fs_balance_fs(sbi);
truncate_inode_pages_range(mapping, blk_start,
blk_end - 1);
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
ret = truncate_hole(inode, pg_start, pg_end);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
}
}
for (index = pg_start; index <= pg_end; index++) {
struct dnode_of_data dn;
- int ilock;
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
ret = get_dnode_of_data(&dn, index, ALLOC_NODE);
if (ret) {
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
break;
}
ret = reserve_new_block(&dn);
if (ret) {
f2fs_put_dnode(&dn);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
break;
}
}
f2fs_put_dnode(&dn);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
if (pg_start == pg_end)
new_size = offset + len;
int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- int ret, ilock;
+ int ret;
if (inode->i_ino == F2FS_NODE_INO(sbi) ||
inode->i_ino == F2FS_META_INO(sbi))
* We need to lock here to prevent from producing dirty node pages
* during the urgent cleaning time when runing out of free sections.
*/
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
ret = update_inode_page(inode);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
if (wbc)
f2fs_balance_fs(sbi);
void f2fs_evict_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- int ilock;
trace_f2fs_evict_inode(inode);
truncate_inode_pages(&inode->i_data, 0);
if (F2FS_HAS_BLOCKS(inode))
f2fs_truncate(inode);
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
remove_inode_page(inode);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
sb_end_intwrite(inode->i_sb);
no_delete:
nid_t ino;
struct inode *inode;
bool nid_free = false;
- int err, ilock;
+ int err;
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
if (!alloc_nid(sbi, &ino)) {
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
err = -ENOSPC;
goto fail;
}
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
inode->i_uid = current_fsuid();
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct inode *inode;
nid_t ino = 0;
- int err, ilock;
+ int err;
f2fs_balance_fs(sbi);
inode->i_mapping->a_ops = &f2fs_dblock_aops;
ino = inode->i_ino;
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
if (err)
goto out;
struct inode *inode = old_dentry->d_inode;
struct super_block *sb = dir->i_sb;
struct f2fs_sb_info *sbi = F2FS_SB(sb);
- int err, ilock;
+ int err;
f2fs_balance_fs(sbi);
ihold(inode);
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
if (err)
goto out;
struct f2fs_dir_entry *de;
struct page *page;
int err = -ENOENT;
- int ilock;
trace_f2fs_unlink_enter(dir, dentry);
f2fs_balance_fs(sbi);
goto fail;
}
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
f2fs_delete_entry(de, page, inode);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
/* In order to evict this inode, we set it dirty */
mark_inode_dirty(inode);
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct inode *inode;
size_t symlen = strlen(symname) + 1;
- int err, ilock;
+ int err;
f2fs_balance_fs(sbi);
inode->i_op = &f2fs_symlink_inode_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
if (err)
goto out;
{
struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
struct inode *inode;
- int err, ilock;
+ int err;
f2fs_balance_fs(sbi);
mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
if (err)
goto out_fail;
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct inode *inode;
int err = 0;
- int ilock;
if (!new_valid_dev(rdev))
return -EINVAL;
init_special_inode(inode, inode->i_mode, rdev);
inode->i_op = &f2fs_special_inode_operations;
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
err = f2fs_add_link(dentry, inode);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
if (err)
goto out;
struct f2fs_dir_entry *old_dir_entry = NULL;
struct f2fs_dir_entry *old_entry;
struct f2fs_dir_entry *new_entry;
- int err = -ENOENT, ilock = -1;
+ int err = -ENOENT;
f2fs_balance_fs(sbi);
goto out_old;
}
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
if (new_inode) {
update_inode_page(old_dir);
}
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
return 0;
put_out_dir:
kunmap(old_dir_page);
f2fs_put_page(old_dir_page, 0);
}
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
out_old:
kunmap(old_page);
f2fs_put_page(old_page, 0);
struct f2fs_summary sum;
struct node_info ni;
int err = 0, recovered = 0;
- int ilock;
start = start_bidx_of_node(ofs_of_node(page), fi);
if (IS_INODE(page))
else
end = start + ADDRS_PER_BLOCK;
- ilock = mutex_lock_op(sbi);
+ f2fs_lock_op(sbi);
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, start, ALLOC_NODE);
if (err) {
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
return err;
}
recover_node_page(sbi, dn.node_page, &sum, &ni, blkaddr);
err:
f2fs_put_dnode(&dn);
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
f2fs_msg(sbi->sb, KERN_NOTICE, "recover_data: ino = %lx, "
"recovered_data = %d blocks, err = %d",
struct buffer_head *raw_super_buf;
struct inode *root;
long err = -EINVAL;
- int i;
/* allocate memory for f2fs-specific super block info */
sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL);
mutex_init(&sbi->gc_mutex);
mutex_init(&sbi->writepages);
mutex_init(&sbi->cp_mutex);
- for (i = 0; i < NR_GLOBAL_LOCKS; i++)
- mutex_init(&sbi->fs_lock[i]);
mutex_init(&sbi->node_write);
sbi->por_doing = 0;
spin_lock_init(&sbi->stat_lock);
init_rwsem(&sbi->bio_sem);
+ init_rwsem(&sbi->cp_rwsem);
+ init_waitqueue_head(&sbi->cp_wait);
init_sb_info(sbi);
/* get an inode for meta space */
const void *value, size_t value_len, struct page *ipage)
{
struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
- int ilock;
int err;
f2fs_balance_fs(sbi);
- ilock = mutex_lock_op(sbi);
-
+ f2fs_lock_op(sbi);
err = __f2fs_setxattr(inode, name_index, name, value, value_len, ipage);
-
- mutex_unlock_op(sbi, ilock);
+ f2fs_unlock_op(sbi);
return err;
}