ocfs2: shared writeable mmap
authorMark Fasheh <mark.fasheh@oracle.com>
Wed, 9 May 2007 22:16:19 +0000 (15:16 -0700)
committerMark Fasheh <mark.fasheh@oracle.com>
Wed, 11 Jul 2007 00:31:51 +0000 (17:31 -0700)
Implement cluster consistent shared writeable mappings using the
->page_mkwrite() callback.

Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
fs/ocfs2/aops.c
fs/ocfs2/aops.h
fs/ocfs2/file.c
fs/ocfs2/mmap.c

index fc723fb9c981e2ac1dd75592441c807b342a42dd..b8869fd0884f595f4f9d36e82f7f042f146b817a 100644 (file)
@@ -1034,7 +1034,8 @@ out:
  */
 static int ocfs2_grab_pages_for_write(struct address_space *mapping,
                                      struct ocfs2_write_ctxt *wc,
-                                     u32 cpos, loff_t user_pos, int new)
+                                     u32 cpos, loff_t user_pos, int new,
+                                     struct page *mmap_page)
 {
        int ret = 0, i;
        unsigned long start, target_index, index;
@@ -1058,11 +1059,36 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
        for(i = 0; i < wc->w_num_pages; i++) {
                index = start + i;
 
-               wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS);
-               if (!wc->w_pages[i]) {
-                       ret = -ENOMEM;
-                       mlog_errno(ret);
-                       goto out;
+               if (index == target_index && mmap_page) {
+                       /*
+                        * ocfs2_pagemkwrite() is a little different
+                        * and wants us to directly use the page
+                        * passed in.
+                        */
+                       lock_page(mmap_page);
+
+                       if (mmap_page->mapping != mapping) {
+                               unlock_page(mmap_page);
+                               /*
+                                * Sanity check - the locking in
+                                * ocfs2_pagemkwrite() should ensure
+                                * that this code doesn't trigger.
+                                */
+                               ret = -EINVAL;
+                               mlog_errno(ret);
+                               goto out;
+                       }
+
+                       page_cache_get(mmap_page);
+                       wc->w_pages[i] = mmap_page;
+               } else {
+                       wc->w_pages[i] = find_or_create_page(mapping, index,
+                                                            GFP_NOFS);
+                       if (!wc->w_pages[i]) {
+                               ret = -ENOMEM;
+                               mlog_errno(ret);
+                               goto out;
+                       }
                }
 
                if (index == target_index)
@@ -1213,10 +1239,10 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
        }
 }
 
-static int ocfs2_write_begin_nolock(struct address_space *mapping,
-                                   loff_t pos, unsigned len, unsigned flags,
-                                   struct page **pagep, void **fsdata,
-                                   struct buffer_head *di_bh)
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+                            loff_t pos, unsigned len, unsigned flags,
+                            struct page **pagep, void **fsdata,
+                            struct buffer_head *di_bh, struct page *mmap_page)
 {
        int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS;
        unsigned int num_clusters = 0, clusters_to_alloc = 0;
@@ -1318,7 +1344,7 @@ static int ocfs2_write_begin_nolock(struct address_space *mapping,
         * extent.
         */
        ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
-                                        clusters_to_alloc);
+                                        clusters_to_alloc, mmap_page);
        if (ret) {
                mlog_errno(ret);
                goto out_commit;
@@ -1386,7 +1412,7 @@ int ocfs2_write_begin(struct file *file, struct address_space *mapping,
        }
 
        ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
-                                      fsdata, di_bh);
+                                      fsdata, di_bh, NULL);
        if (ret) {
                mlog_errno(ret);
                goto out_fail_data;
@@ -1407,9 +1433,9 @@ out_fail:
        return ret;
 }
 
-static int ocfs2_write_end_nolock(struct address_space *mapping,
-                                 loff_t pos, unsigned len, unsigned copied,
-                                 struct page *page, void *fsdata)
+int ocfs2_write_end_nolock(struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned copied,
+                          struct page *page, void *fsdata)
 {
        int i;
        unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
index bdcdd1ae63a9c418dd41255bd784d8d482e7ca18..389579bd64e372e8294858d63d5b0d913fee648a 100644 (file)
@@ -50,6 +50,15 @@ int ocfs2_write_end(struct file *file, struct address_space *mapping,
                    loff_t pos, unsigned len, unsigned copied,
                    struct page *page, void *fsdata);
 
+int ocfs2_write_end_nolock(struct address_space *mapping,
+                          loff_t pos, unsigned len, unsigned copied,
+                          struct page *page, void *fsdata);
+
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+                            loff_t pos, unsigned len, unsigned flags,
+                            struct page **pagep, void **fsdata,
+                            struct buffer_head *di_bh, struct page *mmap_page);
+
 /* all ocfs2_dio_end_io()'s fault */
 #define ocfs2_iocb_is_rw_locked(iocb) \
        test_bit(0, (unsigned long *)&iocb->private)
index 4c850d00c26975ee7dd6f165ba704e9b55cbb290..a80f31776d94beacc2383b53fe04dd7b1808077b 100644 (file)
@@ -1001,6 +1001,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
                goto bail_unlock;
        }
 
+       /*
+        * This will intentionally not wind up calling vmtruncate(),
+        * since all the work for a size change has been done above.
+        * Otherwise, we could get into problems with truncate as
+        * ip_alloc_sem is used there to protect against i_size
+        * changes.
+        */
        status = inode_setattr(inode, attr);
        if (status < 0) {
                mlog_errno(status);
index af01158b39f553fd7f04f7970ce0a88d4293953c..d79aa12137d205868bbc7d868132dd421203e9d0 100644 (file)
 
 #include "ocfs2.h"
 
+#include "aops.h"
 #include "dlmglue.h"
 #include "file.h"
 #include "inode.h"
 #include "mmap.h"
 
+static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
+{
+       /* The best way to deal with signals in the vm path is
+        * to block them upfront, rather than allowing the
+        * locking paths to return -ERESTARTSYS. */
+       sigfillset(blocked);
+
+       /* We should technically never get a bad return value
+        * from sigprocmask */
+       return sigprocmask(SIG_BLOCK, blocked, oldset);
+}
+
+static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
+{
+       return sigprocmask(SIG_SETMASK, oldset, NULL);
+}
+
 static struct page *ocfs2_nopage(struct vm_area_struct * area,
                                 unsigned long address,
                                 int *type)
@@ -53,14 +71,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
        mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
                   type);
 
-       /* The best way to deal with signals in this path is
-        * to block them upfront, rather than allowing the
-        * locking paths to return -ERESTARTSYS. */
-       sigfillset(&blocked);
-
-       /* We should technically never get a bad ret return
-        * from sigprocmask */
-       ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+       ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -68,7 +79,7 @@ static struct page *ocfs2_nopage(struct vm_area_struct * area,
 
        page = filemap_nopage(area, address, type);
 
-       ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
+       ret = ocfs2_vm_op_unblock_sigs(&oldset);
        if (ret < 0)
                mlog_errno(ret);
 out:
@@ -76,28 +87,136 @@ out:
        return page;
 }
 
-static struct vm_operations_struct ocfs2_file_vm_ops = {
-       .nopage = ocfs2_nopage,
-};
+static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
+                               struct page *page)
+{
+       int ret;
+       struct address_space *mapping = inode->i_mapping;
+       loff_t pos = page->index << PAGE_CACHE_SHIFT;
+       unsigned int len = PAGE_CACHE_SIZE;
+       pgoff_t last_index;
+       struct page *locked_page = NULL;
+       void *fsdata;
+       loff_t size = i_size_read(inode);
 
-int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+       /*
+        * Another node might have truncated while we were waiting on
+        * cluster locks.
+        */
+       last_index = size >> PAGE_CACHE_SHIFT;
+       if (page->index > last_index) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * The i_size check above doesn't catch the case where nodes
+        * truncated and then re-extended the file. We'll re-check the
+        * page mapping after taking the page lock inside of
+        * ocfs2_write_begin_nolock().
+        */
+       if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /*
+        * Call ocfs2_write_begin() and ocfs2_write_end() to take
+        * advantage of the allocation code there. We pass a write
+        * length of the whole page (chopped to i_size) to make sure
+        * the whole thing is allocated.
+        *
+        * Since we know the page is up to date, we don't have to
+        * worry about ocfs2_write_begin() skipping some buffer reads
+        * because the "write" would invalidate their data.
+        */
+       if (page->index == last_index)
+               len = size & ~PAGE_CACHE_MASK;
+
+       ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
+                                      &fsdata, di_bh, page);
+       if (ret) {
+               if (ret != -ENOSPC)
+                       mlog_errno(ret);
+               goto out;
+       }
+
+       ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+                                    fsdata);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out;
+       }
+       BUG_ON(ret != len);
+       ret = 0;
+out:
+       return ret;
+}
+
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
-       int ret = 0, lock_level = 0;
-       struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);
+       struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+       struct buffer_head *di_bh = NULL;
+       sigset_t blocked, oldset;
+       int ret, ret2;
+
+       ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
+       if (ret < 0) {
+               mlog_errno(ret);
+               return ret;
+       }
+
+       /*
+        * The cluster locks taken will block a truncate from another
+        * node. Taking the data lock will also ensure that we don't
+        * attempt page truncation as part of a downconvert.
+        */
+       ret = ocfs2_meta_lock(inode, &di_bh, 1);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out;
+       }
 
        /*
-        * Only support shared writeable mmap for local mounts which
-        * don't know about holes.
+        * The alloc sem should be enough to serialize with
+        * ocfs2_truncate_file() changing i_size as well as any thread
+        * modifying the inode btree.
         */
-       if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
-           ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
-           ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
-               mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
-               /* This is -EINVAL because generic_file_readonly_mmap
-                * returns it in a similar situation. */
-               return -EINVAL;
+       down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+       ret = ocfs2_data_lock(inode, 1);
+       if (ret < 0) {
+               mlog_errno(ret);
+               goto out_meta_unlock;
        }
 
+       ret = __ocfs2_page_mkwrite(inode, di_bh, page);
+
+       ocfs2_data_unlock(inode, 1);
+
+out_meta_unlock:
+       up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+       brelse(di_bh);
+       ocfs2_meta_unlock(inode, 1);
+
+out:
+       ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
+       if (ret2 < 0)
+               mlog_errno(ret2);
+
+       return ret;
+}
+
+static struct vm_operations_struct ocfs2_file_vm_ops = {
+       .nopage         = ocfs2_nopage,
+       .page_mkwrite   = ocfs2_page_mkwrite,
+};
+
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       int ret = 0, lock_level = 0;
+
        ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
                                    file->f_vfsmnt, &lock_level);
        if (ret < 0) {