orangefs: write range tracking
authorMartin Brandenburg <martin@omnibond.com>
Fri, 14 Dec 2018 20:24:43 +0000 (15:24 -0500)
committerMike Marshall <hubcap@omnibond.com>
Fri, 3 May 2019 18:32:38 +0000 (14:32 -0400)
Attach the actual range of bytes written to plus the responsible uid/gid
to each dirty page.  This information must be sent to the server when
the page is written out.

Now write_begin, page_mkwrite, and invalidatepage keep up with this
information.  There are several conditions where they must write out the
page immediately to store the new range.  Two non-contiguous ranges
cannot be stored on a single page.

Signed-off-by: Martin Brandenburg <martin@omnibond.com>
Signed-off-by: Mike Marshall <hubcap@omnibond.com>
fs/orangefs/file.c
fs/orangefs/inode.c
fs/orangefs/orangefs-kernel.h

index 26d8ff410b0a29f4623e99237b31717e6f06ad45..f409ac5d3410661c0557f6c7f3226d0c98aa545e 100644 (file)
@@ -46,8 +46,8 @@ static int flush_racache(struct inode *inode)
  * Post and wait for the I/O upcall to finish
  */
 ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode,
-               loff_t *offset, struct iov_iter *iter,
-               size_t total_size, loff_t readahead_size)
+    loff_t *offset, struct iov_iter *iter, size_t total_size,
+    loff_t readahead_size, struct orangefs_write_range *wr)
 {
        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
        struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
@@ -85,6 +85,10 @@ populate_shared_memory:
        new_op->upcall.req.io.buf_index = buffer_index;
        new_op->upcall.req.io.count = total_size;
        new_op->upcall.req.io.offset = *offset;
+       if (type == ORANGEFS_IO_WRITE && wr) {
+               new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid);
+               new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid);
+       }
 
        gossip_debug(GOSSIP_FILE_DEBUG,
                     "%s(%pU): offset: %llu total_size: %zd\n",
@@ -329,7 +333,7 @@ static vm_fault_t orangefs_fault(struct vm_fault *vmf)
 static const struct vm_operations_struct orangefs_file_vm_ops = {
        .fault = orangefs_fault,
        .map_pages = filemap_map_pages,
-       .page_mkwrite = filemap_page_mkwrite,
+       .page_mkwrite = orangefs_page_mkwrite,
 };
 
 /*
index 1c72aa38317ddae2454050a7cb3f56391a827dce..add9c569a7dce0f1f1de50ae7f4ff8a500322c01 100644 (file)
 #include "orangefs-kernel.h"
 #include "orangefs-bufmap.h"
 
-static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
+static int orangefs_writepage_locked(struct page *page,
+    struct writeback_control *wbc)
 {
        struct inode *inode = page->mapping->host;
+       struct orangefs_write_range *wr = NULL;
        struct iov_iter iter;
        struct bio_vec bv;
        size_t len, wlen;
@@ -26,34 +28,52 @@ static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
 
        set_page_writeback(page);
 
-       off = page_offset(page);
        len = i_size_read(inode);
-       if (off > len) {
-               /* The file was truncated; there is nothing to write. */
-               unlock_page(page);
-               end_page_writeback(page);
-               return 0;
+       if (PagePrivate(page)) {
+               wr = (struct orangefs_write_range *)page_private(page);
+               off = wr->pos;
+               if (off + wr->len > len)
+                       wlen = len - off;
+               else
+                       wlen = wr->len;
+       } else {
+               WARN_ON(1);
+               off = page_offset(page);
+               if (off + PAGE_SIZE > len)
+                       wlen = len - off;
+               else
+                       wlen = PAGE_SIZE;
        }
-       if (off + PAGE_SIZE > len)
-               wlen = len - off;
-       else
-               wlen = PAGE_SIZE;
+       /* Should've been handled in orangefs_invalidatepage. */
+       WARN_ON(off == len || off + wlen > len);
 
        bv.bv_page = page;
        bv.bv_len = wlen;
        bv.bv_offset = off % PAGE_SIZE;
-       if (wlen == 0)
-               dump_stack();
+       WARN_ON(wlen == 0);
        iov_iter_bvec(&iter, WRITE, &bv, 1, wlen);
 
        ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen,
-           len);
+           len, wr);
        if (ret < 0) {
                SetPageError(page);
                mapping_set_error(page->mapping, ret);
        } else {
                ret = 0;
        }
+       if (wr) {
+               kfree(wr);
+               set_page_private(page, 0);
+               ClearPagePrivate(page);
+               put_page(page);
+       }
+       return ret;
+}
+
+static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
+{
+       int ret;
+       ret = orangefs_writepage_locked(page, wbc);
        unlock_page(page);
        end_page_writeback(page);
        return ret;
@@ -74,7 +94,7 @@ static int orangefs_readpage(struct file *file, struct page *page)
        iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE);
 
        ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
-           PAGE_SIZE, inode->i_size);
+           PAGE_SIZE, inode->i_size, NULL);
        /* this will only zero remaining unread portions of the page data */
        iov_iter_zero(~0U, &iter);
        /* takes care of potential aliasing */
@@ -92,6 +112,73 @@ static int orangefs_readpage(struct file *file, struct page *page)
        return ret;
 }
 
+static int orangefs_launder_page(struct page *);
+
+static int orangefs_write_begin(struct file *file,
+    struct address_space *mapping,
+    loff_t pos, unsigned len, unsigned flags, struct page **pagep,
+    void **fsdata)
+{
+       struct orangefs_write_range *wr;
+       struct page *page;
+       pgoff_t index;
+       int ret;
+
+       index = pos >> PAGE_SHIFT;
+
+       page = grab_cache_page_write_begin(mapping, index, flags);
+       if (!page)
+               return -ENOMEM;
+
+       *pagep = page;
+
+       if (PageDirty(page) && !PagePrivate(page)) {
+               /*
+                * Should be impossible.  If it happens, launder the page
+                * since we don't know what's dirty.  This will WARN in
+                * orangefs_writepage_locked.
+                */
+               ret = orangefs_launder_page(page);
+               if (ret)
+                       return ret;
+       }
+       if (PagePrivate(page)) {
+               struct orangefs_write_range *wr;
+               wr = (struct orangefs_write_range *)page_private(page);
+               if (wr->pos + wr->len == pos &&
+                   uid_eq(wr->uid, current_fsuid()) &&
+                   gid_eq(wr->gid, current_fsgid())) {
+                       wr->len += len;
+                       goto okay;
+               } else {
+                       ret = orangefs_launder_page(page);
+                       if (ret)
+                               return ret;
+               }
+
+       }
+
+       wr = kmalloc(sizeof *wr, GFP_KERNEL);
+       if (!wr)
+               return -ENOMEM;
+
+       wr->pos = pos;
+       wr->len = len;
+       wr->uid = current_fsuid();
+       wr->gid = current_fsgid();
+       SetPagePrivate(page);
+       set_page_private(page, (unsigned long)wr);
+       get_page(page);
+okay:
+
+       if (!PageUptodate(page) && (len != PAGE_SIZE)) {
+               unsigned from = pos & (PAGE_SIZE - 1);
+
+               zero_user_segments(page, 0, from, from + len, PAGE_SIZE);
+       }
+       return 0;
+}
+
 static int orangefs_write_end(struct file *file, struct address_space *mapping,
     loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata)
 {
@@ -105,24 +192,96 @@ static void orangefs_invalidatepage(struct page *page,
                                 unsigned int offset,
                                 unsigned int length)
 {
-       gossip_debug(GOSSIP_INODE_DEBUG,
-                    "orangefs_invalidatepage called on page %p "
-                    "(offset is %u)\n",
-                    page,
-                    offset);
+       struct orangefs_write_range *wr;
+       wr = (struct orangefs_write_range *)page_private(page);
+
+       if (offset == 0 && length == PAGE_SIZE) {
+               kfree((struct orangefs_write_range *)page_private(page));
+               set_page_private(page, 0);
+               ClearPagePrivate(page);
+               put_page(page);
+       /* write range entirely within invalidate range (or equal) */
+       } else if (page_offset(page) + offset <= wr->pos &&
+           wr->pos + wr->len <= page_offset(page) + offset + length) {
+               kfree((struct orangefs_write_range *)page_private(page));
+               set_page_private(page, 0);
+               ClearPagePrivate(page);
+               put_page(page);
+               /* XXX is this right? only caller in fs */
+               cancel_dirty_page(page);
+       /* invalidate range chops off end of write range */
+       } else if (wr->pos < page_offset(page) + offset &&
+           wr->pos + wr->len <= page_offset(page) + offset + length &&
+            page_offset(page) + offset < wr->pos + wr->len) {
+               size_t x;
+               x = wr->pos + wr->len - (page_offset(page) + offset);
+               WARN_ON(x > wr->len);
+               wr->len -= x;
+               wr->uid = current_fsuid();
+               wr->gid = current_fsgid();
+       /* invalidate range chops off beginning of write range */
+       } else if (page_offset(page) + offset <= wr->pos &&
+           page_offset(page) + offset + length < wr->pos + wr->len &&
+           wr->pos < page_offset(page) + offset + length) {
+               size_t x;
+               x = page_offset(page) + offset + length - wr->pos;
+               WARN_ON(x > wr->len);
+               wr->pos += x;
+               wr->len -= x;
+               wr->uid = current_fsuid();
+               wr->gid = current_fsgid();
+       /* invalidate range entirely within write range (punch hole) */
+       } else if (wr->pos < page_offset(page) + offset &&
+           page_offset(page) + offset + length < wr->pos + wr->len) {
+               /* XXX what do we do here... should not WARN_ON */
+               WARN_ON(1);
+               /* punch hole */
+               /*
+                * should we just ignore this and write it out anyway?
+                * it hardly makes sense
+                */
+       /* non-overlapping ranges */
+       } else {
+               /* WARN if they do overlap */
+               if (!((page_offset(page) + offset + length <= wr->pos) ^
+                   (wr->pos + wr->len <= page_offset(page) + offset))) {
+                       WARN_ON(1);
+                       printk("invalidate range offset %llu length %u\n",
+                           page_offset(page) + offset, length);
+                       printk("write range offset %llu length %zu\n",
+                           wr->pos, wr->len);
+               }
+       }
+}
 
-       ClearPageUptodate(page);
-       ClearPageMappedToDisk(page);
-       return;
+static int orangefs_releasepage(struct page *page, gfp_t foo)
+{
+       return !PagePrivate(page);
+}
 
+static void orangefs_freepage(struct page *page)
+{
+       if (PagePrivate(page)) {
+               kfree((struct orangefs_write_range *)page_private(page));
+               set_page_private(page, 0);
+               ClearPagePrivate(page);
+               put_page(page);
+       }
 }
 
-static int orangefs_releasepage(struct page *page, gfp_t foo)
+static int orangefs_launder_page(struct page *page)
 {
-       gossip_debug(GOSSIP_INODE_DEBUG,
-                    "orangefs_releasepage called on page %p\n",
-                    page);
-       return 0;
+       int r = 0;
+       struct writeback_control wbc = {
+               .sync_mode = WB_SYNC_ALL,
+               .nr_to_write = 0,
+       };
+       wait_on_page_writeback(page);
+       if (clear_page_dirty_for_io(page)) {
+               r = orangefs_writepage_locked(page, &wbc);
+               end_page_writeback(page);
+       }
+       return r;
 }
 
 static ssize_t orangefs_direct_IO(struct kiocb *iocb,
@@ -145,7 +304,6 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
        struct orangefs_khandle *handle = &orangefs_inode->refn.khandle;
        size_t count = iov_iter_count(iter);
-       size_t ORIGINALcount = iov_iter_count(iter);
        ssize_t total_count = 0;
        ssize_t ret = -EINVAL;
        int i = 0;
@@ -192,7 +350,7 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
                             (int)*offset);
 
                ret = wait_for_direct_io(type, inode, offset, iter,
-                               each_count, 0);
+                               each_count, 0, NULL);
                gossip_debug(GOSSIP_FILE_DEBUG,
                             "%s(%pU): return from wait_for_io:%d\n",
                             __func__,
@@ -247,13 +405,82 @@ static const struct address_space_operations orangefs_address_operations = {
        .writepage = orangefs_writepage,
        .readpage = orangefs_readpage,
        .set_page_dirty = __set_page_dirty_nobuffers,
-       .write_begin = simple_write_begin,
+       .write_begin = orangefs_write_begin,
        .write_end = orangefs_write_end,
        .invalidatepage = orangefs_invalidatepage,
        .releasepage = orangefs_releasepage,
+       .freepage = orangefs_freepage,
+       .launder_page = orangefs_launder_page,
        .direct_IO = orangefs_direct_IO,
 };
 
+vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
+{
+       struct page *page = vmf->page;
+       struct inode *inode = file_inode(vmf->vma->vm_file);
+       vm_fault_t ret = VM_FAULT_LOCKED;
+       struct orangefs_write_range *wr;
+
+       lock_page(page);
+       if (PageDirty(page) && !PagePrivate(page)) {
+               /*
+                * Should be impossible.  If it happens, launder the page
+                * since we don't know what's dirty.  This will WARN in
+                * orangefs_writepage_locked.
+                */
+               if (orangefs_launder_page(page)) {
+                       ret = VM_FAULT_RETRY;
+                       goto out;
+               }
+       }
+       if (PagePrivate(page)) {
+               wr = (struct orangefs_write_range *)page_private(page);
+               if (uid_eq(wr->uid, current_fsuid()) &&
+                   gid_eq(wr->gid, current_fsgid())) {
+                       wr->pos = page_offset(page);
+                       wr->len = PAGE_SIZE;
+                       goto okay;
+               } else {
+                       if (orangefs_launder_page(page)) {
+                               ret = VM_FAULT_RETRY;
+                               goto out;
+                       }
+               }
+       }
+       wr = kmalloc(sizeof *wr, GFP_KERNEL);
+       if (!wr) {
+               ret = VM_FAULT_RETRY;
+               goto out;
+       }
+       wr->pos = page_offset(page);
+       wr->len = PAGE_SIZE;
+       wr->uid = current_fsuid();
+       wr->gid = current_fsgid();
+       SetPagePrivate(page);
+       set_page_private(page, (unsigned long)wr);
+       get_page(page);
+okay:
+
+       sb_start_pagefault(inode->i_sb);
+       file_update_time(vmf->vma->vm_file);
+       if (page->mapping != inode->i_mapping) {
+               unlock_page(page);
+               ret = VM_FAULT_NOPAGE;
+               goto out;
+       }
+
+       /*
+        * We mark the page dirty already here so that when freeze is in
+        * progress, we are guaranteed that writeback during freezing will
+        * see the dirty page and writeprotect it again.
+        */
+       set_page_dirty(page);
+       wait_for_stable_page(page);
+out:
+       sb_end_pagefault(inode->i_sb);
+       return ret;
+}
+
 static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr)
 {
        struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
index 307bbb61819af5021e5a4af91fd064125dba4dae..336a3ec0b83e584f44b489ed33f8bf2a71673249 100644 (file)
@@ -230,6 +230,13 @@ struct orangefs_cached_xattr {
        unsigned long timeout;
 };
 
+struct orangefs_write_range {
+       loff_t pos;
+       size_t len;
+       kuid_t uid;
+       kgid_t gid;
+};
+
 extern struct orangefs_stats orangefs_stats;
 
 /*
@@ -342,6 +349,7 @@ void fsid_key_table_finalize(void);
 /*
  * defined in inode.c
  */
+vm_fault_t orangefs_page_mkwrite(struct vm_fault *);
 struct inode *orangefs_new_inode(struct super_block *sb,
                              struct inode *dir,
                              int mode,
@@ -383,7 +391,7 @@ bool __is_daemon_in_service(void);
  * defined in file.c
  */
 ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *,
-    struct iov_iter *, size_t, loff_t);
+    struct iov_iter *, size_t, loff_t, struct orangefs_write_range *);
 ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *,
     struct iov_iter *);