orangefs: add orangefs_revalidate_mapping
authorMartin Brandenburg <martin@omnibond.com>
Tue, 12 Feb 2019 20:19:06 +0000 (20:19 +0000)
committerMike Marshall <hubcap@omnibond.com>
Fri, 3 May 2019 18:32:39 +0000 (14:32 -0400)
This is modeled after NFS, except our method is different.  We use a
simple timer to determine whether to invalidate the page cache.  This
is bound to perform.

This addes a sysfs parameter cache_timeout_msecs which controls the time
between page cache invalidations.

Signed-off-by: Martin Brandenburg <martin@omnibond.com>
Signed-off-by: Mike Marshall <hubcap@omnibond.com>
fs/orangefs/file.c
fs/orangefs/inode.c
fs/orangefs/orangefs-kernel.h
fs/orangefs/orangefs-mod.c
fs/orangefs/orangefs-sysfs.c

index 405449ce4b0264da1c4bdb138f6b1b31062ecdc3..faa5b61cdfd601d11e03b945174570a1c2a727c8 100644 (file)
@@ -241,18 +241,78 @@ out:
        return ret;
 }
 
+int orangefs_revalidate_mapping(struct inode *inode)
+{
+       struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+       struct address_space *mapping = inode->i_mapping;
+       unsigned long *bitlock = &orangefs_inode->bitlock;
+       int ret;
+
+       while (1) {
+               ret = wait_on_bit(bitlock, 1, TASK_KILLABLE);
+               if (ret)
+                       return ret;
+               spin_lock(&inode->i_lock);
+               if (test_bit(1, bitlock)) {
+                       spin_unlock(&inode->i_lock);
+                       continue;
+               }
+               if (!time_before(jiffies, orangefs_inode->mapping_time))
+                       break;
+               spin_unlock(&inode->i_lock);
+               return 0;
+       }
+
+       set_bit(1, bitlock);
+       smp_wmb();
+       spin_unlock(&inode->i_lock);
+
+       unmap_mapping_range(mapping, 0, 0, 0);
+       ret = filemap_write_and_wait(mapping);
+       if (!ret)
+               ret = invalidate_inode_pages2(mapping);
+
+       orangefs_inode->mapping_time = jiffies +
+           orangefs_cache_timeout_msecs*HZ/1000;
+
+       clear_bit(1, bitlock);
+       smp_mb__after_atomic();
+       wake_up_bit(bitlock, 1);
+
+       return ret;
+}
+
 static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
     struct iov_iter *iter)
 {
+       int ret;
        orangefs_stats.reads++;
-       return generic_file_read_iter(iocb, iter);
+
+       down_read(&file_inode(iocb->ki_filp)->i_rwsem);
+       ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
+       if (ret)
+               goto out;
+
+       ret = generic_file_read_iter(iocb, iter);
+out:
+       up_read(&file_inode(iocb->ki_filp)->i_rwsem);
+       return ret;
 }
 
 static ssize_t orangefs_file_write_iter(struct kiocb *iocb,
     struct iov_iter *iter)
 {
+       int ret;
        orangefs_stats.writes++;
-       return generic_file_write_iter(iocb, iter);
+
+       if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) {
+               ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
+               if (ret)
+                       return ret;
+       }
+
+       ret = generic_file_write_iter(iocb, iter);
+       return ret;
 }
 
 /*
@@ -341,6 +401,12 @@ static const struct vm_operations_struct orangefs_file_vm_ops = {
  */
 static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
+       int ret;
+
+       ret = orangefs_revalidate_mapping(file_inode(file));
+       if (ret)
+               return ret;
+
        gossip_debug(GOSSIP_FILE_DEBUG,
                     "orangefs_file_mmap: called on %s\n",
                     (file ?
index add9c569a7dce0f1f1de50ae7f4ff8a500322c01..7ed2ea093c4ea5fb47baf692e0a78e17617fe8a5 100644 (file)
@@ -31,6 +31,7 @@ static int orangefs_writepage_locked(struct page *page,
        len = i_size_read(inode);
        if (PagePrivate(page)) {
                wr = (struct orangefs_write_range *)page_private(page);
+               WARN_ON(wr->pos >= len);
                off = wr->pos;
                if (off + wr->len > len)
                        wlen = len - off;
@@ -79,6 +80,173 @@ static int orangefs_writepage(struct page *page, struct writeback_control *wbc)
        return ret;
 }
 
+struct orangefs_writepages {
+       loff_t off;
+       size_t len;
+       kuid_t uid;
+       kgid_t gid;
+       int maxpages;
+       int npages;
+       struct page **pages;
+       struct bio_vec *bv;
+};
+
+static int orangefs_writepages_work(struct orangefs_writepages *ow,
+    struct writeback_control *wbc)
+{
+       struct inode *inode = ow->pages[0]->mapping->host;
+       struct orangefs_write_range *wrp, wr;
+       struct iov_iter iter;
+       ssize_t ret;
+       size_t len;
+       loff_t off;
+       int i;
+
+       len = i_size_read(inode);
+
+       for (i = 0; i < ow->npages; i++) {
+               set_page_writeback(ow->pages[i]);
+               ow->bv[i].bv_page = ow->pages[i];
+               ow->bv[i].bv_len = min(page_offset(ow->pages[i]) + PAGE_SIZE,
+                   ow->off + ow->len) -
+                   max(ow->off, page_offset(ow->pages[i]));
+               if (i == 0)
+                       ow->bv[i].bv_offset = ow->off -
+                           page_offset(ow->pages[i]);
+               else
+                       ow->bv[i].bv_offset = 0;
+       }
+       iov_iter_bvec(&iter, WRITE, ow->bv, ow->npages, ow->len);
+
+       WARN_ON(ow->off >= len);
+       if (ow->off + ow->len > len)
+               ow->len = len - ow->off;
+
+       off = ow->off;
+       wr.uid = ow->uid;
+       wr.gid = ow->gid;
+       ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len,
+           0, &wr);
+       if (ret < 0) {
+               for (i = 0; i < ow->npages; i++) {
+                       SetPageError(ow->pages[i]);
+                       mapping_set_error(ow->pages[i]->mapping, ret);
+                       if (PagePrivate(ow->pages[i])) {
+                               wrp = (struct orangefs_write_range *)
+                                   page_private(ow->pages[i]);
+                               ClearPagePrivate(ow->pages[i]);
+                               put_page(ow->pages[i]);
+                               kfree(wrp);
+                       }
+                       end_page_writeback(ow->pages[i]);
+                       unlock_page(ow->pages[i]);
+               }
+       } else {
+               ret = 0;
+               for (i = 0; i < ow->npages; i++) {
+                       if (PagePrivate(ow->pages[i])) {
+                               wrp = (struct orangefs_write_range *)
+                                   page_private(ow->pages[i]);
+                               ClearPagePrivate(ow->pages[i]);
+                               put_page(ow->pages[i]);
+                               kfree(wrp);
+                       }
+                       end_page_writeback(ow->pages[i]);
+                       unlock_page(ow->pages[i]);
+               }
+       }
+       return ret;
+}
+
+static int orangefs_writepages_callback(struct page *page,
+    struct writeback_control *wbc, void *data)
+{
+       struct orangefs_writepages *ow = data;
+       struct orangefs_write_range *wr;
+       int ret;
+
+       if (!PagePrivate(page)) {
+               unlock_page(page);
+               /* It's not private so there's nothing to write, right? */
+               printk("writepages_callback not private!\n");
+               BUG();
+               return 0;
+       }
+       wr = (struct orangefs_write_range *)page_private(page);
+
+       ret = -1;
+       if (ow->npages == 0) {
+               ow->off = wr->pos;
+               ow->len = wr->len;
+               ow->uid = wr->uid;
+               ow->gid = wr->gid;
+               ow->pages[ow->npages++] = page;
+               ret = 0;
+               goto done;
+       }
+       if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) {
+               orangefs_writepages_work(ow, wbc);
+               ow->npages = 0;
+               ret = -1;
+               goto done;
+       }
+       if (ow->off + ow->len == wr->pos) {
+               ow->len += wr->len;
+               ow->pages[ow->npages++] = page;
+               ret = 0;
+               goto done;
+       }
+done:
+       if (ret == -1) {
+               if (ow->npages) {
+                       orangefs_writepages_work(ow, wbc);
+                       ow->npages = 0;
+               }
+               ret = orangefs_writepage_locked(page, wbc);
+               mapping_set_error(page->mapping, ret);
+               unlock_page(page);
+               end_page_writeback(page);
+       } else {
+               if (ow->npages == ow->maxpages) {
+                       orangefs_writepages_work(ow, wbc);
+                       ow->npages = 0;
+               }
+       }
+       return ret;
+}
+
+static int orangefs_writepages(struct address_space *mapping,
+    struct writeback_control *wbc)
+{
+       struct orangefs_writepages *ow;
+       struct blk_plug plug;
+       int ret;
+       ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL);
+       if (!ow)
+               return -ENOMEM;
+       ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE;
+       ow->pages = kcalloc(ow->maxpages, sizeof(struct page *), GFP_KERNEL);
+       if (!ow->pages) {
+               kfree(ow);
+               return -ENOMEM;
+       }
+       ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL);
+       if (!ow->bv) {
+               kfree(ow->pages);
+               kfree(ow);
+               return -ENOMEM;
+       }
+       blk_start_plug(&plug);
+       ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow);
+       if (ow->npages)
+               ret = orangefs_writepages_work(ow, wbc);
+       blk_finish_plug(&plug);
+       kfree(ow->pages);
+       kfree(ow->bv);
+       kfree(ow);
+       return ret;
+}
+
 static int orangefs_readpage(struct file *file, struct page *page)
 {
        struct inode *inode = page->mapping->host;
@@ -93,6 +261,9 @@ static int orangefs_readpage(struct file *file, struct page *page)
        bv.bv_offset = 0;
        iov_iter_bvec(&iter, READ, &bv, 1, PAGE_SIZE);
 
+       if (PageDirty(page))
+               orangefs_launder_page(page);
+
        ret = wait_for_direct_io(ORANGEFS_IO_READ, inode, &off, &iter,
            PAGE_SIZE, inode->i_size, NULL);
        /* this will only zero remaining unread portions of the page data */
@@ -170,22 +341,42 @@ static int orangefs_write_begin(struct file *file,
        set_page_private(page, (unsigned long)wr);
        get_page(page);
 okay:
-
-       if (!PageUptodate(page) && (len != PAGE_SIZE)) {
-               unsigned from = pos & (PAGE_SIZE - 1);
-
-               zero_user_segments(page, 0, from, from + len, PAGE_SIZE);
-       }
        return 0;
 }
 
 static int orangefs_write_end(struct file *file, struct address_space *mapping,
     loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata)
 {
-       int r;
-       r = simple_write_end(file, mapping, pos, len, copied, page, fsdata);
+       struct inode *inode = page->mapping->host;
+       loff_t last_pos = pos + copied;
+
+       /*
+        * No need to use i_size_read() here, the i_size
+        * cannot change under us because we hold the i_mutex.
+        */
+       if (last_pos > inode->i_size)
+               i_size_write(inode, last_pos);
+
+       /* zero the stale part of the page if we did a short copy */
+       if (!PageUptodate(page)) {
+               unsigned from = pos & (PAGE_SIZE - 1);
+               if (copied < len) {
+                       zero_user(page, from + copied, len - copied);
+               }
+               /* Set fully written pages uptodate. */
+               if (pos == page_offset(page) &&
+                   (len == PAGE_SIZE || pos + len == inode->i_size)) {
+                       zero_user_segment(page, from + copied, PAGE_SIZE);
+                       SetPageUptodate(page);
+               }
+       }
+
+       set_page_dirty(page);
+       unlock_page(page);
+       put_page(page);
+
        mark_inode_dirty_sync(file_inode(file));
-       return r;
+       return copied;
 }
 
 static void orangefs_invalidatepage(struct page *page,
@@ -200,6 +391,7 @@ static void orangefs_invalidatepage(struct page *page,
                set_page_private(page, 0);
                ClearPagePrivate(page);
                put_page(page);
+               return;
        /* write range entirely within invalidate range (or equal) */
        } else if (page_offset(page) + offset <= wr->pos &&
            wr->pos + wr->len <= page_offset(page) + offset + length) {
@@ -209,6 +401,7 @@ static void orangefs_invalidatepage(struct page *page,
                put_page(page);
                /* XXX is this right? only caller in fs */
                cancel_dirty_page(page);
+               return;
        /* invalidate range chops off end of write range */
        } else if (wr->pos < page_offset(page) + offset &&
            wr->pos + wr->len <= page_offset(page) + offset + length &&
@@ -240,6 +433,7 @@ static void orangefs_invalidatepage(struct page *page,
                 * should we just ignore this and write it out anyway?
                 * it hardly makes sense
                 */
+               return;
        /* non-overlapping ranges */
        } else {
                /* WARN if they do overlap */
@@ -251,7 +445,15 @@ static void orangefs_invalidatepage(struct page *page,
                        printk("write range offset %llu length %zu\n",
                            wr->pos, wr->len);
                }
+               return;
        }
+
+       /*
+        * Above there are returns where wr is freed or where we WARN.
+        * Thus the following runs if wr was modified above.
+        */
+
+       orangefs_launder_page(page);
 }
 
 static int orangefs_releasepage(struct page *page, gfp_t foo)
@@ -404,6 +606,7 @@ out:
 static const struct address_space_operations orangefs_address_operations = {
        .writepage = orangefs_writepage,
        .readpage = orangefs_readpage,
+       .writepages = orangefs_writepages,
        .set_page_dirty = __set_page_dirty_nobuffers,
        .write_begin = orangefs_write_begin,
        .write_end = orangefs_write_end,
@@ -418,9 +621,18 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
 {
        struct page *page = vmf->page;
        struct inode *inode = file_inode(vmf->vma->vm_file);
-       vm_fault_t ret = VM_FAULT_LOCKED;
+       struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+       unsigned long *bitlock = &orangefs_inode->bitlock;
+       vm_fault_t ret;
        struct orangefs_write_range *wr;
 
+       sb_start_pagefault(inode->i_sb);
+
+       if (wait_on_bit(bitlock, 1, TASK_KILLABLE)) {
+               ret = VM_FAULT_RETRY;
+               goto out;
+       }
+
        lock_page(page);
        if (PageDirty(page) && !PagePrivate(page)) {
                /*
@@ -429,7 +641,7 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
                 * orangefs_writepage_locked.
                 */
                if (orangefs_launder_page(page)) {
-                       ret = VM_FAULT_RETRY;
+                       ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
                        goto out;
                }
        }
@@ -442,14 +654,14 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
                        goto okay;
                } else {
                        if (orangefs_launder_page(page)) {
-                               ret = VM_FAULT_RETRY;
+                               ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
                                goto out;
                        }
                }
        }
        wr = kmalloc(sizeof *wr, GFP_KERNEL);
        if (!wr) {
-               ret = VM_FAULT_RETRY;
+               ret = VM_FAULT_LOCKED|VM_FAULT_RETRY;
                goto out;
        }
        wr->pos = page_offset(page);
@@ -461,11 +673,10 @@ vm_fault_t orangefs_page_mkwrite(struct vm_fault *vmf)
        get_page(page);
 okay:
 
-       sb_start_pagefault(inode->i_sb);
        file_update_time(vmf->vma->vm_file);
        if (page->mapping != inode->i_mapping) {
                unlock_page(page);
-               ret = VM_FAULT_NOPAGE;
+               ret = VM_FAULT_LOCKED|VM_FAULT_NOPAGE;
                goto out;
        }
 
@@ -476,6 +687,7 @@ okay:
         */
        set_page_dirty(page);
        wait_for_stable_page(page);
+       ret = VM_FAULT_LOCKED;
 out:
        sb_end_pagefault(inode->i_sb);
        return ret;
@@ -553,13 +765,15 @@ int __orangefs_setattr(struct inode *inode, struct iattr *iattr)
                        } else {
                                gossip_debug(GOSSIP_UTILS_DEBUG,
                                             "User attempted to set sticky bit on non-root directory; returning EINVAL.\n");
-                               return -EINVAL;
+                               ret = -EINVAL;
+                               goto out;
                        }
                }
                if (iattr->ia_mode & (S_ISUID)) {
                        gossip_debug(GOSSIP_UTILS_DEBUG,
                                     "Attempting to set setuid bit (not supported); returning EINVAL.\n");
-                       return -EINVAL;
+                       ret = -EINVAL;
+                       goto out;
                }
        }
 
@@ -741,6 +955,8 @@ static int orangefs_set_inode(struct inode *inode, void *data)
        ORANGEFS_I(inode)->refn.khandle = ref->khandle;
        ORANGEFS_I(inode)->attr_valid = 0;
        hash_init(ORANGEFS_I(inode)->xattr_cache);
+       ORANGEFS_I(inode)->mapping_time = jiffies - 1;
+       ORANGEFS_I(inode)->bitlock = 0;
        return 0;
 }
 
index 336a3ec0b83e584f44b489ed33f8bf2a71673249..87beab10326a77def01c5a4743d91ce68a87e7e4 100644 (file)
@@ -193,9 +193,11 @@ struct orangefs_inode_s {
        sector_t last_failed_block_index_read;
 
        unsigned long getattr_time;
+       unsigned long mapping_time;
        int attr_valid;
        kuid_t attr_uid;
        kgid_t attr_gid;
+       unsigned long bitlock;
 
        DECLARE_HASHTABLE(xattr_cache, 4);
 };
@@ -390,6 +392,7 @@ bool __is_daemon_in_service(void);
 /*
  * defined in file.c
  */
+int orangefs_revalidate_mapping(struct inode *);
 ssize_t wait_for_direct_io(enum ORANGEFS_io_type, struct inode *, loff_t *,
     struct iov_iter *, size_t, loff_t, struct orangefs_write_range *);
 ssize_t do_readv_writev(enum ORANGEFS_io_type, struct file *, loff_t *,
@@ -427,6 +430,7 @@ int orangefs_normalize_to_errno(__s32 error_code);
 extern struct mutex orangefs_request_mutex;
 extern int op_timeout_secs;
 extern int slot_timeout_secs;
+extern int orangefs_cache_timeout_msecs;
 extern int orangefs_dcache_timeout_msecs;
 extern int orangefs_getattr_timeout_msecs;
 extern struct list_head orangefs_superblocks;
index 85ef87245a872e51de68a6ae84df88a1416ef3a3..82cf8b3e568b4a5096ebb3ac31d240832905a7eb 100644 (file)
@@ -30,6 +30,7 @@ static ulong module_parm_debug_mask;
 __u64 orangefs_gossip_debug_mask;
 int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
 int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
+int orangefs_cache_timeout_msecs = 50;
 int orangefs_dcache_timeout_msecs = 50;
 int orangefs_getattr_timeout_msecs = 50;
 
index 19739aaee67554fff5bbac1e793cc6b803a017c3..3627ea946402c01299ec0bde3fa50127f1ab0e21 100644 (file)
  *                     Slots are requested and waited for,
  *                     the wait times out after slot_timeout_secs.
  *
+ * What:               /sys/fs/orangefs/cache_timeout_msecs
+ * Date:               Mar 2018
+ * Contact:            Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ *                     Time in milliseconds between which
+ *                     orangefs_revalidate_mapping will invalidate the page
+ *                     cache.
+ *
  * What:               /sys/fs/orangefs/dcache_timeout_msecs
  * Date:               Jul 2016
  * Contact:            Martin Brandenburg <martin@omnibond.com>
@@ -221,6 +229,13 @@ static ssize_t sysfs_int_show(struct kobject *kobj,
                                       "%d\n",
                                       slot_timeout_secs);
                        goto out;
+               } else if (!strcmp(attr->attr.name,
+                                  "cache_timeout_msecs")) {
+                       rc = scnprintf(buf,
+                                      PAGE_SIZE,
+                                      "%d\n",
+                                      orangefs_cache_timeout_msecs);
+                       goto out;
                } else if (!strcmp(attr->attr.name,
                                   "dcache_timeout_msecs")) {
                        rc = scnprintf(buf,
@@ -277,6 +292,9 @@ static ssize_t sysfs_int_store(struct kobject *kobj,
        } else if (!strcmp(attr->attr.name, "slot_timeout_secs")) {
                rc = kstrtoint(buf, 0, &slot_timeout_secs);
                goto out;
+       } else if (!strcmp(attr->attr.name, "cache_timeout_msecs")) {
+               rc = kstrtoint(buf, 0, &orangefs_cache_timeout_msecs);
+               goto out;
        } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) {
                rc = kstrtoint(buf, 0, &orangefs_dcache_timeout_msecs);
                goto out;
@@ -818,6 +836,9 @@ static struct orangefs_attribute op_timeout_secs_attribute =
 static struct orangefs_attribute slot_timeout_secs_attribute =
        __ATTR(slot_timeout_secs, 0664, sysfs_int_show, sysfs_int_store);
 
+static struct orangefs_attribute cache_timeout_msecs_attribute =
+       __ATTR(cache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store);
+
 static struct orangefs_attribute dcache_timeout_msecs_attribute =
        __ATTR(dcache_timeout_msecs, 0664, sysfs_int_show, sysfs_int_store);
 
@@ -861,6 +882,7 @@ static struct orangefs_attribute perf_time_interval_secs_attribute =
 static struct attribute *orangefs_default_attrs[] = {
        &op_timeout_secs_attribute.attr,
        &slot_timeout_secs_attribute.attr,
+       &cache_timeout_msecs_attribute.attr,
        &dcache_timeout_msecs_attribute.attr,
        &getattr_timeout_msecs_attribute.attr,
        &readahead_count_attribute.attr,