nfs: add support for multiple nfs reqs per page

author Weston Andros Adamson <dros@primarydata.com>

Thu, 15 May 2014 15:56:45 +0000 (11:56 -0400)

committer Trond Myklebust <trond.myklebust@primarydata.com>

Thu, 29 May 2014 15:11:44 +0000 (11:11 -0400)
author Weston Andros Adamson <dros@primarydata.com>
Thu, 15 May 2014 15:56:45 +0000 (11:56 -0400)
committer Trond Myklebust <trond.myklebust@primarydata.com>
Thu, 29 May 2014 15:11:44 +0000 (11:11 -0400)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c

index 1dd8c622d719d3ff00a4bbf14d70f79d41659be9..2c0e08f4cf712e439e0790e8be35607774a8f557 100644 (file)
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -380,7 +380,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
                         struct nfs_page *req;
                         unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
                         /* XXX do we need to do the eof zeroing found in async_filler? */
-                       req = nfs_create_request(dreq->ctx, pagevec[i],
+                       req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
                                                  pgbase, req_len);
                         if (IS_ERR(req)) {
                                 result = PTR_ERR(req);
@@ -749,7 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
                         struct nfs_page *req;
                         unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
  
-                       req = nfs_create_request(dreq->ctx, pagevec[i],
+                       req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
                                                  pgbase, req_len);
                         if (IS_ERR(req)) {
                                 result = PTR_ERR(req);
@@ -827,6 +827,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
         spin_unlock(&dreq->lock);
  
         while (!list_empty(&hdr->pages)) {
+               bool do_destroy = true;
+
                 req = nfs_list_entry(hdr->pages.next);
                 nfs_list_remove_request(req);
                 switch (bit) {
@@ -834,6 +836,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
                 case NFS_IOHDR_NEED_COMMIT:
                         kref_get(&req->wb_kref);
                         nfs_mark_request_commit(req, hdr->lseg, &cinfo);
+                       do_destroy = false;
                 }
                 nfs_unlock_and_release_request(req);
         }
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c

index f343f49ff596b71875331ef0a4723a31719dd882..015fb7b48dfeab66fa42779fc917012b7b0f546d 100644 (file)
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -29,6 +29,8 @@
  static struct kmem_cache *nfs_page_cachep;
  static const struct rpc_call_ops nfs_pgio_common_ops;
  
+static void nfs_free_request(struct nfs_page *);
+
  static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
  {
         p->npages = pagecount;
@@ -136,10 +138,151 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
         return __nfs_iocounter_wait(c);
  }
  
+/*
+ * nfs_page_group_lock - lock the head of the page group
+ * @req - request in group that is to be locked
+ *
+ * this lock must be held if modifying the page group list
+ */
+void
+nfs_page_group_lock(struct nfs_page *req)
+{
+       struct nfs_page *head = req->wb_head;
+       int err = -EAGAIN;
+
+       WARN_ON_ONCE(head != head->wb_head);
+
+       while (err)
+               err = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+                       nfs_wait_bit_killable, TASK_KILLABLE);
+}
+
+/*
+ * nfs_page_group_unlock - unlock the head of the page group
+ * @req - request in group that is to be unlocked
+ */
+void
+nfs_page_group_unlock(struct nfs_page *req)
+{
+       struct nfs_page *head = req->wb_head;
+
+       WARN_ON_ONCE(head != head->wb_head);
+
+       smp_mb__before_clear_bit();
+       clear_bit(PG_HEADLOCK, &head->wb_flags);
+       smp_mb__after_clear_bit();
+       wake_up_bit(&head->wb_flags, PG_HEADLOCK);
+}
+
+/*
+ * nfs_page_group_sync_on_bit_locked
+ *
+ * must be called with page group lock held
+ */
+static bool
+nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
+{
+       struct nfs_page *head = req->wb_head;
+       struct nfs_page *tmp;
+
+       WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
+       WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
+
+       tmp = req->wb_this_page;
+       while (tmp != req) {
+               if (!test_bit(bit, &tmp->wb_flags))
+                       return false;
+               tmp = tmp->wb_this_page;
+       }
+
+       /* true! reset all bits */
+       tmp = req;
+       do {
+               clear_bit(bit, &tmp->wb_flags);
+               tmp = tmp->wb_this_page;
+       } while (tmp != req);
+
+       return true;
+}
+
+/*
+ * nfs_page_group_sync_on_bit - set bit on current request, but only
+ *   return true if the bit is set for all requests in page group
+ * @req - request in page group
+ * @bit - PG_* bit that is used to sync page group
+ */
+bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
+{
+       bool ret;
+
+       nfs_page_group_lock(req);
+       ret = nfs_page_group_sync_on_bit_locked(req, bit);
+       nfs_page_group_unlock(req);
+
+       return ret;
+}
+
+/*
+ * nfs_page_group_init - Initialize the page group linkage for @req
+ * @req - a new nfs request
+ * @prev - the previous request in page group, or NULL if @req is the first
+ *         or only request in the group (the head).
+ */
+static inline void
+nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
+{
+       WARN_ON_ONCE(prev == req);
+
+       if (!prev) {
+               req->wb_head = req;
+               req->wb_this_page = req;
+       } else {
+               WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
+               WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
+               req->wb_head = prev->wb_head;
+               req->wb_this_page = prev->wb_this_page;
+               prev->wb_this_page = req;
+
+               /* grab extra ref if head request has extra ref from
+                * the write/commit path to handle handoff between write
+                * and commit lists */
+               if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags))
+                       kref_get(&req->wb_kref);
+       }
+}
+
+/*
+ * nfs_page_group_destroy - sync the destruction of page groups
+ * @req - request that no longer needs the page group
+ *
+ * releases the page group reference from each member once all
+ * members have called this function.
+ */
+static void
+nfs_page_group_destroy(struct kref *kref)
+{
+       struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+       struct nfs_page *tmp, *next;
+
+       if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
+               return;
+
+       tmp = req;
+       do {
+               next = tmp->wb_this_page;
+               /* unlink and free */
+               tmp->wb_this_page = tmp;
+               tmp->wb_head = tmp;
+               nfs_free_request(tmp);
+               tmp = next;
+       } while (tmp != req);
+}
+
  /**
   * nfs_create_request - Create an NFS read/write request.
   * @ctx: open context to use
   * @page: page to write
+ * @last: last nfs request created for this page group or NULL if head
   * @offset: starting offset within the page for the write
   * @count: number of bytes to read/write
   *
@@ -149,7 +292,8 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
   */
  struct nfs_page *
  nfs_create_request(struct nfs_open_context *ctx, struct page *page,
-                  unsigned int offset, unsigned int count)
+                  struct nfs_page *last, unsigned int offset,
+                  unsigned int count)
  {
         struct nfs_page         *req;
         struct nfs_lock_context *l_ctx;
@@ -181,6 +325,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
         req->wb_bytes   = count;
         req->wb_context = get_nfs_open_context(ctx);
         kref_init(&req->wb_kref);
+       nfs_page_group_init(req, last);
         return req;
  }
  
@@ -238,16 +383,18 @@ static void nfs_clear_request(struct nfs_page *req)
         }
  }
  
-
  /**
   * nfs_release_request - Release the count on an NFS read/write request
   * @req: request to release
   *
   * Note: Should never be called with the spinlock held!
   */
-static void nfs_free_request(struct kref *kref)
+static void nfs_free_request(struct nfs_page *req)
  {
-       struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
+       WARN_ON_ONCE(req->wb_this_page != req);
+
+       /* extra debug: make sure no sync bits are still set */
+       WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
  
         /* Release struct file and open context */
         nfs_clear_request(req);
@@ -256,7 +403,7 @@ static void nfs_free_request(struct kref *kref)
  
  void nfs_release_request(struct nfs_page *req)
  {
-       kref_put(&req->wb_kref, nfs_free_request);
+       kref_put(&req->wb_kref, nfs_page_group_destroy);
  }
  
  static int nfs_wait_bit_uninterruptible(void *word)
@@ -832,21 +979,66 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
   * @desc: destination io descriptor
   * @req: request
   *
+ * This may split a request into subrequests which are all part of the
+ * same page group.
+ *
   * Returns true if the request 'req' was successfully coalesced into the
   * existing list of pages 'desc'.
   */
  static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                            struct nfs_page *req)
  {
-       while (!nfs_pageio_do_add_request(desc, req)) {
-               desc->pg_moreio = 1;
-               nfs_pageio_doio(desc);
-               if (desc->pg_error < 0)
-                       return 0;
-               desc->pg_moreio = 0;
-               if (desc->pg_recoalesce)
-                       return 0;
-       }
+       struct nfs_page *subreq;
+       unsigned int bytes_left = 0;
+       unsigned int offset, pgbase;
+
+       nfs_page_group_lock(req);
+
+       subreq = req;
+       bytes_left = subreq->wb_bytes;
+       offset = subreq->wb_offset;
+       pgbase = subreq->wb_pgbase;
+
+       do {
+               if (!nfs_pageio_do_add_request(desc, subreq)) {
+                       /* make sure pg_test call(s) did nothing */
+                       WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
+                       WARN_ON_ONCE(subreq->wb_offset != offset);
+                       WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
+
+                       nfs_page_group_unlock(req);
+                       desc->pg_moreio = 1;
+                       nfs_pageio_doio(desc);
+                       if (desc->pg_error < 0)
+                               return 0;
+                       desc->pg_moreio = 0;
+                       if (desc->pg_recoalesce)
+                               return 0;
+                       /* retry add_request for this subreq */
+                       nfs_page_group_lock(req);
+                       continue;
+               }
+
+               /* check for buggy pg_test call(s) */
+               WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
+               WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
+               WARN_ON_ONCE(subreq->wb_bytes == 0);
+
+               bytes_left -= subreq->wb_bytes;
+               offset += subreq->wb_bytes;
+               pgbase += subreq->wb_bytes;
+
+               if (bytes_left) {
+                       subreq = nfs_create_request(req->wb_context,
+                                       req->wb_page,
+                                       subreq, pgbase, bytes_left);
+                       nfs_lock_request(subreq);
+                       subreq->wb_offset  = offset;
+                       subreq->wb_index = req->wb_index;
+               }
+       } while (bytes_left > 0);
+
+       nfs_page_group_unlock(req);
         return 1;
  }
  
diff --git a/fs/nfs/read.c b/fs/nfs/read.c

index 46d90448f69b214de0426f2a35e36c50b634473d..902ba2c63d05dc8a4a13774f54ab388e1f3bb6a3 100644 (file)
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -85,7 +85,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
         len = nfs_page_length(page);
         if (len == 0)
                 return nfs_return_empty_page(page);
-       new = nfs_create_request(ctx, page, 0, len);
+       new = nfs_create_request(ctx, page, NULL, 0, len);
         if (IS_ERR(new)) {
                 unlock_page(page);
                 return PTR_ERR(new);
@@ -311,7 +311,7 @@ readpage_async_filler(void *data, struct page *page)
         if (len == 0)
                 return nfs_return_empty_page(page);
  
-       new = nfs_create_request(desc->ctx, page, 0, len);
+       new = nfs_create_request(desc->ctx, page, NULL, 0, len);
         if (IS_ERR(new))
                 goto out_error;
  
diff --git a/fs/nfs/write.c b/fs/nfs/write.c

index e773df207c054034c371fbdfacfd7c0cf1b87b69..d0f30f12a8b3f381f9825b50be00ccc8060f0e33 100644 (file)
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -367,6 +367,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
  {
         struct nfs_inode *nfsi = NFS_I(inode);
  
+       WARN_ON_ONCE(req->wb_this_page != req);
+
         /* Lock the request! */
         nfs_lock_request(req);
  
@@ -383,6 +385,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
                 set_page_private(req->wb_page, (unsigned long)req);
         }
         nfsi->npages++;
+       set_bit(PG_INODE_REF, &req->wb_flags);
         kref_get(&req->wb_kref);
         spin_unlock(&inode->i_lock);
  }
@@ -567,6 +570,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
  {
         struct nfs_commit_info cinfo;
         unsigned long bytes = 0;
+       bool do_destroy;
  
         if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
                 goto out;
@@ -596,6 +600,7 @@ remove_req:
  next:
                 nfs_unlock_request(req);
                 nfs_end_page_writeback(req->wb_page);
+               do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
                 nfs_release_request(req);
         }
  out:
@@ -700,6 +705,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
                 if (req == NULL)
                         goto out_unlock;
  
+               /* should be handled by nfs_flush_incompatible */
+               WARN_ON_ONCE(req->wb_head != req);
+               WARN_ON_ONCE(req->wb_this_page != req);
+
                 rqend = req->wb_offset + req->wb_bytes;
                 /*
                  * Tell the caller to flush out the request if
@@ -761,7 +770,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
         req = nfs_try_to_update_request(inode, page, offset, bytes);
         if (req != NULL)
                 goto out;
-       req = nfs_create_request(ctx, page, offset, bytes);
+       req = nfs_create_request(ctx, page, NULL, offset, bytes);
         if (IS_ERR(req))
                 goto out;
         nfs_inode_add_request(inode, req);
@@ -805,6 +814,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
                         return 0;
                 l_ctx = req->wb_lock_context;
                 do_flush = req->wb_page != page || req->wb_context != ctx;
+               /* for now, flush if more than 1 request in page_group */
+               do_flush |= req->wb_this_page != req;
                 if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
                         do_flush |= l_ctx->lockowner.l_owner != current->files
                                 || l_ctx->lockowner.l_pid != current->tgid;
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h

index 13d59af561f624a99a2a5e4b1df6805b16b9e1fe..986c0c279d0e9cf8d56adaf6c17e04061958ccd6 100644 (file)
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -26,6 +26,9 @@ enum {
         PG_MAPPED,              /* page private set for buffered io */
         PG_CLEAN,               /* write succeeded */
         PG_COMMIT_TO_DS,        /* used by pnfs layouts */
+       PG_INODE_REF,           /* extra ref held by inode (head req only) */
+       PG_HEADLOCK,            /* page group lock of wb_head */
+       PG_TEARDOWN,            /* page group sync for destroy */
  };
  
  struct nfs_inode;
@@ -41,6 +44,8 @@ struct nfs_page {
         struct kref             wb_kref;        /* reference count */
         unsigned long           wb_flags;
         struct nfs_write_verifier       wb_verf;        /* Commit cookie */
+       struct nfs_page         *wb_this_page;  /* list of reqs for this page */
+       struct nfs_page         *wb_head;       /* head pointer for req list */
  };
  
  struct nfs_pageio_descriptor;
@@ -87,9 +92,10 @@ struct nfs_pageio_descriptor {
  
  extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
                                             struct page *page,
+                                           struct nfs_page *last,
                                             unsigned int offset,
                                             unsigned int count);
-extern void nfs_release_request(struct nfs_page *req);
+extern void nfs_release_request(struct nfs_page *);
  
  
  extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
@@ -108,7 +114,10 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
                                 struct nfs_page *req);
  extern  int nfs_wait_on_request(struct nfs_page *);
  extern void nfs_unlock_request(struct nfs_page *req);
-extern void nfs_unlock_and_release_request(struct nfs_page *req);
+extern void nfs_unlock_and_release_request(struct nfs_page *);
+extern void nfs_page_group_lock(struct nfs_page *);
+extern void nfs_page_group_unlock(struct nfs_page *);
+extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
  
  /*
   * Lock the page of an asynchronous request
author	Weston Andros Adamson <dros@primarydata.com>
	Thu, 15 May 2014 15:56:45 +0000 (11:56 -0400)
committer	Trond Myklebust <trond.myklebust@primarydata.com>
	Thu, 29 May 2014 15:11:44 +0000 (11:11 -0400)
fs/nfs/direct.c		patch \| blob \| history
fs/nfs/pagelist.c		patch \| blob \| history
fs/nfs/read.c		patch \| blob \| history
fs/nfs/write.c		patch \| blob \| history
include/linux/nfs_page.h		patch \| blob \| history