perf: Optimize ring-buffer write by depending on control dependencies

author Peter Zijlstra <peterz@infradead.org>

Mon, 25 Nov 2013 10:49:10 +0000 (11:49 +0100)

committer Ingo Molnar <mingo@kernel.org>

Wed, 11 Dec 2013 14:53:22 +0000 (15:53 +0100)
author Peter Zijlstra <peterz@infradead.org>
Mon, 25 Nov 2013 10:49:10 +0000 (11:49 +0100)
committer Ingo Molnar <mingo@kernel.org>
Wed, 11 Dec 2013 14:53:22 +0000 (15:53 +0100)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c

index e8b168af135ba3ec7d72c780746ce02f051be2e3..146a5792b1d2aaf9412eaf2610f1f4cd1f05a037 100644 (file)
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -61,19 +61,20 @@ again:
          *
          *   kernel                             user
          *
-        *   READ ->data_tail                   READ ->data_head
-        *   smp_mb()   (A)                     smp_rmb()       (C)
-        *   WRITE $data                        READ $data
-        *   smp_wmb()  (B)                     smp_mb()        (D)
-        *   STORE ->data_head                  WRITE ->data_tail
+        *   if (LOAD ->data_tail) {            LOAD ->data_head
+        *                      (A)             smp_rmb()       (C)
+        *      STORE $data                     LOAD $data
+        *      smp_wmb()       (B)             smp_mb()        (D)
+        *      STORE ->data_head               STORE ->data_tail
+        *   }
          *
          * Where A pairs with D, and B pairs with C.
          *
-        * I don't think A needs to be a full barrier because we won't in fact
-        * write data until we see the store from userspace. So we simply don't
-        * issue the data WRITE until we observe it. Be conservative for now.
+        * In our case (A) is a control dependency that separates the load of
+        * the ->data_tail and the stores of $data. In case ->data_tail
+        * indicates there is no room in the buffer to store $data we do not.
          *
-        * OTOH, D needs to be a full barrier since it separates the data READ
+        * D needs to be a full barrier since it separates the data READ
          * from the tail WRITE.
          *
          * For B a WMB is sufficient since it separates two WRITEs, and for C
@@ -81,7 +82,7 @@ again:
          *
          * See perf_output_begin().
          */
-       smp_wmb();
+       smp_wmb(); /* B, matches C */
         rb->user_page->data_head = head;
  
         /*
@@ -144,17 +145,26 @@ int perf_output_begin(struct perf_output_handle *handle,
                 if (!rb->overwrite &&
                     unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
                         goto fail;
+
+               /*
+                * The above forms a control dependency barrier separating the
+                * @tail load above from the data stores below. Since the @tail
+                * load is required to compute the branch to fail below.
+                *
+                * A, matches D; the full memory barrier userspace SHOULD issue
+                * after reading the data and before storing the new tail
+                * position.
+                *
+                * See perf_output_put_handle().
+                */
+
                 head += size;
         } while (local_cmpxchg(&rb->head, offset, head) != offset);
  
         /*
-        * Separate the userpage->tail read from the data stores below.
-        * Matches the MB userspace SHOULD issue after reading the data
-        * and before storing the new tail position.
-        *
-        * See perf_output_put_handle().
+        * We rely on the implied barrier() by local_cmpxchg() to ensure
+        * none of the data stores below can be lifted up by the compiler.
          */
-       smp_mb();
  
         if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
                 local_add(rb->watermark, &rb->wakeup);
author	Peter Zijlstra <peterz@infradead.org>
	Mon, 25 Nov 2013 10:49:10 +0000 (11:49 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Wed, 11 Dec 2013 14:53:22 +0000 (15:53 +0100)