perf record: Enable asynchronous trace writing
authorAlexey Budankov <alexey.budankov@linux.intel.com>
Tue, 6 Nov 2018 09:04:58 +0000 (12:04 +0300)
committerArnaldo Carvalho de Melo <acme@redhat.com>
Mon, 17 Dec 2018 17:55:08 +0000 (14:55 -0300)
The trace file offset is read once before mmaps iterating loop and
written back after all performance data is enqueued for aio writing.

The trace file offset is incremented linearly after every successful aio
write operation.

record__aio_sync() blocks till completion of the started AIO operation
and then proceeds.

record__aio_mmap_read_sync() implements a barrier for all incomplete
aio write requests.

Signed-off-by: Alexey Budankov <alexey.budankov@linux.intel.com>
Reviewed-by: Jiri Olsa <jolsa@redhat.com>
Acked-by: Namhyung Kim <namhyung@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/ce2d45e9-d236-871c-7c8f-1bed2d37e8ac@linux.intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
tools/perf/Documentation/perf-record.txt
tools/perf/builtin-record.c
tools/perf/perf.h
tools/perf/util/evlist.c
tools/perf/util/evlist.h
tools/perf/util/mmap.c
tools/perf/util/mmap.h

index 246dee081efda287e39060a55e4bfc8ca7bfde8e..7efb4af88a689e2b6599da0c75c250bc9f7b29ec 100644 (file)
@@ -435,6 +435,11 @@ Specify vmlinux path which has debuginfo.
 --buildid-all::
 Record build-id of all DSOs regardless whether it's actually hit or not.
 
+--aio::
+Enable asynchronous (Posix AIO) trace writing mode.
+Asynchronous mode is supported only when linking Perf tool with libc library
+providing implementation for Posix AIO API.
+
 --all-kernel::
 Configure all used events to run in kernel space.
 
index 488779bc4c8d2f6ed8dbcad69e1de5e477ede138..408d6477c960b21694993b5eef26e283d0765398 100644 (file)
@@ -124,6 +124,183 @@ static int record__write(struct record *rec, struct perf_mmap *map __maybe_unuse
        return 0;
 }
 
+#ifdef HAVE_AIO_SUPPORT
+static int record__aio_write(struct aiocb *cblock, int trace_fd,
+               void *buf, size_t size, off_t off)
+{
+       int rc;
+
+       cblock->aio_fildes = trace_fd;
+       cblock->aio_buf    = buf;
+       cblock->aio_nbytes = size;
+       cblock->aio_offset = off;
+       cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
+
+       do {
+               rc = aio_write(cblock);
+               if (rc == 0) {
+                       break;
+               } else if (errno != EAGAIN) {
+                       cblock->aio_fildes = -1;
+                       pr_err("failed to queue perf data, error: %m\n");
+                       break;
+               }
+       } while (1);
+
+       return rc;
+}
+
+static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
+{
+       void *rem_buf;
+       off_t rem_off;
+       size_t rem_size;
+       int rc, aio_errno;
+       ssize_t aio_ret, written;
+
+       aio_errno = aio_error(cblock);
+       if (aio_errno == EINPROGRESS)
+               return 0;
+
+       written = aio_ret = aio_return(cblock);
+       if (aio_ret < 0) {
+               if (aio_errno != EINTR)
+                       pr_err("failed to write perf data, error: %m\n");
+               written = 0;
+       }
+
+       rem_size = cblock->aio_nbytes - written;
+
+       if (rem_size == 0) {
+               cblock->aio_fildes = -1;
+               /*
+                * md->refcount is incremented in perf_mmap__push() for
+                * every enqueued aio write request so decrement it because
+                * the request is now complete.
+                */
+               perf_mmap__put(md);
+               rc = 1;
+       } else {
+               /*
+                * aio write request may require restart with the
+                * reminder if the kernel didn't write whole
+                * chunk at once.
+                */
+               rem_off = cblock->aio_offset + written;
+               rem_buf = (void *)(cblock->aio_buf + written);
+               record__aio_write(cblock, cblock->aio_fildes,
+                               rem_buf, rem_size, rem_off);
+               rc = 0;
+       }
+
+       return rc;
+}
+
+static void record__aio_sync(struct perf_mmap *md)
+{
+       struct aiocb *cblock = &md->aio.cblock;
+       struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
+
+       do {
+               if (cblock->aio_fildes == -1 || record__aio_complete(md, cblock))
+                       return;
+
+               while (aio_suspend((const struct aiocb**)&cblock, 1, &timeout)) {
+                       if (!(errno == EAGAIN || errno == EINTR))
+                               pr_err("failed to sync perf data, error: %m\n");
+               }
+       } while (1);
+}
+
+static int record__aio_pushfn(void *to, struct aiocb *cblock, void *bf, size_t size, off_t off)
+{
+       struct record *rec = to;
+       int ret, trace_fd = rec->session->data->file.fd;
+
+       rec->samples++;
+
+       ret = record__aio_write(cblock, trace_fd, bf, size, off);
+       if (!ret) {
+               rec->bytes_written += size;
+               if (switch_output_size(rec))
+                       trigger_hit(&switch_output_trigger);
+       }
+
+       return ret;
+}
+
+static off_t record__aio_get_pos(int trace_fd)
+{
+       return lseek(trace_fd, 0, SEEK_CUR);
+}
+
+static void record__aio_set_pos(int trace_fd, off_t pos)
+{
+       lseek(trace_fd, pos, SEEK_SET);
+}
+
+static void record__aio_mmap_read_sync(struct record *rec)
+{
+       int i;
+       struct perf_evlist *evlist = rec->evlist;
+       struct perf_mmap *maps = evlist->mmap;
+
+       if (!rec->opts.nr_cblocks)
+               return;
+
+       for (i = 0; i < evlist->nr_mmaps; i++) {
+               struct perf_mmap *map = &maps[i];
+
+               if (map->base)
+                       record__aio_sync(map);
+       }
+}
+
+static int nr_cblocks_default = 1;
+
+static int record__aio_parse(const struct option *opt,
+                            const char *str __maybe_unused,
+                            int unset)
+{
+       struct record_opts *opts = (struct record_opts *)opt->value;
+
+       if (unset)
+               opts->nr_cblocks = 0;
+       else
+               opts->nr_cblocks = nr_cblocks_default;
+
+       return 0;
+}
+#else /* HAVE_AIO_SUPPORT */
+static void record__aio_sync(struct perf_mmap *md __maybe_unused)
+{
+}
+
+static int record__aio_pushfn(void *to __maybe_unused, struct aiocb *cblock __maybe_unused,
+               void *bf __maybe_unused, size_t size __maybe_unused, off_t off __maybe_unused)
+{
+       return -1;
+}
+
+static off_t record__aio_get_pos(int trace_fd __maybe_unused)
+{
+       return -1;
+}
+
+static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
+{
+}
+
+static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
+{
+}
+#endif
+
+static int record__aio_enabled(struct record *rec)
+{
+       return rec->opts.nr_cblocks > 0;
+}
+
 static int process_synthesized_event(struct perf_tool *tool,
                                     union perf_event *event,
                                     struct perf_sample *sample __maybe_unused,
@@ -329,7 +506,7 @@ static int record__mmap_evlist(struct record *rec,
 
        if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
                                 opts->auxtrace_mmap_pages,
-                                opts->auxtrace_snapshot_mode) < 0) {
+                                opts->auxtrace_snapshot_mode, opts->nr_cblocks) < 0) {
                if (errno == EPERM) {
                        pr_err("Permission error mapping pages.\n"
                               "Consider increasing "
@@ -525,6 +702,8 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
        int i;
        int rc = 0;
        struct perf_mmap *maps;
+       int trace_fd = rec->data.file.fd;
+       off_t off;
 
        if (!evlist)
                return 0;
@@ -536,13 +715,29 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
        if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
                return 0;
 
+       if (record__aio_enabled(rec))
+               off = record__aio_get_pos(trace_fd);
+
        for (i = 0; i < evlist->nr_mmaps; i++) {
                struct perf_mmap *map = &maps[i];
 
                if (map->base) {
-                       if (perf_mmap__push(map, rec, record__pushfn) != 0) {
-                               rc = -1;
-                               goto out;
+                       if (!record__aio_enabled(rec)) {
+                               if (perf_mmap__push(map, rec, record__pushfn) != 0) {
+                                       rc = -1;
+                                       goto out;
+                               }
+                       } else {
+                               /*
+                                * Call record__aio_sync() to wait till map->data buffer
+                                * becomes available after previous aio write request.
+                                */
+                               record__aio_sync(map);
+                               if (perf_mmap__aio_push(map, rec, record__aio_pushfn, &off) != 0) {
+                                       record__aio_set_pos(trace_fd, off);
+                                       rc = -1;
+                                       goto out;
+                               }
                        }
                }
 
@@ -553,6 +748,9 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli
                }
        }
 
+       if (record__aio_enabled(rec))
+               record__aio_set_pos(trace_fd, off);
+
        /*
         * Mark the round finished in case we wrote
         * at least one event.
@@ -658,6 +856,8 @@ record__switch_output(struct record *rec, bool at_exit)
        /* Same Size:      "2015122520103046"*/
        char timestamp[] = "InvalidTimestamp";
 
+       record__aio_mmap_read_sync(rec);
+
        record__synthesize(rec, true);
        if (target__none(&rec->opts.target))
                record__synthesize_workload(rec, true);
@@ -1168,6 +1368,8 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
                record__synthesize_workload(rec, true);
 
 out_child:
+       record__aio_mmap_read_sync(rec);
+
        if (forks) {
                int exit_status;
 
@@ -1706,6 +1908,11 @@ static struct option __record_options[] = {
                          "signal"),
        OPT_BOOLEAN(0, "dry-run", &dry_run,
                    "Parse options then exit"),
+#ifdef HAVE_AIO_SUPPORT
+       OPT_CALLBACK_NOOPT(0, "aio", &record.opts,
+                    NULL, "Enable asynchronous trace writing mode",
+                    record__aio_parse),
+#endif
        OPT_END()
 };
 
@@ -1898,6 +2105,9 @@ int cmd_record(int argc, const char **argv)
                goto out;
        }
 
+       if (verbose > 0)
+               pr_info("nr_cblocks: %d\n", rec->opts.nr_cblocks);
+
        err = __cmd_record(&record, argc, argv);
 out:
        perf_evlist__delete(rec->evlist);
index 0ed4a34c74c4bc6d2c38457c05a59dd9f6ea8551..4d40baa45a5f500bb5d5f3e5329f294ab2099b01 100644 (file)
@@ -83,6 +83,7 @@ struct record_opts {
        clockid_t    clockid;
        u64          clockid_res_ns;
        unsigned int proc_map_timeout;
+       int          nr_cblocks;
 };
 
 struct option;
index 6f010b9f0a81ff4ef3123ecd9fdbd4f50e16792a..e90575192209c54db7fadf8408c5885799b70387 100644 (file)
@@ -1018,7 +1018,7 @@ int perf_evlist__parse_mmap_pages(const struct option *opt, const char *str,
  */
 int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
                         unsigned int auxtrace_pages,
-                        bool auxtrace_overwrite)
+                        bool auxtrace_overwrite, int nr_cblocks)
 {
        struct perf_evsel *evsel;
        const struct cpu_map *cpus = evlist->cpus;
@@ -1028,7 +1028,7 @@ int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
         * Its value is decided by evsel's write_backward.
         * So &mp should not be passed through const pointer.
         */
-       struct mmap_params mp = { .nr_cblocks = 0 };
+       struct mmap_params mp = { .nr_cblocks = nr_cblocks };
 
        if (!evlist->mmap)
                evlist->mmap = perf_evlist__alloc_mmap(evlist, false);
@@ -1060,7 +1060,7 @@ int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
 
 int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages)
 {
-       return perf_evlist__mmap_ex(evlist, pages, 0, false);
+       return perf_evlist__mmap_ex(evlist, pages, 0, false, 0);
 }
 
 int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target)
index d108d167eb364aff656f5e8e52143ae2b967162a..8682944911944004e64e7138596ed94c292c3c89 100644 (file)
@@ -162,7 +162,7 @@ unsigned long perf_event_mlock_kb_in_pages(void);
 
 int perf_evlist__mmap_ex(struct perf_evlist *evlist, unsigned int pages,
                         unsigned int auxtrace_pages,
-                        bool auxtrace_overwrite);
+                        bool auxtrace_overwrite, int nr_cblocks);
 int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages);
 void perf_evlist__munmap(struct perf_evlist *evlist);
 
index 47cdc3ad654609b18c0e5f3ba5a99d838eb84eeb..61aa381d05d0a8a26204628268637716a03eb1e4 100644 (file)
@@ -158,7 +158,8 @@ static int perf_mmap__aio_mmap(struct perf_mmap *map, struct mmap_params *mp)
 {
        int delta_max;
 
-       if (mp->nr_cblocks) {
+       map->aio.nr_cblocks = mp->nr_cblocks;
+       if (map->aio.nr_cblocks) {
                map->aio.data = malloc(perf_mmap__mmap_len(map));
                if (!map->aio.data) {
                        pr_debug2("failed to allocate data buffer, error %m\n");
@@ -187,6 +188,80 @@ static void perf_mmap__aio_munmap(struct perf_mmap *map)
        if (map->aio.data)
                zfree(&map->aio.data);
 }
+
+int perf_mmap__aio_push(struct perf_mmap *md, void *to,
+                       int push(void *to, struct aiocb *cblock, void *buf, size_t size, off_t off),
+                       off_t *off)
+{
+       u64 head = perf_mmap__read_head(md);
+       unsigned char *data = md->base + page_size;
+       unsigned long size, size0 = 0;
+       void *buf;
+       int rc = 0;
+
+       rc = perf_mmap__read_init(md);
+       if (rc < 0)
+               return (rc == -EAGAIN) ? 0 : -1;
+
+       /*
+        * md->base data is copied into md->data buffer to
+        * release space in the kernel buffer as fast as possible,
+        * thru perf_mmap__consume() below.
+        *
+        * That lets the kernel to proceed with storing more
+        * profiling data into the kernel buffer earlier than other
+        * per-cpu kernel buffers are handled.
+        *
+        * Coping can be done in two steps in case the chunk of
+        * profiling data crosses the upper bound of the kernel buffer.
+        * In this case we first move part of data from md->start
+        * till the upper bound and then the reminder from the
+        * beginning of the kernel buffer till the end of
+        * the data chunk.
+        */
+
+       size = md->end - md->start;
+
+       if ((md->start & md->mask) + size != (md->end & md->mask)) {
+               buf = &data[md->start & md->mask];
+               size = md->mask + 1 - (md->start & md->mask);
+               md->start += size;
+               memcpy(md->aio.data, buf, size);
+               size0 = size;
+       }
+
+       buf = &data[md->start & md->mask];
+       size = md->end - md->start;
+       md->start += size;
+       memcpy(md->aio.data + size0, buf, size);
+
+       /*
+        * Increment md->refcount to guard md->data buffer
+        * from premature deallocation because md object can be
+        * released earlier than aio write request started
+        * on mmap->data is complete.
+        *
+        * perf_mmap__put() is done at record__aio_complete()
+        * after started request completion.
+        */
+       perf_mmap__get(md);
+
+       md->prev = head;
+       perf_mmap__consume(md);
+
+       rc = push(to, &md->aio.cblock, md->aio.data, size0 + size, *off);
+       if (!rc) {
+               *off += size0 + size;
+       } else {
+               /*
+                * Decrement md->refcount back if aio write
+                * operation failed to start.
+                */
+               perf_mmap__put(md);
+       }
+
+       return rc;
+}
 #else
 static int perf_mmap__aio_mmap(struct perf_mmap *map __maybe_unused,
                               struct mmap_params *mp __maybe_unused)
index 3f10ad030c5e23b2a4f491e733056cb88816995f..b99213ba11b5f072c54ae3177114fc65960fab09 100644 (file)
@@ -12,6 +12,7 @@
 #include "auxtrace.h"
 #include "event.h"
 
+struct aiocb;
 /**
  * struct perf_mmap - perf's ring buffer mmap details
  *
@@ -33,6 +34,7 @@ struct perf_mmap {
        struct {
                void             *data;
                struct aiocb     cblock;
+               int              nr_cblocks;
        } aio;
 #endif
 };
@@ -94,6 +96,18 @@ union perf_event *perf_mmap__read_event(struct perf_mmap *map);
 
 int perf_mmap__push(struct perf_mmap *md, void *to,
                    int push(struct perf_mmap *map, void *to, void *buf, size_t size));
+#ifdef HAVE_AIO_SUPPORT
+int perf_mmap__aio_push(struct perf_mmap *md, void *to,
+                       int push(void *to, struct aiocb *cblock, void *buf, size_t size, off_t off),
+                       off_t *off);
+#else
+static inline int perf_mmap__aio_push(struct perf_mmap *md __maybe_unused, void *to __maybe_unused,
+       int push(void *to, struct aiocb *cblock, void *buf, size_t size, off_t off) __maybe_unused,
+       off_t *off __maybe_unused)
+{
+       return 0;
+}
+#endif
 
 size_t perf_mmap__mmap_len(struct perf_mmap *map);