md: Runtime support for multiple ppls
authorPawel Baldysiak <pawel.baldysiak@intel.com>
Wed, 16 Aug 2017 15:13:45 +0000 (17:13 +0200)
committerShaohua Li <shli@fb.com>
Mon, 28 Aug 2017 14:45:48 +0000 (07:45 -0700)
Increase PPL area to 1MB and use it as circular buffer to store PPL. The
entry with highest generation number is the latest one. If PPL to be
written is larger then space left in a buffer, rewind the buffer to the
start (don't wrap it).

Signed-off-by: Pawel Baldysiak <pawel.baldysiak@intel.com>
Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
drivers/md/md.c
drivers/md/md.h
drivers/md/raid0.c
drivers/md/raid1.c
drivers/md/raid5-ppl.c
drivers/md/raid5.c
include/uapi/linux/raid/md_p.h

index a74dc99638229ad09f9f30ccd5e3bac18d81e499..a7876237de10cf922c858a8f4ce3067090512bdd 100644 (file)
@@ -1536,7 +1536,8 @@ static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_
        } else if (sb->bblog_offset != 0)
                rdev->badblocks.shift = 0;
 
-       if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+       if ((le32_to_cpu(sb->feature_map) &
+           (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
                rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
                rdev->ppl.size = le16_to_cpu(sb->ppl.size);
                rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
@@ -1655,10 +1656,15 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
                if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
                        set_bit(MD_HAS_JOURNAL, &mddev->flags);
 
-               if (le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) {
+               if (le32_to_cpu(sb->feature_map) &
+                   (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
                        if (le32_to_cpu(sb->feature_map) &
                            (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
                                return -EINVAL;
+                       if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
+                           (le32_to_cpu(sb->feature_map) &
+                                           MD_FEATURE_MULTIPLE_PPLS))
+                               return -EINVAL;
                        set_bit(MD_HAS_PPL, &mddev->flags);
                }
        } else if (mddev->pers == NULL) {
@@ -1875,7 +1881,11 @@ retry:
                sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
 
        if (test_bit(MD_HAS_PPL, &mddev->flags)) {
-               sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
+               if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
+                       sb->feature_map |=
+                           cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
+               else
+                       sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
                sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
                sb->ppl.size = cpu_to_le16(rdev->ppl.size);
        }
index 09db034558017e3d327960e8333d5defa35acdcb..d4bdfa5c223b9aa34931dd57753efd395506b5f8 100644 (file)
@@ -236,6 +236,7 @@ enum mddev_flags {
                                 * never cause the array to become failed.
                                 */
        MD_HAS_PPL,             /* The raid array has PPL feature set */
+       MD_HAS_MULTIPLE_PPLS,   /* The raid array has multiple PPLs feature set */
 };
 
 enum mddev_sb_flags {
index 6fb81704aff4f221138a8d70da311cc4626e6c67..fd5e8e5efbeff0f5c7682e69ebe586f55e3ae178 100644 (file)
@@ -30,7 +30,8 @@
        ((1L << MD_HAS_JOURNAL) |       \
         (1L << MD_JOURNAL_CLEAN) |     \
         (1L << MD_FAILFAST_SUPPORTED) |\
-        (1L << MD_HAS_PPL))
+        (1L << MD_HAS_PPL) |           \
+        (1L << MD_HAS_MULTIPLE_PPLS))
 
 static int raid0_congested(struct mddev *mddev, int bits)
 {
index 79474f47eeefad3477d1009857022c574c7a2dc9..1f5bd9475dc1306d8eb98bd55ddc16ed33e6bb16 100644 (file)
@@ -48,7 +48,8 @@
 #define UNSUPPORTED_MDDEV_FLAGS                \
        ((1L << MD_HAS_JOURNAL) |       \
         (1L << MD_JOURNAL_CLEAN) |     \
-        (1L << MD_HAS_PPL))
+        (1L << MD_HAS_PPL) |           \
+        (1L << MD_HAS_MULTIPLE_PPLS))
 
 /*
  * Number of guaranteed r1bios in case of extreme VM load:
index 44ad5baf320684e61b1aee1549d15b6e68c0a732..b313f17a62601640817f8ba1bb3629c1a80e29dd 100644 (file)
@@ -87,6 +87,8 @@
  * The current io_unit accepting new stripes is always at the end of the list.
  */
 
+#define PPL_SPACE_SIZE (128 * 1024)
+
 struct ppl_conf {
        struct mddev *mddev;
 
@@ -122,6 +124,10 @@ struct ppl_log {
                                         * always at the end of io_list */
        spinlock_t io_list_lock;
        struct list_head io_list;       /* all io_units of this log */
+
+       sector_t next_io_sector;
+       unsigned int entry_space;
+       bool use_multippl;
 };
 
 #define PPL_IO_INLINE_BVECS 32
@@ -264,13 +270,12 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh)
        int i;
        sector_t data_sector = 0;
        int data_disks = 0;
-       unsigned int entry_space = (log->rdev->ppl.size << 9) - PPL_HEADER_SIZE;
        struct r5conf *conf = sh->raid_conf;
 
        pr_debug("%s: stripe: %llu\n", __func__, (unsigned long long)sh->sector);
 
        /* check if current io_unit is full */
-       if (io && (io->pp_size == entry_space ||
+       if (io && (io->pp_size == log->entry_space ||
                   io->entries_count == PPL_HDR_MAX_ENTRIES)) {
                pr_debug("%s: add io_unit blocked by seq: %llu\n",
                         __func__, io->seq);
@@ -451,12 +456,25 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
        pplhdr->entries_count = cpu_to_le32(io->entries_count);
        pplhdr->checksum = cpu_to_le32(~crc32c_le(~0, pplhdr, PPL_HEADER_SIZE));
 
+       /* Rewind the buffer if current PPL is larger then remaining space */
+       if (log->use_multippl &&
+           log->rdev->ppl.sector + log->rdev->ppl.size - log->next_io_sector <
+           (PPL_HEADER_SIZE + io->pp_size) >> 9)
+               log->next_io_sector = log->rdev->ppl.sector;
+
+
        bio->bi_end_io = ppl_log_endio;
        bio->bi_opf = REQ_OP_WRITE | REQ_FUA;
        bio->bi_bdev = log->rdev->bdev;
-       bio->bi_iter.bi_sector = log->rdev->ppl.sector;
+       bio->bi_iter.bi_sector = log->next_io_sector;
        bio_add_page(bio, io->header_page, PAGE_SIZE, 0);
 
+       pr_debug("%s: log->current_io_sector: %llu\n", __func__,
+           (unsigned long long)log->next_io_sector);
+
+       if (log->use_multippl)
+               log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9;
+
        list_for_each_entry(sh, &io->stripe_list, log_list) {
                /* entries for full stripe writes have no partial parity */
                if (test_bit(STRIPE_FULL_WRITE, &sh->state))
@@ -1031,6 +1049,7 @@ static int ppl_load(struct ppl_conf *ppl_conf)
 static void __ppl_exit_log(struct ppl_conf *ppl_conf)
 {
        clear_bit(MD_HAS_PPL, &ppl_conf->mddev->flags);
+       clear_bit(MD_HAS_MULTIPLE_PPLS, &ppl_conf->mddev->flags);
 
        kfree(ppl_conf->child_logs);
 
@@ -1099,6 +1118,22 @@ static int ppl_validate_rdev(struct md_rdev *rdev)
        return 0;
 }
 
+static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
+{
+       if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE +
+                                     PPL_HEADER_SIZE) * 2) {
+               log->use_multippl = true;
+               set_bit(MD_HAS_MULTIPLE_PPLS,
+                       &log->ppl_conf->mddev->flags);
+               log->entry_space = PPL_SPACE_SIZE;
+       } else {
+               log->use_multippl = false;
+               log->entry_space = (log->rdev->ppl.size << 9) -
+                                  PPL_HEADER_SIZE;
+       }
+       log->next_io_sector = rdev->ppl.sector;
+}
+
 int ppl_init_log(struct r5conf *conf)
 {
        struct ppl_conf *ppl_conf;
@@ -1196,6 +1231,7 @@ int ppl_init_log(struct r5conf *conf)
                        q = bdev_get_queue(rdev->bdev);
                        if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
                                need_cache_flush = true;
+                       ppl_init_child_log(log, rdev);
                }
        }
 
@@ -1261,6 +1297,7 @@ int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add)
                if (!ret) {
                        log->rdev = rdev;
                        ret = ppl_write_empty_header(log);
+                       ppl_init_child_log(log, rdev);
                }
        } else {
                log->rdev = NULL;
index 6af57c6c0533bd9029e822d4413341d9f23c4256..049a958d3c1e4389190fb6d39d8d25a22fdab582 100644 (file)
@@ -7236,6 +7236,7 @@ static int raid5_run(struct mddev *mddev)
                pr_warn("md/raid:%s: using journal device and PPL not allowed - disabling PPL\n",
                        mdname(mddev));
                clear_bit(MD_HAS_PPL, &mddev->flags);
+               clear_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags);
        }
 
        if (mddev->private == NULL)
index d500bd224979d923f15732bd19a5bcde11c8f91f..b9197976b660b41c7e728c361dec5890feee8864 100644 (file)
@@ -324,9 +324,10 @@ struct mdp_superblock_1 {
 #define        MD_FEATURE_RECOVERY_BITMAP      128 /* recovery that is happening
                                             * is guided by bitmap.
                                             */
-#define MD_FEATURE_CLUSTERED           256 /* clustered MD */
+#define        MD_FEATURE_CLUSTERED            256 /* clustered MD */
 #define        MD_FEATURE_JOURNAL              512 /* support write cache */
 #define        MD_FEATURE_PPL                  1024 /* support PPL */
+#define        MD_FEATURE_MULTIPLE_PPLS        2048 /* support for multiple PPLs */
 #define        MD_FEATURE_ALL                  (MD_FEATURE_BITMAP_OFFSET       \
                                        |MD_FEATURE_RECOVERY_OFFSET     \
                                        |MD_FEATURE_RESHAPE_ACTIVE      \
@@ -338,6 +339,7 @@ struct mdp_superblock_1 {
                                        |MD_FEATURE_CLUSTERED           \
                                        |MD_FEATURE_JOURNAL             \
                                        |MD_FEATURE_PPL                 \
+                                       |MD_FEATURE_MULTIPLE_PPLS       \
                                        )
 
 struct r5l_payload_header {