drbd: don't forget error completion when "unsuspending" IO

author Lars Ellenberg <lars.ellenberg@linbit.com>

Mon, 13 Jun 2016 22:26:26 +0000 (00:26 +0200)

committer Jens Axboe <axboe@fb.com>

Tue, 14 Jun 2016 03:43:06 +0000 (21:43 -0600)
author Lars Ellenberg <lars.ellenberg@linbit.com>
Mon, 13 Jun 2016 22:26:26 +0000 (00:26 +0200)
committer Jens Axboe <axboe@fb.com>
Tue, 14 Jun 2016 03:43:06 +0000 (21:43 -0600)
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c

index 4a4eb80621ef24a57751bff65a9f806ce60b3a9a..e5fdcc6c2e672e77be031c2cb41d73084a76395a 100644 (file)
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -442,19 +442,17 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connec
         }
         rcu_read_unlock();
  
-       if (fp == FP_NOT_AVAIL) {
-               /* IO Suspending works on the whole resource.
-                  Do it only for one device. */
-               vnr = 0;
-               peer_device = idr_get_next(&connection->peer_devices, &vnr);
-               drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0));
-       }
-
         return fp;
  }
  
+static bool resource_is_supended(struct drbd_resource *resource)
+{
+       return resource->susp || resource->susp_fen || resource->susp_nod;
+}
+
  bool conn_try_outdate_peer(struct drbd_connection *connection)
  {
+       struct drbd_resource * const resource = connection->resource;
         unsigned int connect_cnt;
         union drbd_state mask = { };
         union drbd_state val = { };
@@ -462,21 +460,41 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
         char *ex_to_string;
         int r;
  
-       spin_lock_irq(&connection->resource->req_lock);
+       spin_lock_irq(&resource->req_lock);
         if (connection->cstate >= C_WF_REPORT_PARAMS) {
                 drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n");
-               spin_unlock_irq(&connection->resource->req_lock);
+               spin_unlock_irq(&resource->req_lock);
                 return false;
         }
  
         connect_cnt = connection->connect_cnt;
-       spin_unlock_irq(&connection->resource->req_lock);
+       spin_unlock_irq(&resource->req_lock);
  
         fp = highest_fencing_policy(connection);
         switch (fp) {
         case FP_NOT_AVAIL:
                 drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n");
-               goto out;
+               spin_lock_irq(&resource->req_lock);
+               if (connection->cstate < C_WF_REPORT_PARAMS) {
+                       _conn_request_state(connection,
+                                           (union drbd_state) { { .susp_fen = 1 } },
+                                           (union drbd_state) { { .susp_fen = 0 } },
+                                           CS_VERBOSE | CS_HARD | CS_DC_SUSP);
+                       /* We are no longer suspended due to the fencing policy.
+                        * We may still be suspended due to the on-no-data-accessible policy.
+                        * If that was OND_IO_ERROR, fail pending requests. */
+                       if (!resource_is_supended(resource))
+                               _tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
+               }
+               /* Else: in case we raced with a connection handshake,
+                * let the handshake figure out if we maybe can RESEND,
+                * and do not resume/fail pending requests here.
+                * Worst case is we stay suspended for now, which may be
+                * resolved by either re-establishing the replication link, or
+                * the next link failure, or eventually the administrator.  */
+               spin_unlock_irq(&resource->req_lock);
+               return false;
+
         case FP_DONT_CARE:
                 return true;
         default: ;
@@ -529,13 +547,11 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
         drbd_info(connection, "fence-peer helper returned %d (%s)\n",
                   (r>>8) & 0xff, ex_to_string);
  
- out:
-
         /* Not using
            conn_request_state(connection, mask, val, CS_VERBOSE);
            here, because we might were able to re-establish the connection in the
            meantime. */
-       spin_lock_irq(&connection->resource->req_lock);
+       spin_lock_irq(&resource->req_lock);
         if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) {
                 if (connection->connect_cnt != connect_cnt)
                         /* In case the connection was established and droped
@@ -544,7 +560,7 @@ bool conn_try_outdate_peer(struct drbd_connection *connection)
                 else
                         _conn_request_state(connection, mask, val, CS_VERBOSE);
         }
-       spin_unlock_irq(&connection->resource->req_lock);
+       spin_unlock_irq(&resource->req_lock);
  
         return conn_highest_pdsk(connection) <= D_OUTDATED;
  }
author	Lars Ellenberg <lars.ellenberg@linbit.com>
	Mon, 13 Jun 2016 22:26:26 +0000 (00:26 +0200)
committer	Jens Axboe <axboe@fb.com>
	Tue, 14 Jun 2016 03:43:06 +0000 (21:43 -0600)