* @retry: whether to retry, if not enough pages are available right now
*
* Tries to allocate number pages, first from our own page pool, then from
- * the kernel, unless this allocation would exceed the max_buffers setting.
+ * the kernel.
* Possibly retry until DRBD frees sufficient pages somewhere else.
*
+ * If this allocation would exceed the max_buffers setting, we throttle
+ * allocation (schedule_timeout) to give the system some room to breathe.
+ *
+ * We do not use max-buffers as hard limit, because it could lead to
+ * congestion and further to a distributed deadlock during online-verify or
+ * (checksum based) resync, if the max-buffers, socket buffer sizes and
+ * resync-rate settings are mis-configured.
+ *
* Returns a page chain linked via page->private.
*/
struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
struct page *page = NULL;
struct net_conf *nc;
DEFINE_WAIT(wait);
- int mxb;
+ unsigned int mxb;
- /* Yes, we may run up to @number over max_buffers. If we
- * follow it strictly, the admin will get it wrong anyways. */
rcu_read_lock();
nc = rcu_dereference(peer_device->connection->net_conf);
mxb = nc ? nc->max_buffers : 1000000;
break;
}
- schedule();
+ if (schedule_timeout(HZ/10) == 0)
+ mxb = UINT_MAX;
}
finish_wait(&drbd_pp_wait, &wait);
return fb;
}
-static int drbd_rs_controller(struct drbd_device *device)
+static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
{
struct disk_conf *dc;
- unsigned int sect_in; /* Number of sectors that came in since the last turn */
unsigned int want; /* The number of sectors we want in the proxy */
int req_sect; /* Number of sectors to request in this turn */
int correction; /* Number of sectors more we need in the proxy*/
int max_sect;
struct fifo_buffer *plan;
- sect_in = atomic_xchg(&device->rs_sect_in, 0); /* Number of sectors that came in */
- device->rs_in_flight -= sect_in;
-
dc = rcu_dereference(device->ldev->disk_conf);
plan = rcu_dereference(device->rs_plan_s);
static int drbd_rs_number_requests(struct drbd_device *device)
{
- int number;
+ unsigned int sect_in; /* Number of sectors that came in since the last turn */
+ int number, mxb;
+
+ sect_in = atomic_xchg(&device->rs_sect_in, 0);
+ device->rs_in_flight -= sect_in;
rcu_read_lock();
+ mxb = drbd_get_max_buffers(device) / 2;
if (rcu_dereference(device->rs_plan_s)->size) {
- number = drbd_rs_controller(device) >> (BM_BLOCK_SHIFT - 9);
+ number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
} else {
device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
}
rcu_read_unlock();
- /* ignore the amount of pending requests, the resync controller should
- * throttle down to incoming reply rate soon enough anyways. */
+ /* Don't have more than "max-buffers"/2 in-flight.
+ * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
+ * potentially causing a distributed deadlock on congestion during
+ * online-verify or (checksum-based) resync, if max-buffers,
+ * socket buffer sizes and resync rate settings are mis-configured. */
+ if (mxb - device->rs_in_flight < number)
+ number = mxb - device->rs_in_flight;
+
return number;
}
max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
number = drbd_rs_number_requests(device);
- if (number == 0)
+ if (number <= 0)
goto requeue;
for (i = 0; i < number; i++) {