If peer is down don't retry indefinitely. Use the retry_count
parameter to restrict the number of retries. After which the
connection fails and error is propagated up.
This prevents long timeouts when mounting a file system with
nodes which might have their NIDs configured in the FS, but the
nodes have been taken offline.
Test-Parameters: trivial
Signed-off-by: Amir Shehata <ashehata@whamcloud.com>
Change-Id: I04faf690ed13357e3ed50c2adaadee265db269c7
Reviewed-on: https://review.whamcloud.com/39981
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
unsigned char ibp_races;
/* # consecutive reconnection attempts to this peer */
unsigned int ibp_reconnected;
unsigned char ibp_races;
/* # consecutive reconnection attempts to this peer */
unsigned int ibp_reconnected;
+ /* number of total active retries */
+ unsigned int ibp_retries;
/* errno on closing this peer_ni */
int ibp_error;
/* max map_on_demand */
/* errno on closing this peer_ni */
int ibp_error;
/* max map_on_demand */
(conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
peer_ni->ibp_accepting > 0));
(conn->ibc_state == IBLND_CONN_PASSIVE_WAIT &&
peer_ni->ibp_accepting > 0));
- LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
- conn->ibc_connvars = NULL;
+ LIBCFS_FREE(conn->ibc_connvars, sizeof(*conn->ibc_connvars));
+ conn->ibc_connvars = NULL;
- if (status != 0) {
- /* failed to establish connection */
- kiblnd_peer_connect_failed(peer_ni, active, status);
- kiblnd_finalise_conn(conn);
- return;
- }
+ if (status != 0) {
+ /* failed to establish connection */
+ kiblnd_peer_connect_failed(peer_ni, active, status);
+ kiblnd_finalise_conn(conn);
+ return;
+ }
- /* connection established */
+ /* connection established */
write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
write_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
+ /* reset retry count */
+ peer_ni->ibp_retries = 0;
+
conn->ibc_last_send = ktime_get();
conn->ibc_last_send = ktime_get();
- kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
- kiblnd_peer_alive(peer_ni);
+ kiblnd_set_conn_state(conn, IBLND_CONN_ESTABLISHED);
+ kiblnd_peer_alive(peer_ni);
/* Add conn to peer_ni's list and nuke any dangling conns from a different
* peer_ni instance... */
/* Add conn to peer_ni's list and nuke any dangling conns from a different
* peer_ni instance... */
- switch (why) {
- default:
- reason = "Unknown";
- break;
+ if (peer_ni->ibp_retries > *kiblnd_tunables.kib_retry_count) {
+ reason = "retry count exceeded due to no listener";
+ goto out;
+ }
+
+ switch (why) {
+ default:
+ reason = "Unknown";
+ break;
case IBLND_REJECT_RDMA_FRAGS: {
struct lnet_ioctl_config_o2iblnd_tunables *tunables;
case IBLND_REJECT_RDMA_FRAGS: {
struct lnet_ioctl_config_o2iblnd_tunables *tunables;
IBLND_REJECT_CONN_STALE, NULL);
break;
IBLND_REJECT_CONN_STALE, NULL);
break;
- case IB_CM_REJ_INVALID_SERVICE_ID:
+ case IB_CM_REJ_INVALID_SERVICE_ID:
+ peer_ni->ibp_retries++;
kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0,
IBLND_REJECT_INVALID_SRV_ID, NULL);
kiblnd_check_reconnect(conn, IBLND_MSG_VERSION, 0,
IBLND_REJECT_INVALID_SRV_ID, NULL);
- CNETERR("%s rejected: no listener at %d\n",
- libcfs_nid2str(peer_ni->ibp_nid),
- *kiblnd_tunables.kib_service);
- break;
+ CNETERR("%s rejected: no listener at %d\n",
+ libcfs_nid2str(peer_ni->ibp_nid),
+ *kiblnd_tunables.kib_service);
+ break;
case IB_CM_REJ_CONSUMER_DEFINED:
if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) {
case IB_CM_REJ_CONSUMER_DEFINED:
if (priv_nob >= offsetof(struct kib_rej, ibr_padding)) {
static int retry_count = 5;
module_param(retry_count, int, 0644);
static int retry_count = 5;
module_param(retry_count, int, 0644);
-MODULE_PARM_DESC(retry_count, "Retransmissions when no ACK received");
+MODULE_PARM_DESC(retry_count, "Number of times to retry connection operations");
static int rnr_retry_count = 6;
module_param(rnr_retry_count, int, 0644);
static int rnr_retry_count = 6;
module_param(rnr_retry_count, int, 0644);