From: eeb Date: Fri, 3 Jun 2005 09:03:44 +0000 (+0000) Subject: * merged in 5858 fix (vibnal arp retries) X-Git-Tag: v1_7_100~1^25~6^2~249 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=c91c70b391f911485e6a4657b0350dd142351dcb;p=fs%2Flustre-release.git * merged in 5858 fix (vibnal arp retries) --- diff --git a/lnet/klnds/viblnd/viblnd.h b/lnet/klnds/viblnd/viblnd.h index bc1790c..57ebdb8 100644 --- a/lnet/klnds/viblnd/viblnd.h +++ b/lnet/klnds/viblnd/viblnd.h @@ -101,6 +101,7 @@ #define IBNAL_TIMEOUT 50 /* default comms timeout (seconds) */ #define IBNAL_NTX 64 /* # tx descs */ #define IBNAL_NTX_NBLK 128 /* # reserved tx descs */ +#define IBNAL_ARP_RETRIES 3 /* # times to retry ARP */ /* tunables fixed at compile time */ #define IBNAL_PEER_HASH_SIZE 101 /* # peer lists */ @@ -172,6 +173,7 @@ typedef struct int *kib_timeout; /* comms timeout (seconds) */ int *kib_ntx; /* # tx descs */ int *kib_ntx_nblk; /* # reserved tx descs */ + int *kib_arp_retries; /* # times to retry ARP */ struct ctl_table_header *kib_sysctl; /* sysctl interface */ } kib_tunables_t; @@ -401,6 +403,7 @@ typedef struct kib_peer struct list_head ibp_conns; /* all active connections */ struct list_head ibp_tx_queue; /* msgs waiting for a conn */ int ibp_connecting; /* connecting+accepting */ + int ibp_arp_count; /* # arp attempts */ unsigned long ibp_reconnect_time; /* when reconnect may be attempted */ unsigned long ibp_reconnect_interval; /* exponential backoff */ } kib_peer_t; diff --git a/lnet/klnds/viblnd/viblnd_cb.c b/lnet/klnds/viblnd/viblnd_cb.c index c55671fa..6de75ad 100644 --- a/lnet/klnds/viblnd/viblnd_cb.c +++ b/lnet/klnds/viblnd/viblnd_cb.c @@ -1223,6 +1223,24 @@ kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn) } void +kibnal_schedule_peer_arp (kib_peer_t *peer) +{ + unsigned long flags; + + LASSERT (peer->ibp_connecting != 0); + LASSERT (peer->ibp_arp_count > 0); + + kibnal_peer_addref(peer); /* extra ref for connd */ + + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); + + list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); +} + +void kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) { kib_peer_t *peer; @@ -1306,15 +1324,8 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid) } peer->ibp_connecting = 1; - kibnal_peer_addref(peer); /* extra ref for connd */ - - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - - list_add_tail (&peer->ibp_connd_list, - &kibnal_data.kib_connd_peers); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); + peer->ibp_arp_count = 1 + *kibnal_tunables.kib_arp_retries; + kibnal_schedule_peer_arp(peer); } /* A connection is being established; queue the message... */ @@ -1717,6 +1728,21 @@ kibnal_thread_fini (void) } void +kibnal_schedule_conn (kib_conn_t *conn) +{ + unsigned long flags; + + kibnal_conn_addref(conn); /* ++ref for connd */ + + spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); + + list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); + wake_up (&kibnal_data.kib_connd_waitq); + + spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); +} + +void kibnal_close_conn_locked (kib_conn_t *conn, int error) { /* This just does the immmediate housekeeping. 'error' is zero for a @@ -1735,9 +1761,10 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error) if (conn->ibc_state != IBNAL_CONN_ESTABLISHED) return; /* already being handled */ - - spin_lock(&conn->ibc_lock); + /* NB Can't take ibc_lock here (could be in IRQ context), without + * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */ + if (error == 0 && list_empty(&conn->ibc_tx_queue) && list_empty(&conn->ibc_active_txs)) { @@ -1752,6 +1779,8 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error) list_empty(&conn->ibc_active_txs) ? "" : "(waiting)", conn->ibc_txseq, conn->ibc_rxseq); +#if 0 + /* can't skip down the queue without holding ibc_lock (see above) */ list_for_each(tmp, &conn->ibc_tx_queue) { kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list); @@ -1771,11 +1800,9 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error) tx->tx_sending, tx->tx_waiting, (long)(tx->tx_deadline - jiffies), HZ); } +#endif } - spin_unlock(&conn->ibc_lock); - - /* connd takes ibc_list's ref */ list_del (&conn->ibc_list); if (list_empty (&peer->ibp_conns) && /* no more conns */ @@ -1786,12 +1813,8 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error) kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1); - spin_lock(&kibnal_data.kib_connd_lock); - - list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up (&kibnal_data.kib_connd_waitq); - - spin_unlock(&kibnal_data.kib_connd_lock); + kibnal_schedule_conn(conn); + kibnal_conn_decref(conn); /* lose ibc_list's ref */ } void @@ -1991,6 +2014,7 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status) switch (conn->ibc_state) { default: LBUG(); + case IBNAL_CONN_ACTIVE_CHECK_REPLY: /* got a connection reply but failed checks */ LASSERT (active); @@ -2132,33 +2156,27 @@ kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg) case IBNAL_CONN_ACTIVE_RTU: /* kibnal_connreq_done is getting there; It'll see * ibc_disconnect set... */ - kibnal_conn_decref(conn); /* lose my ref */ break; case IBNAL_CONN_ESTABLISHED: /* kibnal_connreq_done got there already; get * disconnect going... */ kibnal_close_conn_locked(conn, 0); - kibnal_conn_decref(conn); /* lose my ref */ break; case IBNAL_CONN_DISCONNECT1: /* kibnal_terminate_conn is getting there; It'll see * ibc_disconnect set... */ - kibnal_conn_decref(conn); /* lose my ref */ break; case IBNAL_CONN_DISCONNECT2: /* kibnal_terminate_conn got there already; complete - * the disconnect. NB kib_connd_conns takes my ref */ - spin_lock(&kibnal_data.kib_connd_lock); - list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up(&kibnal_data.kib_connd_waitq); - spin_unlock(&kibnal_data.kib_connd_lock); + * the disconnect. */ + kibnal_schedule_conn(conn); break; } write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); - return; + break; case cm_event_disconn_timeout: case cm_event_disconn_reply: @@ -2167,12 +2185,8 @@ kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg) LASSERT (!conn->ibc_disconnect); conn->ibc_disconnect = 1; - /* kibnal_terminate_conn sent the disconnect request. - * NB kib_connd_conns takes my ref */ - spin_lock(&kibnal_data.kib_connd_lock); - list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up(&kibnal_data.kib_connd_waitq); - spin_unlock(&kibnal_data.kib_connd_lock); + /* kibnal_terminate_conn sent the disconnect request. */ + kibnal_schedule_conn(conn); write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags); break; @@ -2182,13 +2196,12 @@ kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg) case cm_event_conn_reject: LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT); conn->ibc_connvars->cv_conndata = *cmdata; - - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up(&kibnal_data.kib_connd_waitq); - spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); + + kibnal_schedule_conn(conn); break; } + + kibnal_conn_decref(conn); /* lose my ref */ } void @@ -2439,11 +2452,8 @@ kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd, LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT); cv->cv_conndata = *cd; - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - /* connd takes my ref */ - list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up(&kibnal_data.kib_connd_waitq); - spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); + kibnal_schedule_conn(conn); + kibnal_conn_decref(conn); } void @@ -2672,7 +2682,7 @@ kibnal_check_connreply (kib_conn_t *conn) } void -kibnal_send_connreq (kib_conn_t *conn) +kibnal_arp_done (kib_conn_t *conn) { kib_peer_t *peer = conn->ibc_peer; kib_connvars_t *cv = conn->ibc_connvars; @@ -2680,15 +2690,39 @@ kibnal_send_connreq (kib_conn_t *conn) ib_path_record_v2_t *path = &cv->cv_path; vv_return_t vvrc; int rc; + unsigned long flags; - /* Only called by connd => statics OK */ LASSERT (!in_interrupt()); LASSERT (current == kibnal_data.kib_connd); LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP); + LASSERT (peer->ibp_arp_count > 0); if (cv->cv_arprc != ibat_stat_ok) { CERROR("Can't Arp "LPX64"@%u.%u.%u.%u: %d\n", peer->ibp_nid, HIPQUAD(peer->ibp_ip), cv->cv_arprc); + + write_lock_irqsave(&kibnal_data.kib_global_lock, flags); + peer->ibp_arp_count--; + if (peer->ibp_arp_count == 0) { + /* final ARP attempt failed */ + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); + CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n", + peer->ibp_nid, HIPQUAD(peer->ibp_ip), + cv->cv_arprc); + } else { + /* Retry ARP: ibp_connecting++ so terminating conn + * doesn't end peer's connection attempt */ + peer->ibp_connecting++; + write_unlock_irqrestore(&kibnal_data.kib_global_lock, + flags); + CWARN("Arp "LPX64"@%u.%u.%u.%u failed: %d " + "(%d attempts left)\n", + peer->ibp_nid, HIPQUAD(peer->ibp_ip), + cv->cv_arprc, peer->ibp_arp_count); + + kibnal_schedule_peer_arp(peer); + } kibnal_connreq_done(conn, 1, -ENETUNREACH); return; } @@ -2778,13 +2812,8 @@ kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg) if (arprc == ibat_stat_ok) conn->ibc_connvars->cv_arp = *arp_data; - /* connd takes over my ref on conn */ - spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags); - - list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns); - wake_up(&kibnal_data.kib_connd_waitq); - - spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags); + kibnal_schedule_conn(conn); + kibnal_conn_decref(conn); } void @@ -2797,6 +2826,7 @@ kibnal_arp_peer (kib_peer_t *peer) /* Only the connd does this (i.e. single threaded) */ LASSERT (current == kibnal_data.kib_connd); LASSERT (peer->ibp_connecting != 0); + LASSERT (peer->ibp_arp_count > 0); cep = cm_create_cep(cm_cep_transp_rc); if (cep == NULL) { @@ -2834,18 +2864,13 @@ kibnal_arp_peer (kib_peer_t *peer) break; case ibat_stat_ok: - /* Immediate return (ARP cache hit) == no callback. */ - conn->ibc_connvars->cv_arprc = ibat_stat_ok; - kibnal_send_connreq(conn); - kibnal_conn_decref(conn); - break; - case ibat_stat_error: case ibat_stat_timeout: case ibat_stat_not_found: - CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n", peer->ibp_nid, - HIPQUAD(peer->ibp_ip), ibatrc); - kibnal_connreq_done(conn, 1, -ENETUNREACH); + /* Immediate return (ARP cache hit or failure) == no callback. + * Do the next stage directly... */ + conn->ibc_connvars->cv_arprc = ibatrc; + kibnal_arp_done(conn); kibnal_conn_decref(conn); break; } @@ -3068,7 +3093,7 @@ kibnal_connd (void *arg) LBUG(); case IBNAL_CONN_ACTIVE_ARP: - kibnal_send_connreq(conn); + kibnal_arp_done(conn); break; case IBNAL_CONN_ACTIVE_CONNECT: diff --git a/lnet/klnds/viblnd/viblnd_modparams.c b/lnet/klnds/viblnd/viblnd_modparams.c index b084d48..2da22d8 100644 --- a/lnet/klnds/viblnd/viblnd_modparams.c +++ b/lnet/klnds/viblnd/viblnd_modparams.c @@ -55,6 +55,10 @@ static int ntx_nblk = IBNAL_NTX_NBLK; CFS_MODULE_PARM(ntx_nblk, "i", int, 0444, "# of 'reserved' message descriptors"); +static int arp_retries = IBNAL_ARP_RETRIES; +CFS_MODULE_PARM(arp_retries, "i", int, 0644, + "# of times to retry ARP"); + kib_tunables_t kibnal_tunables = { .kib_service_number = &service_number, .kib_min_reconnect_interval = &min_reconnect_interval, @@ -64,6 +68,7 @@ kib_tunables_t kibnal_tunables = { .kib_timeout = &timeout, .kib_ntx = &ntx, .kib_ntx_nblk = &ntx_nblk, + .kib_arp_retries = &arp_retries, }; #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM @@ -84,6 +89,8 @@ static ctl_table kibnal_ctl_table[] = { sizeof(int), 0444, NULL, &proc_dointvec}, {8, "ntx_nblk", &ntx_nblk, sizeof(int), 0444, NULL, &proc_dointvec}, + {9, "arp_retries", &arp_retries, + sizeof(int), 0644, NULL, &proc_dointvec}, {0} };