Whamcloud - gitweb
* Fixed 5858: vibnal arp retries
authoreeb <eeb>
Fri, 3 Jun 2005 09:36:56 +0000 (09:36 +0000)
committereeb <eeb>
Fri, 3 Jun 2005 09:36:56 +0000 (09:36 +0000)
lnet/klnds/viblnd/viblnd.h
lnet/klnds/viblnd/viblnd_cb.c

index 59fa4c3..e3720b6 100644 (file)
 #define IBNAL_MSG_QUEUE_SIZE      8             /* # messages/RDMAs in-flight */
 #define IBNAL_CREDIT_HIGHWATER    7             /* when to eagerly return credits */
 
+#define IBNAL_ARP_RETRIES         3             /* How many times to retry ARP */
+
 #define IBNAL_NTX                 64            /* # tx descs */
 #define IBNAL_NTX_NBLK            128           /* # reserved tx descs */
 /* reduced from 256 to ensure we register < 255 pages per region.  
@@ -401,6 +403,7 @@ typedef struct kib_peer
         struct list_head    ibp_conns;          /* all active connections */
         struct list_head    ibp_tx_queue;       /* msgs waiting for a conn */
         int                 ibp_connecting;     /* connecting+accepting */
+        int                 ibp_arp_count;      /* # arp attempts */
         unsigned long       ibp_reconnect_time; /* when reconnect may be attempted */
         unsigned long       ibp_reconnect_interval; /* exponential backoff */
 } kib_peer_t;
index 8a9c8a4..b3a8dd4 100644 (file)
@@ -1219,6 +1219,24 @@ kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
 }
 
 void
+kibnal_schedule_peer_arp (kib_peer_t *peer)
+{
+        unsigned long flags;
+
+        LASSERT (peer->ibp_connecting != 0);
+        LASSERT (peer->ibp_arp_count > 0);
+
+        kibnal_peer_addref(peer); /* extra ref for connd */
+
+        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
+
+        list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers);
+        wake_up (&kibnal_data.kib_connd_waitq);
+
+        spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
+}
+
+void
 kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
 {
         kib_peer_t      *peer;
@@ -1287,15 +1305,8 @@ kibnal_launch_tx (kib_tx_t *tx, ptl_nid_t nid)
                 }
         
                 peer->ibp_connecting = 1;
-                kibnal_peer_addref(peer); /* extra ref for connd */
-        
-                spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
-        
-                list_add_tail (&peer->ibp_connd_list,
-                               &kibnal_data.kib_connd_peers);
-                wake_up (&kibnal_data.kib_connd_waitq);
-        
-                spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
+                peer->ibp_arp_count = 1 + IBNAL_ARP_RETRIES;
+                kibnal_schedule_peer_arp(peer);
         }
         
         /* A connection is being established; queue the message... */
@@ -1709,6 +1720,21 @@ kibnal_thread_fini (void)
 }
 
 void
+kibnal_schedule_conn (kib_conn_t *conn)
+{
+        unsigned long flags;
+
+        kibnal_conn_addref(conn);               /* ++ref for connd */
+        
+        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
+
+        list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+        wake_up (&kibnal_data.kib_connd_waitq);
+                
+        spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
+}
+
+void
 kibnal_close_conn_locked (kib_conn_t *conn, int error)
 {
         /* This just does the immmediate housekeeping.  'error' is zero for a
@@ -1727,9 +1753,10 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error)
 
         if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
                 return; /* already being handled  */
-
-        spin_lock(&conn->ibc_lock);
         
+        /* NB Can't take ibc_lock here (could be in IRQ context), without
+         * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */
+
         if (error == 0 &&
             list_empty(&conn->ibc_tx_queue) &&
             list_empty(&conn->ibc_active_txs)) {
@@ -1744,6 +1771,8 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error)
                        list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
                        conn->ibc_txseq, conn->ibc_rxseq);
 
+#if 0
+                /* can't skip down the queue without holding ibc_lock (see above) */
                 list_for_each(tmp, &conn->ibc_tx_queue) {
                         kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
                         
@@ -1763,11 +1792,9 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error)
                                tx->tx_sending, tx->tx_waiting,
                                (long)(tx->tx_deadline - jiffies), HZ);
                 }
+#endif
         }
 
-        spin_unlock(&conn->ibc_lock);
-
-        /* connd takes ibc_list's ref */
         list_del (&conn->ibc_list);
         
         if (list_empty (&peer->ibp_conns) &&    /* no more conns */
@@ -1778,12 +1805,8 @@ kibnal_close_conn_locked (kib_conn_t *conn, int error)
 
         kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECT1);
 
-        spin_lock(&kibnal_data.kib_connd_lock);
-
-        list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
-        wake_up (&kibnal_data.kib_connd_waitq);
-                
-        spin_unlock(&kibnal_data.kib_connd_lock);
+        kibnal_schedule_conn(conn);
+        kibnal_conn_decref(conn);               /* lose ibc_list's ref */
 }
 
 void
@@ -1983,6 +2006,7 @@ kibnal_connreq_done(kib_conn_t *conn, int active, int status)
                 switch (conn->ibc_state) {
                 default:
                         LBUG();
+
                 case IBNAL_CONN_ACTIVE_CHECK_REPLY:
                         /* got a connection reply but failed checks */
                         LASSERT (active);
@@ -2123,33 +2147,27 @@ kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
                 case IBNAL_CONN_ACTIVE_RTU:
                         /* kibnal_connreq_done is getting there; It'll see
                          * ibc_disconnect set... */
-                        kibnal_conn_decref(conn); /* lose my ref */
                         break;
 
                 case IBNAL_CONN_ESTABLISHED:
                         /* kibnal_connreq_done got there already; get
                          * disconnect going... */
                         kibnal_close_conn_locked(conn, 0);
-                        kibnal_conn_decref(conn); /* lose my ref */
                         break;
 
                 case IBNAL_CONN_DISCONNECT1:
                         /* kibnal_terminate_conn is getting there; It'll see
                          * ibc_disconnect set... */
-                        kibnal_conn_decref(conn); /* lose my ref */
                         break;
 
                 case IBNAL_CONN_DISCONNECT2:
                         /* kibnal_terminate_conn got there already; complete
-                         * the disconnect.  NB kib_connd_conns takes my ref */
-                        spin_lock(&kibnal_data.kib_connd_lock);
-                        list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
-                        wake_up(&kibnal_data.kib_connd_waitq);
-                        spin_unlock(&kibnal_data.kib_connd_lock);
+                         * the disconnect. */
+                        kibnal_schedule_conn(conn);
                         break;
                 }
                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
-                return;
+                break;
                 
         case cm_event_disconn_timeout:
         case cm_event_disconn_reply:
@@ -2158,12 +2176,8 @@ kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
                 LASSERT (!conn->ibc_disconnect);
                 conn->ibc_disconnect = 1;
 
-                /* kibnal_terminate_conn sent the disconnect request.  
-                 * NB kib_connd_conns takes my ref */
-                spin_lock(&kibnal_data.kib_connd_lock);
-                list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
-                wake_up(&kibnal_data.kib_connd_waitq);
-                spin_unlock(&kibnal_data.kib_connd_lock);
+                /* kibnal_terminate_conn sent the disconnect request. */
+                kibnal_schedule_conn(conn);
 
                 write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
                 break;
@@ -2173,13 +2187,12 @@ kibnal_cm_callback(cm_cep_handle_t cep, cm_conn_data_t *cmdata, void *arg)
         case cm_event_conn_reject:
                 LASSERT (conn->ibc_state == IBNAL_CONN_PASSIVE_WAIT);
                 conn->ibc_connvars->cv_conndata = *cmdata;
-                
-                spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
-                list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
-                wake_up(&kibnal_data.kib_connd_waitq);
-                spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
+
+                kibnal_schedule_conn(conn);
                 break;
         }
+
+        kibnal_conn_decref(conn); /* lose my ref */
 }
 
 void
@@ -2430,11 +2443,8 @@ kibnal_active_connect_callback (cm_cep_handle_t cep, cm_conn_data_t *cd,
         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_CONNECT);
         cv->cv_conndata = *cd;
 
-        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
-        /* connd takes my ref */
-        list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
-        wake_up(&kibnal_data.kib_connd_waitq);
-        spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
+        kibnal_schedule_conn(conn);
+        kibnal_conn_decref(conn);
 }
 
 void
@@ -2663,7 +2673,7 @@ kibnal_check_connreply (kib_conn_t *conn)
 }
 
 void
-kibnal_send_connreq (kib_conn_t *conn)
+kibnal_arp_done (kib_conn_t *conn)
 {
         kib_peer_t           *peer = conn->ibc_peer;
         kib_connvars_t       *cv = conn->ibc_connvars;
@@ -2671,15 +2681,39 @@ kibnal_send_connreq (kib_conn_t *conn)
         ib_path_record_v2_t  *path = &cv->cv_path;
         vv_return_t           vvrc;
         int                   rc;
+        unsigned long         flags;
 
-        /* Only called by connd => statics OK */
         LASSERT (!in_interrupt());
         LASSERT (current == kibnal_data.kib_connd);
         LASSERT (conn->ibc_state == IBNAL_CONN_ACTIVE_ARP);
+        LASSERT (peer->ibp_arp_count > 0);
         
         if (cv->cv_arprc != ibat_stat_ok) {
                 CERROR("Can't Arp "LPX64"@%u.%u.%u.%u: %d\n", peer->ibp_nid,
                        HIPQUAD(peer->ibp_ip), cv->cv_arprc);
+
+                write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+                peer->ibp_arp_count--;
+                if (peer->ibp_arp_count == 0) {
+                        /* final ARP attempt failed */
+                        write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
+                                                flags);
+                        CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n", 
+                               peer->ibp_nid, HIPQUAD(peer->ibp_ip), 
+                               cv->cv_arprc);
+                } else {
+                        /* Retry ARP: ibp_connecting++ so terminating conn
+                         * doesn't end peer's connection attempt */
+                        peer->ibp_connecting++;
+                        write_unlock_irqrestore(&kibnal_data.kib_global_lock, 
+                                                flags);
+                        CWARN("Arp "LPX64"@%u.%u.%u.%u failed: %d "
+                              "(%d attempts left)\n", 
+                              peer->ibp_nid, HIPQUAD(peer->ibp_ip), 
+                              cv->cv_arprc, peer->ibp_arp_count);
+
+                        kibnal_schedule_peer_arp(peer);
+                }
                 kibnal_connreq_done(conn, 1, -ENETUNREACH);
                 return;
         }
@@ -2769,13 +2803,8 @@ kibnal_arp_callback (ibat_stat_t arprc, ibat_arp_data_t *arp_data, void *arg)
         if (arprc == ibat_stat_ok)
                 conn->ibc_connvars->cv_arp = *arp_data;
         
-        /* connd takes over my ref on conn */
-        spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
-        
-        list_add_tail(&conn->ibc_list, &kibnal_data.kib_connd_conns);
-        wake_up(&kibnal_data.kib_connd_waitq);
-        
-        spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
+        kibnal_schedule_conn(conn);
+        kibnal_conn_decref(conn);
 }
 
 void
@@ -2788,6 +2817,7 @@ kibnal_arp_peer (kib_peer_t *peer)
         /* Only the connd does this (i.e. single threaded) */
         LASSERT (current == kibnal_data.kib_connd);
         LASSERT (peer->ibp_connecting != 0);
+        LASSERT (peer->ibp_arp_count > 0);
 
         cep = cm_create_cep(cm_cep_transp_rc);
         if (cep == NULL) {
@@ -2825,18 +2855,13 @@ kibnal_arp_peer (kib_peer_t *peer)
                 break;
                 
         case ibat_stat_ok:
-                /* Immediate return (ARP cache hit) == no callback. */
-                conn->ibc_connvars->cv_arprc = ibat_stat_ok;
-                kibnal_send_connreq(conn);
-                kibnal_conn_decref(conn);
-                break;
-
         case ibat_stat_error:
         case ibat_stat_timeout:
         case ibat_stat_not_found:
-                CERROR("Arp "LPX64"@%u.%u.%u.%u failed: %d\n", peer->ibp_nid,
-                       HIPQUAD(peer->ibp_ip), ibatrc);
-                kibnal_connreq_done(conn, 1, -ENETUNREACH);
+                /* Immediate return (ARP cache hit or failure) == no callback. 
+                 * Do the next stage directly... */
+                conn->ibc_connvars->cv_arprc = ibatrc;
+                kibnal_arp_done(conn);
                 kibnal_conn_decref(conn);
                 break;
         }
@@ -3059,7 +3084,7 @@ kibnal_connd (void *arg)
                                 LBUG();
                                 
                         case IBNAL_CONN_ACTIVE_ARP:
-                                kibnal_send_connreq(conn);
+                                kibnal_arp_done(conn);
                                 break;
 
                         case IBNAL_CONN_ACTIVE_CONNECT: