Whamcloud - gitweb
Branch HEAD
authorliangzhen <liangzhen>
Mon, 5 May 2008 14:54:30 +0000 (14:54 +0000)
committerliangzhen <liangzhen>
Mon, 5 May 2008 14:54:30 +0000 (14:54 +0000)
Backout patch of b13983, checking and updating ksnp_proto and
adding conn to ksnp_conns should always happen atomically, keep
extra reference count of socket during establishing of connection.

b=15712
i=adilger
i=liangzhen

lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd_cb.c

index a04bb48..710ebd2 100644 (file)
@@ -1045,7 +1045,9 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
         conn->ksnc_peer = NULL;
         conn->ksnc_route = NULL;
         conn->ksnc_sock = sock;
-        atomic_set (&conn->ksnc_sock_refcount, 1); /* 1 ref for conn */
+        /* 2 ref, 1 for conn, another extra ref prevents socket
+         * being closed before establishment of connection */
+        atomic_set (&conn->ksnc_sock_refcount, 2);
         conn->ksnc_type = type;
         ksocknal_lib_save_callback(sock, conn);
         atomic_set (&conn->ksnc_conn_refcount, 1); /* 1 ref for me */
@@ -1154,6 +1156,14 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
                 }
         }
 
+        if (peer->ksnp_closing ||
+            (active && route->ksnr_deleted)) {
+                /* peer/route got closed under me */
+                rc = -ESTALE;
+                warn = "peer/route removed";
+                goto failed_2;
+        }
+
         if (peer->ksnp_proto == NULL) {
                 /* Never connected before.
                  * NB recv_hello may have returned EPROTO to signal my peer
@@ -1191,40 +1201,6 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
                 goto failed_2;
         }
 
-        write_unlock_bh(global_lock);
-
-        /* No more race (or won the race), has compatible version with peer */
-        if (active) {
-                /* additional routes after interface exchange? */
-                ksocknal_create_routes(peer, conn->ksnc_port,
-                                       hello->kshm_ips, hello->kshm_nips);
-        } else {
-                hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips,
-                                                       hello->kshm_nips);
-                rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
-        }
-        
-        /* setup the socket AFTER I've received hello (it disables
-         * SO_LINGER).  I might call back to the acceptor who may want
-         * to send a protocol version response and then close the
-         * socket; this ensures the socket only tears down after the
-         * response has been sent. */
-        if (rc == 0)
-                rc = ksocknal_lib_setup_sock(sock);
-
-        write_lock_bh(global_lock);
-
-        if (rc != 0) 
-                goto failed_2;
-
-        if (peer->ksnp_closing ||
-            (active && route->ksnr_deleted)) {
-                /* peer/route got closed under me */
-                rc = -ESTALE;
-                warn = "peer/route removed";
-                goto failed_2;
-        }
-
         /* Refuse to duplicate an existing connection, unless this is a
          * loopback connection */
         if (conn->ksnc_ipaddr != conn->ksnc_myipaddr) {
@@ -1301,21 +1277,17 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
                 ksocknal_queue_tx_locked (tx, conn);
         }
 
-        /* NB my callbacks block while I hold ksnd_global_lock */
-        ksocknal_lib_set_callback(sock, conn);
-
-        if (!active)
-                peer->ksnp_accepting--;
+        write_unlock_bh (global_lock);
 
-        write_unlock_bh(global_lock);
+        /* We've now got a new connection.  Any errors from here on are just
+         * like "normal" comms errors and we close the connection normally.
+         * NB (a) we still have to send the reply HELLO for passive
+         *        connections, 
+         *    (b) normal I/O on the conn is blocked until I setup and call the
+         *        socket callbacks.
+         */
 
-        if (ksocknal_connsock_addref(conn) == 0) {
-                ksocknal_lib_bind_irq (irq);
-                /* Allow I/O to proceed. */
-                ksocknal_read_callback(conn);
-                ksocknal_write_callback(conn);
-                ksocknal_connsock_decref(conn);
-        } 
+        ksocknal_lib_bind_irq (irq);
 
         CDEBUG(D_NET, "New conn %s p %d.x %u.%u.%u.%u -> %u.%u.%u.%u/%d"
                " incarnation:"LPD64" sched[%d]/%d\n",
@@ -1324,11 +1296,51 @@ ksocknal_create_conn (lnet_ni_t *ni, ksock_route_t *route,
                conn->ksnc_port, incarnation,
                (int)(conn->ksnc_scheduler - ksocknal_data.ksnd_schedulers), irq);
 
+        if (active) {
+                /* additional routes after interface exchange? */
+                ksocknal_create_routes(peer, conn->ksnc_port,
+                                       hello->kshm_ips, hello->kshm_nips);
+        } else {
+                hello->kshm_nips = ksocknal_select_ips(peer, hello->kshm_ips,
+                                                       hello->kshm_nips);
+                rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
+        }
+
         LIBCFS_FREE(hello, offsetof(ksock_hello_msg_t,
                                     kshm_ips[LNET_MAX_INTERFACES]));
 
+        /* setup the socket AFTER I've received hello (it disables
+         * SO_LINGER).  I might call back to the acceptor who may want
+         * to send a protocol version response and then close the
+         * socket; this ensures the socket only tears down after the
+         * response has been sent. */
+        if (rc == 0)
+                rc = ksocknal_lib_setup_sock(sock);
+
+        write_lock_bh(global_lock);
+
+        /* NB my callbacks block while I hold ksnd_global_lock */
+        ksocknal_lib_set_callback(sock, conn);
+
+        if (!active)
+                peer->ksnp_accepting--;
+
+        write_unlock_bh(global_lock);
+
+        if (rc != 0) {
+                write_lock_bh(global_lock);
+                ksocknal_close_conn_locked(conn, rc);
+                write_unlock_bh(global_lock);
+        } else if (ksocknal_connsock_addref(conn) == 0) {
+                /* Allow I/O to proceed. */
+                ksocknal_read_callback(conn);
+                ksocknal_write_callback(conn);
+                ksocknal_connsock_decref(conn);
+        }
+
+        ksocknal_connsock_decref(conn);
         ksocknal_conn_decref(conn);
-        return 0;
+        return rc;
 
  failed_2:
         if (!peer->ksnp_closing &&
index 8217d6b..bf4cd08 100644 (file)
@@ -2104,8 +2104,7 @@ ksocknal_send_hello (lnet_ni_t *ni, ksock_conn_t *conn,
 
         LASSERT (0 <= hello->kshm_nips && hello->kshm_nips <= LNET_MAX_INTERFACES);
 
-        /* No need for getconnsock/putconnsock */
-        LASSERT (!conn->ksnc_closing);
+        /* rely on caller to hold a ref on socket so it wouldn't disappear */
         LASSERT (conn->ksnc_proto != NULL);
 
         srcnid = lnet_ptlcompat_srcnid(ni->ni_nid, peer_nid);