From 38de8da72c4c227cca809c7712165016c4fa8269 Mon Sep 17 00:00:00 2001 From: eeb Date: Fri, 11 Mar 2005 11:10:46 +0000 Subject: [PATCH] * fixed 5000: socknal autoconnect race (outgoing v. incoming) * minor socknal cleanups --- lnet/klnds/socklnd/socklnd.c | 6 +- lnet/klnds/socklnd/socklnd.h | 4 +- lnet/klnds/socklnd/socklnd_cb.c | 121 +++++++++++++++++++--------------------- 3 files changed, 61 insertions(+), 70 deletions(-) diff --git a/lnet/klnds/socklnd/socklnd.c b/lnet/klnds/socklnd/socklnd.c index 6a71d07..448871e 100644 --- a/lnet/klnds/socklnd/socklnd.c +++ b/lnet/klnds/socklnd/socklnd.c @@ -342,7 +342,6 @@ ksocknal_associate_route_conn_locked(ksock_route_t *route, ksock_conn_t *conn) } route->ksnr_connected |= (1<ksnr_connecting &= ~(1<ksnr_conn_count++; /* Successful connection => further attempts can @@ -360,7 +359,7 @@ ksocknal_add_route_locked (ksock_peer_t *peer, ksock_route_t *route) ksock_route_t *route2; LASSERT (route->ksnr_peer == NULL); - LASSERT (route->ksnr_connecting == 0); + LASSERT (!route->ksnr_connecting); LASSERT (route->ksnr_connected == 0); /* LASSERT(unique) */ @@ -1196,7 +1195,6 @@ ksocknal_close_conn_locked (ksock_conn_t *conn, int error) if (route != NULL) { /* dissociate conn from route... */ LASSERT (!route->ksnr_deleted); - LASSERT ((route->ksnr_connecting & (1 << conn->ksnc_type)) == 0); LASSERT ((route->ksnr_connected & (1 << conn->ksnc_type)) != 0); conn2 = NULL; @@ -1795,7 +1793,7 @@ ksocknal_cmd(struct portals_cfg *pcfg, void * private) int rxmem; int nagle; - ksocknal_get_conn_tunables(conn, &txmem, &rxmem, &nagle); + ksocknal_lib_get_conn_tunables(conn, &txmem, &rxmem, &nagle); rc = 0; pcfg->pcfg_nid = conn->ksnc_peer->ksnp_nid; diff --git a/lnet/klnds/socklnd/socklnd.h b/lnet/klnds/socklnd/socklnd.h index 517f553..da7014e 100644 --- a/lnet/klnds/socklnd/socklnd.h +++ b/lnet/klnds/socklnd/socklnd.h @@ -345,7 +345,7 @@ typedef struct ksock_route __u32 ksnr_myipaddr; /* my IP */ __u32 ksnr_ipaddr; /* IP address to connect to */ int ksnr_port; /* port to connect to */ - unsigned int ksnr_connecting:4; /* autoconnects in progress by type */ + unsigned int ksnr_connecting:1; /* autoconnect in progress */ unsigned int ksnr_connected:4; /* connections established by type */ unsigned int ksnr_deleted:1; /* been removed from peer? */ unsigned int ksnr_share_count; /* created explicitly? */ @@ -428,8 +428,6 @@ extern int ksocknal_new_packet (ksock_conn_t *conn, int skip); extern int ksocknal_scheduler (void *arg); extern int ksocknal_autoconnectd (void *arg); extern int ksocknal_reaper (void *arg); -extern int ksocknal_get_conn_tunables (ksock_conn_t *conn, int *txmem, - int *rxmem, int *nagle); extern int ksocknal_setup_sock (struct socket *sock); extern int ksocknal_send_hello (ksock_conn_t *conn, __u32 *ipaddrs, int nipaddrs); extern int ksocknal_recv_hello (ksock_conn_t *conn, diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index f40ee0f..8c61b16 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -513,19 +513,10 @@ ksocknal_launch_autoconnect_locked (ksock_route_t *route) unsigned long flags; /* called holding write lock on ksnd_global_lock */ - - LASSERT (!route->ksnr_deleted); - LASSERT ((route->ksnr_connected & (1 << SOCKNAL_CONN_ANY)) == 0); - LASSERT ((route->ksnr_connected & KSNR_TYPED_ROUTES) != KSNR_TYPED_ROUTES); - LASSERT (route->ksnr_connecting == 0); + LASSERT (!route->ksnr_connecting); - if (ksocknal_tunables.ksnd_typed_conns) - route->ksnr_connecting = - KSNR_TYPED_ROUTES & ~route->ksnr_connected; - else - route->ksnr_connecting = (1 << SOCKNAL_CONN_ANY); - - atomic_inc (&route->ksnr_refcount); /* extra ref for asynchd */ + route->ksnr_connecting = 1; /* scheduling conn for autoconnectd */ + atomic_inc (&route->ksnr_refcount); /* extra ref for autoconnectd */ spin_lock_irqsave (&ksocknal_data.ksnd_autoconnectd_lock, flags); @@ -698,16 +689,18 @@ ksocknal_find_connectable_route_locked (ksock_peer_t *peer) route = list_entry (tmp, ksock_route_t, ksnr_list); bits = route->ksnr_connected; - /* All typed connections established? */ - if ((bits & KSNR_TYPED_ROUTES) == KSNR_TYPED_ROUTES) - continue; - - /* Untyped connection established? */ - if ((bits & (1 << SOCKNAL_CONN_ANY)) != 0) - continue; - + if (ksocknal_tunables.ksnd_typed_conns) { + /* All typed connections established? */ + if ((bits & KSNR_TYPED_ROUTES) == KSNR_TYPED_ROUTES) + continue; + } else { + /* Untyped connection established? */ + if ((bits & (1 << SOCKNAL_CONN_ANY)) != 0) + continue; + } + /* connection being established? */ - if (route->ksnr_connecting != 0) + if (route->ksnr_connecting) continue; /* too soon to retry this guy? */ @@ -729,7 +722,7 @@ ksocknal_find_connecting_route_locked (ksock_peer_t *peer) list_for_each (tmp, &peer->ksnp_routes) { route = list_entry (tmp, ksock_route_t, ksnr_list); - if (route->ksnr_connecting != 0) + if (route->ksnr_connecting) return (route); } @@ -1946,12 +1939,6 @@ ksocknal_recv_hello (ksock_conn_t *conn, ptl_nid_t *nid, } int -ksocknal_get_conn_tunables (ksock_conn_t *conn, int *txmem, int *rxmem, int *nagle) -{ - return ksocknal_lib_get_conn_tunables(conn, txmem, rxmem, nagle); -} - -int ksocknal_connect_peer (ksock_route_t *route, int type) { struct socket *sock; @@ -1989,37 +1976,51 @@ ksocknal_autoconnect (ksock_route_t *route) ksock_tx_t *tx; ksock_peer_t *peer; unsigned long flags; - int rc; int type; - char *err_msg = NULL; - + int mask; + int rc = 0; + + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); + for (;;) { - for (type = 0; type < SOCKNAL_CONN_NTYPES; type++) - if ((route->ksnr_connecting & (1 << type)) != 0) - break; - LASSERT (type < SOCKNAL_CONN_NTYPES); + if (!ksocknal_tunables.ksnd_typed_conns) { + if ((route->ksnr_connected & (1<ksnr_connected & (1<ksnr_connected & (1<ksnr_connected & (1<ksnr_connecting == 0) { - /* No more connections required */ - return; - } + write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); } + LASSERT (route->ksnr_connecting); + route->ksnr_connecting = 0; + write_unlock_irqrestore (&ksocknal_data.ksnd_global_lock, flags); + return; + + failed: switch (rc) { /* "normal" errors */ case -ECONNREFUSED: LCONSOLE_ERROR("Connection was refused by host %u.%u.%u.%u on " "port %d; check that Lustre is running on that " "node.\n", - HIPQUAD(route->ksnr_ipaddr), - route->ksnr_port); + HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); break; case -EHOSTUNREACH: case -ENETUNREACH: @@ -2032,37 +2033,31 @@ ksocknal_autoconnect (ksock_route_t *route) LCONSOLE_ERROR("Connecting to host %u.%u.%u.%u on port %d took " "too long; that node may be hung or " "experiencing high load.\n", - HIPQUAD(route->ksnr_ipaddr), - route->ksnr_port); + HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); break; /* errors that should be rare */ case -EPROTO: - err_msg = "Portals could not negotiate a connection"; + LCONSOLE_ERROR("Protocol error connecting to host %u.%u.%u.%u " + "on port %d: Is it running a compatible version" + " of Lustre?\n", + HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); break; - case -EAGAIN: case -EADDRINUSE: - /* -EAGAIN is out of ports, but we specify the ports - * manually. we really should never get this */ - err_msg = "no privileged ports were available"; + LCONSOLE_ERROR("No privileged ports available to connect to " + "host %u.%u.%u.%u on port %d\n", + HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); break; default: - err_msg = "unknown error"; + LCONSOLE_ERROR("Unexpected error %d connecting to " + "host %u.%u.%u.%u on port %d\n", rc, + HIPQUAD(route->ksnr_ipaddr), route->ksnr_port); break; } - if (err_msg) { - LCONSOLE_ERROR("There was an unexpected error connecting to host " - "%u.%u.%u.%u on port %d: %s (error code %d).\n", - HIPQUAD(route->ksnr_ipaddr), - route->ksnr_port, - err_msg, -rc); - } - - /* Connection attempt failed */ - write_lock_irqsave (&ksocknal_data.ksnd_global_lock, flags); peer = route->ksnr_peer; + LASSERT (route->ksnr_connecting); route->ksnr_connecting = 0; /* This is a retry rather than a new connection */ -- 1.8.3.1