Whamcloud - gitweb
LU-12815 socklnd: add conns_per_peer parameter 56/41056/9
authorSerguei Smirnov <ssmirnov@whamcloud.com>
Tue, 30 Mar 2021 16:58:57 +0000 (12:58 -0400)
committerOleg Drokin <green@whamcloud.com>
Wed, 5 May 2021 02:49:50 +0000 (02:49 +0000)
Introduce conns_per_peer ksocklnd module parameter.
In typed mode, this parameter shall control
the number of BULK_IN and BULK_OUT tcp connections,
while the number of CONTROL connections shall stay
at 1. In untyped mode, this parameter shall control
the number of untyped connections.
The default conns_per_peer is 1. Max is 127.

Test-Parameters: trivial testlist=sanity-lnet
Signed-off-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Change-Id: I70bbaf7899ae1fbc41de34553c8c4ad1c7d55f7e
Reviewed-on: https://review.whamcloud.com/41056
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Chris Horn <chris.horn@hpe.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/klnds/socklnd/socklnd.c
lnet/klnds/socklnd/socklnd.h
lnet/klnds/socklnd/socklnd_cb.c
lnet/klnds/socklnd/socklnd_modparams.c

index a0aef2d..cdc15e9 100644 (file)
@@ -131,6 +131,9 @@ ksocknal_create_conn_cb(struct sockaddr *addr)
        conn_cb->ksnr_connected = 0;
        conn_cb->ksnr_deleted = 0;
        conn_cb->ksnr_conn_count = 0;
+       conn_cb->ksnr_ctrl_conn_count = 0;
+       conn_cb->ksnr_blki_conn_count = 0;
+       conn_cb->ksnr_blko_conn_count = 0;
 
        return conn_cb;
 }
@@ -363,6 +366,73 @@ out:
        return rc;
 }
 
+static unsigned int
+ksocknal_get_conn_count_by_type(struct ksock_conn_cb *conn_cb,
+                               int type)
+{
+       unsigned int count = 0;
+
+       switch (type) {
+       case SOCKLND_CONN_CONTROL:
+               count = conn_cb->ksnr_ctrl_conn_count;
+               break;
+       case SOCKLND_CONN_BULK_IN:
+               count = conn_cb->ksnr_blki_conn_count;
+               break;
+       case SOCKLND_CONN_BULK_OUT:
+               count = conn_cb->ksnr_blko_conn_count;
+               break;
+       case SOCKLND_CONN_ANY:
+               count = conn_cb->ksnr_conn_count;
+               break;
+       default:
+               LBUG();
+               break;
+       }
+
+       return count;
+}
+
+static void
+ksocknal_incr_conn_count(struct ksock_conn_cb *conn_cb,
+                        int type)
+{
+       conn_cb->ksnr_conn_count++;
+
+       /* check if all connections of the given type got created */
+       switch (type) {
+       case SOCKLND_CONN_CONTROL:
+               conn_cb->ksnr_ctrl_conn_count++;
+               /* there's a single control connection per peer */
+               conn_cb->ksnr_connected |= BIT(type);
+               break;
+       case SOCKLND_CONN_BULK_IN:
+               conn_cb->ksnr_blki_conn_count++;
+               if (conn_cb->ksnr_blki_conn_count >=
+                   *ksocknal_tunables.ksnd_conns_per_peer)
+                       conn_cb->ksnr_connected |= BIT(type);
+               break;
+       case SOCKLND_CONN_BULK_OUT:
+               conn_cb->ksnr_blko_conn_count++;
+               if (conn_cb->ksnr_blko_conn_count >=
+                   *ksocknal_tunables.ksnd_conns_per_peer)
+                       conn_cb->ksnr_connected |= BIT(type);
+               break;
+       case SOCKLND_CONN_ANY:
+               if (conn_cb->ksnr_conn_count >=
+                   *ksocknal_tunables.ksnd_conns_per_peer)
+                       conn_cb->ksnr_connected |= BIT(type);
+               break;
+       default:
+               LBUG();
+               break;
+
+       }
+
+       CDEBUG(D_NET, "Add conn type %d, ksnr_connected %x conns_per_peer %d\n",
+              type, conn_cb->ksnr_connected, *ksocknal_tunables.ksnd_conns_per_peer);
+}
+
 static void
 ksocknal_associate_cb_conn_locked(struct ksock_conn_cb *conn_cb,
                                  struct ksock_conn *conn)
@@ -404,8 +474,7 @@ ksocknal_associate_cb_conn_locked(struct ksock_conn_cb *conn_cb,
                        iface->ksni_nroutes++;
        }
 
-       conn_cb->ksnr_connected |= (1<<type);
-       conn_cb->ksnr_conn_count++;
+       ksocknal_incr_conn_count(conn_cb, type);
 
        /* Successful connection => further attempts can
         * proceed immediately
@@ -727,6 +796,7 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_conn_cb *conn_cb,
        int rc;
        int rc2;
        int active;
+       int num_dup = 0;
        char *warn = NULL;
 
        active = (conn_cb != NULL);
@@ -845,9 +915,9 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_conn_cb *conn_cb,
                        peer_ni = peer2;
                }
 
-                /* +1 ref for me */
-                ksocknal_peer_addref(peer_ni);
-                peer_ni->ksnp_accepting++;
+               /* +1 ref for me */
+               ksocknal_peer_addref(peer_ni);
+               peer_ni->ksnp_accepting++;
 
                /* Am I already connecting to this guy?  Resolve in
                 * favour of higher NID...
@@ -859,14 +929,14 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_conn_cb *conn_cb,
                        warn = "connection race resolution";
                        goto failed_2;
                }
-        }
+       }
 
-        if (peer_ni->ksnp_closing ||
+       if (peer_ni->ksnp_closing ||
            (active && conn_cb->ksnr_deleted)) {
                /* peer_ni/conn_cb got closed under me */
-                rc = -ESTALE;
+               rc = -ESTALE;
                warn = "peer_ni/conn_cb removed";
-                goto failed_2;
+               goto failed_2;
         }
 
        if (peer_ni->ksnp_proto == NULL) {
@@ -893,18 +963,18 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_conn_cb *conn_cb,
                goto failed_2;
        }
 
-        switch (rc) {
-        default:
-                LBUG();
-        case 0:
-                break;
-        case EALREADY:
-                warn = "lost conn race";
-                goto failed_2;
-        case EPROTO:
-                warn = "retry with different protocol version";
-                goto failed_2;
-        }
+       switch (rc) {
+       default:
+               LBUG();
+       case 0:
+               break;
+       case EALREADY:
+               warn = "lost conn race";
+               goto failed_2;
+       case EPROTO:
+               warn = "retry with different protocol version";
+               goto failed_2;
+       }
 
        /* Refuse to duplicate an existing connection, unless this is a
         * loopback connection */
@@ -922,29 +992,33 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_conn_cb *conn_cb,
                            conn2->ksnc_type != conn->ksnc_type)
                                continue;
 
-                        /* Reply on a passive connection attempt so the peer_ni
-                         * realises we're connected. */
-                        LASSERT (rc == 0);
-                        if (!active)
-                                rc = EALREADY;
+                       num_dup++;
+                       if (num_dup < *ksocknal_tunables.ksnd_conns_per_peer)
+                               continue;
 
-                        warn = "duplicate";
-                        goto failed_2;
-                }
-        }
+                       /* Reply on a passive connection attempt so the peer_ni
+                        * realises we're connected.
+                        */
+                       LASSERT(rc == 0);
+                       if (!active)
+                               rc = EALREADY;
 
-        /* If the connection created by this route didn't bind to the IP
-         * address the route connected to, the connection/route matching
+                       warn = "duplicate";
+                       goto failed_2;
+               }
+       }
+       /* If the connection created by this route didn't bind to the IP
+        * address the route connected to, the connection/route matching
         * code below probably isn't going to work.
         */
-        if (active &&
+       if (active &&
            !rpc_cmp_addr((struct sockaddr *)&conn_cb->ksnr_addr,
                          (struct sockaddr *)&conn->ksnc_peeraddr)) {
                CERROR("Route %s %pIS connected to %pIS\n",
                       libcfs_id2str(peer_ni->ksnp_id),
                       &conn_cb->ksnr_addr,
                       &conn->ksnc_peeraddr);
-        }
+       }
 
        /* Search for a conn_cb corresponding to the new connection and
         * create an association.  This allows incoming connections created
@@ -1017,8 +1091,8 @@ ksocknal_create_conn(struct lnet_ni *ni, struct ksock_conn_cb *conn_cb,
 
        if (!active) {
                hello->kshm_nips = 0;
-                rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
-        }
+               rc = ksocknal_send_hello(ni, conn, peerid.nid, hello);
+       }
 
        LIBCFS_FREE(hello, offsetof(struct ksock_hello_msg,
                                    kshm_ips[LNET_INTERFACES_NUM]));
@@ -1139,7 +1213,14 @@ ksocknal_close_conn_locked(struct ksock_conn *conn, int error)
        if (conn_cb != NULL) {
                /* dissociate conn from cb... */
                LASSERT(!conn_cb->ksnr_deleted);
-               LASSERT((conn_cb->ksnr_connected & BIT(conn->ksnc_type)) != 0);
+
+               /* connected bit is set only if all connections
+                * of the given type got created
+                */
+               if (ksocknal_get_conn_count_by_type(conn_cb, conn->ksnc_type) ==
+                   *ksocknal_tunables.ksnd_conns_per_peer)
+                       LASSERT((conn_cb->ksnr_connected &
+                               BIT(conn->ksnc_type)) != 0);
 
                conn2 = NULL;
                list_for_each(tmp, &peer_ni->ksnp_conns) {
@@ -1606,9 +1687,9 @@ ksocknal_ctl(struct lnet_ni *ni, unsigned int cmd, void *arg)
                read_lock(&ksocknal_data.ksnd_global_lock);
 
                if (data->ioc_count >= 1) {
-                        rc = -ENOENT;
-                } else {
-                        rc = 0;
+                       rc = -ENOENT;
+               } else {
+                       rc = 0;
                        iface = &net->ksnn_interface;
 
                        sa = (void *)&iface->ksni_addr;
index 49055d0..08bbab9 100644 (file)
@@ -156,6 +156,11 @@ struct ksock_tunables {
 #if SOCKNAL_VERSION_DEBUG
         int              *ksnd_protocol;        /* protocol version */
 #endif
+       int              *ksnd_conns_per_peer;  /* for typed mode, yields:
+                                                * 1 + 2*conns_per_peer total
+                                                * for untyped:
+                                                * conns_per_peer total
+                                                */
 };
 
 struct ksock_net {
@@ -357,6 +362,8 @@ struct ksock_conn {
        time64_t                ksnc_tx_last_post;
 };
 
+#define SOCKNAL_CONN_COUNT_MAX_BITS    8       /* max conn count bits */
+
 struct ksock_conn_cb {
        struct list_head        ksnr_connd_list;/* chain on ksnr_connd_routes */
        struct ksock_peer_ni   *ksnr_peer;      /* owning peer_ni */
@@ -371,7 +378,11 @@ struct ksock_conn_cb {
        unsigned int            ksnr_connecting:1;/* connection in progress */
        unsigned int            ksnr_connected:4;/* connections by type */
        unsigned int            ksnr_deleted:1; /* been removed from peer_ni? */
-       int                     ksnr_conn_count;/* # conns for this route */
+       unsigned int            ksnr_ctrl_conn_count:1; /* # conns by type */
+       unsigned int            ksnr_blki_conn_count:8;
+       unsigned int            ksnr_blko_conn_count:8;
+       int                     ksnr_conn_count;/* total # conns for this cb */
+
 };
 
 #define SOCKNAL_KEEPALIVE_PING          1       /* cookie for keepalive ping */
@@ -557,9 +568,12 @@ ksocknal_peer_decref(struct ksock_peer_ni *peer_ni)
 
 static inline int ksocknal_timeout(void)
 {
-       return *ksocknal_tunables.ksnd_timeout ?
-               *ksocknal_tunables.ksnd_timeout :
-               lnet_get_lnd_timeout();
+       return *ksocknal_tunables.ksnd_timeout ?: lnet_get_lnd_timeout();
+}
+
+static inline int ksocknal_conns_per_peer(void)
+{
+       return *ksocknal_tunables.ksnd_conns_per_peer ?: 1;
 }
 
 int ksocknal_startup(struct lnet_ni *ni);
index 502d127..a0c79d2 100644 (file)
@@ -1943,7 +1943,8 @@ ksocknal_connect(struct ksock_conn_cb *conn_cb)
                        type = SOCKLND_CONN_ANY;
                } else if ((wanted & BIT(SOCKLND_CONN_CONTROL)) != 0) {
                        type = SOCKLND_CONN_CONTROL;
-               } else if ((wanted & BIT(SOCKLND_CONN_BULK_IN)) != 0) {
+               } else if ((wanted & BIT(SOCKLND_CONN_BULK_IN)) != 0 &&
+                          conn_cb->ksnr_blki_conn_count <= conn_cb->ksnr_blko_conn_count) {
                        type = SOCKLND_CONN_BULK_IN;
                } else {
                        LASSERT ((wanted & BIT(SOCKLND_CONN_BULK_OUT)) != 0);
index 2204280..6d274a0 100644 (file)
@@ -142,6 +142,10 @@ static unsigned int zc_recv_min_nfrags = 16;
 module_param(zc_recv_min_nfrags, int, 0644);
 MODULE_PARM_DESC(zc_recv_min_nfrags, "minimum # of fragments to enable ZC recv");
 
+static unsigned int conns_per_peer = 1;
+module_param(conns_per_peer, uint, 0444);
+MODULE_PARM_DESC(conns_per_peer, "number of connections per peer");
+
 #ifdef SOCKNAL_BACKOFF
 static int backoff_init = 3;
 module_param(backoff_init, int, 0644);
@@ -201,6 +205,11 @@ int ksocknal_tunables_init(void)
        ksocknal_tunables.ksnd_zc_min_payload     = &zc_min_payload;
        ksocknal_tunables.ksnd_zc_recv            = &zc_recv;
        ksocknal_tunables.ksnd_zc_recv_min_nfrags = &zc_recv_min_nfrags;
+       if (conns_per_peer > ((1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1)) {
+               CWARN("socklnd conns_per_peer is capped at %u.\n",
+                     (1 << SOCKNAL_CONN_COUNT_MAX_BITS)-1);
+       }
+       ksocknal_tunables.ksnd_conns_per_peer     = &conns_per_peer;
 
        if (enable_irq_affinity) {
                CWARN("irq_affinity is removed from socklnd because modern "