Whamcloud - gitweb
LU-10391 lnet: allow ping packet to contain large nids 28/44628/20
authorMr NeilBrown <neilb@suse.de>
Thu, 27 Oct 2022 13:58:02 +0000 (09:58 -0400)
committerOleg Drokin <green@whamcloud.com>
Tue, 8 Nov 2022 08:50:14 +0000 (08:50 +0000)
The ping packet has an array of fixed-size status entries that only
have room for a 4-byte-address nid.

This patches adds a feature flag which activates a list of variable
sized entries after the initial array.

Each entry contains a 4-byte status and then a nid, rounded to a
multiple of 4 bytes.  The total number of bytes of the ping_info
(header, first array, subsequent list) is stored in the ns_unused
field of the first entry in the array.

The user-space interfaces only see the initial array.

Test-Parameters: trivial testlist=sanity-lnet
Test-Parameters: serverversion=2.12 serverdistro=el7.9 testlist=runtests
Test-Parameters: clientversion=2.12 testlist=runtests
Signed-off-by: Mr NeilBrown <neilb@suse.de>
Change-Id: I774641d8cda24251337ce2d055caf05a14a9e088
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/44628
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Tested-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lnet/include/lnet/lib-types.h
lnet/include/uapi/linux/lnet/lnet-idl.h
lnet/lnet/api-ni.c
lnet/lnet/lib-msg.c
lnet/utils/wirecheck.c

index 2aa823a..8b4a8c2 100644 (file)
@@ -675,6 +675,45 @@ struct lnet_ping_buffer {
 #define LNET_PING_INFO_TO_BUFFER(PINFO)        \
        container_of((PINFO), struct lnet_ping_buffer, pb_info)
 
+static inline int
+lnet_ping_sts_size(const struct lnet_nid *nid)
+{
+       int size;
+
+       if (nid_is_nid4(nid))
+               return sizeof(struct lnet_ni_status);
+
+       size = offsetof(struct lnet_ni_large_status, ns_nid) +
+              NID_BYTES(nid);
+
+       return round_up(size, 4);
+}
+
+static inline struct lnet_ni_large_status *
+lnet_ping_sts_next(const struct lnet_ni_large_status *nis)
+{
+       return (void *)nis + lnet_ping_sts_size(&nis->ns_nid);
+}
+
+static inline bool
+lnet_ping_at_least_two_entries(const struct lnet_ping_info *pi)
+{
+       /* Return true if we have at lease two entries.  There is always a
+        * least one, a 4-byte lo0 interface.
+        */
+       struct lnet_ni_large_status *lns;
+
+       if ((pi->pi_features & LNET_PING_FEAT_LARGE_ADDR) == 0)
+               return pi->pi_nnis <= 2;
+       /* There is at least 1 large-address entry */
+       if (pi->pi_nnis != 1)
+               return false;
+       lns = (void *)&pi->pi_ni[1];
+       lns = lnet_ping_sts_next(lns);
+
+       return ((void *)pi + lnet_ping_info_size(pi) <= (void *)lns);
+}
+
 struct lnet_nid_list {
        struct list_head nl_list;
        struct lnet_nid nl_nid;
index 0e2b1f8..ff44538 100644 (file)
@@ -247,7 +247,6 @@ struct lnet_counters_common {
        __u64   lcc_drop_length;
 } __attribute__((packed));
 
-
 #define LNET_NI_STATUS_UP      0x15aac0de
 #define LNET_NI_STATUS_DOWN    0xdeadface
 #define LNET_NI_STATUS_INVALID 0x00000000
@@ -255,19 +254,32 @@ struct lnet_counters_common {
 struct lnet_ni_status {
        lnet_nid_t ns_nid;
        __u32      ns_status;
-       __u32      ns_unused;
+       __u32      ns_msg_size; /* represents ping buffer size if message
+                                * contains large NID addresses.
+                                */
 } __attribute__((packed));
 
-/*
- * NB: value of these features equal to LNET_PROTO_PING_VERSION_x
+/* When this appears in lnet_ping_info, it will be large
+ * enough to hold whatever nid is present, rounded up
+ * to a multiple of 4 bytes.
+ * NOTE: all users MUST check ns_nid.nid_size is usable.
+ */
+struct lnet_ni_large_status {
+       __u32           ns_status;
+       struct lnet_nid ns_nid;
+} __attribute__((packed));
+
+/* NB: value of these features equal to LNET_PROTO_PING_VERSION_x
  * of old LNet, so there shouldn't be any compatibility issue
  */
 #define LNET_PING_FEAT_INVAL           (0)             /* no feature */
 #define LNET_PING_FEAT_BASE            (1 << 0)        /* just a ping */
 #define LNET_PING_FEAT_NI_STATUS       (1 << 1)        /* return NI status */
-#define LNET_PING_FEAT_RTE_DISABLED    (1 << 2)        /* Routing enabled */
-#define LNET_PING_FEAT_MULTI_RAIL      (1 << 3)        /* Multi-Rail aware */
+#define LNET_PING_FEAT_RTE_DISABLED    (1 << 2)        /* Routing enabled */
+#define LNET_PING_FEAT_MULTI_RAIL      (1 << 3)        /* Multi-Rail aware */
 #define LNET_PING_FEAT_DISCOVERY       (1 << 4)        /* Supports Discovery */
+#define LNET_PING_FEAT_LARGE_ADDR      (1 << 5)        /* Large addr nids present */
+#define LNET_PING_FEAT_PRIMARY_LARGE   (1 << 6)        /* Primary is first Large addr */
 
 /*
  * All ping feature bits fit to hit the wire.
@@ -277,17 +289,26 @@ struct lnet_ni_status {
  * New feature bits can be added, just be aware that this does change the
  * over-the-wire protocol.
  */
-#define LNET_PING_FEAT_BITS            (LNET_PING_FEAT_BASE | \
-                                        LNET_PING_FEAT_NI_STATUS | \
-                                        LNET_PING_FEAT_RTE_DISABLED | \
-                                        LNET_PING_FEAT_MULTI_RAIL | \
-                                        LNET_PING_FEAT_DISCOVERY)
-
+#define LNET_PING_FEAT_BITS            (LNET_PING_FEAT_BASE |          \
+                                        LNET_PING_FEAT_NI_STATUS |     \
+                                        LNET_PING_FEAT_RTE_DISABLED |  \
+                                        LNET_PING_FEAT_MULTI_RAIL |    \
+                                        LNET_PING_FEAT_DISCOVERY |     \
+                                        LNET_PING_FEAT_LARGE_ADDR |    \
+                                        LNET_PING_FEAT_PRIMARY_LARGE)
+
+/* NOTE:
+ * The first address in pi_ni *must* be the loop-back nid: LNET_NID_LO_0
+ * The second address must be the primary nid for the host unless
+ * LNET_PING_FEAT_PRIMARY_LARGE is set, then the first large address
+ * is the preferred primary.  However nodes that do not recognise that
+ * flag will quietly ignore it.
+ */
 struct lnet_ping_info {
        __u32                   pi_magic;
        __u32                   pi_features;
        lnet_pid_t              pi_pid;
-       __u32                   pi_nnis;
+       __u32                   pi_nnis;        /* number of nid4 entries */
        struct lnet_ni_status   pi_ni[0];
 } __attribute__((packed));
 
@@ -297,7 +318,14 @@ struct lnet_ping_info {
        offsetof(struct lnet_ping_info, pi_ni[LNET_INTERFACES_MIN])
 #define LNET_PING_INFO_LONI(PINFO)      ((PINFO)->pi_ni[0].ns_nid)
 #define LNET_PING_INFO_SEQNO(PINFO)     ((PINFO)->pi_ni[0].ns_status)
-#define lnet_ping_info_size(pinfo)     \
-       offsetof(struct lnet_ping_info, pi_ni[(pinfo)->pi_nnis])
+/* If LNET_PING_FEAT_LARGE_ADDR set, pi_nnis is the number of nid4 entries
+ * and pi_ni[0].ns_msg_size is the total number of bytes, including header and
+ * lnet_ni_large_status entries which follow the lnet_ni_status entries.
+ * This must be a multiple of 4.
+ */
+#define lnet_ping_info_size(pinfo)                             \
+       (((pinfo)->pi_features & LNET_PING_FEAT_LARGE_ADDR)     \
+       ? ((pinfo)->pi_ni[0].ns_msg_size & ~3)                  \
+       : offsetof(struct lnet_ping_info, pi_ni[(pinfo)->pi_nnis]))
 
 #endif
index 2c5683c..970bfed 100644 (file)
@@ -903,8 +903,15 @@ static void lnet_assert_wire_constants(void)
        BUILD_BUG_ON((int)sizeof(((struct lnet_ni_status *)0)->ns_nid) != 8);
        BUILD_BUG_ON((int)offsetof(struct lnet_ni_status, ns_status) != 8);
        BUILD_BUG_ON((int)sizeof(((struct lnet_ni_status *)0)->ns_status) != 4);
-       BUILD_BUG_ON((int)offsetof(struct lnet_ni_status, ns_unused) != 12);
-       BUILD_BUG_ON((int)sizeof(((struct lnet_ni_status *)0)->ns_unused) != 4);
+       BUILD_BUG_ON((int)offsetof(struct lnet_ni_status, ns_msg_size) != 12);
+       BUILD_BUG_ON((int)sizeof(((struct lnet_ni_status *)0)->ns_msg_size) != 4);
+
+       /* Checks for struct lnet_ni_large_status */
+       BUILD_BUG_ON((int)sizeof(struct lnet_ni_large_status) != 24);
+       BUILD_BUG_ON((int)offsetof(struct lnet_ni_large_status, ns_status) != 0);
+       BUILD_BUG_ON((int)sizeof(((struct lnet_ni_large_status *)0)->ns_status) != 4);
+       BUILD_BUG_ON((int)offsetof(struct lnet_ni_large_status, ns_nid) != 4);
+       BUILD_BUG_ON((int)sizeof(((struct lnet_ni_large_status *)0)->ns_nid) != 20);
 
        /* Checks for struct lnet_ping_info and related constants */
        BUILD_BUG_ON(LNET_PROTO_PING_MAGIC != 0x70696E67);
@@ -914,7 +921,9 @@ static void lnet_assert_wire_constants(void)
        BUILD_BUG_ON(LNET_PING_FEAT_RTE_DISABLED != 4);
        BUILD_BUG_ON(LNET_PING_FEAT_MULTI_RAIL != 8);
        BUILD_BUG_ON(LNET_PING_FEAT_DISCOVERY != 16);
-       BUILD_BUG_ON(LNET_PING_FEAT_BITS != 31);
+       BUILD_BUG_ON(LNET_PING_FEAT_LARGE_ADDR != 32);
+       BUILD_BUG_ON(LNET_PING_FEAT_PRIMARY_LARGE != 64);
+       BUILD_BUG_ON(LNET_PING_FEAT_BITS != 127);
 
        /* Checks for struct lnet_ping_info */
        BUILD_BUG_ON((int)sizeof(struct lnet_ping_info) != 16);
@@ -1854,21 +1863,7 @@ lnet_get_net_ni_bytes_locked(struct lnet_net *net)
        int bytes = 0;
 
        list_for_each_entry(ni, &net->net_ni_list, ni_netlist)
-               if (nid_is_nid4(&ni->ni_nid))
-                       bytes += sizeof(struct lnet_ni_status);
-
-       return bytes;
-}
-
-static inline int
-lnet_get_net_ni_bytes_pre(struct lnet_net *net)
-{
-       struct lnet_ni *ni;
-       int bytes = 0;
-
-       list_for_each_entry(ni, &net->net_ni_added, ni_netlist)
-               if (nid_is_nid4(&ni->ni_nid))
-                       bytes += sizeof(struct lnet_ni_status);
+               bytes += lnet_ping_sts_size(&ni->ni_nid);
 
        return bytes;
 }
@@ -1884,8 +1879,7 @@ lnet_get_ni_bytes(void)
 
        list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
                list_for_each_entry(ni, &net->net_ni_list, ni_netlist)
-                       if (nid_is_nid4(&ni->ni_nid))
-                               bytes += sizeof(struct lnet_ni_status);
+                       bytes += lnet_ping_sts_size(&ni->ni_nid);
        }
 
        lnet_net_unlock(0);
@@ -1896,6 +1890,7 @@ lnet_get_ni_bytes(void)
 void
 lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf)
 {
+       struct lnet_ni_large_status *lstat, *lend;
        struct lnet_ni_status *stat, *end;
        int nnis;
        int i;
@@ -1910,6 +1905,19 @@ lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf)
        for (i = 0; i < nnis && stat + 1 <= end; i++, stat++) {
                __swab64s(&stat->ns_nid);
                __swab32s(&stat->ns_status);
+               if (i == 0)
+                       /* Might be total size */
+                       __swab32s(&stat->ns_msg_size);
+       }
+       if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_LARGE_ADDR))
+               return;
+
+       lstat = (struct lnet_ni_large_status *)stat;
+       lend = (void *)end;
+       while (lstat + 1 <= lend) {
+               __swab32s(&lstat->ns_status);
+               /* struct lnet_nid never needs to be swabed */
+               lstat = lnet_ping_sts_next(lstat);
        }
 }
 
@@ -2040,6 +2048,7 @@ lnet_ping_target_install_locked(struct lnet_ping_buffer *pbuf)
        struct lnet_ni *ni;
        struct lnet_net *net;
        struct lnet_ni_status *ns, *end;
+       struct lnet_ni_large_status *lns, *lend;
        int rc;
 
        pbuf->pb_info.pi_nnis = 0;
@@ -2047,8 +2056,14 @@ lnet_ping_target_install_locked(struct lnet_ping_buffer *pbuf)
        end = (void *)&pbuf->pb_info + pbuf->pb_nbytes;
        list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
                list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
-                       if (!nid_is_nid4(&ni->ni_nid))
+                       if (!nid_is_nid4(&ni->ni_nid)) {
+                               if (ns == &pbuf->pb_info.pi_ni[1]) {
+                                       /* This is primary, and it is long */
+                                       pbuf->pb_info.pi_features |=
+                                               LNET_PING_FEAT_PRIMARY_LARGE;
+                               }
                                continue;
+                       }
                        LASSERT(ns + 1 <= end);
                        ns->ns_nid = lnet_nid_to_nid4(&ni->ni_nid);
 
@@ -2062,6 +2077,31 @@ lnet_ping_target_install_locked(struct lnet_ping_buffer *pbuf)
                }
        }
 
+       lns = (void *)ns;
+       lend = (void *)end;
+       list_for_each_entry(net, &the_lnet.ln_nets, net_list) {
+               list_for_each_entry(ni, &net->net_ni_list, ni_netlist) {
+                       if (nid_is_nid4(&ni->ni_nid))
+                               continue;
+                       LASSERT(lns + 1 <= lend);
+
+                       lns->ns_nid = ni->ni_nid;
+
+                       lnet_ni_lock(ni);
+                       ns->ns_status = lnet_ni_get_status_locked(ni);
+                       ni->ni_status = &lns->ns_status;
+                       lnet_ni_unlock(ni);
+
+                       lns = lnet_ping_sts_next(lns);
+               }
+       }
+       if ((void *)lns > (void *)ns) {
+               /* Record total info size */
+               pbuf->pb_info.pi_ni[0].ns_msg_size =
+                       (void *)lns - (void *)&pbuf->pb_info;
+               pbuf->pb_info.pi_features |= LNET_PING_FEAT_LARGE_ADDR;
+       }
+
        /* We (ab)use the ns_status of the loopback interface to
         * transmit the sequence number. The first interface listed
         * must be the loopback interface.
@@ -3479,8 +3519,7 @@ static int lnet_add_net_common(struct lnet_net *net,
        struct lnet_ping_buffer *pbuf;
        struct lnet_remotenet *rnet;
        struct lnet_ni *ni;
-       int net_ni_bytes;
-       __u32 net_id;
+       u32 net_id;
        int rc;
 
        lnet_net_lock(LNET_LOCK_EX);
@@ -3497,26 +3536,6 @@ static int lnet_add_net_common(struct lnet_net *net,
                return -EUSERS;
        }
 
-       /*
-        * make sure you calculate the correct number of slots in the ping
-        * buffer. Since the ping info is a flattened list of all the NIs,
-        * we should allocate enough slots to accomodate the number of NIs
-        * which will be added.
-        *
-        * since ni hasn't been configured yet, use
-        * lnet_get_net_ni_bytes_pre() which checks the net_ni_added list
-        */
-       net_ni_bytes = lnet_get_net_ni_bytes_pre(net);
-
-       rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
-                                   LNET_PING_INFO_HDR_SIZE +
-                                   net_ni_bytes + lnet_get_ni_bytes(),
-                                   false);
-       if (rc < 0) {
-               lnet_net_free(net);
-               return rc;
-       }
-
        if (tun)
                memcpy(&net->net_tunables,
                       &tun->lt_cmn, sizeof(net->net_tunables));
@@ -3528,7 +3547,21 @@ static int lnet_add_net_common(struct lnet_net *net,
        rc = lnet_startup_lndnet(net,
                                 (tun) ? &tun->lt_tun : NULL);
        if (rc < 0)
-               goto failed;
+               return rc;
+
+       /* make sure you calculate the correct number of slots in the ping
+        * buffer. Since the ping info is a flattened list of all the NIs,
+        * we should allocate enough slots to accomodate the number of NIs
+        * which will be added.
+        */
+       rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
+                                   LNET_PING_INFO_HDR_SIZE +
+                                   lnet_get_ni_bytes(),
+                                   false);
+       if (rc < 0) {
+               lnet_shutdown_lndnet(net);
+               return rc;
+       }
 
        lnet_net_lock(LNET_LOCK_EX);
        net = lnet_get_net_locked(net_id);
@@ -3762,7 +3795,7 @@ int lnet_dyn_del_ni(struct lnet_nid *nid)
        rc = lnet_ping_target_setup(&pbuf, &ping_mdh,
                                    (LNET_PING_INFO_HDR_SIZE +
                                     lnet_get_ni_bytes() -
-                                    sizeof(pbuf->pb_info.pi_ni[0])),
+                                    lnet_ping_sts_size(&ni->ni_nid)),
                                    false);
        if (rc != 0)
                goto unlock_api_mutex;
@@ -5545,10 +5578,12 @@ static int lnet_ping(struct lnet_process_id id4, struct lnet_nid *src_nid,
                goto fail_ping_buffer_decref;
        }
 
-       /* Test if smaller than lnet_pinginfo with no pi_ni status info */
-       if (nob < LNET_PING_INFO_HDR_SIZE) {
+       /* Test if smaller than lnet_pinginfo with just one pi_ni status info.
+        * That one might contain size when large nids are used.
+        */
+       if (nob < LNET_PING_INFO_SIZE(1)) {
                CERROR("%s: Short reply %d(%lu min)\n",
-                      libcfs_idstr(&id), nob, LNET_PING_INFO_HDR_SIZE);
+                      libcfs_idstr(&id), nob, LNET_PING_INFO_SIZE(1));
                goto fail_ping_buffer_decref;
        }
 
index f54acbf..245499e 100644 (file)
@@ -831,7 +831,7 @@ lnet_health_check(struct lnet_msg *msg)
                 * I only have a single (non-lolnd) interface.
                 */
                pi = &the_lnet.ln_ping_target->pb_info;
-               if (pi->pi_nnis <= 2) {
+               if (lnet_ping_at_least_two_entries(pi)) {
                        handle_local_health = false;
                        attempt_local_resend = false;
                }
index 2a23e74..4edc523 100644 (file)
@@ -165,7 +165,11 @@ check_lnet_ni_status(void)
        CHECK_STRUCT(struct lnet_ni_status);
        CHECK_MEMBER(struct lnet_ni_status, ns_nid);
        CHECK_MEMBER(struct lnet_ni_status, ns_status);
-       CHECK_MEMBER(struct lnet_ni_status, ns_unused);
+       CHECK_MEMBER(struct lnet_ni_status, ns_msg_size);
+
+       CHECK_STRUCT(struct lnet_ni_large_status);
+       CHECK_MEMBER(struct lnet_ni_large_status, ns_status);
+       CHECK_MEMBER(struct lnet_ni_large_status, ns_nid);
 }
 
 void
@@ -181,6 +185,8 @@ check_lnet_ping_info(void)
        CHECK_VALUE(LNET_PING_FEAT_RTE_DISABLED);
        CHECK_VALUE(LNET_PING_FEAT_MULTI_RAIL);
        CHECK_VALUE(LNET_PING_FEAT_DISCOVERY);
+       CHECK_VALUE(LNET_PING_FEAT_LARGE_ADDR);
+       CHECK_VALUE(LNET_PING_FEAT_PRIMARY_LARGE);
        CHECK_VALUE(LNET_PING_FEAT_BITS);
 
        CHECK_STRUCT(struct lnet_ping_info);