From: Mr NeilBrown Date: Thu, 27 Oct 2022 13:58:02 +0000 (-0400) Subject: LU-10391 lnet: allow ping packet to contain large nids X-Git-Tag: 2.15.53~101 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=refs%2Fchanges%2F28%2F44628%2F20 LU-10391 lnet: allow ping packet to contain large nids The ping packet has an array of fixed-size status entries that only have room for a 4-byte-address nid. This patches adds a feature flag which activates a list of variable sized entries after the initial array. Each entry contains a 4-byte status and then a nid, rounded to a multiple of 4 bytes. The total number of bytes of the ping_info (header, first array, subsequent list) is stored in the ns_unused field of the first entry in the array. The user-space interfaces only see the initial array. Test-Parameters: trivial testlist=sanity-lnet Test-Parameters: serverversion=2.12 serverdistro=el7.9 testlist=runtests Test-Parameters: clientversion=2.12 testlist=runtests Signed-off-by: Mr NeilBrown Change-Id: I774641d8cda24251337ce2d055caf05a14a9e088 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/44628 Tested-by: jenkins Tested-by: Maloo Tested-by: James Simmons Reviewed-by: James Simmons Reviewed-by: Serguei Smirnov Reviewed-by: Oleg Drokin --- diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 2aa823a..8b4a8c2 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -675,6 +675,45 @@ struct lnet_ping_buffer { #define LNET_PING_INFO_TO_BUFFER(PINFO) \ container_of((PINFO), struct lnet_ping_buffer, pb_info) +static inline int +lnet_ping_sts_size(const struct lnet_nid *nid) +{ + int size; + + if (nid_is_nid4(nid)) + return sizeof(struct lnet_ni_status); + + size = offsetof(struct lnet_ni_large_status, ns_nid) + + NID_BYTES(nid); + + return round_up(size, 4); +} + +static inline struct lnet_ni_large_status * +lnet_ping_sts_next(const struct lnet_ni_large_status *nis) +{ + return (void *)nis + lnet_ping_sts_size(&nis->ns_nid); +} + +static inline bool +lnet_ping_at_least_two_entries(const struct lnet_ping_info *pi) +{ + /* Return true if we have at lease two entries. There is always a + * least one, a 4-byte lo0 interface. + */ + struct lnet_ni_large_status *lns; + + if ((pi->pi_features & LNET_PING_FEAT_LARGE_ADDR) == 0) + return pi->pi_nnis <= 2; + /* There is at least 1 large-address entry */ + if (pi->pi_nnis != 1) + return false; + lns = (void *)&pi->pi_ni[1]; + lns = lnet_ping_sts_next(lns); + + return ((void *)pi + lnet_ping_info_size(pi) <= (void *)lns); +} + struct lnet_nid_list { struct list_head nl_list; struct lnet_nid nl_nid; diff --git a/lnet/include/uapi/linux/lnet/lnet-idl.h b/lnet/include/uapi/linux/lnet/lnet-idl.h index 0e2b1f8..ff44538 100644 --- a/lnet/include/uapi/linux/lnet/lnet-idl.h +++ b/lnet/include/uapi/linux/lnet/lnet-idl.h @@ -247,7 +247,6 @@ struct lnet_counters_common { __u64 lcc_drop_length; } __attribute__((packed)); - #define LNET_NI_STATUS_UP 0x15aac0de #define LNET_NI_STATUS_DOWN 0xdeadface #define LNET_NI_STATUS_INVALID 0x00000000 @@ -255,19 +254,32 @@ struct lnet_counters_common { struct lnet_ni_status { lnet_nid_t ns_nid; __u32 ns_status; - __u32 ns_unused; + __u32 ns_msg_size; /* represents ping buffer size if message + * contains large NID addresses. + */ } __attribute__((packed)); -/* - * NB: value of these features equal to LNET_PROTO_PING_VERSION_x +/* When this appears in lnet_ping_info, it will be large + * enough to hold whatever nid is present, rounded up + * to a multiple of 4 bytes. + * NOTE: all users MUST check ns_nid.nid_size is usable. + */ +struct lnet_ni_large_status { + __u32 ns_status; + struct lnet_nid ns_nid; +} __attribute__((packed)); + +/* NB: value of these features equal to LNET_PROTO_PING_VERSION_x * of old LNet, so there shouldn't be any compatibility issue */ #define LNET_PING_FEAT_INVAL (0) /* no feature */ #define LNET_PING_FEAT_BASE (1 << 0) /* just a ping */ #define LNET_PING_FEAT_NI_STATUS (1 << 1) /* return NI status */ -#define LNET_PING_FEAT_RTE_DISABLED (1 << 2) /* Routing enabled */ -#define LNET_PING_FEAT_MULTI_RAIL (1 << 3) /* Multi-Rail aware */ +#define LNET_PING_FEAT_RTE_DISABLED (1 << 2) /* Routing enabled */ +#define LNET_PING_FEAT_MULTI_RAIL (1 << 3) /* Multi-Rail aware */ #define LNET_PING_FEAT_DISCOVERY (1 << 4) /* Supports Discovery */ +#define LNET_PING_FEAT_LARGE_ADDR (1 << 5) /* Large addr nids present */ +#define LNET_PING_FEAT_PRIMARY_LARGE (1 << 6) /* Primary is first Large addr */ /* * All ping feature bits fit to hit the wire. @@ -277,17 +289,26 @@ struct lnet_ni_status { * New feature bits can be added, just be aware that this does change the * over-the-wire protocol. */ -#define LNET_PING_FEAT_BITS (LNET_PING_FEAT_BASE | \ - LNET_PING_FEAT_NI_STATUS | \ - LNET_PING_FEAT_RTE_DISABLED | \ - LNET_PING_FEAT_MULTI_RAIL | \ - LNET_PING_FEAT_DISCOVERY) - +#define LNET_PING_FEAT_BITS (LNET_PING_FEAT_BASE | \ + LNET_PING_FEAT_NI_STATUS | \ + LNET_PING_FEAT_RTE_DISABLED | \ + LNET_PING_FEAT_MULTI_RAIL | \ + LNET_PING_FEAT_DISCOVERY | \ + LNET_PING_FEAT_LARGE_ADDR | \ + LNET_PING_FEAT_PRIMARY_LARGE) + +/* NOTE: + * The first address in pi_ni *must* be the loop-back nid: LNET_NID_LO_0 + * The second address must be the primary nid for the host unless + * LNET_PING_FEAT_PRIMARY_LARGE is set, then the first large address + * is the preferred primary. However nodes that do not recognise that + * flag will quietly ignore it. + */ struct lnet_ping_info { __u32 pi_magic; __u32 pi_features; lnet_pid_t pi_pid; - __u32 pi_nnis; + __u32 pi_nnis; /* number of nid4 entries */ struct lnet_ni_status pi_ni[0]; } __attribute__((packed)); @@ -297,7 +318,14 @@ struct lnet_ping_info { offsetof(struct lnet_ping_info, pi_ni[LNET_INTERFACES_MIN]) #define LNET_PING_INFO_LONI(PINFO) ((PINFO)->pi_ni[0].ns_nid) #define LNET_PING_INFO_SEQNO(PINFO) ((PINFO)->pi_ni[0].ns_status) -#define lnet_ping_info_size(pinfo) \ - offsetof(struct lnet_ping_info, pi_ni[(pinfo)->pi_nnis]) +/* If LNET_PING_FEAT_LARGE_ADDR set, pi_nnis is the number of nid4 entries + * and pi_ni[0].ns_msg_size is the total number of bytes, including header and + * lnet_ni_large_status entries which follow the lnet_ni_status entries. + * This must be a multiple of 4. + */ +#define lnet_ping_info_size(pinfo) \ + (((pinfo)->pi_features & LNET_PING_FEAT_LARGE_ADDR) \ + ? ((pinfo)->pi_ni[0].ns_msg_size & ~3) \ + : offsetof(struct lnet_ping_info, pi_ni[(pinfo)->pi_nnis])) #endif diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index 2c5683c..970bfed 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -903,8 +903,15 @@ static void lnet_assert_wire_constants(void) BUILD_BUG_ON((int)sizeof(((struct lnet_ni_status *)0)->ns_nid) != 8); BUILD_BUG_ON((int)offsetof(struct lnet_ni_status, ns_status) != 8); BUILD_BUG_ON((int)sizeof(((struct lnet_ni_status *)0)->ns_status) != 4); - BUILD_BUG_ON((int)offsetof(struct lnet_ni_status, ns_unused) != 12); - BUILD_BUG_ON((int)sizeof(((struct lnet_ni_status *)0)->ns_unused) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_ni_status, ns_msg_size) != 12); + BUILD_BUG_ON((int)sizeof(((struct lnet_ni_status *)0)->ns_msg_size) != 4); + + /* Checks for struct lnet_ni_large_status */ + BUILD_BUG_ON((int)sizeof(struct lnet_ni_large_status) != 24); + BUILD_BUG_ON((int)offsetof(struct lnet_ni_large_status, ns_status) != 0); + BUILD_BUG_ON((int)sizeof(((struct lnet_ni_large_status *)0)->ns_status) != 4); + BUILD_BUG_ON((int)offsetof(struct lnet_ni_large_status, ns_nid) != 4); + BUILD_BUG_ON((int)sizeof(((struct lnet_ni_large_status *)0)->ns_nid) != 20); /* Checks for struct lnet_ping_info and related constants */ BUILD_BUG_ON(LNET_PROTO_PING_MAGIC != 0x70696E67); @@ -914,7 +921,9 @@ static void lnet_assert_wire_constants(void) BUILD_BUG_ON(LNET_PING_FEAT_RTE_DISABLED != 4); BUILD_BUG_ON(LNET_PING_FEAT_MULTI_RAIL != 8); BUILD_BUG_ON(LNET_PING_FEAT_DISCOVERY != 16); - BUILD_BUG_ON(LNET_PING_FEAT_BITS != 31); + BUILD_BUG_ON(LNET_PING_FEAT_LARGE_ADDR != 32); + BUILD_BUG_ON(LNET_PING_FEAT_PRIMARY_LARGE != 64); + BUILD_BUG_ON(LNET_PING_FEAT_BITS != 127); /* Checks for struct lnet_ping_info */ BUILD_BUG_ON((int)sizeof(struct lnet_ping_info) != 16); @@ -1854,21 +1863,7 @@ lnet_get_net_ni_bytes_locked(struct lnet_net *net) int bytes = 0; list_for_each_entry(ni, &net->net_ni_list, ni_netlist) - if (nid_is_nid4(&ni->ni_nid)) - bytes += sizeof(struct lnet_ni_status); - - return bytes; -} - -static inline int -lnet_get_net_ni_bytes_pre(struct lnet_net *net) -{ - struct lnet_ni *ni; - int bytes = 0; - - list_for_each_entry(ni, &net->net_ni_added, ni_netlist) - if (nid_is_nid4(&ni->ni_nid)) - bytes += sizeof(struct lnet_ni_status); + bytes += lnet_ping_sts_size(&ni->ni_nid); return bytes; } @@ -1884,8 +1879,7 @@ lnet_get_ni_bytes(void) list_for_each_entry(net, &the_lnet.ln_nets, net_list) { list_for_each_entry(ni, &net->net_ni_list, ni_netlist) - if (nid_is_nid4(&ni->ni_nid)) - bytes += sizeof(struct lnet_ni_status); + bytes += lnet_ping_sts_size(&ni->ni_nid); } lnet_net_unlock(0); @@ -1896,6 +1890,7 @@ lnet_get_ni_bytes(void) void lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf) { + struct lnet_ni_large_status *lstat, *lend; struct lnet_ni_status *stat, *end; int nnis; int i; @@ -1910,6 +1905,19 @@ lnet_swap_pinginfo(struct lnet_ping_buffer *pbuf) for (i = 0; i < nnis && stat + 1 <= end; i++, stat++) { __swab64s(&stat->ns_nid); __swab32s(&stat->ns_status); + if (i == 0) + /* Might be total size */ + __swab32s(&stat->ns_msg_size); + } + if (!(pbuf->pb_info.pi_features & LNET_PING_FEAT_LARGE_ADDR)) + return; + + lstat = (struct lnet_ni_large_status *)stat; + lend = (void *)end; + while (lstat + 1 <= lend) { + __swab32s(&lstat->ns_status); + /* struct lnet_nid never needs to be swabed */ + lstat = lnet_ping_sts_next(lstat); } } @@ -2040,6 +2048,7 @@ lnet_ping_target_install_locked(struct lnet_ping_buffer *pbuf) struct lnet_ni *ni; struct lnet_net *net; struct lnet_ni_status *ns, *end; + struct lnet_ni_large_status *lns, *lend; int rc; pbuf->pb_info.pi_nnis = 0; @@ -2047,8 +2056,14 @@ lnet_ping_target_install_locked(struct lnet_ping_buffer *pbuf) end = (void *)&pbuf->pb_info + pbuf->pb_nbytes; list_for_each_entry(net, &the_lnet.ln_nets, net_list) { list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { - if (!nid_is_nid4(&ni->ni_nid)) + if (!nid_is_nid4(&ni->ni_nid)) { + if (ns == &pbuf->pb_info.pi_ni[1]) { + /* This is primary, and it is long */ + pbuf->pb_info.pi_features |= + LNET_PING_FEAT_PRIMARY_LARGE; + } continue; + } LASSERT(ns + 1 <= end); ns->ns_nid = lnet_nid_to_nid4(&ni->ni_nid); @@ -2062,6 +2077,31 @@ lnet_ping_target_install_locked(struct lnet_ping_buffer *pbuf) } } + lns = (void *)ns; + lend = (void *)end; + list_for_each_entry(net, &the_lnet.ln_nets, net_list) { + list_for_each_entry(ni, &net->net_ni_list, ni_netlist) { + if (nid_is_nid4(&ni->ni_nid)) + continue; + LASSERT(lns + 1 <= lend); + + lns->ns_nid = ni->ni_nid; + + lnet_ni_lock(ni); + ns->ns_status = lnet_ni_get_status_locked(ni); + ni->ni_status = &lns->ns_status; + lnet_ni_unlock(ni); + + lns = lnet_ping_sts_next(lns); + } + } + if ((void *)lns > (void *)ns) { + /* Record total info size */ + pbuf->pb_info.pi_ni[0].ns_msg_size = + (void *)lns - (void *)&pbuf->pb_info; + pbuf->pb_info.pi_features |= LNET_PING_FEAT_LARGE_ADDR; + } + /* We (ab)use the ns_status of the loopback interface to * transmit the sequence number. The first interface listed * must be the loopback interface. @@ -3479,8 +3519,7 @@ static int lnet_add_net_common(struct lnet_net *net, struct lnet_ping_buffer *pbuf; struct lnet_remotenet *rnet; struct lnet_ni *ni; - int net_ni_bytes; - __u32 net_id; + u32 net_id; int rc; lnet_net_lock(LNET_LOCK_EX); @@ -3497,26 +3536,6 @@ static int lnet_add_net_common(struct lnet_net *net, return -EUSERS; } - /* - * make sure you calculate the correct number of slots in the ping - * buffer. Since the ping info is a flattened list of all the NIs, - * we should allocate enough slots to accomodate the number of NIs - * which will be added. - * - * since ni hasn't been configured yet, use - * lnet_get_net_ni_bytes_pre() which checks the net_ni_added list - */ - net_ni_bytes = lnet_get_net_ni_bytes_pre(net); - - rc = lnet_ping_target_setup(&pbuf, &ping_mdh, - LNET_PING_INFO_HDR_SIZE + - net_ni_bytes + lnet_get_ni_bytes(), - false); - if (rc < 0) { - lnet_net_free(net); - return rc; - } - if (tun) memcpy(&net->net_tunables, &tun->lt_cmn, sizeof(net->net_tunables)); @@ -3528,7 +3547,21 @@ static int lnet_add_net_common(struct lnet_net *net, rc = lnet_startup_lndnet(net, (tun) ? &tun->lt_tun : NULL); if (rc < 0) - goto failed; + return rc; + + /* make sure you calculate the correct number of slots in the ping + * buffer. Since the ping info is a flattened list of all the NIs, + * we should allocate enough slots to accomodate the number of NIs + * which will be added. + */ + rc = lnet_ping_target_setup(&pbuf, &ping_mdh, + LNET_PING_INFO_HDR_SIZE + + lnet_get_ni_bytes(), + false); + if (rc < 0) { + lnet_shutdown_lndnet(net); + return rc; + } lnet_net_lock(LNET_LOCK_EX); net = lnet_get_net_locked(net_id); @@ -3762,7 +3795,7 @@ int lnet_dyn_del_ni(struct lnet_nid *nid) rc = lnet_ping_target_setup(&pbuf, &ping_mdh, (LNET_PING_INFO_HDR_SIZE + lnet_get_ni_bytes() - - sizeof(pbuf->pb_info.pi_ni[0])), + lnet_ping_sts_size(&ni->ni_nid)), false); if (rc != 0) goto unlock_api_mutex; @@ -5545,10 +5578,12 @@ static int lnet_ping(struct lnet_process_id id4, struct lnet_nid *src_nid, goto fail_ping_buffer_decref; } - /* Test if smaller than lnet_pinginfo with no pi_ni status info */ - if (nob < LNET_PING_INFO_HDR_SIZE) { + /* Test if smaller than lnet_pinginfo with just one pi_ni status info. + * That one might contain size when large nids are used. + */ + if (nob < LNET_PING_INFO_SIZE(1)) { CERROR("%s: Short reply %d(%lu min)\n", - libcfs_idstr(&id), nob, LNET_PING_INFO_HDR_SIZE); + libcfs_idstr(&id), nob, LNET_PING_INFO_SIZE(1)); goto fail_ping_buffer_decref; } diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index f54acbf..245499e 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -831,7 +831,7 @@ lnet_health_check(struct lnet_msg *msg) * I only have a single (non-lolnd) interface. */ pi = &the_lnet.ln_ping_target->pb_info; - if (pi->pi_nnis <= 2) { + if (lnet_ping_at_least_two_entries(pi)) { handle_local_health = false; attempt_local_resend = false; } diff --git a/lnet/utils/wirecheck.c b/lnet/utils/wirecheck.c index 2a23e74..4edc523 100644 --- a/lnet/utils/wirecheck.c +++ b/lnet/utils/wirecheck.c @@ -165,7 +165,11 @@ check_lnet_ni_status(void) CHECK_STRUCT(struct lnet_ni_status); CHECK_MEMBER(struct lnet_ni_status, ns_nid); CHECK_MEMBER(struct lnet_ni_status, ns_status); - CHECK_MEMBER(struct lnet_ni_status, ns_unused); + CHECK_MEMBER(struct lnet_ni_status, ns_msg_size); + + CHECK_STRUCT(struct lnet_ni_large_status); + CHECK_MEMBER(struct lnet_ni_large_status, ns_status); + CHECK_MEMBER(struct lnet_ni_large_status, ns_nid); } void @@ -181,6 +185,8 @@ check_lnet_ping_info(void) CHECK_VALUE(LNET_PING_FEAT_RTE_DISABLED); CHECK_VALUE(LNET_PING_FEAT_MULTI_RAIL); CHECK_VALUE(LNET_PING_FEAT_DISCOVERY); + CHECK_VALUE(LNET_PING_FEAT_LARGE_ADDR); + CHECK_VALUE(LNET_PING_FEAT_PRIMARY_LARGE); CHECK_VALUE(LNET_PING_FEAT_BITS); CHECK_STRUCT(struct lnet_ping_info);