Whamcloud - gitweb
LU-9680 lnet: Fault injection list ioctls to netlink 33/53733/11
authorChris Horn <chris.horn@hpe.com>
Wed, 12 Jun 2024 14:12:34 +0000 (10:12 -0400)
committerOleg Drokin <green@whamcloud.com>
Tue, 25 Jun 2024 03:23:28 +0000 (03:23 +0000)
Convert the fault injection list ioctls to a netlink implementation.

sanity-lnet tests that use fault injection can now be enabled for
large NIDs.

Test-Parameters: trivial testlist=sanity-lnet
Signed-off-by: Chris Horn <chris.horn@hpe.com>
Change-Id: Ieaf9c01401fc0841c1e5805667531ba3455e8110
Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53733
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: James Simmons <jsimmons@infradead.org>
Reviewed-by: Serguei Smirnov <ssmirnov@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
libcfs/autoconf/lustre-libcfs.m4
libcfs/include/libcfs/linux/linux-net.h
lnet/include/lnet/lib-lnet.h
lnet/include/lnet/lib-types.h
lnet/include/uapi/linux/lnet/lnet-dlc.h
lnet/lnet/api-ni.c
lnet/lnet/net_fault.c
lnet/utils/lnetconfig/liblnetconfig.c
lnet/utils/lnetconfig/liblnetconfig.h
lnet/utils/lnetctl.c
lustre/utils/portals.c

index 6cc8c3b..91542f9 100644 (file)
@@ -1057,6 +1057,27 @@ AC_DEFUN([LIBCFS_REFCOUNT_T], [
 ]) # LIBCFS_REFCOUNT_T
 
 #
+# HAVE_NLA_PUT_U64_64BIT
+#
+# Kernel version 4.10 commit 73520786b0793c612ef4de3e9addb2ec411bea20
+# added nla_put_u64_64bit
+#
+AC_DEFUN([LIBCFS_SRC_NLA_PUT_U64_64BIT], [
+       LB2_LINUX_TEST_SRC([nla_put_u64_64bit], [
+               #include <net/genetlink.h>
+       ],[
+               nla_put_u64_64bit(NULL, 0, 0, 0)
+       ])
+])
+AC_DEFUN([LIBCFS_NLA_PUT_U64_64BIT], [
+       LB2_MSG_LINUX_TEST_RESULT([if 'nla_put_u64_64bit()' exists],
+       [nla_put_u64_64bit], [
+               AC_DEFINE(HAVE_NLA_PUT_U64_64BIT, 1,
+                       ['nla_put_u64_64bit' is available])
+       ])
+]) # LIBCFS_NLA_PUT_U64_64BIT
+
+#
 # Kernel version 4.12 commit 499118e966f1d2150bd66647c8932343c4e9a0b8
 # introduce memalloc_noreclaim_{save,restore}
 #
@@ -2484,6 +2505,7 @@ AC_DEFUN([LIBCFS_PROG_LINUX_SRC], [
        LIBCFS_SRC_RHASHTABLE_WALK_ENTER
        # 4.10
        LIBCFS_SRC_HOTPLUG_STATE_MACHINE
+       LIBCFS_SRC_NLA_PUT_U64_64BIT
        # 4.11
        LIBCFS_SRC_NL_EXT_ACK
        LIBCFS_SRC_RHASHTABLE_LOOKUP_GET_INSERT_FAST
@@ -2635,6 +2657,7 @@ AC_DEFUN([LIBCFS_PROG_LINUX_RESULTS], [
        LIBCFS_RHASHTABLE_WALK_ENTER
        # 4.10
        LIBCFS_HOTPLUG_STATE_MACHINE
+       LIBCFS_NLA_PUT_U64_64BIT
        # 4.11
        LIBCFS_NL_EXT_ACK
        LIBCFS_RHASHTABLE_LOOKUP_GET_INSERT_FAST
index 6785b7f..6f9daf7 100644 (file)
@@ -71,6 +71,11 @@ char *nla_strdup(const struct nlattr *nla, gfp_t flags);
 #define nla_strscpy    nla_strlcpy
 #endif /* HAVE_NLA_STRLCPY */
 
+#ifndef HAVE_NLA_PUT_U64_64BIT
+#define nla_put_u64_64bit(skb, type, value, padattr) \
+       nla_put_u64(skb, type, value)
+#endif
+
 #ifndef HAVE_NL_PARSE_WITH_EXT_ACK
 
 #define NL_SET_BAD_ATTR(extack, attr)
index 58b49b6..88037e2 100644 (file)
@@ -19,6 +19,7 @@
 #define CFS_FAIL_MATCH_MD_NID          0xe001
 #define CFS_FAIL_DELAY_MSG_FORWARD     0xe002
 
+#include <linux/generic-radix-tree.h>
 #include <linux/netdevice.h>
 
 #include <libcfs/libcfs.h>
@@ -816,18 +817,31 @@ struct lnet_fault_large_attr {
        } u;
 };
 
+struct lnet_rule_properties {
+       struct lnet_fault_large_attr attr;
+       struct lnet_fault_stat stat;
+};
+
+struct lnet_genl_fault_rule_list {
+       unsigned int                            lgfrl_index;
+       unsigned int                            lgfrl_count;
+       u32                                     lgfrl_opc;
+       GENRADIX(struct lnet_rule_properties)   lgfrl_list;
+};
+
 int lnet_fault_ctl(int cmd, struct libcfs_ioctl_data *data);
 int lnet_fault_init(void);
 void lnet_fault_fini(void);
 
 bool lnet_drop_rule_match(struct lnet_hdr *hdr, struct lnet_nid *local_nid,
                          enum lnet_msg_hstatus *hstatus);
-
+int lnet_drop_rule_collect(struct lnet_genl_fault_rule_list *rlist);
 int lnet_delay_rule_add(struct lnet_fault_large_attr *attr);
 int lnet_delay_rule_del(struct lnet_nid *src, struct lnet_nid *dst,
                        bool shutdown);
 int lnet_delay_rule_list(int pos, struct lnet_fault_large_attr *attr,
                         struct lnet_fault_stat *stat);
+int lnet_delay_rule_collect(struct lnet_genl_fault_rule_list *rlist);
 void lnet_delay_rule_reset(void);
 void lnet_delay_rule_check(void);
 bool lnet_delay_rule_match_locked(struct lnet_hdr *hdr, struct lnet_msg *msg);
index ddd1dac..72820ee 100644 (file)
@@ -1084,7 +1084,7 @@ enum lnet_peer_dist_attr {
 /** enum lnet_debug_recovery_attr              Attributes to report contents of
  *                                             the LNet health recovery queues
  *
- * @LNET_DBG_RECOV_ATTR_UNSPEC                 Unspecified attribyute to catch
+ * @LNET_DBG_RECOV_ATTR_UNSPEC                 Unspecified attribute to catch
  *                                             errors
  * @LNET_DBG_RECOV_ATTR_HDR                    Grouping for NI recovery queue
  *                                             (NLA_NUL_STRING)
@@ -1101,6 +1101,62 @@ enum lnet_debug_recovery_attr {
 
 #define LNET_DBG_RECOV_ATTR_MAX (__LNET_DBG_RECOV_ATTR_MAX_PLUS_ONE - 1)
 
+
+/** enum lnet_fault_rule_attr          Attributes to report LNet fault
+ *                                     injection.
+ *
+ * @LNET_FAULT_ATTR_UNSPEC             Unspecified attribute to catch errors
+ * @LNET_FAULT_ATTR_PAD                        Pad attribute for 64b alignment
+ *
+ * @LNET_FAULT_ATTR_HDR                        Grouping for "fault"
+ * @LNET_FAULT_ATTR_FA_TYPE            The type of fault injection rule. i.e.
+ *                                     either a "drop" rule or a "delay" rule.
+ * @LNET_FAULT_ATTR_FA_SRC             For a description of this field, and
+ *                                     the ones below, refer to
+ *                                     struct lnet_fault_attr
+ * @LNET_FAULT_ATTR_FA_DST
+ * @LNET_FAULT_ATTR_FA_PTL_MASK
+ * @LNET_FAULT_ATTR_FA_MSG_MASK
+ * @LNET_FAULT_ATTR_DA_RATE
+ * @LNET_FAULT_ATTR_DA_INTERVAL
+ * @LNET_FAULT_ATTR_DS_DROPPED
+ * @LNET_FAULT_ATTR_LA_RATE
+ * @LNET_FAULT_ATTR_LA_INTERVAL
+ * @LNET_FAULT_ATTR_LA_LATENCY
+ * @LNET_FAULT_ATTR_LS_DELAYED
+ * @LNET_FAULT_ATTR_FS_COUNT
+ * @LNET_FAULT_ATTR_FS_PUT
+ * @LNET_FAULT_ATTR_FS_ACK
+ * @LNET_FAULT_ATTR_FS_GET
+ * @LNET_FAULT_ATTR_FS_REPLY
+ */
+enum lnet_fault_rule_attr {
+       LNET_FAULT_ATTR_UNSPEC = 0,
+       LNET_FAULT_ATTR_PAD = LNET_FAULT_ATTR_UNSPEC,
+
+       LNET_FAULT_ATTR_HDR,
+       LNET_FAULT_ATTR_FA_TYPE,
+       LNET_FAULT_ATTR_FA_SRC,
+       LNET_FAULT_ATTR_FA_DST,
+       LNET_FAULT_ATTR_FA_PTL_MASK,
+       LNET_FAULT_ATTR_FA_MSG_MASK,
+       LNET_FAULT_ATTR_DA_RATE,
+       LNET_FAULT_ATTR_DA_INTERVAL,
+       LNET_FAULT_ATTR_DS_DROPPED,
+       LNET_FAULT_ATTR_LA_RATE,
+       LNET_FAULT_ATTR_LA_INTERVAL,
+       LNET_FAULT_ATTR_LA_LATENCY,
+       LNET_FAULT_ATTR_LS_DELAYED,
+       LNET_FAULT_ATTR_FS_COUNT,
+       LNET_FAULT_ATTR_FS_PUT,
+       LNET_FAULT_ATTR_FS_ACK,
+       LNET_FAULT_ATTR_FS_GET,
+       LNET_FAULT_ATTR_FS_REPLY,
+       __LNET_FAULT_ATTR_MAX_PLUS_ONE,
+};
+
+#define LNET_FAULT_ATTR_MAX (__LNET_FAULT_ATTR_MAX_PLUS_ONE - 1)
+
 struct lnet_ni {
        /* chain on the lnet_net structure */
        struct list_head        ni_netlist;
index b9da1b5..6312427 100644 (file)
@@ -43,7 +43,8 @@
  * @LNET_CMD_PEER_DIST:                command to find distance between LNet peers
  * @LNET_CMD_UDSP:             command to manage LNet UDSP rules
  * @LNET_CMD_PEER_FAIL:                command to fail LNet peers
- * @LNET_CMD_DBG_RECOV:                command to debug peers
+ * @LNET_CMD_DBG_RECOV:                command to print recovery queues
+ * @LNET_CMD_FAULT:            command to inject LNet message failures
  */
 enum lnet_commands {
        LNET_CMD_UNSPEC         = 0,
@@ -59,6 +60,7 @@ enum lnet_commands {
        LNET_CMD_UDSP           = 9,
        LNET_CMD_PEER_FAIL      = 10,
        LNET_CMD_DBG_RECOV      = 11,
+       LNET_CMD_FAULT          = 12,
 
        __LNET_CMD_MAX_PLUS_ONE
 };
index fadf9d4..300887c 100644 (file)
@@ -9261,6 +9261,300 @@ static int lnet_old_debug_recovery_show_dump(struct sk_buff *msg,
 }
 #endif
 
+static inline struct lnet_genl_fault_rule_list *
+lnet_fault_dump_ctx(struct netlink_callback *cb)
+{
+       return (struct lnet_genl_fault_rule_list *)cb->args[0];
+}
+
+int lnet_fault_show_done(struct netlink_callback *cb)
+{
+       struct lnet_genl_fault_rule_list *rlist = lnet_fault_dump_ctx(cb);
+
+       ENTRY;
+       if (rlist) {
+               genradix_free(&rlist->lgfrl_list);
+               CFS_FREE_PTR(rlist);
+       }
+       cb->args[0] = 0;
+
+       RETURN(0);
+}
+
+int lnet_fault_show_start(struct netlink_callback *cb)
+{
+       struct genlmsghdr *gnlh = nlmsg_data(cb->nlh);
+       struct netlink_ext_ack *extack = NULL;
+       struct nlattr *params = genlmsg_data(gnlh);
+       struct lnet_genl_fault_rule_list *rlist;
+       int msg_len, rem, rc = 0;
+       struct nlattr *entry;
+       s64 opc = 0;
+
+       ENTRY;
+#ifdef HAVE_NL_DUMP_WITH_EXT_ACK
+       extack = cb->extack;
+#endif
+       msg_len = genlmsg_len(gnlh);
+       if (!msg_len) {
+               NL_SET_ERR_MSG(extack, "no configuration");
+               RETURN(-ENOMSG);
+       }
+
+       if (!(nla_type(params) & LN_SCALAR_ATTR_LIST)) {
+               NL_SET_ERR_MSG(extack, "invalid configuration");
+               RETURN(-EINVAL);
+       }
+
+       nla_for_each_attr(entry, params, msg_len, rem) {
+               if (nla_type(entry) != LN_SCALAR_ATTR_VALUE)
+                       continue;
+
+               if (nla_strcmp(entry, "rule_type") == 0) {
+                       rc = nla_extract_val(&entry, &rem,
+                                            LN_SCALAR_ATTR_INT_VALUE,
+                                            (void *)&opc, sizeof(opc), extack);
+                       if (rc < 0)
+                               GOTO(report_error, rc);
+               }
+       }
+
+       CDEBUG(D_NET, "Got opc %lld\n", opc);
+
+       if (opc != LNET_CTL_DROP_LIST && opc != LNET_CTL_DELAY_LIST) {
+               NL_SET_ERR_MSG(extack, "invalid operation");
+               GOTO(report_error, rc = -EINVAL);
+       }
+
+       CFS_ALLOC_PTR(rlist);
+       if (!rlist) {
+               NL_SET_ERR_MSG(extack, "No memory for rule list");
+               RETURN(-ENOMEM);
+       }
+
+       genradix_init(&rlist->lgfrl_list);
+       rlist->lgfrl_count = 0;
+       rlist->lgfrl_index = 0;
+       rlist->lgfrl_opc = opc;
+       cb->args[0] = (long)rlist;
+
+       rc = -ENOENT;
+       if (opc == LNET_CTL_DROP_LIST)
+               rc = lnet_drop_rule_collect(rlist);
+       else if (opc == LNET_CTL_DELAY_LIST)
+               rc = lnet_delay_rule_collect(rlist);
+report_error:
+       if (rc < 0)
+               lnet_fault_show_done(cb);
+
+       RETURN(rc);
+}
+
+static const struct ln_key_list fault_attr_list = {
+       .lkl_maxattr                    = LNET_FAULT_ATTR_MAX,
+       .lkl_list                       = {
+               [LNET_FAULT_ATTR_HDR]           = {
+                       .lkp_value              = "fault",
+                       .lkp_key_format         = LNKF_SEQUENCE | LNKF_MAPPING,
+                       .lkp_data_type          = NLA_NUL_STRING,
+               },
+               [LNET_FAULT_ATTR_FA_TYPE]       = {
+                       .lkp_value              = "rule_type",
+                       .lkp_data_type          = NLA_STRING
+               },
+               [LNET_FAULT_ATTR_FA_SRC]        = {
+                       .lkp_value              = "fa_src",
+                       .lkp_data_type          = NLA_STRING
+               },
+               [LNET_FAULT_ATTR_FA_DST]        = {
+                       .lkp_value              = "fa_dst",
+                       .lkp_data_type          = NLA_STRING
+               },
+               [LNET_FAULT_ATTR_FA_PTL_MASK]   = {
+                       .lkp_value              = "fa_ptl_mask",
+                       .lkp_data_type          = NLA_U64
+               },
+               [LNET_FAULT_ATTR_FA_MSG_MASK]   = {
+                       .lkp_value              = "fa_msg_mask",
+                       .lkp_data_type          = NLA_U32
+               },
+               [LNET_FAULT_ATTR_DA_RATE]       = {
+                       .lkp_value              = "da_rate",
+                       .lkp_data_type          = NLA_U32
+               },
+               [LNET_FAULT_ATTR_DA_INTERVAL]   = {
+                       .lkp_value              = "da_interval",
+                       .lkp_data_type          = NLA_U32
+               },
+               [LNET_FAULT_ATTR_DS_DROPPED]    = {
+                       .lkp_value              = "ds_dropped",
+                       .lkp_data_type          = NLA_U64
+               },
+               [LNET_FAULT_ATTR_LA_RATE]       = {
+                       .lkp_value              = "la_rate",
+                       .lkp_data_type          = NLA_U32
+               },
+               [LNET_FAULT_ATTR_LA_INTERVAL]   = {
+                       .lkp_value              = "la_interval",
+                       .lkp_data_type          = NLA_U32
+               },
+               [LNET_FAULT_ATTR_LA_LATENCY]    = {
+                       .lkp_value              = "la_latency",
+                       .lkp_data_type          = NLA_U32
+               },
+               [LNET_FAULT_ATTR_LS_DELAYED]    = {
+                       .lkp_value              = "ls_delayed",
+                       .lkp_data_type          = NLA_U64
+               },
+               [LNET_FAULT_ATTR_FS_COUNT]      = {
+                       .lkp_value              = "fs_count",
+                       .lkp_data_type          = NLA_U64
+               },
+               [LNET_FAULT_ATTR_FS_PUT]        = {
+                       .lkp_value              = "fs_put",
+                       .lkp_data_type          = NLA_U64
+               },
+               [LNET_FAULT_ATTR_FS_ACK]        = {
+                       .lkp_value              = "fs_ack",
+                       .lkp_data_type          = NLA_U64
+               },
+               [LNET_FAULT_ATTR_FS_GET]        = {
+                       .lkp_value              = "fs_get",
+                       .lkp_data_type          = NLA_U64
+               },
+               [LNET_FAULT_ATTR_FS_REPLY]      = {
+                       .lkp_value              = "fs_reply",
+                       .lkp_data_type          = NLA_U64
+               },
+       },
+};
+
+int lnet_fault_show_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+       struct lnet_genl_fault_rule_list *rlist = lnet_fault_dump_ctx(cb);
+#ifdef HAVE_NL_PARSE_WITH_EXT_ACK
+       struct netlink_ext_ack *extack = NULL;
+#endif
+       int portid = NETLINK_CB(cb->skb).portid;
+       int seq = cb->nlh->nlmsg_seq;
+       int idx, rc = 0;
+       u32 opc;
+
+       ENTRY;
+#ifdef HAVE_NL_DUMP_WITH_EXT_ACK
+       extack = cb->extack;
+#endif
+       if (!rlist->lgfrl_count) {
+               NL_SET_ERR_MSG(extack, "No routes found");
+               GOTO(send_error, rc = -ENOENT);
+       }
+
+       idx = rlist->lgfrl_index;
+       if (!idx) {
+               const struct ln_key_list *all[] = {
+                       &fault_attr_list, NULL
+               };
+
+               rc = lnet_genl_send_scalar_list(msg, portid, seq,
+                                               &lnet_family,
+                                               NLM_F_CREATE | NLM_F_MULTI,
+                                               LNET_CMD_FAULT, all);
+               if (rc < 0) {
+                       NL_SET_ERR_MSG(extack, "failed to send key table");
+                       GOTO(send_error, rc);
+               }
+       }
+       opc = rlist->lgfrl_opc;
+
+       while (idx < rlist->lgfrl_count) {
+               struct lnet_rule_properties *prop;
+               void *hdr;
+
+               prop = genradix_ptr(&rlist->lgfrl_list, idx++);
+
+               hdr = genlmsg_put(msg, portid, seq, &lnet_family,
+                                 NLM_F_MULTI, LNET_CMD_FAULT);
+               if (!hdr) {
+                       NL_SET_ERR_MSG(extack, "failed to send values");
+                       genlmsg_cancel(msg, hdr);
+                       GOTO(send_error, rc = -EMSGSIZE);
+               }
+
+               if (idx == 1)
+                       nla_put_string(msg, LNET_FAULT_ATTR_HDR, "");
+
+               nla_put_string(msg, LNET_FAULT_ATTR_FA_TYPE,
+                              opc == LNET_CTL_DROP_LIST ? "drop" : "delay");
+
+               nla_put_string(msg, LNET_FAULT_ATTR_FA_SRC,
+                              libcfs_nidstr(&prop->attr.fa_src));
+               nla_put_string(msg, LNET_FAULT_ATTR_FA_DST,
+                              libcfs_nidstr(&prop->attr.fa_dst));
+
+               nla_put_u64_64bit(msg, LNET_FAULT_ATTR_FA_PTL_MASK,
+                                 prop->attr.fa_ptl_mask,
+                                 LNET_FAULT_ATTR_PAD);
+               nla_put_u32(msg, LNET_FAULT_ATTR_FA_MSG_MASK,
+                           prop->attr.fa_msg_mask);
+
+               if (opc == LNET_CTL_DROP_LIST) {
+                       nla_put_u32(msg, LNET_FAULT_ATTR_DA_RATE,
+                                   prop->attr.u.drop.da_rate);
+                       nla_put_u32(msg, LNET_FAULT_ATTR_DA_INTERVAL,
+                                   prop->attr.u.drop.da_interval);
+                       nla_put_u64_64bit(msg, LNET_FAULT_ATTR_DS_DROPPED,
+                                         prop->stat.u.drop.ds_dropped,
+                                         LNET_FAULT_ATTR_PAD);
+               } else if (opc == LNET_CTL_DELAY_LIST) {
+                       nla_put_u32(msg, LNET_FAULT_ATTR_LA_RATE,
+                                   prop->attr.u.delay.la_rate);
+                       nla_put_u32(msg, LNET_FAULT_ATTR_LA_INTERVAL,
+                                   prop->attr.u.delay.la_interval);
+                       nla_put_u32(msg, LNET_FAULT_ATTR_LA_LATENCY,
+                                   prop->attr.u.delay.la_latency);
+                       nla_put_u64_64bit(msg, LNET_FAULT_ATTR_LS_DELAYED,
+                                         prop->stat.u.delay.ls_delayed,
+                                         LNET_FAULT_ATTR_PAD);
+               }
+               nla_put_u64_64bit(msg, LNET_FAULT_ATTR_FS_COUNT,
+                                 prop->stat.fs_count,
+                                 LNET_FAULT_ATTR_PAD);
+               nla_put_u64_64bit(msg, LNET_FAULT_ATTR_FS_PUT,
+                                 prop->stat.fs_put,
+                                 LNET_FAULT_ATTR_PAD);
+               nla_put_u64_64bit(msg, LNET_FAULT_ATTR_FS_ACK,
+                                 prop->stat.fs_ack,
+                                 LNET_FAULT_ATTR_PAD);
+               nla_put_u64_64bit(msg, LNET_FAULT_ATTR_FS_GET,
+                                 prop->stat.fs_get,
+                                 LNET_FAULT_ATTR_PAD);
+               nla_put_u64_64bit(msg, LNET_FAULT_ATTR_FS_REPLY,
+                                 prop->stat.fs_reply,
+                                 LNET_FAULT_ATTR_PAD);
+               genlmsg_end(msg, hdr);
+       }
+       rlist->lgfrl_index = idx;
+send_error:
+       return lnet_nl_send_error(cb->skb, portid, seq, rc);
+}
+
+#ifndef HAVE_NETLINK_CALLBACK_START
+int lnet_old_fault_show_dump(struct sk_buff *msg, struct netlink_callback *cb)
+{
+       if (!cb->args[0]) {
+               int rc = lnet_fault_show_start(cb);
+
+               if (rc < 0)
+                       return lnet_nl_send_error(cb->skb,
+                                                 NETLINK_CB(cb->skb).portid,
+                                                 cb->nlh->nlmsg_seq,
+                                                 rc);
+       }
+
+       return lnet_fault_show_dump(msg, cb);
+}
+#endif
+
 static const struct genl_multicast_group lnet_mcast_grps[] = {
        { .name =       "ip2net",       },
        { .name =       "net",          },
@@ -9270,6 +9564,7 @@ static const struct genl_multicast_group lnet_mcast_grps[] = {
        { .name =       "discover",     },
        { .name =       "cpt-of-nid",   },
        { .name =       "dbg-recov",    },
+       { .name =       "fault",        },
 };
 
 static const struct genl_ops lnet_genl_ops[] = {
@@ -9362,6 +9657,17 @@ static const struct genl_ops lnet_genl_ops[] = {
 #endif
                .done           = lnet_debug_recovery_show_done,
        },
+       {
+               .cmd            = LNET_CMD_FAULT,
+               .flags          = GENL_ADMIN_PERM,
+#ifdef HAVE_NETLINK_CALLBACK_START
+               .start          = lnet_fault_show_start,
+               .dumpit         = lnet_fault_show_dump,
+#else
+               .dumpit         = lnet_old_fault_show_dump,
+#endif
+               .done           = lnet_fault_show_done,
+       },
 };
 
 static struct genl_family lnet_family = {
index 5f165a1..a611d40 100644 (file)
@@ -288,6 +288,33 @@ lnet_drop_rule_list(int pos, struct lnet_fault_large_attr *attr,
        RETURN(rc);
 }
 
+int lnet_drop_rule_collect(struct lnet_genl_fault_rule_list *rlist)
+{
+       struct lnet_drop_rule *rule;
+       int cpt, rc = 0;
+
+       ENTRY;
+       cpt = lnet_net_lock_current();
+       list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) {
+               struct lnet_rule_properties *prop;
+
+               prop = genradix_ptr_alloc(&rlist->lgfrl_list,
+                                         rlist->lgfrl_count++,
+                                         GFP_KERNEL);
+               if (!prop) {
+                       rc = -ENOMEM;
+                       break;
+               }
+               spin_lock(&rule->dr_lock);
+               prop->attr = rule->dr_attr;
+               prop->stat = rule->dr_stat;
+               spin_unlock(&rule->dr_lock);
+       }
+
+       lnet_net_unlock(cpt);
+       RETURN(rc);
+}
+
 /**
  * reset counters for all drop rules
  */
@@ -1013,6 +1040,33 @@ lnet_delay_rule_list(int pos, struct lnet_fault_large_attr *attr,
        RETURN(rc);
 }
 
+int lnet_delay_rule_collect(struct lnet_genl_fault_rule_list *rlist)
+{
+       struct lnet_delay_rule *rule;
+       int cpt, rc = 0;
+
+       ENTRY;
+       cpt = lnet_net_lock_current();
+       list_for_each_entry(rule, &the_lnet.ln_delay_rules, dl_link) {
+               struct lnet_rule_properties *prop;
+
+               prop = genradix_ptr_alloc(&rlist->lgfrl_list,
+                                         rlist->lgfrl_count++,
+                                         GFP_KERNEL);
+               if (!prop) {
+                       rc = -ENOMEM;
+                       break;
+               }
+               spin_lock(&rule->dl_lock);
+               prop->attr = rule->dl_attr;
+               prop->stat = rule->dl_stat;
+               spin_unlock(&rule->dl_lock);
+       }
+
+       lnet_net_unlock(cpt);
+       RETURN(rc);
+}
+
 /**
  * reset counters for all Delay Rules
  */
index ec7decb..6df3100 100644 (file)
@@ -375,7 +375,6 @@ int yaml_netlink_setup_emitter(yaml_emitter_t *output, struct nl_sock *sk,
                goto emitter_error;
 
        yaml_emitter_open(output);
-       yaml_emitter_set_indent(output, 6);
        yaml_document_start_event_initialize(&hdr, NULL, NULL, NULL, 0);
        rc = yaml_emitter_emit(output, &hdr);
        if (rc == 0)
@@ -529,6 +528,131 @@ free_reply:
        return rc == 1 ? 0 : rc;
 }
 
+static char *
+fault_opc_to_str(__u32 opc)
+{
+       switch (opc) {
+       case LNET_CTL_DROP_LIST:
+               return "list drop rule";
+       case LNET_CTL_DELAY_LIST:
+               return "list delay rules";
+       default:
+               return "unrecognized command";
+       }
+}
+
+int yaml_lnet_fault_rule(yaml_document_t *results, __u32 opc, char *src,
+                        char *dst, char *local_nid,
+                        struct lnet_fault_attr *attr)
+{
+       struct nl_sock *sk = NULL;
+       char num[INT_STRING_LEN];
+       const char *msg = NULL;
+       int flags = NLM_F_DUMP;
+       yaml_emitter_t output;
+       yaml_parser_t reply;
+       yaml_event_t event;
+       int rc;
+
+       /* Create Netlink emitter to send request to kernel */
+       sk = nl_socket_alloc();
+       if (!sk)
+               return -EOPNOTSUPP;
+
+       /* Setup parser to receive Netlink packets */
+       rc = yaml_parser_initialize(&reply);
+       if (rc == 0) {
+               nl_socket_free(sk);
+               return -EOPNOTSUPP;
+       }
+
+       rc = yaml_parser_set_input_netlink(&reply, sk, false);
+       if (rc == 0)
+               goto free_reply;
+
+       rc = yaml_netlink_setup_emitter(&output, sk, LNET_GENL_NAME,
+                                       LNET_GENL_VERSION, flags,
+                                       LNET_CMD_FAULT, false);
+       if (rc == 0)
+               goto emitter_error;
+
+       yaml_scalar_event_initialize(&event, NULL,
+                                    (yaml_char_t *)YAML_STR_TAG,
+                                    (yaml_char_t *)"fault",
+                                    strlen("fault"), 1, 0,
+                                    YAML_PLAIN_SCALAR_STYLE);
+       rc = yaml_emitter_emit(&output, &event);
+       if (rc == 0)
+               goto emitter_error;
+
+       yaml_mapping_start_event_initialize(&event, NULL,
+                                           (yaml_char_t *)YAML_MAP_TAG,
+                                           1, YAML_ANY_MAPPING_STYLE);
+       rc = yaml_emitter_emit(&output, &event);
+       if (rc == 0)
+               goto emitter_error;
+
+       yaml_scalar_event_initialize(&event, NULL,
+                                    (yaml_char_t *)YAML_STR_TAG,
+                                    (yaml_char_t *)"rule_type",
+                                    strlen("rule_type"), 1, 0,
+                                    YAML_PLAIN_SCALAR_STYLE);
+       rc = yaml_emitter_emit(&output, &event);
+       if (rc == 0)
+               goto emitter_error;
+
+       snprintf(num, sizeof(num), "%d", opc);
+       yaml_scalar_event_initialize(&event, NULL,
+                                    (yaml_char_t *)YAML_INT_TAG,
+                                    (yaml_char_t *)num,
+                                    strlen(num), 1, 0,
+                                    YAML_PLAIN_SCALAR_STYLE);
+       rc = yaml_emitter_emit(&output, &event);
+       if (rc == 0)
+               goto emitter_error;
+
+       yaml_mapping_end_event_initialize(&event);
+       rc = yaml_emitter_emit(&output, &event);
+       if (rc == 0)
+               goto emitter_error;
+
+       rc = yaml_netlink_complete_emitter(&output);
+emitter_error:
+       if (rc == 0) {
+               yaml_emitter_log_error(&output, stderr);
+               rc = -EINVAL;
+       } else {
+               rc = yaml_parser_load(&reply, results);
+               if (rc == 0) {
+                       msg = yaml_parser_get_reader_error(&reply);
+                       /* dump routine returns ENOENT if the rule list is
+                        * empty. This is not an error condition.
+                        */
+                       if (errno == -ENOENT && (flags & NLM_F_DUMP))
+                               rc = 1;
+               }
+       }
+       yaml_emitter_delete(&output);
+free_reply:
+       if (rc == 0) {
+               if (!msg)
+                       msg = yaml_parser_get_reader_error(&reply);
+
+               if (strcmp(msg, "Unspecific failure") != 0) {
+                       fprintf(stdout, "failed to %s: %s\n",
+                               fault_opc_to_str(opc), msg);
+               } else {
+                       fprintf(stdout, "failed to %s: %s\n",
+                               fault_opc_to_str(opc), strerror(errno));
+               }
+               rc = errno;
+       }
+       yaml_parser_delete(&reply);
+       nl_socket_free(sk);
+
+       return rc == 1 ? 0 : rc;
+}
+
 static int dispatch_peer_ni_cmd(__u32 cmd, struct lnet_ioctl_peer_cfg *data,
                                char *err_str, char *cmd_str)
 {
index 8a852a2..e9e2198 100644 (file)
@@ -25,6 +25,7 @@
 #include <libcfs/util/ioctl.h>
 #include <libcfs/util/string.h>
 #include <linux/lnet/lnet-dlc.h>
+#include <linux/lnet/lnetctl.h>
 #include <linux/lnet/nidstr.h>
 
 #define LUSTRE_CFG_RC_NO_ERR                    0 /* success */
@@ -792,6 +793,10 @@ void lustre_lnet_free_list(struct nid_node *head);
  */
 int yaml_lnet_configure(int flags, const char **msg);
 
+int yaml_lnet_fault_rule(yaml_document_t *results, __u32 opc, char *src,
+                        char *dst, char *local_nid,
+                        struct lnet_fault_attr *attr);
+
 /**
  * yaml_emitter_set_output_netlink
  *
index 152a694..7b59686 100644 (file)
@@ -43,6 +43,7 @@ static int jt_show_peer(int argc, char **argv);
 static int jt_show_recovery(int argc, char **argv);
 static int jt_show_global(int argc, char **argv);
 static int jt_show_udsp(int argc, char **argv);
+static int jt_show_fault(int argc, char **argv);
 static int jt_set_tiny(int argc, char **argv);
 static int jt_set_small(int argc, char **argv);
 static int jt_set_large(int argc, char **argv);
@@ -62,7 +63,6 @@ static int jt_set_drop_asym_route(int argc, char **argv);
 static int jt_list_peer(int argc, char **argv);
 static int jt_add_udsp(int argc, char **argv);
 static int jt_del_udsp(int argc, char **argv);
-/*static int jt_show_peer(int argc, char **argv);*/
 static int jt_import(int argc, char **argv);
 static int jt_export(int argc, char **argv);
 static int jt_ping(int argc, char **argv);
@@ -82,6 +82,7 @@ static int jt_calc_service_id(int argc, char **argv);
 static int jt_set_response_tracking(int argc, char **argv);
 static int jt_set_recovery_limit(int argc, char **argv);
 static int jt_udsp(int argc, char **argv);
+static int jt_fault(int argc, char **argv);
 static int jt_setup_mrrouting(int argc, char **argv);
 static int jt_calc_cpt_of_nid(int argc, char **argv);
 static int jt_show_peer_debug_info(int argc, char **argv);
@@ -107,6 +108,7 @@ command_t cmd_list[] = {
        {"discover", jt_discover, 0, "discover nid[,nid,...]"},
        {"service-id", jt_calc_service_id, 0, "Calculate IB Lustre service ID\n"},
        {"udsp", jt_udsp, 0, "udsp {add | del | help}"},
+       {"fault", jt_fault, 0, "udsp {show | help}"},
        {"setup-mrrouting", jt_setup_mrrouting, 0,
         "setup linux routing tables\n"},
        {"cpt-of-nid", jt_calc_cpt_of_nid, 0,
@@ -296,6 +298,12 @@ command_t udsp_cmds[] = {
        { 0, 0, 0, NULL }
 };
 
+command_t fault_cmds[] = {
+       {"show", jt_show_fault, 0, "show fault rules\n"
+        "\t--rule_type t: Show LNet fault rules of type t.\n"},
+       { 0, 0, 0, NULL }
+};
+
 static int parse_long(const char *number, long int *value)
 {
        char *end;
@@ -4011,6 +4019,17 @@ static int jt_udsp(int argc, char **argv)
        return cfs_parser(argc, argv, udsp_cmds);
 }
 
+static int jt_fault(int argc, char **argv)
+{
+       int rc;
+
+       rc = check_cmd(fault_cmds, "fault", NULL, 2, argc, argv);
+       if (rc)
+               return rc;
+
+       return cfs_parser(argc, argv, fault_cmds);
+}
+
 static int yaml_import_global_settings(char *key, unsigned long value,
                                       char cmd, struct cYAML *show_rc,
                                       struct cYAML *err_rc)
@@ -5475,6 +5494,57 @@ static int jt_del_udsp(int argc, char **argv)
        return rc;
 }
 
+int jt_show_fault(int argc, char **argv)
+{
+       const char *const short_options = "t:";
+       static const struct option long_options[] = {
+               { .name = "rule_type",  .has_arg = required_argument, .val = 't' },
+               { .name = NULL }
+       };
+       yaml_document_t results;
+       yaml_emitter_t debug;
+       int opc = 0, opt, rc;
+
+       rc = check_cmd(fault_cmds, "fault", "show", 2, argc, argv);
+       if (rc < 0)
+               return rc;
+
+       while ((opt = getopt_long(argc, argv, short_options,
+                                 long_options, NULL)) != -1) {
+               switch (opt) {
+               case 't':
+                       if (strcmp(optarg, "delay") == 0)
+                               opc = LNET_CTL_DELAY_LIST;
+                       else if (strcmp(optarg, "drop") == 0)
+                               opc = LNET_CTL_DROP_LIST;
+                       else
+                               return -EINVAL;
+                       break;
+               default:
+                       return 0;
+               }
+       }
+       if (rc < 0)
+               return rc;
+
+       rc = yaml_lnet_fault_rule(&results, opc, NULL, NULL, NULL, NULL);
+       if (rc < 0)
+               return rc;
+
+       rc = yaml_emitter_initialize(&debug);
+       if (rc == 0)
+               return -EINVAL;
+
+       yaml_emitter_set_indent(&debug, LNET_DEFAULT_INDENT);
+       yaml_emitter_set_output_file(&debug, stdout);
+       rc = yaml_emitter_dump(&debug, &results);
+
+       yaml_emitter_delete(&debug);
+       yaml_document_delete(&results);
+
+       return rc == 0 ? -EINVAL : 0;
+}
+
 int main(int argc, char **argv)
 {
        int rc = 0;
index 0f259bc..5475d98 100644 (file)
@@ -2812,18 +2812,158 @@ jt_ptl_delay_reset(int argc, char **argv)
                                      argc, argv);
 }
 
+static void print_fault_rules(__u32 opc, struct lnet_nid *src,
+                             struct lnet_nid *dst,
+                             struct lnet_fault_attr *attr,
+                             struct lnet_fault_stat *stat)
+{
+       if (opc == LNET_CTL_DROP_LIST) {
+               printf("%s->%s (1/%d | %d) ptl %#jx, msg %x, %ju/%ju, PUT %ju, ACK %ju, GET %ju, REP %ju\n",
+                      libcfs_nidstr(src),
+                      libcfs_nidstr(dst),
+                      attr->u.drop.da_rate, attr->u.drop.da_interval,
+                      (uintmax_t)attr->fa_ptl_mask, attr->fa_msg_mask,
+                      (uintmax_t)stat->u.drop.ds_dropped,
+                      (uintmax_t)stat->fs_count,
+                      (uintmax_t)stat->fs_put,
+                      (uintmax_t)stat->fs_ack,
+                      (uintmax_t)stat->fs_get,
+                      (uintmax_t)stat->fs_reply);
+       } else if (opc == LNET_CTL_DELAY_LIST) {
+               printf("%s->%s (1/%d | %d, latency %d) ptl %#jx, msg %x, %ju/%ju, PUT %ju, ACK %ju, GET %ju, REP %ju\n",
+                      libcfs_nidstr(src),
+                      libcfs_nidstr(dst),
+                      attr->u.delay.la_rate, attr->u.delay.la_interval,
+                      attr->u.delay.la_latency,
+                      (uintmax_t)attr->fa_ptl_mask, attr->fa_msg_mask,
+                      (uintmax_t)stat->u.delay.ls_delayed,
+                      (uintmax_t)stat->fs_count,
+                      (uintmax_t)stat->fs_put,
+                      (uintmax_t)stat->fs_ack,
+                      (uintmax_t)stat->fs_get,
+                      (uintmax_t)stat->fs_reply);
+       }
+}
+
 static int
 fault_simul_rule_list(__u32 opc, char *name, int argc, char **argv)
 {
        struct libcfs_ioctl_data data = { { 0 } };
-       struct lnet_fault_attr   attr;
-       struct lnet_fault_stat   stat;
-       int pos;
+       struct lnet_nid src = {}, dst = {};
+       struct lnet_fault_attr attr;
+       struct lnet_fault_stat stat;
+       yaml_document_t results;
+       yaml_node_t *node;
+       bool first = true;
+       int pos, rc;
+       int i = 2;
+
+       rc = yaml_lnet_fault_rule(&results, opc, NULL, NULL, NULL, NULL);
+       if (rc < 0) {
+               if (rc == -EOPNOTSUPP)
+                       goto old_api;
+               return rc;
+       }
 
+       memset(&attr, 0, sizeof(attr));
+       memset(&stat, 0, sizeof(stat));
+       pos = 0;
+
+       while ((node = yaml_document_get_node(&results, i++)) != NULL) {
+               yaml_node_t *next;
+               char *tmp;
+
+               if (node->type == YAML_MAPPING_NODE) {
+                       if (first) {
+                               fprintf(stderr, "LNet %s rules:\n",
+                                       opc == LNET_CTL_DELAY_LIST ?
+                                       "delay" : "drop");
+                               first = false;
+                       } else {
+                               print_fault_rules(opc, &src, &dst,
+                                                 &attr, &stat);
+                       }
+                       pos++;
+               }
+
+               if (node->type != YAML_SCALAR_NODE)
+                       continue;
+
+               tmp = (char *)node->data.scalar.value;
+               if (strcmp("fa_src", tmp) == 0) {
+                       next = yaml_document_get_node(&results, i);
+                       tmp = (char *)next->data.scalar.value;
+                       libcfs_strnid(&src, tmp);
+               } else if (strcmp("fa_dst", tmp) == 0) {
+                       next = yaml_document_get_node(&results, i);
+                       tmp = (char *)next->data.scalar.value;
+                       libcfs_strnid(&dst, tmp);
+               } else if (strcmp("fa_ptl_mask", tmp) == 0) {
+                       next = yaml_document_get_node(&results, i);
+                       tmp = (char *)next->data.scalar.value;
+                       attr.fa_ptl_mask = strtoul(tmp, NULL, 0);
+               } else if (strcmp("fa_msg_mask", tmp) == 0) {
+                       next = yaml_document_get_node(&results, i);
+                       tmp = (char *)next->data.scalar.value;
+                       attr.fa_msg_mask = strtoul(tmp, NULL, 0);
+               } else if (strcmp("la_rate", tmp) == 0) {
+                       next = yaml_document_get_node(&results, i);
+                       tmp = (char *)next->data.scalar.value;
+                       attr.u.delay.la_rate = strtoul(tmp, NULL, 0);
+               } else if (strcmp("la_interval", tmp) == 0) {
+                       next = yaml_document_get_node(&results, i);
+                       tmp = (char *)next->data.scalar.value;
+                       attr.u.delay.la_interval = strtoul(tmp, NULL, 0);
+               } else if (strcmp("la_latency", tmp) == 0) {
+                       next = yaml_document_get_node(&results, i);
+                       tmp = (char *)next->data.scalar.value;
+                       attr.u.delay.la_latency = strtoul(tmp, NULL, 0);
+               } else if (strcmp("da_rate", tmp) == 0) {
+                       next = yaml_document_get_node(&results, i);
+                       tmp = (char *)next->data.scalar.value;
+                       attr.u.drop.da_rate = strtoul(tmp, NULL, 0);
+               } else if (strcmp("da_interval", tmp) == 0) {
+                       next = yaml_document_get_node(&results, i);
+                       tmp = (char *)next->data.scalar.value;
+                       attr.u.drop.da_interval = strtoul(tmp, NULL, 0);
+               } else if (strcmp("ds_dropped", tmp) == 0) {
+                       next = yaml_document_get_node(&results, i);
+                       tmp = (char *)next->data.scalar.value;
+                       stat.u.drop.ds_dropped = strtoul(tmp, NULL, 0);
+               } else if (strcmp("ls_delayed", tmp) == 0) {
+                       next = yaml_document_get_node(&results, i);
+                       tmp = (char *)next->data.scalar.value;
+                       stat.u.delay.ls_delayed = strtoul(tmp, NULL, 0);
+               } else if (strcmp("fs_count", tmp) == 0) {
+                       next = yaml_document_get_node(&results, i);
+                       tmp = (char *)next->data.scalar.value;
+                       stat.fs_count = strtoul(tmp, NULL, 0);
+               } else if (strcmp("fs_put", tmp) == 0) {
+                       next = yaml_document_get_node(&results, i);
+                       tmp = (char *)next->data.scalar.value;
+                       stat.fs_put = strtoul(tmp, NULL, 0);
+               } else if (strcmp("fs_ack", tmp) == 0) {
+                       next = yaml_document_get_node(&results, i);
+                       tmp = (char *)next->data.scalar.value;
+                       stat.fs_ack = strtoul(tmp, NULL, 0);
+               } else if (strcmp("fs_get", tmp) == 0) {
+                       next = yaml_document_get_node(&results, i);
+                       tmp = (char *)next->data.scalar.value;
+                       stat.fs_get = strtoul(tmp, NULL, 0);
+               } else if (strcmp("fs_reply", tmp) == 0) {
+                       next = yaml_document_get_node(&results, i);
+                       tmp = (char *)next->data.scalar.value;
+                       stat.fs_reply = strtoul(tmp, NULL, 0);
+               }
+       }
+
+       print_fault_rules(opc, &src, &dst, &attr, &stat);
+       printf("found total %d\n", pos);
+       return rc == 0 ? -EINVAL : 0;
+old_api:
+       rc = 0;
        printf("LNet %s rules:\n", name);
        for (pos = 0;; pos++) {
-               int rc;
-
                memset(&attr, 0, sizeof(attr));
                memset(&stat, 0, sizeof(stat));
 
@@ -2844,37 +2984,14 @@ fault_simul_rule_list(__u32 opc, char *name, int argc, char **argv)
 
                libcfs_ioctl_unpack(&data, ioc_buf);
 
-               if (opc == LNET_CTL_DROP_LIST) {
-                       printf("%s->%s (1/%d | %d) ptl %#jx, msg %x, %ju/%ju, PUT %ju, ACK %ju, GET %ju, REP %ju\n",
-                              libcfs_nid2str(attr.fa_src),
-                              libcfs_nid2str(attr.fa_dst),
-                              attr.u.drop.da_rate, attr.u.drop.da_interval,
-                              (uintmax_t)attr.fa_ptl_mask, attr.fa_msg_mask,
-                              (uintmax_t)stat.u.drop.ds_dropped,
-                              (uintmax_t)stat.fs_count,
-                              (uintmax_t)stat.fs_put,
-                              (uintmax_t)stat.fs_ack,
-                              (uintmax_t)stat.fs_get,
-                              (uintmax_t)stat.fs_reply);
-
-               } else if (opc == LNET_CTL_DELAY_LIST) {
-                       printf("%s->%s (1/%d | %d, latency %d) ptl %#jx, msg %x, %ju/%ju, PUT %ju, ACK %ju, GET %ju, REP %ju\n",
-                              libcfs_nid2str(attr.fa_src),
-                              libcfs_nid2str(attr.fa_dst),
-                              attr.u.delay.la_rate, attr.u.delay.la_interval,
-                              attr.u.delay.la_latency,
-                              (uintmax_t)attr.fa_ptl_mask, attr.fa_msg_mask,
-                              (uintmax_t)stat.u.delay.ls_delayed,
-                              (uintmax_t)stat.fs_count,
-                              (uintmax_t)stat.fs_put,
-                              (uintmax_t)stat.fs_ack,
-                              (uintmax_t)stat.fs_get,
-                              (uintmax_t)stat.fs_reply);
-               }
+               lnet_nid4_to_nid(attr.fa_src, &src);
+               lnet_nid4_to_nid(attr.fa_dst, &dst);
+
+               print_fault_rules(opc, &src, &dst, &attr, &stat);
        }
        printf("found total %d\n", pos);
 
-       return 0;
+       return rc;
 }
 
 int