From 826ea19c077b2a3e1a32464a7eb63fba6e460946 Mon Sep 17 00:00:00 2001 From: Amir Shehata Date: Sun, 5 Aug 2018 14:25:47 -0700 Subject: [PATCH] LU-9120 lnet: print recovery queues content Add commands to lnetctl to print recovery queues content from user space. Associated code to handle the IOCTL is added in LNet module. for local NIs: lnetctl debug recovery --local for peer NIs: lnetctl debug recovery --peer Test-Parameters: forbuildonly Signed-off-by: Amir Shehata Change-Id: Id136d506772d95381fd5d8346d772177442a84fb Reviewed-on: https://review.whamcloud.com/32950 Tested-by: Jenkins Reviewed-by: Sonia Sharma Reviewed-by: Olaf Weber --- lnet/include/uapi/linux/lnet/libcfs_ioctl.h | 3 +- lnet/include/uapi/linux/lnet/lnet-dlc.h | 8 +++ lnet/lnet/api-ni.c | 52 ++++++++++++++++++ lnet/utils/lnetconfig/liblnetconfig.c | 84 +++++++++++++++++++++++++++++ lnet/utils/lnetconfig/liblnetconfig.h | 6 +++ lnet/utils/lnetctl.c | 61 +++++++++++++++++++++ 6 files changed, 213 insertions(+), 1 deletion(-) diff --git a/lnet/include/uapi/linux/lnet/libcfs_ioctl.h b/lnet/include/uapi/linux/lnet/libcfs_ioctl.h index d8080a8..cdac10f 100644 --- a/lnet/include/uapi/linux/lnet/libcfs_ioctl.h +++ b/lnet/include/uapi/linux/lnet/libcfs_ioctl.h @@ -149,7 +149,8 @@ struct libcfs_debug_ioctl_data { #define IOC_LIBCFS_GET_LOCAL_NI_MSG_STATS _IOWR(IOC_LIBCFS_TYPE, 101, IOCTL_CONFIG_SIZE) #define IOC_LIBCFS_SET_HEALHV _IOWR(IOC_LIBCFS_TYPE, 102, IOCTL_CONFIG_SIZE) #define IOC_LIBCFS_GET_LOCAL_HSTATS _IOWR(IOC_LIBCFS_TYPE, 103, IOCTL_CONFIG_SIZE) -#define IOC_LIBCFS_MAX_NR 103 +#define IOC_LIBCFS_GET_RECOVERY_QUEUE _IOWR(IOC_LIBCFS_TYPE, 104, IOCTL_CONFIG_SIZE) +#define IOC_LIBCFS_MAX_NR 104 extern int libcfs_ioctl_data_adjust(struct libcfs_ioctl_data *data); diff --git a/lnet/include/uapi/linux/lnet/lnet-dlc.h b/lnet/include/uapi/linux/lnet/lnet-dlc.h index 1d4b98d..52d0c81 100644 --- a/lnet/include/uapi/linux/lnet/lnet-dlc.h +++ b/lnet/include/uapi/linux/lnet/lnet-dlc.h @@ -44,6 +44,7 @@ #define MAX_NUM_SHOW_ENTRIES 32 #define LNET_MAX_STR_LEN 128 #define LNET_MAX_SHOW_NUM_CPT 128 +#define LNET_MAX_SHOW_NUM_NID 128 #define LNET_UNDEFINED_HOPS ((__u32) -1) /* @@ -272,6 +273,13 @@ struct lnet_ioctl_reset_health_cfg { lnet_nid_t rh_nid; }; +struct lnet_ioctl_recovery_list { + struct libcfs_ioctl_hdr rlst_hdr; + enum lnet_health_type rlst_type; + int rlst_num_nids; + lnet_nid_t rlst_nid_array[LNET_MAX_SHOW_NUM_NID]; +}; + struct lnet_ioctl_set_value { struct libcfs_ioctl_hdr sv_hdr; __u32 sv_value; diff --git a/lnet/lnet/api-ni.c b/lnet/lnet/api-ni.c index cc7b16c..9a56605 100644 --- a/lnet/lnet/api-ni.c +++ b/lnet/lnet/api-ni.c @@ -3303,6 +3303,44 @@ unlock: return rc; } +static int +lnet_get_local_ni_recovery_list(struct lnet_ioctl_recovery_list *list) +{ + struct lnet_ni *ni; + int i = 0; + + lnet_net_lock(LNET_LOCK_EX); + list_for_each_entry(ni, &the_lnet.ln_mt_localNIRecovq, ni_recovery) { + list->rlst_nid_array[i] = ni->ni_nid; + i++; + if (i >= LNET_MAX_SHOW_NUM_NID) + break; + } + lnet_net_unlock(LNET_LOCK_EX); + list->rlst_num_nids = i; + + return 0; +} + +static int +lnet_get_peer_ni_recovery_list(struct lnet_ioctl_recovery_list *list) +{ + struct lnet_peer_ni *lpni; + int i = 0; + + lnet_net_lock(LNET_LOCK_EX); + list_for_each_entry(lpni, &the_lnet.ln_mt_peerNIRecovq, lpni_recovery) { + list->rlst_nid_array[i] = lpni->lpni_nid; + i++; + if (i >= LNET_MAX_SHOW_NUM_NID) + break; + } + lnet_net_unlock(LNET_LOCK_EX); + list->rlst_num_nids = i; + + return 0; +} + /** * LNet ioctl handler. * @@ -3526,6 +3564,20 @@ LNetCtl(unsigned int cmd, void *arg) return rc; } + case IOC_LIBCFS_GET_RECOVERY_QUEUE: { + struct lnet_ioctl_recovery_list *list = arg; + if (list->rlst_hdr.ioc_len < sizeof(*list)) + return -EINVAL; + + mutex_lock(&the_lnet.ln_api_mutex); + if (list->rlst_type == LNET_HEALTH_TYPE_LOCAL_NI) + rc = lnet_get_local_ni_recovery_list(list); + else + rc = lnet_get_peer_ni_recovery_list(list); + mutex_unlock(&the_lnet.ln_api_mutex); + return rc; + } + case IOC_LIBCFS_ADD_PEER_NI: { struct lnet_ioctl_peer_cfg *cfg = arg; diff --git a/lnet/utils/lnetconfig/liblnetconfig.c b/lnet/utils/lnetconfig/liblnetconfig.c index ce460cf..c2b3667 100644 --- a/lnet/utils/lnetconfig/liblnetconfig.c +++ b/lnet/utils/lnetconfig/liblnetconfig.c @@ -3453,6 +3453,90 @@ int lustre_lnet_show_retry_count(int seq_no, struct cYAML **show_rc, err_rc, l_errno); } +int show_recovery_queue(enum lnet_health_type type, char *name, int seq_no, + struct cYAML **show_rc, struct cYAML **err_rc) +{ + struct lnet_ioctl_recovery_list nid_list; + struct cYAML *root = NULL, *nids = NULL; + int rc, i; + char err_str[LNET_MAX_STR_LEN]; + + snprintf(err_str, sizeof(err_str), "failed to print recovery queue\n"); + + LIBCFS_IOC_INIT_V2(nid_list, rlst_hdr); + nid_list.rlst_type = type; + + rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_RECOVERY_QUEUE, &nid_list); + if (rc) { + rc = errno; + goto out; + } + + root = cYAML_create_object(NULL, NULL); + if (root == NULL) + goto out; + + nids = cYAML_create_object(root, name); + if (nids == NULL) + goto out; + + rc = -EINVAL; + + for (i = 0; i < nid_list.rlst_num_nids; i++) { + char nidenum[LNET_MAX_STR_LEN]; + snprintf(nidenum, sizeof(nidenum), "nid-%d", i); + if (!cYAML_create_string(nids, nidenum, + libcfs_nid2str(nid_list.rlst_nid_array[i]))) + goto out; + } + + snprintf(err_str, sizeof(err_str), "success\n"); + + rc = 0; + +out: + if (show_rc == NULL || rc != LUSTRE_CFG_RC_NO_ERR) { + cYAML_free_tree(root); + } else if (show_rc != NULL && *show_rc != NULL) { + struct cYAML *show_node; + /* find the net node, if one doesn't exist + * then insert one. Otherwise add to the one there + */ + show_node = cYAML_get_object_item(*show_rc, name); + if (show_node != NULL && cYAML_is_sequence(show_node)) { + cYAML_insert_child(show_node, nids); + free(nids); + free(root); + } else if (show_node == NULL) { + cYAML_insert_sibling((*show_rc)->cy_child, + nids); + free(root); + } else { + cYAML_free_tree(root); + } + } else { + *show_rc = root; + } + + cYAML_build_error(rc, seq_no, SHOW_CMD, name, err_str, err_rc); + + return rc; +} + +int lustre_lnet_show_local_ni_recovq(int seq_no, struct cYAML **show_rc, + struct cYAML **err_rc) +{ + return show_recovery_queue(LNET_HEALTH_TYPE_LOCAL_NI, "local NI recovery", + seq_no, show_rc, err_rc); +} + +int lustre_lnet_show_peer_ni_recovq(int seq_no, struct cYAML **show_rc, + struct cYAML **err_rc) +{ + return show_recovery_queue(LNET_HEALTH_TYPE_PEER_NI, "peer NI recovery", + seq_no, show_rc, err_rc); +} + int lustre_lnet_show_max_intf(int seq_no, struct cYAML **show_rc, struct cYAML **err_rc) { diff --git a/lnet/utils/lnetconfig/liblnetconfig.h b/lnet/utils/lnetconfig/liblnetconfig.h index 8ec4069..4cd6d5e 100644 --- a/lnet/utils/lnetconfig/liblnetconfig.h +++ b/lnet/utils/lnetconfig/liblnetconfig.h @@ -325,6 +325,12 @@ int lustre_lnet_config_retry_count(int count, int seq_no, struct cYAML **err_rc) int lustre_lnet_show_retry_count(int seq_no, struct cYAML **show_rc, struct cYAML **err_rc); +int lustre_lnet_show_local_ni_recovq(int seq_no, struct cYAML **show_rc, + struct cYAML **err_rc); + +int lustre_lnet_show_peer_ni_recovq(int seq_no, struct cYAML **show_rc, + struct cYAML **err_rc); + /* * lustre_lnet_config_max_intf * Sets the maximum number of interfaces per node. this tunable is diff --git a/lnet/utils/lnetctl.c b/lnet/utils/lnetctl.c index e6e4255..e6975af 100644 --- a/lnet/utils/lnetctl.c +++ b/lnet/utils/lnetctl.c @@ -48,6 +48,7 @@ static int jt_show_net(int argc, char **argv); static int jt_show_routing(int argc, char **argv); static int jt_show_stats(int argc, char **argv); static int jt_show_peer(int argc, char **argv); +static int jt_show_recovery(int argc, char **argv); static int jt_show_global(int argc, char **argv); static int jt_set_tiny(int argc, char **argv); static int jt_set_small(int argc, char **argv); @@ -72,6 +73,7 @@ static int jt_route(int argc, char **argv); static int jt_net(int argc, char **argv); static int jt_routing(int argc, char **argv); static int jt_set(int argc, char **argv); +static int jt_debug(int argc, char **argv); static int jt_stats(int argc, char **argv); static int jt_global(int argc, char **argv); static int jt_peers(int argc, char **argv); @@ -89,6 +91,7 @@ command_t cmd_list[] = { {"import", jt_import, 0, "import FILE.yaml"}, {"export", jt_export, 0, "export FILE.yaml"}, {"stats", jt_stats, 0, "stats {show | help}"}, + {"debug", jt_debug, 0, "debug recovery {local | peer}"}, {"global", jt_global, 0, "global {show | help}"}, {"peer", jt_peers, 0, "peer {add | del | show | help}"}, {"ping", jt_ping, 0, "ping nid,[nid,...]"}, @@ -159,6 +162,13 @@ command_t stats_cmds[] = { { 0, 0, 0, NULL } }; +command_t debug_cmds[] = { + {"recovery", jt_show_recovery, 0, "list recovery queues\n" + "\t--local : list local recovery queue\n" + "\t--peer : list peer recovery queue\n"}, + { 0, 0, 0, NULL } +}; + command_t global_cmds[] = { {"show", jt_show_global, 0, "show global variables\n"}, { 0, 0, 0, NULL } @@ -1019,6 +1029,46 @@ static int jt_set_peer_ni_value(int argc, char **argv) return set_value_helper(argc, argv, lustre_lnet_config_peer_ni_healthv); } +static int jt_show_recovery(int argc, char **argv) +{ + int rc, opt; + struct cYAML *err_rc = NULL, *show_rc = NULL; + + const char *const short_options = "lp"; + static const struct option long_options[] = { + { .name = "local", .has_arg = no_argument, .val = 'l' }, + { .name = "peer", .has_arg = no_argument, .val = 'p' }, + { .name = NULL } }; + + rc = check_cmd(debug_cmds, "recovery", NULL, 0, argc, argv); + if (rc) + return rc; + + while ((opt = getopt_long(argc, argv, short_options, + long_options, NULL)) != -1) { + switch (opt) { + case 'l': + rc = lustre_lnet_show_local_ni_recovq(-1, &show_rc, &err_rc); + break; + case 'p': + rc = lustre_lnet_show_peer_ni_recovq(-1, &show_rc, &err_rc); + break; + default: + return 0; + } + } + + if (rc != LUSTRE_CFG_RC_NO_ERR) + cYAML_print_tree2file(stderr, err_rc); + else if (show_rc) + cYAML_print_tree(show_rc); + + cYAML_free_tree(err_rc); + cYAML_free_tree(show_rc); + + return rc; +} + static int jt_show_net(int argc, char **argv) { char *network = NULL; @@ -1224,6 +1274,17 @@ static int jt_stats(int argc, char **argv) return Parser_execarg(argc - 1, &argv[1], stats_cmds); } +static int jt_debug(int argc, char **argv) +{ + int rc; + + rc = check_cmd(debug_cmds, "recovery", NULL, 2, argc, argv); + if (rc) + return rc; + + return Parser_execarg(argc - 1, &argv[1], debug_cmds); +} + static int jt_global(int argc, char **argv) { int rc; -- 1.8.3.1