Whamcloud - gitweb
LU-11468 lnet: set recovery interval from lnetctl
[fs/lustre-release.git] / lnet / utils / lnetconfig / liblnetconfig.c
index 42c0181..a194ce0 100644 (file)
@@ -2028,6 +2028,62 @@ out:
        return rc;
 }
 
+static int
+lustre_lnet_config_healthv(int value, bool all, lnet_nid_t nid,
+                          enum lnet_health_type type, char *name,
+                          int seq_no, struct cYAML **err_rc)
+{
+       struct lnet_ioctl_reset_health_cfg data;
+       int rc = LUSTRE_CFG_RC_NO_ERR;
+       char err_str[LNET_MAX_STR_LEN];
+
+       snprintf(err_str, sizeof(err_str), "\"success\"");
+
+       LIBCFS_IOC_INIT_V2(data, rh_hdr);
+       data.rh_type = type;
+       data.rh_all = all;
+       data.rh_value = value;
+       data.rh_nid = nid;
+
+       rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_SET_HEALHV, &data);
+       if (rc != 0) {
+               rc = -errno;
+               snprintf(err_str,
+                        sizeof(err_str), "Can not configure health value: %s",
+                        strerror(errno));
+       }
+
+       cYAML_build_error(rc, seq_no, ADD_CMD, name, err_str, err_rc);
+
+       return rc;
+}
+
+int lustre_lnet_config_ni_healthv(int value, bool all, char *ni_nid, int seq_no,
+                                 struct cYAML **err_rc)
+{
+       lnet_nid_t nid;
+       if (ni_nid)
+               nid = libcfs_str2nid(ni_nid);
+       else
+               nid = LNET_NID_ANY;
+       return lustre_lnet_config_healthv(value, all, nid,
+                                         LNET_HEALTH_TYPE_LOCAL_NI,
+                                         "ni healthv", seq_no, err_rc);
+}
+
+int lustre_lnet_config_peer_ni_healthv(int value, bool all, char *lpni_nid,
+                                      int seq_no, struct cYAML **err_rc)
+{
+       lnet_nid_t nid;
+       if (lpni_nid)
+               nid = libcfs_str2nid(lpni_nid);
+       else
+               nid = LNET_NID_ANY;
+       return lustre_lnet_config_healthv(value, all, nid,
+                                         LNET_HEALTH_TYPE_PEER_NI,
+                                         "peer_ni healthv", seq_no, err_rc);
+}
+
 static bool
 add_msg_stats_to_yaml_blk(struct cYAML *yaml,
                          struct lnet_ioctl_comm_count *counts)
@@ -2473,6 +2529,28 @@ int ioctl_set_value(__u32 val, int ioc, char *name,
        return rc;
 }
 
+int lustre_lnet_config_recov_intrv(int intrv, int seq_no, struct cYAML **err_rc)
+{
+       int rc = LUSTRE_CFG_RC_NO_ERR;
+       char err_str[LNET_MAX_STR_LEN];
+       char val[LNET_MAX_STR_LEN];
+
+       snprintf(err_str, sizeof(err_str), "\"success\"");
+
+       snprintf(val, sizeof(val), "%d", intrv);
+
+       rc = write_sysfs_file(modparam_path, "lnet_recovery_interval", val,
+                             1, strlen(val) + 1);
+       if (rc)
+               snprintf(err_str, sizeof(err_str),
+                        "\"cannot configure recovery interval: %s\"",
+                        strerror(errno));
+
+       cYAML_build_error(rc, seq_no, ADD_CMD, "recovery_interval", err_str, err_rc);
+
+       return rc;
+}
+
 int lustre_lnet_config_hsensitivity(int sen, int seq_no, struct cYAML **err_rc)
 {
        int rc = LUSTRE_CFG_RC_NO_ERR;
@@ -2816,6 +2894,7 @@ int lustre_lnet_show_peer(char *knid, int detail, int seq_no,
        struct lnet_peer_ni_credit_info *lpni_cri;
        struct lnet_ioctl_element_stats *lpni_stats;
        struct lnet_ioctl_element_msg_stats *msg_stats;
+       struct lnet_ioctl_peer_ni_hstats *hstats;
        lnet_nid_t *nidp;
        int rc = LUSTRE_CFG_RC_OUT_OF_MEM;
        int i, j, k;
@@ -2824,11 +2903,13 @@ int lustre_lnet_show_peer(char *knid, int detail, int seq_no,
        __u32 size;
        struct cYAML *root = NULL, *peer = NULL, *peer_ni = NULL,
                     *first_seq = NULL, *peer_root = NULL, *tmp = NULL,
-                    *msg_statistics = NULL, *statistics = NULL;
+                    *msg_statistics = NULL, *statistics = NULL,
+                    *yhstats;
        char err_str[LNET_MAX_STR_LEN];
        struct lnet_process_id *list = NULL;
        void *data = NULL;
        void *lpni_data;
+       bool exist = false;
 
        snprintf(err_str, sizeof(err_str),
                 "\"out of memory\"");
@@ -2891,6 +2972,7 @@ int lustre_lnet_show_peer(char *knid, int detail, int seq_no,
                l_errno = ENOMEM;
                goto out;
        }
+
        for (i = 0; i < count; i++) {
                for (;;) {
                        memset(&peer_info, 0, sizeof(peer_info));
@@ -2922,6 +3004,7 @@ int lustre_lnet_show_peer(char *knid, int detail, int seq_no,
                                goto out;
                        }
                }
+               exist = true;
 
                peer = cYAML_create_seq_item(peer_root);
                if (peer == NULL)
@@ -2961,7 +3044,8 @@ int lustre_lnet_show_peer(char *knid, int detail, int seq_no,
                        lpni_cri = (void*)nidp + sizeof(nidp);
                        lpni_stats = (void *)lpni_cri + sizeof(*lpni_cri);
                        msg_stats = (void *)lpni_stats + sizeof(*lpni_stats);
-                       lpni_data = (void *)msg_stats + sizeof(*msg_stats);
+                       hstats = (void *)msg_stats + sizeof(*msg_stats);
+                       lpni_data = (void *)hstats + sizeof(*hstats);
 
                        peer_ni = cYAML_create_seq_item(tmp);
                        if (peer_ni == NULL)
@@ -3056,6 +3140,29 @@ int lustre_lnet_show_peer(char *knid, int detail, int seq_no,
                                        goto out;
                        }
 
+                       yhstats = cYAML_create_object(peer_ni, "health stats");
+                       if (!yhstats)
+                               goto out;
+                       if (cYAML_create_number(yhstats, "health value",
+                                               hstats->hlpni_health_value)
+                                                       == NULL)
+                               goto out;
+                       if (cYAML_create_number(yhstats, "dropped",
+                                               hstats->hlpni_remote_dropped)
+                                                       == NULL)
+                               goto out;
+                       if (cYAML_create_number(yhstats, "timeout",
+                                               hstats->hlpni_remote_timeout)
+                                                       == NULL)
+                               goto out;
+                       if (cYAML_create_number(yhstats, "error",
+                                               hstats->hlpni_remote_error)
+                                                       == NULL)
+                               goto out;
+                       if (cYAML_create_number(yhstats, "network timeout",
+                                               hstats->hlpni_network_timeout)
+                                                       == NULL)
+                               goto out;
                }
        }
 
@@ -3069,7 +3176,7 @@ int lustre_lnet_show_peer(char *knid, int detail, int seq_no,
 out:
        free(list);
        free(data);
-       if (show_rc == NULL || rc != LUSTRE_CFG_RC_NO_ERR) {
+       if (show_rc == NULL || rc != LUSTRE_CFG_RC_NO_ERR || !exist) {
                cYAML_free_tree(root);
        } else if (show_rc != NULL && *show_rc != NULL) {
                struct cYAML *show_node;
@@ -3297,6 +3404,31 @@ static int ioctl_show_global_values(int ioc, int seq_no, char *name,
                                       data.sv_value, show_rc, err_rc, l_errno);
 }
 
+int lustre_lnet_show_recov_intrv(int seq_no, struct cYAML **show_rc,
+                                struct cYAML **err_rc)
+{
+       int rc = LUSTRE_CFG_RC_OUT_OF_MEM;
+       char val[LNET_MAX_STR_LEN];
+       int intrv = -1, l_errno = 0;
+       char err_str[LNET_MAX_STR_LEN];
+
+       snprintf(err_str, sizeof(err_str), "\"out of memory\"");
+
+       rc = read_sysfs_file(modparam_path, "lnet_recovery_interval", val,
+                            1, sizeof(val));
+       if (rc) {
+               l_errno = -errno;
+               snprintf(err_str, sizeof(err_str),
+                        "\"cannot get recovery interval: %d\"", rc);
+       } else {
+               intrv = atoi(val);
+       }
+
+       return build_global_yaml_entry(err_str, sizeof(err_str), seq_no,
+                                      "recovery_interval", intrv, show_rc,
+                                      err_rc, l_errno);
+}
+
 int lustre_lnet_show_hsensitivity(int seq_no, struct cYAML **show_rc,
                                  struct cYAML **err_rc)
 {
@@ -3372,6 +3504,93 @@ int lustre_lnet_show_retry_count(int seq_no, struct cYAML **show_rc,
                                       err_rc, l_errno);
 }
 
+int show_recovery_queue(enum lnet_health_type type, char *name, int seq_no,
+                       struct cYAML **show_rc, struct cYAML **err_rc)
+{
+       struct lnet_ioctl_recovery_list nid_list;
+       struct cYAML *root = NULL, *nids = NULL;
+       int rc, i;
+       char err_str[LNET_MAX_STR_LEN];
+
+       snprintf(err_str, sizeof(err_str), "failed to print recovery queue\n");
+
+       LIBCFS_IOC_INIT_V2(nid_list, rlst_hdr);
+       nid_list.rlst_type = type;
+
+       rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_RECOVERY_QUEUE, &nid_list);
+       if (rc) {
+               rc = errno;
+               goto out;
+       }
+
+       if (nid_list.rlst_num_nids == 0)
+               goto out;
+
+       root = cYAML_create_object(NULL, NULL);
+       if (root == NULL)
+               goto out;
+
+       nids = cYAML_create_object(root, name);
+       if (nids == NULL)
+               goto out;
+
+       rc = -EINVAL;
+
+       for (i = 0; i < nid_list.rlst_num_nids; i++) {
+               char nidenum[LNET_MAX_STR_LEN];
+               snprintf(nidenum, sizeof(nidenum), "nid-%d", i);
+               if (!cYAML_create_string(nids, nidenum,
+                       libcfs_nid2str(nid_list.rlst_nid_array[i])))
+                       goto out;
+       }
+
+       snprintf(err_str, sizeof(err_str), "success\n");
+
+       rc = 0;
+
+out:
+       if (show_rc == NULL || rc != LUSTRE_CFG_RC_NO_ERR) {
+               cYAML_free_tree(root);
+       } else if (show_rc != NULL && *show_rc != NULL) {
+               struct cYAML *show_node;
+               /* find the net node, if one doesn't exist
+                * then insert one.  Otherwise add to the one there
+                */
+               show_node = cYAML_get_object_item(*show_rc, name);
+               if (show_node != NULL && cYAML_is_sequence(show_node)) {
+                       cYAML_insert_child(show_node, nids);
+                       free(nids);
+                       free(root);
+               } else if (show_node == NULL) {
+                       cYAML_insert_sibling((*show_rc)->cy_child,
+                                               nids);
+                       free(root);
+               } else {
+                       cYAML_free_tree(root);
+               }
+       } else {
+               *show_rc = root;
+       }
+
+       cYAML_build_error(rc, seq_no, SHOW_CMD, name, err_str, err_rc);
+
+       return rc;
+}
+
+int lustre_lnet_show_local_ni_recovq(int seq_no, struct cYAML **show_rc,
+                                    struct cYAML **err_rc)
+{
+       return show_recovery_queue(LNET_HEALTH_TYPE_LOCAL_NI, "local NI recovery",
+                                  seq_no, show_rc, err_rc);
+}
+
+int lustre_lnet_show_peer_ni_recovq(int seq_no, struct cYAML **show_rc,
+                                   struct cYAML **err_rc)
+{
+       return show_recovery_queue(LNET_HEALTH_TYPE_PEER_NI, "peer NI recovery",
+                                  seq_no, show_rc, err_rc);
+}
+
 int lustre_lnet_show_max_intf(int seq_no, struct cYAML **show_rc,
                              struct cYAML **err_rc)
 {
@@ -3438,6 +3657,7 @@ int lustre_lnet_show_stats(int seq_no, struct cYAML **show_rc,
                           struct cYAML **err_rc)
 {
        struct lnet_ioctl_lnet_stats data;
+       struct lnet_counters *cntrs;
        int rc;
        int l_errno;
        char err_str[LNET_MAX_STR_LEN];
@@ -3448,7 +3668,7 @@ int lustre_lnet_show_stats(int seq_no, struct cYAML **show_rc,
        LIBCFS_IOC_INIT_V2(data, st_hdr);
 
        rc = l_ioctl(LNET_DEV_ID, IOC_LIBCFS_GET_LNET_STATS, &data);
-       if (rc != 0) {
+       if (rc) {
                l_errno = errno;
                snprintf(err_str,
                         sizeof(err_str),
@@ -3460,59 +3680,113 @@ int lustre_lnet_show_stats(int seq_no, struct cYAML **show_rc,
 
        rc = LUSTRE_CFG_RC_OUT_OF_MEM;
 
+       cntrs = &data.st_cntrs;
+
        root = cYAML_create_object(NULL, NULL);
-       if (root == NULL)
+       if (!root)
                goto out;
 
        stats = cYAML_create_object(root, "statistics");
-       if (stats == NULL)
+       if (!stats)
                goto out;
 
-       if (cYAML_create_number(stats, "msgs_alloc",
-                               data.st_cntrs.msgs_alloc) == NULL)
+       if (!cYAML_create_number(stats, "msgs_alloc",
+                                cntrs->lct_common.lcc_msgs_alloc))
                goto out;
 
-       if (cYAML_create_number(stats, "msgs_max",
-                               data.st_cntrs.msgs_max) == NULL)
+       if (!cYAML_create_number(stats, "msgs_max",
+                                cntrs->lct_common.lcc_msgs_max))
                goto out;
 
-       if (cYAML_create_number(stats, "errors",
-                               data.st_cntrs.errors) == NULL)
+       if (!cYAML_create_number(stats, "rst_alloc",
+                                cntrs->lct_health.lch_rst_alloc))
                goto out;
 
-       if (cYAML_create_number(stats, "send_count",
-                               data.st_cntrs.send_count) == NULL)
+       if (!cYAML_create_number(stats, "errors",
+                                cntrs->lct_common.lcc_errors))
                goto out;
 
-       if (cYAML_create_number(stats, "recv_count",
-                               data.st_cntrs.recv_count) == NULL)
+       if (!cYAML_create_number(stats, "send_count",
+                                cntrs->lct_common.lcc_send_count))
                goto out;
 
-       if (cYAML_create_number(stats, "route_count",
-                               data.st_cntrs.route_count) == NULL)
+       if (!cYAML_create_number(stats, "resend_count",
+                                cntrs->lct_health.lch_resend_count))
                goto out;
 
-       if (cYAML_create_number(stats, "drop_count",
-                               data.st_cntrs.drop_count) == NULL)
+       if (!cYAML_create_number(stats, "response_timeout_count",
+                                cntrs->lct_health.lch_response_timeout_count))
                goto out;
 
-       if (cYAML_create_number(stats, "send_length",
-                               data.st_cntrs.send_length) == NULL)
+       if (!cYAML_create_number(stats, "local_interrupt_count",
+                                cntrs->lct_health.lch_local_interrupt_count))
                goto out;
 
-       if (cYAML_create_number(stats, "recv_length",
-                               data.st_cntrs.recv_length) == NULL)
+       if (!cYAML_create_number(stats, "local_dropped_count",
+                                cntrs->lct_health.lch_local_dropped_count))
                goto out;
 
-       if (cYAML_create_number(stats, "route_length",
-                               data.st_cntrs.route_length) == NULL)
+       if (!cYAML_create_number(stats, "local_aborted_count",
+                                cntrs->lct_health.lch_local_aborted_count))
                goto out;
 
-       if (cYAML_create_number(stats, "drop_length",
-                               data.st_cntrs.drop_length) == NULL)
+       if (!cYAML_create_number(stats, "local_no_route_count",
+                                cntrs->lct_health.lch_local_no_route_count))
                goto out;
 
-       if (show_rc == NULL)
+       if (!cYAML_create_number(stats, "local_timeout_count",
+                                cntrs->lct_health.lch_local_timeout_count))
+               goto out;
+
+       if (!cYAML_create_number(stats, "local_error_count",
+                                cntrs->lct_health.lch_local_error_count))
+               goto out;
+
+       if (!cYAML_create_number(stats, "remote_dropped_count",
+                                cntrs->lct_health.lch_remote_dropped_count))
+               goto out;
+
+       if (!cYAML_create_number(stats, "remote_error_count",
+                                cntrs->lct_health.lch_remote_error_count))
+               goto out;
+
+       if (!cYAML_create_number(stats, "remote_timeout_count",
+                                cntrs->lct_health.lch_remote_timeout_count))
+               goto out;
+
+       if (!cYAML_create_number(stats, "network_timeout_count",
+                                cntrs->lct_health.lch_network_timeout_count))
+               goto out;
+
+       if (!cYAML_create_number(stats, "recv_count",
+                                cntrs->lct_common.lcc_recv_count))
+               goto out;
+
+       if (!cYAML_create_number(stats, "route_count",
+                                cntrs->lct_common.lcc_route_count))
+               goto out;
+
+       if (!cYAML_create_number(stats, "drop_count",
+                                cntrs->lct_common.lcc_drop_count))
+               goto out;
+
+       if (!cYAML_create_number(stats, "send_length",
+                                cntrs->lct_common.lcc_send_length))
+               goto out;
+
+       if (!cYAML_create_number(stats, "recv_length",
+                                cntrs->lct_common.lcc_recv_length))
+               goto out;
+
+       if (!cYAML_create_number(stats, "route_length",
+                                cntrs->lct_common.lcc_route_length))
+               goto out;
+
+       if (!cYAML_create_number(stats, "drop_length",
+                                cntrs->lct_common.lcc_drop_length))
+               goto out;
+
+       if (!show_rc)
                cYAML_print_tree(root);
 
        snprintf(err_str, sizeof(err_str), "\"success\"");
@@ -3996,12 +4270,13 @@ static int handle_yaml_config_peer(struct cYAML *tree, struct cYAML **show_rc,
 {
        char **nids = NULL;
        int num, rc;
-       struct cYAML *seq_no, *prim_nid, *non_mr, *ip2nets, *peer_nis;
+       struct cYAML *seq_no, *prim_nid, *mr, *ip2nets, *peer_nis;
        char err_str[LNET_MAX_STR_LEN];
+       bool mr_value;
 
        seq_no = cYAML_get_object_item(tree, "seq_no");
        prim_nid = cYAML_get_object_item(tree, "primary nid");
-       non_mr = cYAML_get_object_item(tree, "non_mr");
+       mr = cYAML_get_object_item(tree, "Multi-Rail");
        ip2nets = cYAML_get_object_item(tree, "ip2nets");
        peer_nis = cYAML_get_object_item(tree, "peer ni");
 
@@ -4015,6 +4290,22 @@ static int handle_yaml_config_peer(struct cYAML *tree, struct cYAML **show_rc,
                return rc;
        }
 
+       if (!mr)
+               mr_value = true;
+       else {
+               if (!mr->cy_valuestring || !strcmp(mr->cy_valuestring, "True"))
+                       mr_value = true;
+               else if (!strcmp(mr->cy_valuestring, "False"))
+                       mr_value = false;
+               else {
+                       rc = LUSTRE_CFG_RC_BAD_PARAM;
+                       snprintf(err_str, sizeof(err_str), "Bad MR value");
+                       cYAML_build_error(rc, (seq_no) ? seq_no->cy_valueint : -1,
+                                         ADD_CMD, "peer", err_str, err_rc);
+                       return rc;
+               }
+       }
+
        num = yaml_copy_peer_nids((ip2nets) ? ip2nets : peer_nis, &nids,
                                  (prim_nid) ? prim_nid->cy_valuestring : NULL,
                                   false);
@@ -4028,8 +4319,7 @@ static int handle_yaml_config_peer(struct cYAML *tree, struct cYAML **show_rc,
        }
 
        rc = lustre_lnet_config_peer_nid((prim_nid) ? prim_nid->cy_valuestring : NULL,
-                                        nids, num,
-                                        (non_mr) ? false : true,
+                                        nids, num, mr_value,
                                         (ip2nets) ? true : false,
                                         (seq_no) ? seq_no->cy_valueint : -1,
                                         err_rc);
@@ -4269,7 +4559,7 @@ static int handle_yaml_config_global_settings(struct cYAML *tree,
                                              struct cYAML **err_rc)
 {
        struct cYAML *max_intf, *numa, *discovery, *retry, *tto, *seq_no,
-                    *sen;
+                    *sen, *recov;
        int rc = 0;
 
        seq_no = cYAML_get_object_item(tree, "seq_no");
@@ -4315,6 +4605,13 @@ static int handle_yaml_config_global_settings(struct cYAML *tree,
                                                        : -1,
                                                     err_rc);
 
+       recov = cYAML_get_object_item(tree, "recovery_interval");
+       if (recov)
+               rc = lustre_lnet_config_recov_intrv(recov->cy_valueint,
+                                                   seq_no ? seq_no->cy_valueint
+                                                       : -1,
+                                                   err_rc);
+
        return rc;
 }
 
@@ -4356,7 +4653,7 @@ static int handle_yaml_show_global_settings(struct cYAML *tree,
                                            struct cYAML **err_rc)
 {
        struct cYAML *max_intf, *numa, *discovery, *retry, *tto, *seq_no,
-                    *sen;
+                    *sen, *recov;
        int rc = 0;
 
        seq_no = cYAML_get_object_item(tree, "seq_no");
@@ -4396,6 +4693,12 @@ static int handle_yaml_show_global_settings(struct cYAML *tree,
                                                        : -1,
                                                     show_rc, err_rc);
 
+       recov = cYAML_get_object_item(tree, "recovery_interval");
+       if (recov)
+               rc = lustre_lnet_show_recov_intrv(seq_no ? seq_no->cy_valueint
+                                                       : -1,
+                                                 show_rc, err_rc);
+
        return rc;
 }