From: Amir Shehata Date: Sun, 5 Aug 2018 21:37:29 +0000 (-0700) Subject: LU-9120 lnet: health error simulation X-Git-Tag: 2.11.55~65^2^2 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=5c17777d97bd20cde68771c6186320b5eae90e62;hp=826ea19c077b2a3e1a32464a7eb63fba6e460946 LU-9120 lnet: health error simulation Modified the error simulation code to simulate health errors for testing purposes. The specific error can be set. If multiple errors are configured then one at random is chosen from the set. EX: lctl net_drop_add -s *@tcp -d *@tcp -m GET -i 1 -e local_interrupt The -e can be repeated multiple times to specify different errors to simulate. The available set are local_interrupt local_dropped local_aborted local_no_route local_error local_timeout remote_error remote_dropped remote_timeout network_timeout random a -n, "--random", has been added to randomize error generation for drop rules. This will rely an interval value provided via -i. This will generate a random number no bigger than interval. If the number is smaller than half of the interval then the rule isn't matched, otherwise it is. The purpose of this is because drop matching can happen multiple times in the path of sending the message, and using time based or rate will not result in even error generation across the multiple calls. Signed-off-by: Amir Shehata Change-Id: If070e29f68c3de10100a9d5eaa49d10cdb76a59a Reviewed-on: https://review.whamcloud.com/32951 Tested-by: Jenkins Reviewed-by: Sonia Sharma Reviewed-by: Olaf Weber Tested-by: Maloo --- diff --git a/lnet/include/lnet/lib-lnet.h b/lnet/include/lnet/lib-lnet.h index fb9aa02..aeb0e4b 100644 --- a/lnet/include/lnet/lib-lnet.h +++ b/lnet/include/lnet/lib-lnet.h @@ -706,6 +706,8 @@ void lnet_set_reply_msg_len(struct lnet_ni *ni, struct lnet_msg *msg, void lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt); void lnet_finalize(struct lnet_msg *msg, int rc); +bool lnet_send_error_simulation(struct lnet_msg *msg, + enum lnet_msg_hstatus *hstatus); void lnet_drop_message(struct lnet_ni *ni, int cpt, void *private, unsigned int nob, __u32 msg_type); @@ -728,7 +730,7 @@ int lnet_fault_ctl(int cmd, struct libcfs_ioctl_data *data); int lnet_fault_init(void); void lnet_fault_fini(void); -bool lnet_drop_rule_match(struct lnet_hdr *hdr); +bool lnet_drop_rule_match(struct lnet_hdr *hdr, enum lnet_msg_hstatus *hstatus); int lnet_delay_rule_add(struct lnet_fault_attr *attr); int lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown); diff --git a/lnet/include/lnet/lib-types.h b/lnet/include/lnet/lib-types.h index 463c9a3..4ee8c60 100644 --- a/lnet/include/lnet/lib-types.h +++ b/lnet/include/lnet/lib-types.h @@ -75,7 +75,8 @@ enum lnet_msg_hstatus { LNET_MSG_STATUS_REMOTE_ERROR, LNET_MSG_STATUS_REMOTE_DROPPED, LNET_MSG_STATUS_REMOTE_TIMEOUT, - LNET_MSG_STATUS_NETWORK_TIMEOUT + LNET_MSG_STATUS_NETWORK_TIMEOUT, + LNET_MSG_STATUS_END, }; struct lnet_rsp_tracker { diff --git a/lnet/include/uapi/linux/lnet/lnetctl.h b/lnet/include/uapi/linux/lnet/lnetctl.h index 611bf02..7e211e1 100644 --- a/lnet/include/uapi/linux/lnet/lnetctl.h +++ b/lnet/include/uapi/linux/lnet/lnetctl.h @@ -32,6 +32,8 @@ # include #endif +#include + /** \addtogroup lnet_fault_simulation * @{ */ @@ -51,6 +53,19 @@ enum { #define LNET_GET_BIT (1 << 2) #define LNET_REPLY_BIT (1 << 3) +#define HSTATUS_END 11 +#define HSTATUS_LOCAL_INTERRUPT_BIT (1 << 1) +#define HSTATUS_LOCAL_DROPPED_BIT (1 << 2) +#define HSTATUS_LOCAL_ABORTED_BIT (1 << 3) +#define HSTATUS_LOCAL_NO_ROUTE_BIT (1 << 4) +#define HSTATUS_LOCAL_ERROR_BIT (1 << 5) +#define HSTATUS_LOCAL_TIMEOUT_BIT (1 << 6) +#define HSTATUS_REMOTE_ERROR_BIT (1 << 7) +#define HSTATUS_REMOTE_DROPPED_BIT (1 << 8) +#define HSTATUS_REMOTE_TIMEOUT_BIT (1 << 9) +#define HSTATUS_NETWORK_TIMEOUT_BIT (1 << 10) +#define HSTATUS_RANDOM 0xffffffff + /** ioctl parameter for LNet fault simulation */ struct lnet_fault_attr { /** @@ -88,6 +103,10 @@ struct lnet_fault_attr { * with da_rate */ __u32 da_interval; + /** error type mask */ + __u32 da_health_error_mask; + /** randomize error generation */ + bool da_random; } drop; /** message latency simulation */ struct { diff --git a/lnet/klnds/o2iblnd/o2iblnd_cb.c b/lnet/klnds/o2iblnd/o2iblnd_cb.c index de09e6d..65eb85e 100644 --- a/lnet/klnds/o2iblnd/o2iblnd_cb.c +++ b/lnet/klnds/o2iblnd/o2iblnd_cb.c @@ -969,7 +969,10 @@ __must_hold(&conn->ibc_lock) libcfs_nid2str(conn->ibc_peer->ibp_nid)); bad = NULL; - rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad); + if (lnet_send_error_simulation(tx->tx_lntmsg[0], &tx->tx_hstatus)) + rc = -EINVAL; + else + rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad); } conn->ibc_last_send = ktime_get(); diff --git a/lnet/klnds/socklnd/socklnd_cb.c b/lnet/klnds/socklnd/socklnd_cb.c index 2eed8b1..aa32b2a 100644 --- a/lnet/klnds/socklnd/socklnd_cb.c +++ b/lnet/klnds/socklnd/socklnd_cb.c @@ -400,7 +400,8 @@ ksocknal_tx_done(struct lnet_ni *ni, struct ksock_tx *tx, int rc) if (!rc && (tx->tx_resid != 0 || tx->tx_zc_aborted)) { rc = -EIO; - hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + if (hstatus == LNET_MSG_STATUS_OK) + hstatus = LNET_MSG_STATUS_LOCAL_ERROR; } if (tx->tx_conn != NULL) @@ -533,6 +534,13 @@ static int ksocknal_process_transmit(struct ksock_conn *conn, struct ksock_tx *tx) { int rc; + bool error_sim = false; + + if (lnet_send_error_simulation(tx->tx_lnetmsg, &tx->tx_hstatus)) { + error_sim = true; + rc = -EINVAL; + goto simulate_error; + } if (tx->tx_zc_capable && !tx->tx_zc_checked) ksocknal_check_zc_req(tx); @@ -580,17 +588,21 @@ ksocknal_process_transmit(struct ksock_conn *conn, struct ksock_tx *tx) return (rc); } +simulate_error: + /* Actual error */ LASSERT(rc < 0); - /* - * set the health status of the message which determines - * whether we should retry the transmit - */ - if (rc == -ETIMEDOUT) - tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_TIMEOUT; - else - tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + if (!error_sim) { + /* + * set the health status of the message which determines + * whether we should retry the transmit + */ + if (rc == -ETIMEDOUT) + tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_TIMEOUT; + else + tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR; + } if (!conn->ksnc_closing) { switch (rc) { diff --git a/lnet/lnet/lib-move.c b/lnet/lnet/lib-move.c index 1aea403..a89fa41 100644 --- a/lnet/lnet/lib-move.c +++ b/lnet/lnet/lib-move.c @@ -757,7 +757,7 @@ static void lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg) { void *priv = msg->msg_private; - int rc; + int rc; LASSERT (!in_interrupt ()); LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND || @@ -4135,7 +4135,7 @@ lnet_parse(struct lnet_ni *ni, struct lnet_hdr *hdr, lnet_nid_t from_nid, } if (!list_empty(&the_lnet.ln_drop_rules) && - lnet_drop_rule_match(hdr)) { + lnet_drop_rule_match(hdr, NULL)) { CDEBUG(D_NET, "%s, src %s, dst %s: Dropping %s to simulate" "silent message loss\n", libcfs_nid2str(from_nid), libcfs_nid2str(src_nid), diff --git a/lnet/lnet/lib-msg.c b/lnet/lnet/lib-msg.c index 5f988be..8fed27f 100644 --- a/lnet/lnet/lib-msg.c +++ b/lnet/lnet/lib-msg.c @@ -817,6 +817,30 @@ lnet_health_error2str(enum lnet_msg_hstatus hstatus) } } +bool +lnet_send_error_simulation(struct lnet_msg *msg, + enum lnet_msg_hstatus *hstatus) +{ + if (!msg) + return false; + + if (list_empty(&the_lnet.ln_drop_rules)) + return false; + + /* match only health rules */ + if (!lnet_drop_rule_match(&msg->msg_hdr, hstatus)) + return false; + + CDEBUG(D_NET, "src %s, dst %s: %s simulate health error: %s\n", + libcfs_nid2str(msg->msg_hdr.src_nid), + libcfs_nid2str(msg->msg_hdr.dest_nid), + lnet_msgtyp2str(msg->msg_type), + lnet_health_error2str(*hstatus)); + + return true; +} +EXPORT_SYMBOL(lnet_send_error_simulation); + void lnet_finalize(struct lnet_msg *msg, int status) { diff --git a/lnet/lnet/net_fault.c b/lnet/lnet/net_fault.c index 05daed2..04c98d5 100644 --- a/lnet/lnet/net_fault.c +++ b/lnet/lnet/net_fault.c @@ -294,13 +294,58 @@ lnet_drop_rule_reset(void) EXIT; } +static void +lnet_fault_match_health(enum lnet_msg_hstatus *hstatus, __u32 mask) +{ + unsigned int random; + int choice; + int delta; + int best_delta; + int i; + + /* assign a random failure */ + random = cfs_rand(); + choice = random % (LNET_MSG_STATUS_END - LNET_MSG_STATUS_OK); + if (choice == 0) + choice++; + + if (mask == HSTATUS_RANDOM) { + *hstatus = choice; + return; + } + + if (mask & (1 << choice)) { + *hstatus = choice; + return; + } + + /* round to the closest ON bit */ + i = HSTATUS_END; + best_delta = HSTATUS_END; + while (i > 0) { + if (mask & (1 << i)) { + delta = choice - i; + if (delta < 0) + delta *= -1; + if (delta < best_delta) { + best_delta = delta; + choice = i; + } + } + i--; + } + + *hstatus = choice; +} + /** * check source/destination NID, portal, message type and drop rate, * decide whether should drop this message or not */ static bool drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src, - lnet_nid_t dst, unsigned int type, unsigned int portal) + lnet_nid_t dst, unsigned int type, unsigned int portal, + enum lnet_msg_hstatus *hstatus) { struct lnet_fault_attr *attr = &rule->dr_attr; bool drop; @@ -308,9 +353,23 @@ drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src, if (!lnet_fault_attr_match(attr, src, dst, type, portal)) return false; + /* + * if we're trying to match a health status error but it hasn't + * been set in the rule, then don't match + */ + if ((hstatus && !attr->u.drop.da_health_error_mask) || + (!hstatus && attr->u.drop.da_health_error_mask)) + return false; + /* match this rule, check drop rate now */ spin_lock(&rule->dr_lock); - if (rule->dr_drop_time != 0) { /* time based drop */ + if (attr->u.drop.da_random) { + int value = cfs_rand() % attr->u.drop.da_interval; + if (value >= (attr->u.drop.da_interval / 2)) + drop = true; + else + drop = false; + } else if (rule->dr_drop_time != 0) { /* time based drop */ time64_t now = ktime_get_seconds(); rule->dr_stat.fs_count++; @@ -344,6 +403,9 @@ drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src, } if (drop) { /* drop this message, update counters */ + if (hstatus) + lnet_fault_match_health(hstatus, + attr->u.drop.da_health_error_mask); lnet_fault_stat_inc(&rule->dr_stat, type); rule->dr_stat.u.drop.ds_dropped++; } @@ -356,15 +418,15 @@ drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src, * Check if message from \a src to \a dst can match any existed drop rule */ bool -lnet_drop_rule_match(struct lnet_hdr *hdr) +lnet_drop_rule_match(struct lnet_hdr *hdr, enum lnet_msg_hstatus *hstatus) { - struct lnet_drop_rule *rule; - lnet_nid_t src = le64_to_cpu(hdr->src_nid); - lnet_nid_t dst = le64_to_cpu(hdr->dest_nid); - unsigned int typ = le32_to_cpu(hdr->type); - unsigned int ptl = -1; - bool drop = false; - int cpt; + lnet_nid_t src = le64_to_cpu(hdr->src_nid); + lnet_nid_t dst = le64_to_cpu(hdr->dest_nid); + unsigned int typ = le32_to_cpu(hdr->type); + struct lnet_drop_rule *rule; + unsigned int ptl = -1; + bool drop = false; + int cpt; /* NB: if Portal is specified, then only PUT and GET will be * filtered by drop rule */ @@ -375,12 +437,13 @@ lnet_drop_rule_match(struct lnet_hdr *hdr) cpt = lnet_net_lock_current(); list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) { - drop = drop_rule_match(rule, src, dst, typ, ptl); + drop = drop_rule_match(rule, src, dst, typ, ptl, + hstatus); if (drop) break; } - lnet_net_unlock(cpt); + return drop; } diff --git a/lustre/utils/lctl.c b/lustre/utils/lctl.c index c9632b9..2a2939b 100644 --- a/lustre/utils/lctl.c +++ b/lustre/utils/lctl.c @@ -117,7 +117,8 @@ command_t cmdlist[] = { " <<-r | --rate DROP_RATE> |\n" " <-i | --interval SECONDS>>\n" " [<-p | --portal> PORTAL...]\n" - " [<-m | --message> ...]\n"}, + " [<-m | --message> ...]\n" + " [< -e | --health_error]\n"}, {"net_drop_del", jt_ptl_drop_del, 0, "remove LNet drop rule\n" "usage: net_drop_del <[-a | --all] |\n" " <-s | --source NID>\n" diff --git a/lustre/utils/portals.c b/lustre/utils/portals.c index e544588..9349b6b 100644 --- a/lustre/utils/portals.c +++ b/lustre/utils/portals.c @@ -1315,6 +1315,57 @@ fault_attr_ptl_parse(char *ptl_str, __u64 *mask_p) } static int +fault_attr_health_error_parse(char *error, __u32 *mask) +{ + if (!strcasecmp(error, "local_interrupt")) { + *mask |= HSTATUS_LOCAL_INTERRUPT_BIT; + return 0; + } + if (!strcasecmp(error, "local_dropped")) { + *mask |= HSTATUS_LOCAL_DROPPED_BIT; + return 0; + } + if (!strcasecmp(error, "local_aborted")) { + *mask |= HSTATUS_LOCAL_ABORTED_BIT; + return 0; + } + if (!strcasecmp(error, "local_no_route")) { + *mask |= HSTATUS_LOCAL_NO_ROUTE_BIT; + return 0; + } + if (!strcasecmp(error, "local_error")) { + *mask |= HSTATUS_LOCAL_ERROR_BIT; + return 0; + } + if (!strcasecmp(error, "local_timeout")) { + *mask |= HSTATUS_LOCAL_TIMEOUT_BIT; + return 0; + } + if (!strcasecmp(error, "remote_error")) { + *mask |= HSTATUS_REMOTE_ERROR_BIT; + return 0; + } + if (!strcasecmp(error, "remote_dropped")) { + *mask |= HSTATUS_REMOTE_DROPPED_BIT; + return 0; + } + if (!strcasecmp(error, "remote_timeout")) { + *mask |= HSTATUS_REMOTE_TIMEOUT_BIT; + return 0; + } + if (!strcasecmp(error, "network_timeout")) { + *mask |= HSTATUS_NETWORK_TIMEOUT_BIT; + return 0; + } + if (!strcasecmp(error, "random")) { + *mask = HSTATUS_RANDOM; + return 0; + } + + return -1; +} + +static int fault_simul_rule_add(__u32 opc, char *name, int argc, char **argv) { struct libcfs_ioctl_data data = { { 0 } }; @@ -1327,9 +1378,11 @@ fault_simul_rule_add(__u32 opc, char *name, int argc, char **argv) { .name = "dest", .has_arg = required_argument, .val = 'd' }, { .name = "rate", .has_arg = required_argument, .val = 'r' }, { .name = "interval", .has_arg = required_argument, .val = 'i' }, + { .name = "random", .has_arg = no_argument, .val = 'n' }, { .name = "latency", .has_arg = required_argument, .val = 'l' }, { .name = "portal", .has_arg = required_argument, .val = 'p' }, { .name = "message", .has_arg = required_argument, .val = 'm' }, + { .name = "health_error", .has_arg = required_argument, .val = 'e' }, { .name = NULL } }; if (argc == 1) { @@ -1338,7 +1391,7 @@ fault_simul_rule_add(__u32 opc, char *name, int argc, char **argv) return -1; } - optstr = opc == LNET_CTL_DROP_ADD ? "s:d:r:i:p:m:" : "s:d:r:l:p:m:"; + optstr = opc == LNET_CTL_DROP_ADD ? "s:d:r:i:p:m:e:n" : "s:d:r:l:p:m:"; memset(&attr, 0, sizeof(attr)); while (1) { char c = getopt_long(argc, argv, optstr, opts, NULL); @@ -1366,6 +1419,20 @@ fault_simul_rule_add(__u32 opc, char *name, int argc, char **argv) attr.u.delay.la_rate = strtoul(optarg, NULL, 0); break; + case 'e': + if (opc == LNET_CTL_DROP_ADD) { + rc = fault_attr_health_error_parse(optarg, + &attr.u.drop.da_health_error_mask); + if (rc) + goto getopt_failed; + } + break; + + case 'n': + if (opc == LNET_CTL_DROP_ADD) + attr.u.drop.da_random = true; + break; + case 'i': /* time interval (# seconds) for message drop */ if (opc == LNET_CTL_DROP_ADD) attr.u.drop.da_interval = strtoul(optarg, @@ -1408,6 +1475,12 @@ fault_simul_rule_add(__u32 opc, char *name, int argc, char **argv) "but not both at the same time.\n"); return -1; } + + if (attr.u.drop.da_random && + attr.u.drop.da_interval == 0) { + fprintf(stderr, "please provide an interval to randomize\n"); + return -1; + } } else if (opc == LNET_CTL_DELAY_ADD) { if (!((attr.u.delay.la_rate == 0) ^ (attr.u.delay.la_interval == 0))) {