void lnet_detach_rsp_tracker(struct lnet_libmd *md, int cpt);
void lnet_finalize(struct lnet_msg *msg, int rc);
+bool lnet_send_error_simulation(struct lnet_msg *msg,
+ enum lnet_msg_hstatus *hstatus);
void lnet_drop_message(struct lnet_ni *ni, int cpt, void *private,
unsigned int nob, __u32 msg_type);
int lnet_fault_init(void);
void lnet_fault_fini(void);
-bool lnet_drop_rule_match(struct lnet_hdr *hdr);
+bool lnet_drop_rule_match(struct lnet_hdr *hdr, enum lnet_msg_hstatus *hstatus);
int lnet_delay_rule_add(struct lnet_fault_attr *attr);
int lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown);
LNET_MSG_STATUS_REMOTE_ERROR,
LNET_MSG_STATUS_REMOTE_DROPPED,
LNET_MSG_STATUS_REMOTE_TIMEOUT,
- LNET_MSG_STATUS_NETWORK_TIMEOUT
+ LNET_MSG_STATUS_NETWORK_TIMEOUT,
+ LNET_MSG_STATUS_END,
};
struct lnet_rsp_tracker {
# include <linux/lnet/lnet-types.h>
#endif
+#include <stdbool.h>
+
/** \addtogroup lnet_fault_simulation
* @{ */
#define LNET_GET_BIT (1 << 2)
#define LNET_REPLY_BIT (1 << 3)
+#define HSTATUS_END 11
+#define HSTATUS_LOCAL_INTERRUPT_BIT (1 << 1)
+#define HSTATUS_LOCAL_DROPPED_BIT (1 << 2)
+#define HSTATUS_LOCAL_ABORTED_BIT (1 << 3)
+#define HSTATUS_LOCAL_NO_ROUTE_BIT (1 << 4)
+#define HSTATUS_LOCAL_ERROR_BIT (1 << 5)
+#define HSTATUS_LOCAL_TIMEOUT_BIT (1 << 6)
+#define HSTATUS_REMOTE_ERROR_BIT (1 << 7)
+#define HSTATUS_REMOTE_DROPPED_BIT (1 << 8)
+#define HSTATUS_REMOTE_TIMEOUT_BIT (1 << 9)
+#define HSTATUS_NETWORK_TIMEOUT_BIT (1 << 10)
+#define HSTATUS_RANDOM 0xffffffff
+
/** ioctl parameter for LNet fault simulation */
struct lnet_fault_attr {
/**
* with da_rate
*/
__u32 da_interval;
+ /** error type mask */
+ __u32 da_health_error_mask;
+ /** randomize error generation */
+ bool da_random;
} drop;
/** message latency simulation */
struct {
libcfs_nid2str(conn->ibc_peer->ibp_nid));
bad = NULL;
- rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad);
+ if (lnet_send_error_simulation(tx->tx_lntmsg[0], &tx->tx_hstatus))
+ rc = -EINVAL;
+ else
+ rc = ib_post_send(conn->ibc_cmid->qp, wr, &bad);
}
conn->ibc_last_send = ktime_get();
if (!rc && (tx->tx_resid != 0 || tx->tx_zc_aborted)) {
rc = -EIO;
- hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+ if (hstatus == LNET_MSG_STATUS_OK)
+ hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
}
if (tx->tx_conn != NULL)
ksocknal_process_transmit(struct ksock_conn *conn, struct ksock_tx *tx)
{
int rc;
+ bool error_sim = false;
+
+ if (lnet_send_error_simulation(tx->tx_lnetmsg, &tx->tx_hstatus)) {
+ error_sim = true;
+ rc = -EINVAL;
+ goto simulate_error;
+ }
if (tx->tx_zc_capable && !tx->tx_zc_checked)
ksocknal_check_zc_req(tx);
return (rc);
}
+simulate_error:
+
/* Actual error */
LASSERT(rc < 0);
- /*
- * set the health status of the message which determines
- * whether we should retry the transmit
- */
- if (rc == -ETIMEDOUT)
- tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_TIMEOUT;
- else
- tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+ if (!error_sim) {
+ /*
+ * set the health status of the message which determines
+ * whether we should retry the transmit
+ */
+ if (rc == -ETIMEDOUT)
+ tx->tx_hstatus = LNET_MSG_STATUS_REMOTE_TIMEOUT;
+ else
+ tx->tx_hstatus = LNET_MSG_STATUS_LOCAL_ERROR;
+ }
if (!conn->ksnc_closing) {
switch (rc) {
lnet_ni_send(struct lnet_ni *ni, struct lnet_msg *msg)
{
void *priv = msg->msg_private;
- int rc;
+ int rc;
LASSERT (!in_interrupt ());
LASSERT (LNET_NETTYP(LNET_NIDNET(ni->ni_nid)) == LOLND ||
}
if (!list_empty(&the_lnet.ln_drop_rules) &&
- lnet_drop_rule_match(hdr)) {
+ lnet_drop_rule_match(hdr, NULL)) {
CDEBUG(D_NET, "%s, src %s, dst %s: Dropping %s to simulate"
"silent message loss\n",
libcfs_nid2str(from_nid), libcfs_nid2str(src_nid),
}
}
+bool
+lnet_send_error_simulation(struct lnet_msg *msg,
+ enum lnet_msg_hstatus *hstatus)
+{
+ if (!msg)
+ return false;
+
+ if (list_empty(&the_lnet.ln_drop_rules))
+ return false;
+
+ /* match only health rules */
+ if (!lnet_drop_rule_match(&msg->msg_hdr, hstatus))
+ return false;
+
+ CDEBUG(D_NET, "src %s, dst %s: %s simulate health error: %s\n",
+ libcfs_nid2str(msg->msg_hdr.src_nid),
+ libcfs_nid2str(msg->msg_hdr.dest_nid),
+ lnet_msgtyp2str(msg->msg_type),
+ lnet_health_error2str(*hstatus));
+
+ return true;
+}
+EXPORT_SYMBOL(lnet_send_error_simulation);
+
void
lnet_finalize(struct lnet_msg *msg, int status)
{
EXIT;
}
+static void
+lnet_fault_match_health(enum lnet_msg_hstatus *hstatus, __u32 mask)
+{
+ unsigned int random;
+ int choice;
+ int delta;
+ int best_delta;
+ int i;
+
+ /* assign a random failure */
+ random = cfs_rand();
+ choice = random % (LNET_MSG_STATUS_END - LNET_MSG_STATUS_OK);
+ if (choice == 0)
+ choice++;
+
+ if (mask == HSTATUS_RANDOM) {
+ *hstatus = choice;
+ return;
+ }
+
+ if (mask & (1 << choice)) {
+ *hstatus = choice;
+ return;
+ }
+
+ /* round to the closest ON bit */
+ i = HSTATUS_END;
+ best_delta = HSTATUS_END;
+ while (i > 0) {
+ if (mask & (1 << i)) {
+ delta = choice - i;
+ if (delta < 0)
+ delta *= -1;
+ if (delta < best_delta) {
+ best_delta = delta;
+ choice = i;
+ }
+ }
+ i--;
+ }
+
+ *hstatus = choice;
+}
+
/**
* check source/destination NID, portal, message type and drop rate,
* decide whether should drop this message or not
*/
static bool
drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src,
- lnet_nid_t dst, unsigned int type, unsigned int portal)
+ lnet_nid_t dst, unsigned int type, unsigned int portal,
+ enum lnet_msg_hstatus *hstatus)
{
struct lnet_fault_attr *attr = &rule->dr_attr;
bool drop;
if (!lnet_fault_attr_match(attr, src, dst, type, portal))
return false;
+ /*
+ * if we're trying to match a health status error but it hasn't
+ * been set in the rule, then don't match
+ */
+ if ((hstatus && !attr->u.drop.da_health_error_mask) ||
+ (!hstatus && attr->u.drop.da_health_error_mask))
+ return false;
+
/* match this rule, check drop rate now */
spin_lock(&rule->dr_lock);
- if (rule->dr_drop_time != 0) { /* time based drop */
+ if (attr->u.drop.da_random) {
+ int value = cfs_rand() % attr->u.drop.da_interval;
+ if (value >= (attr->u.drop.da_interval / 2))
+ drop = true;
+ else
+ drop = false;
+ } else if (rule->dr_drop_time != 0) { /* time based drop */
time64_t now = ktime_get_seconds();
rule->dr_stat.fs_count++;
}
if (drop) { /* drop this message, update counters */
+ if (hstatus)
+ lnet_fault_match_health(hstatus,
+ attr->u.drop.da_health_error_mask);
lnet_fault_stat_inc(&rule->dr_stat, type);
rule->dr_stat.u.drop.ds_dropped++;
}
* Check if message from \a src to \a dst can match any existed drop rule
*/
bool
-lnet_drop_rule_match(struct lnet_hdr *hdr)
+lnet_drop_rule_match(struct lnet_hdr *hdr, enum lnet_msg_hstatus *hstatus)
{
- struct lnet_drop_rule *rule;
- lnet_nid_t src = le64_to_cpu(hdr->src_nid);
- lnet_nid_t dst = le64_to_cpu(hdr->dest_nid);
- unsigned int typ = le32_to_cpu(hdr->type);
- unsigned int ptl = -1;
- bool drop = false;
- int cpt;
+ lnet_nid_t src = le64_to_cpu(hdr->src_nid);
+ lnet_nid_t dst = le64_to_cpu(hdr->dest_nid);
+ unsigned int typ = le32_to_cpu(hdr->type);
+ struct lnet_drop_rule *rule;
+ unsigned int ptl = -1;
+ bool drop = false;
+ int cpt;
/* NB: if Portal is specified, then only PUT and GET will be
* filtered by drop rule */
cpt = lnet_net_lock_current();
list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) {
- drop = drop_rule_match(rule, src, dst, typ, ptl);
+ drop = drop_rule_match(rule, src, dst, typ, ptl,
+ hstatus);
if (drop)
break;
}
-
lnet_net_unlock(cpt);
+
return drop;
}
" <<-r | --rate DROP_RATE> |\n"
" <-i | --interval SECONDS>>\n"
" [<-p | --portal> PORTAL...]\n"
- " [<-m | --message> <PUT|ACK|GET|REPLY>...]\n"},
+ " [<-m | --message> <PUT|ACK|GET|REPLY>...]\n"
+ " [< -e | --health_error]\n"},
{"net_drop_del", jt_ptl_drop_del, 0, "remove LNet drop rule\n"
"usage: net_drop_del <[-a | --all] |\n"
" <-s | --source NID>\n"
}
static int
+fault_attr_health_error_parse(char *error, __u32 *mask)
+{
+ if (!strcasecmp(error, "local_interrupt")) {
+ *mask |= HSTATUS_LOCAL_INTERRUPT_BIT;
+ return 0;
+ }
+ if (!strcasecmp(error, "local_dropped")) {
+ *mask |= HSTATUS_LOCAL_DROPPED_BIT;
+ return 0;
+ }
+ if (!strcasecmp(error, "local_aborted")) {
+ *mask |= HSTATUS_LOCAL_ABORTED_BIT;
+ return 0;
+ }
+ if (!strcasecmp(error, "local_no_route")) {
+ *mask |= HSTATUS_LOCAL_NO_ROUTE_BIT;
+ return 0;
+ }
+ if (!strcasecmp(error, "local_error")) {
+ *mask |= HSTATUS_LOCAL_ERROR_BIT;
+ return 0;
+ }
+ if (!strcasecmp(error, "local_timeout")) {
+ *mask |= HSTATUS_LOCAL_TIMEOUT_BIT;
+ return 0;
+ }
+ if (!strcasecmp(error, "remote_error")) {
+ *mask |= HSTATUS_REMOTE_ERROR_BIT;
+ return 0;
+ }
+ if (!strcasecmp(error, "remote_dropped")) {
+ *mask |= HSTATUS_REMOTE_DROPPED_BIT;
+ return 0;
+ }
+ if (!strcasecmp(error, "remote_timeout")) {
+ *mask |= HSTATUS_REMOTE_TIMEOUT_BIT;
+ return 0;
+ }
+ if (!strcasecmp(error, "network_timeout")) {
+ *mask |= HSTATUS_NETWORK_TIMEOUT_BIT;
+ return 0;
+ }
+ if (!strcasecmp(error, "random")) {
+ *mask = HSTATUS_RANDOM;
+ return 0;
+ }
+
+ return -1;
+}
+
+static int
fault_simul_rule_add(__u32 opc, char *name, int argc, char **argv)
{
struct libcfs_ioctl_data data = { { 0 } };
{ .name = "dest", .has_arg = required_argument, .val = 'd' },
{ .name = "rate", .has_arg = required_argument, .val = 'r' },
{ .name = "interval", .has_arg = required_argument, .val = 'i' },
+ { .name = "random", .has_arg = no_argument, .val = 'n' },
{ .name = "latency", .has_arg = required_argument, .val = 'l' },
{ .name = "portal", .has_arg = required_argument, .val = 'p' },
{ .name = "message", .has_arg = required_argument, .val = 'm' },
+ { .name = "health_error", .has_arg = required_argument, .val = 'e' },
{ .name = NULL } };
if (argc == 1) {
return -1;
}
- optstr = opc == LNET_CTL_DROP_ADD ? "s:d:r:i:p:m:" : "s:d:r:l:p:m:";
+ optstr = opc == LNET_CTL_DROP_ADD ? "s:d:r:i:p:m:e:n" : "s:d:r:l:p:m:";
memset(&attr, 0, sizeof(attr));
while (1) {
char c = getopt_long(argc, argv, optstr, opts, NULL);
attr.u.delay.la_rate = strtoul(optarg, NULL, 0);
break;
+ case 'e':
+ if (opc == LNET_CTL_DROP_ADD) {
+ rc = fault_attr_health_error_parse(optarg,
+ &attr.u.drop.da_health_error_mask);
+ if (rc)
+ goto getopt_failed;
+ }
+ break;
+
+ case 'n':
+ if (opc == LNET_CTL_DROP_ADD)
+ attr.u.drop.da_random = true;
+ break;
+
case 'i': /* time interval (# seconds) for message drop */
if (opc == LNET_CTL_DROP_ADD)
attr.u.drop.da_interval = strtoul(optarg,
"but not both at the same time.\n");
return -1;
}
+
+ if (attr.u.drop.da_random &&
+ attr.u.drop.da_interval == 0) {
+ fprintf(stderr, "please provide an interval to randomize\n");
+ return -1;
+ }
} else if (opc == LNET_CTL_DELAY_ADD) {
if (!((attr.u.delay.la_rate == 0) ^
(attr.u.delay.la_interval == 0))) {