X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lnet%2Flnet%2Fnet_fault.c;h=56365fd6c148aa03c5368fc2339f8f433dcfce05;hb=1477027d073be1efb6ffbb368ed68b97a65c72f1;hp=05daed27518090924f54313a1f90477b608c04a1;hpb=782ff36d50c77652d0358dc2d0bbddf81fac8759;p=fs%2Flustre-release.git diff --git a/lnet/lnet/net_fault.c b/lnet/lnet/net_fault.c index 05daed2..56365fd6 100644 --- a/lnet/lnet/net_fault.c +++ b/lnet/lnet/net_fault.c @@ -36,6 +36,7 @@ #define DEBUG_SUBSYSTEM S_LNET +#include #include #include @@ -79,10 +80,12 @@ lnet_fault_nid_match(lnet_nid_t nid, lnet_nid_t msg_nid) static bool lnet_fault_attr_match(struct lnet_fault_attr *attr, lnet_nid_t src, - lnet_nid_t dst, unsigned int type, unsigned int portal) + lnet_nid_t local_nid, lnet_nid_t dst, + unsigned int type, unsigned int portal) { if (!lnet_fault_nid_match(attr->fa_src, src) || - !lnet_fault_nid_match(attr->fa_dst, dst)) + !lnet_fault_nid_match(attr->fa_dst, dst) || + !lnet_fault_nid_match(attr->fa_local_nid, local_nid)) return false; if (!(attr->fa_msg_mask & (1 << type))) @@ -172,9 +175,9 @@ lnet_drop_rule_add(struct lnet_fault_attr *attr) if (attr->u.drop.da_interval != 0) { rule->dr_time_base = ktime_get_seconds() + attr->u.drop.da_interval; rule->dr_drop_time = ktime_get_seconds() + - cfs_rand() % attr->u.drop.da_interval; + prandom_u32_max(attr->u.drop.da_interval); } else { - rule->dr_drop_at = cfs_rand() % attr->u.drop.da_rate; + rule->dr_drop_at = prandom_u32_max(attr->u.drop.da_rate); } lnet_net_lock(LNET_LOCK_EX); @@ -199,12 +202,10 @@ lnet_drop_rule_del(lnet_nid_t src, lnet_nid_t dst) { struct lnet_drop_rule *rule; struct lnet_drop_rule *tmp; - struct list_head zombies; - int n = 0; + LIST_HEAD(zombies); + int n = 0; ENTRY; - INIT_LIST_HEAD(&zombies); - lnet_net_lock(LNET_LOCK_EX); list_for_each_entry_safe(rule, tmp, &the_lnet.ln_drop_rules, dr_link) { if (rule->dr_attr.fa_src != src && src != 0) @@ -281,10 +282,10 @@ lnet_drop_rule_reset(void) memset(&rule->dr_stat, 0, sizeof(rule->dr_stat)); if (attr->u.drop.da_rate != 0) { - rule->dr_drop_at = cfs_rand() % attr->u.drop.da_rate; + rule->dr_drop_at = prandom_u32_max(attr->u.drop.da_rate); } else { rule->dr_drop_time = ktime_get_seconds() + - cfs_rand() % attr->u.drop.da_interval; + prandom_u32_max(attr->u.drop.da_interval); rule->dr_time_base = ktime_get_seconds() + attr->u.drop.da_interval; } spin_unlock(&rule->dr_lock); @@ -294,23 +295,87 @@ lnet_drop_rule_reset(void) EXIT; } +static void +lnet_fault_match_health(enum lnet_msg_hstatus *hstatus, __u32 mask) +{ + int choice; + int delta; + int best_delta; + int i; + + /* assign a random failure */ + choice = prandom_u32_max(LNET_MSG_STATUS_END - LNET_MSG_STATUS_OK); + if (choice == 0) + choice++; + + if (mask == HSTATUS_RANDOM) { + *hstatus = choice; + return; + } + + if (mask & (1 << choice)) { + *hstatus = choice; + return; + } + + /* round to the closest ON bit */ + i = HSTATUS_END; + best_delta = HSTATUS_END; + while (i > 0) { + if (mask & (1 << i)) { + delta = choice - i; + if (delta < 0) + delta *= -1; + if (delta < best_delta) { + best_delta = delta; + choice = i; + } + } + i--; + } + + *hstatus = choice; +} + /** * check source/destination NID, portal, message type and drop rate, * decide whether should drop this message or not */ static bool drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src, - lnet_nid_t dst, unsigned int type, unsigned int portal) + lnet_nid_t local_nid, lnet_nid_t dst, + unsigned int type, unsigned int portal, + enum lnet_msg_hstatus *hstatus) { struct lnet_fault_attr *attr = &rule->dr_attr; bool drop; - if (!lnet_fault_attr_match(attr, src, dst, type, portal)) + if (!lnet_fault_attr_match(attr, src, local_nid, dst, type, portal)) + return false; + + if (attr->u.drop.da_drop_all) { + CDEBUG(D_NET, "set to drop all messages\n"); + drop = true; + goto drop_matched; + } + + /* + * if we're trying to match a health status error but it hasn't + * been set in the rule, then don't match + */ + if ((hstatus && !attr->u.drop.da_health_error_mask) || + (!hstatus && attr->u.drop.da_health_error_mask)) return false; /* match this rule, check drop rate now */ spin_lock(&rule->dr_lock); - if (rule->dr_drop_time != 0) { /* time based drop */ + if (attr->u.drop.da_random) { + int value = prandom_u32_max(attr->u.drop.da_interval); + if (value >= (attr->u.drop.da_interval / 2)) + drop = true; + else + drop = false; + } else if (rule->dr_drop_time != 0) { /* time based drop */ time64_t now = ktime_get_seconds(); rule->dr_stat.fs_count++; @@ -320,7 +385,7 @@ drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src, rule->dr_time_base = now; rule->dr_drop_time = rule->dr_time_base + - cfs_rand() % attr->u.drop.da_interval; + prandom_u32_max(attr->u.drop.da_interval); rule->dr_time_base += attr->u.drop.da_interval; CDEBUG(D_NET, "Drop Rule %s->%s: next drop : %lld\n", @@ -336,14 +401,19 @@ drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src, count = rule->dr_stat.fs_count; if (do_div(count, attr->u.drop.da_rate) == 0) { rule->dr_drop_at = rule->dr_stat.fs_count + - cfs_rand() % attr->u.drop.da_rate; + prandom_u32_max(attr->u.drop.da_rate); CDEBUG(D_NET, "Drop Rule %s->%s: next drop: %lu\n", libcfs_nid2str(attr->fa_src), libcfs_nid2str(attr->fa_dst), rule->dr_drop_at); } } +drop_matched: + if (drop) { /* drop this message, update counters */ + if (hstatus) + lnet_fault_match_health(hstatus, + attr->u.drop.da_health_error_mask); lnet_fault_stat_inc(&rule->dr_stat, type); rule->dr_stat.u.drop.ds_dropped++; } @@ -356,15 +426,17 @@ drop_rule_match(struct lnet_drop_rule *rule, lnet_nid_t src, * Check if message from \a src to \a dst can match any existed drop rule */ bool -lnet_drop_rule_match(struct lnet_hdr *hdr) +lnet_drop_rule_match(struct lnet_hdr *hdr, + lnet_nid_t local_nid, + enum lnet_msg_hstatus *hstatus) { - struct lnet_drop_rule *rule; - lnet_nid_t src = le64_to_cpu(hdr->src_nid); - lnet_nid_t dst = le64_to_cpu(hdr->dest_nid); - unsigned int typ = le32_to_cpu(hdr->type); - unsigned int ptl = -1; - bool drop = false; - int cpt; + lnet_nid_t src = le64_to_cpu(hdr->src_nid); + lnet_nid_t dst = le64_to_cpu(hdr->dest_nid); + unsigned int typ = le32_to_cpu(hdr->type); + struct lnet_drop_rule *rule; + unsigned int ptl = -1; + bool drop = false; + int cpt; /* NB: if Portal is specified, then only PUT and GET will be * filtered by drop rule */ @@ -375,12 +447,13 @@ lnet_drop_rule_match(struct lnet_hdr *hdr) cpt = lnet_net_lock_current(); list_for_each_entry(rule, &the_lnet.ln_drop_rules, dr_link) { - drop = drop_rule_match(rule, src, dst, typ, ptl); + drop = drop_rule_match(rule, src, local_nid, dst, typ, ptl, + hstatus); if (drop) break; } - lnet_net_unlock(cpt); + return drop; } @@ -412,8 +485,8 @@ struct lnet_delay_rule { time64_t dl_delay_time; /** baseline to caculate dl_delay_time */ time64_t dl_time_base; - /** jiffies to send the next delayed message */ - unsigned long dl_msg_send; + /** seconds until we send the next delayed message */ + time64_t dl_msg_send; /** delayed message list */ struct list_head dl_msg_list; /** statistic of delayed messages */ @@ -465,7 +538,8 @@ delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src, struct lnet_fault_attr *attr = &rule->dl_attr; bool delay; - if (!lnet_fault_attr_match(attr, src, dst, type, portal)) + if (!lnet_fault_attr_match(attr, src, LNET_NID_ANY, + dst, type, portal)) return false; /* match this rule, check delay rate now */ @@ -480,7 +554,7 @@ delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src, rule->dl_time_base = now; rule->dl_delay_time = rule->dl_time_base + - cfs_rand() % attr->u.delay.la_interval; + prandom_u32_max(attr->u.delay.la_interval); rule->dl_time_base += attr->u.delay.la_interval; CDEBUG(D_NET, "Delay Rule %s->%s: next delay : %lld\n", @@ -497,7 +571,7 @@ delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src, count = rule->dl_stat.fs_count; if (do_div(count, attr->u.delay.la_rate) == 0) { rule->dl_delay_at = rule->dl_stat.fs_count + - cfs_rand() % attr->u.delay.la_rate; + prandom_u32_max(attr->u.delay.la_rate); CDEBUG(D_NET, "Delay Rule %s->%s: next delay: %lu\n", libcfs_nid2str(attr->fa_src), libcfs_nid2str(attr->fa_dst), rule->dl_delay_at); @@ -517,7 +591,8 @@ delay_rule_match(struct lnet_delay_rule *rule, lnet_nid_t src, msg->msg_delay_send = ktime_get_seconds() + attr->u.delay.la_latency; if (rule->dl_msg_send == -1) { rule->dl_msg_send = msg->msg_delay_send; - mod_timer(&rule->dl_timer, rule->dl_msg_send); + mod_timer(&rule->dl_timer, + jiffies + cfs_time_seconds(rule->dl_msg_send)); } spin_unlock(&rule->dl_lock); @@ -585,7 +660,8 @@ delayed_msg_check(struct lnet_delay_rule *rule, bool all, msg = list_entry(rule->dl_msg_list.next, struct lnet_msg, msg_list); rule->dl_msg_send = msg->msg_delay_send; - mod_timer(&rule->dl_timer, rule->dl_msg_send); + mod_timer(&rule->dl_timer, + jiffies + cfs_time_seconds(rule->dl_msg_send)); } spin_unlock(&rule->dl_lock); } @@ -625,6 +701,7 @@ delayed_msg_process(struct list_head *msg_list, bool drop) case LNET_CREDIT_OK: lnet_ni_recv(ni, msg->msg_private, msg, 0, 0, msg->msg_len, msg->msg_len); + /* fallthrough */ case LNET_CREDIT_WAIT: continue; default: /* failures */ @@ -645,10 +722,9 @@ delayed_msg_process(struct list_head *msg_list, bool drop) void lnet_delay_rule_check(void) { - struct lnet_delay_rule *rule; - struct list_head msgs; + struct lnet_delay_rule *rule; + LIST_HEAD(msgs); - INIT_LIST_HEAD(&msgs); while (1) { if (list_empty(&delay_dd.dd_sched_rules)) break; @@ -772,9 +848,9 @@ lnet_delay_rule_add(struct lnet_fault_attr *attr) rule->dl_time_base = ktime_get_seconds() + attr->u.delay.la_interval; rule->dl_delay_time = ktime_get_seconds() + - cfs_rand() % attr->u.delay.la_interval; + prandom_u32_max(attr->u.delay.la_interval); } else { - rule->dl_delay_at = cfs_rand() % attr->u.delay.la_rate; + rule->dl_delay_at = prandom_u32_max(attr->u.delay.la_rate); } rule->dl_msg_send = -1; @@ -810,16 +886,13 @@ int lnet_delay_rule_del(lnet_nid_t src, lnet_nid_t dst, bool shutdown) { struct lnet_delay_rule *rule; - struct lnet_delay_rule *tmp; - struct list_head rule_list; - struct list_head msg_list; - int n = 0; - bool cleanup; + struct lnet_delay_rule *tmp; + LIST_HEAD(rule_list); + LIST_HEAD(msg_list); + int n = 0; + bool cleanup; ENTRY; - INIT_LIST_HEAD(&rule_list); - INIT_LIST_HEAD(&msg_list); - if (shutdown) src = dst = 0; @@ -921,10 +994,10 @@ lnet_delay_rule_reset(void) memset(&rule->dl_stat, 0, sizeof(rule->dl_stat)); if (attr->u.delay.la_rate != 0) { - rule->dl_delay_at = cfs_rand() % attr->u.delay.la_rate; + rule->dl_delay_at = prandom_u32_max(attr->u.delay.la_rate); } else { rule->dl_delay_time = ktime_get_seconds() + - cfs_rand() % attr->u.delay.la_interval; + prandom_u32_max(attr->u.delay.la_interval); rule->dl_time_base = ktime_get_seconds() + attr->u.delay.la_interval; } @@ -1002,10 +1075,10 @@ lnet_fault_ctl(int opc, struct libcfs_ioctl_data *data) int lnet_fault_init(void) { - CLASSERT(LNET_PUT_BIT == 1 << LNET_MSG_PUT); - CLASSERT(LNET_ACK_BIT == 1 << LNET_MSG_ACK); - CLASSERT(LNET_GET_BIT == 1 << LNET_MSG_GET); - CLASSERT(LNET_REPLY_BIT == 1 << LNET_MSG_REPLY); + BUILD_BUG_ON(LNET_PUT_BIT != 1 << LNET_MSG_PUT); + BUILD_BUG_ON(LNET_ACK_BIT != 1 << LNET_MSG_ACK); + BUILD_BUG_ON(LNET_GET_BIT != 1 << LNET_MSG_GET); + BUILD_BUG_ON(LNET_REPLY_BIT != 1 << LNET_MSG_REPLY); mutex_init(&delay_dd.dd_mutex); spin_lock_init(&delay_dd.dd_lock);