From: James Shimek Date: Thu, 13 Oct 2016 03:24:29 +0000 (-0500) Subject: LU-8734 gnilnd: Handle dla credits exhaustion X-Git-Tag: 2.9.54~10 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=4ae5fa81f75e66f57f1cfcaae7b692e66285d79c;p=fs%2Flustre-release.git LU-8734 gnilnd: Handle dla credits exhaustion Add delay_list so that when dla_credits are exhausted we retry sends regardless of whether there is a CQ event targetted at the connection that tried to send. Remove retry limit and rely on connection timeouts to tell us when to close a connection. Change max_retransmits such that it is the number of attempts before the connection enters the delay queue. Retransmits are now expected to occur, so remove NETERROR log and always use D_NET when logging a retransmit. Move state changes and retransmit counter changes out of the logging function. Remove log_retrans_level and replace usages with D_NET directly. Test-parameters: trivial Signed-off-by: James Shimek Signed-off-by: Chris Horn Change-Id: Id0aeb1add8e761fc7351c28a04c20ca40c51b1e1 Reviewed-on: https://review.whamcloud.com/23258 Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Chuck Fossen Reviewed-by: James Simmons Reviewed-by: Oleg Drokin --- diff --git a/lnet/klnds/gnilnd/gnilnd.c b/lnet/klnds/gnilnd/gnilnd.c index 74d0fa2b..a41aafb 100644 --- a/lnet/klnds/gnilnd/gnilnd.c +++ b/lnet/klnds/gnilnd/gnilnd.c @@ -266,6 +266,7 @@ kgnilnd_create_conn(kgn_conn_t **connp, kgn_device_t *dev) INIT_LIST_HEAD(&conn->gnc_schedlist); INIT_LIST_HEAD(&conn->gnc_fmaq); INIT_LIST_HEAD(&conn->gnc_mdd_list); + INIT_LIST_HEAD(&conn->gnc_delaylist); spin_lock_init(&conn->gnc_list_lock); spin_lock_init(&conn->gnc_tx_lock); conn->gnc_magic = GNILND_CONN_MAGIC; @@ -451,8 +452,9 @@ kgnilnd_destroy_conn(kgn_conn_t *conn) list_empty(&conn->gnc_hashlist) && list_empty(&conn->gnc_schedlist) && list_empty(&conn->gnc_mdd_list) && + list_empty(&conn->gnc_delaylist) && conn->gnc_magic == GNILND_CONN_MAGIC, - "conn 0x%p->%s IRQ %d sched %d purg %d ep 0x%p Mg %d lists %d/%d/%d/%d\n", + "conn 0x%p->%s IRQ %d sched %d purg %d ep 0x%p Mg %d lists %d/%d/%d/%d/%d\n", conn, conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid) : "", !!in_interrupt(), conn->gnc_scheduled, @@ -462,7 +464,8 @@ kgnilnd_destroy_conn(kgn_conn_t *conn) list_empty(&conn->gnc_list), list_empty(&conn->gnc_hashlist), list_empty(&conn->gnc_schedlist), - list_empty(&conn->gnc_mdd_list)); + list_empty(&conn->gnc_mdd_list), + list_empty(&conn->gnc_delaylist)); /* Tripping these is especially bad, as it means we have items on the * lists that didn't keep their refcount on the connection - or @@ -738,6 +741,11 @@ kgnilnd_complete_closed_conn(kgn_conn_t *conn) kgnilnd_conn_state2str(conn)); LASSERT(list_empty(&conn->gnc_hashlist)); + /* We shouldnt be on the delay list, the conn can + * get added to this list during a retransmit, and retransmits + * only occur within scheduler threads. + */ + LASSERT(list_empty(&conn->gnc_delaylist)); /* we've sent the close, start nuking */ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SCHEDULE_COMPLETE)) @@ -2114,9 +2122,12 @@ kgnilnd_dev_fini(kgn_device_t *dev) /* At quiesce or rest time, need to loop through and clear gnd_ready_conns ?*/ LASSERTF(list_empty(&dev->gnd_ready_conns) && list_empty(&dev->gnd_map_tx) && - list_empty(&dev->gnd_rdmaq), - "dev 0x%p ready_conns %d@0x%p map_tx %d@0x%p rdmaq %d@0x%p\n", + list_empty(&dev->gnd_rdmaq) && + list_empty(&dev->gnd_delay_conns), + "dev 0x%p ready_conns %d@0x%p delay_conns %d@0x%p" + "map_tx %d@0x%p rdmaq %d@0x%p\n", dev, kgnilnd_count_list(&dev->gnd_ready_conns), &dev->gnd_ready_conns, + kgnilnd_count_list(&dev->gnd_delay_conns), &dev->gnd_delay_conns, kgnilnd_count_list(&dev->gnd_map_tx), &dev->gnd_map_tx, kgnilnd_count_list(&dev->gnd_rdmaq), &dev->gnd_rdmaq); @@ -2240,6 +2251,7 @@ int kgnilnd_base_startup(void) dev->gnd_id = i; INIT_LIST_HEAD(&dev->gnd_ready_conns); + INIT_LIST_HEAD(&dev->gnd_delay_conns); INIT_LIST_HEAD(&dev->gnd_map_tx); INIT_LIST_HEAD(&dev->gnd_fma_buffs); mutex_init(&dev->gnd_cq_mutex); diff --git a/lnet/klnds/gnilnd/gnilnd.h b/lnet/klnds/gnilnd/gnilnd.h index dd9669c..9589ae8 100644 --- a/lnet/klnds/gnilnd/gnilnd.h +++ b/lnet/klnds/gnilnd/gnilnd.h @@ -462,7 +462,7 @@ typedef struct kgn_tunables { int *kgn_bte_relaxed_ordering; /* relaxed ordering (PASSPW) on BTE transfers */ int *kgn_ptag; /* PTAG for cdm_create */ int *kgn_pkey; /* PKEY for cdm_create */ - int *kgn_max_retransmits; /* max number of FMA retransmits */ + int *kgn_max_retransmits; /* max number of FMA retransmits before entering delay list */ int *kgn_nwildcard; /* # wildcard per net to post */ int *kgn_nice; /* nice value for kgnilnd threads */ int *kgn_rdmaq_intervals; /* # intervals per second for rdmaq throttle */ @@ -541,6 +541,7 @@ typedef struct kgn_device { atomic_t gnd_neps; /* # EP allocated to conns */ short gnd_ready; /* stuff to do in scheduler thread */ struct list_head gnd_ready_conns; /* connections ready to tx/rx */ + struct list_head gnd_delay_conns; /* connections in need of dla/or smsg credits */ struct list_head gnd_map_tx; /* TX: needing buffer mapping */ wait_queue_head_t gnd_waitq; /* scheduler wakeup */ spinlock_t gnd_lock; /* serialise gnd_ready_conns */ @@ -706,6 +707,7 @@ typedef struct kgn_conn { struct list_head gnc_schedlist; /* schedule (on gnd_?_conns) for attention */ struct list_head gnc_fmaq; /* txs queued for FMA */ struct list_head gnc_mdd_list; /* hold list for MDD on hard conn reset */ + struct list_head gnc_delaylist; /* If on this list schedule anytime we get interrupted */ __u64 gnc_peerstamp; /* peer's unique stamp */ __u64 gnc_peer_connstamp; /* peer's unique connection stamp */ __u64 gnc_my_connstamp; /* my unique connection stamp */ @@ -879,7 +881,8 @@ extern kgn_tunables_t kgnilnd_tunables; extern void kgnilnd_destroy_peer(kgn_peer_t *peer); extern void kgnilnd_destroy_conn(kgn_conn_t *conn); -extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld); +extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld, int lock_held); +extern int _kgnilnd_schedule_delay_conn(kgn_conn_t *conn); /* Macro wrapper for _kgnilnd_schedule_conn. This will store the function * and the line of the calling function to allow us to debug problematic @@ -887,10 +890,20 @@ extern int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line * the location manually. */ #define kgnilnd_schedule_conn(conn) \ - _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0); + _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0, 0); #define kgnilnd_schedule_conn_refheld(conn, refheld) \ - _kgnilnd_schedule_conn(conn, __func__, __LINE__, refheld); + _kgnilnd_schedule_conn(conn, __func__, __LINE__, refheld, 0); + +#define kgnilnd_schedule_conn_nolock(conn) \ + _kgnilnd_schedule_conn(conn, __func__, __LINE__, 0, 1); + + +/* Macro wrapper for _kgnilnd_schedule_delay_conn. This will allow us to store + * extra data if we need to. + */ +#define kgnilnd_schedule_delay_conn(conn) \ + _kgnilnd_schedule_delay_conn(conn); static inline void kgnilnd_thread_fini(void) @@ -1764,7 +1777,7 @@ kgn_tx_t *kgnilnd_new_tx_msg(int type, lnet_nid_t source); void kgnilnd_tx_done(kgn_tx_t *tx, int completion); void kgnilnd_txlist_done(struct list_head *txlist, int error); void kgnilnd_unlink_peer_locked(kgn_peer_t *peer); -int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld); +int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld, int lock_held); int kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent); void kgnilnd_schedule_dgram(kgn_device_t *dev); diff --git a/lnet/klnds/gnilnd/gnilnd_cb.c b/lnet/klnds/gnilnd/gnilnd_cb.c index c8b4fe6..8ff626a 100644 --- a/lnet/klnds/gnilnd/gnilnd_cb.c +++ b/lnet/klnds/gnilnd/gnilnd_cb.c @@ -151,7 +151,7 @@ kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent) * as scheduled */ int -_kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld) +_kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld, int lock_held) { kgn_device_t *dev = conn->gnc_device; int sched; @@ -184,10 +184,11 @@ _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refhe conn, sched); CDEBUG(D_INFO, "scheduling conn 0x%p caller %s:%d\n", conn, caller, line); - - spin_lock(&dev->gnd_lock); + if (!lock_held) + spin_lock(&dev->gnd_lock); list_add_tail(&conn->gnc_schedlist, &dev->gnd_ready_conns); - spin_unlock(&dev->gnd_lock); + if (!lock_held) + spin_unlock(&dev->gnd_lock); set_mb(conn->gnc_last_sched_ask, jiffies); rc = 1; } else { @@ -197,6 +198,23 @@ _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refhe /* make sure thread(s) going to process conns - but let it make * separate decision from conn schedule */ + if (!lock_held) + kgnilnd_schedule_device(dev); + return rc; +} + +int +_kgnilnd_schedule_delay_conn(kgn_conn_t *conn) +{ + kgn_device_t *dev = conn->gnc_device; + int rc = 0; + spin_lock(&dev->gnd_lock); + if (list_empty(&conn->gnc_delaylist)) { + list_add_tail(&conn->gnc_delaylist, &dev->gnd_delay_conns); + rc = 1; + } + spin_unlock(&dev->gnd_lock); + kgnilnd_schedule_device(dev); return rc; } @@ -1343,70 +1361,35 @@ search_again: return 0; } -static inline int -kgnilnd_tx_should_retry(kgn_conn_t *conn, kgn_tx_t *tx) +static inline void +kgnilnd_tx_log_retrans(kgn_conn_t *conn, kgn_tx_t *tx) { - int max_retrans = *kgnilnd_tunables.kgn_max_retransmits; int log_retrans; - int log_retrans_level; - /* I need kgni credits to send this. Replace tx at the head of the - * fmaq and I'll get rescheduled when credits appear */ - tx->tx_state = 0; - tx->tx_retrans++; - conn->gnc_tx_retrans++; - log_retrans = ((tx->tx_retrans < 25) || ((tx->tx_retrans % 25) == 0) || - (tx->tx_retrans > (max_retrans / 2))); - log_retrans_level = tx->tx_retrans < (max_retrans / 2) ? D_NET : D_NETERROR; - - /* Decision time - either error, warn or just retransmit */ + log_retrans = ((tx->tx_retrans < 25) || ((tx->tx_retrans % 25) == 0)); /* we don't care about TX timeout - it could be that the network is slower * or throttled. We'll keep retranmitting - so if the network is so slow * that we fill up our mailbox, we'll keep trying to resend that msg * until we exceed the max_retrans _or_ gnc_last_rx expires, indicating * that he hasn't send us any traffic in return */ - - if (tx->tx_retrans > max_retrans) { - /* this means we are not backing off the retransmits - * in a healthy manner and are likely chewing up the - * CPU cycles quite badly */ - GNIDBG_TOMSG(D_ERROR, &tx->tx_msg, - "SOFTWARE BUG: too many retransmits (%d) for tx id %x " - "conn 0x%p->%s\n", - tx->tx_retrans, tx->tx_id, conn, - libcfs_nid2str(conn->gnc_peer->gnp_nid)); - - /* yes - double errors to help debug this condition */ - GNIDBG_TOMSG(D_NETERROR, &tx->tx_msg, "connection dead. " - "unable to send to %s for %lu secs (%d tries)", - libcfs_nid2str(tx->tx_conn->gnc_peer->gnp_nid), - cfs_duration_sec(jiffies - tx->tx_cred_wait), - tx->tx_retrans); - - kgnilnd_close_conn(conn, -ETIMEDOUT); - - /* caller should terminate */ - RETURN(0); - } else { - /* some reasonable throttling of the debug message */ - if (log_retrans) { - unsigned long now = jiffies; - /* XXX Nic: Mystical TX debug here... */ - GNIDBG_SMSG_CREDS(log_retrans_level, conn); - GNIDBG_TOMSG(log_retrans_level, &tx->tx_msg, - "NOT_DONE on conn 0x%p->%s id %x retrans %d wait %dus" - " last_msg %uus/%uus last_cq %uus/%uus", - conn, libcfs_nid2str(conn->gnc_peer->gnp_nid), - tx->tx_id, tx->tx_retrans, - jiffies_to_usecs(now - tx->tx_cred_wait), - jiffies_to_usecs(now - conn->gnc_last_tx), - jiffies_to_usecs(now - conn->gnc_last_rx), - jiffies_to_usecs(now - conn->gnc_last_tx_cq), - jiffies_to_usecs(now - conn->gnc_last_rx_cq)); - } - /* caller should retry */ - RETURN(1); + + /* some reasonable throttling of the debug message */ + if (log_retrans) { + unsigned long now = jiffies; + /* XXX Nic: Mystical TX debug here... */ + /* We expect retransmissions so only log when D_NET is enabled */ + GNIDBG_SMSG_CREDS(D_NET, conn); + GNIDBG_TOMSG(D_NET, &tx->tx_msg, + "NOT_DONE on conn 0x%p->%s id %x retrans %d wait %dus" + " last_msg %uus/%uus last_cq %uus/%uus", + conn, libcfs_nid2str(conn->gnc_peer->gnp_nid), + tx->tx_id, tx->tx_retrans, + jiffies_to_usecs(now - tx->tx_cred_wait), + jiffies_to_usecs(now - conn->gnc_last_tx), + jiffies_to_usecs(now - conn->gnc_last_rx), + jiffies_to_usecs(now - conn->gnc_last_tx_cq), + jiffies_to_usecs(now - conn->gnc_last_rx_cq)); } } @@ -1419,7 +1402,6 @@ kgnilnd_sendmsg_nolock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob, { kgn_conn_t *conn = tx->tx_conn; kgn_msg_t *msg = &tx->tx_msg; - int retry_send; gni_return_t rrc; unsigned long newest_last_rx, timeout; unsigned long now; @@ -1529,9 +1511,11 @@ kgnilnd_sendmsg_nolock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob, return 0; case GNI_RC_NOT_DONE: - /* XXX Nic: We need to figure out how to track this - * - there are bound to be good reasons for it, - * but we want to know when it happens */ + /* Jshimek: We can get GNI_RC_NOT_DONE for 3 reasons currently + * 1: out of mbox credits + * 2: out of mbox payload credits + * 3: On Aries out of dla credits + */ kgnilnd_conn_mutex_unlock(&conn->gnc_smsg_mutex); kgnilnd_gl_mutex_unlock(&conn->gnc_device->gnd_cq_mutex); /* We'll handle this error inline - makes the calling logic much more @@ -1542,31 +1526,36 @@ kgnilnd_sendmsg_nolock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob, return -EAGAIN; } - retry_send = kgnilnd_tx_should_retry(conn, tx); - if (retry_send) { - /* add to head of list for the state and retries */ - spin_lock(state_lock); - kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, state, 0); - spin_unlock(state_lock); - - /* We only reschedule for a certain number of retries, then - * we will wait for the CQ events indicating a release of SMSG - * credits */ - if (tx->tx_retrans < (*kgnilnd_tunables.kgn_max_retransmits/4)) { - kgnilnd_schedule_conn(conn); - return 0; - } else { - /* CQ event coming in signifies either TX completed or - * RX receive. Either of these *could* free up credits - * in the SMSG mbox and we should try sending again */ - GNIDBG_TX(D_NET, tx, "waiting for CQID %u event to resend", - tx->tx_conn->gnc_cqid); - /* use +ve return code to let upper layers know they - * should stop looping on sends */ - return EAGAIN; - } + /* I need kgni credits to send this. Replace tx at the head of the + * fmaq and I'll get rescheduled when credits appear. Reset the tx_state + * and bump retrans counts since we are requeueing the tx. + */ + tx->tx_state = 0; + tx->tx_retrans++; + conn->gnc_tx_retrans++; + + kgnilnd_tx_log_retrans(conn, tx); + /* add to head of list for the state and retries */ + spin_lock(state_lock); + kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, state, 0); + spin_unlock(state_lock); + + /* We only reschedule for a certain number of retries, then + * we will wait for the CQ events indicating a release of SMSG + * credits */ + if (tx->tx_retrans < *kgnilnd_tunables.kgn_max_retransmits) { + kgnilnd_schedule_conn(conn); + return 0; } else { - return -EAGAIN; + /* CQ event coming in signifies either TX completed or + * RX receive. Either of these *could* free up credits + * in the SMSG mbox and we should try sending again */ + GNIDBG_TX(D_NET, tx, "waiting for CQID %u event to resend", + tx->tx_conn->gnc_cqid); + kgnilnd_schedule_delay_conn(conn); + /* use +ve return code to let upper layers know they + * should stop looping on sends */ + return EAGAIN; } default: /* handle bad retcode gracefully */ @@ -2080,6 +2069,8 @@ kgnilnd_release_msg(kgn_conn_t *conn) LASSERTF(rrc == GNI_RC_SUCCESS, "bad rrc %d\n", rrc); GNIDBG_SMSG_CREDS(D_NET, conn); + kgnilnd_schedule_conn(conn); + return; } @@ -3338,6 +3329,7 @@ kgnilnd_check_fma_send_cq(kgn_device_t *dev) kgn_conn_t *conn = NULL; int queued_fma, saw_reply, rc; long num_processed = 0; + struct list_head *ctmp, *ctmpN; for (;;) { /* make sure we don't keep looping if we need to reset */ @@ -3360,6 +3352,22 @@ kgnilnd_check_fma_send_cq(kgn_device_t *dev) "SMSG send CQ %d not ready (data %#llx) " "processed %ld\n", dev->gnd_id, event_data, num_processed); + + if (num_processed > 0) { + spin_lock(&dev->gnd_lock); + if (!list_empty(&dev->gnd_delay_conns)) { + list_for_each_safe(ctmp, ctmpN, &dev->gnd_delay_conns) { + conn = list_entry(ctmp, kgn_conn_t, gnc_delaylist); + list_del_init(&conn->gnc_delaylist); + CDEBUG(D_NET, "Moving Conn %p from delay queue to ready_queue\n", conn); + kgnilnd_schedule_conn_nolock(conn); + } + spin_unlock(&dev->gnd_lock); + kgnilnd_schedule_device(dev); + } else { + spin_unlock(&dev->gnd_lock); + } + } return num_processed; } @@ -4900,6 +4908,12 @@ kgnilnd_process_conns(kgn_device_t *dev, unsigned long deadline) conn = list_first_entry(&dev->gnd_ready_conns, kgn_conn_t, gnc_schedlist); list_del_init(&conn->gnc_schedlist); + /* + * Since we are processing conn now, we don't need to be on the delaylist any longer. + */ + + if (!list_empty(&conn->gnc_delaylist)) + list_del_init(&conn->gnc_delaylist); spin_unlock(&dev->gnd_lock); conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS); @@ -4926,7 +4940,7 @@ kgnilnd_process_conns(kgn_device_t *dev, unsigned long deadline) kgnilnd_conn_decref(conn); up_write(&dev->gnd_conn_sem); } else if (rc != 1) { - kgnilnd_conn_decref(conn); + kgnilnd_conn_decref(conn); } /* clear this so that scheduler thread doesn't spin */ found_work = 0; @@ -4977,7 +4991,7 @@ kgnilnd_process_conns(kgn_device_t *dev, unsigned long deadline) kgnilnd_conn_decref(conn); up_write(&dev->gnd_conn_sem); } else if (rc != 1) { - kgnilnd_conn_decref(conn); + kgnilnd_conn_decref(conn); } /* check list again with lock held */ diff --git a/lnet/klnds/gnilnd/gnilnd_modparams.c b/lnet/klnds/gnilnd/gnilnd_modparams.c index c4d6458..68659d1 100644 --- a/lnet/klnds/gnilnd/gnilnd_modparams.c +++ b/lnet/klnds/gnilnd/gnilnd_modparams.c @@ -102,9 +102,10 @@ static int pkey = GNI_JOB_CREATE_COOKIE(GNI_PKEY_LND, 0); module_param(pkey, int, 0444); MODULE_PARM_DESC(pkey, "pkey for CDM"); -static int max_retransmits = 1024; +static int max_retransmits = 128; module_param(max_retransmits, int, 0444); -MODULE_PARM_DESC(max_retransmits, "max retransmits for FMA"); +MODULE_PARM_DESC(max_retransmits, + "max retransmits for FMA before entering delay queue"); static int nwildcard = 4; module_param(nwildcard, int, 0444); diff --git a/lnet/klnds/gnilnd/gnilnd_stack.c b/lnet/klnds/gnilnd/gnilnd_stack.c index 68024e1..bdec685 100644 --- a/lnet/klnds/gnilnd/gnilnd_stack.c +++ b/lnet/klnds/gnilnd/gnilnd_stack.c @@ -254,6 +254,9 @@ kgnilnd_reset_stack(void) list_del_init(&conn->gnc_schedlist); + if (!list_empty(&conn->gnc_delaylist)) + list_del_init(&conn->gnc_delaylist); + if (conn->gnc_state == GNILND_CONN_CLOSING) { /* bump to CLOSED to fake out send of CLOSE */ conn->gnc_state = GNILND_CONN_CLOSED;