Whamcloud - gitweb
LU-13276 lnet: Update nnis to avoid infinite loop
[fs/lustre-release.git] / lnet / lnet / api-ni.c
index 6ced2aa..f8310e4 100644 (file)
@@ -548,7 +548,6 @@ lnet_init_locks(void)
 {
        spin_lock_init(&the_lnet.ln_eq_wait_lock);
        spin_lock_init(&the_lnet.ln_msg_resend_lock);
-       init_waitqueue_head(&the_lnet.ln_eq_waitq);
        init_completion(&the_lnet.ln_mt_wait_complete);
        mutex_init(&the_lnet.ln_lnd_mutex);
 }
@@ -1679,7 +1678,7 @@ lnet_ping_target_setup(struct lnet_ping_buffer **ppbuf,
 
        if (set_eq) {
                the_lnet.ln_ping_target_eq =
-                       LNetEQAlloc(0, lnet_ping_target_event_handler);
+                       LNetEQAlloc(lnet_ping_target_event_handler);
                if (IS_ERR(the_lnet.ln_ping_target_eq)) {
                        rc = PTR_ERR(the_lnet.ln_ping_target_eq);
                        CERROR("Can't allocate ping buffer EQ: %d\n", rc);
@@ -1855,14 +1854,16 @@ int lnet_push_target_resize(void)
        struct lnet_handle_md old_mdh;
        struct lnet_ping_buffer *pbuf;
        struct lnet_ping_buffer *old_pbuf;
-       int nnis = the_lnet.ln_push_target_nnis;
+       int nnis;
        int rc;
 
+again:
+       nnis = the_lnet.ln_push_target_nnis;
        if (nnis <= 0) {
                rc = -EINVAL;
                goto fail_return;
        }
-again:
+
        pbuf = lnet_ping_buffer_alloc(nnis, GFP_NOFS);
        if (!pbuf) {
                rc = -ENOMEM;
@@ -1945,7 +1946,7 @@ static int lnet_push_target_init(void)
                return -EALREADY;
 
        the_lnet.ln_push_target_eq =
-               LNetEQAlloc(0, lnet_push_target_event_handler);
+               LNetEQAlloc(lnet_push_target_event_handler);
        if (IS_ERR(the_lnet.ln_push_target_eq)) {
                rc = PTR_ERR(the_lnet.ln_push_target_eq);
                CERROR("Can't allocated push target EQ: %d\n", rc);
@@ -2646,7 +2647,7 @@ LNetNIInit(lnet_pid_t requested_pid)
 
        lnet_ping_target_update(pbuf, ping_mdh);
 
-       the_lnet.ln_mt_eq = LNetEQAlloc(0, lnet_mt_event_handler);
+       the_lnet.ln_mt_eq = LNetEQAlloc(lnet_mt_event_handler);
        if (IS_ERR(the_lnet.ln_mt_eq)) {
                rc = PTR_ERR(the_lnet.ln_mt_eq);
                CERROR("Can't allocate monitor thread EQ: %d\n", rc);
@@ -4077,24 +4078,45 @@ LNetGetId(unsigned int index, struct lnet_process_id *id)
 }
 EXPORT_SYMBOL(LNetGetId);
 
+struct ping_data {
+       int rc;
+       int replied;
+       struct lnet_handle_md mdh;
+       struct completion completion;
+};
+
+static void
+lnet_ping_event_handler(struct lnet_event *event)
+{
+       struct ping_data *pd = event->md.user_ptr;
+
+       CDEBUG(D_NET, "ping event (%d %d)%s\n",
+              event->type, event->status,
+              event->unlinked ? " unlinked" : "");
+
+       if (event->status) {
+               if (!pd->rc)
+                       pd->rc = event->status;
+       } else if (event->type == LNET_EVENT_REPLY) {
+               pd->replied = 1;
+               pd->rc = event->mlength;
+       }
+       if (event->unlinked)
+               complete(&pd->completion);
+}
+
 static int lnet_ping(struct lnet_process_id id, signed long timeout,
                     struct lnet_process_id __user *ids, int n_ids)
 {
        struct lnet_eq *eq;
-       struct lnet_handle_md mdh;
-       struct lnet_event event;
        struct lnet_md md = { NULL };
-       int which;
-       int unlinked = 0;
-       int replied = 0;
-       const signed long a_long_time = cfs_time_seconds(60);
+       struct ping_data pd = { 0 };
        struct lnet_ping_buffer *pbuf;
        struct lnet_process_id tmpid;
        int i;
        int nob;
        int rc;
        int rc2;
-       sigset_t blocked;
 
        /* n_ids limit is arbitrary */
        if (n_ids <= 0 || id.nid == LNET_NID_ANY)
@@ -4114,8 +4136,7 @@ static int lnet_ping(struct lnet_process_id id, signed long timeout,
        if (!pbuf)
                return -ENOMEM;
 
-       /* NB 2 events max (including any unlink event) */
-       eq = LNetEQAlloc(2, LNET_EQ_HANDLER_NONE);
+       eq = LNetEQAlloc(lnet_ping_event_handler);
        if (IS_ERR(eq)) {
                rc = PTR_ERR(eq);
                CERROR("Can't allocate EQ: %d\n", rc);
@@ -4128,83 +4149,40 @@ static int lnet_ping(struct lnet_process_id id, signed long timeout,
        md.threshold = 2; /* GET/REPLY */
        md.max_size  = 0;
        md.options   = LNET_MD_TRUNCATE;
-       md.user_ptr  = NULL;
+       md.user_ptr  = &pd;
        md.eq_handle = eq;
 
-       rc = LNetMDBind(md, LNET_UNLINK, &mdh);
+       init_completion(&pd.completion);
+
+       rc = LNetMDBind(md, LNET_UNLINK, &pd.mdh);
        if (rc != 0) {
                CERROR("Can't bind MD: %d\n", rc);
                goto fail_free_eq;
        }
 
-       rc = LNetGet(LNET_NID_ANY, mdh, id,
+       rc = LNetGet(LNET_NID_ANY, pd.mdh, id,
                     LNET_RESERVED_PORTAL,
                     LNET_PROTO_PING_MATCHBITS, 0, false);
 
        if (rc != 0) {
                /* Don't CERROR; this could be deliberate! */
-               rc2 = LNetMDUnlink(mdh);
+               rc2 = LNetMDUnlink(pd.mdh);
                LASSERT(rc2 == 0);
 
                /* NB must wait for the UNLINK event below... */
-               unlinked = 1;
-               timeout = a_long_time;
        }
 
-       do {
-               /* MUST block for unlink to complete */
-               if (unlinked) {
-                       sigset_t set;
-
-                       sigfillset(&set);
-                       sigprocmask(SIG_SETMASK, &set, &blocked);
-               }
-
-               rc2 = LNetEQPoll(&eq, 1, timeout, &event, &which);
-
-               if (unlinked)
-                       cfs_restore_sigs(blocked);
-
-               CDEBUG(D_NET, "poll %d(%d %d)%s\n", rc2,
-                      (rc2 <= 0) ? -1 : event.type,
-                      (rc2 <= 0) ? -1 : event.status,
-                      (rc2 > 0 && event.unlinked) ? " unlinked" : "");
-
-               LASSERT(rc2 != -EOVERFLOW);     /* can't miss anything */
-
-               if (rc2 <= 0 || event.status != 0) {
-                       /* timeout or error */
-                       if (!replied && rc == 0)
-                               rc = (rc2 < 0) ? rc2 :
-                                    (rc2 == 0) ? -ETIMEDOUT :
-                                    event.status;
-
-                       if (!unlinked) {
-                               /* Ensure completion in finite time... */
-                               LNetMDUnlink(mdh);
-                               /* No assertion (racing with network) */
-                               unlinked = 1;
-                               timeout = a_long_time;
-                       } else if (rc2 == 0) {
-                               /* timed out waiting for unlink */
-                               CWARN("ping %s: late network completion\n",
-                                     libcfs_id2str(id));
-                       }
-               } else if (event.type == LNET_EVENT_REPLY) {
-                       replied = 1;
-                       rc = event.mlength;
-               }
-       } while (rc2 <= 0 || !event.unlinked);
-
-       if (!replied) {
-               if (rc >= 0)
-                       CWARN("%s: Unexpected rc >= 0 but no reply!\n",
-                             libcfs_id2str(id));
+       if (wait_for_completion_timeout(&pd.completion, timeout) == 0) {
+               /* Ensure completion in finite time... */
+               LNetMDUnlink(pd.mdh);
+               wait_for_completion(&pd.completion);
+       }
+       if (!pd.replied) {
                rc = -EIO;
                goto fail_free_eq;
        }
 
-       nob = rc;
+       nob = pd.rc;
        LASSERT(nob >= 0 && nob <= LNET_PING_INFO_SIZE(n_ids));
 
        rc = -EPROTO;           /* if I can't parse... */