Whamcloud - gitweb
LU-15509 lnet: Ping buffer ref leak in lnet_peer_data_present
[fs/lustre-release.git] / lnet / lnet / peer.c
index ba2f84d..ae71e25 100644 (file)
@@ -1439,21 +1439,20 @@ unlock:
 }
 EXPORT_SYMBOL(LNetAddPeer);
 
-/* FIXME support large-addr nid */
-lnet_nid_t
-LNetPrimaryNID(lnet_nid_t nid)
+void LNetPrimaryNID(struct lnet_nid *nid)
 {
        struct lnet_peer *lp;
        struct lnet_peer_ni *lpni;
-       lnet_nid_t primary_nid = nid;
+       struct lnet_nid orig;
        int rc = 0;
        int cpt;
 
-       if (nid == LNET_NID_LO_0)
-               return LNET_NID_LO_0;
+       if (!nid || nid_is_lo0(nid))
+               return;
+       orig = *nid;
 
        cpt = lnet_net_lock_current();
-       lpni = lnet_nid2peerni_locked(nid, LNET_NID_ANY, cpt);
+       lpni = lnet_peerni_by_nid_locked(nid, NULL, cpt);
        if (IS_ERR(lpni)) {
                rc = PTR_ERR(lpni);
                goto out_unlock;
@@ -1480,7 +1479,7 @@ LNetPrimaryNID(lnet_nid_t nid)
                 * and lookup the lpni again
                 */
                lnet_peer_ni_decref_locked(lpni);
-               lpni = lnet_find_peer_ni_locked(nid);
+               lpni = lnet_peer_ni_find_locked(nid);
                if (!lpni) {
                        rc = -ENOENT;
                        goto out_unlock;
@@ -1495,15 +1494,14 @@ LNetPrimaryNID(lnet_nid_t nid)
                if (lnet_is_discovery_disabled(lp))
                        break;
        }
-       primary_nid = lnet_nid_to_nid4(&lp->lp_primary_nid);
+       *nid = lp->lp_primary_nid;
 out_decref:
        lnet_peer_ni_decref_locked(lpni);
 out_unlock:
        lnet_net_unlock(cpt);
 
-       CDEBUG(D_NET, "NID %s primary NID %s rc %d\n", libcfs_nid2str(nid),
-              libcfs_nid2str(primary_nid), rc);
-       return primary_nid;
+       CDEBUG(D_NET, "NID %s primary NID %s rc %d\n", libcfs_nidstr(&orig),
+              libcfs_nidstr(nid), rc);
 }
 EXPORT_SYMBOL(LNetPrimaryNID);
 
@@ -1846,19 +1844,22 @@ out:
 
 /*
  * lpni creation initiated due to traffic either sending or receiving.
+ * Callers must hold ln_api_mutex
+ * Ref taken on lnet_peer_ni returned by this function
  */
-static int
+static struct lnet_peer_ni *
 lnet_peer_ni_traffic_add(struct lnet_nid *nid, struct lnet_nid *pref)
+__must_hold(&the_lnet.ln_api_mutex)
 {
-       struct lnet_peer *lp;
-       struct lnet_peer_net *lpn;
+       struct lnet_peer *lp = NULL;
+       struct lnet_peer_net *lpn = NULL;
        struct lnet_peer_ni *lpni;
        unsigned flags = 0;
        int rc = 0;
 
        if (LNET_NID_IS_ANY(nid)) {
                rc = -EINVAL;
-               goto out;
+               goto out_err;
        }
 
        /* lnet_net_lock is not needed here because ln_api_lock is held */
@@ -1870,7 +1871,6 @@ lnet_peer_ni_traffic_add(struct lnet_nid *nid, struct lnet_nid *pref)
                 * traffic, we just assume everything is ok and
                 * return.
                 */
-               lnet_peer_ni_decref_locked(lpni);
                goto out;
        }
 
@@ -1878,24 +1878,31 @@ lnet_peer_ni_traffic_add(struct lnet_nid *nid, struct lnet_nid *pref)
        rc = -ENOMEM;
        lp = lnet_peer_alloc(nid);
        if (!lp)
-               goto out;
+               goto out_err;
        lpn = lnet_peer_net_alloc(LNET_NID_NET(nid));
        if (!lpn)
-               goto out_free_lp;
+               goto out_err;
        lpni = lnet_peer_ni_alloc(nid);
        if (!lpni)
-               goto out_free_lpn;
+               goto out_err;
        lnet_peer_ni_set_non_mr_pref_nid(lpni, pref);
 
-       return lnet_peer_attach_peer_ni(lp, lpn, lpni, flags);
+       /* lnet_peer_attach_peer_ni() always returns 0 */
+       rc = lnet_peer_attach_peer_ni(lp, lpn, lpni, flags);
 
-out_free_lpn:
-       LIBCFS_FREE(lpn, sizeof(*lpn));
-out_free_lp:
-       LIBCFS_FREE(lp, sizeof(*lp));
+       lnet_peer_ni_addref_locked(lpni);
+
+out_err:
+       if (rc) {
+               if (lpn)
+                       LIBCFS_FREE(lpn, sizeof(*lpn));
+               if (lp)
+                       LIBCFS_FREE(lp, sizeof(*lp));
+               lpni = ERR_PTR(rc);
+       }
 out:
        CDEBUG(D_NET, "peer %s: %d\n", libcfs_nidstr(nid), rc);
-       return rc;
+       return lpni;
 }
 
 /*
@@ -2064,10 +2071,10 @@ lnet_destroy_peer_ni_locked(struct kref *ref)
 }
 
 struct lnet_peer_ni *
-lnet_nid2peerni_ex(struct lnet_nid *nid, int cpt)
+lnet_nid2peerni_ex(struct lnet_nid *nid)
+__must_hold(&the_lnet.ln_api_mutex)
 {
        struct lnet_peer_ni *lpni = NULL;
-       int rc;
 
        if (the_lnet.ln_state != LNET_STATE_RUNNING)
                return ERR_PTR(-ESHUTDOWN);
@@ -2080,19 +2087,11 @@ lnet_nid2peerni_ex(struct lnet_nid *nid, int cpt)
        if (lpni)
                return lpni;
 
-       lnet_net_unlock(cpt);
-
-       rc = lnet_peer_ni_traffic_add(nid, NULL);
-       if (rc) {
-               lpni = ERR_PTR(rc);
-               goto out_net_relock;
-       }
+       lnet_net_unlock(LNET_LOCK_EX);
 
-       lpni = lnet_peer_ni_find_locked(nid);
-       LASSERT(lpni);
+       lpni = lnet_peer_ni_traffic_add(nid, NULL);
 
-out_net_relock:
-       lnet_net_lock(cpt);
+       lnet_net_lock(LNET_LOCK_EX);
 
        return lpni;
 }
@@ -2106,7 +2105,6 @@ lnet_peerni_by_nid_locked(struct lnet_nid *nid,
                        struct lnet_nid *pref, int cpt)
 {
        struct lnet_peer_ni *lpni = NULL;
-       int rc;
 
        if (the_lnet.ln_state != LNET_STATE_RUNNING)
                return ERR_PTR(-ESHUTDOWN);
@@ -2134,30 +2132,18 @@ lnet_peerni_by_nid_locked(struct lnet_nid *nid,
        lnet_net_unlock(cpt);
        mutex_lock(&the_lnet.ln_api_mutex);
        /*
-        * Shutdown is only set under the ln_api_lock, so a single
+        * the_lnet.ln_state is only modified under the ln_api_lock, so a single
         * check here is sufficent.
         */
-       if (the_lnet.ln_state != LNET_STATE_RUNNING) {
-               lpni = ERR_PTR(-ESHUTDOWN);
-               goto out_mutex_unlock;
-       }
+       if (the_lnet.ln_state == LNET_STATE_RUNNING)
+               lpni = lnet_peer_ni_traffic_add(nid, pref);
 
-       rc = lnet_peer_ni_traffic_add(nid, pref);
-       if (rc) {
-               lpni = ERR_PTR(rc);
-               goto out_mutex_unlock;
-       }
-
-       lpni = lnet_peer_ni_find_locked(nid);
-       LASSERT(lpni);
-
-out_mutex_unlock:
        mutex_unlock(&the_lnet.ln_api_mutex);
        lnet_net_lock(cpt);
 
        /* Lock has been dropped, check again for shutdown. */
        if (the_lnet.ln_state != LNET_STATE_RUNNING) {
-               if (!IS_ERR(lpni))
+               if (!IS_ERR_OR_NULL(lpni))
                        lnet_peer_ni_decref_locked(lpni);
                lpni = ERR_PTR(-ESHUTDOWN);
        }
@@ -2593,6 +2579,8 @@ again:
                        break;
                if (lnet_peer_is_uptodate(lp))
                        break;
+               if (lp->lp_state & LNET_PEER_MARK_DELETED)
+                       break;
                lnet_peer_queue_for_discovery(lp);
                count++;
                CDEBUG(D_NET, "Discovery attempt # %d\n", count);
@@ -2637,7 +2625,9 @@ again:
                rc = lp->lp_dc_error;
        else if (!block)
                CDEBUG(D_NET, "non-blocking discovery\n");
-       else if (!lnet_peer_is_uptodate(lp) && !lnet_is_discovery_disabled(lp))
+       else if (!lnet_peer_is_uptodate(lp) &&
+                !(lnet_is_discovery_disabled(lp) ||
+                  (lp->lp_state & LNET_PEER_MARK_DELETED)))
                goto again;
 
        CDEBUG(D_NET, "peer %s NID %s: %d. %s\n",
@@ -3258,12 +3248,15 @@ __must_hold(&lp->lp_lock)
        if (lp->lp_state & LNET_PEER_MARK_DELETED)
                return 0;
 
-       if (the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING)
-               return -ESHUTDOWN;
-
        spin_unlock(&lp->lp_lock);
 
        mutex_lock(&the_lnet.ln_api_mutex);
+       if (the_lnet.ln_state != LNET_STATE_RUNNING ||
+           the_lnet.ln_dc_state != LNET_DC_STATE_RUNNING) {
+               mutex_unlock(&the_lnet.ln_api_mutex);
+               spin_lock(&lp->lp_lock);
+               return -ESHUTDOWN;
+       }
 
        lnet_net_lock(LNET_LOCK_EX);
        /* remove the peer from the discovery work
@@ -3341,8 +3334,10 @@ __must_hold(&lp->lp_lock)
         * down, and our reference count may be all that is keeping it
         * alive. Don't do any work on it.
         */
-       if (list_empty(&lp->lp_peer_list))
+       if (list_empty(&lp->lp_peer_list)) {
+               lnet_ping_buffer_decref(pbuf);
                goto out;
+       }
 
        flags = LNET_PEER_DISCOVERED;
        if (pbuf->pb_info.pi_features & LNET_PING_FEAT_MULTI_RAIL)
@@ -3369,7 +3364,9 @@ __must_hold(&lp->lp_lock)
        nid = pbuf->pb_info.pi_ni[1].ns_nid;
        if (nid_is_lo0(&lp->lp_primary_nid)) {
                rc = lnet_peer_set_primary_nid(lp, nid, flags);
-               if (!rc)
+               if (rc)
+                       lnet_ping_buffer_decref(pbuf);
+               else
                        rc = lnet_peer_merge_data(lp, pbuf);
        /*
         * if the primary nid of the peer is present in the ping info returned
@@ -3392,6 +3389,7 @@ __must_hold(&lp->lp_lock)
                                CERROR("Primary NID error %s versus %s: %d\n",
                                       libcfs_nidstr(&lp->lp_primary_nid),
                                       libcfs_nid2str(nid), rc);
+                               lnet_ping_buffer_decref(pbuf);
                        } else {
                                rc = lnet_peer_merge_data(lp, pbuf);
                        }
@@ -3574,7 +3572,7 @@ static int lnet_peer_send_push(struct lnet_peer *lp)
 __must_hold(&lp->lp_lock)
 {
        struct lnet_ping_buffer *pbuf;
-       struct lnet_process_id id;
+       struct lnet_processid id;
        struct lnet_md md;
        int cpt;
        int rc;
@@ -3621,13 +3619,13 @@ __must_hold(&lp->lp_lock)
        lnet_peer_addref_locked(lp);
        id.pid = LNET_PID_LUSTRE;
        if (!LNET_NID_IS_ANY(&lp->lp_disc_dst_nid))
-               id.nid = lnet_nid_to_nid4(&lp->lp_disc_dst_nid);
+               id.nid = lp->lp_disc_dst_nid;
        else
-               id.nid = lnet_nid_to_nid4(&lp->lp_primary_nid);
+               id.nid = lp->lp_primary_nid;
        lnet_net_unlock(cpt);
 
-       rc = LNetPut(lnet_nid_to_nid4(&lp->lp_disc_src_nid), lp->lp_push_mdh,
-                    LNET_ACK_REQ, id, LNET_RESERVED_PORTAL,
+       rc = LNetPut(&lp->lp_disc_src_nid, lp->lp_push_mdh,
+                    LNET_ACK_REQ, &id, LNET_RESERVED_PORTAL,
                     LNET_PROTO_PING_MATCHBITS, 0, 0);
 
        /*
@@ -3954,8 +3952,10 @@ void lnet_peer_discovery_stop(void)
        else
                wake_up(&the_lnet.ln_dc_waitq);
 
+       mutex_unlock(&the_lnet.ln_api_mutex);
        wait_event(the_lnet.ln_dc_waitq,
                   the_lnet.ln_dc_state == LNET_DC_STATE_SHUTDOWN);
+       mutex_lock(&the_lnet.ln_api_mutex);
 
        LASSERT(list_empty(&the_lnet.ln_dc_request));
        LASSERT(list_empty(&the_lnet.ln_dc_working));