From 2740cf66c88e2d04126f7016cb1958ca976cd323 Mon Sep 17 00:00:00 2001 From: Chris Horn Date: Tue, 7 Nov 2023 15:19:26 -0700 Subject: [PATCH] LU-17271 kfilnd: Allocate tn_mr_key before kfilnd_peer A race exists between kfilnd_peer and tn_mr_key allocation that could result in RKEY re-use and data corruption. Thread 1: Posts tagged receive with RKEY based on peerA::kp_local_session_key X and tn_mr_key Y Thread 2: Fetches peerA with kp_local_session_key X Thread 1: Cancels tagged receive, marks peerA for removal, and releases tn_mr_key Y Thread 2: allocates tn_mr_key Y At this point, thread 2 has the same RKEY used by thread 1. The fix is to always allocate the tn_mr_key before looking up the peer, and always mark peers for removal before releasing tn_mr_key. This commit modifies the TN allocation to ensure the tn_mr_key is allocated before looking up the target peer. HPE-bug-id: LUS-11972 Test-Parameters: trivial Signed-off-by: Chris Horn Change-Id: I2e0948ae4fe7c5dfb86e297a3437213f193bf67c Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53029 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Ron Gredvig Reviewed-by: Ian Ziemba Reviewed-by: Oleg Drokin --- lnet/klnds/kfilnd/kfilnd.c | 2 +- lnet/klnds/kfilnd/kfilnd_tn.c | 205 ++++++++++++++++++++++++++---------------- lnet/klnds/kfilnd/kfilnd_tn.h | 9 +- 3 files changed, 132 insertions(+), 84 deletions(-) diff --git a/lnet/klnds/kfilnd/kfilnd.c b/lnet/klnds/kfilnd/kfilnd.c index 69cc3ce..4be2e9b 100644 --- a/lnet/klnds/kfilnd/kfilnd.c +++ b/lnet/klnds/kfilnd/kfilnd.c @@ -69,7 +69,7 @@ int kfilnd_send_hello_request(struct kfilnd_dev *dev, int cpt, return 0; } - tn = kfilnd_tn_alloc_for_peer(dev, cpt, kp, true, true, false); + tn = kfilnd_tn_alloc_for_hello(dev, cpt, kp); if (IS_ERR(tn)) { rc = PTR_ERR(tn); CERROR("Failed to allocate transaction struct: rc=%d\n", rc); diff --git a/lnet/klnds/kfilnd/kfilnd_tn.c b/lnet/klnds/kfilnd/kfilnd_tn.c index 243ca29..cb9fbbe 100644 --- a/lnet/klnds/kfilnd/kfilnd_tn.c +++ b/lnet/klnds/kfilnd/kfilnd_tn.c @@ -1523,6 +1523,93 @@ void kfilnd_tn_free(struct kfilnd_transaction *tn) kmem_cache_free(tn_cache, tn); } +/* + * Allocation logic common to kfilnd_tn_alloc() and kfilnd_tn_alloc_for_hello(). + * @ep: The KFI LND endpoint to associate with the transaction. + * @kp: The kfilnd peer to associate with the transaction. + * See kfilnd_tn_alloc() for a description of the other fields + * Note: Caller must have a reference on @kp + */ +static struct kfilnd_transaction *kfilnd_tn_alloc_common(struct kfilnd_ep *ep, + struct kfilnd_peer *kp, + bool alloc_msg, + bool is_initiator, + u16 key) +{ + struct kfilnd_transaction *tn; + int rc; + ktime_t tn_alloc_ts; + + tn_alloc_ts = ktime_get(); + + tn = kmem_cache_zalloc(tn_cache, GFP_KERNEL); + if (!tn) { + rc = -ENOMEM; + goto err; + } + + if (alloc_msg) { + tn->tn_tx_msg.msg = kmem_cache_alloc(imm_buf_cache, GFP_KERNEL); + if (!tn->tn_tx_msg.msg) { + rc = -ENOMEM; + goto err_free_tn; + } + } + + tn->tn_mr_key = key; + + tn->tn_kp = kp; + + mutex_init(&tn->tn_lock); + tn->tn_ep = ep; + tn->tn_response_rx = ep->end_context_id; + tn->tn_state = TN_STATE_IDLE; + tn->hstatus = LNET_MSG_STATUS_OK; + tn->deadline = ktime_get_seconds() + lnet_get_lnd_timeout(); + tn->tn_replay_deadline = ktime_sub(tn->deadline, + (lnet_get_lnd_timeout() / 2)); + tn->is_initiator = is_initiator; + INIT_WORK(&tn->timeout_work, kfilnd_tn_timeout_work); + + /* Add the transaction to an endpoint. This is like + * incrementing a ref counter. + */ + spin_lock(&ep->tn_list_lock); + list_add_tail(&tn->tn_entry, &ep->tn_list); + spin_unlock(&ep->tn_list_lock); + + tn->tn_alloc_ts = tn_alloc_ts; + tn->tn_state_ts = ktime_get(); + + KFILND_EP_DEBUG(ep, "Transaction ID %u allocated", tn->tn_mr_key); + + return tn; + +err_free_tn: + if (tn->tn_tx_msg.msg) + kmem_cache_free(imm_buf_cache, tn->tn_tx_msg.msg); + kmem_cache_free(tn_cache, tn); +err: + return ERR_PTR(rc); +} + +static struct kfilnd_ep *kfilnd_dev_to_ep(struct kfilnd_dev *dev, int cpt) +{ + struct kfilnd_ep *ep; + + if (!dev) + return ERR_PTR(-EINVAL); + + ep = dev->cpt_to_endpoint[cpt]; + if (!ep) { + CWARN("%s used invalid cpt=%d\n", + libcfs_nidstr(&dev->kfd_ni->ni_nid), cpt); + ep = dev->kfd_endpoints[0]; + } + + return ep; +} + /** * kfilnd_tn_alloc() - Allocate a new KFI LND transaction. * @dev: KFI LND device used to look the KFI LND endpoint to associate with the @@ -1531,7 +1618,7 @@ void kfilnd_tn_free(struct kfilnd_transaction *tn) * @target_nid: Target NID of the transaction. * @alloc_msg: Allocate an immediate message for the transaction. * @is_initiator: Is initiator of LNet transaction. - * @key: Is transaction memory region key need. + * @need_key: Is transaction memory region key needed. * * During transaction allocation, each transaction is associated with a KFI LND * endpoint use to post data transfer operations. The CPT argument is used to @@ -1542,121 +1629,85 @@ void kfilnd_tn_free(struct kfilnd_transaction *tn) struct kfilnd_transaction *kfilnd_tn_alloc(struct kfilnd_dev *dev, int cpt, lnet_nid_t target_nid, bool alloc_msg, bool is_initiator, - bool key) + bool need_key) { struct kfilnd_transaction *tn; + struct kfilnd_ep *ep; struct kfilnd_peer *kp; int rc; + u16 key = 0; - if (!dev) { - rc = -EINVAL; + ep = kfilnd_dev_to_ep(dev, cpt); + if (IS_ERR(ep)) { + rc = PTR_ERR(ep); goto err; } + /* Consider the following: + * Thread 1: Posts tagged receive with RKEY based on + * peerA::kp_local_session_key X and tn_mr_key Y + * Thread 2: Fetches peerA with kp_local_session_key X + * Thread 1: Cancels tagged receive, marks peerA for removal, and + * releases tn_mr_key Y + * Thread 2: allocates tn_mr_key Y + * At this point, thread 2 has the same RKEY used by thread 1. + * Thus, we always allocate the tn_mr_key before looking up the peer, + * and we always mark peers for removal before releasing tn_mr_key. + */ + if (need_key) { + rc = kfilnd_ep_get_key(ep); + if (rc < 0) + goto err; + key = rc; + } + kp = kfilnd_peer_get(dev, target_nid); if (IS_ERR(kp)) { rc = PTR_ERR(kp); - goto err; + goto err_put_key; } - tn = kfilnd_tn_alloc_for_peer(dev, cpt, kp, alloc_msg, is_initiator, - key); + tn = kfilnd_tn_alloc_common(ep, kp, alloc_msg, is_initiator, key); if (IS_ERR(tn)) { rc = PTR_ERR(tn); kfilnd_peer_put(kp); - goto err; + goto err_put_key; } return tn; +err_put_key: + kfilnd_ep_put_key(ep, key); err: return ERR_PTR(rc); } -/* See kfilnd_tn_alloc() +/* Like kfilnd_tn_alloc(), but caller already looked up the kfilnd_peer. + * Used only to allocate a TN for a hello request. + * See kfilnd_tn_alloc()/kfilnd_tn_alloc_comm() * Note: Caller must have a reference on @kp */ -struct kfilnd_transaction *kfilnd_tn_alloc_for_peer(struct kfilnd_dev *dev, - int cpt, - struct kfilnd_peer *kp, - bool alloc_msg, - bool is_initiator, - bool key) +struct kfilnd_transaction *kfilnd_tn_alloc_for_hello(struct kfilnd_dev *dev, int cpt, + struct kfilnd_peer *kp) { struct kfilnd_transaction *tn; struct kfilnd_ep *ep; int rc; - ktime_t tn_alloc_ts; - if (!dev) { - rc = -EINVAL; + ep = kfilnd_dev_to_ep(dev, cpt); + if (IS_ERR(ep)) { + rc = PTR_ERR(ep); goto err; } - tn_alloc_ts = ktime_get(); - - /* If the CPT does not fall into the LNet NI CPT range, force the CPT - * into the LNet NI CPT range. This should never happen. - */ - ep = dev->cpt_to_endpoint[cpt]; - if (!ep) { - CWARN("%s used invalid cpt=%d\n", - libcfs_nidstr(&dev->kfd_ni->ni_nid), cpt); - ep = dev->kfd_endpoints[0]; - } - - tn = kmem_cache_zalloc(tn_cache, GFP_KERNEL); - if (!tn) { - rc = -ENOMEM; + tn = kfilnd_tn_alloc_common(ep, kp, true, true, 0); + if (IS_ERR(tn)) { + rc = PTR_ERR(tn); goto err; } - if (alloc_msg) { - tn->tn_tx_msg.msg = kmem_cache_alloc(imm_buf_cache, GFP_KERNEL); - if (!tn->tn_tx_msg.msg) { - rc = -ENOMEM; - goto err_free_tn; - } - } - - if (key) { - rc = kfilnd_ep_get_key(ep); - if (rc < 0) - goto err_free_tn; - tn->tn_mr_key = rc; - } - - tn->tn_kp = kp; - - mutex_init(&tn->tn_lock); - tn->tn_ep = ep; - tn->tn_response_rx = ep->end_context_id; - tn->tn_state = TN_STATE_IDLE; - tn->hstatus = LNET_MSG_STATUS_OK; - tn->deadline = ktime_get_seconds() + lnet_get_lnd_timeout(); - tn->tn_replay_deadline = ktime_sub(tn->deadline, - (lnet_get_lnd_timeout() / 2)); - tn->is_initiator = is_initiator; - INIT_WORK(&tn->timeout_work, kfilnd_tn_timeout_work); - - /* Add the transaction to an endpoint. This is like - * incrementing a ref counter. - */ - spin_lock(&ep->tn_list_lock); - list_add_tail(&tn->tn_entry, &ep->tn_list); - spin_unlock(&ep->tn_list_lock); - - tn->tn_alloc_ts = tn_alloc_ts; - tn->tn_state_ts = ktime_get(); - - KFILND_EP_DEBUG(ep, "Transaction ID %u allocated", tn->tn_mr_key); - return tn; -err_free_tn: - if (tn->tn_tx_msg.msg) - kmem_cache_free(imm_buf_cache, tn->tn_tx_msg.msg); - kmem_cache_free(tn_cache, tn); err: return ERR_PTR(rc); } diff --git a/lnet/klnds/kfilnd/kfilnd_tn.h b/lnet/klnds/kfilnd/kfilnd_tn.h index 2f460a7..4b3b104 100644 --- a/lnet/klnds/kfilnd/kfilnd_tn.h +++ b/lnet/klnds/kfilnd/kfilnd_tn.h @@ -40,12 +40,9 @@ struct kfilnd_transaction *kfilnd_tn_alloc(struct kfilnd_dev *dev, int cpt, lnet_nid_t target_nid, bool alloc_msg, bool is_initiator, bool key); -struct kfilnd_transaction *kfilnd_tn_alloc_for_peer(struct kfilnd_dev *dev, - int cpt, - struct kfilnd_peer *kp, - bool alloc_msg, - bool is_initiator, - bool key); +struct kfilnd_transaction *kfilnd_tn_alloc_for_hello(struct kfilnd_dev *dev, + int cpt, + struct kfilnd_peer *kp); void kfilnd_tn_event_handler(struct kfilnd_transaction *tn, enum tn_events event, int status); void kfilnd_tn_cleanup(void); -- 1.8.3.1