4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright 2022 Hewlett Packard Enterprise Development LP
26 * This file is part of Lustre, http://www.lustre.org/
29 * kfilnd main interface.
35 #include <linux/version.h>
36 #include <linux/module.h>
37 #include <linux/kernel.h>
38 #include <linux/kthread.h>
40 #include <linux/string.h>
41 #include <linux/stat.h>
42 #include <linux/errno.h>
43 #include <linux/unistd.h>
44 #include <linux/uio.h>
45 #include <linux/rwsem.h>
46 #include <linux/mutex.h>
47 #include <linux/rhashtable.h>
48 #include <linux/workqueue.h>
49 #include <linux/debugfs.h>
50 #include <linux/seq_file.h>
51 #include <linux/ktime.h>
53 #include <asm/uaccess.h>
56 #include <linux/init.h>
58 #include <linux/file.h>
59 #include <linux/stat.h>
60 #include <linux/list.h>
61 #include <linux/kmod.h>
62 #include <linux/sysctl.h>
63 #include <linux/pci.h>
68 #define KFILND_VERSION "0.2.0"
70 #define DEBUG_SUBSYSTEM S_LND
72 #include <libcfs/libcfs.h>
73 #include <lnet/lib-lnet.h>
74 #include "kfi_endpoint.h"
75 #include "kfi_errno.h"
77 #include "kfi_tagged.h"
78 #include "kfi_cxi_ext.h"
80 /* KFILND CFS fail range 0xF100 - 0xF1FF. */
82 #define CFS_KFI_FAIL_SEND_EVENT 0xF100
83 #define CFS_KFI_FAIL_READ_EVENT 0xF101
84 #define CFS_KFI_FAIL_WRITE_EVENT 0xF102
85 #define CFS_KFI_FAIL_TAGGED_SEND_EVENT 0xF103
86 #define CFS_KFI_FAIL_TAGGED_RECV_EVENT 0xF104
87 #define CFS_KFI_FAIL_BULK_TIMEOUT 0xF105
88 #define CFS_KFI_FAIL_SEND 0xF106
89 #define CFS_KFI_FAIL_READ 0xF107
90 #define CFS_KFI_FAIL_WRITE 0xF108
91 #define CFS_KFI_FAIL_TAGGED_SEND 0xF109
92 #define CFS_KFI_FAIL_TAGGED_RECV 0xF10A
93 #define CFS_KFI_FAIL_SEND_EAGAIN 0xF10B
94 #define CFS_KFI_FAIL_READ_EAGAIN 0xF10C
95 #define CFS_KFI_FAIL_WRITE_EAGAIN 0xF10D
96 #define CFS_KFI_FAIL_TAGGED_SEND_EAGAIN 0xF10E
97 #define CFS_KFI_FAIL_TAGGED_RECV_EAGAIN 0xF10F
98 #define CFS_KFI_FAIL_TAGGED_RECV_CANCEL_EAGAIN 0xF110
99 #define CFS_KFI_FAIL_RECV_EAGAIN 0xF111
100 #define CFS_KFI_FAIL_RECV 0xF112
101 #define CFS_KFI_FAIL_MSG_UNPACK 0xF113
103 /* Maximum number of transaction keys supported. */
104 #define KFILND_EP_KEY_BITS 16U
105 #define KFILND_EP_KEY_MAX (BIT(KFILND_EP_KEY_BITS) - 1)
107 /* Some constants which should be turned into tunables */
108 #define KFILND_IMMEDIATE_MSG_SIZE 4096
110 #define KFILND_MY_PROCID 49152
112 /* 256 Rx contexts max */
113 #define KFILND_FAB_RX_CTX_BITS 8
115 /* Get the KFI base address from a KFI RX address. RX context information is
116 * stored in the MSBs of the KFI address.
118 #define KFILND_BASE_ADDR(addr) \
119 ((addr) & ((1UL << (64 - KFILND_FAB_RX_CTX_BITS)) - 1))
121 /* States used by all kfilnd structures */
122 enum kfilnd_object_states {
123 KFILND_STATE_UNINITIALIZED,
124 KFILND_STATE_INITIALIZED,
125 KFILND_STATE_SHUTTING_DOWN
128 extern struct dentry *kfilnd_debug_dir;
129 extern const struct file_operations kfilnd_initiator_state_stats_file_ops;
130 extern const struct file_operations kfilnd_target_state_stats_file_ops;
131 extern const struct file_operations kfilnd_target_stats_file_ops;
132 extern const struct file_operations kfilnd_initiator_stats_file_ops;
133 extern const struct file_operations kfilnd_reset_stats_file_ops;
135 extern struct workqueue_struct *kfilnd_wq;
137 extern unsigned int cksum;
138 extern unsigned int tx_scale_factor;
139 extern unsigned int rx_cq_scale_factor;
140 extern unsigned int tx_cq_scale_factor;
141 extern unsigned int eq_size;
142 extern unsigned int immediate_rx_buf_count;
144 int kfilnd_tunables_setup(struct lnet_ni *ni);
145 int kfilnd_tunables_init(void);
147 struct kfilnd_transaction;
151 /* Multi-receive buffers for immediate receives */
152 struct kfilnd_immediate_buffer {
154 size_t immed_buf_size;
155 struct page *immed_buf_page;
157 bool immed_no_repost;
158 struct list_head replay_entry;
159 struct kfilnd_ep *immed_end;
162 extern atomic_t kfilnd_rx_count;
166 struct kfilnd_cq_work {
167 struct kfilnd_cq *cq;
168 unsigned int work_cpu;
169 struct work_struct work;
173 struct kfilnd_ep *ep;
175 unsigned int cq_work_count;
176 struct kfilnd_cq_work cq_works[];
180 /* The contexts for this CPT */
181 struct kfid_ep *end_tx;
182 struct kfid_ep *end_rx;
184 /* Corresponding CQs */
185 struct kfilnd_cq *end_tx_cq;
186 struct kfilnd_cq *end_rx_cq;
188 /* Specific config values for this endpoint */
189 struct kfilnd_dev *end_dev;
193 /* List of transactions. */
194 struct list_head tn_list;
195 spinlock_t tn_list_lock;
198 struct list_head tn_replay;
199 struct list_head imm_buffer_replay;
200 spinlock_t replay_lock;
201 struct timer_list replay_timer;
202 struct work_struct replay_work;
203 atomic_t replay_count;
205 /* Key used to build the tag for tagged buffers. */
208 /* Pre-posted immediate buffers */
209 struct kfilnd_immediate_buffer end_immed_bufs[];
213 struct rhash_head node;
214 struct rcu_head rcu_head;
215 struct kfilnd_dev *dev;
219 atomic_t remove_peer;
223 u32 local_session_key;
224 u32 remote_session_key;
227 static inline bool kfilnd_peer_is_new_peer(struct kfilnd_peer *peer)
229 return peer->version == 0;
232 static inline void kfilnd_peer_set_version(struct kfilnd_peer *peer,
235 peer->version = version;
238 static inline void kfilnd_peer_set_remote_session_key(struct kfilnd_peer *peer,
241 peer->remote_session_key = session_key;
245 struct list_head entry;
246 struct list_head dom_list;
247 struct mutex dom_list_lock;
248 struct kfid_fabric *fabric;
253 struct list_head entry;
254 struct list_head dev_list;
256 struct kfilnd_fab *fab;
257 struct kfid_domain *domain;
261 /* Transaction States */
265 /* Shared initiator and target states. */
267 TN_STATE_WAIT_TAG_COMP,
269 /* Initiator immediate states. */
272 /* Initiator bulk states. */
273 TN_STATE_TAGGED_RECV_POSTED,
274 TN_STATE_SEND_FAILED,
276 TN_STATE_WAIT_TIMEOUT_COMP,
277 TN_STATE_WAIT_SEND_COMP,
278 TN_STATE_WAIT_TIMEOUT_TAG_COMP,
283 TN_STATE_WAIT_TAG_RMA_COMP,
285 /* Invalid max value. */
289 /* Base duration state stats. */
290 struct kfilnd_tn_duration_stat {
291 atomic64_t accumulated_duration;
292 atomic_t accumulated_count;
295 /* Transaction state stats group into 22 buckets. Bucket zero corresponds to
296 * LNet message size of 0 bytes and buckets 1 through 21 correspond to LNet
297 * message sizes of 1 to 1048576 bytes increasing by a power of 2. LNet message
298 * sizes are round up to the nearest power of 2.
300 #define KFILND_DATA_SIZE_BUCKETS 22U
301 #define KFILND_DATA_SIZE_MAX_SIZE (1U << (KFILND_DATA_SIZE_BUCKETS - 2))
302 struct kfilnd_tn_data_size_duration_stats {
303 struct kfilnd_tn_duration_stat data_size[KFILND_DATA_SIZE_BUCKETS];
306 static inline unsigned int kfilnd_msg_len_to_data_size_bucket(size_t size)
312 if (size >= KFILND_DATA_SIZE_MAX_SIZE)
313 return KFILND_DATA_SIZE_BUCKETS - 1;
315 /* Round size up to the nearest power of 2. */
320 return (unsigned int)bit;
323 /* One data size duraction state bucket for each transaction state. */
324 struct kfilnd_tn_state_data_size_duration_stats {
325 struct kfilnd_tn_data_size_duration_stats state[TN_STATE_MAX];
329 struct list_head kfd_list; /* chain on kfid_devs */
330 struct lnet_ni *kfd_ni;
331 enum kfilnd_object_states kfd_state;
333 /* KFI LND domain the device is associated with. */
334 struct kfilnd_dom *dom;
336 /* Fields specific to kfabric operation */
338 struct kfid_ep *kfd_sep;
339 struct kfid_av *kfd_av;
340 struct kfilnd_ep **kfd_endpoints;
342 /* Map of LNet NI CPTs to endpoints. */
343 struct kfilnd_ep **cpt_to_endpoint;
345 /* Hash of LNet NIDs to KFI addresses. */
346 struct rhashtable peer_cache;
348 /* Per LNet NI states. */
349 struct kfilnd_tn_state_data_size_duration_stats initiator_state_stats;
350 struct kfilnd_tn_state_data_size_duration_stats target_state_stats;
351 struct kfilnd_tn_data_size_duration_stats initiator_stats;
352 struct kfilnd_tn_data_size_duration_stats target_stats;
354 /* Per LNet NI debugfs stats. */
355 struct dentry *dev_dir;
356 struct dentry *initiator_state_stats_file;
357 struct dentry *initiator_stats_file;
358 struct dentry *target_state_stats_file;
359 struct dentry *target_stats_file;
360 struct dentry *reset_stats_file;
362 /* Physical NIC address. */
363 unsigned int nic_addr;
364 atomic_t session_keys;
367 /* Invalid checksum value is treated as no checksum. */
368 /* TODO: Module parameter to disable checksum? */
369 #define NO_CHECKSUM 0x0
371 /* Hello message header. */
372 struct kfilnd_hello_msg {
373 /* Support kfilnd version. */
376 /* Base RX context peer should used. */
379 /* Session key used by peer. */
382 /* RX context count peer can target. */
386 /* Immediate message header. */
387 struct kfilnd_immed_msg {
388 /* Entire LNet header needed by the destination to match incoming
391 struct lnet_hdr_nid4 hdr;
393 /* Entire LNet message payload. */
397 /* Bulk request message header. */
398 struct kfilnd_bulk_req_msg {
399 /* Entire LNet header needed by the destination to match incoming
402 struct lnet_hdr_nid4 hdr;
404 /* Specific RX context the target must target to push/pull LNet
409 /* Memory key needed by the target to push/pull LNet payload. */
413 /* Kfilnd message. Includes base transport header plus embedded protocol
417 /* Unique kfilnd magic. */
420 /* Version of the kfilnd protocol. */
423 /* Specific kfilnd protocol type. */
429 /* Number of bytes in message. */
432 /* Checksum of entire message. 0 is checksum disabled. */
435 /* Message LNet source NID. */
438 /* Message LNet target NID. */
441 /* Embedded protocol headers. Must remain at bottom. */
443 struct kfilnd_immed_msg immed;
444 struct kfilnd_bulk_req_msg bulk_req;
445 struct kfilnd_hello_msg hello;
449 #define KFILND_MSG_MAGIC LNET_PROTO_KFI_MAGIC /* unique magic */
451 #define KFILND_MSG_VERSION_1 0x1
452 #define KFILND_MSG_VERSION KFILND_MSG_VERSION_1
454 /* Get the KFI RX context from a KFI RX address. RX context information is
455 * stored in the MSBs of the KFI address.
457 #define KFILND_RX_CONTEXT(addr) ((addr) >> (64 - KFILND_FAB_RX_CTX_BITS))
459 #define KFILND_EP_DEBUG(ep, fmt, ...) \
460 CDEBUG(D_NET, "%s:%d " fmt "\n", \
461 libcfs_nidstr(&(ep)->end_dev->kfd_ni->ni_nid), \
462 (ep)->end_context_id, ##__VA_ARGS__)
464 #define KFILND_EP_ERROR(ep, fmt, ...) \
465 CNETERR("%s:%d " fmt "\n", \
466 libcfs_nidstr(&(ep)->end_dev->kfd_ni->ni_nid), \
467 (ep)->end_context_id, ##__VA_ARGS__)
469 #define KFILND_TN_PEER_VALID(tn) \
470 !IS_ERR_OR_NULL((tn)->peer)
472 #define KFILND_TN_DIR_DEBUG(tn, fmt, dir, ...) \
473 CDEBUG(D_NET, "Transaction ID %p: %s:%u %s %s:%llu " fmt "\n", \
475 libcfs_nidstr(&(tn)->tn_ep->end_dev->kfd_ni->ni_nid), \
476 (tn)->tn_ep->end_context_id, dir, \
477 libcfs_nid2str((tn)->peer->nid), \
478 KFILND_TN_PEER_VALID(tn) ? \
479 KFILND_RX_CONTEXT((tn)->peer->addr) : 0, \
482 #define KFILND_TN_DEBUG(tn, fmt, ...) \
484 if ((tn)->is_initiator) \
485 KFILND_TN_DIR_DEBUG(tn, fmt, "->", ##__VA_ARGS__); \
487 KFILND_TN_DIR_DEBUG(tn, fmt, "<-", ##__VA_ARGS__); \
490 #define KFILND_TN_DIR_ERROR(tn, fmt, dir, ...) \
491 CNETERR("Transaction ID %p: %s:%u %s %s:%llu " fmt "\n", \
493 libcfs_nidstr(&(tn)->tn_ep->end_dev->kfd_ni->ni_nid), \
494 (tn)->tn_ep->end_context_id, dir, \
495 libcfs_nid2str((tn)->peer->nid), \
496 KFILND_TN_PEER_VALID(tn) ? \
497 KFILND_RX_CONTEXT((tn)->peer->addr) : 0, \
500 #define KFILND_TN_ERROR(tn, fmt, ...) \
502 if ((tn)->is_initiator) \
503 KFILND_TN_DIR_ERROR(tn, fmt, "->", ##__VA_ARGS__); \
505 KFILND_TN_DIR_ERROR(tn, fmt, "<-", ##__VA_ARGS__); \
508 /* TODO: Support NOOPs? */
509 enum kfilnd_msg_type {
510 /* Valid message types start at 1. */
513 /* Valid message types. */
514 KFILND_MSG_IMMEDIATE,
515 KFILND_MSG_BULK_PUT_REQ,
516 KFILND_MSG_BULK_GET_REQ,
517 KFILND_MSG_HELLO_REQ,
518 KFILND_MSG_HELLO_RSP,
520 /* Invalid max value. */
524 static inline const char *msg_type_to_str(enum kfilnd_msg_type type)
526 static const char *str[KFILND_MSG_MAX] = {
527 [KFILND_MSG_INVALID] = "KFILND_MSG_INVALID",
528 [KFILND_MSG_IMMEDIATE] = "KFILND_MSG_IMMEDIATE",
529 [KFILND_MSG_BULK_PUT_REQ] = "KFILND_MSG_BULK_PUT_REQ",
530 [KFILND_MSG_BULK_GET_REQ] = "KFILND_MSG_BULK_GET_REQ",
531 [KFILND_MSG_HELLO_REQ] = "KFILND_MSG_HELLO_REQ",
532 [KFILND_MSG_HELLO_RSP] = "KFILND_MSG_HELLO_RSP",
535 if (type >= KFILND_MSG_MAX)
536 return "KFILND_MSG_INVALID";
541 static inline const char *tn_state_to_str(enum tn_states type)
543 static const char *str[TN_STATE_MAX] = {
544 [TN_STATE_INVALID] = "TN_STATE_INVALID",
545 [TN_STATE_IDLE] = "TN_STATE_IDLE",
546 [TN_STATE_WAIT_TAG_COMP] = "TN_STATE_WAIT_TAG_COMP",
547 [TN_STATE_IMM_SEND] = "TN_STATE_IMM_SEND",
548 [TN_STATE_TAGGED_RECV_POSTED] = "TN_STATE_TAGGED_RECV_POSTED",
549 [TN_STATE_SEND_FAILED] = "TN_STATE_SEND_FAILED",
550 [TN_STATE_WAIT_COMP] = "TN_STATE_WAIT_COMP",
551 [TN_STATE_WAIT_TIMEOUT_COMP] = "TN_STATE_WAIT_TIMEOUT_COMP",
552 [TN_STATE_WAIT_SEND_COMP] = "TN_STATE_WAIT_SEND_COMP",
553 [TN_STATE_WAIT_TIMEOUT_TAG_COMP] = "TN_STATE_WAIT_TIMEOUT_TAG_COMP",
554 [TN_STATE_FAIL] = "TN_STATE_FAIL",
555 [TN_STATE_IMM_RECV] = "TN_STATE_IMM_RECV",
556 [TN_STATE_WAIT_TAG_RMA_COMP] = "TN_STATE_WAIT_TAG_RMA_COMP",
562 /* Transaction Events */
566 /* Initiator events. */
567 TN_EVENT_INIT_IMMEDIATE,
573 TN_EVENT_TAG_RX_FAIL,
574 TN_EVENT_TAG_RX_CANCEL,
581 TN_EVENT_INIT_TAG_RMA,
582 TN_EVENT_SKIP_TAG_RMA,
584 TN_EVENT_TAG_TX_FAIL,
586 /* Invalid max value. */
590 static inline const char *tn_event_to_str(enum tn_events type)
592 static const char *str[TN_EVENT_MAX] = {
593 [TN_EVENT_INVALID] = "TN_EVENT_INVALID",
594 [TN_EVENT_INIT_IMMEDIATE] = "TN_EVENT_INIT_IMMEDIATE",
595 [TN_EVENT_INIT_BULK] = "TN_EVENT_INIT_BULK",
596 [TN_EVENT_TX_HELLO] = "TN_EVENT_TX_HELLO",
597 [TN_EVENT_TX_OK] = "TN_EVENT_TX_OK",
598 [TN_EVENT_TX_FAIL] = "TN_EVENT_TX_FAIL",
599 [TN_EVENT_TAG_RX_OK] = "TN_EVENT_TAG_RX_OK",
600 [TN_EVENT_TAG_RX_FAIL] = "TN_EVENT_TAG_RX_FAIL",
601 [TN_EVENT_TAG_RX_CANCEL] = "TN_EVENT_TAG_RX_CANCEL",
602 [TN_EVENT_TIMEOUT] = "TN_EVENT_TIMEOUT",
603 [TN_EVENT_RX_HELLO] = "TN_EVENT_RX_HELLO",
604 [TN_EVENT_RX_OK] = "TN_EVENT_RX_OK",
605 [TN_EVENT_RX_FAIL] = "TN_EVENT_RX_FAIL",
606 [TN_EVENT_INIT_TAG_RMA] = "TN_EVENT_INIT_TAG_RMA",
607 [TN_EVENT_SKIP_TAG_RMA] = "TN_EVENT_SKIP_TAG_RMA",
608 [TN_EVENT_TAG_TX_FAIL] = "TN_EVENT_TAG_TX_FAIL",
614 struct kfilnd_transaction_msg {
615 struct kfilnd_msg *msg;
619 /* Initiator and target transaction structure. */
620 struct kfilnd_transaction {
621 /* Endpoint list transaction lives on. */
622 struct list_head tn_entry;
623 struct mutex tn_lock; /* to serialize events */
624 int tn_status; /* return code from ops */
625 struct kfilnd_ep *tn_ep; /* endpoint we operate under */
626 enum tn_states tn_state; /* current state of Tn */
627 struct lnet_msg *tn_lntmsg; /* LNet msg to finalize */
628 struct lnet_msg *tn_getreply; /* GET LNet msg to finalize */
630 bool is_initiator; /* Initiated LNet transfer. */
632 /* Transaction send message and target address. */
633 kfi_addr_t tn_target_addr;
634 struct kfilnd_peer *peer;
635 struct kfilnd_transaction_msg tn_tx_msg;
637 /* Transaction multi-receive buffer and associated receive message. */
638 struct kfilnd_immediate_buffer *tn_posted_buf;
639 struct kfilnd_transaction_msg tn_rx_msg;
641 /* LNet buffer used to register a memory region or perform a RMA
644 struct bio_vec tn_kiov[LNET_MAX_IOV];
645 unsigned int tn_num_iovec;
647 /* LNet transaction payload byte count. */
650 /* Bulk transaction buffer is sink or source buffer. */
653 /* Memory region and remote key used to cover initiator's buffer. */
656 /* RX context used to perform response operations to a Put/Get
657 * request. This is required since the request initiator locks in a
658 * transactions to a specific RX context.
660 u16 tn_response_mr_key;
663 /* Immediate data used to convey transaction state from LNet target to
668 /* Bulk operation timeout timer. */
669 struct timer_list timeout_timer;
670 struct work_struct timeout_work;
672 /* Transaction health status. */
673 enum lnet_msg_hstatus hstatus;
675 /* Transaction deadline. */
682 /* Fields used to replay transaction. */
683 struct list_head replay_entry;
684 enum tn_events replay_event;
687 enum kfilnd_msg_type msg_type;
690 int kfilnd_send_hello_request(struct kfilnd_dev *dev, int cpt, lnet_nid_t nid);
692 #endif /* _KFILND_ */