4 #include <asm/system.h>
5 #include <asm/uaccess.h>
6 #include <asm/segment.h>
7 #include <linux/config.h>
8 #include <linux/module.h>
9 #include <linux/kernel.h>
11 #include <linux/string.h>
13 #include <linux/ipc.h>
14 #include <linux/shm.h>
16 #include <linux/stat.h>
17 #include <linux/errno.h>
18 #include <linux/locks.h>
19 #include <linux/unistd.h>
20 #include <linux/init.h>
22 #include <linux/file.h>
23 #include <linux/list.h>
27 #define DEBUG_SUBSYSTEM S_IBNAL
29 #include <portals/p30.h>
30 #include <portals/lib-p30.h>
31 #include <linux/kp30.h>
32 #include <linux/kpr.h>
34 // Infiniband VAPI/EVAPI header files
35 // Mellanox MT23108 VAPI
37 #include <vapi_types.h>
38 #include <vapi_common.h>
41 // pick a port for this RDMA information exhange between two hosts
42 #define HOST_PORT 11211
43 #define QUEUE_SIZE 1024
46 #define DEBUG_SUBSYSTEM S_IBNAL
48 #define START_SEND_WRQ_ID 0
49 #define START_RECV_WRQ_ID 0
50 #define START_RDMA_WRQ_ID 0
52 #define DEFAULT_PRIORITY 100
54 #define WAIT_FOT_R_RDMA_TIMEOUT 10000
55 #define MAX_NUM_TRY 3000
57 #define MAX_NUM_POLL 300
58 #define MAX_LOOP_COUNT 500
61 #define MCG_BUF_LENGTH 128
63 #define SHARED_SEGMENT_SIZE 0x10000
64 #define HCA_EXCHANGE_SHM_KEY 999 // shared memory key for HCA data exchange
66 // some internals opcodes for IB operations used in IBNAL
67 #define SEND_QP_INFO 0X00000001
68 #define RECV_QP_INFO 0X00000010
70 // Mellanox InfiniHost MT23108
71 // QP/CQ related information
74 #define MTU_256 1 /* 1-256,2-512,3-1024,4-2048 */
75 #define MTU_512 2 /* 1-256,2-512,3-1024,4-2048 */
76 #define MTU_1024 3 /* 1-256,2-512,3-1024,4-2048 */
77 #define MTU_2048 4 /* 1-256,2-512,3-1024,4-2048 */
79 // number of entries for each CQ and WQ
80 // how much do we need ?
87 #define NUM_RDMA_RESERVED_ENTRY 128
90 #define INVALID_WR_ID ((VAPI_wr_id_t) -1)
95 // Portals can support upto 64 IO-Vectors
96 // how much do we need ?
102 #define ONE_MB 1024 * ONE_KB
103 #define ONE_GB 1024 * ONE_MB
106 #define KB_4 1024 * 4
107 #define KB_8 1024 * 8
108 #define KB_16 1024 * 16
109 #define KB_32 1024 * 32
110 #define KB_64 1024 * 64
111 #define KB_128 1024 * 128
112 #define KB_256 1024 * 256
114 // 256 entry in registered buffer list
115 // small size message
124 #define SMALL_MSG_SIZE KB_32
126 #define MAX_MSG_SIZE ONE_MB * 512
128 // 128's 64KB bufer for send
129 // 128's 64KB bufer for recv
130 // used in RDAM operation only
132 #define NUM_ENTRY 128
134 #define End_4_kb Num_4_KB
135 #define End_8_kb End_4_kb + Num_8_KB
136 #define End_16_kb End_8_kb + Num_16_KB
137 #define End_32_kb End_16_kb + Num_32_KB
138 #define End_64_kb End_32_kb + Num_64_KB
139 #define End_128_kb End_64_kb + Num_128_KB
140 #define End_256_kb End_128_kb+ Num_256_KB
143 #define SEND_BUF_SIZE KB_32
144 #define RECV_BUF_SIZE SEND_BUF_SIZE
146 // #define POLL_BASED_CQE_HANDLING 1
147 #define EVENT_BASED_CQE_HANDLING 1
148 #define IBNAL_SELF_TESTING 1
150 #ifdef IBNAL_SELF_TESTING
151 #undef IBNAL_SELF_TESTING
155 #define MSG_SIZE_SMALL 1
156 #define MSG_SIZE_LARGE 2
160 // some defauly configuration values for early testing
161 #define DEFAULT_DLID 1 // default destination link ID
162 #define DEFAULT_QP_NUM 4 // default QP number
163 #define P_KEY 0xFFFF // do we need default value
164 #define PKEY_IX 0x0 // do we need default value
165 #define Q_KEY 0x012 // do we need default value
166 #define L_KEY 0x12345678 // do we need default value
167 #define R_KEY 0x87654321 // do we need default value
168 #define HCA_ID "InfiniHost0" // default
170 #define START_SQ_PSN 0
171 #define START_RQ_PSN 0
174 #define __u_long_long unsigned long long
176 #define IBNAL_DEBUG 1
178 #define USE_SHARED_MEMORY_AND_SOCKET 1
181 #define TRY_SEND_ONLY 1
187 // a common data structure for IB QP's operation
188 // each QP is associated with an QP_info structure
190 typedef struct QP_info
192 VAPI_hca_hndl_t hca_hndl; // HCA handle
193 IB_port_t port; // port number
194 VAPI_qp_hndl_t qp_hndl; // QP's handle list
195 VAPI_qp_state_t qp_state; // QP's current state
196 VAPI_pd_hndl_t pd_hndl; // protection domain
197 VAPI_cq_hndl_t cq_hndl; // send-queue CQ's handle
198 VAPI_cq_hndl_t sq_cq_hndl; // send-queue CQ's handle
199 VAPI_cq_hndl_t rq_cq_hndl; // receive-queue CQ's handle
200 VAPI_ud_av_hndl_t av_hndl; // receive-queue CQ's handle
201 VAPI_qp_init_attr_t qp_init_attr; // QP's init attribute
202 VAPI_qp_attr_t qp_attr; // QP's attribute - dlid
203 VAPI_qp_prop_t qp_prop; // QP's propertities
204 VAPI_hca_port_t hca_port;
205 VAPI_qp_num_t qp_num; // QP's number
206 VAPI_qp_num_t rqp_num; // remote QP's number
212 VAPI_virt_addr_t buf_addr;
215 VAPI_mr_hndl_t mr_hndl;
216 VAPI_virt_addr_t raddr;
220 VAPI_wr_id_t last_posted_send_id; // user defined work request ID
221 VAPI_wr_id_t last_posted_rcv_id; // user defined work request ID
222 VAPI_mw_hndl_t mw_hndl; // memory window handle
223 VAPI_rkey_t mw_rkey; // memory window rkey
224 VAPI_sg_lst_entry_t sg_lst[256]; // scatter and gather list
225 int sg_list_sz; // set as NUM_SGE
226 VAPI_wr_id_t wr_id; //
227 spinlock_t snd_mutex;
228 spinlock_t rcv_mutex;
230 spinlock_t cln_mutex;
231 int cur_RDMA_outstanding;
232 int cur_send_outstanding;
233 int cur_posted_rcv_bufs;
239 #define BUF_REGISTERED 0x10000000
240 #define BUF_INUSE 0x01000000
241 #define BUF_UNREGISTERED 0x00100000
244 #define REG_BUF 0x10000000
245 #define RDMA_BUF 0x01000000
250 #define IMM_000 (0 << 32);
251 #define IMM_001 (1 << 32);
252 #define IMM_002 (2 << 32);
253 #define IMM_003 (3 << 32);
254 #define IMM_004 (4 << 32);
255 #define IMM_005 (5 << 32);
256 #define IMM_006 (6 << 32);
257 #define IMM_007 (7 << 32);
258 #define IMM_008 (8 << 32);
259 #define IMM_009 (9 << 32);
260 #define IMM_010 (10 << 32);
261 #define IMM_011 (11 << 32);
262 #define IMM_012 (12 << 32);
263 #define IMM_013 (13 << 32);
264 #define IMM_014 (14 << 32);
265 #define IMM_015 (15 << 32);
266 #define IMM_016 (16 << 32);
267 #define IMM_017 (17 << 32);
268 #define IMM_018 (18 << 32);
269 #define IMM_019 (19 << 32);
270 #define IMM_020 (20 << 32);
271 #define IMM_021 (21 << 32);
272 #define IMM_022 (22 << 32);
273 #define IMM_023 (23 << 32);
274 #define IMM_024 (24 << 32);
275 #define IMM_025 (25 << 32);
276 #define IMM_026 (26 << 32);
277 #define IMM_027 (27 << 32);
278 #define IMM_028 (28 << 32);
279 #define IMM_029 (29 << 32);
280 #define IMM_030 (30 << 32);
281 #define IMM_031 (31 << 32);
285 typedef struct Memory_buffer_info{
287 VAPI_virt_addr_t buf_addr;
290 VAPI_mr_hndl_t mr_hndl;
294 VAPI_virt_addr_t raddr;
297 } Memory_buffer_info;
299 typedef struct RDMA_Info_Exchange {
302 VAPI_mrw_t recv_rdma_mr;
303 VAPI_mr_hndl_t recv_rdma_mr_hndl;
304 VAPI_mrw_t send_rdma_mr;
305 VAPI_mr_hndl_t send_rdma_mr_hndl;
306 VAPI_virt_addr_t raddr;
309 } RDMA_Info_Exchange;
311 // opcode for Rdma info exchange RTS/CTS
312 #define Ready_To_send 0x10000000
313 #define Clear_To_send 0x01000000
315 #define RDMA_RTS_ID 5555
316 #define RDMA_CTS_ID 7777
317 #define RDMA_OP_ID 9999
318 #define SEND_RECV_TEST_ID 2222
319 #define SEND_RECV_TEST_BUF_ID 0
321 #define TEST_SEND_MESSAGE 0x00000001
322 #define TEST_RECV_MESSAGE 0x00000002
325 #define RTS_CTS_TIMEOUT 50
326 #define RECEIVING_THREAD_TIMEOUT 50
327 #define WAIT_FOR_SEND_BUF_TIMEOUT 50
329 #define IBNAL_DEBUG_LEVEL_1 0XFFFFFFFF
330 #define IBNAL_DEBUG_LEVEL_2 D_PORTALS | D_NET | D_WARNING | D_MALLOC | \
331 D_ERROR | D_OTHER | D_TRACE | D_INFO
334 // flag for Rdma info exhange
335 #define RDMA_BUFFER_RESERVED 0x10000000
336 #define RDMA_BUFFER_UNAVAILABLE 0x01000000
339 // receiving data structure
341 ptl_hdr_t *krx_buffer; // pointer to receiving buffer
342 unsigned long krx_len; // length of buffer
343 unsigned int krx_size; //
344 unsigned int krx_priority; // do we need this
345 struct list_head krx_item;
348 // transmitting data structure
352 lib_msg_t *ktx_cookie;
355 unsigned long ktx_size;
357 unsigned int ktx_priority;
358 unsigned int ktx_tgt_node;
359 unsigned int ktx_tgt_port_id;
365 char kib_shuttingdown;
366 IB_port_t port_num; // IB port information
367 struct list_head kib_list;
371 struct kib_trans *kib_trans; // do I need this
372 struct tq_struct kib_ready_tq;
373 spinlock_t kib_dispatch_lock;
378 // A data structure for keeping the HCA information in system
379 // information related to HCA and hca_handle will be kept here
381 typedef struct HCA_Info
383 VAPI_hca_hndl_t hca_hndl; // HCA handle
384 VAPI_pd_hndl_t pd_hndl; // protection domain
385 IB_port_t port; // port number
386 int num_qp; // number of qp used
387 QP_info *qp_ptr[NUM_QPS]; // point to QP_list
388 int num_cq; // number of cq used
389 VAPI_cq_hndl_t cq_hndl;
390 VAPI_cq_hndl_t sq_cq_hndl;
391 VAPI_cq_hndl_t rq_cq_hndl;
394 kibnal_data_t *kib_data; // for PORTALS operations
400 // Remote HCA Info information
401 typedef struct Remote_HCA_Info {
402 unsigned long opcode;
403 unsigned long length;
404 IB_lid_t dlid[NUM_QPS];
405 VAPI_qp_num_t rqp_num[NUM_QPS];
408 typedef struct Bucket_index{
413 // functional prototypes
414 // infiniband initialization
415 int kib_init(kibnal_data_t *);
418 void kibnal_recv_thread(HCA_info *);
419 void recv_thread(HCA_info *);
421 // forward data packet
422 void kibnal_fwd_packet (void *, kpr_fwd_desc_t *);
424 // global data structures
425 extern kibnal_data_t kibnal_data;
426 extern ptl_handle_ni_t kibnal_ni;
427 extern nal_t kibnal_api;
428 extern nal_cb_t kibnal_lib;
429 extern QP_info QP_list[];
430 extern QP_info CQ_list[];
431 extern HCA_info Hca_data;
432 extern VAPI_hca_hndl_t Hca_hndl;
433 extern VAPI_pd_hndl_t Pd_hndl;
434 extern VAPI_hca_vendor_t Hca_vendor;
435 extern VAPI_hca_cap_t Hca_cap;
436 extern VAPI_hca_port_t Hca_port_1_props;
437 extern VAPI_hca_port_t Hca_port_2_props;
438 extern VAPI_hca_attr_t Hca_attr;
439 extern VAPI_hca_attr_mask_t Hca_attr_mask;
440 extern VAPI_cq_hndl_t Cq_SQ_hndl;
441 extern VAPI_cq_hndl_t Cq_RQ_hndl;
442 extern VAPI_cq_hndl_t Cq_hndl;
443 extern unsigned long User_Defined_Small_Msg_Size;
444 extern Remote_QP_Info L_HCA_RDMA_Info;
445 extern Remote_QP_Info R_HCA_RDMA_Info;
446 extern unsigned int Num_posted_recv_buf;
447 extern int R_RDMA_DATA_ARRIVED;
448 extern Memory_buffer_info MRbuf_list[];
449 extern Memory_buffer_info MSbuf_list[];
450 extern Bucket_index Bucket[];
451 extern RDMA_Info_Exchange Rdma_info;
452 extern int Cts_Message_arrived;
453 extern RDMA_Info_Exchange Local_rdma_info;
454 extern spinlock_t MSB_mutex[];
458 // kernel NAL API function prototype
459 int kibnal_forward(nal_t *,int ,void *,size_t ,void *,size_t );
460 void kibnal_lock(nal_t *, unsigned long *);
461 void kibnal_unlock(nal_t *, unsigned long *);
462 int kibnal_shutdown(nal_t *, int );
463 void kibnal_yield( nal_t * );
464 void kibnal_invalidate(nal_cb_t *,void *,size_t ,void *);
465 int kibnal_validate(nal_cb_t *,void *,size_t ,void **);
469 nal_t *kibnal_init(int , ptl_pt_index_t , ptl_ac_index_t , ptl_pid_t );
470 void __exit kibnal_finalize(void );
471 VAPI_ret_t create_qp(QP_info *, int );
472 VAPI_ret_t init_qp(QP_info *, int );
473 VAPI_ret_t IB_Open_HCA(kibnal_data_t *);
474 VAPI_ret_t IB_Close_HCA(void );
475 VAPI_ret_t createMemRegion(VAPI_hca_hndl_t, VAPI_pd_hndl_t);
476 VAPI_ret_t deleteMemRegion(QP_info *, int );
478 void ibnal_send_recv_self_testing(int *);
480 int __init kibnal_initialize(void);
484 /* CB NAL functions */
485 int kibnal_send(nal_cb_t *,
496 int kibnal_send_pages(nal_cb_t *,
506 int kibnal_recv(nal_cb_t *, void *, lib_msg_t *,
507 unsigned int, struct iovec *, size_t, size_t);
508 int kibnal_recv_pages(nal_cb_t *, void *, lib_msg_t *,
509 unsigned int, ptl_kiov_t *, size_t, size_t);
510 int kibnal_read(nal_cb_t *,void *,void *,user_ptr ,size_t );
511 int kibnal_write(nal_cb_t *,void *,user_ptr ,void *,size_t );
512 int kibnal_callback(nal_cb_t * , void *, lib_eq_t *, ptl_event_t *);
513 void *kibnal_malloc(nal_cb_t *,size_t );
514 void kibnal_free(nal_cb_t *,void *,size_t );
515 int kibnal_map(nal_cb_t *, unsigned int , struct iovec *, void **);
516 void kibnal_unmap(nal_cb_t *, unsigned int , struct iovec *, void **);
517 int kibnal_map_pages(nal_cb_t *, unsigned int , ptl_kiov_t *, void **);
518 void kibnal_unmap_pages(nal_cb_t * , unsigned int , ptl_kiov_t *, void **);
519 void kibnal_printf(nal_cb_t *, const char *, ...);
520 void kibnal_cli(nal_cb_t *,unsigned long *);
521 void kibnal_sti(nal_cb_t *,unsigned long *);
522 int kibnal_dist(nal_cb_t *,ptl_nid_t ,unsigned long *);
524 void kibnal_fwd_packet (void *, kpr_fwd_desc_t *);
525 void kibnal_rx(kibnal_data_t *,
531 int kibnal_end(kibnal_data_t *);
533 void async_event_handler(VAPI_hca_hndl_t , VAPI_event_record_t *,void *);
535 void CQE_event_handler(VAPI_hca_hndl_t ,VAPI_cq_hndl_t , void *);
538 VAPI_ret_t Send_Small_Msg(char *, int );
539 VAPI_ret_t Send_Large_Msg(char *, int );
541 VAPI_ret_t repost_recv_buf(QP_info *, VAPI_wr_id_t );
542 int post_recv_bufs(VAPI_wr_id_t );
543 int server_listen_thread(void *);
544 VAPI_wr_id_t RTS_handshaking_protocol(int );
545 VAPI_wr_id_t CTS_handshaking_protocol(RDMA_Info_Exchange *);
547 VAPI_ret_t createMemRegion_RDMA(VAPI_hca_hndl_t ,
555 VAPI_ret_t IB_Set_Event_Handler(HCA_info , kibnal_data_t *);
557 VAPI_ret_t IB_Set_Async_Event_Handler(HCA_info ,kibnal_data_t *);
559 VAPI_wr_id_t find_available_buf(int );
560 VAPI_wr_id_t search_send_buf(int );
561 VAPI_wr_id_t find_filler_list(int ,int );
562 int insert_MRbuf_list(int );
565 #endif /* _IBNAL_H */