4 #include <asm/system.h>
5 #include <asm/uaccess.h>
6 #include <asm/segment.h>
7 #include <linux/config.h>
8 #include <linux/module.h>
9 #include <linux/kernel.h>
11 #include <linux/string.h>
13 #include <linux/ipc.h>
14 #include <linux/shm.h>
16 #include <linux/stat.h>
17 #include <linux/errno.h>
18 #include <linux/locks.h>
19 #include <linux/unistd.h>
20 #include <linux/init.h>
22 #include <linux/file.h>
23 #include <linux/list.h>
27 #define DEBUG_SUBSYSTEM S_IBNAL
29 #include <portals/p30.h>
30 #include <portals/lib-p30.h>
31 #include <linux/kp30.h>
33 // Infiniband VAPI/EVAPI header files
34 // Mellanox MT23108 VAPI
36 #include <vapi_types.h>
37 #include <vapi_common.h>
40 // pick a port for this RDMA information exhange between two hosts
41 #define HOST_PORT 11211
42 #define QUEUE_SIZE 1024
45 #define DEBUG_SUBSYSTEM S_IBNAL
47 #define START_SEND_WRQ_ID 0
48 #define START_RECV_WRQ_ID 0
49 #define START_RDMA_WRQ_ID 0
51 #define DEFAULT_PRIORITY 100
53 #define WAIT_FOT_R_RDMA_TIMEOUT 10000
54 #define MAX_NUM_TRY 3000
56 #define MAX_NUM_POLL 300
57 #define MAX_LOOP_COUNT 500
60 #define MCG_BUF_LENGTH 128
62 #define SHARED_SEGMENT_SIZE 0x10000
63 #define HCA_EXCHANGE_SHM_KEY 999 // shared memory key for HCA data exchange
65 // some internals opcodes for IB operations used in IBNAL
66 #define SEND_QP_INFO 0X00000001
67 #define RECV_QP_INFO 0X00000010
69 // Mellanox InfiniHost MT23108
70 // QP/CQ related information
73 #define MTU_256 1 /* 1-256,2-512,3-1024,4-2048 */
74 #define MTU_512 2 /* 1-256,2-512,3-1024,4-2048 */
75 #define MTU_1024 3 /* 1-256,2-512,3-1024,4-2048 */
76 #define MTU_2048 4 /* 1-256,2-512,3-1024,4-2048 */
78 // number of entries for each CQ and WQ
79 // how much do we need ?
86 #define NUM_RDMA_RESERVED_ENTRY 128
89 #define INVALID_WR_ID ((VAPI_wr_id_t) -1)
94 // Portals can support upto 64 IO-Vectors
95 // how much do we need ?
101 #define ONE_MB 1024 * ONE_KB
102 #define ONE_GB 1024 * ONE_MB
105 #define KB_4 1024 * 4
106 #define KB_8 1024 * 8
107 #define KB_16 1024 * 16
108 #define KB_32 1024 * 32
109 #define KB_64 1024 * 64
110 #define KB_128 1024 * 128
111 #define KB_256 1024 * 256
113 // 256 entry in registered buffer list
114 // small size message
123 #define SMALL_MSG_SIZE KB_32
125 #define MAX_MSG_SIZE ONE_MB * 512
127 // 128's 64KB bufer for send
128 // 128's 64KB bufer for recv
129 // used in RDAM operation only
131 #define NUM_ENTRY 128
133 #define End_4_kb Num_4_KB
134 #define End_8_kb End_4_kb + Num_8_KB
135 #define End_16_kb End_8_kb + Num_16_KB
136 #define End_32_kb End_16_kb + Num_32_KB
137 #define End_64_kb End_32_kb + Num_64_KB
138 #define End_128_kb End_64_kb + Num_128_KB
139 #define End_256_kb End_128_kb+ Num_256_KB
142 #define SEND_BUF_SIZE KB_32
143 #define RECV_BUF_SIZE SEND_BUF_SIZE
145 // #define POLL_BASED_CQE_HANDLING 1
146 #define EVENT_BASED_CQE_HANDLING 1
147 #define IBNAL_SELF_TESTING 1
149 #ifdef IBNAL_SELF_TESTING
150 #undef IBNAL_SELF_TESTING
154 #define MSG_SIZE_SMALL 1
155 #define MSG_SIZE_LARGE 2
159 // some defauly configuration values for early testing
160 #define DEFAULT_DLID 1 // default destination link ID
161 #define DEFAULT_QP_NUM 4 // default QP number
162 #define P_KEY 0xFFFF // do we need default value
163 #define PKEY_IX 0x0 // do we need default value
164 #define Q_KEY 0x012 // do we need default value
165 #define L_KEY 0x12345678 // do we need default value
166 #define R_KEY 0x87654321 // do we need default value
167 #define HCA_ID "InfiniHost0" // default
169 #define START_SQ_PSN 0
170 #define START_RQ_PSN 0
173 #define __u_long_long unsigned long long
175 #define IBNAL_DEBUG 1
177 #define USE_SHARED_MEMORY_AND_SOCKET 1
180 #define TRY_SEND_ONLY 1
186 // a common data structure for IB QP's operation
187 // each QP is associated with an QP_info structure
189 typedef struct QP_info
191 VAPI_hca_hndl_t hca_hndl; // HCA handle
192 IB_port_t port; // port number
193 VAPI_qp_hndl_t qp_hndl; // QP's handle list
194 VAPI_qp_state_t qp_state; // QP's current state
195 VAPI_pd_hndl_t pd_hndl; // protection domain
196 VAPI_cq_hndl_t cq_hndl; // send-queue CQ's handle
197 VAPI_cq_hndl_t sq_cq_hndl; // send-queue CQ's handle
198 VAPI_cq_hndl_t rq_cq_hndl; // receive-queue CQ's handle
199 VAPI_ud_av_hndl_t av_hndl; // receive-queue CQ's handle
200 VAPI_qp_init_attr_t qp_init_attr; // QP's init attribute
201 VAPI_qp_attr_t qp_attr; // QP's attribute - dlid
202 VAPI_qp_prop_t qp_prop; // QP's propertities
203 VAPI_hca_port_t hca_port;
204 VAPI_qp_num_t qp_num; // QP's number
205 VAPI_qp_num_t rqp_num; // remote QP's number
211 VAPI_virt_addr_t buf_addr;
214 VAPI_mr_hndl_t mr_hndl;
215 VAPI_virt_addr_t raddr;
219 VAPI_wr_id_t last_posted_send_id; // user defined work request ID
220 VAPI_wr_id_t last_posted_rcv_id; // user defined work request ID
221 VAPI_mw_hndl_t mw_hndl; // memory window handle
222 VAPI_rkey_t mw_rkey; // memory window rkey
223 VAPI_sg_lst_entry_t sg_lst[256]; // scatter and gather list
224 int sg_list_sz; // set as NUM_SGE
225 VAPI_wr_id_t wr_id; //
226 spinlock_t snd_mutex;
227 spinlock_t rcv_mutex;
229 spinlock_t cln_mutex;
230 int cur_RDMA_outstanding;
231 int cur_send_outstanding;
232 int cur_posted_rcv_bufs;
238 #define BUF_REGISTERED 0x10000000
239 #define BUF_INUSE 0x01000000
240 #define BUF_UNREGISTERED 0x00100000
243 #define REG_BUF 0x10000000
244 #define RDMA_BUF 0x01000000
249 #define IMM_000 (0 << 32);
250 #define IMM_001 (1 << 32);
251 #define IMM_002 (2 << 32);
252 #define IMM_003 (3 << 32);
253 #define IMM_004 (4 << 32);
254 #define IMM_005 (5 << 32);
255 #define IMM_006 (6 << 32);
256 #define IMM_007 (7 << 32);
257 #define IMM_008 (8 << 32);
258 #define IMM_009 (9 << 32);
259 #define IMM_010 (10 << 32);
260 #define IMM_011 (11 << 32);
261 #define IMM_012 (12 << 32);
262 #define IMM_013 (13 << 32);
263 #define IMM_014 (14 << 32);
264 #define IMM_015 (15 << 32);
265 #define IMM_016 (16 << 32);
266 #define IMM_017 (17 << 32);
267 #define IMM_018 (18 << 32);
268 #define IMM_019 (19 << 32);
269 #define IMM_020 (20 << 32);
270 #define IMM_021 (21 << 32);
271 #define IMM_022 (22 << 32);
272 #define IMM_023 (23 << 32);
273 #define IMM_024 (24 << 32);
274 #define IMM_025 (25 << 32);
275 #define IMM_026 (26 << 32);
276 #define IMM_027 (27 << 32);
277 #define IMM_028 (28 << 32);
278 #define IMM_029 (29 << 32);
279 #define IMM_030 (30 << 32);
280 #define IMM_031 (31 << 32);
284 typedef struct Memory_buffer_info{
286 VAPI_virt_addr_t buf_addr;
289 VAPI_mr_hndl_t mr_hndl;
293 VAPI_virt_addr_t raddr;
296 } Memory_buffer_info;
298 typedef struct RDMA_Info_Exchange {
301 VAPI_mrw_t recv_rdma_mr;
302 VAPI_mr_hndl_t recv_rdma_mr_hndl;
303 VAPI_mrw_t send_rdma_mr;
304 VAPI_mr_hndl_t send_rdma_mr_hndl;
305 VAPI_virt_addr_t raddr;
308 } RDMA_Info_Exchange;
310 // opcode for Rdma info exchange RTS/CTS
311 #define Ready_To_send 0x10000000
312 #define Clear_To_send 0x01000000
314 #define RDMA_RTS_ID 5555
315 #define RDMA_CTS_ID 7777
316 #define RDMA_OP_ID 9999
317 #define SEND_RECV_TEST_ID 2222
318 #define SEND_RECV_TEST_BUF_ID 0
320 #define TEST_SEND_MESSAGE 0x00000001
321 #define TEST_RECV_MESSAGE 0x00000002
324 #define RTS_CTS_TIMEOUT 50
325 #define RECEIVING_THREAD_TIMEOUT 50
326 #define WAIT_FOR_SEND_BUF_TIMEOUT 50
328 #define IBNAL_DEBUG_LEVEL_1 0XFFFFFFFF
329 #define IBNAL_DEBUG_LEVEL_2 D_PORTALS | D_NET | D_WARNING | D_MALLOC | \
330 D_ERROR | D_OTHER | D_TRACE | D_INFO
333 // flag for Rdma info exhange
334 #define RDMA_BUFFER_RESERVED 0x10000000
335 #define RDMA_BUFFER_UNAVAILABLE 0x01000000
338 // receiving data structure
340 ptl_hdr_t *krx_buffer; // pointer to receiving buffer
341 unsigned long krx_len; // length of buffer
342 unsigned int krx_size; //
343 unsigned int krx_priority; // do we need this
344 struct list_head krx_item;
347 // transmitting data structure
351 lib_msg_t *ktx_cookie;
354 unsigned long ktx_size;
356 unsigned int ktx_priority;
357 unsigned int ktx_tgt_node;
358 unsigned int ktx_tgt_port_id;
364 char kib_shuttingdown;
365 IB_port_t port_num; // IB port information
366 struct list_head kib_list;
370 struct kib_trans *kib_trans; // do I need this
371 struct tq_struct kib_ready_tq;
372 spinlock_t kib_dispatch_lock;
377 // A data structure for keeping the HCA information in system
378 // information related to HCA and hca_handle will be kept here
380 typedef struct HCA_Info
382 VAPI_hca_hndl_t hca_hndl; // HCA handle
383 VAPI_pd_hndl_t pd_hndl; // protection domain
384 IB_port_t port; // port number
385 int num_qp; // number of qp used
386 QP_info *qp_ptr[NUM_QPS]; // point to QP_list
387 int num_cq; // number of cq used
388 VAPI_cq_hndl_t cq_hndl;
389 VAPI_cq_hndl_t sq_cq_hndl;
390 VAPI_cq_hndl_t rq_cq_hndl;
393 kibnal_data_t *kib_data; // for PORTALS operations
399 // Remote HCA Info information
400 typedef struct Remote_HCA_Info {
401 unsigned long opcode;
402 unsigned long length;
403 IB_lid_t dlid[NUM_QPS];
404 VAPI_qp_num_t rqp_num[NUM_QPS];
407 typedef struct Bucket_index{
412 // functional prototypes
413 // infiniband initialization
414 int kib_init(kibnal_data_t *);
417 void kibnal_recv_thread(HCA_info *);
418 void recv_thread(HCA_info *);
420 // forward data packet
421 void kibnal_fwd_packet (void *, kpr_fwd_desc_t *);
423 // global data structures
424 extern kibnal_data_t kibnal_data;
425 extern ptl_handle_ni_t kibnal_ni;
426 extern nal_t kibnal_api;
427 extern nal_cb_t kibnal_lib;
428 extern QP_info QP_list[];
429 extern QP_info CQ_list[];
430 extern HCA_info Hca_data;
431 extern VAPI_hca_hndl_t Hca_hndl;
432 extern VAPI_pd_hndl_t Pd_hndl;
433 extern VAPI_hca_vendor_t Hca_vendor;
434 extern VAPI_hca_cap_t Hca_cap;
435 extern VAPI_hca_port_t Hca_port_1_props;
436 extern VAPI_hca_port_t Hca_port_2_props;
437 extern VAPI_hca_attr_t Hca_attr;
438 extern VAPI_hca_attr_mask_t Hca_attr_mask;
439 extern VAPI_cq_hndl_t Cq_SQ_hndl;
440 extern VAPI_cq_hndl_t Cq_RQ_hndl;
441 extern VAPI_cq_hndl_t Cq_hndl;
442 extern unsigned long User_Defined_Small_Msg_Size;
443 extern Remote_QP_Info L_HCA_RDMA_Info;
444 extern Remote_QP_Info R_HCA_RDMA_Info;
445 extern unsigned int Num_posted_recv_buf;
446 extern int R_RDMA_DATA_ARRIVED;
447 extern Memory_buffer_info MRbuf_list[];
448 extern Memory_buffer_info MSbuf_list[];
449 extern Bucket_index Bucket[];
450 extern RDMA_Info_Exchange Rdma_info;
451 extern int Cts_Message_arrived;
452 extern RDMA_Info_Exchange Local_rdma_info;
453 extern spinlock_t MSB_mutex[];
457 // kernel NAL API function prototype
458 int kibnal_forward(nal_t *,int ,void *,size_t ,void *,size_t );
459 void kibnal_lock(nal_t *, unsigned long *);
460 void kibnal_unlock(nal_t *, unsigned long *);
461 int kibnal_shutdown(nal_t *, int );
462 void kibnal_yield( nal_t * );
463 void kibnal_invalidate(nal_cb_t *,void *,size_t ,void *);
464 int kibnal_validate(nal_cb_t *,void *,size_t ,void **);
468 nal_t *kibnal_init(int , ptl_pt_index_t , ptl_ac_index_t , ptl_pid_t );
469 void __exit kibnal_finalize(void );
470 VAPI_ret_t create_qp(QP_info *, int );
471 VAPI_ret_t init_qp(QP_info *, int );
472 VAPI_ret_t IB_Open_HCA(kibnal_data_t *);
473 VAPI_ret_t IB_Close_HCA(void );
474 VAPI_ret_t createMemRegion(VAPI_hca_hndl_t, VAPI_pd_hndl_t);
475 VAPI_ret_t deleteMemRegion(QP_info *, int );
477 void ibnal_send_recv_self_testing(int *);
479 int __init kibnal_initialize(void);
483 /* CB NAL functions */
484 int kibnal_send(nal_cb_t *,
495 int kibnal_send_pages(nal_cb_t *,
505 int kibnal_recv(nal_cb_t *, void *, lib_msg_t *,
506 unsigned int, struct iovec *, size_t, size_t);
507 int kibnal_recv_pages(nal_cb_t *, void *, lib_msg_t *,
508 unsigned int, ptl_kiov_t *, size_t, size_t);
509 int kibnal_read(nal_cb_t *,void *,void *,user_ptr ,size_t );
510 int kibnal_write(nal_cb_t *,void *,user_ptr ,void *,size_t );
511 int kibnal_callback(nal_cb_t * , void *, lib_eq_t *, ptl_event_t *);
512 void *kibnal_malloc(nal_cb_t *,size_t );
513 void kibnal_free(nal_cb_t *,void *,size_t );
514 int kibnal_map(nal_cb_t *, unsigned int , struct iovec *, void **);
515 void kibnal_unmap(nal_cb_t *, unsigned int , struct iovec *, void **);
516 int kibnal_map_pages(nal_cb_t *, unsigned int , ptl_kiov_t *, void **);
517 void kibnal_unmap_pages(nal_cb_t * , unsigned int , ptl_kiov_t *, void **);
518 void kibnal_printf(nal_cb_t *, const char *, ...);
519 void kibnal_cli(nal_cb_t *,unsigned long *);
520 void kibnal_sti(nal_cb_t *,unsigned long *);
521 int kibnal_dist(nal_cb_t *,ptl_nid_t ,unsigned long *);
523 void kibnal_fwd_packet (void *, kpr_fwd_desc_t *);
524 void kibnal_rx(kibnal_data_t *,
530 int kibnal_end(kibnal_data_t *);
532 void async_event_handler(VAPI_hca_hndl_t , VAPI_event_record_t *,void *);
534 void CQE_event_handler(VAPI_hca_hndl_t ,VAPI_cq_hndl_t , void *);
537 VAPI_ret_t Send_Small_Msg(char *, int );
538 VAPI_ret_t Send_Large_Msg(char *, int );
540 VAPI_ret_t repost_recv_buf(QP_info *, VAPI_wr_id_t );
541 int post_recv_bufs(VAPI_wr_id_t );
542 int server_listen_thread(void *);
543 VAPI_wr_id_t RTS_handshaking_protocol(int );
544 VAPI_wr_id_t CTS_handshaking_protocol(RDMA_Info_Exchange *);
546 VAPI_ret_t createMemRegion_RDMA(VAPI_hca_hndl_t ,
554 VAPI_ret_t IB_Set_Event_Handler(HCA_info , kibnal_data_t *);
556 VAPI_ret_t IB_Set_Async_Event_Handler(HCA_info ,kibnal_data_t *);
558 VAPI_wr_id_t find_available_buf(int );
559 VAPI_wr_id_t search_send_buf(int );
560 VAPI_wr_id_t find_filler_list(int ,int );
561 int insert_MRbuf_list(int );
564 #endif /* _IBNAL_H */