#ifndef _IBNAL_H #define _IBNAL_H #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define DEBUG_SUBSYSTEM S_IBNAL #include #include #include #include // Infiniband VAPI/EVAPI header files // Mellanox MT23108 VAPI #include #include #include #include // pick a port for this RDMA information exhange between two hosts #define HOST_PORT 11211 #define QUEUE_SIZE 1024 #define HCA_PORT_1 1 #define HCA_PORT_2 2 #define DEBUG_SUBSYSTEM S_IBNAL #define START_SEND_WRQ_ID 0 #define START_RECV_WRQ_ID 0 #define START_RDMA_WRQ_ID 0 #define DEFAULT_PRIORITY 100 #define WAIT_FOT_R_RDMA_TIMEOUT 10000 #define MAX_NUM_TRY 3000 #define MAX_NUM_POLL 300 #define MAX_LOOP_COUNT 500 #define MAX_GID 32 #define MCG_BUF_LENGTH 128 #define SHARED_SEGMENT_SIZE 0x10000 #define HCA_EXCHANGE_SHM_KEY 999 // shared memory key for HCA data exchange // some internals opcodes for IB operations used in IBNAL #define SEND_QP_INFO 0X00000001 #define RECV_QP_INFO 0X00000010 // Mellanox InfiniHost MT23108 // QP/CQ related information // #define MTU_256 1 /* 1-256,2-512,3-1024,4-2048 */ #define MTU_512 2 /* 1-256,2-512,3-1024,4-2048 */ #define MTU_1024 3 /* 1-256,2-512,3-1024,4-2048 */ #define MTU_2048 4 /* 1-256,2-512,3-1024,4-2048 */ // number of entries for each CQ and WQ // how much do we need ? #define NUM_CQE 1024 #define NUM_WQE 1024 #define MAX_OUT_SQ 64 #define MAX_OUT_RQ 64 #define NUM_MBUF 256 #define NUM_RDMA_RESERVED_ENTRY 128 #define NUM_QPS 256 #define INVALID_WR_ID ((VAPI_wr_id_t) -1) // for Vector IO // scatter and gather // Portals can support upto 64 IO-Vectors // how much do we need ? #define NUM_SGE 1 #define NUM_SG 1 #define NUM_CQ 1 #define ONE_KB 1024 #define ONE_MB 1024 * ONE_KB #define ONE_GB 1024 * ONE_MB #define KB_4 1024 * 4 #define KB_8 1024 * 8 #define KB_16 1024 * 16 #define KB_32 1024 * 32 #define KB_64 1024 * 64 #define KB_128 1024 * 128 #define KB_256 1024 * 256 // 256 entry in registered buffer list // small size message #define Num_4_KB 64 #define Num_8_KB 64 #define Num_16_KB 40 #define Num_32_KB 40 #define Num_64_KB 40 #define Num_128_KB 4 #define Num_256_KB 4 #define SMALL_MSG_SIZE KB_32 #define MAX_MSG_SIZE ONE_MB * 512 // 128's 64KB bufer for send // 128's 64KB bufer for recv // used in RDAM operation only #define NUM_ENTRY 128 #define End_4_kb Num_4_KB #define End_8_kb End_4_kb + Num_8_KB #define End_16_kb End_8_kb + Num_16_KB #define End_32_kb End_16_kb + Num_32_KB #define End_64_kb End_32_kb + Num_64_KB #define End_128_kb End_64_kb + Num_128_KB #define End_256_kb End_128_kb+ Num_256_KB #define SEND_BUF_SIZE KB_32 #define RECV_BUF_SIZE SEND_BUF_SIZE // #define POLL_BASED_CQE_HANDLING 1 #define EVENT_BASED_CQE_HANDLING 1 #define IBNAL_SELF_TESTING 1 #ifdef IBNAL_SELF_TESTING #undef IBNAL_SELF_TESTING #endif #define MSG_SIZE_SMALL 1 #define MSG_SIZE_LARGE 2 // some defauly configuration values for early testing #define DEFAULT_DLID 1 // default destination link ID #define DEFAULT_QP_NUM 4 // default QP number #define P_KEY 0xFFFF // do we need default value #define PKEY_IX 0x0 // do we need default value #define Q_KEY 0x012 // do we need default value #define L_KEY 0x12345678 // do we need default value #define R_KEY 0x87654321 // do we need default value #define HCA_ID "InfiniHost0" // default #define START_PSN 0 #define START_SQ_PSN 0 #define START_RQ_PSN 0 #define __u_long_long unsigned long long #define IBNAL_DEBUG 1 #define USE_SHARED_MEMORY_AND_SOCKET 1 // operation type #define TRY_SEND_ONLY 1 #define YES 1 #define NO 0 // // a common data structure for IB QP's operation // each QP is associated with an QP_info structure // typedef struct QP_info { VAPI_hca_hndl_t hca_hndl; // HCA handle IB_port_t port; // port number VAPI_qp_hndl_t qp_hndl; // QP's handle list VAPI_qp_state_t qp_state; // QP's current state VAPI_pd_hndl_t pd_hndl; // protection domain VAPI_cq_hndl_t cq_hndl; // send-queue CQ's handle VAPI_cq_hndl_t sq_cq_hndl; // send-queue CQ's handle VAPI_cq_hndl_t rq_cq_hndl; // receive-queue CQ's handle VAPI_ud_av_hndl_t av_hndl; // receive-queue CQ's handle VAPI_qp_init_attr_t qp_init_attr; // QP's init attribute VAPI_qp_attr_t qp_attr; // QP's attribute - dlid VAPI_qp_prop_t qp_prop; // QP's propertities VAPI_hca_port_t hca_port; VAPI_qp_num_t qp_num; // QP's number VAPI_qp_num_t rqp_num; // remote QP's number IB_lid_t slid; IB_lid_t dlid; VAPI_gid_t src_gid; u_int32_t buf_size; VAPI_virt_addr_t buf_addr; char *bufptr; VAPI_mrw_t mr; VAPI_mr_hndl_t mr_hndl; VAPI_virt_addr_t raddr; VAPI_rkey_t rkey; VAPI_lkey_t lkey; VAPI_wr_id_t last_posted_send_id; // user defined work request ID VAPI_wr_id_t last_posted_rcv_id; // user defined work request ID VAPI_mw_hndl_t mw_hndl; // memory window handle VAPI_rkey_t mw_rkey; // memory window rkey VAPI_sg_lst_entry_t sg_lst[256]; // scatter and gather list int sg_list_sz; // set as NUM_SGE VAPI_wr_id_t wr_id; // spinlock_t snd_mutex; spinlock_t rcv_mutex; spinlock_t bl_mutex; spinlock_t cln_mutex; int cur_RDMA_outstanding; int cur_send_outstanding; int cur_posted_rcv_bufs; int snd_rcv_balance; } QP_info; // buffer status #define BUF_REGISTERED 0x10000000 #define BUF_INUSE 0x01000000 #define BUF_UNREGISTERED 0x00100000 // buffer type #define REG_BUF 0x10000000 #define RDMA_BUF 0x01000000 // // IMM data // #define IMM_000 (0 << 32); #define IMM_001 (1 << 32); #define IMM_002 (2 << 32); #define IMM_003 (3 << 32); #define IMM_004 (4 << 32); #define IMM_005 (5 << 32); #define IMM_006 (6 << 32); #define IMM_007 (7 << 32); #define IMM_008 (8 << 32); #define IMM_009 (9 << 32); #define IMM_010 (10 << 32); #define IMM_011 (11 << 32); #define IMM_012 (12 << 32); #define IMM_013 (13 << 32); #define IMM_014 (14 << 32); #define IMM_015 (15 << 32); #define IMM_016 (16 << 32); #define IMM_017 (17 << 32); #define IMM_018 (18 << 32); #define IMM_019 (19 << 32); #define IMM_020 (20 << 32); #define IMM_021 (21 << 32); #define IMM_022 (22 << 32); #define IMM_023 (23 << 32); #define IMM_024 (24 << 32); #define IMM_025 (25 << 32); #define IMM_026 (26 << 32); #define IMM_027 (27 << 32); #define IMM_028 (28 << 32); #define IMM_029 (29 << 32); #define IMM_030 (30 << 32); #define IMM_031 (31 << 32); typedef struct Memory_buffer_info{ u_int32_t buf_size; VAPI_virt_addr_t buf_addr; char *bufptr; VAPI_mrw_t mr; VAPI_mr_hndl_t mr_hndl; int status; int ref_count; int buf_type; VAPI_virt_addr_t raddr; VAPI_rkey_t rkey; VAPI_lkey_t lkey; } Memory_buffer_info; typedef struct RDMA_Info_Exchange { int opcode; int buf_length; VAPI_mrw_t recv_rdma_mr; VAPI_mr_hndl_t recv_rdma_mr_hndl; VAPI_mrw_t send_rdma_mr; VAPI_mr_hndl_t send_rdma_mr_hndl; VAPI_virt_addr_t raddr; VAPI_rkey_t rkey; int flag; } RDMA_Info_Exchange; // opcode for Rdma info exchange RTS/CTS #define Ready_To_send 0x10000000 #define Clear_To_send 0x01000000 #define RDMA_RTS_ID 5555 #define RDMA_CTS_ID 7777 #define RDMA_OP_ID 9999 #define SEND_RECV_TEST_ID 2222 #define SEND_RECV_TEST_BUF_ID 0 #define TEST_SEND_MESSAGE 0x00000001 #define TEST_RECV_MESSAGE 0x00000002 #define RTS_CTS_TIMEOUT 50 #define RECEIVING_THREAD_TIMEOUT 50 #define WAIT_FOR_SEND_BUF_TIMEOUT 50 #define IBNAL_DEBUG_LEVEL_1 0XFFFFFFFF #define IBNAL_DEBUG_LEVEL_2 D_PORTALS | D_NET | D_WARNING | D_MALLOC | \ D_ERROR | D_OTHER | D_TRACE | D_INFO // flag for Rdma info exhange #define RDMA_BUFFER_RESERVED 0x10000000 #define RDMA_BUFFER_UNAVAILABLE 0x01000000 // receiving data structure typedef struct { ptl_hdr_t *krx_buffer; // pointer to receiving buffer unsigned long krx_len; // length of buffer unsigned int krx_size; // unsigned int krx_priority; // do we need this struct list_head krx_item; } kibnal_rx_t; // transmitting data structure typedef struct { nal_cb_t *ktx_nal; void *ktx_private; lib_msg_t *ktx_cookie; char *ktx_buffer; size_t ktx_len; unsigned long ktx_size; int ktx_ndx; unsigned int ktx_priority; unsigned int ktx_tgt_node; unsigned int ktx_tgt_port_id; } kibnal_tx_t; typedef struct { char kib_init; char kib_shuttingdown; IB_port_t port_num; // IB port information struct list_head kib_list; ptl_nid_t kib_nid; nal_t *kib_nal; nal_cb_t *kib_cb; struct kib_trans *kib_trans; // do I need this struct tq_struct kib_ready_tq; spinlock_t kib_dispatch_lock; } kibnal_data_t; // // A data structure for keeping the HCA information in system // information related to HCA and hca_handle will be kept here // typedef struct HCA_Info { VAPI_hca_hndl_t hca_hndl; // HCA handle VAPI_pd_hndl_t pd_hndl; // protection domain IB_port_t port; // port number int num_qp; // number of qp used QP_info *qp_ptr[NUM_QPS]; // point to QP_list int num_cq; // number of cq used VAPI_cq_hndl_t cq_hndl; VAPI_cq_hndl_t sq_cq_hndl; VAPI_cq_hndl_t rq_cq_hndl; IB_lid_t dlid; IB_lid_t slid; kibnal_data_t *kib_data; // for PORTALS operations } HCA_info; // Remote HCA Info information typedef struct Remote_HCA_Info { unsigned long opcode; unsigned long length; IB_lid_t dlid[NUM_QPS]; VAPI_qp_num_t rqp_num[NUM_QPS]; } Remote_QP_Info; typedef struct Bucket_index{ int start; int end; } Bucket_index; // functional prototypes // infiniband initialization int kib_init(kibnal_data_t *); // receiving thread void kibnal_recv_thread(HCA_info *); void recv_thread(HCA_info *); // forward data packet void kibnal_fwd_packet (void *, kpr_fwd_desc_t *); // global data structures extern kibnal_data_t kibnal_data; extern ptl_handle_ni_t kibnal_ni; extern nal_t kibnal_api; extern nal_cb_t kibnal_lib; extern QP_info QP_list[]; extern QP_info CQ_list[]; extern HCA_info Hca_data; extern VAPI_hca_hndl_t Hca_hndl; extern VAPI_pd_hndl_t Pd_hndl; extern VAPI_hca_vendor_t Hca_vendor; extern VAPI_hca_cap_t Hca_cap; extern VAPI_hca_port_t Hca_port_1_props; extern VAPI_hca_port_t Hca_port_2_props; extern VAPI_hca_attr_t Hca_attr; extern VAPI_hca_attr_mask_t Hca_attr_mask; extern VAPI_cq_hndl_t Cq_SQ_hndl; extern VAPI_cq_hndl_t Cq_RQ_hndl; extern VAPI_cq_hndl_t Cq_hndl; extern unsigned long User_Defined_Small_Msg_Size; extern Remote_QP_Info L_HCA_RDMA_Info; extern Remote_QP_Info R_HCA_RDMA_Info; extern unsigned int Num_posted_recv_buf; extern int R_RDMA_DATA_ARRIVED; extern Memory_buffer_info MRbuf_list[]; extern Memory_buffer_info MSbuf_list[]; extern Bucket_index Bucket[]; extern RDMA_Info_Exchange Rdma_info; extern int Cts_Message_arrived; extern RDMA_Info_Exchange Local_rdma_info; extern spinlock_t MSB_mutex[]; // kernel NAL API function prototype int kibnal_forward(nal_t *,int ,void *,size_t ,void *,size_t ); void kibnal_lock(nal_t *, unsigned long *); void kibnal_unlock(nal_t *, unsigned long *); int kibnal_shutdown(nal_t *, int ); void kibnal_yield( nal_t * ); void kibnal_invalidate(nal_cb_t *,void *,size_t ,void *); int kibnal_validate(nal_cb_t *,void *,size_t ,void **); nal_t *kibnal_init(int , ptl_pt_index_t , ptl_ac_index_t , ptl_pid_t ); void __exit kibnal_finalize(void ); VAPI_ret_t create_qp(QP_info *, int ); VAPI_ret_t init_qp(QP_info *, int ); VAPI_ret_t IB_Open_HCA(kibnal_data_t *); VAPI_ret_t IB_Close_HCA(void ); VAPI_ret_t createMemRegion(VAPI_hca_hndl_t, VAPI_pd_hndl_t); VAPI_ret_t deleteMemRegion(QP_info *, int ); void ibnal_send_recv_self_testing(int *); int __init kibnal_initialize(void); /* CB NAL functions */ int kibnal_send(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, int, ptl_nid_t, ptl_pid_t, unsigned int, ptl_kiov_t *, size_t); int kibnal_send_pages(nal_cb_t *, void *, lib_msg_t *, ptl_hdr_t *, int, ptl_nid_t, ptl_pid_t, unsigned int, ptl_kiov_t *, size_t); int kibnal_recv(nal_cb_t *, void *, lib_msg_t *, unsigned int, struct iovec *, size_t, size_t); int kibnal_recv_pages(nal_cb_t *, void *, lib_msg_t *, unsigned int, ptl_kiov_t *, size_t, size_t); int kibnal_read(nal_cb_t *,void *,void *,user_ptr ,size_t ); int kibnal_write(nal_cb_t *,void *,user_ptr ,void *,size_t ); int kibnal_callback(nal_cb_t * , void *, lib_eq_t *, ptl_event_t *); void *kibnal_malloc(nal_cb_t *,size_t ); void kibnal_free(nal_cb_t *,void *,size_t ); int kibnal_map(nal_cb_t *, unsigned int , struct iovec *, void **); void kibnal_unmap(nal_cb_t *, unsigned int , struct iovec *, void **); int kibnal_map_pages(nal_cb_t *, unsigned int , ptl_kiov_t *, void **); void kibnal_unmap_pages(nal_cb_t * , unsigned int , ptl_kiov_t *, void **); void kibnal_printf(nal_cb_t *, const char *, ...); void kibnal_cli(nal_cb_t *,unsigned long *); void kibnal_sti(nal_cb_t *,unsigned long *); int kibnal_dist(nal_cb_t *,ptl_nid_t ,unsigned long *); void kibnal_fwd_packet (void *, kpr_fwd_desc_t *); void kibnal_rx(kibnal_data_t *, VAPI_virt_addr_t , u_int32_t, u_int32_t, unsigned int); int kibnal_end(kibnal_data_t *); void async_event_handler(VAPI_hca_hndl_t , VAPI_event_record_t *,void *); void CQE_event_handler(VAPI_hca_hndl_t ,VAPI_cq_hndl_t , void *); VAPI_ret_t Send_Small_Msg(char *, int ); VAPI_ret_t Send_Large_Msg(char *, int ); VAPI_ret_t repost_recv_buf(QP_info *, VAPI_wr_id_t ); int post_recv_bufs(VAPI_wr_id_t ); int server_listen_thread(void *); VAPI_wr_id_t RTS_handshaking_protocol(int ); VAPI_wr_id_t CTS_handshaking_protocol(RDMA_Info_Exchange *); VAPI_ret_t createMemRegion_RDMA(VAPI_hca_hndl_t , VAPI_pd_hndl_t , char *, int , VAPI_mr_hndl_t *, VAPI_mrw_t *); VAPI_ret_t IB_Set_Event_Handler(HCA_info , kibnal_data_t *); VAPI_ret_t IB_Set_Async_Event_Handler(HCA_info ,kibnal_data_t *); VAPI_wr_id_t find_available_buf(int ); VAPI_wr_id_t search_send_buf(int ); VAPI_wr_id_t find_filler_list(int ,int ); int insert_MRbuf_list(int ); #endif /* _IBNAL_H */