1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Based on ksocknal, qswnal, and gmnal
6 * Copyright (C) 2003 LANL
7 * Author: HB Chen <hbchen@lanl.gov>
8 * Los Alamos National Lab
10 * Portals is free software; you can redistribute it and/or
11 * modify it under the terms of version 2 of the GNU General Public
12 * License as published by the Free Software Foundation.
14 * Portals is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with Portals; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 // portal handle ID for this IB-NAL
28 ptl_handle_ni_t kibnal_ni;
30 // message send buffer mutex
31 spinlock_t MSBuf_mutex[NUM_MBUF];
33 // message recv buffer mutex
34 spinlock_t MRBuf_mutex[NUM_MBUF];
36 // IB-NAL API information
40 kibnal_data_t kibnal_data;
43 VAPI_pd_hndl_t Pd_hndl;
44 unsigned int Num_posted_recv_buf;
46 // registered send buffer list
47 Memory_buffer_info MSbuf_list[NUM_MBUF];
49 // registered recv buffer list
50 Memory_buffer_info MRbuf_list[NUM_MBUF];
54 // currently there is no need fo IBA
56 kpr_nal_interface_t kibnal_router_interface = {
58 kprni_arg: &kibnal_data,
59 kprni_fwd: kibnal_fwd_packet, // forward data to router
60 // is router invloving the
66 QP_info QP_list[NUM_QPS];
68 // information associated with a HCA
71 // something about HCA
72 VAPI_hca_hndl_t Hca_hndl; // assume we only use one HCA now
73 VAPI_hca_vendor_t Hca_vendor;
74 VAPI_hca_cap_t Hca_cap;
75 VAPI_hca_port_t Hca_port_1_props;
76 VAPI_hca_port_t Hca_port_2_props;
77 VAPI_hca_attr_t Hca_attr;
78 VAPI_hca_attr_mask_t Hca_attr_mask;
79 VAPI_cq_hndl_t Cq_RQ_hndl; // CQ's handle
80 VAPI_cq_hndl_t Cq_SQ_hndl; // CQ's handle
81 VAPI_cq_hndl_t Cq_hndl; // CQ's handle
82 Remote_QP_Info L_QP_data;
83 Remote_QP_Info R_QP_data;
90 kibnal_forward(nal_t *nal,
97 kibnal_data_t *knal_data = nal->nal_data;
98 nal_cb_t *nal_cb = knal_data->kib_cb;
101 LASSERT (nal == &kibnal_api);
102 LASSERT (knal_data == &kibnal_data);
103 LASSERT (nal_cb == &kibnal_lib);
105 // dispatch forward API function
107 CDEBUG(D_NET,"kibnal_forward: function id = %d\n", id);
109 lib_dispatch(nal_cb, knal_data, id, args, ret);
111 CDEBUG(D_TRACE,"IBNAL- Done kibnal_forward\n");
113 return PTL_OK; // always return PTL_OK
120 kibnal_lock(nal_t *nal, unsigned long *flags)
122 kibnal_data_t *knal_data = nal->nal_data;
123 nal_cb_t *nal_cb = knal_data->kib_cb;
126 LASSERT (nal == &kibnal_api);
127 LASSERT (knal_data == &kibnal_data);
128 LASSERT (nal_cb == &kibnal_lib);
130 // disable logical interrrupt
131 nal_cb->cb_cli(nal_cb,flags);
133 CDEBUG(D_TRACE,"IBNAL-Done kibnal_lock\n");
141 kibnal_unlock(nal_t *nal, unsigned long *flags)
143 kibnal_data_t *k = nal->nal_data;
144 nal_cb_t *nal_cb = k->kib_cb;
147 LASSERT (nal == &kibnal_api);
148 LASSERT (k == &kibnal_data);
149 LASSERT (nal_cb == &kibnal_lib);
151 // enable logical interrupt
152 nal_cb->cb_sti(nal_cb,flags);
154 CDEBUG(D_TRACE,"IBNAL-Done kibnal_unlock");
160 // showdown this network interface
163 kibnal_shutdown(nal_t *nal, int ni)
166 kibnal_data_t *k = nal->nal_data;
167 nal_cb_t *nal_cb = k->kib_cb;
170 LASSERT (nal == &kibnal_api);
171 LASSERT (k == &kibnal_data);
172 LASSERT (nal_cb == &kibnal_lib);
174 // take down this IB network interface
175 // there is not corresponding cb function to hande this
176 // do we actually need this one
177 // reference to IB network interface shutdown
180 vstat = IB_Close_HCA();
182 if (vstat != VAPI_OK) {
183 CERROR("Failed to close HCA - %s\n",VAPI_strerror(vstat));
187 CDEBUG(D_TRACE,"IBNAL- Done kibnal_shutdown\n");
194 // when do we call this yield function
197 kibnal_yield( nal_t *nal, unsigned long *flags, int milliseconds )
199 kibnal_data_t *k = nal->nal_data;
200 nal_cb_t *nal_cb = k->kib_cb;
203 LASSERT (nal == &kibnal_api);
204 LASSERT (k == &kibnal_data);
205 LASSERT (nal_cb == &kibnal_lib);
207 if (milliseconds != 0) {
208 CERROR("Blocking yeild not implemented yet\n");
212 // check under what condition that we need to
214 // who set this need_resched
215 if (current->need_resched)
218 CDEBUG(D_TRACE,"IBNAL-Done kibnal_yield");
227 kibnal_init(int interface, // no use here
228 ptl_pt_index_t ptl_size,
229 ptl_ac_index_t ac_size,
230 ptl_pid_t requested_pid // no use here
234 nal_cb_t *nal_cb = NULL;
235 kibnal_data_t *nal_data = NULL;
238 CDEBUG(D_NET, "kibnal_init:calling lib_init with nid 0x%u\n",
239 kibnal_data.kib_nid);
242 CDEBUG(D_NET, "kibnal_init: interface [%d], ptl_size [%d], ac_size[%d]\n",
243 interface, ptl_size, ac_size);
244 CDEBUG(D_NET, "kibnal_init: &kibnal_lib 0x%X\n", &kibnal_lib);
245 CDEBUG(D_NET, "kibnal_init: kibnal_data.kib_nid %d\n", kibnal_data.kib_nid);
247 rc = lib_init(&kibnal_lib,
249 requested_pid , // process id is set as requested_pid instead of 0
254 CERROR("kibnal_init: Failed lib_init with nid 0x%u, rc=%d\n",
255 kibnal_data.kib_nid,rc);
258 CDEBUG(D_NET,"kibnal_init: DONE lib_init with nid 0x%x%x\n",
259 kibnal_data.kib_nid);
268 // called before remove ibnal kernel module
271 kibnal_finalize(void)
273 struct list_head *tmp;
275 inter_module_unregister("kibnal_ni");
277 // release resources allocated to this Infiniband network interface
278 PtlNIFini(kibnal_ni);
280 lib_fini(&kibnal_lib);
284 // how much do we need to do here?
285 list_for_each(tmp, &kibnal_data.kib_list) {
287 conn = list_entry(tmp, kibnal_rx_t, krx_item);
288 CDEBUG(D_IOCTL, "freeing conn %p\n",conn);
290 list_del(&conn->krx_item);
291 PORTAL_FREE(conn, sizeof(*conn));
294 CDEBUG(D_MALLOC,"done kmem %d\n",atomic_read(&portal_kmemory));
295 CDEBUG(D_TRACE,"IBNAL-Done kibnal_finalize\n");
302 // * k_server_thread is a kernel thread
303 // use a shared memory ro exchange HCA's data with a pthread in user
305 // * will be replaced when CM is used to handle communication management
308 void k_server_thread(Remote_QP_Info *hca_data)
311 const int shared_segment_size = sizeof(Remote_QP_Info);
312 key_t key = HCA_EXCHANGE_SHM_KEY;
314 int exchanged_done = NO;
317 Remote_QP_Info *exchange_hca_data;
323 // create a shared memory with pre-agreement key
324 segment_id = sys_shmget(key,
329 // attached to shared memoru
330 // raddr is pointed to an user address space
331 // use this address to update shared menory content
332 ret = sys_shmat(segment_id, 0 , SHM_RND, &raddr);
336 CDEBUG(D_NET,"k_server_thread: Shared memory attach success ret = 0X%d,&raddr"
337 " 0X%x (*(&raddr))=0x%x \n", ret, &raddr, (*(&raddr)));
338 printk("k_server_thread: Shared memory attach success ret = 0X%d, &raddr"
339 " 0X%x (*(&raddr))=0x%x \n", ret, &raddr, (*(&raddr)));
342 CERROR("k_server_thread: Shared memory attach failed ret = 0x%d \n", ret);
343 printk("k_server_thread: Shared memory attach failed ret = 0x%d \n", ret);
349 uaddr = *n; // get the U-address
350 /* cast uaddr to exchange_hca_data */
351 exchange_hca_data = (Remote_QP_Info *) uaddr;
353 /* copy data from local HCA to shared memory */
354 exchange_hca_data->opcode = hca_data->opcode;
355 exchange_hca_data->length = hca_data->length;
357 for(i=0; i < NUM_QPS; i++) {
358 exchange_hca_data->dlid[i] = hca_data->dlid[i];
359 exchange_hca_data->rqp_num[i] = hca_data->rqp_num[i];
362 // periodically check shared memory until get updated
363 // remote HCA's data from user mode pthread
364 while(exchanged_done == NO) {
365 if(exchange_hca_data->opcode == RECV_QP_INFO){
366 exchanged_done = YES;
367 /* copy data to local buffer from shared memory */
368 hca_data->opcode = exchange_hca_data->opcode;
369 hca_data->length = exchange_hca_data->length;
371 for(i=0; i < NUM_QPS; i++) {
372 hca_data->dlid[i] = exchange_hca_data->dlid[i];
373 hca_data->rqp_num[i] = exchange_hca_data->rqp_num[i];
378 schedule_timeout(1000);
382 // detached shared memory
385 CDEBUG(D_NET, "Exit from kernel thread: k_server_thread \n");
386 printk("Exit from kernel thread: k_server_thread \n");
396 create_qp(QP_info *qp, int qp_index)
400 VAPI_qp_init_attr_t qp_init_attr;
401 VAPI_qp_prop_t qp_prop;
403 qp->hca_hndl = Hca_hndl;
404 qp->port = 1; // default
405 qp->slid = Hca_port_1_props.lid;
406 qp->hca_port = Hca_port_1_props;
409 /* Queue Pair Creation Attributes */
410 qp_init_attr.cap.max_oust_wr_rq = NUM_WQE;
411 qp_init_attr.cap.max_oust_wr_sq = NUM_WQE;
412 qp_init_attr.cap.max_sg_size_rq = NUM_SG;
413 qp_init_attr.cap.max_sg_size_sq = NUM_SG;
414 qp_init_attr.pd_hndl = qp->pd_hndl;
415 qp_init_attr.rdd_hndl = 0;
416 qp_init_attr.rq_cq_hndl = qp->rq_cq_hndl;
417 /* we use here polling */
418 //qp_init_attr.rq_sig_type = VAPI_SIGNAL_REQ_WR;
419 qp_init_attr.rq_sig_type = VAPI_SIGNAL_ALL_WR;
420 qp_init_attr.sq_cq_hndl = qp->sq_cq_hndl;
421 /* we use here polling */
422 //qp_init_attr.sq_sig_type = VAPI_SIGNAL_REQ_WR;
423 qp_init_attr.sq_sig_type = VAPI_SIGNAL_ALL_WR;
424 // transport servce - reliable connection
426 qp_init_attr.ts_type = VAPI_TS_RC;
428 vstat = VAPI_create_qp(qp->hca_hndl,
430 &qp->qp_hndl, &qp_prop);
432 if (vstat != VAPI_OK) {
433 CERROR("Failed creating QP. Return Failed - %s\n",VAPI_strerror(vstat));
437 qp->qp_num = qp_prop.qp_num; // the qp number
438 qp->last_posted_send_id = 0; // user defined work request ID
439 qp->last_posted_rcv_id = 0; // user defined work request ID
440 qp->cur_send_outstanding = 0;
441 qp->cur_posted_rcv_bufs = 0;
442 qp->snd_rcv_balance = 0;
444 CDEBUG(D_OTHER, "create_qp: qp_num = %d, slid = %d, qp_hndl = 0X%X",
445 qp->qp_num, qp->slid, qp->qp_hndl);
447 // initialize spin-lock mutex variables
448 spin_lock_init(&(qp->snd_mutex));
449 spin_lock_init(&(qp->rcv_mutex));
450 spin_lock_init(&(qp->bl_mutex));
451 spin_lock_init(&(qp->cln_mutex));
452 // number of outstanding requests on the send Q
453 qp->cur_send_outstanding = 0;
454 // number of posted receive buffers
455 qp->cur_posted_rcv_bufs = 0;
456 qp->snd_rcv_balance = 0;
463 // initialize a UD qp state to RTR and RTS
466 init_qp_UD(QP_info *qp, int qp_index)
468 VAPI_qp_attr_t qp_attr;
469 VAPI_qp_init_attr_t qp_init_attr;
470 VAPI_qp_attr_mask_t qp_attr_mask;
471 VAPI_qp_cap_t qp_cap;
474 /* Move from RST to INIT */
475 /* Change QP to INIT */
477 CDEBUG(D_OTHER, "Changing QP state to INIT qp-index = %d\n", qp_index);
479 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
481 qp_attr.qp_state = VAPI_INIT;
482 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
484 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
487 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PKEY_IX);
489 CDEBUG(D_OTHER, "pkey_ix qp_attr_mask = 0X%x\n", qp_attr_mask);
491 qp_attr.port = qp->port;
492 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PORT);
494 CDEBUG(D_OTHER, "port qp_attr_mask = 0X%x\n", qp_attr_mask);
497 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QKEY);
499 CDEBUG(D_OTHER, "qkey qp_attr_mask = 0X%x\n", qp_attr_mask);
501 /* If I do not set this mask, I get an error from HH. QPM should catch it */
503 vstat = VAPI_modify_qp(qp->hca_hndl,
509 if (vstat != VAPI_OK) {
510 CERROR("Failed modifying QP from RST to INIT. %s\n",VAPI_strerror(vstat));
514 CDEBUG(D_OTHER, "Modifying QP from RST to INIT.\n");
516 vstat= VAPI_query_qp(qp->hca_hndl,
522 if (vstat != VAPI_OK) {
523 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
527 /* Move from INIT to RTR */
528 /* Change QP to RTR */
529 CDEBUG(D_OTHER, "Changing QP state to RTR\n");
531 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
533 qp_attr.qp_state = VAPI_RTR;
534 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
536 CDEBUG(D_OTHER, "INIT to RTR- qp_state : qp_attr_mask = 0X%x\n", qp_attr_mask);
538 vstat = VAPI_modify_qp(qp->hca_hndl,
544 if (vstat != VAPI_OK) {
545 CERROR("Failed modifying QP from INIT to RTR. %s\n",VAPI_strerror(vstat));
549 CDEBUG(D_OTHER, "Modifying QP from INIT to RTR.\n");
551 vstat= VAPI_query_qp(qp->hca_hndl,
557 if (vstat != VAPI_OK) {
558 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
562 /* RTR to RTS - Change QP to RTS */
563 CDEBUG(D_OTHER, "Changing QP state to RTS\n");
565 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
567 qp_attr.qp_state = VAPI_RTS;
568 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
570 qp_attr.sq_psn = START_SQ_PSN;
571 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_SQ_PSN);
573 vstat = VAPI_modify_qp(qp->hca_hndl,
579 if (vstat != VAPI_OK) {
580 CERROR("Failed modifying QP from RTR to RTS. %s:%s\n",
581 VAPI_strerror_sym(vstat),
582 VAPI_strerror(vstat));
586 CDEBUG(D_OTHER, "Modifying QP from RTR to RTS. \n");
588 vstat= VAPI_query_qp(qp->hca_hndl,
594 if (vstat != VAPI_OK) {
595 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
600 // a QP is at RTS state NOW
603 CDEBUG(D_OTHER, "IBNAL- UD qp is at RTS NOW\n");
612 // initialize a RC qp state to RTR and RTS
613 // RC transport service
616 init_qp_RC(QP_info *qp, int qp_index)
618 VAPI_qp_attr_t qp_attr;
619 VAPI_qp_init_attr_t qp_init_attr;
620 VAPI_qp_attr_mask_t qp_attr_mask;
621 VAPI_qp_cap_t qp_cap;
624 /* Move from RST to INIT */
625 /* Change QP to INIT */
627 CDEBUG(D_OTHER, "Changing QP state to INIT qp-index = %d\n", qp_index);
629 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
631 qp_attr.qp_state = VAPI_INIT;
632 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
634 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
637 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PKEY_IX);
639 CDEBUG(D_OTHER, "pkey_ix qp_attr_mask = 0X%x\n", qp_attr_mask);
641 qp_attr.port = qp->port;
642 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PORT);
644 CDEBUG(D_OTHER, "port qp_attr_mask = 0X%x\n", qp_attr_mask);
646 qp_attr.remote_atomic_flags = VAPI_EN_REM_WRITE | VAPI_EN_REM_READ;
647 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_REMOTE_ATOMIC_FLAGS);
649 CDEBUG(D_OTHER, "remote_atomic_flags qp_attr_mask = 0X%x\n", qp_attr_mask);
651 /* If I do not set this mask, I get an error from HH. QPM should catch it */
653 vstat = VAPI_modify_qp(qp->hca_hndl,
659 if (vstat != VAPI_OK) {
660 CERROR("Failed modifying QP from RST to INIT. %s\n",VAPI_strerror(vstat));
664 vstat= VAPI_query_qp(qp->hca_hndl,
670 if (vstat != VAPI_OK) {
671 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
675 /* Move from INIT to RTR */
676 /* Change QP to RTR */
677 CDEBUG(D_OTHER, "Changing QP state to RTR qp_indexi %d\n", qp_index);
679 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
680 qp_attr.qp_state = VAPI_RTR;
682 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
684 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
686 qp_attr.av.sl = 0;/* RESPONDER_SL */
687 qp_attr.av.grh_flag = FALSE;
688 qp_attr.av.dlid = qp->dlid;/*RESPONDER_LID;*/
689 qp_attr.av.static_rate = 0;
690 qp_attr.av.src_path_bits = 0;
691 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_AV);
693 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
695 qp_attr.path_mtu = MTU_2048;// default is MTU_2048
696 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PATH_MTU);
698 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
700 qp_attr.rq_psn = START_RQ_PSN;
701 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_RQ_PSN);
703 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
705 qp_attr.qp_ous_rd_atom = NUM_WQE;
706 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_OUS_RD_ATOM);
708 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
711 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PKEY_IX);
713 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
715 qp_attr.min_rnr_timer = 10;
716 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_MIN_RNR_TIMER);
718 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
720 qp_attr.dest_qp_num = qp->rqp_num;
722 CDEBUG(D_OTHER, "remore qp num %d\n", qp->rqp_num);
724 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_DEST_QP_NUM);
726 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
728 vstat = VAPI_modify_qp(qp->hca_hndl,
735 if (vstat != VAPI_OK) {
736 CERROR("Failed modifying QP from INIT to RTR. qp_index %d - %s\n",
737 qp_index, VAPI_strerror(vstat));
741 vstat= VAPI_query_qp(qp->hca_hndl,
747 if (vstat != VAPI_OK) {
748 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
752 /* RTR to RTS - Change QP to RTS */
753 CDEBUG(D_OTHER, "Changing QP state to RTS\n");
755 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
757 qp_attr.qp_state = VAPI_RTS;
758 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
760 qp_attr.sq_psn = START_SQ_PSN;
761 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_SQ_PSN);
763 qp_attr.timeout = 0x18;
764 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_TIMEOUT);
766 qp_attr.retry_count = 10;
767 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_RETRY_COUNT);
769 qp_attr.rnr_retry = 14;
770 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_RNR_RETRY);
772 qp_attr.ous_dst_rd_atom = 100;
773 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_OUS_DST_RD_ATOM);
775 qp_attr.min_rnr_timer = 5;
776 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_MIN_RNR_TIMER);
778 vstat = VAPI_modify_qp(qp->hca_hndl,
784 if (vstat != VAPI_OK) {
785 CERROR("Failed modifying QP from RTR to RTS. %s:%s\n",
786 VAPI_strerror_sym(vstat), VAPI_strerror(vstat));
790 vstat= VAPI_query_qp(qp->hca_hndl,
796 if (vstat != VAPI_OK) {
797 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
802 // a QP is at RTS state NOW
805 CDEBUG(D_OTHER, "IBNAL- RC qp is at RTS NOW\n");
813 IB_Open_HCA(kibnal_data_t *kib_data)
817 VAPI_cqe_num_t cqe_active_num;
820 int Num_posted_recv_buf;
823 CDEBUG(D_PORTALS, "Opening an HCA\n");
825 vstat = VAPI_open_hca(HCA_ID, &Hca_hndl);
826 vstat = EVAPI_get_hca_hndl(HCA_ID, &Hca_hndl);
827 if (vstat != VAPI_OK) {
828 CERROR("Failed opening the HCA: %s. %s...\n",HCA_ID,VAPI_strerror(vstat));
833 vstat = VAPI_query_hca_cap(Hca_hndl, &Hca_vendor, &Hca_cap);
834 if (vstat != VAPI_OK) {
835 CERROR("Failed query hca cap %s\n",VAPI_strerror(vstat));
839 /* Get port 1 info */
840 vstat = VAPI_query_hca_port_prop(Hca_hndl, HCA_PORT_1 , &Hca_port_1_props);
841 if (vstat != VAPI_OK) {
842 CERROR("Failed query port cap %s\n",VAPI_strerror(vstat));
846 /* Get port 2 info */
847 vstat = VAPI_query_hca_port_prop(Hca_hndl, HCA_PORT_2, &Hca_port_2_props);
848 if (vstat != VAPI_OK) {
849 CERROR("Failed query port cap %s\n",VAPI_strerror(vstat));
854 CDEBUG(D_PORTALS, "Allocating PD \n");
855 vstat = VAPI_alloc_pd(Hca_hndl,&Pd_hndl);
856 if (vstat != VAPI_OK) {
857 CERROR("Failed allocating a PD. %s\n",VAPI_strerror(vstat));
861 vstat = createMemRegion(Hca_hndl, Pd_hndl);
862 if (vstat != VAPI_OK) {
863 CERROR("Failed registering a memory region.%s\n",VAPI_strerror(vstat));
867 /* Create CQ for RQ*/
868 CDEBUG(D_PORTALS, "Creating a send completion queue\n");
870 vstat = VAPI_create_cq(Hca_hndl,
875 if (vstat != VAPI_OK) {
876 CERROR("Failed creating a CQ. %s\n",VAPI_strerror(vstat));
880 if(NUM_CQE == cqe_active_num) {
881 CERROR("VAPI_create_cq: NUM_CQE EQ cqe_active_num \n");
884 CDEBUG(D_NET, "VAPI_create_cq: NUM_CQE %d , actual cqe_active_num %d \n",
885 NUM_CQE, cqe_active_num);
888 Cq_SQ_hndl = Cq_hndl;
889 Cq_RQ_hndl = Cq_hndl;
894 for(i=0; i < NUM_QPS; i++) {
895 QP_list[i].pd_hndl = Pd_hndl;
896 QP_list[i].hca_hndl = Hca_hndl;
897 // sq rq use the same Cq_hndl
898 QP_list[i].sq_cq_hndl = Cq_hndl;
899 QP_list[i].rq_cq_hndl = Cq_hndl;
900 vstat = create_qp(&QP_list[i], i);
901 if (vstat != VAPI_OK) {
902 CERROR("Failed creating a QP %d %s\n",i, VAPI_strerror(vstat));
911 Hca_data.hca_hndl = Hca_hndl; // HCA handle
912 Hca_data.pd_hndl = Pd_hndl; // protection domain
913 Hca_data.port = 1; // port number
914 Hca_data.num_qp = NUM_QPS; // number of qp used
916 for(i=0; i < NUM_QPS; i++) {
917 Hca_data.qp_ptr[i] = &QP_list[i]; // point to QP_list
920 Hca_data.num_cq = NUM_CQ; // number of cq used
921 Hca_data.cq_hndl = Cq_hndl; //
922 Hca_data.sq_cq_hndl = Cq_SQ_hndl; //
923 Hca_data.rq_cq_hndl = Cq_RQ_hndl; //
924 Hca_data.kib_data = kib_data; //
925 Hca_data.slid = QP_list[0].slid;//
929 #ifdef USE_SHARED_MEMORY_AND_SOCKET
932 * + use a shared-memory between a user thread and a kernel thread
933 * for HCA's data exchange on the same node
934 * + use socket in user mode to exhange HCA's data with a remote node
938 R_QP_data.opcode = SEND_QP_INFO;
939 R_QP_data.length = sizeof(L_QP_data);
941 for(i=0; i < NUM_QPS; i++) {
942 // my slid will be used in a remote node as dlid
943 R_QP_data.dlid[i] = QP_list[i].slid;
944 // my qp_num will be used in remode node as remote_qp_number
945 // RC is used here so we need dlid and rqp_num
946 R_QP_data.rqp_num[i] = QP_list[i].qp_num ;
949 // create a kernel thread for exchanging HCA's data
950 // R_QP_data will be exchanged with a remoe node
952 kernel_thread(k_server_thread, &R_QP_data, 0); //
953 // check if the HCA'data have been updated by kernel_thread
954 // loop until the HCA's data is updated
955 // make sure that uagent is running
957 // QP info is exchanged with a remote node
959 schedule_timeout(1000);
960 if(R_QP_data.opcode == RECV_QP_INFO) {
961 CDEBUG(D_NET, "HCA's data is being updated\n");
968 #ifdef USE_SHARED_MEMORY_AND_MULTICAST
971 * + use a shared-memory between a user thread and a kernel thread
972 * for HCA's data exchange on the same node
973 * + use Infinoband UR/multicast in user mode to exhange HCA's data with i
982 for(i=0; i < NUM_QPS; i++) {
983 qp = (QP_info *) &QP_list[i];
984 QP_list[i].rqp_num = R_QP_data.rqp_num[i]; // remoter qp number
985 QP_list[i].dlid = R_QP_data.dlid[i]; // remote dlid
988 // already have remote_qp_num adn dlid information
989 // initialize QP to RTR/RTS state
991 for(i=0; i < NUM_QPS; i++) {
992 vstat = init_qp_RC(&QP_list[i], i);
993 if (vstat != VAPI_OK) {
994 CERROR("Failed change a QP %d to RTS state%s\n",
995 i,VAPI_strerror(vstat));
1000 // post receiving buffer before any send happened
1002 Num_posted_recv_buf = post_recv_bufs( (VAPI_wr_id_t ) START_RECV_WRQ_ID);
1004 // for irregular completion event or some unexpected failure event
1005 vstat = IB_Set_Async_Event_Handler(Hca_data, &kibnal_data);
1006 if (vstat != VAPI_OK) {
1007 CERROR("IB_Set_Async_Event_Handler failed: %d\n", vstat);
1012 CDEBUG(D_PORTALS, "IBNAL- done with IB_Open_HCA\n");
1014 for(i=0; i < NUM_MBUF; i++) {
1015 spin_lock_init(&MSB_mutex[i]);
1024 Function: IB_Set_Event_Handler()
1026 IN Hca_info hca_data
1027 IN kibnal_data_t *kib_data -- private data
1030 return: VAPI_OK - success
1036 IB_Set_Event_Handler(HCA_info hca_data, kibnal_data_t *kib_data)
1039 EVAPI_compl_handler_hndl_t comp_handler_hndl;
1041 // register CQE_Event_Hnadler
1043 vstat = VAPI_set_comp_event_handler(hca_data.hca_hndl,
1048 or use extended VAPI function
1049 vstat = EVAPI_set_comp_eventh(hca_data.hca_hndl,
1057 if (vstat != VAPI_OK) {
1058 CERROR("IB_Set_Event_Handler: failed EVAPI_set_comp_eventh for"
1059 " HCA ID = %s (%s).\n", HCA_ID, VAPI_strerror(vstat));
1063 // issue a request for completion ievent notification
1064 vstat = VAPI_req_comp_notif(hca_data.hca_hndl,
1068 if (vstat != VAPI_OK) {
1069 CERROR("IB_Set_Event_Handler: failed VAPI_req_comp_notif for HCA ID"
1070 " = %s (%s).\n", HCA_ID, VAPI_strerror(vstat));
1079 Function: IB_Set_Async_Event_Handler()
1081 IN HCA_info hca_data
1082 IN kibnal_data_t *kib_data -- private data
1085 return: VAPI_OK - success
1092 IB_Set_Async_Event_Handler(HCA_info hca_data, kibnal_data_t *kib_data)
1097 // register an asynchronous event handler for this HCA
1100 vstat= VAPI_set_async_event_handler(hca_data.hca_hndl,
1101 async_event_handler,
1104 if (vstat != VAPI_OK) {
1105 CERROR("IB_Set_Async_Event_Handler: failed VAPI_set_async_comp_event_handler"
1106 " for HCA ID = %s (%s).\n", HCA_ID, VAPI_strerror(vstat));
1114 // close this Infiniband HCA interface
1115 // release allocated resources to system
1126 CDEBUG(D_PORTALS, "Destroying QP\n");
1128 for(i=0; i < NUM_QPS; i++) {
1129 vstat = VAPI_destroy_qp(QP_list[i].hca_hndl, QP_list[i].qp_hndl);
1130 if (vstat != VAPI_OK) {
1131 CERROR("Failed destroying QP %d. %s\n", i, VAPI_strerror(vstat));
1138 CDEBUG(D_PORTALS, "Destroying CQ\n");
1139 for(i=0; i < NUM_QPS; i++) {
1140 // send_cq adn receive_cq are shared the same CQ
1141 // so only destroy one of them
1142 vstat = VAPI_destroy_cq(QP_list[i].hca_hndl, QP_list[i].sq_cq_hndl);
1143 if (vstat != VAPI_OK) {
1144 CERROR("Failed destroying CQ %d. %s\n", i, VAPI_strerror(vstat));
1151 /* Destroy Memory Region */
1152 CDEBUG(D_PORTALS, "Deregistering MR\n");
1153 for(i=0; i < NUM_QPS; i++) {
1154 vstat = deleteMemRegion(&QP_list[i], i);
1155 if (vstat != VAPI_OK) {
1156 CERROR("Failed deregister mem reg %d. %s\n",i, VAPI_strerror(vstat));
1166 CDEBUG(D_PORTALS, "Closing HCA\n");
1167 vstat = VAPI_close_hca(Hca_hndl);
1168 if (vstat != VAPI_OK) {
1169 CERROR("Failed to close HCA. %s\n", VAPI_strerror(vstat));
1174 CDEBUG(D_PORTALS, "IBNAL- Done with closing HCA \n");
1181 createMemRegion(VAPI_hca_hndl_t hca_hndl,
1182 VAPI_pd_hndl_t pd_hndl)
1187 VAPI_mr_hndl_t rep_mr_hndl;
1192 // send registered memory region
1193 for(i=0; i < NUM_ENTRY; i++) {
1194 MSbuf_list[i].buf_size = KB_32;
1195 PORTAL_ALLOC(bufptr, MSbuf_list[i].buf_size);
1196 if(bufptr == NULL) {
1197 CDEBUG(D_MALLOC,"Failed to malloc a block of send memory, qix %d size %d\n",
1198 i, MSbuf_list[i].buf_size);
1199 CERROR("Failed to malloc a block of send memory, qix %d size %d\n",
1200 i, MSbuf_list[i].buf_size);
1201 return(VAPI_ENOMEM);
1205 mrw.pd_hndl= pd_hndl;
1206 mrw.start = MSbuf_list[i].buf_addr = (VAPI_virt_addr_t)(MT_virt_addr_t) bufptr;
1207 mrw.size = MSbuf_list[i].buf_size;
1208 mrw.acl = VAPI_EN_LOCAL_WRITE |
1209 VAPI_EN_REMOTE_WRITE |
1210 VAPI_EN_REMOTE_READ;
1212 // register send memory region
1213 vstat = VAPI_register_mr(hca_hndl,
1218 // this memory region is going to be reused until deregister is called
1219 if(vstat != VAPI_OK) {
1220 CERROR("Failed registering a mem region qix %d Addr=%p, Len=%d. %s\n",
1221 i, mrw.start, mrw.size, VAPI_strerror(vstat));
1225 MSbuf_list[i].mr = rep_mr;
1226 MSbuf_list[i].mr_hndl = rep_mr_hndl;
1227 MSbuf_list[i].bufptr = bufptr;
1228 MSbuf_list[i].buf_addr = rep_mr.start;
1229 MSbuf_list[i].status = BUF_REGISTERED;
1230 MSbuf_list[i].ref_count = 0;
1231 MSbuf_list[i].buf_type = REG_BUF;
1232 MSbuf_list[i].raddr = 0x0;
1233 MSbuf_list[i].rkey = 0x0;
1236 // RDAM buffer is not reserved for RDAM WRITE/READ
1238 for(i=NUM_ENTRY; i< NUM_MBUF; i++) {
1239 MSbuf_list[i].status = BUF_UNREGISTERED;
1240 MSbuf_list[i].buf_type = RDMA_BUF;
1244 // recv registered memory region
1245 for(i=0; i < NUM_ENTRY; i++) {
1246 MRbuf_list[i].buf_size = KB_32;
1247 PORTAL_ALLOC(bufptr, MRbuf_list[i].buf_size);
1249 if(bufptr == NULL) {
1250 CDEBUG(D_MALLOC, "Failed to malloc a block of send memory, qix %d size %d\n",
1251 i, MRbuf_list[i].buf_size);
1252 return(VAPI_ENOMEM);
1256 mrw.pd_hndl= pd_hndl;
1257 mrw.start = (VAPI_virt_addr_t)(MT_virt_addr_t) bufptr;
1258 mrw.size = MRbuf_list[i].buf_size;
1259 mrw.acl = VAPI_EN_LOCAL_WRITE |
1260 VAPI_EN_REMOTE_WRITE |
1261 VAPI_EN_REMOTE_READ;
1263 // register send memory region
1264 vstat = VAPI_register_mr(hca_hndl,
1269 // this memory region is going to be reused until deregister is called
1270 if(vstat != VAPI_OK) {
1271 CERROR("Failed registering a mem region qix %d Addr=%p, Len=%d. %s\n",
1272 i, mrw.start, mrw.size, VAPI_strerror(vstat));
1276 MRbuf_list[i].mr = rep_mr;
1277 MRbuf_list[i].mr_hndl = rep_mr_hndl;
1278 MRbuf_list[i].bufptr = bufptr;
1279 MRbuf_list[i].buf_addr = rep_mr.start;
1280 MRbuf_list[i].status = BUF_REGISTERED;
1281 MRbuf_list[i].ref_count = 0;
1282 MRbuf_list[i].buf_type = REG_BUF;
1283 MRbuf_list[i].raddr = 0x0;
1284 MRbuf_list[i].rkey = rep_mr.r_key;
1285 MRbuf_list[i].lkey = rep_mr.l_key;
1289 // keep extra information for a qp
1290 for(i=0; i < NUM_QPS; i++) {
1291 QP_list[i].mr_hndl = MSbuf_list[i].mr_hndl;
1292 QP_list[i].mr = MSbuf_list[i].mr;
1293 QP_list[i].bufptr = MSbuf_list[i].bufptr;
1294 QP_list[i].buf_addr = MSbuf_list[i].buf_addr;
1295 QP_list[i].buf_size = MSbuf_list[i].buf_size;
1296 QP_list[i].raddr = MSbuf_list[i].raddr;
1297 QP_list[i].rkey = MSbuf_list[i].rkey;
1298 QP_list[i].lkey = MSbuf_list[i].lkey;
1301 CDEBUG(D_PORTALS, "IBNAL- done VAPI_ret_t createMemRegion \n");
1305 } /* createMemRegion */
1310 deleteMemRegion(QP_info *qp, int qix)
1315 // free send memory assocaited with this memory region
1317 PORTAL_FREE(MSbuf_list[qix].bufptr, MSbuf_list[qix].buf_size);
1320 vstat = VAPI_deregister_mr(qp->hca_hndl, MSbuf_list[qix].mr_hndl);
1322 if(vstat != VAPI_OK) {
1323 CERROR("Failed deregistering a send mem region qix %d %s\n",
1324 qix, VAPI_strerror(vstat));
1329 // free recv memory assocaited with this memory region
1331 PORTAL_FREE(MRbuf_list[qix].bufptr, MRbuf_list[qix].buf_size);
1334 vstat = VAPI_deregister_mr(qp->hca_hndl, MRbuf_list[qix].mr_hndl);
1336 if(vstat != VAPI_OK) {
1337 CERROR("Failed deregistering a recv mem region qix %d %s\n",
1338 qix, VAPI_strerror(vstat));
1347 // polling based event handling
1348 // + a daemon process
1349 // + poll the CQ and check what is in the CQ
1350 // + process incoming CQ event
1355 RDMA_Info_Exchange Rdma_info;
1356 int Cts_Message_arrived = NO;
1358 void k_recv_thread(HCA_info *hca_data)
1361 VAPI_wc_desc_t comp_desc;
1362 unsigned long polling_count = 0;
1363 u_int32_t timeout_usec;
1364 unsigned int priority = 100;
1365 unsigned int length;
1366 VAPI_wr_id_t wrq_id;
1367 u_int32_t transferred_data_length; /* Num. of bytes transferred */
1369 VAPI_virt_addr_t bufaddr;
1370 unsigned long buf_size = 0;
1371 QP_info *qp; // point to QP_list
1373 kportal_daemonize("k_recv_thread"); // make it as a daemon process
1376 timeout_usec = 100; // how is the impact on the performance
1378 // send Q and receive Q are using the same CQ
1379 // so only poll one CQ for both operations
1381 CDEBUG(D_NET, "IBNAL- enter kibnal_recv_thread\n");
1382 CDEBUG(D_NET, "hca_hndl = 0X%x, cq_hndl=0X%x\n",
1383 hca_data->hca_hndl,hca_data->cq_hndl);
1385 qp = hca_data->qp_ptr;
1387 CDEBUG(D_NET, "in recv_thread qp is NULL\n");
1388 CDEBUG(D_NET, "Exit from recv_thread qp is NULL\n");
1392 CDEBUG(D_NET, "in recv_thread qp is 0X%X\n", qp);
1395 CDEBUG(D_NET, "kibnal_recv_thread - enter event driver polling loop\n");
1407 // send Q and receive Q are using the same CQ
1408 // so only poll one CQ for both operations
1411 vstat = VAPI_poll_cq(hca_data->hca_hndl,hca_data->cq_hndl, &comp_desc);
1413 if (vstat == VAPI_CQ_EMPTY) {
1414 // there is no event in CQE
1418 if (vstat != (VAPI_OK)) {
1419 CERROR("error while polling completion queuei vstat %d \n", vstat);
1424 // process the complete event
1425 switch(comp_desc.opcode) {
1426 case VAPI_CQE_SQ_SEND_DATA:
1427 // about the Send Q ,POST SEND completion
1428 // who needs this information
1430 // mark MSbuf_list[wr_id].status = BUF_REGISTERED
1432 wrq_id = comp_desc.id;
1434 if(RDMA_OP_ID < wrq_id) {
1435 // this RDMA message id, adjust it to the right entry
1436 wrq_id = wrq_id - RDMA_OP_ID;
1437 vstat = VAPI_deregister_mr(qp->hca_hndl, Local_rdma_info.send_rdma_mr_hndl);
1440 if(vstat != VAPI_OK) {
1441 CERROR("VAPI_CQE_SQ_SEND_DATA: Failed deregistering a RDMAi recv" " mem region %s\n", VAPI_strerror(vstat));
1444 if((RDMA_CTS_ID <= wrq_id) && (RDMA_OP_ID < wrq_id)) {
1445 // RTS or CTS send complete, release send buffer
1446 if(wrq_id >= RDMA_RTS_ID)
1447 wrq_id = wrq_id - RDMA_RTS_ID;
1449 wrq_id = wrq_id - RDMA_CTS_ID;
1452 spin_lock(&MSB_mutex[(int) wrq_id]);
1453 MRbuf_list[wrq_id].status = BUF_REGISTERED;
1454 spin_unlock(&MSB_mutex[(int) wrq_id]);
1456 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_SEND_DATA\n");
1459 case VAPI_CQE_SQ_RDMA_WRITE:
1460 // about the Send Q, RDMA write completion
1461 // who needs this information
1462 // data is successfully write from pource to destionation
1465 // mark MSbuf_list[wr_id].status = BUF_REGISTERED
1466 // de-register rdma buffer
1469 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_RDMA_WRITE\n");
1472 case VAPI_CQE_SQ_RDMA_READ:
1474 // RDMA read completion
1475 // who needs this information
1476 // data is successfully read from destionation to source
1477 CDEBUG(D_NET, "CQE opcode- VAPI_CQE_SQ_RDMA_READ\n");
1480 case VAPI_CQE_SQ_COMP_SWAP:
1482 // RDMA write completion
1483 // who needs this information
1485 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_COMP_SWAP\n");
1488 case VAPI_CQE_SQ_FETCH_ADD:
1490 // RDMA write completion
1491 // who needs this information
1493 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_FETCH_ADD\n");
1496 case VAPI_CQE_SQ_BIND_MRW:
1498 // RDMA write completion
1499 // who needs this information
1501 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_BIND_MRW\n");
1504 case VAPI_CQE_RQ_SEND_DATA:
1505 // about the Receive Q
1506 // process the incoming data and
1507 // forward it to .....
1508 // a completion recevie event is arriving at CQ
1509 // issue a recevie to get this arriving data out from CQ
1510 // pass the receiving data for further processing
1511 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_SEND_DATA\n");
1512 wrq_id = comp_desc.id ;
1513 transferred_data_length = comp_desc.byte_len;
1515 if((wrq_id >= RDMA_CTS_ID) && (wrq_id < RDMA_OP_ID)) {
1516 // this is RTS/CTS message
1517 // process it locally and don't pass it to portals layer
1518 // adjust wrq_id to get the right entry in MRbfu_list
1520 if(wrq_id >= RDMA_RTS_ID)
1521 wrq_id = wrq_id - RDMA_RTS_ID;
1523 wrq_id = wrq_id - RDMA_CTS_ID;
1525 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) MRbuf_list[wrq_id].buf_addr;
1526 MRbuf_list[wrq_id].status = BUF_INUSE;
1527 memcpy(&Rdma_info, &bufaddr, sizeof(RDMA_Info_Exchange));
1529 if(Ready_To_send == Rdma_info.opcode)
1530 // an RTS request message from remote node
1531 // prepare local RDMA buffer and send local rdma info to
1533 CTS_handshaking_protocol(&Rdma_info);
1535 if((Clear_To_send == Rdma_info.opcode) &&
1536 (RDMA_BUFFER_RESERVED == Rdma_info.flag))
1537 Cts_Message_arrived = YES;
1539 if(RDMA_BUFFER_UNAVAILABLE == Rdma_info.flag)
1540 CERROR("RDMA operation abort-RDMA_BUFFER_UNAVAILABLE\n");
1544 // this is an incoming mesage for portals layer
1545 // move to PORTALS layer for further processing
1548 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t)
1549 MRbuf_list[wrq_id].buf_addr;
1551 MRbuf_list[wrq_id].status = BUF_INUSE;
1552 transferred_data_length = comp_desc.byte_len;
1554 kibnal_rx(hca_data->kib_data,
1556 transferred_data_length,
1557 MRbuf_list[wrq_id].buf_size,
1561 // repost this receiving buffer and makr it at BUF_REGISTERED
1563 vstat = repost_recv_buf(qp, wrq_id);
1564 if(vstat != (VAPI_OK)) {
1565 CERROR("error while polling completion queue\n");
1568 MRbuf_list[wrq_id].status = BUF_REGISTERED;
1573 case VAPI_CQE_RQ_RDMA_WITH_IMM:
1574 // about the Receive Q
1575 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n");
1577 wrq_id = comp_desc.id ;
1578 transferred_data_length = comp_desc.byte_len;
1580 if(wrq_id == RDMA_OP_ID) {
1581 // this is RDAM op , locate the RDAM memory buffer address
1583 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) Local_rdma_info.raddr;
1585 transferred_data_length = comp_desc.byte_len;
1587 kibnal_rx(hca_data->kib_data,
1589 transferred_data_length,
1590 Local_rdma_info.buf_length,
1593 // de-regiser this RDAM receiving memory buffer
1594 // too early ?? test & check
1595 vstat = VAPI_deregister_mr(qp->hca_hndl, Local_rdma_info.recv_rdma_mr_hndl);
1596 if(vstat != VAPI_OK) {
1597 CERROR("VAPI_CQE_RQ_RDMA_WITH_IMM: Failed deregistering a RDMA"
1598 " recv mem region %s\n", VAPI_strerror(vstat));
1602 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n");
1605 case VAPI_CQE_INVAL_OPCODE:
1607 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_INVAL_OPCODE\n");
1611 CDEBUG(D_NET, "CQE opcode-unknown opcode\n");
1615 schedule_timeout(RECEIVING_THREAD_TIMEOUT);//how often do we need to poll CQ
1617 }// receiving while loop
1623 void CQE_event_handler(VAPI_hca_hndl_t hca_hndl,
1624 VAPI_cq_hndl_t cq_hndl,
1628 VAPI_wc_desc_t comp_desc;
1629 unsigned long polling_count = 0;
1630 u_int32_t timeout_usec;
1631 unsigned int priority = 100;
1632 unsigned int length;
1633 VAPI_wr_id_t wrq_id;
1634 u_int32_t transferred_data_length; /* Num. of bytes transferred */
1636 VAPI_virt_addr_t bufaddr;
1637 unsigned long buf_size = 0;
1638 QP_info *qp; // point to QP_list
1641 // send Q and receive Q are using the same CQ
1642 // so only poll one CQ for both operations
1644 CDEBUG(D_NET, "IBNAL- enter CQE_event_handler\n");
1645 printk("IBNAL- enter CQE_event_handler\n");
1647 hca_data = (HCA_info *) private;
1654 vstat = VAPI_poll_cq(hca_data->hca_hndl,hca_data->cq_hndl, &comp_desc);
1656 if (vstat == VAPI_CQ_EMPTY) {
1657 CDEBUG(D_NET, "CQE_event_handler: there is no event in CQE, how could"
1658 " this " "happened \n");
1659 printk("CQE_event_handler: there is no event in CQE, how could"
1660 " this " "happened \n");
1664 if (vstat != (VAPI_OK)) {
1665 CDEBUG(D_NET, "error while polling completion queue vstat %d - %s\n",
1666 vstat, VAPI_strerror(vstat));
1667 printk("error while polling completion queue vstat %d - %s\n",
1668 vstat, VAPI_strerror(vstat));
1673 // process the complete event
1674 switch(comp_desc.opcode) {
1675 case VAPI_CQE_SQ_SEND_DATA:
1676 // about the Send Q ,POST SEND completion
1677 // who needs this information
1679 // mark MSbuf_list[wr_id].status = BUF_REGISTERED
1681 wrq_id = comp_desc.id;
1683 #ifdef IBNAL_SELF_TESTING
1684 if(wrq_id == SEND_RECV_TEST_ID) {
1685 printk("IBNAL_SELF_TESTING - VAPI_CQE_SQ_SEND_DATA \n");
1688 if(RDMA_OP_ID < wrq_id) {
1689 // this RDMA message id, adjust it to the right entry
1690 wrq_id = wrq_id - RDMA_OP_ID;
1691 vstat = VAPI_deregister_mr(qp->hca_hndl,
1692 Local_rdma_info.send_rdma_mr_hndl);
1695 if(vstat != VAPI_OK) {
1696 CERROR(" VAPI_CQE_SQ_SEND_DATA: Failed deregistering a RDMA"
1697 " recv mem region %s\n", VAPI_strerror(vstat));
1700 if((RDMA_CTS_ID <= wrq_id) && (RDMA_OP_ID < wrq_id)) {
1701 // RTS or CTS send complete, release send buffer
1702 if(wrq_id >= RDMA_RTS_ID)
1703 wrq_id = wrq_id - RDMA_RTS_ID;
1705 wrq_id = wrq_id - RDMA_CTS_ID;
1708 spin_lock(&MSB_mutex[(int) wrq_id]);
1709 MRbuf_list[wrq_id].status = BUF_REGISTERED;
1710 spin_unlock(&MSB_mutex[(int) wrq_id]);
1713 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_SEND_DATA\n");
1717 case VAPI_CQE_SQ_RDMA_WRITE:
1718 // about the Send Q, RDMA write completion
1719 // who needs this information
1720 // data is successfully write from pource to destionation
1723 // mark MSbuf_list[wr_id].status = BUF_REGISTERED
1724 // de-register rdma buffer
1727 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_RDMA_WRITE\n");
1730 case VAPI_CQE_SQ_RDMA_READ:
1732 // RDMA read completion
1733 // who needs this information
1734 // data is successfully read from destionation to source
1735 CDEBUG(D_NET, "CQE opcode- VAPI_CQE_SQ_RDMA_READ\n");
1738 case VAPI_CQE_SQ_COMP_SWAP:
1740 // RDMA write completion
1741 // who needs this information
1743 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_COMP_SWAP\n");
1746 case VAPI_CQE_SQ_FETCH_ADD:
1748 // RDMA write completion
1749 // who needs this information
1751 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_FETCH_ADD\n");
1754 case VAPI_CQE_SQ_BIND_MRW:
1756 // RDMA write completion
1757 // who needs this information
1759 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_BIND_MRW\n");
1762 case VAPI_CQE_RQ_SEND_DATA:
1763 // about the Receive Q
1764 // process the incoming data and
1765 // forward it to .....
1766 // a completion recevie event is arriving at CQ
1767 // issue a recevie to get this arriving data out from CQ
1768 // pass the receiving data for further processing
1770 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_SEND_DATA\n");
1772 wrq_id = comp_desc.id ;
1774 #ifdef IBNAL_SELF_TESTING
1779 if(wrq_id == SEND_RECV_TEST_ID) {
1780 printk("IBNAL_SELF_TESTING - VAPI_CQE_RQ_SEND_DATA\n");
1783 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t)
1784 MRbuf_list[ SEND_RECV_TEST_BUF_ID].buf_addr;
1785 MRbuf_list[SEND_RECV_TEST_BUF_ID].status = BUF_INUSE;
1786 memcpy(&rbuf, &bufaddr, KB_32);
1789 for(i=0; i < 16; i++)
1790 printk("rbuf[%d]=%c, ", rbuf[i]);
1793 // repost this receiving buffer and makr it at BUF_REGISTERED
1794 vstat = repost_recv_buf(qp,SEND_RECV_TEST_BUF_ID);
1795 if(vstat != (VAPI_OK)) {
1796 printk("error while polling completion queue\n");
1799 MRbuf_list[SEND_RECV_TEST_BUF_ID].status = BUF_REGISTERED;
1802 transferred_data_length = comp_desc.byte_len;
1804 if((wrq_id >= RDMA_CTS_ID) && (wrq_id < RDMA_OP_ID)) {
1805 // this is RTS/CTS message
1806 // process it locally and don't pass it to portals layer
1807 // adjust wrq_id to get the right entry in MRbfu_list
1809 if(wrq_id >= RDMA_RTS_ID)
1810 wrq_id = wrq_id - RDMA_RTS_ID;
1812 wrq_id = wrq_id - RDMA_CTS_ID;
1814 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t)
1815 MRbuf_list[wrq_id].buf_addr;
1816 MRbuf_list[wrq_id].status = BUF_INUSE;
1817 memcpy(&Rdma_info, &bufaddr, sizeof(RDMA_Info_Exchange));
1819 if(Ready_To_send == Rdma_info.opcode)
1820 // an RTS request message from remote node
1821 // prepare local RDMA buffer and send local rdma info to
1823 CTS_handshaking_protocol(&Rdma_info);
1825 if((Clear_To_send == Rdma_info.opcode) &&
1826 (RDMA_BUFFER_RESERVED == Rdma_info.flag))
1827 Cts_Message_arrived = YES;
1829 if(RDMA_BUFFER_UNAVAILABLE == Rdma_info.flag)
1830 CERROR("RDMA operation abort-RDMA_BUFFER_UNAVAILABLE\n");
1834 // this is an incoming mesage for portals layer
1835 // move to PORTALS layer for further processing
1838 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t)
1839 MRbuf_list[wrq_id].buf_addr;
1841 MRbuf_list[wrq_id].status = BUF_INUSE;
1842 transferred_data_length = comp_desc.byte_len;
1844 kibnal_rx(hca_data->kib_data,
1846 transferred_data_length,
1847 MRbuf_list[wrq_id].buf_size,
1851 // repost this receiving buffer and makr it at BUF_REGISTERED
1852 vstat = repost_recv_buf(qp, wrq_id);
1853 if(vstat != (VAPI_OK)) {
1854 CERROR("error while polling completion queue\n");
1857 MRbuf_list[wrq_id].status = BUF_REGISTERED;
1863 case VAPI_CQE_RQ_RDMA_WITH_IMM:
1864 // about the Receive Q
1865 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n");
1867 wrq_id = comp_desc.id ;
1868 transferred_data_length = comp_desc.byte_len;
1870 if(wrq_id == RDMA_OP_ID) {
1871 // this is RDAM op , locate the RDAM memory buffer address
1873 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) Local_rdma_info.raddr;
1875 transferred_data_length = comp_desc.byte_len;
1877 kibnal_rx(hca_data->kib_data,
1879 transferred_data_length,
1880 Local_rdma_info.buf_length,
1883 // de-regiser this RDAM receiving memory buffer
1884 // too early ?? test & check
1885 vstat = VAPI_deregister_mr(qp->hca_hndl, Local_rdma_info.recv_rdma_mr_hndl);
1886 if(vstat != VAPI_OK) {
1887 CERROR("VAPI_CQE_RQ_RDMA_WITH_IMM: Failed deregistering a RDMA"
1888 " recv mem region %s\n", VAPI_strerror(vstat));
1892 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n");
1895 case VAPI_CQE_INVAL_OPCODE:
1897 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_INVAL_OPCODE\n");
1901 CDEBUG(D_NET, "CQE opcode-unknown opcode\n");
1906 // issue a new request for completion ievent notification
1907 vstat = VAPI_req_comp_notif(hca_data->hca_hndl,
1912 if(vstat != VAPI_OK) {
1913 CERROR("PI_req_comp_notif: Failed %s\n", VAPI_strerror(vstat));
1916 return; // end of event handler
1923 kibnal_cmd(struct portal_ioctl_data * data, void * private)
1927 CDEBUG(D_NET, "kibnal_cmd \n");
1934 void ibnal_send_recv_self_testing(int *my_role)
1937 VAPI_sr_desc_t sr_desc;
1938 VAPI_sg_lst_entry_t sr_sg;
1940 VAPI_wr_id_t send_id;
1945 int buf_length = KB_32;
1946 VAPI_wc_desc_t comp_desc;
1950 // make it as a daemon process
1951 // kportal_daemonize("ibnal_send_recv_self_testing");
1953 printk("My role is 0X%X\n", *my_role);
1955 if(*my_role == TEST_SEND_MESSAGE) {
1956 printk("Enter ibnal_send_recv_self_testing\n");
1958 memset(&sbuf, 'a', KB_32);
1959 memset(&rbuf, ' ', KB_32);
1961 send_id = SEND_RECV_TEST_ID;
1962 buf_id = SEND_RECV_TEST_BUF_ID;
1964 qp = &QP_list[buf_id];
1966 sr_desc.opcode = VAPI_SEND;
1967 sr_desc.comp_type = VAPI_SIGNALED;
1968 sr_desc.id = send_id;
1970 // scatter and gather info
1972 sr_sg.lkey = MSbuf_list[buf_id].mr.l_key; // use send MR
1973 sr_sg.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MSbuf_list[buf_id].buf_addr;
1975 // copy data to register send buffer
1976 memcpy(&sr_sg.addr, &sbuf, buf_length);
1978 sr_desc.sg_lst_p = &sr_sg;
1979 sr_desc.sg_lst_len = 1; // only 1 entry is used
1980 sr_desc.fence = TRUE;
1981 sr_desc.set_se = FALSE;
1984 // call VAPI_post_sr to send out this data
1985 vstat = VAPI_post_sr(qp->hca_hndl, qp->qp_hndl, &sr_desc);
1987 if (vstat != VAPI_OK) {
1988 printk("VAPI_post_sr failed (%s).\n",VAPI_strerror(vstat));
1991 printk("VAPI_post_sr success.\n");
1996 printk("I am a receiver and doing nothing here\n");
1999 printk("ibnal_send_recv_self_testing thread exit \n");
2007 // ibnal initialize process
2009 // 1. Bring up Infiniband network interface
2011 // 2. Initialize a PORTALS nal interface
2015 kibnal_initialize(void)
2019 unsigned long sizemask;
2024 portals_debug_set_level(IBNAL_DEBUG_LEVEL_1);
2026 CDEBUG(D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory));
2028 CDEBUG(D_PORTALS, "kibnal_initialize: Enter kibnal_initialize\n");
2030 // set api functional pointers
2031 kibnal_api.startup = kibnal_startup;
2032 kibnal_api.forward = kibnal_forward;
2033 kibnal_api.shutdown = kibnal_shutdown;
2034 kibnal_api.yield = kibnal_yield;
2035 kibnal_api.lock = kibnal_lock;
2036 kibnal_api.unlock = kibnal_unlock;
2037 kibnal_api.nal_data = &kibnal_data; // this is so called private data
2039 memset(&kibnal_data, 0, sizeof(kibnal_data));
2041 // initialize kib_list list data structure
2042 INIT_LIST_HEAD(&kibnal_data.kib_list);
2044 kibnal_data.kib_cb = &kibnal_lib;
2046 spin_lock_init(&kibnal_data.kib_dispatch_lock);
2050 // bring up the IB inter-connect network interface
2053 vstat = IB_Open_HCA(&kibnal_data);
2055 if(vstat != VAPI_OK) {
2056 CERROR("kibnal_initialize: IB_Open_HCA failed: %d- %s\n",
2057 vstat, VAPI_strerror(vstat));
2059 printk("kibnal_initialize: IB_Open_HCA failed: %d- %s\n",
2060 vstat, VAPI_strerror(vstat));
2064 kibnal_data.kib_nid = (__u64 )Hca_hndl;//convert Hca_hndl to 64-bit format
2065 kibnal_data.kib_init = 1;
2067 CDEBUG(D_NET, " kibnal_data.kib_nid 0x%x%x\n", kibnal_data.kib_nid);
2068 printk(" kibnal_data.kib_nid 0x%x%x\n", kibnal_data.kib_nid);
2070 /* Network interface ready to initialise */
2071 // get an entery in the PORTALS table for this IB protocol
2073 CDEBUG(D_PORTALS,"Call PtlNIInit to register this Infiniband Interface\n");
2074 printk("Call PtlNIInit to register this Infiniband Interface\n");
2076 rc = PtlNIInit(kibnal_init, 32, 4, 0, &kibnal_ni);
2079 CERROR("kibnal_initialize: PtlNIInit failed %d\n", rc);
2080 printk("kibnal_initialize: PtlNIInit failed %d\n", rc);
2085 CDEBUG(D_PORTALS,"kibnal_initialize: PtlNIInit DONE\n");
2086 printk("kibnal_initialize: PtlNIInit DONE\n");
2090 #ifdef POLL_BASED_CQE_HANDLING
2091 // create a receiving thread: main loopa
2092 // this is polling based mail loop
2093 kernel_thread(k_recv_thread, &Hca_data, 0);
2096 #ifdef EVENT_BASED_CQE_HANDLING
2097 // for completion event handling, this is event based CQE handling
2098 vstat = IB_Set_Event_Handler(Hca_data, &kibnal_data);
2100 if (vstat != VAPI_OK) {
2101 CERROR("IB_Set_Event_Handler failed: %d - %s \n",
2102 vstat, VAPI_strerror(vstat));
2106 CDEBUG(D_PORTALS,"IB_Set_Event_Handler Done \n");
2107 printk("IB_Set_Event_Handler Done \n");
2111 PORTAL_SYMBOL_REGISTER(kibnal_ni);
2113 #ifdef IBNAL_SELF_TESTING
2115 // test HCA send recv before normal event handling
2118 my_role = TEST_SEND_MESSAGE;
2120 printk("my role is TEST_RECV_MESSAGE\n");
2122 // kernel_thread(ibnal_send_recv_self_testing, &my_role, 0);
2124 ibnal_send_recv_self_testing(&my_role);
2134 MODULE_AUTHOR("Hsingbung(HB) Chen <hbchen@lanl.gov>");
2135 MODULE_DESCRIPTION("Kernel Infiniband NAL v0.1");
2136 MODULE_LICENSE("GPL");
2138 module_init (kibnal_initialize);
2139 module_exit (kibnal_finalize);
2141 EXPORT_SYMBOL(kibnal_ni);