1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Based on ksocknal, qswnal, and gmnal
6 * Copyright (C) 2003 LANL
7 * Author: HB Chen <hbchen@lanl.gov>
8 * Los Alamos National Lab
10 * Portals is free software; you can redistribute it and/or
11 * modify it under the terms of version 2 of the GNU General Public
12 * License as published by the Free Software Foundation.
14 * Portals is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with Portals; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 // portal handle ID for this IB-NAL
28 ptl_handle_ni_t kibnal_ni;
30 // message send buffer mutex
31 spinlock_t MSBuf_mutex[NUM_MBUF];
33 // message recv buffer mutex
34 spinlock_t MRBuf_mutex[NUM_MBUF];
36 // IB-NAL API information
40 kibnal_data_t kibnal_data;
43 VAPI_pd_hndl_t Pd_hndl;
44 unsigned int Num_posted_recv_buf;
46 // registered send buffer list
47 Memory_buffer_info MSbuf_list[NUM_MBUF];
49 // registered recv buffer list
50 Memory_buffer_info MRbuf_list[NUM_MBUF];
54 // currently there is no need fo IBA
56 kpr_nal_interface_t kibnal_router_interface = {
58 kprni_arg: &kibnal_data,
59 kprni_fwd: kibnal_fwd_packet, // forward data to router
60 // is router invloving the
66 QP_info QP_list[NUM_QPS];
68 // information associated with a HCA
71 // something about HCA
72 VAPI_hca_hndl_t Hca_hndl; // assume we only use one HCA now
73 VAPI_hca_vendor_t Hca_vendor;
74 VAPI_hca_cap_t Hca_cap;
75 VAPI_hca_port_t Hca_port_1_props;
76 VAPI_hca_port_t Hca_port_2_props;
77 VAPI_hca_attr_t Hca_attr;
78 VAPI_hca_attr_mask_t Hca_attr_mask;
79 VAPI_cq_hndl_t Cq_RQ_hndl; // CQ's handle
80 VAPI_cq_hndl_t Cq_SQ_hndl; // CQ's handle
81 VAPI_cq_hndl_t Cq_hndl; // CQ's handle
82 Remote_QP_Info L_QP_data;
83 Remote_QP_Info R_QP_data;
90 kibnal_forward(nal_t *nal,
97 kibnal_data_t *knal_data = nal->nal_data;
98 nal_cb_t *nal_cb = knal_data->kib_cb;
101 LASSERT (nal == &kibnal_api);
102 LASSERT (knal_data == &kibnal_data);
103 LASSERT (nal_cb == &kibnal_lib);
105 // dispatch forward API function
107 CDEBUG(D_NET,"kibnal_forward: function id = %d\n", id);
109 lib_dispatch(nal_cb, knal_data, id, args, ret);
111 CDEBUG(D_TRACE,"IBNAL- Done kibnal_forward\n");
113 return PTL_OK; // always return PTL_OK
120 kibnal_lock(nal_t *nal, unsigned long *flags)
122 kibnal_data_t *knal_data = nal->nal_data;
123 nal_cb_t *nal_cb = knal_data->kib_cb;
126 LASSERT (nal == &kibnal_api);
127 LASSERT (knal_data == &kibnal_data);
128 LASSERT (nal_cb == &kibnal_lib);
130 // disable logical interrrupt
131 nal_cb->cb_cli(nal_cb,flags);
133 CDEBUG(D_TRACE,"IBNAL-Done kibnal_lock\n");
141 kibnal_unlock(nal_t *nal, unsigned long *flags)
143 kibnal_data_t *k = nal->nal_data;
144 nal_cb_t *nal_cb = k->kib_cb;
147 LASSERT (nal == &kibnal_api);
148 LASSERT (k == &kibnal_data);
149 LASSERT (nal_cb == &kibnal_lib);
151 // enable logical interrupt
152 nal_cb->cb_sti(nal_cb,flags);
154 CDEBUG(D_TRACE,"IBNAL-Done kibnal_unlock");
160 // showdown this network interface
163 kibnal_shutdown(nal_t *nal, int ni)
166 kibnal_data_t *k = nal->nal_data;
167 nal_cb_t *nal_cb = k->kib_cb;
170 LASSERT (nal == &kibnal_api);
171 LASSERT (k == &kibnal_data);
172 LASSERT (nal_cb == &kibnal_lib);
174 // take down this IB network interface
175 // there is not corresponding cb function to hande this
176 // do we actually need this one
177 // reference to IB network interface shutdown
180 vstat = IB_Close_HCA();
182 if (vstat != VAPI_OK) {
183 CERROR("Failed to close HCA - %s\n",VAPI_strerror(vstat));
187 CDEBUG(D_TRACE,"IBNAL- Done kibnal_shutdown\n");
194 // when do we call this yield function
197 kibnal_yield( nal_t *nal, unsigned long *flags, int milliseconds )
199 kibnal_data_t *k = nal->nal_data;
200 nal_cb_t *nal_cb = k->kib_cb;
203 LASSERT (nal == &kibnal_api);
204 LASSERT (k == &kibnal_data);
205 LASSERT (nal_cb == &kibnal_lib);
207 if (milliseconds != 0) {
208 CERROR("Blocking yeild not implemented yet\n");
212 // check under what condition that we need to
214 // who set this need_resched
215 if (current->need_resched)
218 CDEBUG(D_TRACE,"IBNAL-Done kibnal_yield");
227 kibnal_init(int interface, // no use here
228 ptl_pt_index_t ptl_size,
229 ptl_ac_index_t ac_size,
230 ptl_pid_t requested_pid // no use here
234 nal_cb_t *nal_cb = NULL;
235 kibnal_data_t *nal_data = NULL;
238 unsigned int nnids = 1; // number of nids
239 // do we know how many nodes are in this
240 // system related to this kib_nid
243 CDEBUG(D_NET, "kibnal_init:calling lib_init with nid 0x%u\n",
244 kibnal_data.kib_nid);
247 CDEBUG(D_NET, "kibnal_init: interface [%d], ptl_size [%d], ac_size[%d]\n",
248 interface, ptl_size, ac_size);
249 CDEBUG(D_NET, "kibnal_init: &kibnal_lib 0x%X\n", &kibnal_lib);
250 CDEBUG(D_NET, "kibnal_init: kibnal_data.kib_nid %d\n", kibnal_data.kib_nid);
252 rc = lib_init(&kibnal_lib,
254 0, // process id is set as 0
260 CERROR("kibnal_init: Failed lib_init with nid 0x%u, rc=%d\n",
261 kibnal_data.kib_nid,rc);
264 CDEBUG(D_NET,"kibnal_init: DONE lib_init with nid 0x%x%x\n",
265 kibnal_data.kib_nid);
274 // called before remove ibnal kernel module
277 kibnal_finalize(void)
279 struct list_head *tmp;
281 inter_module_unregister("kibnal_ni");
283 // release resources allocated to this Infiniband network interface
284 PtlNIFini(kibnal_ni);
286 lib_fini(&kibnal_lib);
290 // how much do we need to do here?
291 list_for_each(tmp, &kibnal_data.kib_list) {
293 conn = list_entry(tmp, kibnal_rx_t, krx_item);
294 CDEBUG(D_IOCTL, "freeing conn %p\n",conn);
296 list_del(&conn->krx_item);
297 PORTAL_FREE(conn, sizeof(*conn));
300 CDEBUG(D_MALLOC,"done kmem %d\n",atomic_read(&portal_kmemory));
301 CDEBUG(D_TRACE,"IBNAL-Done kibnal_finalize\n");
308 // * k_server_thread is a kernel thread
309 // use a shared memory ro exchange HCA's data with a pthread in user
311 // * will be replaced when CM is used to handle communication management
314 void k_server_thread(Remote_QP_Info *hca_data)
317 const int shared_segment_size = sizeof(Remote_QP_Info);
318 key_t key = HCA_EXCHANGE_SHM_KEY;
320 int exchanged_done = NO;
323 Remote_QP_Info *exchange_hca_data;
329 // create a shared memory with pre-agreement key
330 segment_id = sys_shmget(key,
335 // attached to shared memoru
336 // raddr is pointed to an user address space
337 // use this address to update shared menory content
338 ret = sys_shmat(segment_id, 0 , SHM_RND, &raddr);
342 CDEBUG(D_NET,"k_server_thread: Shared memory attach success ret = 0X%d,&raddr"
343 " 0X%x (*(&raddr))=0x%x \n", ret, &raddr, (*(&raddr)));
344 printk("k_server_thread: Shared memory attach success ret = 0X%d, &raddr"
345 " 0X%x (*(&raddr))=0x%x \n", ret, &raddr, (*(&raddr)));
348 CERROR("k_server_thread: Shared memory attach failed ret = 0x%d \n", ret);
349 printk("k_server_thread: Shared memory attach failed ret = 0x%d \n", ret);
355 uaddr = *n; // get the U-address
356 /* cast uaddr to exchange_hca_data */
357 exchange_hca_data = (Remote_QP_Info *) uaddr;
359 /* copy data from local HCA to shared memory */
360 exchange_hca_data->opcode = hca_data->opcode;
361 exchange_hca_data->length = hca_data->length;
363 for(i=0; i < NUM_QPS; i++) {
364 exchange_hca_data->dlid[i] = hca_data->dlid[i];
365 exchange_hca_data->rqp_num[i] = hca_data->rqp_num[i];
368 // periodically check shared memory until get updated
369 // remote HCA's data from user mode pthread
370 while(exchanged_done == NO) {
371 if(exchange_hca_data->opcode == RECV_QP_INFO){
372 exchanged_done = YES;
373 /* copy data to local buffer from shared memory */
374 hca_data->opcode = exchange_hca_data->opcode;
375 hca_data->length = exchange_hca_data->length;
377 for(i=0; i < NUM_QPS; i++) {
378 hca_data->dlid[i] = exchange_hca_data->dlid[i];
379 hca_data->rqp_num[i] = exchange_hca_data->rqp_num[i];
384 schedule_timeout(1000);
388 // detached shared memory
391 CDEBUG(D_NET, "Exit from kernel thread: k_server_thread \n");
392 printk("Exit from kernel thread: k_server_thread \n");
402 create_qp(QP_info *qp, int qp_index)
406 VAPI_qp_init_attr_t qp_init_attr;
407 VAPI_qp_prop_t qp_prop;
409 qp->hca_hndl = Hca_hndl;
410 qp->port = 1; // default
411 qp->slid = Hca_port_1_props.lid;
412 qp->hca_port = Hca_port_1_props;
415 /* Queue Pair Creation Attributes */
416 qp_init_attr.cap.max_oust_wr_rq = NUM_WQE;
417 qp_init_attr.cap.max_oust_wr_sq = NUM_WQE;
418 qp_init_attr.cap.max_sg_size_rq = NUM_SG;
419 qp_init_attr.cap.max_sg_size_sq = NUM_SG;
420 qp_init_attr.pd_hndl = qp->pd_hndl;
421 qp_init_attr.rdd_hndl = 0;
422 qp_init_attr.rq_cq_hndl = qp->rq_cq_hndl;
423 /* we use here polling */
424 //qp_init_attr.rq_sig_type = VAPI_SIGNAL_REQ_WR;
425 qp_init_attr.rq_sig_type = VAPI_SIGNAL_ALL_WR;
426 qp_init_attr.sq_cq_hndl = qp->sq_cq_hndl;
427 /* we use here polling */
428 //qp_init_attr.sq_sig_type = VAPI_SIGNAL_REQ_WR;
429 qp_init_attr.sq_sig_type = VAPI_SIGNAL_ALL_WR;
430 // transport servce - reliable connection
432 qp_init_attr.ts_type = VAPI_TS_RC;
434 vstat = VAPI_create_qp(qp->hca_hndl,
436 &qp->qp_hndl, &qp_prop);
438 if (vstat != VAPI_OK) {
439 CERROR("Failed creating QP. Return Failed - %s\n",VAPI_strerror(vstat));
443 qp->qp_num = qp_prop.qp_num; // the qp number
444 qp->last_posted_send_id = 0; // user defined work request ID
445 qp->last_posted_rcv_id = 0; // user defined work request ID
446 qp->cur_send_outstanding = 0;
447 qp->cur_posted_rcv_bufs = 0;
448 qp->snd_rcv_balance = 0;
450 CDEBUG(D_OTHER, "create_qp: qp_num = %d, slid = %d, qp_hndl = 0X%X",
451 qp->qp_num, qp->slid, qp->qp_hndl);
453 // initialize spin-lock mutex variables
454 spin_lock_init(&(qp->snd_mutex));
455 spin_lock_init(&(qp->rcv_mutex));
456 spin_lock_init(&(qp->bl_mutex));
457 spin_lock_init(&(qp->cln_mutex));
458 // number of outstanding requests on the send Q
459 qp->cur_send_outstanding = 0;
460 // number of posted receive buffers
461 qp->cur_posted_rcv_bufs = 0;
462 qp->snd_rcv_balance = 0;
469 // initialize a UD qp state to RTR and RTS
472 init_qp_UD(QP_info *qp, int qp_index)
474 VAPI_qp_attr_t qp_attr;
475 VAPI_qp_init_attr_t qp_init_attr;
476 VAPI_qp_attr_mask_t qp_attr_mask;
477 VAPI_qp_cap_t qp_cap;
480 /* Move from RST to INIT */
481 /* Change QP to INIT */
483 CDEBUG(D_OTHER, "Changing QP state to INIT qp-index = %d\n", qp_index);
485 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
487 qp_attr.qp_state = VAPI_INIT;
488 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
490 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
493 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PKEY_IX);
495 CDEBUG(D_OTHER, "pkey_ix qp_attr_mask = 0X%x\n", qp_attr_mask);
497 qp_attr.port = qp->port;
498 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PORT);
500 CDEBUG(D_OTHER, "port qp_attr_mask = 0X%x\n", qp_attr_mask);
503 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QKEY);
505 CDEBUG(D_OTHER, "qkey qp_attr_mask = 0X%x\n", qp_attr_mask);
507 /* If I do not set this mask, I get an error from HH. QPM should catch it */
509 vstat = VAPI_modify_qp(qp->hca_hndl,
515 if (vstat != VAPI_OK) {
516 CERROR("Failed modifying QP from RST to INIT. %s\n",VAPI_strerror(vstat));
520 CDEBUG(D_OTHER, "Modifying QP from RST to INIT.\n");
522 vstat= VAPI_query_qp(qp->hca_hndl,
528 if (vstat != VAPI_OK) {
529 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
533 /* Move from INIT to RTR */
534 /* Change QP to RTR */
535 CDEBUG(D_OTHER, "Changing QP state to RTR\n");
537 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
539 qp_attr.qp_state = VAPI_RTR;
540 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
542 CDEBUG(D_OTHER, "INIT to RTR- qp_state : qp_attr_mask = 0X%x\n", qp_attr_mask);
544 vstat = VAPI_modify_qp(qp->hca_hndl,
550 if (vstat != VAPI_OK) {
551 CERROR("Failed modifying QP from INIT to RTR. %s\n",VAPI_strerror(vstat));
555 CDEBUG(D_OTHER, "Modifying QP from INIT to RTR.\n");
557 vstat= VAPI_query_qp(qp->hca_hndl,
563 if (vstat != VAPI_OK) {
564 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
568 /* RTR to RTS - Change QP to RTS */
569 CDEBUG(D_OTHER, "Changing QP state to RTS\n");
571 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
573 qp_attr.qp_state = VAPI_RTS;
574 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
576 qp_attr.sq_psn = START_SQ_PSN;
577 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_SQ_PSN);
579 vstat = VAPI_modify_qp(qp->hca_hndl,
585 if (vstat != VAPI_OK) {
586 CERROR("Failed modifying QP from RTR to RTS. %s:%s\n",
587 VAPI_strerror_sym(vstat),
588 VAPI_strerror(vstat));
592 CDEBUG(D_OTHER, "Modifying QP from RTR to RTS. \n");
594 vstat= VAPI_query_qp(qp->hca_hndl,
600 if (vstat != VAPI_OK) {
601 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
606 // a QP is at RTS state NOW
609 CDEBUG(D_OTHER, "IBNAL- UD qp is at RTS NOW\n");
618 // initialize a RC qp state to RTR and RTS
619 // RC transport service
622 init_qp_RC(QP_info *qp, int qp_index)
624 VAPI_qp_attr_t qp_attr;
625 VAPI_qp_init_attr_t qp_init_attr;
626 VAPI_qp_attr_mask_t qp_attr_mask;
627 VAPI_qp_cap_t qp_cap;
630 /* Move from RST to INIT */
631 /* Change QP to INIT */
633 CDEBUG(D_OTHER, "Changing QP state to INIT qp-index = %d\n", qp_index);
635 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
637 qp_attr.qp_state = VAPI_INIT;
638 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
640 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
643 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PKEY_IX);
645 CDEBUG(D_OTHER, "pkey_ix qp_attr_mask = 0X%x\n", qp_attr_mask);
647 qp_attr.port = qp->port;
648 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PORT);
650 CDEBUG(D_OTHER, "port qp_attr_mask = 0X%x\n", qp_attr_mask);
652 qp_attr.remote_atomic_flags = VAPI_EN_REM_WRITE | VAPI_EN_REM_READ;
653 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_REMOTE_ATOMIC_FLAGS);
655 CDEBUG(D_OTHER, "remote_atomic_flags qp_attr_mask = 0X%x\n", qp_attr_mask);
657 /* If I do not set this mask, I get an error from HH. QPM should catch it */
659 vstat = VAPI_modify_qp(qp->hca_hndl,
665 if (vstat != VAPI_OK) {
666 CERROR("Failed modifying QP from RST to INIT. %s\n",VAPI_strerror(vstat));
670 vstat= VAPI_query_qp(qp->hca_hndl,
676 if (vstat != VAPI_OK) {
677 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
681 /* Move from INIT to RTR */
682 /* Change QP to RTR */
683 CDEBUG(D_OTHER, "Changing QP state to RTR qp_indexi %d\n", qp_index);
685 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
686 qp_attr.qp_state = VAPI_RTR;
688 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
690 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
692 qp_attr.av.sl = 0;/* RESPONDER_SL */
693 qp_attr.av.grh_flag = FALSE;
694 qp_attr.av.dlid = qp->dlid;/*RESPONDER_LID;*/
695 qp_attr.av.static_rate = 0;
696 qp_attr.av.src_path_bits = 0;
697 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_AV);
699 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
701 qp_attr.path_mtu = MTU_2048;// default is MTU_2048
702 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PATH_MTU);
704 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
706 qp_attr.rq_psn = START_RQ_PSN;
707 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_RQ_PSN);
709 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
711 qp_attr.qp_ous_rd_atom = NUM_WQE;
712 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_OUS_RD_ATOM);
714 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
717 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PKEY_IX);
719 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
721 qp_attr.min_rnr_timer = 10;
722 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_MIN_RNR_TIMER);
724 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
726 qp_attr.dest_qp_num = qp->rqp_num;
728 CDEBUG(D_OTHER, "remore qp num %d\n", qp->rqp_num);
730 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_DEST_QP_NUM);
732 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
734 vstat = VAPI_modify_qp(qp->hca_hndl,
741 if (vstat != VAPI_OK) {
742 CERROR("Failed modifying QP from INIT to RTR. qp_index %d - %s\n",
743 qp_index, VAPI_strerror(vstat));
747 vstat= VAPI_query_qp(qp->hca_hndl,
753 if (vstat != VAPI_OK) {
754 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
758 /* RTR to RTS - Change QP to RTS */
759 CDEBUG(D_OTHER, "Changing QP state to RTS\n");
761 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
763 qp_attr.qp_state = VAPI_RTS;
764 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
766 qp_attr.sq_psn = START_SQ_PSN;
767 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_SQ_PSN);
769 qp_attr.timeout = 0x18;
770 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_TIMEOUT);
772 qp_attr.retry_count = 10;
773 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_RETRY_COUNT);
775 qp_attr.rnr_retry = 14;
776 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_RNR_RETRY);
778 qp_attr.ous_dst_rd_atom = 100;
779 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_OUS_DST_RD_ATOM);
781 qp_attr.min_rnr_timer = 5;
782 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_MIN_RNR_TIMER);
784 vstat = VAPI_modify_qp(qp->hca_hndl,
790 if (vstat != VAPI_OK) {
791 CERROR("Failed modifying QP from RTR to RTS. %s:%s\n",
792 VAPI_strerror_sym(vstat), VAPI_strerror(vstat));
796 vstat= VAPI_query_qp(qp->hca_hndl,
802 if (vstat != VAPI_OK) {
803 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
808 // a QP is at RTS state NOW
811 CDEBUG(D_OTHER, "IBNAL- RC qp is at RTS NOW\n");
819 IB_Open_HCA(kibnal_data_t *kib_data)
823 VAPI_cqe_num_t cqe_active_num;
826 int Num_posted_recv_buf;
829 CDEBUG(D_PORTALS, "Opening an HCA\n");
831 vstat = VAPI_open_hca(HCA_ID, &Hca_hndl);
832 vstat = EVAPI_get_hca_hndl(HCA_ID, &Hca_hndl);
833 if (vstat != VAPI_OK) {
834 CERROR("Failed opening the HCA: %s. %s...\n",HCA_ID,VAPI_strerror(vstat));
839 vstat = VAPI_query_hca_cap(Hca_hndl, &Hca_vendor, &Hca_cap);
840 if (vstat != VAPI_OK) {
841 CERROR("Failed query hca cap %s\n",VAPI_strerror(vstat));
845 /* Get port 1 info */
846 vstat = VAPI_query_hca_port_prop(Hca_hndl, HCA_PORT_1 , &Hca_port_1_props);
847 if (vstat != VAPI_OK) {
848 CERROR("Failed query port cap %s\n",VAPI_strerror(vstat));
852 /* Get port 2 info */
853 vstat = VAPI_query_hca_port_prop(Hca_hndl, HCA_PORT_2, &Hca_port_2_props);
854 if (vstat != VAPI_OK) {
855 CERROR("Failed query port cap %s\n",VAPI_strerror(vstat));
860 CDEBUG(D_PORTALS, "Allocating PD \n");
861 vstat = VAPI_alloc_pd(Hca_hndl,&Pd_hndl);
862 if (vstat != VAPI_OK) {
863 CERROR("Failed allocating a PD. %s\n",VAPI_strerror(vstat));
867 vstat = createMemRegion(Hca_hndl, Pd_hndl);
868 if (vstat != VAPI_OK) {
869 CERROR("Failed registering a memory region.%s\n",VAPI_strerror(vstat));
873 /* Create CQ for RQ*/
874 CDEBUG(D_PORTALS, "Creating a send completion queue\n");
876 vstat = VAPI_create_cq(Hca_hndl,
881 if (vstat != VAPI_OK) {
882 CERROR("Failed creating a CQ. %s\n",VAPI_strerror(vstat));
886 if(NUM_CQE == cqe_active_num) {
887 CERROR("VAPI_create_cq: NUM_CQE EQ cqe_active_num \n");
890 CDEBUG(D_NET, "VAPI_create_cq: NUM_CQE %d , actual cqe_active_num %d \n",
891 NUM_CQE, cqe_active_num);
894 Cq_SQ_hndl = Cq_hndl;
895 Cq_RQ_hndl = Cq_hndl;
900 for(i=0; i < NUM_QPS; i++) {
901 QP_list[i].pd_hndl = Pd_hndl;
902 QP_list[i].hca_hndl = Hca_hndl;
903 // sq rq use the same Cq_hndl
904 QP_list[i].sq_cq_hndl = Cq_hndl;
905 QP_list[i].rq_cq_hndl = Cq_hndl;
906 vstat = create_qp(&QP_list[i], i);
907 if (vstat != VAPI_OK) {
908 CERROR("Failed creating a QP %d %s\n",i, VAPI_strerror(vstat));
917 Hca_data.hca_hndl = Hca_hndl; // HCA handle
918 Hca_data.pd_hndl = Pd_hndl; // protection domain
919 Hca_data.port = 1; // port number
920 Hca_data.num_qp = NUM_QPS; // number of qp used
922 for(i=0; i < NUM_QPS; i++) {
923 Hca_data.qp_ptr[i] = &QP_list[i]; // point to QP_list
926 Hca_data.num_cq = NUM_CQ; // number of cq used
927 Hca_data.cq_hndl = Cq_hndl; //
928 Hca_data.sq_cq_hndl = Cq_SQ_hndl; //
929 Hca_data.rq_cq_hndl = Cq_RQ_hndl; //
930 Hca_data.kib_data = kib_data; //
931 Hca_data.slid = QP_list[0].slid;//
935 #ifdef USE_SHARED_MEMORY_AND_SOCKET
938 * + use a shared-memory between a user thread and a kernel thread
939 * for HCA's data exchange on the same node
940 * + use socket in user mode to exhange HCA's data with a remote node
944 R_QP_data.opcode = SEND_QP_INFO;
945 R_QP_data.length = sizeof(L_QP_data);
947 for(i=0; i < NUM_QPS; i++) {
948 // my slid will be used in a remote node as dlid
949 R_QP_data.dlid[i] = QP_list[i].slid;
950 // my qp_num will be used in remode node as remote_qp_number
951 // RC is used here so we need dlid and rqp_num
952 R_QP_data.rqp_num[i] = QP_list[i].qp_num ;
955 // create a kernel thread for exchanging HCA's data
956 // R_QP_data will be exchanged with a remoe node
958 kernel_thread(k_server_thread, &R_QP_data, 0); //
959 // check if the HCA'data have been updated by kernel_thread
960 // loop until the HCA's data is updated
961 // make sure that uagent is running
963 // QP info is exchanged with a remote node
965 schedule_timeout(1000);
966 if(R_QP_data.opcode == RECV_QP_INFO) {
967 CDEBUG(D_NET, "HCA's data is being updated\n");
974 #ifdef USE_SHARED_MEMORY_AND_MULTICAST
977 * + use a shared-memory between a user thread and a kernel thread
978 * for HCA's data exchange on the same node
979 * + use Infinoband UR/multicast in user mode to exhange HCA's data with i
988 for(i=0; i < NUM_QPS; i++) {
989 qp = (QP_info *) &QP_list[i];
990 QP_list[i].rqp_num = R_QP_data.rqp_num[i]; // remoter qp number
991 QP_list[i].dlid = R_QP_data.dlid[i]; // remote dlid
994 // already have remote_qp_num adn dlid information
995 // initialize QP to RTR/RTS state
997 for(i=0; i < NUM_QPS; i++) {
998 vstat = init_qp_RC(&QP_list[i], i);
999 if (vstat != VAPI_OK) {
1000 CERROR("Failed change a QP %d to RTS state%s\n",
1001 i,VAPI_strerror(vstat));
1006 // post receiving buffer before any send happened
1008 Num_posted_recv_buf = post_recv_bufs( (VAPI_wr_id_t ) START_RECV_WRQ_ID);
1010 // for irregular completion event or some unexpected failure event
1011 vstat = IB_Set_Async_Event_Handler(Hca_data, &kibnal_data);
1012 if (vstat != VAPI_OK) {
1013 CERROR("IB_Set_Async_Event_Handler failed: %d\n", vstat);
1018 CDEBUG(D_PORTALS, "IBNAL- done with IB_Open_HCA\n");
1020 for(i=0; i < NUM_MBUF; i++) {
1021 spin_lock_init(&MSB_mutex[i]);
1030 Function: IB_Set_Event_Handler()
1032 IN Hca_info hca_data
1033 IN kibnal_data_t *kib_data -- private data
1036 return: VAPI_OK - success
1042 IB_Set_Event_Handler(HCA_info hca_data, kibnal_data_t *kib_data)
1045 EVAPI_compl_handler_hndl_t comp_handler_hndl;
1047 // register CQE_Event_Hnadler
1049 vstat = VAPI_set_comp_event_handler(hca_data.hca_hndl,
1054 or use extended VAPI function
1055 vstat = EVAPI_set_comp_eventh(hca_data.hca_hndl,
1063 if (vstat != VAPI_OK) {
1064 CERROR("IB_Set_Event_Handler: failed EVAPI_set_comp_eventh for"
1065 " HCA ID = %s (%s).\n", HCA_ID, VAPI_strerror(vstat));
1069 // issue a request for completion ievent notification
1070 vstat = VAPI_req_comp_notif(hca_data.hca_hndl,
1074 if (vstat != VAPI_OK) {
1075 CERROR("IB_Set_Event_Handler: failed VAPI_req_comp_notif for HCA ID"
1076 " = %s (%s).\n", HCA_ID, VAPI_strerror(vstat));
1085 Function: IB_Set_Async_Event_Handler()
1087 IN HCA_info hca_data
1088 IN kibnal_data_t *kib_data -- private data
1091 return: VAPI_OK - success
1098 IB_Set_Async_Event_Handler(HCA_info hca_data, kibnal_data_t *kib_data)
1103 // register an asynchronous event handler for this HCA
1106 vstat= VAPI_set_async_event_handler(hca_data.hca_hndl,
1107 async_event_handler,
1110 if (vstat != VAPI_OK) {
1111 CERROR("IB_Set_Async_Event_Handler: failed VAPI_set_async_comp_event_handler"
1112 " for HCA ID = %s (%s).\n", HCA_ID, VAPI_strerror(vstat));
1120 // close this Infiniband HCA interface
1121 // release allocated resources to system
1132 CDEBUG(D_PORTALS, "Destroying QP\n");
1134 for(i=0; i < NUM_QPS; i++) {
1135 vstat = VAPI_destroy_qp(QP_list[i].hca_hndl, QP_list[i].qp_hndl);
1136 if (vstat != VAPI_OK) {
1137 CERROR("Failed destroying QP %d. %s\n", i, VAPI_strerror(vstat));
1144 CDEBUG(D_PORTALS, "Destroying CQ\n");
1145 for(i=0; i < NUM_QPS; i++) {
1146 // send_cq adn receive_cq are shared the same CQ
1147 // so only destroy one of them
1148 vstat = VAPI_destroy_cq(QP_list[i].hca_hndl, QP_list[i].sq_cq_hndl);
1149 if (vstat != VAPI_OK) {
1150 CERROR("Failed destroying CQ %d. %s\n", i, VAPI_strerror(vstat));
1157 /* Destroy Memory Region */
1158 CDEBUG(D_PORTALS, "Deregistering MR\n");
1159 for(i=0; i < NUM_QPS; i++) {
1160 vstat = deleteMemRegion(&QP_list[i], i);
1161 if (vstat != VAPI_OK) {
1162 CERROR("Failed deregister mem reg %d. %s\n",i, VAPI_strerror(vstat));
1172 CDEBUG(D_PORTALS, "Closing HCA\n");
1173 vstat = VAPI_close_hca(Hca_hndl);
1174 if (vstat != VAPI_OK) {
1175 CERROR("Failed to close HCA. %s\n", VAPI_strerror(vstat));
1180 CDEBUG(D_PORTALS, "IBNAL- Done with closing HCA \n");
1187 createMemRegion(VAPI_hca_hndl_t hca_hndl,
1188 VAPI_pd_hndl_t pd_hndl)
1193 VAPI_mr_hndl_t rep_mr_hndl;
1198 // send registered memory region
1199 for(i=0; i < NUM_ENTRY; i++) {
1200 MSbuf_list[i].buf_size = KB_32;
1201 PORTAL_ALLOC(bufptr, MSbuf_list[i].buf_size);
1202 if(bufptr == NULL) {
1203 CDEBUG(D_MALLOC,"Failed to malloc a block of send memory, qix %d size %d\n",
1204 i, MSbuf_list[i].buf_size);
1205 CERROR("Failed to malloc a block of send memory, qix %d size %d\n",
1206 i, MSbuf_list[i].buf_size);
1207 return(VAPI_ENOMEM);
1211 mrw.pd_hndl= pd_hndl;
1212 mrw.start = MSbuf_list[i].buf_addr = (VAPI_virt_addr_t)(MT_virt_addr_t) bufptr;
1213 mrw.size = MSbuf_list[i].buf_size;
1214 mrw.acl = VAPI_EN_LOCAL_WRITE |
1215 VAPI_EN_REMOTE_WRITE |
1216 VAPI_EN_REMOTE_READ;
1218 // register send memory region
1219 vstat = VAPI_register_mr(hca_hndl,
1224 // this memory region is going to be reused until deregister is called
1225 if(vstat != VAPI_OK) {
1226 CERROR("Failed registering a mem region qix %d Addr=%p, Len=%d. %s\n",
1227 i, mrw.start, mrw.size, VAPI_strerror(vstat));
1231 MSbuf_list[i].mr = rep_mr;
1232 MSbuf_list[i].mr_hndl = rep_mr_hndl;
1233 MSbuf_list[i].bufptr = bufptr;
1234 MSbuf_list[i].buf_addr = rep_mr.start;
1235 MSbuf_list[i].status = BUF_REGISTERED;
1236 MSbuf_list[i].ref_count = 0;
1237 MSbuf_list[i].buf_type = REG_BUF;
1238 MSbuf_list[i].raddr = 0x0;
1239 MSbuf_list[i].rkey = 0x0;
1242 // RDAM buffer is not reserved for RDAM WRITE/READ
1244 for(i=NUM_ENTRY; i< NUM_MBUF; i++) {
1245 MSbuf_list[i].status = BUF_UNREGISTERED;
1246 MSbuf_list[i].buf_type = RDMA_BUF;
1250 // recv registered memory region
1251 for(i=0; i < NUM_ENTRY; i++) {
1252 MRbuf_list[i].buf_size = KB_32;
1253 PORTAL_ALLOC(bufptr, MRbuf_list[i].buf_size);
1255 if(bufptr == NULL) {
1256 CDEBUG(D_MALLOC, "Failed to malloc a block of send memory, qix %d size %d\n",
1257 i, MRbuf_list[i].buf_size);
1258 return(VAPI_ENOMEM);
1262 mrw.pd_hndl= pd_hndl;
1263 mrw.start = (VAPI_virt_addr_t)(MT_virt_addr_t) bufptr;
1264 mrw.size = MRbuf_list[i].buf_size;
1265 mrw.acl = VAPI_EN_LOCAL_WRITE |
1266 VAPI_EN_REMOTE_WRITE |
1267 VAPI_EN_REMOTE_READ;
1269 // register send memory region
1270 vstat = VAPI_register_mr(hca_hndl,
1275 // this memory region is going to be reused until deregister is called
1276 if(vstat != VAPI_OK) {
1277 CERROR("Failed registering a mem region qix %d Addr=%p, Len=%d. %s\n",
1278 i, mrw.start, mrw.size, VAPI_strerror(vstat));
1282 MRbuf_list[i].mr = rep_mr;
1283 MRbuf_list[i].mr_hndl = rep_mr_hndl;
1284 MRbuf_list[i].bufptr = bufptr;
1285 MRbuf_list[i].buf_addr = rep_mr.start;
1286 MRbuf_list[i].status = BUF_REGISTERED;
1287 MRbuf_list[i].ref_count = 0;
1288 MRbuf_list[i].buf_type = REG_BUF;
1289 MRbuf_list[i].raddr = 0x0;
1290 MRbuf_list[i].rkey = rep_mr.r_key;
1291 MRbuf_list[i].lkey = rep_mr.l_key;
1295 // keep extra information for a qp
1296 for(i=0; i < NUM_QPS; i++) {
1297 QP_list[i].mr_hndl = MSbuf_list[i].mr_hndl;
1298 QP_list[i].mr = MSbuf_list[i].mr;
1299 QP_list[i].bufptr = MSbuf_list[i].bufptr;
1300 QP_list[i].buf_addr = MSbuf_list[i].buf_addr;
1301 QP_list[i].buf_size = MSbuf_list[i].buf_size;
1302 QP_list[i].raddr = MSbuf_list[i].raddr;
1303 QP_list[i].rkey = MSbuf_list[i].rkey;
1304 QP_list[i].lkey = MSbuf_list[i].lkey;
1307 CDEBUG(D_PORTALS, "IBNAL- done VAPI_ret_t createMemRegion \n");
1311 } /* createMemRegion */
1316 deleteMemRegion(QP_info *qp, int qix)
1321 // free send memory assocaited with this memory region
1323 PORTAL_FREE(MSbuf_list[qix].bufptr, MSbuf_list[qix].buf_size);
1326 vstat = VAPI_deregister_mr(qp->hca_hndl, MSbuf_list[qix].mr_hndl);
1328 if(vstat != VAPI_OK) {
1329 CERROR("Failed deregistering a send mem region qix %d %s\n",
1330 qix, VAPI_strerror(vstat));
1335 // free recv memory assocaited with this memory region
1337 PORTAL_FREE(MRbuf_list[qix].bufptr, MRbuf_list[qix].buf_size);
1340 vstat = VAPI_deregister_mr(qp->hca_hndl, MRbuf_list[qix].mr_hndl);
1342 if(vstat != VAPI_OK) {
1343 CERROR("Failed deregistering a recv mem region qix %d %s\n",
1344 qix, VAPI_strerror(vstat));
1353 // polling based event handling
1354 // + a daemon process
1355 // + poll the CQ and check what is in the CQ
1356 // + process incoming CQ event
1361 RDMA_Info_Exchange Rdma_info;
1362 int Cts_Message_arrived = NO;
1364 void k_recv_thread(HCA_info *hca_data)
1367 VAPI_wc_desc_t comp_desc;
1368 unsigned long polling_count = 0;
1369 u_int32_t timeout_usec;
1370 unsigned int priority = 100;
1371 unsigned int length;
1372 VAPI_wr_id_t wrq_id;
1373 u_int32_t transferred_data_length; /* Num. of bytes transferred */
1375 VAPI_virt_addr_t bufaddr;
1376 unsigned long buf_size = 0;
1377 QP_info *qp; // point to QP_list
1379 kportal_daemonize("k_recv_thread"); // make it as a daemon process
1382 timeout_usec = 100; // how is the impact on the performance
1384 // send Q and receive Q are using the same CQ
1385 // so only poll one CQ for both operations
1387 CDEBUG(D_NET, "IBNAL- enter kibnal_recv_thread\n");
1388 CDEBUG(D_NET, "hca_hndl = 0X%x, cq_hndl=0X%x\n",
1389 hca_data->hca_hndl,hca_data->cq_hndl);
1391 qp = hca_data->qp_ptr;
1393 CDEBUG(D_NET, "in recv_thread qp is NULL\n");
1394 CDEBUG(D_NET, "Exit from recv_thread qp is NULL\n");
1398 CDEBUG(D_NET, "in recv_thread qp is 0X%X\n", qp);
1401 CDEBUG(D_NET, "kibnal_recv_thread - enter event driver polling loop\n");
1413 // send Q and receive Q are using the same CQ
1414 // so only poll one CQ for both operations
1417 vstat = VAPI_poll_cq(hca_data->hca_hndl,hca_data->cq_hndl, &comp_desc);
1419 if (vstat == VAPI_CQ_EMPTY) {
1420 // there is no event in CQE
1424 if (vstat != (VAPI_OK)) {
1425 CERROR("error while polling completion queuei vstat %d \n", vstat);
1430 // process the complete event
1431 switch(comp_desc.opcode) {
1432 case VAPI_CQE_SQ_SEND_DATA:
1433 // about the Send Q ,POST SEND completion
1434 // who needs this information
1436 // mark MSbuf_list[wr_id].status = BUF_REGISTERED
1438 wrq_id = comp_desc.id;
1440 if(RDMA_OP_ID < wrq_id) {
1441 // this RDMA message id, adjust it to the right entry
1442 wrq_id = wrq_id - RDMA_OP_ID;
1443 vstat = VAPI_deregister_mr(qp->hca_hndl, Local_rdma_info.send_rdma_mr_hndl);
1446 if(vstat != VAPI_OK) {
1447 CERROR("VAPI_CQE_SQ_SEND_DATA: Failed deregistering a RDMAi recv" " mem region %s\n", VAPI_strerror(vstat));
1450 if((RDMA_CTS_ID <= wrq_id) && (RDMA_OP_ID < wrq_id)) {
1451 // RTS or CTS send complete, release send buffer
1452 if(wrq_id >= RDMA_RTS_ID)
1453 wrq_id = wrq_id - RDMA_RTS_ID;
1455 wrq_id = wrq_id - RDMA_CTS_ID;
1458 spin_lock(&MSB_mutex[(int) wrq_id]);
1459 MRbuf_list[wrq_id].status = BUF_REGISTERED;
1460 spin_unlock(&MSB_mutex[(int) wrq_id]);
1462 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_SEND_DATA\n");
1465 case VAPI_CQE_SQ_RDMA_WRITE:
1466 // about the Send Q, RDMA write completion
1467 // who needs this information
1468 // data is successfully write from pource to destionation
1471 // mark MSbuf_list[wr_id].status = BUF_REGISTERED
1472 // de-register rdma buffer
1475 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_RDMA_WRITE\n");
1478 case VAPI_CQE_SQ_RDMA_READ:
1480 // RDMA read completion
1481 // who needs this information
1482 // data is successfully read from destionation to source
1483 CDEBUG(D_NET, "CQE opcode- VAPI_CQE_SQ_RDMA_READ\n");
1486 case VAPI_CQE_SQ_COMP_SWAP:
1488 // RDMA write completion
1489 // who needs this information
1491 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_COMP_SWAP\n");
1494 case VAPI_CQE_SQ_FETCH_ADD:
1496 // RDMA write completion
1497 // who needs this information
1499 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_FETCH_ADD\n");
1502 case VAPI_CQE_SQ_BIND_MRW:
1504 // RDMA write completion
1505 // who needs this information
1507 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_BIND_MRW\n");
1510 case VAPI_CQE_RQ_SEND_DATA:
1511 // about the Receive Q
1512 // process the incoming data and
1513 // forward it to .....
1514 // a completion recevie event is arriving at CQ
1515 // issue a recevie to get this arriving data out from CQ
1516 // pass the receiving data for further processing
1517 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_SEND_DATA\n");
1518 wrq_id = comp_desc.id ;
1519 transferred_data_length = comp_desc.byte_len;
1521 if((wrq_id >= RDMA_CTS_ID) && (wrq_id < RDMA_OP_ID)) {
1522 // this is RTS/CTS message
1523 // process it locally and don't pass it to portals layer
1524 // adjust wrq_id to get the right entry in MRbfu_list
1526 if(wrq_id >= RDMA_RTS_ID)
1527 wrq_id = wrq_id - RDMA_RTS_ID;
1529 wrq_id = wrq_id - RDMA_CTS_ID;
1531 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) MRbuf_list[wrq_id].buf_addr;
1532 MRbuf_list[wrq_id].status = BUF_INUSE;
1533 memcpy(&Rdma_info, &bufaddr, sizeof(RDMA_Info_Exchange));
1535 if(Ready_To_send == Rdma_info.opcode)
1536 // an RTS request message from remote node
1537 // prepare local RDMA buffer and send local rdma info to
1539 CTS_handshaking_protocol(&Rdma_info);
1541 if((Clear_To_send == Rdma_info.opcode) &&
1542 (RDMA_BUFFER_RESERVED == Rdma_info.flag))
1543 Cts_Message_arrived = YES;
1545 if(RDMA_BUFFER_UNAVAILABLE == Rdma_info.flag)
1546 CERROR("RDMA operation abort-RDMA_BUFFER_UNAVAILABLE\n");
1550 // this is an incoming mesage for portals layer
1551 // move to PORTALS layer for further processing
1554 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t)
1555 MRbuf_list[wrq_id].buf_addr;
1557 MRbuf_list[wrq_id].status = BUF_INUSE;
1558 transferred_data_length = comp_desc.byte_len;
1560 kibnal_rx(hca_data->kib_data,
1562 transferred_data_length,
1563 MRbuf_list[wrq_id].buf_size,
1567 // repost this receiving buffer and makr it at BUF_REGISTERED
1569 vstat = repost_recv_buf(qp, wrq_id);
1570 if(vstat != (VAPI_OK)) {
1571 CERROR("error while polling completion queue\n");
1574 MRbuf_list[wrq_id].status = BUF_REGISTERED;
1579 case VAPI_CQE_RQ_RDMA_WITH_IMM:
1580 // about the Receive Q
1581 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n");
1583 wrq_id = comp_desc.id ;
1584 transferred_data_length = comp_desc.byte_len;
1586 if(wrq_id == RDMA_OP_ID) {
1587 // this is RDAM op , locate the RDAM memory buffer address
1589 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) Local_rdma_info.raddr;
1591 transferred_data_length = comp_desc.byte_len;
1593 kibnal_rx(hca_data->kib_data,
1595 transferred_data_length,
1596 Local_rdma_info.buf_length,
1599 // de-regiser this RDAM receiving memory buffer
1600 // too early ?? test & check
1601 vstat = VAPI_deregister_mr(qp->hca_hndl, Local_rdma_info.recv_rdma_mr_hndl);
1602 if(vstat != VAPI_OK) {
1603 CERROR("VAPI_CQE_RQ_RDMA_WITH_IMM: Failed deregistering a RDMA"
1604 " recv mem region %s\n", VAPI_strerror(vstat));
1608 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n");
1611 case VAPI_CQE_INVAL_OPCODE:
1613 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_INVAL_OPCODE\n");
1617 CDEBUG(D_NET, "CQE opcode-unknown opcode\n");
1621 schedule_timeout(RECEIVING_THREAD_TIMEOUT);//how often do we need to poll CQ
1623 }// receiving while loop
1629 void CQE_event_handler(VAPI_hca_hndl_t hca_hndl,
1630 VAPI_cq_hndl_t cq_hndl,
1634 VAPI_wc_desc_t comp_desc;
1635 unsigned long polling_count = 0;
1636 u_int32_t timeout_usec;
1637 unsigned int priority = 100;
1638 unsigned int length;
1639 VAPI_wr_id_t wrq_id;
1640 u_int32_t transferred_data_length; /* Num. of bytes transferred */
1642 VAPI_virt_addr_t bufaddr;
1643 unsigned long buf_size = 0;
1644 QP_info *qp; // point to QP_list
1647 // send Q and receive Q are using the same CQ
1648 // so only poll one CQ for both operations
1650 CDEBUG(D_NET, "IBNAL- enter CQE_event_handler\n");
1651 printk("IBNAL- enter CQE_event_handler\n");
1653 hca_data = (HCA_info *) private;
1660 vstat = VAPI_poll_cq(hca_data->hca_hndl,hca_data->cq_hndl, &comp_desc);
1662 if (vstat == VAPI_CQ_EMPTY) {
1663 CDEBUG(D_NET, "CQE_event_handler: there is no event in CQE, how could"
1664 " this " "happened \n");
1665 printk("CQE_event_handler: there is no event in CQE, how could"
1666 " this " "happened \n");
1670 if (vstat != (VAPI_OK)) {
1671 CDEBUG(D_NET, "error while polling completion queue vstat %d - %s\n",
1672 vstat, VAPI_strerror(vstat));
1673 printk("error while polling completion queue vstat %d - %s\n",
1674 vstat, VAPI_strerror(vstat));
1679 // process the complete event
1680 switch(comp_desc.opcode) {
1681 case VAPI_CQE_SQ_SEND_DATA:
1682 // about the Send Q ,POST SEND completion
1683 // who needs this information
1685 // mark MSbuf_list[wr_id].status = BUF_REGISTERED
1687 wrq_id = comp_desc.id;
1689 #ifdef IBNAL_SELF_TESTING
1690 if(wrq_id == SEND_RECV_TEST_ID) {
1691 printk("IBNAL_SELF_TESTING - VAPI_CQE_SQ_SEND_DATA \n");
1694 if(RDMA_OP_ID < wrq_id) {
1695 // this RDMA message id, adjust it to the right entry
1696 wrq_id = wrq_id - RDMA_OP_ID;
1697 vstat = VAPI_deregister_mr(qp->hca_hndl,
1698 Local_rdma_info.send_rdma_mr_hndl);
1701 if(vstat != VAPI_OK) {
1702 CERROR(" VAPI_CQE_SQ_SEND_DATA: Failed deregistering a RDMA"
1703 " recv mem region %s\n", VAPI_strerror(vstat));
1706 if((RDMA_CTS_ID <= wrq_id) && (RDMA_OP_ID < wrq_id)) {
1707 // RTS or CTS send complete, release send buffer
1708 if(wrq_id >= RDMA_RTS_ID)
1709 wrq_id = wrq_id - RDMA_RTS_ID;
1711 wrq_id = wrq_id - RDMA_CTS_ID;
1714 spin_lock(&MSB_mutex[(int) wrq_id]);
1715 MRbuf_list[wrq_id].status = BUF_REGISTERED;
1716 spin_unlock(&MSB_mutex[(int) wrq_id]);
1719 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_SEND_DATA\n");
1723 case VAPI_CQE_SQ_RDMA_WRITE:
1724 // about the Send Q, RDMA write completion
1725 // who needs this information
1726 // data is successfully write from pource to destionation
1729 // mark MSbuf_list[wr_id].status = BUF_REGISTERED
1730 // de-register rdma buffer
1733 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_RDMA_WRITE\n");
1736 case VAPI_CQE_SQ_RDMA_READ:
1738 // RDMA read completion
1739 // who needs this information
1740 // data is successfully read from destionation to source
1741 CDEBUG(D_NET, "CQE opcode- VAPI_CQE_SQ_RDMA_READ\n");
1744 case VAPI_CQE_SQ_COMP_SWAP:
1746 // RDMA write completion
1747 // who needs this information
1749 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_COMP_SWAP\n");
1752 case VAPI_CQE_SQ_FETCH_ADD:
1754 // RDMA write completion
1755 // who needs this information
1757 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_FETCH_ADD\n");
1760 case VAPI_CQE_SQ_BIND_MRW:
1762 // RDMA write completion
1763 // who needs this information
1765 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_BIND_MRW\n");
1768 case VAPI_CQE_RQ_SEND_DATA:
1769 // about the Receive Q
1770 // process the incoming data and
1771 // forward it to .....
1772 // a completion recevie event is arriving at CQ
1773 // issue a recevie to get this arriving data out from CQ
1774 // pass the receiving data for further processing
1776 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_SEND_DATA\n");
1778 wrq_id = comp_desc.id ;
1780 #ifdef IBNAL_SELF_TESTING
1785 if(wrq_id == SEND_RECV_TEST_ID) {
1786 printk("IBNAL_SELF_TESTING - VAPI_CQE_RQ_SEND_DATA\n");
1789 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t)
1790 MRbuf_list[ SEND_RECV_TEST_BUF_ID].buf_addr;
1791 MRbuf_list[SEND_RECV_TEST_BUF_ID].status = BUF_INUSE;
1792 memcpy(&rbuf, &bufaddr, KB_32);
1795 for(i=0; i < 16; i++)
1796 printk("rbuf[%d]=%c, ", rbuf[i]);
1799 // repost this receiving buffer and makr it at BUF_REGISTERED
1800 vstat = repost_recv_buf(qp,SEND_RECV_TEST_BUF_ID);
1801 if(vstat != (VAPI_OK)) {
1802 printk("error while polling completion queue\n");
1805 MRbuf_list[SEND_RECV_TEST_BUF_ID].status = BUF_REGISTERED;
1808 transferred_data_length = comp_desc.byte_len;
1810 if((wrq_id >= RDMA_CTS_ID) && (wrq_id < RDMA_OP_ID)) {
1811 // this is RTS/CTS message
1812 // process it locally and don't pass it to portals layer
1813 // adjust wrq_id to get the right entry in MRbfu_list
1815 if(wrq_id >= RDMA_RTS_ID)
1816 wrq_id = wrq_id - RDMA_RTS_ID;
1818 wrq_id = wrq_id - RDMA_CTS_ID;
1820 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t)
1821 MRbuf_list[wrq_id].buf_addr;
1822 MRbuf_list[wrq_id].status = BUF_INUSE;
1823 memcpy(&Rdma_info, &bufaddr, sizeof(RDMA_Info_Exchange));
1825 if(Ready_To_send == Rdma_info.opcode)
1826 // an RTS request message from remote node
1827 // prepare local RDMA buffer and send local rdma info to
1829 CTS_handshaking_protocol(&Rdma_info);
1831 if((Clear_To_send == Rdma_info.opcode) &&
1832 (RDMA_BUFFER_RESERVED == Rdma_info.flag))
1833 Cts_Message_arrived = YES;
1835 if(RDMA_BUFFER_UNAVAILABLE == Rdma_info.flag)
1836 CERROR("RDMA operation abort-RDMA_BUFFER_UNAVAILABLE\n");
1840 // this is an incoming mesage for portals layer
1841 // move to PORTALS layer for further processing
1844 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t)
1845 MRbuf_list[wrq_id].buf_addr;
1847 MRbuf_list[wrq_id].status = BUF_INUSE;
1848 transferred_data_length = comp_desc.byte_len;
1850 kibnal_rx(hca_data->kib_data,
1852 transferred_data_length,
1853 MRbuf_list[wrq_id].buf_size,
1857 // repost this receiving buffer and makr it at BUF_REGISTERED
1858 vstat = repost_recv_buf(qp, wrq_id);
1859 if(vstat != (VAPI_OK)) {
1860 CERROR("error while polling completion queue\n");
1863 MRbuf_list[wrq_id].status = BUF_REGISTERED;
1869 case VAPI_CQE_RQ_RDMA_WITH_IMM:
1870 // about the Receive Q
1871 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n");
1873 wrq_id = comp_desc.id ;
1874 transferred_data_length = comp_desc.byte_len;
1876 if(wrq_id == RDMA_OP_ID) {
1877 // this is RDAM op , locate the RDAM memory buffer address
1879 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) Local_rdma_info.raddr;
1881 transferred_data_length = comp_desc.byte_len;
1883 kibnal_rx(hca_data->kib_data,
1885 transferred_data_length,
1886 Local_rdma_info.buf_length,
1889 // de-regiser this RDAM receiving memory buffer
1890 // too early ?? test & check
1891 vstat = VAPI_deregister_mr(qp->hca_hndl, Local_rdma_info.recv_rdma_mr_hndl);
1892 if(vstat != VAPI_OK) {
1893 CERROR("VAPI_CQE_RQ_RDMA_WITH_IMM: Failed deregistering a RDMA"
1894 " recv mem region %s\n", VAPI_strerror(vstat));
1898 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n");
1901 case VAPI_CQE_INVAL_OPCODE:
1903 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_INVAL_OPCODE\n");
1907 CDEBUG(D_NET, "CQE opcode-unknown opcode\n");
1912 // issue a new request for completion ievent notification
1913 vstat = VAPI_req_comp_notif(hca_data->hca_hndl,
1918 if(vstat != VAPI_OK) {
1919 CERROR("PI_req_comp_notif: Failed %s\n", VAPI_strerror(vstat));
1922 return; // end of event handler
1929 kibnal_cmd(struct portal_ioctl_data * data, void * private)
1933 CDEBUG(D_NET, "kibnal_cmd \n");
1940 void ibnal_send_recv_self_testing(int *my_role)
1943 VAPI_sr_desc_t sr_desc;
1944 VAPI_sg_lst_entry_t sr_sg;
1946 VAPI_wr_id_t send_id;
1951 int buf_length = KB_32;
1952 VAPI_wc_desc_t comp_desc;
1956 // make it as a daemon process
1957 // kportal_daemonize("ibnal_send_recv_self_testing");
1959 printk("My role is 0X%X\n", *my_role);
1961 if(*my_role == TEST_SEND_MESSAGE) {
1962 printk("Enter ibnal_send_recv_self_testing\n");
1964 memset(&sbuf, 'a', KB_32);
1965 memset(&rbuf, ' ', KB_32);
1967 send_id = SEND_RECV_TEST_ID;
1968 buf_id = SEND_RECV_TEST_BUF_ID;
1970 qp = &QP_list[buf_id];
1972 sr_desc.opcode = VAPI_SEND;
1973 sr_desc.comp_type = VAPI_SIGNALED;
1974 sr_desc.id = send_id;
1976 // scatter and gather info
1978 sr_sg.lkey = MSbuf_list[buf_id].mr.l_key; // use send MR
1979 sr_sg.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MSbuf_list[buf_id].buf_addr;
1981 // copy data to register send buffer
1982 memcpy(&sr_sg.addr, &sbuf, buf_length);
1984 sr_desc.sg_lst_p = &sr_sg;
1985 sr_desc.sg_lst_len = 1; // only 1 entry is used
1986 sr_desc.fence = TRUE;
1987 sr_desc.set_se = FALSE;
1990 // call VAPI_post_sr to send out this data
1991 vstat = VAPI_post_sr(qp->hca_hndl, qp->qp_hndl, &sr_desc);
1993 if (vstat != VAPI_OK) {
1994 printk("VAPI_post_sr failed (%s).\n",VAPI_strerror(vstat));
1997 printk("VAPI_post_sr success.\n");
2002 printk("I am a receiver and doing nothing here\n");
2005 printk("ibnal_send_recv_self_testing thread exit \n");
2013 // ibnal initialize process
2015 // 1. Bring up Infiniband network interface
2017 // 2. Initialize a PORTALS nal interface
2021 kibnal_initialize(void)
2025 unsigned long sizemask;
2030 portals_debug_set_level(IBNAL_DEBUG_LEVEL_1);
2032 CDEBUG(D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory));
2034 CDEBUG(D_PORTALS, "kibnal_initialize: Enter kibnal_initialize\n");
2036 // set api functional pointers
2037 kibnal_api.forward = kibnal_forward;
2038 kibnal_api.shutdown = kibnal_shutdown;
2039 kibnal_api.yield = kibnal_yield;
2040 kibnal_api.validate = NULL; /* our api validate is a NOOP */
2041 kibnal_api.lock = kibnal_lock;
2042 kibnal_api.unlock = kibnal_unlock;
2043 kibnal_api.nal_data = &kibnal_data; // this is so called private data
2044 kibnal_api.refct = 1;
2045 kibnal_api.timeout = NULL;
2046 kibnal_lib.nal_data = &kibnal_data;
2048 memset(&kibnal_data, 0, sizeof(kibnal_data));
2050 // initialize kib_list list data structure
2051 INIT_LIST_HEAD(&kibnal_data.kib_list);
2053 kibnal_data.kib_cb = &kibnal_lib;
2055 spin_lock_init(&kibnal_data.kib_dispatch_lock);
2059 // bring up the IB inter-connect network interface
2062 vstat = IB_Open_HCA(&kibnal_data);
2064 if(vstat != VAPI_OK) {
2065 CERROR("kibnal_initialize: IB_Open_HCA failed: %d- %s\n",
2066 vstat, VAPI_strerror(vstat));
2068 printk("kibnal_initialize: IB_Open_HCA failed: %d- %s\n",
2069 vstat, VAPI_strerror(vstat));
2073 kibnal_data.kib_nid = (__u64 )Hca_hndl;//convert Hca_hndl to 64-bit format
2074 kibnal_data.kib_init = 1;
2076 CDEBUG(D_NET, " kibnal_data.kib_nid 0x%x%x\n", kibnal_data.kib_nid);
2077 printk(" kibnal_data.kib_nid 0x%x%x\n", kibnal_data.kib_nid);
2079 /* Network interface ready to initialise */
2080 // get an entery in the PORTALS table for this IB protocol
2082 CDEBUG(D_PORTALS,"Call PtlNIInit to register this Infiniband Interface\n");
2083 printk("Call PtlNIInit to register this Infiniband Interface\n");
2085 rc = PtlNIInit(kibnal_init, 32, 4, 0, &kibnal_ni);
2088 CERROR("kibnal_initialize: PtlNIInit failed %d\n", rc);
2089 printk("kibnal_initialize: PtlNIInit failed %d\n", rc);
2094 CDEBUG(D_PORTALS,"kibnal_initialize: PtlNIInit DONE\n");
2095 printk("kibnal_initialize: PtlNIInit DONE\n");
2099 #ifdef POLL_BASED_CQE_HANDLING
2100 // create a receiving thread: main loopa
2101 // this is polling based mail loop
2102 kernel_thread(k_recv_thread, &Hca_data, 0);
2105 #ifdef EVENT_BASED_CQE_HANDLING
2106 // for completion event handling, this is event based CQE handling
2107 vstat = IB_Set_Event_Handler(Hca_data, &kibnal_data);
2109 if (vstat != VAPI_OK) {
2110 CERROR("IB_Set_Event_Handler failed: %d - %s \n",
2111 vstat, VAPI_strerror(vstat));
2115 CDEBUG(D_PORTALS,"IB_Set_Event_Handler Done \n");
2116 printk("IB_Set_Event_Handler Done \n");
2120 PORTAL_SYMBOL_REGISTER(kibnal_ni);
2122 #ifdef IBNAL_SELF_TESTING
2124 // test HCA send recv before normal event handling
2127 my_role = TEST_SEND_MESSAGE;
2129 printk("my role is TEST_RECV_MESSAGE\n");
2131 // kernel_thread(ibnal_send_recv_self_testing, &my_role, 0);
2133 ibnal_send_recv_self_testing(&my_role);
2143 MODULE_AUTHOR("Hsingbung(HB) Chen <hbchen@lanl.gov>");
2144 MODULE_DESCRIPTION("Kernel Infiniband NAL v0.1");
2145 MODULE_LICENSE("GPL");
2147 module_init (kibnal_initialize);
2148 module_exit (kibnal_finalize);
2150 EXPORT_SYMBOL(kibnal_ni);