1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Based on ksocknal, qswnal, and gmnal
6 * Copyright (C) 2003 LANL
7 * Author: HB Chen <hbchen@lanl.gov>
8 * Los Alamos National Lab
10 * Portals is free software; you can redistribute it and/or
11 * modify it under the terms of version 2 of the GNU General Public
12 * License as published by the Free Software Foundation.
14 * Portals is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with Portals; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
27 // portal handle ID for this IB-NAL
28 ptl_handle_ni_t kibnal_ni;
30 // message send buffer mutex
31 spinlock_t MSBuf_mutex[NUM_MBUF];
33 // message recv buffer mutex
34 spinlock_t MRBuf_mutex[NUM_MBUF];
36 // IB-NAL API information
40 kibnal_data_t kibnal_data;
43 VAPI_pd_hndl_t Pd_hndl;
44 unsigned int Num_posted_recv_buf;
46 // registered send buffer list
47 Memory_buffer_info MSbuf_list[NUM_MBUF];
49 // registered recv buffer list
50 Memory_buffer_info MRbuf_list[NUM_MBUF];
54 // currently there is no need fo IBA
56 kpr_nal_interface_t kibnal_router_interface = {
58 kprni_arg: &kibnal_data,
59 kprni_fwd: kibnal_fwd_packet, // forward data to router
60 // is router invloving the
66 QP_info QP_list[NUM_QPS];
68 // information associated with a HCA
71 // something about HCA
72 VAPI_hca_hndl_t Hca_hndl; // assume we only use one HCA now
73 VAPI_hca_vendor_t Hca_vendor;
74 VAPI_hca_cap_t Hca_cap;
75 VAPI_hca_port_t Hca_port_1_props;
76 VAPI_hca_port_t Hca_port_2_props;
77 VAPI_hca_attr_t Hca_attr;
78 VAPI_hca_attr_mask_t Hca_attr_mask;
79 VAPI_cq_hndl_t Cq_RQ_hndl; // CQ's handle
80 VAPI_cq_hndl_t Cq_SQ_hndl; // CQ's handle
81 VAPI_cq_hndl_t Cq_hndl; // CQ's handle
82 Remote_QP_Info L_QP_data;
83 Remote_QP_Info R_QP_data;
90 kibnal_forward(nal_t *nal,
97 kibnal_data_t *knal_data = nal->nal_data;
98 nal_cb_t *nal_cb = knal_data->kib_cb;
101 LASSERT (nal == &kibnal_api);
102 LASSERT (knal_data == &kibnal_data);
103 LASSERT (nal_cb == &kibnal_lib);
105 // dispatch forward API function
107 CDEBUG(D_NET,"kibnal_forward: function id = %d\n", id);
109 lib_dispatch(nal_cb, knal_data, id, args, ret);
111 CDEBUG(D_TRACE,"IBNAL- Done kibnal_forward\n");
113 return PTL_OK; // always return PTL_OK
120 kibnal_lock(nal_t *nal, unsigned long *flags)
122 kibnal_data_t *knal_data = nal->nal_data;
123 nal_cb_t *nal_cb = knal_data->kib_cb;
126 LASSERT (nal == &kibnal_api);
127 LASSERT (knal_data == &kibnal_data);
128 LASSERT (nal_cb == &kibnal_lib);
130 // disable logical interrrupt
131 nal_cb->cb_cli(nal_cb,flags);
133 CDEBUG(D_TRACE,"IBNAL-Done kibnal_lock\n");
141 kibnal_unlock(nal_t *nal, unsigned long *flags)
143 kibnal_data_t *k = nal->nal_data;
144 nal_cb_t *nal_cb = k->kib_cb;
147 LASSERT (nal == &kibnal_api);
148 LASSERT (k == &kibnal_data);
149 LASSERT (nal_cb == &kibnal_lib);
151 // enable logical interrupt
152 nal_cb->cb_sti(nal_cb,flags);
154 CDEBUG(D_TRACE,"IBNAL-Done kibnal_unlock");
160 // showdown this network interface
163 kibnal_shutdown(nal_t *nal, int ni)
166 kibnal_data_t *k = nal->nal_data;
167 nal_cb_t *nal_cb = k->kib_cb;
170 LASSERT (nal == &kibnal_api);
171 LASSERT (k == &kibnal_data);
172 LASSERT (nal_cb == &kibnal_lib);
174 // take down this IB network interface
175 // there is not corresponding cb function to hande this
176 // do we actually need this one
177 // reference to IB network interface shutdown
180 vstat = IB_Close_HCA();
182 if (vstat != VAPI_OK) {
183 CERROR("Failed to close HCA - %s\n",VAPI_strerror(vstat));
187 CDEBUG(D_TRACE,"IBNAL- Done kibnal_shutdown\n");
194 // when do we call this yield function
197 kibnal_yield( nal_t *nal )
199 kibnal_data_t *k = nal->nal_data;
200 nal_cb_t *nal_cb = k->kib_cb;
203 LASSERT (nal == &kibnal_api);
204 LASSERT (k == &kibnal_data);
205 LASSERT (nal_cb == &kibnal_lib);
207 // check under what condition that we need to
209 // who set this need_resched
210 if (current->need_resched)
213 CDEBUG(D_TRACE,"IBNAL-Done kibnal_yield");
222 kibnal_init(int interface, // no use here
223 ptl_pt_index_t ptl_size,
224 ptl_ac_index_t ac_size,
225 ptl_pid_t requested_pid // no use here
229 nal_cb_t *nal_cb = NULL;
230 kibnal_data_t *nal_data = NULL;
233 unsigned int nnids = 1; // number of nids
234 // do we know how many nodes are in this
235 // system related to this kib_nid
238 CDEBUG(D_NET, "kibnal_init:calling lib_init with nid 0x%u\n",
239 kibnal_data.kib_nid);
242 CDEBUG(D_NET, "kibnal_init: interface [%d], ptl_size [%d], ac_size[%d]\n",
243 interface, ptl_size, ac_size);
244 CDEBUG(D_NET, "kibnal_init: &kibnal_lib 0x%X\n", &kibnal_lib);
245 CDEBUG(D_NET, "kibnal_init: kibnal_data.kib_nid %d\n", kibnal_data.kib_nid);
247 rc = lib_init(&kibnal_lib,
249 0, // process id is set as 0
255 CERROR("kibnal_init: Failed lib_init with nid 0x%u, rc=%d\n",
256 kibnal_data.kib_nid,rc);
259 CDEBUG(D_NET,"kibnal_init: DONE lib_init with nid 0x%x%x\n",
260 kibnal_data.kib_nid);
269 // called before remove ibnal kernel module
272 kibnal_finalize(void)
274 struct list_head *tmp;
276 inter_module_unregister("kibnal_ni");
278 // release resources allocated to this Infiniband network interface
279 PtlNIFini(kibnal_ni);
281 lib_fini(&kibnal_lib);
285 // how much do we need to do here?
286 list_for_each(tmp, &kibnal_data.kib_list) {
288 conn = list_entry(tmp, kibnal_rx_t, krx_item);
289 CDEBUG(D_IOCTL, "freeing conn %p\n",conn);
291 list_del(&conn->krx_item);
292 PORTAL_FREE(conn, sizeof(*conn));
295 CDEBUG(D_MALLOC,"done kmem %d\n",atomic_read(&portal_kmemory));
296 CDEBUG(D_TRACE,"IBNAL-Done kibnal_finalize\n");
303 // * k_server_thread is a kernel thread
304 // use a shared memory ro exchange HCA's data with a pthread in user
306 // * will be replaced when CM is used to handle communication management
309 void k_server_thread(Remote_QP_Info *hca_data)
312 const int shared_segment_size = sizeof(Remote_QP_Info);
313 key_t key = HCA_EXCHANGE_SHM_KEY;
315 int exchanged_done = NO;
318 Remote_QP_Info *exchange_hca_data;
324 // create a shared memory with pre-agreement key
325 segment_id = sys_shmget(key,
330 // attached to shared memoru
331 // raddr is pointed to an user address space
332 // use this address to update shared menory content
333 ret = sys_shmat(segment_id, 0 , SHM_RND, &raddr);
337 CDEBUG(D_NET,"k_server_thread: Shared memory attach success ret = 0X%d,&raddr"
338 " 0X%x (*(&raddr))=0x%x \n", ret, &raddr, (*(&raddr)));
339 printk("k_server_thread: Shared memory attach success ret = 0X%d, &raddr"
340 " 0X%x (*(&raddr))=0x%x \n", ret, &raddr, (*(&raddr)));
343 CERROR("k_server_thread: Shared memory attach failed ret = 0x%d \n", ret);
344 printk("k_server_thread: Shared memory attach failed ret = 0x%d \n", ret);
350 uaddr = *n; // get the U-address
351 /* cast uaddr to exchange_hca_data */
352 exchange_hca_data = (Remote_QP_Info *) uaddr;
354 /* copy data from local HCA to shared memory */
355 exchange_hca_data->opcode = hca_data->opcode;
356 exchange_hca_data->length = hca_data->length;
358 for(i=0; i < NUM_QPS; i++) {
359 exchange_hca_data->dlid[i] = hca_data->dlid[i];
360 exchange_hca_data->rqp_num[i] = hca_data->rqp_num[i];
363 // periodically check shared memory until get updated
364 // remote HCA's data from user mode pthread
365 while(exchanged_done == NO) {
366 if(exchange_hca_data->opcode == RECV_QP_INFO){
367 exchanged_done = YES;
368 /* copy data to local buffer from shared memory */
369 hca_data->opcode = exchange_hca_data->opcode;
370 hca_data->length = exchange_hca_data->length;
372 for(i=0; i < NUM_QPS; i++) {
373 hca_data->dlid[i] = exchange_hca_data->dlid[i];
374 hca_data->rqp_num[i] = exchange_hca_data->rqp_num[i];
379 schedule_timeout(1000);
383 // detached shared memory
386 CDEBUG(D_NET, "Exit from kernel thread: k_server_thread \n");
387 printk("Exit from kernel thread: k_server_thread \n");
397 create_qp(QP_info *qp, int qp_index)
401 VAPI_qp_init_attr_t qp_init_attr;
402 VAPI_qp_prop_t qp_prop;
404 qp->hca_hndl = Hca_hndl;
405 qp->port = 1; // default
406 qp->slid = Hca_port_1_props.lid;
407 qp->hca_port = Hca_port_1_props;
410 /* Queue Pair Creation Attributes */
411 qp_init_attr.cap.max_oust_wr_rq = NUM_WQE;
412 qp_init_attr.cap.max_oust_wr_sq = NUM_WQE;
413 qp_init_attr.cap.max_sg_size_rq = NUM_SG;
414 qp_init_attr.cap.max_sg_size_sq = NUM_SG;
415 qp_init_attr.pd_hndl = qp->pd_hndl;
416 qp_init_attr.rdd_hndl = 0;
417 qp_init_attr.rq_cq_hndl = qp->rq_cq_hndl;
418 /* we use here polling */
419 //qp_init_attr.rq_sig_type = VAPI_SIGNAL_REQ_WR;
420 qp_init_attr.rq_sig_type = VAPI_SIGNAL_ALL_WR;
421 qp_init_attr.sq_cq_hndl = qp->sq_cq_hndl;
422 /* we use here polling */
423 //qp_init_attr.sq_sig_type = VAPI_SIGNAL_REQ_WR;
424 qp_init_attr.sq_sig_type = VAPI_SIGNAL_ALL_WR;
425 // transport servce - reliable connection
427 qp_init_attr.ts_type = VAPI_TS_RC;
429 vstat = VAPI_create_qp(qp->hca_hndl,
431 &qp->qp_hndl, &qp_prop);
433 if (vstat != VAPI_OK) {
434 CERROR("Failed creating QP. Return Failed - %s\n",VAPI_strerror(vstat));
438 qp->qp_num = qp_prop.qp_num; // the qp number
439 qp->last_posted_send_id = 0; // user defined work request ID
440 qp->last_posted_rcv_id = 0; // user defined work request ID
441 qp->cur_send_outstanding = 0;
442 qp->cur_posted_rcv_bufs = 0;
443 qp->snd_rcv_balance = 0;
445 CDEBUG(D_OTHER, "create_qp: qp_num = %d, slid = %d, qp_hndl = 0X%X",
446 qp->qp_num, qp->slid, qp->qp_hndl);
448 // initialize spin-lock mutex variables
449 spin_lock_init(&(qp->snd_mutex));
450 spin_lock_init(&(qp->rcv_mutex));
451 spin_lock_init(&(qp->bl_mutex));
452 spin_lock_init(&(qp->cln_mutex));
453 // number of outstanding requests on the send Q
454 qp->cur_send_outstanding = 0;
455 // number of posted receive buffers
456 qp->cur_posted_rcv_bufs = 0;
457 qp->snd_rcv_balance = 0;
464 // initialize a UD qp state to RTR and RTS
467 init_qp_UD(QP_info *qp, int qp_index)
469 VAPI_qp_attr_t qp_attr;
470 VAPI_qp_init_attr_t qp_init_attr;
471 VAPI_qp_attr_mask_t qp_attr_mask;
472 VAPI_qp_cap_t qp_cap;
475 /* Move from RST to INIT */
476 /* Change QP to INIT */
478 CDEBUG(D_OTHER, "Changing QP state to INIT qp-index = %d\n", qp_index);
480 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
482 qp_attr.qp_state = VAPI_INIT;
483 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
485 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
488 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PKEY_IX);
490 CDEBUG(D_OTHER, "pkey_ix qp_attr_mask = 0X%x\n", qp_attr_mask);
492 qp_attr.port = qp->port;
493 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PORT);
495 CDEBUG(D_OTHER, "port qp_attr_mask = 0X%x\n", qp_attr_mask);
498 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QKEY);
500 CDEBUG(D_OTHER, "qkey qp_attr_mask = 0X%x\n", qp_attr_mask);
502 /* If I do not set this mask, I get an error from HH. QPM should catch it */
504 vstat = VAPI_modify_qp(qp->hca_hndl,
510 if (vstat != VAPI_OK) {
511 CERROR("Failed modifying QP from RST to INIT. %s\n",VAPI_strerror(vstat));
515 CDEBUG(D_OTHER, "Modifying QP from RST to INIT.\n");
517 vstat= VAPI_query_qp(qp->hca_hndl,
523 if (vstat != VAPI_OK) {
524 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
528 /* Move from INIT to RTR */
529 /* Change QP to RTR */
530 CDEBUG(D_OTHER, "Changing QP state to RTR\n");
532 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
534 qp_attr.qp_state = VAPI_RTR;
535 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
537 CDEBUG(D_OTHER, "INIT to RTR- qp_state : qp_attr_mask = 0X%x\n", qp_attr_mask);
539 vstat = VAPI_modify_qp(qp->hca_hndl,
545 if (vstat != VAPI_OK) {
546 CERROR("Failed modifying QP from INIT to RTR. %s\n",VAPI_strerror(vstat));
550 CDEBUG(D_OTHER, "Modifying QP from INIT to RTR.\n");
552 vstat= VAPI_query_qp(qp->hca_hndl,
558 if (vstat != VAPI_OK) {
559 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
563 /* RTR to RTS - Change QP to RTS */
564 CDEBUG(D_OTHER, "Changing QP state to RTS\n");
566 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
568 qp_attr.qp_state = VAPI_RTS;
569 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
571 qp_attr.sq_psn = START_SQ_PSN;
572 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_SQ_PSN);
574 vstat = VAPI_modify_qp(qp->hca_hndl,
580 if (vstat != VAPI_OK) {
581 CERROR("Failed modifying QP from RTR to RTS. %s:%s\n",
582 VAPI_strerror_sym(vstat),
583 VAPI_strerror(vstat));
587 CDEBUG(D_OTHER, "Modifying QP from RTR to RTS. \n");
589 vstat= VAPI_query_qp(qp->hca_hndl,
595 if (vstat != VAPI_OK) {
596 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
601 // a QP is at RTS state NOW
604 CDEBUG(D_OTHER, "IBNAL- UD qp is at RTS NOW\n");
613 // initialize a RC qp state to RTR and RTS
614 // RC transport service
617 init_qp_RC(QP_info *qp, int qp_index)
619 VAPI_qp_attr_t qp_attr;
620 VAPI_qp_init_attr_t qp_init_attr;
621 VAPI_qp_attr_mask_t qp_attr_mask;
622 VAPI_qp_cap_t qp_cap;
625 /* Move from RST to INIT */
626 /* Change QP to INIT */
628 CDEBUG(D_OTHER, "Changing QP state to INIT qp-index = %d\n", qp_index);
630 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
632 qp_attr.qp_state = VAPI_INIT;
633 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
635 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
638 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PKEY_IX);
640 CDEBUG(D_OTHER, "pkey_ix qp_attr_mask = 0X%x\n", qp_attr_mask);
642 qp_attr.port = qp->port;
643 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PORT);
645 CDEBUG(D_OTHER, "port qp_attr_mask = 0X%x\n", qp_attr_mask);
647 qp_attr.remote_atomic_flags = VAPI_EN_REM_WRITE | VAPI_EN_REM_READ;
648 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_REMOTE_ATOMIC_FLAGS);
650 CDEBUG(D_OTHER, "remote_atomic_flags qp_attr_mask = 0X%x\n", qp_attr_mask);
652 /* If I do not set this mask, I get an error from HH. QPM should catch it */
654 vstat = VAPI_modify_qp(qp->hca_hndl,
660 if (vstat != VAPI_OK) {
661 CERROR("Failed modifying QP from RST to INIT. %s\n",VAPI_strerror(vstat));
665 vstat= VAPI_query_qp(qp->hca_hndl,
671 if (vstat != VAPI_OK) {
672 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
676 /* Move from INIT to RTR */
677 /* Change QP to RTR */
678 CDEBUG(D_OTHER, "Changing QP state to RTR qp_indexi %d\n", qp_index);
680 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
681 qp_attr.qp_state = VAPI_RTR;
683 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
685 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
687 qp_attr.av.sl = 0;/* RESPONDER_SL */
688 qp_attr.av.grh_flag = FALSE;
689 qp_attr.av.dlid = qp->dlid;/*RESPONDER_LID;*/
690 qp_attr.av.static_rate = 0;
691 qp_attr.av.src_path_bits = 0;
692 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_AV);
694 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
696 qp_attr.path_mtu = MTU_2048;// default is MTU_2048
697 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PATH_MTU);
699 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
701 qp_attr.rq_psn = START_RQ_PSN;
702 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_RQ_PSN);
704 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
706 qp_attr.qp_ous_rd_atom = NUM_WQE;
707 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_OUS_RD_ATOM);
709 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
712 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_PKEY_IX);
714 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
716 qp_attr.min_rnr_timer = 10;
717 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_MIN_RNR_TIMER);
719 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
721 qp_attr.dest_qp_num = qp->rqp_num;
723 CDEBUG(D_OTHER, "remore qp num %d\n", qp->rqp_num);
725 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_DEST_QP_NUM);
727 CDEBUG(D_OTHER, "qp_state qp_attr_mask = 0X%x\n", qp_attr_mask);
729 vstat = VAPI_modify_qp(qp->hca_hndl,
736 if (vstat != VAPI_OK) {
737 CERROR("Failed modifying QP from INIT to RTR. qp_index %d - %s\n",
738 qp_index, VAPI_strerror(vstat));
742 vstat= VAPI_query_qp(qp->hca_hndl,
748 if (vstat != VAPI_OK) {
749 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
753 /* RTR to RTS - Change QP to RTS */
754 CDEBUG(D_OTHER, "Changing QP state to RTS\n");
756 QP_ATTR_MASK_CLR_ALL(qp_attr_mask);
758 qp_attr.qp_state = VAPI_RTS;
759 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_QP_STATE);
761 qp_attr.sq_psn = START_SQ_PSN;
762 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_SQ_PSN);
764 qp_attr.timeout = 0x18;
765 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_TIMEOUT);
767 qp_attr.retry_count = 10;
768 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_RETRY_COUNT);
770 qp_attr.rnr_retry = 14;
771 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_RNR_RETRY);
773 qp_attr.ous_dst_rd_atom = 100;
774 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_OUS_DST_RD_ATOM);
776 qp_attr.min_rnr_timer = 5;
777 QP_ATTR_MASK_SET(qp_attr_mask,QP_ATTR_MIN_RNR_TIMER);
779 vstat = VAPI_modify_qp(qp->hca_hndl,
785 if (vstat != VAPI_OK) {
786 CERROR("Failed modifying QP from RTR to RTS. %s:%s\n",
787 VAPI_strerror_sym(vstat), VAPI_strerror(vstat));
791 vstat= VAPI_query_qp(qp->hca_hndl,
797 if (vstat != VAPI_OK) {
798 CERROR("Failed query QP. %s\n",VAPI_strerror(vstat));
803 // a QP is at RTS state NOW
806 CDEBUG(D_OTHER, "IBNAL- RC qp is at RTS NOW\n");
814 IB_Open_HCA(kibnal_data_t *kib_data)
818 VAPI_cqe_num_t cqe_active_num;
821 int Num_posted_recv_buf;
824 CDEBUG(D_PORTALS, "Opening an HCA\n");
826 vstat = VAPI_open_hca(HCA_ID, &Hca_hndl);
827 vstat = EVAPI_get_hca_hndl(HCA_ID, &Hca_hndl);
828 if (vstat != VAPI_OK) {
829 CERROR("Failed opening the HCA: %s. %s...\n",HCA_ID,VAPI_strerror(vstat));
834 vstat = VAPI_query_hca_cap(Hca_hndl, &Hca_vendor, &Hca_cap);
835 if (vstat != VAPI_OK) {
836 CERROR("Failed query hca cap %s\n",VAPI_strerror(vstat));
840 /* Get port 1 info */
841 vstat = VAPI_query_hca_port_prop(Hca_hndl, HCA_PORT_1 , &Hca_port_1_props);
842 if (vstat != VAPI_OK) {
843 CERROR("Failed query port cap %s\n",VAPI_strerror(vstat));
847 /* Get port 2 info */
848 vstat = VAPI_query_hca_port_prop(Hca_hndl, HCA_PORT_2, &Hca_port_2_props);
849 if (vstat != VAPI_OK) {
850 CERROR("Failed query port cap %s\n",VAPI_strerror(vstat));
855 CDEBUG(D_PORTALS, "Allocating PD \n");
856 vstat = VAPI_alloc_pd(Hca_hndl,&Pd_hndl);
857 if (vstat != VAPI_OK) {
858 CERROR("Failed allocating a PD. %s\n",VAPI_strerror(vstat));
862 vstat = createMemRegion(Hca_hndl, Pd_hndl);
863 if (vstat != VAPI_OK) {
864 CERROR("Failed registering a memory region.%s\n",VAPI_strerror(vstat));
868 /* Create CQ for RQ*/
869 CDEBUG(D_PORTALS, "Creating a send completion queue\n");
871 vstat = VAPI_create_cq(Hca_hndl,
876 if (vstat != VAPI_OK) {
877 CERROR("Failed creating a CQ. %s\n",VAPI_strerror(vstat));
881 if(NUM_CQE == cqe_active_num) {
882 CERROR("VAPI_create_cq: NUM_CQE EQ cqe_active_num \n");
885 CDEBUG(D_NET, "VAPI_create_cq: NUM_CQE %d , actual cqe_active_num %d \n",
886 NUM_CQE, cqe_active_num);
889 Cq_SQ_hndl = Cq_hndl;
890 Cq_RQ_hndl = Cq_hndl;
895 for(i=0; i < NUM_QPS; i++) {
896 QP_list[i].pd_hndl = Pd_hndl;
897 QP_list[i].hca_hndl = Hca_hndl;
898 // sq rq use the same Cq_hndl
899 QP_list[i].sq_cq_hndl = Cq_hndl;
900 QP_list[i].rq_cq_hndl = Cq_hndl;
901 vstat = create_qp(&QP_list[i], i);
902 if (vstat != VAPI_OK) {
903 CERROR("Failed creating a QP %d %s\n",i, VAPI_strerror(vstat));
912 Hca_data.hca_hndl = Hca_hndl; // HCA handle
913 Hca_data.pd_hndl = Pd_hndl; // protection domain
914 Hca_data.port = 1; // port number
915 Hca_data.num_qp = NUM_QPS; // number of qp used
917 for(i=0; i < NUM_QPS; i++) {
918 Hca_data.qp_ptr[i] = &QP_list[i]; // point to QP_list
921 Hca_data.num_cq = NUM_CQ; // number of cq used
922 Hca_data.cq_hndl = Cq_hndl; //
923 Hca_data.sq_cq_hndl = Cq_SQ_hndl; //
924 Hca_data.rq_cq_hndl = Cq_RQ_hndl; //
925 Hca_data.kib_data = kib_data; //
926 Hca_data.slid = QP_list[0].slid;//
930 #ifdef USE_SHARED_MEMORY_AND_SOCKET
933 * + use a shared-memory between a user thread and a kernel thread
934 * for HCA's data exchange on the same node
935 * + use socket in user mode to exhange HCA's data with a remote node
939 R_QP_data.opcode = SEND_QP_INFO;
940 R_QP_data.length = sizeof(L_QP_data);
942 for(i=0; i < NUM_QPS; i++) {
943 // my slid will be used in a remote node as dlid
944 R_QP_data.dlid[i] = QP_list[i].slid;
945 // my qp_num will be used in remode node as remote_qp_number
946 // RC is used here so we need dlid and rqp_num
947 R_QP_data.rqp_num[i] = QP_list[i].qp_num ;
950 // create a kernel thread for exchanging HCA's data
951 // R_QP_data will be exchanged with a remoe node
953 kernel_thread(k_server_thread, &R_QP_data, 0); //
954 // check if the HCA'data have been updated by kernel_thread
955 // loop until the HCA's data is updated
956 // make sure that uagent is running
958 // QP info is exchanged with a remote node
960 schedule_timeout(1000);
961 if(R_QP_data.opcode == RECV_QP_INFO) {
962 CDEBUG(D_NET, "HCA's data is being updated\n");
969 #ifdef USE_SHARED_MEMORY_AND_MULTICAST
972 * + use a shared-memory between a user thread and a kernel thread
973 * for HCA's data exchange on the same node
974 * + use Infinoband UR/multicast in user mode to exhange HCA's data with i
983 for(i=0; i < NUM_QPS; i++) {
984 qp = (QP_info *) &QP_list[i];
985 QP_list[i].rqp_num = R_QP_data.rqp_num[i]; // remoter qp number
986 QP_list[i].dlid = R_QP_data.dlid[i]; // remote dlid
989 // already have remote_qp_num adn dlid information
990 // initialize QP to RTR/RTS state
992 for(i=0; i < NUM_QPS; i++) {
993 vstat = init_qp_RC(&QP_list[i], i);
994 if (vstat != VAPI_OK) {
995 CERROR("Failed change a QP %d to RTS state%s\n",
996 i,VAPI_strerror(vstat));
1001 // post receiving buffer before any send happened
1003 Num_posted_recv_buf = post_recv_bufs( (VAPI_wr_id_t ) START_RECV_WRQ_ID);
1005 // for irregular completion event or some unexpected failure event
1006 vstat = IB_Set_Async_Event_Handler(Hca_data, &kibnal_data);
1007 if (vstat != VAPI_OK) {
1008 CERROR("IB_Set_Async_Event_Handler failed: %d\n", vstat);
1013 CDEBUG(D_PORTALS, "IBNAL- done with IB_Open_HCA\n");
1015 for(i=0; i < NUM_MBUF; i++) {
1016 spin_lock_init(&MSB_mutex[i]);
1025 Function: IB_Set_Event_Handler()
1027 IN Hca_info hca_data
1028 IN kibnal_data_t *kib_data -- private data
1031 return: VAPI_OK - success
1037 IB_Set_Event_Handler(HCA_info hca_data, kibnal_data_t *kib_data)
1040 EVAPI_compl_handler_hndl_t comp_handler_hndl;
1042 // register CQE_Event_Hnadler
1044 vstat = VAPI_set_comp_event_handler(hca_data.hca_hndl,
1049 or use extended VAPI function
1050 vstat = EVAPI_set_comp_eventh(hca_data.hca_hndl,
1058 if (vstat != VAPI_OK) {
1059 CERROR("IB_Set_Event_Handler: failed EVAPI_set_comp_eventh for"
1060 " HCA ID = %s (%s).\n", HCA_ID, VAPI_strerror(vstat));
1064 // issue a request for completion ievent notification
1065 vstat = VAPI_req_comp_notif(hca_data.hca_hndl,
1069 if (vstat != VAPI_OK) {
1070 CERROR("IB_Set_Event_Handler: failed VAPI_req_comp_notif for HCA ID"
1071 " = %s (%s).\n", HCA_ID, VAPI_strerror(vstat));
1080 Function: IB_Set_Async_Event_Handler()
1082 IN HCA_info hca_data
1083 IN kibnal_data_t *kib_data -- private data
1086 return: VAPI_OK - success
1093 IB_Set_Async_Event_Handler(HCA_info hca_data, kibnal_data_t *kib_data)
1098 // register an asynchronous event handler for this HCA
1101 vstat= VAPI_set_async_event_handler(hca_data.hca_hndl,
1102 async_event_handler,
1105 if (vstat != VAPI_OK) {
1106 CERROR("IB_Set_Async_Event_Handler: failed VAPI_set_async_comp_event_handler"
1107 " for HCA ID = %s (%s).\n", HCA_ID, VAPI_strerror(vstat));
1115 // close this Infiniband HCA interface
1116 // release allocated resources to system
1127 CDEBUG(D_PORTALS, "Destroying QP\n");
1129 for(i=0; i < NUM_QPS; i++) {
1130 vstat = VAPI_destroy_qp(QP_list[i].hca_hndl, QP_list[i].qp_hndl);
1131 if (vstat != VAPI_OK) {
1132 CERROR("Failed destroying QP %d. %s\n", i, VAPI_strerror(vstat));
1139 CDEBUG(D_PORTALS, "Destroying CQ\n");
1140 for(i=0; i < NUM_QPS; i++) {
1141 // send_cq adn receive_cq are shared the same CQ
1142 // so only destroy one of them
1143 vstat = VAPI_destroy_cq(QP_list[i].hca_hndl, QP_list[i].sq_cq_hndl);
1144 if (vstat != VAPI_OK) {
1145 CERROR("Failed destroying CQ %d. %s\n", i, VAPI_strerror(vstat));
1152 /* Destroy Memory Region */
1153 CDEBUG(D_PORTALS, "Deregistering MR\n");
1154 for(i=0; i < NUM_QPS; i++) {
1155 vstat = deleteMemRegion(&QP_list[i], i);
1156 if (vstat != VAPI_OK) {
1157 CERROR("Failed deregister mem reg %d. %s\n",i, VAPI_strerror(vstat));
1167 CDEBUG(D_PORTALS, "Closing HCA\n");
1168 vstat = VAPI_close_hca(Hca_hndl);
1169 if (vstat != VAPI_OK) {
1170 CERROR("Failed to close HCA. %s\n", VAPI_strerror(vstat));
1175 CDEBUG(D_PORTALS, "IBNAL- Done with closing HCA \n");
1182 createMemRegion(VAPI_hca_hndl_t hca_hndl,
1183 VAPI_pd_hndl_t pd_hndl)
1188 VAPI_mr_hndl_t rep_mr_hndl;
1193 // send registered memory region
1194 for(i=0; i < NUM_ENTRY; i++) {
1195 MSbuf_list[i].buf_size = KB_32;
1196 PORTAL_ALLOC(bufptr, MSbuf_list[i].buf_size);
1197 if(bufptr == NULL) {
1198 CDEBUG(D_MALLOC,"Failed to malloc a block of send memory, qix %d size %d\n",
1199 i, MSbuf_list[i].buf_size);
1200 CERROR("Failed to malloc a block of send memory, qix %d size %d\n",
1201 i, MSbuf_list[i].buf_size);
1202 return(VAPI_ENOMEM);
1206 mrw.pd_hndl= pd_hndl;
1207 mrw.start = MSbuf_list[i].buf_addr = (VAPI_virt_addr_t)(MT_virt_addr_t) bufptr;
1208 mrw.size = MSbuf_list[i].buf_size;
1209 mrw.acl = VAPI_EN_LOCAL_WRITE |
1210 VAPI_EN_REMOTE_WRITE |
1211 VAPI_EN_REMOTE_READ;
1213 // register send memory region
1214 vstat = VAPI_register_mr(hca_hndl,
1219 // this memory region is going to be reused until deregister is called
1220 if(vstat != VAPI_OK) {
1221 CERROR("Failed registering a mem region qix %d Addr=%p, Len=%d. %s\n",
1222 i, mrw.start, mrw.size, VAPI_strerror(vstat));
1226 MSbuf_list[i].mr = rep_mr;
1227 MSbuf_list[i].mr_hndl = rep_mr_hndl;
1228 MSbuf_list[i].bufptr = bufptr;
1229 MSbuf_list[i].buf_addr = rep_mr.start;
1230 MSbuf_list[i].status = BUF_REGISTERED;
1231 MSbuf_list[i].ref_count = 0;
1232 MSbuf_list[i].buf_type = REG_BUF;
1233 MSbuf_list[i].raddr = 0x0;
1234 MSbuf_list[i].rkey = 0x0;
1237 // RDAM buffer is not reserved for RDAM WRITE/READ
1239 for(i=NUM_ENTRY; i< NUM_MBUF; i++) {
1240 MSbuf_list[i].status = BUF_UNREGISTERED;
1241 MSbuf_list[i].buf_type = RDMA_BUF;
1245 // recv registered memory region
1246 for(i=0; i < NUM_ENTRY; i++) {
1247 MRbuf_list[i].buf_size = KB_32;
1248 PORTAL_ALLOC(bufptr, MRbuf_list[i].buf_size);
1250 if(bufptr == NULL) {
1251 CDEBUG(D_MALLOC, "Failed to malloc a block of send memory, qix %d size %d\n",
1252 i, MRbuf_list[i].buf_size);
1253 return(VAPI_ENOMEM);
1257 mrw.pd_hndl= pd_hndl;
1258 mrw.start = (VAPI_virt_addr_t)(MT_virt_addr_t) bufptr;
1259 mrw.size = MRbuf_list[i].buf_size;
1260 mrw.acl = VAPI_EN_LOCAL_WRITE |
1261 VAPI_EN_REMOTE_WRITE |
1262 VAPI_EN_REMOTE_READ;
1264 // register send memory region
1265 vstat = VAPI_register_mr(hca_hndl,
1270 // this memory region is going to be reused until deregister is called
1271 if(vstat != VAPI_OK) {
1272 CERROR("Failed registering a mem region qix %d Addr=%p, Len=%d. %s\n",
1273 i, mrw.start, mrw.size, VAPI_strerror(vstat));
1277 MRbuf_list[i].mr = rep_mr;
1278 MRbuf_list[i].mr_hndl = rep_mr_hndl;
1279 MRbuf_list[i].bufptr = bufptr;
1280 MRbuf_list[i].buf_addr = rep_mr.start;
1281 MRbuf_list[i].status = BUF_REGISTERED;
1282 MRbuf_list[i].ref_count = 0;
1283 MRbuf_list[i].buf_type = REG_BUF;
1284 MRbuf_list[i].raddr = 0x0;
1285 MRbuf_list[i].rkey = rep_mr.r_key;
1286 MRbuf_list[i].lkey = rep_mr.l_key;
1290 // keep extra information for a qp
1291 for(i=0; i < NUM_QPS; i++) {
1292 QP_list[i].mr_hndl = MSbuf_list[i].mr_hndl;
1293 QP_list[i].mr = MSbuf_list[i].mr;
1294 QP_list[i].bufptr = MSbuf_list[i].bufptr;
1295 QP_list[i].buf_addr = MSbuf_list[i].buf_addr;
1296 QP_list[i].buf_size = MSbuf_list[i].buf_size;
1297 QP_list[i].raddr = MSbuf_list[i].raddr;
1298 QP_list[i].rkey = MSbuf_list[i].rkey;
1299 QP_list[i].lkey = MSbuf_list[i].lkey;
1302 CDEBUG(D_PORTALS, "IBNAL- done VAPI_ret_t createMemRegion \n");
1306 } /* createMemRegion */
1311 deleteMemRegion(QP_info *qp, int qix)
1316 // free send memory assocaited with this memory region
1318 PORTAL_FREE(MSbuf_list[qix].bufptr, MSbuf_list[qix].buf_size);
1321 vstat = VAPI_deregister_mr(qp->hca_hndl, MSbuf_list[qix].mr_hndl);
1323 if(vstat != VAPI_OK) {
1324 CERROR("Failed deregistering a send mem region qix %d %s\n",
1325 qix, VAPI_strerror(vstat));
1330 // free recv memory assocaited with this memory region
1332 PORTAL_FREE(MRbuf_list[qix].bufptr, MRbuf_list[qix].buf_size);
1335 vstat = VAPI_deregister_mr(qp->hca_hndl, MRbuf_list[qix].mr_hndl);
1337 if(vstat != VAPI_OK) {
1338 CERROR("Failed deregistering a recv mem region qix %d %s\n",
1339 qix, VAPI_strerror(vstat));
1348 // polling based event handling
1349 // + a daemon process
1350 // + poll the CQ and check what is in the CQ
1351 // + process incoming CQ event
1356 RDMA_Info_Exchange Rdma_info;
1357 int Cts_Message_arrived = NO;
1359 void k_recv_thread(HCA_info *hca_data)
1362 VAPI_wc_desc_t comp_desc;
1363 unsigned long polling_count = 0;
1364 u_int32_t timeout_usec;
1365 unsigned int priority = 100;
1366 unsigned int length;
1367 VAPI_wr_id_t wrq_id;
1368 u_int32_t transferred_data_length; /* Num. of bytes transferred */
1370 VAPI_virt_addr_t bufaddr;
1371 unsigned long buf_size = 0;
1372 QP_info *qp; // point to QP_list
1374 kportal_daemonize("k_recv_thread"); // make it as a daemon process
1377 timeout_usec = 100; // how is the impact on the performance
1379 // send Q and receive Q are using the same CQ
1380 // so only poll one CQ for both operations
1382 CDEBUG(D_NET, "IBNAL- enter kibnal_recv_thread\n");
1383 CDEBUG(D_NET, "hca_hndl = 0X%x, cq_hndl=0X%x\n",
1384 hca_data->hca_hndl,hca_data->cq_hndl);
1386 qp = hca_data->qp_ptr;
1388 CDEBUG(D_NET, "in recv_thread qp is NULL\n");
1389 CDEBUG(D_NET, "Exit from recv_thread qp is NULL\n");
1393 CDEBUG(D_NET, "in recv_thread qp is 0X%X\n", qp);
1396 CDEBUG(D_NET, "kibnal_recv_thread - enter event driver polling loop\n");
1408 // send Q and receive Q are using the same CQ
1409 // so only poll one CQ for both operations
1412 vstat = VAPI_poll_cq(hca_data->hca_hndl,hca_data->cq_hndl, &comp_desc);
1414 if (vstat == VAPI_CQ_EMPTY) {
1415 // there is no event in CQE
1419 if (vstat != (VAPI_OK)) {
1420 CERROR("error while polling completion queuei vstat %d \n", vstat);
1425 // process the complete event
1426 switch(comp_desc.opcode) {
1427 case VAPI_CQE_SQ_SEND_DATA:
1428 // about the Send Q ,POST SEND completion
1429 // who needs this information
1431 // mark MSbuf_list[wr_id].status = BUF_REGISTERED
1433 wrq_id = comp_desc.id;
1435 if(RDMA_OP_ID < wrq_id) {
1436 // this RDMA message id, adjust it to the right entry
1437 wrq_id = wrq_id - RDMA_OP_ID;
1438 vstat = VAPI_deregister_mr(qp->hca_hndl, Local_rdma_info.send_rdma_mr_hndl);
1441 if(vstat != VAPI_OK) {
1442 CERROR("VAPI_CQE_SQ_SEND_DATA: Failed deregistering a RDMAi recv" " mem region %s\n", VAPI_strerror(vstat));
1445 if((RDMA_CTS_ID <= wrq_id) && (RDMA_OP_ID < wrq_id)) {
1446 // RTS or CTS send complete, release send buffer
1447 if(wrq_id >= RDMA_RTS_ID)
1448 wrq_id = wrq_id - RDMA_RTS_ID;
1450 wrq_id = wrq_id - RDMA_CTS_ID;
1453 spin_lock(&MSB_mutex[(int) wrq_id]);
1454 MRbuf_list[wrq_id].status = BUF_REGISTERED;
1455 spin_unlock(&MSB_mutex[(int) wrq_id]);
1457 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_SEND_DATA\n");
1460 case VAPI_CQE_SQ_RDMA_WRITE:
1461 // about the Send Q, RDMA write completion
1462 // who needs this information
1463 // data is successfully write from pource to destionation
1466 // mark MSbuf_list[wr_id].status = BUF_REGISTERED
1467 // de-register rdma buffer
1470 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_RDMA_WRITE\n");
1473 case VAPI_CQE_SQ_RDMA_READ:
1475 // RDMA read completion
1476 // who needs this information
1477 // data is successfully read from destionation to source
1478 CDEBUG(D_NET, "CQE opcode- VAPI_CQE_SQ_RDMA_READ\n");
1481 case VAPI_CQE_SQ_COMP_SWAP:
1483 // RDMA write completion
1484 // who needs this information
1486 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_COMP_SWAP\n");
1489 case VAPI_CQE_SQ_FETCH_ADD:
1491 // RDMA write completion
1492 // who needs this information
1494 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_FETCH_ADD\n");
1497 case VAPI_CQE_SQ_BIND_MRW:
1499 // RDMA write completion
1500 // who needs this information
1502 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_BIND_MRW\n");
1505 case VAPI_CQE_RQ_SEND_DATA:
1506 // about the Receive Q
1507 // process the incoming data and
1508 // forward it to .....
1509 // a completion recevie event is arriving at CQ
1510 // issue a recevie to get this arriving data out from CQ
1511 // pass the receiving data for further processing
1512 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_SEND_DATA\n");
1513 wrq_id = comp_desc.id ;
1514 transferred_data_length = comp_desc.byte_len;
1516 if((wrq_id >= RDMA_CTS_ID) && (wrq_id < RDMA_OP_ID)) {
1517 // this is RTS/CTS message
1518 // process it locally and don't pass it to portals layer
1519 // adjust wrq_id to get the right entry in MRbfu_list
1521 if(wrq_id >= RDMA_RTS_ID)
1522 wrq_id = wrq_id - RDMA_RTS_ID;
1524 wrq_id = wrq_id - RDMA_CTS_ID;
1526 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) MRbuf_list[wrq_id].buf_addr;
1527 MRbuf_list[wrq_id].status = BUF_INUSE;
1528 memcpy(&Rdma_info, &bufaddr, sizeof(RDMA_Info_Exchange));
1530 if(Ready_To_send == Rdma_info.opcode)
1531 // an RTS request message from remote node
1532 // prepare local RDMA buffer and send local rdma info to
1534 CTS_handshaking_protocol(&Rdma_info);
1536 if((Clear_To_send == Rdma_info.opcode) &&
1537 (RDMA_BUFFER_RESERVED == Rdma_info.flag))
1538 Cts_Message_arrived = YES;
1540 if(RDMA_BUFFER_UNAVAILABLE == Rdma_info.flag)
1541 CERROR("RDMA operation abort-RDMA_BUFFER_UNAVAILABLE\n");
1545 // this is an incoming mesage for portals layer
1546 // move to PORTALS layer for further processing
1549 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t)
1550 MRbuf_list[wrq_id].buf_addr;
1552 MRbuf_list[wrq_id].status = BUF_INUSE;
1553 transferred_data_length = comp_desc.byte_len;
1555 kibnal_rx(hca_data->kib_data,
1557 transferred_data_length,
1558 MRbuf_list[wrq_id].buf_size,
1562 // repost this receiving buffer and makr it at BUF_REGISTERED
1564 vstat = repost_recv_buf(qp, wrq_id);
1565 if(vstat != (VAPI_OK)) {
1566 CERROR("error while polling completion queue\n");
1569 MRbuf_list[wrq_id].status = BUF_REGISTERED;
1574 case VAPI_CQE_RQ_RDMA_WITH_IMM:
1575 // about the Receive Q
1576 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n");
1578 wrq_id = comp_desc.id ;
1579 transferred_data_length = comp_desc.byte_len;
1581 if(wrq_id == RDMA_OP_ID) {
1582 // this is RDAM op , locate the RDAM memory buffer address
1584 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) Local_rdma_info.raddr;
1586 transferred_data_length = comp_desc.byte_len;
1588 kibnal_rx(hca_data->kib_data,
1590 transferred_data_length,
1591 Local_rdma_info.buf_length,
1594 // de-regiser this RDAM receiving memory buffer
1595 // too early ?? test & check
1596 vstat = VAPI_deregister_mr(qp->hca_hndl, Local_rdma_info.recv_rdma_mr_hndl);
1597 if(vstat != VAPI_OK) {
1598 CERROR("VAPI_CQE_RQ_RDMA_WITH_IMM: Failed deregistering a RDMA"
1599 " recv mem region %s\n", VAPI_strerror(vstat));
1603 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n");
1606 case VAPI_CQE_INVAL_OPCODE:
1608 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_INVAL_OPCODE\n");
1612 CDEBUG(D_NET, "CQE opcode-unknown opcode\n");
1616 schedule_timeout(RECEIVING_THREAD_TIMEOUT);//how often do we need to poll CQ
1618 }// receiving while loop
1624 void CQE_event_handler(VAPI_hca_hndl_t hca_hndl,
1625 VAPI_cq_hndl_t cq_hndl,
1629 VAPI_wc_desc_t comp_desc;
1630 unsigned long polling_count = 0;
1631 u_int32_t timeout_usec;
1632 unsigned int priority = 100;
1633 unsigned int length;
1634 VAPI_wr_id_t wrq_id;
1635 u_int32_t transferred_data_length; /* Num. of bytes transferred */
1637 VAPI_virt_addr_t bufaddr;
1638 unsigned long buf_size = 0;
1639 QP_info *qp; // point to QP_list
1642 // send Q and receive Q are using the same CQ
1643 // so only poll one CQ for both operations
1645 CDEBUG(D_NET, "IBNAL- enter CQE_event_handler\n");
1646 printk("IBNAL- enter CQE_event_handler\n");
1648 hca_data = (HCA_info *) private;
1655 vstat = VAPI_poll_cq(hca_data->hca_hndl,hca_data->cq_hndl, &comp_desc);
1657 if (vstat == VAPI_CQ_EMPTY) {
1658 CDEBUG(D_NET, "CQE_event_handler: there is no event in CQE, how could"
1659 " this " "happened \n");
1660 printk("CQE_event_handler: there is no event in CQE, how could"
1661 " this " "happened \n");
1665 if (vstat != (VAPI_OK)) {
1666 CDEBUG(D_NET, "error while polling completion queue vstat %d - %s\n",
1667 vstat, VAPI_strerror(vstat));
1668 printk("error while polling completion queue vstat %d - %s\n",
1669 vstat, VAPI_strerror(vstat));
1674 // process the complete event
1675 switch(comp_desc.opcode) {
1676 case VAPI_CQE_SQ_SEND_DATA:
1677 // about the Send Q ,POST SEND completion
1678 // who needs this information
1680 // mark MSbuf_list[wr_id].status = BUF_REGISTERED
1682 wrq_id = comp_desc.id;
1684 #ifdef IBNAL_SELF_TESTING
1685 if(wrq_id == SEND_RECV_TEST_ID) {
1686 printk("IBNAL_SELF_TESTING - VAPI_CQE_SQ_SEND_DATA \n");
1689 if(RDMA_OP_ID < wrq_id) {
1690 // this RDMA message id, adjust it to the right entry
1691 wrq_id = wrq_id - RDMA_OP_ID;
1692 vstat = VAPI_deregister_mr(qp->hca_hndl,
1693 Local_rdma_info.send_rdma_mr_hndl);
1696 if(vstat != VAPI_OK) {
1697 CERROR(" VAPI_CQE_SQ_SEND_DATA: Failed deregistering a RDMA"
1698 " recv mem region %s\n", VAPI_strerror(vstat));
1701 if((RDMA_CTS_ID <= wrq_id) && (RDMA_OP_ID < wrq_id)) {
1702 // RTS or CTS send complete, release send buffer
1703 if(wrq_id >= RDMA_RTS_ID)
1704 wrq_id = wrq_id - RDMA_RTS_ID;
1706 wrq_id = wrq_id - RDMA_CTS_ID;
1709 spin_lock(&MSB_mutex[(int) wrq_id]);
1710 MRbuf_list[wrq_id].status = BUF_REGISTERED;
1711 spin_unlock(&MSB_mutex[(int) wrq_id]);
1714 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_SEND_DATA\n");
1718 case VAPI_CQE_SQ_RDMA_WRITE:
1719 // about the Send Q, RDMA write completion
1720 // who needs this information
1721 // data is successfully write from pource to destionation
1724 // mark MSbuf_list[wr_id].status = BUF_REGISTERED
1725 // de-register rdma buffer
1728 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_RDMA_WRITE\n");
1731 case VAPI_CQE_SQ_RDMA_READ:
1733 // RDMA read completion
1734 // who needs this information
1735 // data is successfully read from destionation to source
1736 CDEBUG(D_NET, "CQE opcode- VAPI_CQE_SQ_RDMA_READ\n");
1739 case VAPI_CQE_SQ_COMP_SWAP:
1741 // RDMA write completion
1742 // who needs this information
1744 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_COMP_SWAP\n");
1747 case VAPI_CQE_SQ_FETCH_ADD:
1749 // RDMA write completion
1750 // who needs this information
1752 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_FETCH_ADD\n");
1755 case VAPI_CQE_SQ_BIND_MRW:
1757 // RDMA write completion
1758 // who needs this information
1760 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_SQ_BIND_MRW\n");
1763 case VAPI_CQE_RQ_SEND_DATA:
1764 // about the Receive Q
1765 // process the incoming data and
1766 // forward it to .....
1767 // a completion recevie event is arriving at CQ
1768 // issue a recevie to get this arriving data out from CQ
1769 // pass the receiving data for further processing
1771 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_SEND_DATA\n");
1773 wrq_id = comp_desc.id ;
1775 #ifdef IBNAL_SELF_TESTING
1780 if(wrq_id == SEND_RECV_TEST_ID) {
1781 printk("IBNAL_SELF_TESTING - VAPI_CQE_RQ_SEND_DATA\n");
1784 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t)
1785 MRbuf_list[ SEND_RECV_TEST_BUF_ID].buf_addr;
1786 MRbuf_list[SEND_RECV_TEST_BUF_ID].status = BUF_INUSE;
1787 memcpy(&rbuf, &bufaddr, KB_32);
1790 for(i=0; i < 16; i++)
1791 printk("rbuf[%d]=%c, ", rbuf[i]);
1794 // repost this receiving buffer and makr it at BUF_REGISTERED
1795 vstat = repost_recv_buf(qp,SEND_RECV_TEST_BUF_ID);
1796 if(vstat != (VAPI_OK)) {
1797 printk("error while polling completion queue\n");
1800 MRbuf_list[SEND_RECV_TEST_BUF_ID].status = BUF_REGISTERED;
1803 transferred_data_length = comp_desc.byte_len;
1805 if((wrq_id >= RDMA_CTS_ID) && (wrq_id < RDMA_OP_ID)) {
1806 // this is RTS/CTS message
1807 // process it locally and don't pass it to portals layer
1808 // adjust wrq_id to get the right entry in MRbfu_list
1810 if(wrq_id >= RDMA_RTS_ID)
1811 wrq_id = wrq_id - RDMA_RTS_ID;
1813 wrq_id = wrq_id - RDMA_CTS_ID;
1815 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t)
1816 MRbuf_list[wrq_id].buf_addr;
1817 MRbuf_list[wrq_id].status = BUF_INUSE;
1818 memcpy(&Rdma_info, &bufaddr, sizeof(RDMA_Info_Exchange));
1820 if(Ready_To_send == Rdma_info.opcode)
1821 // an RTS request message from remote node
1822 // prepare local RDMA buffer and send local rdma info to
1824 CTS_handshaking_protocol(&Rdma_info);
1826 if((Clear_To_send == Rdma_info.opcode) &&
1827 (RDMA_BUFFER_RESERVED == Rdma_info.flag))
1828 Cts_Message_arrived = YES;
1830 if(RDMA_BUFFER_UNAVAILABLE == Rdma_info.flag)
1831 CERROR("RDMA operation abort-RDMA_BUFFER_UNAVAILABLE\n");
1835 // this is an incoming mesage for portals layer
1836 // move to PORTALS layer for further processing
1839 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t)
1840 MRbuf_list[wrq_id].buf_addr;
1842 MRbuf_list[wrq_id].status = BUF_INUSE;
1843 transferred_data_length = comp_desc.byte_len;
1845 kibnal_rx(hca_data->kib_data,
1847 transferred_data_length,
1848 MRbuf_list[wrq_id].buf_size,
1852 // repost this receiving buffer and makr it at BUF_REGISTERED
1853 vstat = repost_recv_buf(qp, wrq_id);
1854 if(vstat != (VAPI_OK)) {
1855 CERROR("error while polling completion queue\n");
1858 MRbuf_list[wrq_id].status = BUF_REGISTERED;
1864 case VAPI_CQE_RQ_RDMA_WITH_IMM:
1865 // about the Receive Q
1866 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n");
1868 wrq_id = comp_desc.id ;
1869 transferred_data_length = comp_desc.byte_len;
1871 if(wrq_id == RDMA_OP_ID) {
1872 // this is RDAM op , locate the RDAM memory buffer address
1874 bufaddr = (VAPI_virt_addr_t)(MT_virt_addr_t) Local_rdma_info.raddr;
1876 transferred_data_length = comp_desc.byte_len;
1878 kibnal_rx(hca_data->kib_data,
1880 transferred_data_length,
1881 Local_rdma_info.buf_length,
1884 // de-regiser this RDAM receiving memory buffer
1885 // too early ?? test & check
1886 vstat = VAPI_deregister_mr(qp->hca_hndl, Local_rdma_info.recv_rdma_mr_hndl);
1887 if(vstat != VAPI_OK) {
1888 CERROR("VAPI_CQE_RQ_RDMA_WITH_IMM: Failed deregistering a RDMA"
1889 " recv mem region %s\n", VAPI_strerror(vstat));
1893 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_RQ_RDMA_WITH_IMM\n");
1896 case VAPI_CQE_INVAL_OPCODE:
1898 CDEBUG(D_NET, "CQE opcode-VAPI_CQE_INVAL_OPCODE\n");
1902 CDEBUG(D_NET, "CQE opcode-unknown opcode\n");
1907 // issue a new request for completion ievent notification
1908 vstat = VAPI_req_comp_notif(hca_data->hca_hndl,
1913 if(vstat != VAPI_OK) {
1914 CERROR("PI_req_comp_notif: Failed %s\n", VAPI_strerror(vstat));
1917 return; // end of event handler
1924 kibnal_cmd(struct portal_ioctl_data * data, void * private)
1928 CDEBUG(D_NET, "kibnal_cmd \n");
1935 void ibnal_send_recv_self_testing(int *my_role)
1938 VAPI_sr_desc_t sr_desc;
1939 VAPI_sg_lst_entry_t sr_sg;
1941 VAPI_wr_id_t send_id;
1946 int buf_length = KB_32;
1947 VAPI_wc_desc_t comp_desc;
1951 // make it as a daemon process
1952 // kportal_daemonize("ibnal_send_recv_self_testing");
1954 printk("My role is 0X%X\n", *my_role);
1956 if(*my_role == TEST_SEND_MESSAGE) {
1957 printk("Enter ibnal_send_recv_self_testing\n");
1959 memset(&sbuf, 'a', KB_32);
1960 memset(&rbuf, ' ', KB_32);
1962 send_id = SEND_RECV_TEST_ID;
1963 buf_id = SEND_RECV_TEST_BUF_ID;
1965 qp = &QP_list[buf_id];
1967 sr_desc.opcode = VAPI_SEND;
1968 sr_desc.comp_type = VAPI_SIGNALED;
1969 sr_desc.id = send_id;
1971 // scatter and gather info
1973 sr_sg.lkey = MSbuf_list[buf_id].mr.l_key; // use send MR
1974 sr_sg.addr = (VAPI_virt_addr_t)(MT_virt_addr_t) MSbuf_list[buf_id].buf_addr;
1976 // copy data to register send buffer
1977 memcpy(&sr_sg.addr, &sbuf, buf_length);
1979 sr_desc.sg_lst_p = &sr_sg;
1980 sr_desc.sg_lst_len = 1; // only 1 entry is used
1981 sr_desc.fence = TRUE;
1982 sr_desc.set_se = FALSE;
1985 // call VAPI_post_sr to send out this data
1986 vstat = VAPI_post_sr(qp->hca_hndl, qp->qp_hndl, &sr_desc);
1988 if (vstat != VAPI_OK) {
1989 printk("VAPI_post_sr failed (%s).\n",VAPI_strerror(vstat));
1992 printk("VAPI_post_sr success.\n");
1997 printk("I am a receiver and doing nothing here\n");
2000 printk("ibnal_send_recv_self_testing thread exit \n");
2008 // ibnal initialize process
2010 // 1. Bring up Infiniband network interface
2012 // 2. Initialize a PORTALS nal interface
2016 kibnal_initialize(void)
2020 unsigned long sizemask;
2025 portals_debug_set_level(IBNAL_DEBUG_LEVEL_1);
2027 CDEBUG(D_MALLOC, "start kmem %d\n", atomic_read (&portal_kmemory));
2029 CDEBUG(D_PORTALS, "kibnal_initialize: Enter kibnal_initialize\n");
2031 // set api functional pointers
2032 kibnal_api.forward = kibnal_forward;
2033 kibnal_api.shutdown = kibnal_shutdown;
2034 kibnal_api.yield = kibnal_yield;
2035 kibnal_api.validate = NULL; /* our api validate is a NOOP */
2036 kibnal_api.lock = kibnal_lock;
2037 kibnal_api.unlock = kibnal_unlock;
2038 kibnal_api.nal_data = &kibnal_data; // this is so called private data
2039 kibnal_api.refct = 1;
2040 kibnal_api.timeout = NULL;
2041 kibnal_lib.nal_data = &kibnal_data;
2043 memset(&kibnal_data, 0, sizeof(kibnal_data));
2045 // initialize kib_list list data structure
2046 INIT_LIST_HEAD(&kibnal_data.kib_list);
2048 kibnal_data.kib_cb = &kibnal_lib;
2050 spin_lock_init(&kibnal_data.kib_dispatch_lock);
2054 // bring up the IB inter-connect network interface
2057 vstat = IB_Open_HCA(&kibnal_data);
2059 if(vstat != VAPI_OK) {
2060 CERROR("kibnal_initialize: IB_Open_HCA failed: %d- %s\n",
2061 vstat, VAPI_strerror(vstat));
2063 printk("kibnal_initialize: IB_Open_HCA failed: %d- %s\n",
2064 vstat, VAPI_strerror(vstat));
2068 kibnal_data.kib_nid = (__u64 )Hca_hndl;//convert Hca_hndl to 64-bit format
2069 kibnal_data.kib_init = 1;
2071 CDEBUG(D_NET, " kibnal_data.kib_nid 0x%x%x\n", kibnal_data.kib_nid);
2072 printk(" kibnal_data.kib_nid 0x%x%x\n", kibnal_data.kib_nid);
2074 /* Network interface ready to initialise */
2075 // get an entery in the PORTALS table for this IB protocol
2077 CDEBUG(D_PORTALS,"Call PtlNIInit to register this Infiniband Interface\n");
2078 printk("Call PtlNIInit to register this Infiniband Interface\n");
2080 rc = PtlNIInit(kibnal_init, 32, 4, 0, &kibnal_ni);
2083 CERROR("kibnal_initialize: PtlNIInit failed %d\n", rc);
2084 printk("kibnal_initialize: PtlNIInit failed %d\n", rc);
2089 CDEBUG(D_PORTALS,"kibnal_initialize: PtlNIInit DONE\n");
2090 printk("kibnal_initialize: PtlNIInit DONE\n");
2094 #ifdef POLL_BASED_CQE_HANDLING
2095 // create a receiving thread: main loopa
2096 // this is polling based mail loop
2097 kernel_thread(k_recv_thread, &Hca_data, 0);
2100 #ifdef EVENT_BASED_CQE_HANDLING
2101 // for completion event handling, this is event based CQE handling
2102 vstat = IB_Set_Event_Handler(Hca_data, &kibnal_data);
2104 if (vstat != VAPI_OK) {
2105 CERROR("IB_Set_Event_Handler failed: %d - %s \n",
2106 vstat, VAPI_strerror(vstat));
2110 CDEBUG(D_PORTALS,"IB_Set_Event_Handler Done \n");
2111 printk("IB_Set_Event_Handler Done \n");
2115 PORTAL_SYMBOL_REGISTER(kibnal_ni);
2117 #ifdef IBNAL_SELF_TESTING
2119 // test HCA send recv before normal event handling
2122 my_role = TEST_SEND_MESSAGE;
2124 printk("my role is TEST_RECV_MESSAGE\n");
2126 // kernel_thread(ibnal_send_recv_self_testing, &my_role, 0);
2128 ibnal_send_recv_self_testing(&my_role);
2138 MODULE_AUTHOR("Hsingbung(HB) Chen <hbchen@lanl.gov>");
2139 MODULE_DESCRIPTION("Kernel Infiniband NAL v0.1");
2140 MODULE_LICENSE("GPL");
2142 module_init (kibnal_initialize);
2143 module_exit (kibnal_finalize);
2145 EXPORT_SYMBOL(kibnal_ni);