.lnd_recv = kibnal_recv,
};
-kib_tunables_t kibnal_tunables;
+kib_data_t kibnal_data;
-kib_data_t kibnal_data = {
- .kib_service_id = IBNAL_SERVICE_NUMBER,
-};
-
-#ifdef CONFIG_SYSCTL
-#define IBNAL_SYSCTL 202
+__u32
+kibnal_cksum (void *ptr, int nob)
+{
+ char *c = ptr;
+ __u32 sum = 0;
-#define IBNAL_SYSCTL_TIMEOUT 1
+ while (nob-- > 0)
+ sum = ((sum << 1) | (sum >> 31)) + *c++;
+
+ /* ensure I don't return 0 (== no checksum) */
+ return (sum == 0) ? 1 : sum;
+}
-static ctl_table kibnal_ctl_table[] = {
- {IBNAL_SYSCTL_TIMEOUT, "timeout",
- &kibnal_tunables.kib_io_timeout, sizeof (int),
- 0644, NULL, &proc_dointvec},
- { 0 }
-};
+void
+kibnal_init_msg(kib_msg_t *msg, int type, int body_nob)
+{
+ msg->ibm_type = type;
+ msg->ibm_nob = offsetof(kib_msg_t, ibm_u) + body_nob;
+}
-static ctl_table kibnal_top_ctl_table[] = {
- {IBNAL_SYSCTL, "iibnal", NULL, 0, 0555, kibnal_ctl_table},
- { 0 }
-};
-#endif
+void
+kibnal_pack_msg(kib_msg_t *msg, int credits, lnet_nid_t dstnid,
+ __u64 dststamp, __u64 seq)
+{
+ /* CAVEAT EMPTOR! all message fields not set here should have been
+ * initialised previously. */
+ msg->ibm_magic = IBNAL_MSG_MAGIC;
+ msg->ibm_version = IBNAL_MSG_VERSION;
+ /* ibm_type */
+ msg->ibm_credits = credits;
+ /* ibm_nob */
+ msg->ibm_cksum = 0;
+ msg->ibm_srcnid = lnet_ptlcompat_srcnid(kibnal_data.kib_ni->ni_nid,
+ dstnid);
+ msg->ibm_srcstamp = kibnal_data.kib_incarnation;
+ msg->ibm_dstnid = dstnid;
+ msg->ibm_dststamp = dststamp;
+ msg->ibm_seq = seq;
+
+ if (*kibnal_tunables.kib_cksum) {
+ /* NB ibm_cksum zero while computing cksum */
+ msg->ibm_cksum = kibnal_cksum(msg, msg->ibm_nob);
+ }
+}
-#ifdef unused
void
-print_service(IB_SERVICE_RECORD *service, char *tag, int rc)
+kibnal_pack_connmsg(kib_msg_t *msg, int nob, int type,
+ lnet_nid_t dstnid, __u64 dststamp)
{
- char name[32];
- lnet_nid_t nid;
+ LASSERT (nob >= offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t));
- if (service == NULL)
- {
- CWARN("tag : %s\n"
- "status : %d (NULL)\n", tag, rc);
- return;
+ memset(msg, 0, nob);
+ kibnal_init_msg(msg, type, sizeof(kib_connparams_t));
+
+ msg->ibm_u.connparams.ibcp_queue_depth = IBNAL_MSG_QUEUE_SIZE;
+ msg->ibm_u.connparams.ibcp_max_msg_size = IBNAL_MSG_SIZE;
+ msg->ibm_u.connparams.ibcp_max_frags = IBNAL_MAX_RDMA_FRAGS;
+
+ kibnal_pack_msg(msg, 0, dstnid, dststamp, 0);
+}
+
+int
+kibnal_unpack_msg(kib_msg_t *msg, int nob)
+{
+ const int hdr_size = offsetof(kib_msg_t, ibm_u);
+ __u32 msg_cksum;
+ int flip;
+ int msg_nob;
+#if !IBNAL_USE_FMR
+ int i;
+ int n;
+#endif
+ /* 6 bytes are enough to have received magic + version */
+ if (nob < 6) {
+ CERROR("Short message: %d\n", nob);
+ return -EPROTO;
+ }
+
+ if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
+ flip = 0;
+ } else if (msg->ibm_magic == __swab32(IBNAL_MSG_MAGIC)) {
+ flip = 1;
+ } else {
+ CERROR("Bad magic: %08x\n", msg->ibm_magic);
+ return -EPROTO;
+ }
+
+ if (msg->ibm_version !=
+ (flip ? __swab16(IBNAL_MSG_VERSION) : IBNAL_MSG_VERSION)) {
+ CERROR("Bad version: %d\n", msg->ibm_version);
+ return -EPROTO;
+ }
+
+ if (nob < hdr_size) {
+ CERROR("Short message: %d\n", nob);
+ return -EPROTO;
+ }
+
+ msg_nob = flip ? __swab32(msg->ibm_nob) : msg->ibm_nob;
+ if (msg_nob > nob) {
+ CERROR("Short message: got %d, wanted %d\n", nob, msg_nob);
+ return -EPROTO;
+ }
+
+ /* checksum must be computed with ibm_cksum zero and BEFORE anything
+ * gets flipped */
+ msg_cksum = flip ? __swab32(msg->ibm_cksum) : msg->ibm_cksum;
+ msg->ibm_cksum = 0;
+ if (msg_cksum != 0 &&
+ msg_cksum != kibnal_cksum(msg, msg_nob)) {
+ CERROR("Bad checksum\n");
+ return -EPROTO;
}
- strncpy (name, service->ServiceName, sizeof(name)-1);
- name[sizeof(name)-1] = 0;
- nid = *kibnal_service_nid_field(service);
+ msg->ibm_cksum = msg_cksum;
- CWARN("tag : %s\n"
- "status : %d\n"
- "service id: "LPX64"\n"
- "name : %s\n"
- "NID : %s\n", tag, rc,
- service->RID.ServiceID, name,
- libcfs_nid2str(nid));
-}
+ if (flip) {
+ /* leave magic unflipped as a clue to peer endianness */
+ __swab16s(&msg->ibm_version);
+ CLASSERT (sizeof(msg->ibm_type) == 1);
+ CLASSERT (sizeof(msg->ibm_credits) == 1);
+ msg->ibm_nob = msg_nob;
+ __swab64s(&msg->ibm_srcnid);
+ __swab64s(&msg->ibm_srcstamp);
+ __swab64s(&msg->ibm_dstnid);
+ __swab64s(&msg->ibm_dststamp);
+ __swab64s(&msg->ibm_seq);
+ }
+
+ if (msg->ibm_srcnid == LNET_NID_ANY) {
+ CERROR("Bad src nid: %s\n", libcfs_nid2str(msg->ibm_srcnid));
+ return -EPROTO;
+ }
+
+ switch (msg->ibm_type) {
+ default:
+ CERROR("Unknown message type %x\n", msg->ibm_type);
+ return -EPROTO;
+
+ case IBNAL_MSG_NOOP:
+ break;
+
+ case IBNAL_MSG_IMMEDIATE:
+ if (msg_nob < offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0])) {
+ CERROR("Short IMMEDIATE: %d(%d)\n", msg_nob,
+ (int)offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[0]));
+ return -EPROTO;
+ }
+ break;
+
+ case IBNAL_MSG_PUT_REQ:
+ if (msg_nob < hdr_size + sizeof(msg->ibm_u.putreq)) {
+ CERROR("Short PUT_REQ: %d(%d)\n", msg_nob,
+ (int)(hdr_size + sizeof(msg->ibm_u.putreq)));
+ return -EPROTO;
+ }
+ break;
+
+ case IBNAL_MSG_PUT_ACK:
+#if IBNAL_USE_FMR
+ if (msg_nob < hdr_size + sizeof(msg->ibm_u.putack)) {
+ CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
+ (int)(hdr_size + sizeof(msg->ibm_u.putack)));
+ return -EPROTO;
+ }
+
+ if (flip) {
+ __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_addr);
+ __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nob);
+ __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
+ }
+#else
+ if (flip) {
+ __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_key);
+ __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_nfrag);
+ }
+
+ n = msg->ibm_u.putack.ibpam_rd.rd_nfrag;
+ if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
+ CERROR("Bad PUT_ACK nfrags: %d, should be 0 < n <= %d\n",
+ n, IBNAL_MAX_RDMA_FRAGS);
+ return -EPROTO;
+ }
+
+ if (msg_nob < offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n])) {
+ CERROR("Short PUT_ACK: %d(%d)\n", msg_nob,
+ (int)offsetof(kib_msg_t, ibm_u.putack.ibpam_rd.rd_frags[n]));
+ return -EPROTO;
+ }
+
+ if (flip) {
+ for (i = 0; i < n; i++) {
+ __swab32s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_nob);
+ __swab64s(&msg->ibm_u.putack.ibpam_rd.rd_frags[i].rf_addr);
+ }
+ }
#endif
+ break;
-static void
-kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
- FSTATUS frc, uint32 madrc)
+ case IBNAL_MSG_GET_REQ:
+ if (msg_nob < hdr_size + sizeof(msg->ibm_u.get)) {
+ CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
+ (int)(hdr_size + sizeof(msg->ibm_u.get)));
+ return -EPROTO;
+ }
+#if IBNAL_USE_FMR
+ if (flip) {
+ __swab64s(&msg->ibm_u.get.ibgm_rd.rd_addr);
+ __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nob);
+ __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
+ }
+#else
+ if (flip) {
+ __swab32s(&msg->ibm_u.get.ibgm_rd.rd_key);
+ __swab32s(&msg->ibm_u.get.ibgm_rd.rd_nfrag);
+ }
+
+ n = msg->ibm_u.get.ibgm_rd.rd_nfrag;
+ if (n <= 0 || n > IBNAL_MAX_RDMA_FRAGS) {
+ CERROR("Bad GET_REQ nfrags: %d, should be 0 < n <= %d\n",
+ n, IBNAL_MAX_RDMA_FRAGS);
+ return -EPROTO;
+ }
+
+ if (msg_nob < offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n])) {
+ CERROR("Short GET_REQ: %d(%d)\n", msg_nob,
+ (int)offsetof(kib_msg_t, ibm_u.get.ibgm_rd.rd_frags[n]));
+ return -EPROTO;
+ }
+
+ if (flip)
+ for (i = 0; i < msg->ibm_u.get.ibgm_rd.rd_nfrag; i++) {
+ __swab32s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_nob);
+ __swab64s(&msg->ibm_u.get.ibgm_rd.rd_frags[i].rf_addr);
+ }
+#endif
+ break;
+
+ case IBNAL_MSG_PUT_NAK:
+ case IBNAL_MSG_PUT_DONE:
+ case IBNAL_MSG_GET_DONE:
+ if (msg_nob < hdr_size + sizeof(msg->ibm_u.completion)) {
+ CERROR("Short RDMA completion: %d(%d)\n", msg_nob,
+ (int)(hdr_size + sizeof(msg->ibm_u.completion)));
+ return -EPROTO;
+ }
+ if (flip)
+ __swab32s(&msg->ibm_u.completion.ibcm_status);
+ break;
+
+ case IBNAL_MSG_CONNREQ:
+ case IBNAL_MSG_CONNACK:
+ if (msg_nob < hdr_size + sizeof(msg->ibm_u.connparams)) {
+ CERROR("Short connreq/ack: %d(%d)\n", msg_nob,
+ (int)(hdr_size + sizeof(msg->ibm_u.connparams)));
+ return -EPROTO;
+ }
+ if (flip) {
+ __swab32s(&msg->ibm_u.connparams.ibcp_queue_depth);
+ __swab32s(&msg->ibm_u.connparams.ibcp_max_msg_size);
+ __swab32s(&msg->ibm_u.connparams.ibcp_max_frags);
+ }
+ break;
+ }
+ return 0;
+}
+
+IB_HANDLE
+kibnal_create_cep(lnet_nid_t nid)
{
- *(FSTATUS *)arg = frc;
- up (&kibnal_data.kib_nid_signal);
+ FSTATUS frc;
+ __u32 u32val;
+ IB_HANDLE cep;
+
+ cep = iibt_cm_create_cep(CM_RC_TYPE);
+ if (cep == NULL) {
+ CERROR ("Can't create CEP for %s\n",
+ (nid == LNET_NID_ANY) ? "listener" :
+ libcfs_nid2str(nid));
+ return NULL;
+ }
+
+ if (nid == LNET_NID_ANY) {
+ u32val = 1;
+ frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT,
+ (char *)&u32val, sizeof(u32val), 0);
+ if (frc != FSUCCESS) {
+ CERROR("Can't set async_accept: %d\n", frc);
+ goto failed;
+ }
+
+ u32val = 0; /* sets system max */
+ frc = iibt_cm_modify_cep(cep, CM_FLAG_LISTEN_BACKLOG,
+ (char *)&u32val, sizeof(u32val), 0);
+ if (frc != FSUCCESS) {
+ CERROR("Can't set listen backlog: %d\n", frc);
+ goto failed;
+ }
+ }
+
+ u32val = 1;
+ frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
+ (char *)&u32val, sizeof(u32val), 0);
+ if (frc != FSUCCESS) {
+ CERROR("Can't set timewait_callback for %s: %d\n",
+ (nid == LNET_NID_ANY) ? "listener" :
+ libcfs_nid2str(nid), frc);
+ goto failed;
+ }
+
+ return cep;
+
+ failed:
+ iibt_cm_destroy_cep(cep);
+ return NULL;
}
#if IBNAL_CHECK_ADVERT
-static void
+void
kibnal_service_query_done (void *arg, QUERY *qry,
QUERY_RESULT_VALUES *qry_result)
{
- FSTATUS frc = qry_result->Status;
+ int *rcp = arg;
+ FSTATUS frc = qry_result->Status;
+ SERVICE_RECORD_RESULTS *svc_rslt;
+ SERVICE_RECORD *svc;
+ lnet_nid_t nid;
+
+ if (frc != FSUCCESS || qry_result->ResultDataSize == 0) {
+ CERROR("Error checking advert: status %d data size %d\n",
+ frc, qry_result->ResultDataSize);
+ *rcp = -EIO;
+ goto out;
+ }
+
+ svc_rslt = (SERVICE_RECORD_RESULTS *)qry_result->QueryResult;
- if (frc != FSUCCESS &&
- qry_result->ResultDataSize == 0)
- frc = FERROR;
+ if (svc_rslt->NumServiceRecords < 1) {
+ CERROR("Check advert: %d records\n",
+ svc->NumServiceRecords);
+ *rcp = -ENOENT;
+ goto out;
+ }
+
+ svc = &svc_rslt->ServiceRecords[0];
+ nid = le64_to_cpu(*kibnal_service_nid_field(svc));
- *(FSTATUS *)arg = frc;
- up (&kibnal_data.kib_nid_signal);
+ if (nid != kibnal_data.kib_ni->ni_nid) {
+ CERROR("Check advert: Bad NID %s (%s expected)\n",
+ nid, kibnal_data.kib_ni->ni_nid);
+ *rcp = -EINVAL;
+ goto out;
+ }
+
+ if (svc->RID.ServiceID != *kibnal_tunables.kib_service_number) {
+ CERROR("Check advert: Bad ServiceID "LPX64" ("LPX64" expected)\n",
+ svc->RID.ServiceID,
+ *kibnal_tunables.kib_service_number);
+ *rcp = -EINVAL;
+ goto out;
+ }
+
+ if (svc->RID.ServiceGID.Type.Global.InterfaceID !=
+ kibnal_data.kib_port_guid) {
+ CERROR("Check advert: Bad GUID "LPX64" ("LPX64" expected)\n",
+ svc->RID.ServiceGID.Type.Global.InterfaceID,
+ kibnal_data.kib_port_guid);
+ *rcp = -EINVAL;
+ goto out;
+ }
+
+ if (svc->RID.ServiceP_Key != kibnal_data.kib_port_pkey) {
+ CERROR("Check advert: Bad PKEY %04x (%04x expected)\n",
+ svc->RID.ServiceP_Key, kibnal_data.kib_port_pkey);
+ *rcp = -EINVAL;
+ goto out;
+ }
+
+ CDEBUG(D_WARNING, "Check advert OK\n");
+ *rcp = 0;
+
+ out:
+ up (&kibnal_data.kib_listener_signal);
}
-static void
+int
kibnal_check_advert (void)
{
- QUERY *qry;
- IB_SERVICE_RECORD *svc;
- FSTATUS frc;
- FSTATUS frc2;
+ /* single-threaded */
+ static QUERY qry;
- LIBCFS_ALLOC(qry, sizeof(*qry));
- if (qry == NULL)
- return;
+ FSTATUS frc;
+ int rc;
- memset (qry, 0, sizeof(*qry));
- qry->InputType = InputTypeServiceRecord;
- qry->OutputType = OutputTypeServiceRecord;
- qry->InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
- svc = &qry->InputValue.ServiceRecordValue.ServiceRecord;
- kibnal_set_service_keys(svc, kibnal_data.kib_ni->ni_nid);
+ memset (&qry, 0, sizeof(qry));
+ qry.InputType = InputTypeServiceRecord;
+ qry.OutputType = OutputTypeServiceRecord;
+ kibnal_set_service_keys(&qry.InputValue.ServiceRecordValue.ServiceRecord,
+ kibnal_data.kib_ni->ni_nid);
+ qry.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
- frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
+ frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
kibnal_data.kib_port_guid,
- qry,
+ &qry,
kibnal_service_query_done,
- NULL, &frc2);
- if (frc != FSUCCESS && frc != FPENDING) {
+ &kibnal_data.kib_sdretry,
+ &rc);
+ if (frc != FPENDING) {
CERROR ("Immediate error %d checking SM service\n", frc);
- } else {
- down (&kibnal_data.kib_nid_signal);
- frc = frc2;
-
- if (frc != 0)
- CERROR ("Error %d checking SM service\n", rc);
+ return -EIO;
}
-
- return (rc);
+
+ down (&kibnal_data.kib_listener_signal);
+
+ if (rc != 0)
+ CERROR ("Error %d checking SM service\n", rc);
+ return rc;
+}
+#else
+int
+kibnal_check_advert(void)
+{
+ return 0;
}
#endif
-static void fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
+void
+kibnal_fill_fod(FABRIC_OPERATION_DATA *fod, FABRIC_OPERATION_TYPE type)
{
IB_SERVICE_RECORD *svc;
fod->Type = type;
svc = &fod->Value.ServiceRecordValue.ServiceRecord;
- svc->RID.ServiceID = kibnal_data.kib_service_id;
+ svc->RID.ServiceID = *kibnal_tunables.kib_service_number;
svc->RID.ServiceGID.Type.Global.InterfaceID = kibnal_data.kib_port_guid;
svc->RID.ServiceGID.Type.Global.SubnetPrefix = DEFAULT_SUBNET_PREFIX;
svc->RID.ServiceP_Key = kibnal_data.kib_port_pkey;
kibnal_set_service_keys(svc, kibnal_data.kib_ni->ni_nid);
}
-static int
-kibnal_advertise (void)
+void
+kibnal_service_setunset_done (void *arg, FABRIC_OPERATION_DATA *fod,
+ FSTATUS frc, uint32 madrc)
{
- FABRIC_OPERATION_DATA *fod;
- IB_SERVICE_RECORD *svc;
- FSTATUS frc;
- FSTATUS frc2;
-
- LASSERT (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY);
+ *(FSTATUS *)arg = frc;
+ up (&kibnal_data.kib_listener_signal);
+}
- LIBCFS_ALLOC(fod, sizeof(*fod));
- if (fod == NULL)
- return (-ENOMEM);
+int
+kibnal_advertise (void)
+{
+ /* Single threaded here */
+ static FABRIC_OPERATION_DATA fod;
+
+ IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
+ FSTATUS frc;
+ FSTATUS frc2;
+
+ if (strlen(*kibnal_tunables.kib_service_name) >=
+ sizeof(svc->ServiceName)) {
+ CERROR("Service name '%s' too long (%d chars max)\n",
+ *kibnal_tunables.kib_service_name,
+ sizeof(svc->ServiceName) - 1);
+ return -EINVAL;
+ }
- fill_fod(fod, FabOpSetServiceRecord);
- svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+ kibnal_fill_fod(&fod, FabOpSetServiceRecord);
CDEBUG(D_NET, "Advertising service id "LPX64" %s:%s\n",
- svc->RID.ServiceID,
- svc->ServiceName,
- libcfs_nid2str(*kibnal_service_nid_field(svc)));
+ svc->RID.ServiceID, svc->ServiceName,
+ libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
kibnal_data.kib_port_guid,
- fod, kibnal_service_setunset_done,
- NULL, &frc2);
+ &fod,
+ kibnal_service_setunset_done,
+ &kibnal_data.kib_sdretry,
+ &frc2);
if (frc != FSUCCESS && frc != FPENDING) {
CERROR ("Immediate error %d advertising NID %s\n",
frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
- goto out;
+ return -EIO;
}
- down (&kibnal_data.kib_nid_signal);
+ down (&kibnal_data.kib_listener_signal);
frc = frc2;
- if (frc != FSUCCESS)
- CERROR ("Error %d advertising BUD %s\n",
- frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
-out:
- LIBCFS_FREE(fod, sizeof(*fod));
- return (frc == FSUCCESS) ? 0 : -EINVAL;
+ if (frc == FSUCCESS)
+ return 0;
+
+ CERROR ("Error %d advertising %s\n",
+ frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
+ return -EIO;
}
-static void
+void
kibnal_unadvertise (int expect_success)
{
- FABRIC_OPERATION_DATA *fod;
- IB_SERVICE_RECORD *svc;
- FSTATUS frc;
- FSTATUS frc2;
+ /* single threaded */
+ static FABRIC_OPERATION_DATA fod;
- LASSERT (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY);
+ IB_SERVICE_RECORD *svc = &fod.Value.ServiceRecordValue.ServiceRecord;
+ FSTATUS frc;
+ FSTATUS frc2;
- LIBCFS_ALLOC(fod, sizeof(*fod));
- if (fod == NULL)
- return;
+ LASSERT (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY);
- fill_fod(fod, FabOpDeleteServiceRecord);
- svc = &fod->Value.ServiceRecordValue.ServiceRecord;
+ kibnal_fill_fod(&fod, FabOpDeleteServiceRecord);
CDEBUG(D_NET, "Unadvertising service %s:%s\n",
svc->ServiceName,
- libcfs_nid2str(*kibnal_service_nid_field(svc)));
+ libcfs_nid2str(le64_to_cpu(*kibnal_service_nid_field(svc))));
frc = iibt_sd_port_fabric_operation(kibnal_data.kib_sd,
kibnal_data.kib_port_guid,
- fod, kibnal_service_setunset_done,
- NULL, &frc2);
-
+ &fod,
+ kibnal_service_setunset_done,
+ &kibnal_data.kib_sdretry,
+ &frc2);
if (frc != FSUCCESS && frc != FPENDING) {
CERROR ("Immediate error %d unadvertising NID %s\n",
frc, libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
- goto out;
+ return;
}
- down (&kibnal_data.kib_nid_signal);
+ down (&kibnal_data.kib_listener_signal);
- if ((frc2 == FSUCCESS) == !!expect_success)
- goto out;
+ if ((frc2 -= FSUCCESS) == !!expect_success)
+ return;
if (expect_success)
CERROR("Error %d unadvertising NID %s\n",
else
CWARN("Removed conflicting NID %s\n",
libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
- out:
- LIBCFS_FREE(fod, sizeof(*fod));
}
-static int
-kibnal_set_mynid(lnet_nid_t nid)
+void
+kibnal_stop_listener(int normal_shutdown)
{
- struct timeval tv;
- int rc;
+ /* NB this also disables peer creation and destroys all existing
+ * peers */
+ IB_HANDLE cep = kibnal_data.kib_listener_cep;
+ unsigned long flags;
FSTATUS frc;
- CDEBUG(D_IOCTL, "setting mynid to %s (old nid=%s)\n",
- libcfs_nid2str(nid),
- libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
+ LASSERT (cep != NULL);
- do_gettimeofday(&tv);
+ kibnal_unadvertise(normal_shutdown);
- down (&kibnal_data.kib_nid_mutex);
+ frc = iibt_cm_cancel(cep);
+ if (frc != FSUCCESS && frc != FPENDING)
+ CERROR ("Error %d stopping listener\n", frc);
- if (nid == kibnal_data.kib_ni->ni_nid) {
- /* no change of NID */
- up (&kibnal_data.kib_nid_mutex);
- return (0);
- }
+ down(&kibnal_data.kib_listener_signal);
- CDEBUG(D_NET, "NID %s(%s)\n",
- libcfs_nid2str(kibnal_data.kib_ni->ni_nid),
- libcfs_nid2str(nid));
-
- if (kibnal_data.kib_ni->ni_nid != LNET_NID_ANY) {
+ frc = iibt_cm_destroy_cep(cep);
+ if (frc != FSUCCESS)
+ CERROR ("Error %d destroying listener CEP\n", frc);
- kibnal_unadvertise (1);
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+ /* This assignment disables peer creation */
+ kibnal_data.kib_listener_cep = NULL;
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
- frc = iibt_cm_cancel(kibnal_data.kib_cep);
- if (frc != FSUCCESS && frc != FPENDING)
- CERROR ("Error %d stopping listener\n", frc);
+ /* Start to tear down any peers created while the listener was
+ * running */
+ kibnal_del_peer(LNET_NID_ANY);
+}
- frc = iibt_cm_destroy_cep(kibnal_data.kib_cep);
- if (frc != FSUCCESS)
- CERROR ("Error %d destroying CEP\n", frc);
+int
+kibnal_start_listener(void)
+{
+ /* NB this also enables peer creation */
- kibnal_data.kib_cep = NULL;
- }
-
- kibnal_data.kib_ni->ni_nid = nid;
- kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
-
- /* Delete all existing peers and their connections after new
- * NID/incarnation set to ensure no old connections in our brave
- * new world. */
- kibnal_del_peer(LNET_NID_ANY);
+ IB_HANDLE cep;
+ CM_LISTEN_INFO info;
+ unsigned long flags;
+ int rc;
+ FSTATUS frc;
+ __u32 u32val;
- if (kibnal_data.kib_ni->ni_nid == LNET_NID_ANY) {
- /* No new NID to install */
- up (&kibnal_data.kib_nid_mutex);
- return (0);
- }
+ LASSERT (kibnal_data.kib_listener_cep == NULL);
+ init_MUTEX_LOCKED (&kibnal_data.kib_listener_signal);
/* remove any previous advert (crashed node etc) */
kibnal_unadvertise(0);
- kibnal_data.kib_cep = iibt_cm_create_cep(CM_RC_TYPE);
- if (kibnal_data.kib_cep == NULL) {
- CERROR ("Can't create CEP\n");
- rc = -ENOMEM;
- } else {
- CM_LISTEN_INFO info;
- memset (&info, 0, sizeof(info));
- info.ListenAddr.EndPt.SID = kibnal_data.kib_service_id;
-
- frc = iibt_cm_listen(kibnal_data.kib_cep, &info,
- kibnal_listen_callback, NULL);
- if (frc != FSUCCESS && frc != FPENDING) {
- CERROR ("iibt_cm_listen error: %d\n", frc);
- rc = -EINVAL;
- } else {
- rc = 0;
- }
- }
-
- if (rc == 0) {
- rc = kibnal_advertise();
- if (rc == 0) {
-#if IBNAL_CHECK_ADVERT
- kibnal_check_advert();
-#endif
- up (&kibnal_data.kib_nid_mutex);
- return (0);
- }
-
- iibt_cm_cancel (kibnal_data.kib_cep);
- iibt_cm_destroy_cep (kibnal_data.kib_cep);
- /* remove any peers that sprung up while I failed to
- * advertise myself */
- kibnal_del_peer(LNET_NID_ANY);
+ cep = kibnal_create_cep(LNET_NID_ANY);
+ if (cep == NULL)
+ return -ENOMEM;
+
+ memset (&info, 0, sizeof(info));
+ info.ListenAddr.EndPt.SID = *kibnal_tunables.kib_service_number;
+
+ frc = iibt_cm_listen(cep, &info, kibnal_listen_callback, NULL);
+ if (frc != FSUCCESS && frc != FPENDING) {
+ CERROR ("iibt_cm_listen error: %d\n", frc);
+
+ iibt_cm_destroy_cep(cep);
+ return -EIO;
}
- kibnal_data.kib_ni->ni_nid = LNET_NID_ANY;
- up (&kibnal_data.kib_nid_mutex);
- return (rc);
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+ /* This assignment enables peer creation */
+ kibnal_data.kib_listener_cep = cep;
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+ rc = kibnal_advertise();
+ if (rc == 0)
+ rc = kibnal_check_advert();
+
+ if (rc == 0)
+ return 0;
+
+ kibnal_stop_listener(0);
+ return rc;
}
-kib_peer_t *
-kibnal_create_peer (lnet_nid_t nid)
+int
+kibnal_create_peer (kib_peer_t **peerp, lnet_nid_t nid)
{
- kib_peer_t *peer;
+ kib_peer_t *peer;
+ unsigned long flags;
+ int rc;
LASSERT (nid != LNET_NID_ANY);
LIBCFS_ALLOC (peer, sizeof (*peer));
- if (peer == NULL)
- return (NULL);
+ if (peer == NULL) {
+ CERROR("Cannot allocate peer\n");
+ return -ENOMEM;
+ }
memset(peer, 0, sizeof(*peer)); /* zero flags etc */
peer->ibp_reconnect_interval = 0; /* OK to connect at any time */
- atomic_inc (&kibnal_data.kib_npeers);
- return (peer);
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+
+ if (atomic_read(&kibnal_data.kib_npeers) >=
+ *kibnal_tunables.kib_concurrent_peers) {
+ rc = -EOVERFLOW; /* !! but at least it distinguishes */
+ } else if (kibnal_data.kib_listener_cep == NULL) {
+ rc = -ESHUTDOWN; /* shutdown has started */
+ } else {
+ rc = 0;
+ /* npeers only grows with the global lock held */
+ atomic_inc(&kibnal_data.kib_npeers);
+ }
+
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+ if (rc != 0) {
+ CERROR("Can't create peer: %s\n",
+ (rc == -ESHUTDOWN) ? "shutting down" :
+ "too many peers");
+ LIBCFS_FREE(peer, sizeof(*peer));
+ } else {
+ *peerp = peer;
+ }
+
+ return rc;
}
void
return (NULL);
}
-kib_peer_t *
-kibnal_get_peer (lnet_nid_t nid)
-{
- kib_peer_t *peer;
- unsigned long flags;
-
- read_lock_irqsave(&kibnal_data.kib_global_lock, flags);
- peer = kibnal_find_peer_locked (nid);
- if (peer != NULL) /* +1 ref for caller? */
- kib_peer_addref(peer);
- read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
-
- return (peer);
-}
-
void
kibnal_unlink_peer_locked (kib_peer_t *peer)
{
LASSERT (kibnal_peer_active(peer));
list_del_init (&peer->ibp_list);
/* lose peerlist's ref */
- kib_peer_decref(peer);
+ kibnal_peer_decref(peer);
}
-static int
+int
kibnal_get_peer_info (int index, lnet_nid_t *nidp, int *persistencep)
{
kib_peer_t *peer;
return (-ENOENT);
}
-static int
+int
kibnal_add_persistent_peer (lnet_nid_t nid)
{
unsigned long flags;
kib_peer_t *peer;
kib_peer_t *peer2;
+ int rc;
if (nid == LNET_NID_ANY)
return (-EINVAL);
- peer = kibnal_create_peer (nid);
- if (peer == NULL)
- return (-ENOMEM);
+ rc = kibnal_create_peer(&peer, nid);
+ if (rc != 0)
+ return rc;
write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
peer2 = kibnal_find_peer_locked (nid);
if (peer2 != NULL) {
- kib_peer_decref (peer);
+ kibnal_peer_decref (peer);
peer = peer2;
} else {
/* peer table takes existing ref on peer */
return (0);
}
-static void
+void
kibnal_del_peer_locked (kib_peer_t *peer)
{
struct list_head *ctmp;
return (rc);
}
-static kib_conn_t *
+kib_conn_t *
kibnal_get_conn_by_idx (int index)
{
kib_peer_t *peer;
continue;
conn = list_entry (ctmp, kib_conn_t, ibc_list);
- CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
- conn, conn->ibc_state,
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- atomic_read (&conn->ibc_refcount));
- atomic_inc (&conn->ibc_refcount);
+ kibnal_conn_addref(conn);
read_unlock_irqrestore(&kibnal_data.kib_global_lock,
flags);
return (conn);
return (NULL);
}
+int
+kibnal_conn_rts(kib_conn_t *conn,
+ __u32 qpn, __u8 resp_res, __u8 init_depth, __u32 psn)
+{
+ IB_PATH_RECORD *path = &conn->ibc_cvars->cv_path;
+ IB_HANDLE qp = conn->ibc_qp;
+ IB_QP_ATTRIBUTES_MODIFY modify_attr;
+ FSTATUS frc;
+ int rc;
+
+ if (resp_res > kibnal_data.kib_hca_attrs.MaxQPResponderResources)
+ resp_res = kibnal_data.kib_hca_attrs.MaxQPResponderResources;
+
+ if (init_depth > kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth)
+ init_depth = kibnal_data.kib_hca_attrs.MaxQPInitiatorDepth;
+
+ modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+ .RequestState = QPStateReadyToRecv,
+ .RecvPSN = IBNAL_STARTING_PSN,
+ .DestQPNumber = qpn,
+ .ResponderResources = resp_res,
+ .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */
+ .Attrs = (IB_QP_ATTR_RECVPSN |
+ IB_QP_ATTR_DESTQPNUMBER |
+ IB_QP_ATTR_RESPONDERRESOURCES |
+ IB_QP_ATTR_DESTAV |
+ IB_QP_ATTR_PATHMTU |
+ IB_QP_ATTR_MINRNRTIMER),
+ };
+ GetAVFromPath(0, path, &modify_attr.PathMTU, NULL,
+ &modify_attr.DestAV);
+
+ frc = iibt_qp_modify(qp, &modify_attr, NULL);
+ if (frc != FSUCCESS) {
+ CERROR("Can't set QP %s ready to receive: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+ return -EIO;
+ }
+
+ rc = kibnal_post_receives(conn);
+ if (rc != 0) {
+ CERROR("Can't post receives for %s: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+ return rc;
+ }
+
+ modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
+ .RequestState = QPStateReadyToSend,
+ .FlowControl = TRUE,
+ .InitiatorDepth = init_depth,
+ .SendPSN = psn,
+ .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */
+ .RetryCount = IBNAL_RETRY,
+ .RnrRetryCount = IBNAL_RNR_RETRY,
+ .Attrs = (IB_QP_ATTR_FLOWCONTROL |
+ IB_QP_ATTR_INITIATORDEPTH |
+ IB_QP_ATTR_SENDPSN |
+ IB_QP_ATTR_LOCALACKTIMEOUT |
+ IB_QP_ATTR_RETRYCOUNT |
+ IB_QP_ATTR_RNRRETRYCOUNT),
+ };
+
+ frc = iibt_qp_modify(qp, &modify_attr, NULL);
+ if (frc != FSUCCESS) {
+ CERROR("Can't set QP %s ready to send: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+ return -EIO;
+ }
+
+ frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't query QP %s attributes: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+ return -EIO;
+ }
+
+ return 0;
+}
+
kib_conn_t *
-kibnal_create_conn (void)
+kibnal_create_conn (lnet_nid_t nid)
{
kib_conn_t *conn;
int i;
- __u64 vaddr = 0;
- __u64 vaddr_base;
int page_offset;
int ipage;
int rc;
LIBCFS_ALLOC (conn, sizeof (*conn));
if (conn == NULL) {
- CERROR ("Can't allocate connection\n");
+ CERROR ("Can't allocate connection for %s\n",
+ libcfs_nid2str(nid));
return (NULL);
}
/* zero flags, NULL pointers etc... */
memset (conn, 0, sizeof (*conn));
+ conn->ibc_state = IBNAL_CONN_INIT_NOTHING;
+ INIT_LIST_HEAD (&conn->ibc_early_rxs);
INIT_LIST_HEAD (&conn->ibc_tx_queue);
INIT_LIST_HEAD (&conn->ibc_active_txs);
spin_lock_init (&conn->ibc_lock);
atomic_inc (&kibnal_data.kib_nconns);
/* well not really, but I call destroy() on failure, which decrements */
- LIBCFS_ALLOC (conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
- if (conn->ibc_rxs == NULL)
+ LIBCFS_ALLOC (conn->ibc_cvars, sizeof (*conn->ibc_cvars));
+ if (conn->ibc_cvars == NULL) {
+ CERROR ("Can't allocate connvars for %s\n",
+ libcfs_nid2str(nid));
goto failed;
- memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
+ }
+ memset(conn->ibc_cvars, 0, sizeof (*conn->ibc_cvars));
- rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES, 1);
- if (rc != 0)
+ LIBCFS_ALLOC(conn->ibc_rxs, IBNAL_RX_MSGS * sizeof (kib_rx_t));
+ if (conn->ibc_rxs == NULL) {
+ CERROR("Cannot allocate RX descriptors for %s\n",
+ libcfs_nid2str(nid));
goto failed;
+ }
+ memset (conn->ibc_rxs, 0, IBNAL_RX_MSGS * sizeof(kib_rx_t));
- vaddr_base = vaddr = conn->ibc_rx_pages->ibp_vaddr;
-
+ rc = kibnal_alloc_pages(&conn->ibc_rx_pages, IBNAL_RX_MSG_PAGES);
+ if (rc != 0) {
+ CERROR("Can't allocate RX buffers for %s\n",
+ libcfs_nid2str(nid));
+ goto failed;
+ }
+
for (i = ipage = page_offset = 0; i < IBNAL_RX_MSGS; i++) {
struct page *page = conn->ibc_rx_pages->ibp_pages[ipage];
- kib_rx_t *rx = &conn->ibc_rxs[i];
+ kib_rx_t *rx = &conn->ibc_rxs[i];
rx->rx_conn = conn;
rx->rx_msg = (kib_msg_t *)(((char *)page_address(page)) +
page_offset);
- if (kibnal_whole_mem())
- rx->rx_vaddr = kibnal_page2phys(page) +
- page_offset +
- kibnal_data.kib_md.md_addr;
- else
- rx->rx_vaddr = vaddr;
-
- vaddr += IBNAL_MSG_SIZE;
- LASSERT (vaddr <= vaddr_base + IBNAL_RX_MSG_BYTES);
+ rx->rx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
+ kibnal_page2phys(page) + page_offset;
page_offset += IBNAL_MSG_SIZE;
LASSERT (page_offset <= PAGE_SIZE);
params.qp_create = (IB_QP_ATTRIBUTES_CREATE) {
.Type = QPTypeReliableConnected,
- .SendQDepth = IBNAL_TX_MAX_SG *
+ .SendQDepth = (IBNAL_MAX_RDMA_FRAGS + 1) *
IBNAL_MSG_QUEUE_SIZE,
.RecvQDepth = IBNAL_MSG_QUEUE_SIZE,
.SendDSListDepth = 1,
.SendSignaledCompletions = TRUE,
};
frc = iibt_qp_create(kibnal_data.kib_hca, ¶ms.qp_create, NULL,
- &conn->ibc_qp, &conn->ibc_qp_attrs);
- if (rc != 0) {
- CERROR ("Failed to create queue pair: %d\n", rc);
+ &conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs);
+ if (frc != 0) {
+ CERROR ("Can't create QP %s: %d\n", libcfs_nid2str(nid), frc);
goto failed;
}
/* Mark QP created */
- conn->ibc_state = IBNAL_CONN_INIT_QP;
+ kibnal_set_conn_state(conn, IBNAL_CONN_INIT_QP);
params.qp_attr = (IB_QP_ATTRIBUTES_MODIFY) {
.RequestState = QPStateInit,
IB_QP_ATTR_ACCESSCONTROL),
.PortGUID = kibnal_data.kib_port_guid,
.PkeyIndex = 0,
- .AccessControl = {
+ .AccessControl = {
.s = {
.RdmaWrite = 1,
.RdmaRead = 1,
},
},
};
- rc = iibt_qp_modify(conn->ibc_qp, ¶ms.qp_attr, NULL);
- if (rc != 0) {
- CERROR ("Failed to modify queue pair: %d\n", rc);
+ frc = iibt_qp_modify(conn->ibc_qp, ¶ms.qp_attr, NULL);
+ if (frc != 0) {
+ CERROR ("Can't set QP %s state to INIT: %d\n",
+ libcfs_nid2str(nid), frc);
+ goto failed;
+ }
+
+ frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_cvars->cv_qpattrs, NULL);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't query QP %s attributes: %d\n",
+ libcfs_nid2str(nid), frc);
goto failed;
}
void
kibnal_destroy_conn (kib_conn_t *conn)
{
- int rc;
+ int rc;
FSTATUS frc;
+
+ LASSERT (!in_interrupt());
CDEBUG (D_NET, "connection %p\n", conn);
LASSERT (atomic_read (&conn->ibc_refcount) == 0);
+ LASSERT (list_empty(&conn->ibc_early_rxs));
LASSERT (list_empty(&conn->ibc_tx_queue));
LASSERT (list_empty(&conn->ibc_active_txs));
LASSERT (conn->ibc_nsends_posted == 0);
- LASSERT (conn->ibc_connreq == NULL);
switch (conn->ibc_state) {
- case IBNAL_CONN_DISCONNECTED:
- /* called after connection sequence initiated */
- /* fall through */
-
- case IBNAL_CONN_INIT_QP:
- /* _destroy includes an implicit Reset of the QP which
- * discards posted work */
- rc = iibt_qp_destroy(conn->ibc_qp);
- if (rc != 0)
- CERROR("Can't destroy QP: %d\n", rc);
- /* fall through */
-
case IBNAL_CONN_INIT_NOTHING:
+ case IBNAL_CONN_INIT_QP:
+ case IBNAL_CONN_DISCONNECTED:
break;
default:
- LASSERT (0);
+ /* conn must either have never engaged with the CM, or have
+ * completely disengaged from it */
+ CERROR("Bad conn %s state %d\n",
+ (conn->ibc_peer) == NULL ? "<anon>" :
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), conn->ibc_state);
+ LBUG();
}
if (conn->ibc_cep != NULL) {
frc = iibt_cm_destroy_cep(conn->ibc_cep);
- if (frc != 0)
- CERROR("Can't destroy CEP %p: %d\n", conn->ibc_cep,
- frc);
+ if (frc != FSUCCESS)
+ CERROR("Error destroying CEP %p: %d\n",
+ conn->ibc_cep, frc);
+ }
+
+ if (conn->ibc_qp != NULL) {
+ frc = iibt_qp_destroy(conn->ibc_qp);
+ if (frc != FSUCCESS)
+ CERROR("Error destroying QP %p: %d\n",
+ conn->ibc_qp, frc);
}
if (conn->ibc_rx_pages != NULL)
LIBCFS_FREE(conn->ibc_rxs,
IBNAL_RX_MSGS * sizeof(kib_rx_t));
+ if (conn->ibc_cvars != NULL)
+ LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars));
+
if (conn->ibc_peer != NULL)
- kib_peer_decref(conn->ibc_peer);
+ kibnal_peer_decref(conn->ibc_peer);
LIBCFS_FREE(conn, sizeof (*conn));
atomic_dec(&kibnal_data.kib_nconns);
-
- if (atomic_read (&kibnal_data.kib_nconns) == 0 &&
- kibnal_data.kib_shutdown) {
- /* I just nuked the last connection on shutdown; wake up
- * everyone so they can exit. */
- wake_up_all(&kibnal_data.kib_sched_waitq);
- wake_up_all(&kibnal_data.kib_connd_waitq);
- }
-}
-
-void
-kibnal_put_conn (kib_conn_t *conn)
-{
- unsigned long flags;
-
- CDEBUG (D_NET, "putting conn[%p] state %d -> %s (%d)\n",
- conn, conn->ibc_state,
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- atomic_read (&conn->ibc_refcount));
-
- LASSERT (atomic_read (&conn->ibc_refcount) > 0);
- if (!atomic_dec_and_test (&conn->ibc_refcount))
- return;
-
- /* must disconnect before dropping the final ref */
- LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTED);
-
- spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
-
- list_add (&conn->ibc_list, &kibnal_data.kib_connd_conns);
- wake_up (&kibnal_data.kib_connd_waitq);
-
- spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
}
-static int
+int
kibnal_close_peer_conns_locked (kib_peer_t *peer, int why)
{
kib_conn_t *conn;
return (count);
}
-static int
+int
kibnal_close_matching_conns (lnet_nid_t nid)
{
unsigned long flags;
switch(cmd) {
case IOC_LIBCFS_GET_PEER: {
lnet_nid_t nid = 0;
- int share_count = 0;
+ int share_count = 0;
rc = kibnal_get_peer_info(data->ioc_count,
&nid, &share_count);
else {
rc = 0;
data->ioc_nid = conn->ibc_peer->ibp_nid;
- kibnal_put_conn (conn);
+ kibnal_conn_decref(conn);
}
break;
}
break;
}
case IOC_LIBCFS_REGISTER_MYNID: {
- if (data->ioc_nid == LNET_NID_ANY)
+ if (ni->ni_nid == data->ioc_nid) {
+ rc = 0;
+ } else {
+ CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID: %s(%s)\n",
+ libcfs_nid2str(data->ioc_nid),
+ libcfs_nid2str(ni->ni_nid));
rc = -EINVAL;
- else
- rc = kibnal_set_mynid (data->ioc_nid);
+ }
break;
}
}
kibnal_free_pages (kib_pages_t *p)
{
int npages = p->ibp_npages;
- int rc;
int i;
- if (p->ibp_mapped) {
- rc = iibt_deregister_memory(p->ibp_handle);
- if (rc != 0)
- CERROR ("Deregister error: %d\n", rc);
- }
-
for (i = 0; i < npages; i++)
if (p->ibp_pages[i] != NULL)
__free_page(p->ibp_pages[i]);
}
int
-kibnal_alloc_pages (kib_pages_t **pp, int npages, int allow_write)
+kibnal_alloc_pages (kib_pages_t **pp, int npages)
{
- kib_pages_t *p;
- __u64 *phys_pages;
- int i;
- FSTATUS frc;
- IB_ACCESS_CONTROL access;
-
- memset(&access, 0, sizeof(access));
- access.s.MWBindable = 1;
- access.s.LocalWrite = 1;
- access.s.RdmaRead = 1;
- access.s.RdmaWrite = 1;
+ kib_pages_t *p;
+ int i;
LIBCFS_ALLOC(p, offsetof(kib_pages_t, ibp_pages[npages]));
if (p == NULL) {
}
}
- if (kibnal_whole_mem())
- goto out;
-
- LIBCFS_ALLOC(phys_pages, npages * sizeof(*phys_pages));
- if (phys_pages == NULL) {
- CERROR ("Can't allocate physarray for %d pages\n", npages);
- /* XXX free ibp_pages? */
- kibnal_free_pages(p);
- return (-ENOMEM);
- }
+ *pp = p;
+ return (0);
+}
- /* if we were using the _contig_ registration variant we would have
- * an array of PhysAddr/Length pairs, but the discontiguous variant
- * just takes the PhysAddr */
- for (i = 0; i < npages; i++)
- phys_pages[i] = kibnal_page2phys(p->ibp_pages[i]);
-
- frc = iibt_register_physical_memory(kibnal_data.kib_hca,
- 0, /* requested vaddr */
- phys_pages, npages,
- 0, /* offset */
- kibnal_data.kib_pd,
- access,
- &p->ibp_handle, &p->ibp_vaddr,
- &p->ibp_lkey, &p->ibp_rkey);
+int
+kibnal_alloc_tx_descs (void)
+{
+ int i;
- LIBCFS_FREE(phys_pages, npages * sizeof(*phys_pages));
+ LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
+ IBNAL_TX_MSGS() * sizeof(kib_tx_t));
+ if (kibnal_data.kib_tx_descs == NULL)
+ return -ENOMEM;
- if (frc != FSUCCESS) {
- CERROR ("Error %d mapping %d pages\n", frc, npages);
- kibnal_free_pages(p);
- return (-ENOMEM);
+ memset(kibnal_data.kib_tx_descs, 0,
+ IBNAL_TX_MSGS() * sizeof(kib_tx_t));
+
+ for (i = 0; i < IBNAL_TX_MSGS(); i++) {
+ kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
+
+#if IBNAL_USE_FMR
+ LIBCFS_ALLOC(tx->tx_pages, LNET_MAX_IOV *
+ sizeof(*tx->tx_pages));
+ if (tx->tx_pages == NULL)
+ return -ENOMEM;
+#else
+ LIBCFS_ALLOC(tx->tx_wrq,
+ (1 + IBNAL_MAX_RDMA_FRAGS) *
+ sizeof(*tx->tx_wrq));
+ if (tx->tx_wrq == NULL)
+ return -ENOMEM;
+
+ LIBCFS_ALLOC(tx->tx_gl,
+ (1 + IBNAL_MAX_RDMA_FRAGS) *
+ sizeof(*tx->tx_gl));
+ if (tx->tx_gl == NULL)
+ return -ENOMEM;
+
+ LIBCFS_ALLOC(tx->tx_rd,
+ offsetof(kib_rdma_desc_t,
+ rd_frags[IBNAL_MAX_RDMA_FRAGS]));
+ if (tx->tx_rd == NULL)
+ return -ENOMEM;
+#endif
}
- CDEBUG(D_NET, "registered %d pages; handle: %p vaddr "LPX64" "
- "lkey %x rkey %x\n", npages, p->ibp_handle,
- p->ibp_vaddr, p->ibp_lkey, p->ibp_rkey);
-
- p->ibp_mapped = 1;
-out:
- *pp = p;
- return (0);
+ return 0;
+}
+
+void
+kibnal_free_tx_descs (void)
+{
+ int i;
+
+ if (kibnal_data.kib_tx_descs == NULL)
+ return;
+
+ for (i = 0; i < IBNAL_TX_MSGS(); i++) {
+ kib_tx_t *tx = &kibnal_data.kib_tx_descs[i];
+
+#if IBNAL_USE_FMR
+ if (tx->tx_pages != NULL)
+ LIBCFS_FREE(tx->tx_pages, LNET_MAX_IOV *
+ sizeof(*tx->tx_pages));
+#else
+ if (tx->tx_wrq != NULL)
+ LIBCFS_FREE(tx->tx_wrq,
+ (1 + IBNAL_MAX_RDMA_FRAGS) *
+ sizeof(*tx->tx_wrq));
+
+ if (tx->tx_gl != NULL)
+ LIBCFS_FREE(tx->tx_gl,
+ (1 + IBNAL_MAX_RDMA_FRAGS) *
+ sizeof(*tx->tx_gl));
+
+ if (tx->tx_rd != NULL)
+ LIBCFS_FREE(tx->tx_rd,
+ offsetof(kib_rdma_desc_t,
+ rd_frags[IBNAL_MAX_RDMA_FRAGS]));
+#endif
+ }
+
+ LIBCFS_FREE(kibnal_data.kib_tx_descs,
+ IBNAL_TX_MSGS() * sizeof(kib_tx_t));
}
-static int
+int
kibnal_setup_tx_descs (void)
{
int ipage = 0;
int page_offset = 0;
- __u64 vaddr;
- __u64 vaddr_base;
struct page *page;
kib_tx_t *tx;
int i;
int rc;
/* pre-mapped messages are not bigger than 1 page */
- LASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
+ CLASSERT (IBNAL_MSG_SIZE <= PAGE_SIZE);
/* No fancy arithmetic when we do the buffer calculations */
- LASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
+ CLASSERT (PAGE_SIZE % IBNAL_MSG_SIZE == 0);
- rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages, IBNAL_TX_MSG_PAGES,
- 0);
+ rc = kibnal_alloc_pages(&kibnal_data.kib_tx_pages,
+ IBNAL_TX_MSG_PAGES());
if (rc != 0)
return (rc);
- /* ignored for the whole_mem case */
- vaddr = vaddr_base = kibnal_data.kib_tx_pages->ibp_vaddr;
-
- for (i = 0; i < IBNAL_TX_MSGS; i++) {
+ for (i = 0; i < IBNAL_TX_MSGS(); i++) {
page = kibnal_data.kib_tx_pages->ibp_pages[ipage];
tx = &kibnal_data.kib_tx_descs[i];
- memset (tx, 0, sizeof(*tx)); /* zero flags etc */
-
+#if IBNAL_USE_FMR
+ /* Allocate an FMR for this TX so it can map src/sink buffers
+ * for large transfers */
+#endif
tx->tx_msg = (kib_msg_t *)(((char *)page_address(page)) +
page_offset);
- if (kibnal_whole_mem())
- tx->tx_vaddr = kibnal_page2phys(page) +
- page_offset +
- kibnal_data.kib_md.md_addr;
- else
- tx->tx_vaddr = vaddr;
-
- tx->tx_mapped = KIB_TX_UNMAPPED;
+ tx->tx_hca_msg = kibnal_data.kib_whole_mem.md_addr +
+ kibnal_page2phys(page) + page_offset;
CDEBUG(D_NET, "Tx[%d] %p->%p - "LPX64"\n",
- i, tx, tx->tx_msg, tx->tx_vaddr);
+ i, tx, tx->tx_msg, tx->tx_hca_msg);
list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
- vaddr += IBNAL_MSG_SIZE;
- LASSERT (vaddr <= vaddr_base + IBNAL_TX_MSG_BYTES);
-
page_offset += IBNAL_MSG_SIZE;
LASSERT (page_offset <= PAGE_SIZE);
if (page_offset == PAGE_SIZE) {
page_offset = 0;
ipage++;
- LASSERT (ipage <= IBNAL_TX_MSG_PAGES);
+ LASSERT (ipage <= IBNAL_TX_MSG_PAGES());
}
}
return (0);
}
+int
+kibnal_register_all_memory(void)
+{
+ /* CAVEAT EMPTOR: this assumes all physical memory is in 1 contiguous
+ * chunk starting at 0 */
+ struct sysinfo si;
+ __u64 total;
+ __u64 roundup = (128<<20); /* round up in big chunks */
+ IB_MR_PHYS_BUFFER phys;
+ IB_ACCESS_CONTROL access;
+ FSTATUS frc;
+ int rc;
+
+ memset(&access, 0, sizeof(access));
+ access.s.MWBindable = 1;
+ access.s.LocalWrite = 1;
+ access.s.RdmaRead = 1;
+ access.s.RdmaWrite = 1;
+
+ /* XXX we don't bother with first-gen cards */
+ if (kibnal_data.kib_hca_attrs.VendorId == 0xd0b7 &&
+ kibnal_data.kib_hca_attrs.DeviceId == 0x3101) {
+ CERROR("Can't register all memory on first generation HCAs\n");
+ return -EINVAL;
+ }
+
+ si_meminfo(&si);
+ total = ((__u64)si.totalram) * si.mem_unit;
+
+ if (total < ((__u64)max_mapnr) * PAGE_SIZE)
+ total = ((__u64)max_mapnr) * PAGE_SIZE;
+
+ if (total == 0) {
+ CERROR("Can't determine memory size\n");
+ return -ENOMEM;
+ }
+
+ roundup = (128<<20);
+ total = (total + (roundup - 1)) & ~(roundup - 1);
+
+ phys.PhysAddr = 0;
+ phys.Length = total;
+
+ frc = iibt_register_contig_physical_memory(
+ kibnal_data.kib_hca, 0, &phys, 1, 0,
+ kibnal_data.kib_pd, access,
+ &kibnal_data.kib_whole_mem.md_handle,
+ &kibnal_data.kib_whole_mem.md_addr,
+ &kibnal_data.kib_whole_mem.md_lkey,
+ &kibnal_data.kib_whole_mem.md_rkey);
+
+ if (frc != FSUCCESS) {
+ CERROR("registering physical memory failed: %d\n", frc);
+ return -EIO;
+ }
+
+ CDEBUG(D_WARNING, "registered phys mem from "LPX64" for "LPU64"\n",
+ phys.PhysAddr, phys.Length);
+
+ return 0;
+}
+
void
kibnal_shutdown (lnet_ni_t *ni)
{
int i;
int rc;
- LASSERT (ni->ni_data == &kibnal_data);
LASSERT (ni == kibnal_data.kib_ni);
+ LASSERT (ni->ni_data == &kibnal_data);
CDEBUG(D_MALLOC, "before NAL cleanup: kmem %d\n",
atomic_read (&libcfs_kmemory));
LBUG();
case IBNAL_INIT_ALL:
- /* resetting my NID to unadvertises me, removes my
- * listener and nukes all current peers */
- kibnal_set_mynid (LNET_NID_ANY);
+ /* stop accepting connections, prevent new peers and start to
+ * tear down all existing ones... */
+ kibnal_stop_listener(1);
- /* Wait for all peer state to clean up (crazy) */
+ /* Wait for all peer state to clean up */
i = 2;
while (atomic_read (&kibnal_data.kib_npeers) != 0) {
i++;
CDEBUG(((i & (-i)) == i) ? D_WARNING : D_NET, /* power of 2? */
- "waiting for %d peers to disconnect (can take a few seconds)\n",
+ "waiting for %d peers to disconnect\n",
atomic_read (&kibnal_data.kib_npeers));
set_current_state (TASK_UNINTERRUPTIBLE);
schedule_timeout (HZ);
kibnal_free_pages (kibnal_data.kib_tx_pages);
/* fall through */
- case IBNAL_INIT_MR:
- if (kibnal_data.kib_md.md_handle != NULL) {
- rc = iibt_deregister_memory(kibnal_data.kib_md.md_handle);
- if (rc != FSUCCESS)
- CERROR ("Deregister memory: %d\n", rc);
- }
+ case IBNAL_INIT_MD:
+ rc = iibt_deregister_memory(kibnal_data.kib_whole_mem.md_handle);
+ if (rc != FSUCCESS)
+ CERROR ("Deregister memory: %d\n", rc);
/* fall through */
-#if IBNAL_FMR
- case IBNAL_INIT_FMR:
- rc = ib_fmr_pool_destroy (kibnal_data.kib_fmr_pool);
- if (rc != 0)
- CERROR ("Destroy FMR pool error: %d\n", rc);
- /* fall through */
-#endif
case IBNAL_INIT_PD:
rc = iibt_pd_free(kibnal_data.kib_pd);
if (rc != 0)
CERROR ("Deregister SD error: %d\n", rc);
/* fall through */
- case IBNAL_INIT_PORT:
- /* XXX ??? */
- /* fall through */
-
case IBNAL_INIT_PORTATTRS:
LIBCFS_FREE(kibnal_data.kib_hca_attrs.PortAttributesList,
kibnal_data.kib_hca_attrs.PortAttributesListSize);
/* fall through */
case IBNAL_INIT_DATA:
- /* Module refcount only gets to zero when all peers
- * have been closed so all lists must be empty */
LASSERT (atomic_read (&kibnal_data.kib_npeers) == 0);
LASSERT (kibnal_data.kib_peers != NULL);
for (i = 0; i < kibnal_data.kib_peer_hash_size; i++) {
LASSERT (list_empty (&kibnal_data.kib_peers[i]));
}
LASSERT (atomic_read (&kibnal_data.kib_nconns) == 0);
- LASSERT (list_empty (&kibnal_data.kib_sched_rxq));
- LASSERT (list_empty (&kibnal_data.kib_sched_txq));
+ LASSERT (list_empty (&kibnal_data.kib_connd_zombies));
LASSERT (list_empty (&kibnal_data.kib_connd_conns));
LASSERT (list_empty (&kibnal_data.kib_connd_peers));
break;
}
- if (kibnal_data.kib_tx_descs != NULL)
- LIBCFS_FREE (kibnal_data.kib_tx_descs,
- IBNAL_TX_MSGS * sizeof(kib_tx_t));
+ kibnal_free_tx_descs();
if (kibnal_data.kib_peers != NULL)
LIBCFS_FREE (kibnal_data.kib_peers,
PORTAL_MODULE_UNUSE;
}
-#define roundup_power(val, power) \
- ( (val + (__u64)(power - 1)) & ~((__u64)(power - 1)) )
-
-/* this isn't very portable or sturdy in the face of funny mem/bus configs */
-static __u64 max_phys_mem(IB_CA_ATTRIBUTES *ca_attr)
-{
- struct sysinfo si;
- __u64 ret;
-
- /* XXX we don't bother with first-gen cards */
- if (ca_attr->VendorId == 0xd0b7 && ca_attr->DeviceId == 0x3101)
- return 0ULL;
-
- si_meminfo(&si);
- ret = (__u64)max(si.totalram, max_mapnr) * si.mem_unit;
- return roundup_power(ret, 128 * 1024 * 1024);
-}
-#undef roundup_power
-
int
kibnal_startup (lnet_ni_t *ni)
{
+ char ipif_name[32];
+ __u32 ip;
+ __u32 netmask;
+ int up;
+ int nob;
+ struct timeval tv;
IB_PORT_ATTRIBUTES *pattr;
FSTATUS frc;
int rc;
- int n;
+ __u32 n;
int i;
LASSERT (ni->ni_lnd == &the_kiblnd);
return -EPERM;
}
- if (ni->ni_interfaces[0] != NULL) {
- CERROR("Explicit interface config not supported\n");
- return -EPERM;
- }
-
if (IBNAL_CREDITS > IBNAL_NTX) {
CERROR ("Can't set credits(%d) > ntx(%d)\n",
IBNAL_CREDITS, IBNAL_NTX);
return -EINVAL;
}
- ni->ni_maxtxcredits = IBNAL_CREDITS;
- ni->ni_peertxcredits = IBNAL_PEERCREDITS;
+ if (*kibnal_tunables.kib_credits > *kibnal_tunables.kib_ntx) {
+ CERROR ("Can't set credits(%d) > ntx(%d)\n",
+ *kibnal_tunables.kib_credits,
+ *kibnal_tunables.kib_ntx);
+ return -EINVAL;
+ }
+
+ ni->ni_maxtxcredits = *kibnal_tunables.kib_credits;
+ ni->ni_peertxcredits = *kibnal_tunables.kib_peercredits;
+
+ CLASSERT (LNET_MAX_INTERFACES > 1);
+
+ if (ni->ni_interfaces[0] == NULL) {
+ kibnal_data.kib_hca_idx = 0;
+ } else {
+ /* Use the HCA specified in 'networks=' */
+ if (ni->ni_interfaces[1] != NULL) {
+ CERROR("Multiple interfaces not supported\n");
+ return -EPERM;
+ }
+
+ /* Parse <number> into kib_hca_idx */
+ nob = strlen(ni->ni_interfaces[0]);
+ if (sscanf(ni->ni_interfaces[0], "%d%n",
+ &kibnal_data.kib_hca_idx, &nob) < 1 ||
+ nob != strlen(ni->ni_interfaces[0])) {
+ CERROR("Can't parse interface '%s'\n",
+ ni->ni_interfaces[0]);
+ return -EINVAL;
+ }
+ }
+
+ /* Find IP address from <ipif base name><number> */
+ snprintf(ipif_name, sizeof(ipif_name), "%s%d",
+ *kibnal_tunables.kib_ipif_basename, kibnal_data.kib_hca_idx);
+ if (strlen(ipif_name) == sizeof(ipif_name - 1)) {
+ CERROR("IPoIB interface name %s truncated\n", ipif_name);
+ return -EINVAL;
+ }
+
+ rc = libcfs_ipif_query(ipif_name, &up, &ip, &netmask);
+ if (rc != 0) {
+ CERROR("Can't query IPoIB interface %s: %d\n", ipif_name, rc);
+ return -ENETDOWN;
+ }
+
+ if (!up) {
+ CERROR("Can't query IPoIB interface %s: it's down\n", ipif_name);
+ return -ENETDOWN;
+ }
+
+ ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
ni->ni_data = &kibnal_data;
kibnal_data.kib_ni = ni;
+ do_gettimeofday(&tv);
+ kibnal_data.kib_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
+
frc = IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2,
&kibnal_data.kib_interfaces);
if (frc != FSUCCESS) {
CERROR("IbtGetInterfaceByVersion(IBT_INTERFACE_VERSION_2) = %d\n",
- frc);
+ frc);
return -ENOSYS;
}
PORTAL_MODULE_USE;
- init_MUTEX (&kibnal_data.kib_nid_mutex);
- init_MUTEX_LOCKED (&kibnal_data.kib_nid_signal);
- kibnal_data.kib_ni->ni_nid = LNET_NID_ANY;
-
rwlock_init(&kibnal_data.kib_global_lock);
kibnal_data.kib_peer_hash_size = IBNAL_PEER_HASH_SIZE;
spin_lock_init (&kibnal_data.kib_connd_lock);
INIT_LIST_HEAD (&kibnal_data.kib_connd_peers);
INIT_LIST_HEAD (&kibnal_data.kib_connd_conns);
+ INIT_LIST_HEAD (&kibnal_data.kib_connd_zombies);
init_waitqueue_head (&kibnal_data.kib_connd_waitq);
spin_lock_init (&kibnal_data.kib_sched_lock);
- INIT_LIST_HEAD (&kibnal_data.kib_sched_txq);
- INIT_LIST_HEAD (&kibnal_data.kib_sched_rxq);
init_waitqueue_head (&kibnal_data.kib_sched_waitq);
spin_lock_init (&kibnal_data.kib_tx_lock);
INIT_LIST_HEAD (&kibnal_data.kib_idle_txs);
- LIBCFS_ALLOC (kibnal_data.kib_tx_descs,
- IBNAL_TX_MSGS * sizeof(kib_tx_t));
- if (kibnal_data.kib_tx_descs == NULL) {
- CERROR ("Can't allocate tx descs\n");
+ rc = kibnal_alloc_tx_descs();
+ if (rc != 0) {
+ CERROR("Can't allocate tx descs\n");
goto failed;
}
kibnal_data.kib_init = IBNAL_INIT_DATA;
/*****************************************************/
+ kibnal_data.kib_sdretry.RetryCount = *kibnal_tunables.kib_sd_retries;
+ kibnal_data.kib_sdretry.Timeout = (*kibnal_tunables.kib_timeout * 1000)/
+ *kibnal_tunables.kib_sd_retries;
+
for (i = 0; i < IBNAL_N_SCHED; i++) {
rc = kibnal_thread_start (kibnal_scheduler, (void *)i);
if (rc != 0) {
sizeof(kibnal_data.kib_hca_guids[0]);
frc = iibt_get_hca_guids(&n, kibnal_data.kib_hca_guids);
if (frc != FSUCCESS) {
- CERROR ("Can't get channel adapter guids: %d\n", frc);
+ CERROR ("Can't get HCA guids: %d\n", frc);
goto failed;
}
+
if (n == 0) {
- CERROR ("No channel adapters found\n");
+ CERROR ("No HCAs found\n");
goto failed;
}
- /* Infinicon has per-HCA rather than per CQ completion handlers */
- frc = iibt_open_hca(kibnal_data.kib_hca_guids[0],
- kibnal_ca_callback,
- kibnal_ca_async_callback,
- &kibnal_data.kib_hca,
+ if (n <= kibnal_data.kib_hca_idx) {
+ CERROR("Invalid HCA %d requested: (must be 0 - %d inclusive)\n",
+ kibnal_data.kib_hca_idx, n - 1);
+ goto failed;
+ }
+
+ /* Infinicon has per-HCA notification callbacks */
+ frc = iibt_open_hca(kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx],
+ kibnal_hca_callback,
+ kibnal_hca_async_callback,
+ NULL,
&kibnal_data.kib_hca);
if (frc != FSUCCESS) {
- CERROR ("Can't open CA[0]: %d\n", frc);
+ CERROR ("Can't open HCA[%d]: %d\n",
+ kibnal_data.kib_hca_idx, frc);
goto failed;
}
frc = iibt_query_hca(kibnal_data.kib_hca, &kibnal_data.kib_hca_attrs,
NULL);
if (frc != FSUCCESS) {
- CERROR ("Can't get port attrs for CA 0: %d\n", frc);
+ CERROR ("Can't get port attrs for HCA %d: %d\n",
+ kibnal_data.kib_hca_idx, frc);
goto failed;
}
CDEBUG(D_NET, "got guid "LPX64"\n", kibnal_data.kib_port_guid);
- /* Active port found */
- kibnal_data.kib_init = IBNAL_INIT_PORT;
- /*****************************************************/
-
frc = iibt_sd_register(&kibnal_data.kib_sd, NULL);
if (frc != FSUCCESS) {
CERROR ("Can't register with SD: %d\n", frc);
kibnal_data.kib_init = IBNAL_INIT_PD;
/*****************************************************/
-#if IBNAL_FMR
- {
- const int pool_size = IBNAL_NTX;
- struct ib_fmr_pool_param params = {
- .max_pages_per_fmr = LNET_MTU/PAGE_SIZE,
- .access = (IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_REMOTE_READ),
- .pool_size = pool_size,
- .dirty_watermark = (pool_size * 3)/4,
- .flush_function = NULL,
- .flush_arg = NULL,
- .cache = 1,
- };
- rc = ib_fmr_pool_create(kibnal_data.kib_pd, ¶ms,
- &kibnal_data.kib_fmr_pool);
- if (rc != 0) {
- CERROR ("Can't create FMR pool size %d: %d\n",
- pool_size, rc);
- goto failed;
- }
- }
-
- /* flag FMR pool initialised */
- kibnal_data.kib_init = IBNAL_INIT_FMR;
-#endif
- /*****************************************************/
- if (IBNAL_WHOLE_MEM) {
- IB_MR_PHYS_BUFFER phys;
- IB_ACCESS_CONTROL access;
- kib_md_t *md = &kibnal_data.kib_md;
-
- memset(&access, 0, sizeof(access));
- access.s.MWBindable = 1;
- access.s.LocalWrite = 1;
- access.s.RdmaRead = 1;
- access.s.RdmaWrite = 1;
-
- phys.PhysAddr = 0;
- phys.Length = max_phys_mem(&kibnal_data.kib_hca_attrs);
- if (phys.Length == 0) {
- CERROR ("couldn't determine the end of phys mem\n");
- goto failed;
- }
-
- rc = iibt_register_contig_physical_memory(kibnal_data.kib_hca,
- 0,
- &phys, 1,
- 0,
- kibnal_data.kib_pd,
- access,
- &md->md_handle,
- &md->md_addr,
- &md->md_lkey,
- &md->md_rkey);
- if (rc != FSUCCESS) {
- CERROR("registering physical memory failed: %d\n",
- rc);
- CERROR("falling back to registration per-rdma\n");
- md->md_handle = NULL;
- } else {
- CDEBUG(D_NET, "registered "LPU64" bytes of mem\n",
- phys.Length);
- kibnal_data.kib_init = IBNAL_INIT_MR;
- }
+ rc = kibnal_register_all_memory();
+ if (rc != 0) {
+ CERROR ("Can't register all memory\n");
+ goto failed;
}
-
+
+ /* flag whole memory MD initialised */
+ kibnal_data.kib_init = IBNAL_INIT_MD;
/*****************************************************/
rc = kibnal_setup_tx_descs();
kibnal_data.kib_init = IBNAL_INIT_TXD;
/*****************************************************/
- {
- uint32 nentries;
-
- frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES,
- &kibnal_data.kib_cq, &kibnal_data.kib_cq,
- &nentries);
- if (frc != FSUCCESS) {
- CERROR ("Can't create RX CQ: %d\n", frc);
- goto failed;
- }
-
- /* flag CQ initialised */
- kibnal_data.kib_init = IBNAL_INIT_CQ;
+ frc = iibt_cq_create(kibnal_data.kib_hca, IBNAL_CQ_ENTRIES(),
+ &kibnal_data.kib_cq, &kibnal_data.kib_cq,
+ &n);
+ if (frc != FSUCCESS) {
+ CERROR ("Can't create RX CQ: %d\n", frc);
+ goto failed;
+ }
- if (nentries < IBNAL_CQ_ENTRIES) {
- CERROR ("CQ only has %d entries, need %d\n",
- nentries, IBNAL_CQ_ENTRIES);
- goto failed;
- }
+ /* flag CQ initialised */
+ kibnal_data.kib_init = IBNAL_INIT_CQ;
+ /*****************************************************/
+
+ if (n < IBNAL_CQ_ENTRIES()) {
+ CERROR ("CQ only has %d entries: %d needed\n",
+ n, IBNAL_CQ_ENTRIES());
+ goto failed;
+ }
- rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC);
- if (rc != 0) {
- CERROR ("Failed to re-arm completion queue: %d\n", rc);
- goto failed;
- }
+ rc = iibt_cq_rearm(kibnal_data.kib_cq, CQEventSelNextWC);
+ if (rc != 0) {
+ CERROR ("Failed to re-arm completion queue: %d\n", rc);
+ goto failed;
}
+ rc = kibnal_start_listener();
+ if (rc != 0) {
+ CERROR("Can't start listener: %d\n", rc);
+ goto failed;
+ }
+
/* flag everything initialised */
kibnal_data.kib_init = IBNAL_INIT_ALL;
/*****************************************************/
void __exit
kibnal_module_fini (void)
{
-#ifdef CONFIG_SYSCTL
- if (kibnal_tunables.kib_sysctl != NULL)
- unregister_sysctl_table (kibnal_tunables.kib_sysctl);
-#endif
lnet_unregister_lnd(&the_kiblnd);
+ kibnal_tunables_fini();
}
int __init
return -ENODEV;
}
- if (sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN) {
- CERROR("sizeof(kib_wire_connreq_t) > CM_REQUEST_INFO_USER_LEN\n");
- return -EINVAL;
- }
-
- /* the following must be sizeof(int) for proc_dointvec() */
- if (sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)) {
- CERROR("sizeof (kibnal_tunables.kib_io_timeout) != sizeof (int)\n");
- return -EINVAL;
- }
-
- /* Initialise dynamic tunables to defaults once only */
- kibnal_tunables.kib_io_timeout = IBNAL_IO_TIMEOUT;
+ rc = kibnal_tunables_init();
+ if (rc != 0)
+ return rc;
lnet_register_lnd(&the_kiblnd);
-
-#ifdef CONFIG_SYSCTL
- /* Press on regardless even if registering sysctl doesn't work */
- kibnal_tunables.kib_sysctl =
- register_sysctl_table (kibnal_top_ctl_table, 0);
-#endif
- return (0);
+
+ return 0;
}
MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
-MODULE_DESCRIPTION("Kernel Infinicon IB NAL v0.01");
+MODULE_DESCRIPTION("Kernel Infinicon IB LND v1.00");
MODULE_LICENSE("GPL");
module_init(kibnal_module_init);
#include "iiblnd.h"
-/*
- * LIB functions follow
- *
- */
-static void
-kibnal_schedule_tx_done (kib_tx_t *tx)
+void
+hexdump(char *string, void *ptr, int len)
{
- unsigned long flags;
+ unsigned char *c = ptr;
+ int i;
- spin_lock_irqsave (&kibnal_data.kib_sched_lock, flags);
+ return;
- list_add_tail(&tx->tx_list, &kibnal_data.kib_sched_txq);
- wake_up (&kibnal_data.kib_sched_waitq);
+ if (len < 0 || len > 2048) {
+ printk("XXX what the hell? %d\n",len);
+ return;
+ }
- spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+ printk("%d bytes of '%s' from 0x%p\n", len, string, ptr);
+
+ for (i = 0; i < len;) {
+ printk("%02x",*(c++));
+ i++;
+ if (!(i & 15)) {
+ printk("\n");
+ } else if (!(i&1)) {
+ printk(" ");
+ }
+ }
+
+ if(len & 15) {
+ printk("\n");
+ }
}
-static void
+void
kibnal_tx_done (kib_tx_t *tx)
{
- unsigned long flags;
- int i;
- FSTATUS frc;
+ int rc = tx->tx_status;
+ int i;
+ FSTATUS frc;
- LASSERT (tx->tx_sending == 0); /* mustn't be awaiting callback */
- LASSERT (!tx->tx_passive_rdma_wait); /* mustn't be awaiting RDMA */
+ LASSERT (!in_interrupt());
+ LASSERT (!tx->tx_queued); /* mustn't be queued for sending */
+ LASSERT (tx->tx_sending == 0); /* mustn't be awaiting sent callback */
+ LASSERT (!tx->tx_waiting); /* mustn't be awaiting peer response */
- switch (tx->tx_mapped) {
- default:
- LBUG();
-
- case KIB_TX_UNMAPPED:
- break;
-
- case KIB_TX_MAPPED:
- if (in_interrupt()) {
- /* can't deregister memory in IRQ context... */
- kibnal_schedule_tx_done(tx);
- return;
- }
- frc = iibt_deregister_memory(tx->tx_md.md_handle);
- LASSERT (frc == FSUCCESS);
- tx->tx_mapped = KIB_TX_UNMAPPED;
- break;
-
-#if IBNAL_FMR
- case KIB_TX_MAPPED_FMR:
- if (in_interrupt() && tx->tx_status != 0) {
- /* can't flush FMRs in IRQ context... */
- kibnal_schedule_tx_done(tx);
- return;
- }
-
- rc = ib_fmr_deregister(tx->tx_md.md_handle.fmr);
- LASSERT (rc == 0);
-
- if (tx->tx_status != 0)
- ib_fmr_pool_force_flush(kibnal_data.kib_fmr_pool);
- tx->tx_mapped = KIB_TX_UNMAPPED;
- break;
+#if IBNAL_USE_FMR
+ /* Handle unmapping if required */
#endif
- }
-
for (i = 0; i < 2; i++) {
/* tx may have up to 2 ptlmsgs to finalise */
if (tx->tx_lntmsg[i] == NULL)
continue;
- lnet_finalize (kibnal_data.kib_ni, tx->tx_lntmsg[i],
- tx->tx_status);
+ lnet_finalize (kibnal_data.kib_ni, tx->tx_lntmsg[i], rc);
tx->tx_lntmsg[i] = NULL;
}
if (tx->tx_conn != NULL) {
- kibnal_put_conn (tx->tx_conn);
+ kibnal_conn_decref(tx->tx_conn);
tx->tx_conn = NULL;
}
- tx->tx_nsp = 0;
- tx->tx_passive_rdma = 0;
+ tx->tx_nwrq = 0;
tx->tx_status = 0;
- spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+ spin_lock(&kibnal_data.kib_tx_lock);
- list_add_tail (&tx->tx_list, &kibnal_data.kib_idle_txs);
+ list_add (&tx->tx_list, &kibnal_data.kib_idle_txs);
- spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+ spin_unlock(&kibnal_data.kib_tx_lock);
}
-static kib_tx_t *
+kib_tx_t *
kibnal_get_idle_tx (void)
{
- unsigned long flags;
- kib_tx_t *tx = NULL;
+ kib_tx_t *tx;
- spin_lock_irqsave (&kibnal_data.kib_tx_lock, flags);
+ spin_lock(&kibnal_data.kib_tx_lock);
if (list_empty (&kibnal_data.kib_idle_txs)) {
- spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+ spin_unlock(&kibnal_data.kib_tx_lock);
return NULL;
}
-
+
tx = list_entry (kibnal_data.kib_idle_txs.next, kib_tx_t, tx_list);
list_del (&tx->tx_list);
- /* Allocate a new passive RDMA completion cookie. It might not be
- * needed, but we've got a lock right now and we're unlikely to
+ /* Allocate a new completion cookie. It might not be needed,
+ * but we've got a lock right now and we're unlikely to
* wrap... */
- tx->tx_passive_rdma_cookie = kibnal_data.kib_next_tx_cookie++;
+ tx->tx_cookie = kibnal_data.kib_next_tx_cookie++;
- spin_unlock_irqrestore (&kibnal_data.kib_tx_lock, flags);
+ spin_unlock(&kibnal_data.kib_tx_lock);
- LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
- LASSERT (tx->tx_nsp == 0);
+ LASSERT (tx->tx_nwrq == 0);
+ LASSERT (!tx->tx_queued);
LASSERT (tx->tx_sending == 0);
+ LASSERT (!tx->tx_waiting);
LASSERT (tx->tx_status == 0);
LASSERT (tx->tx_conn == NULL);
- LASSERT (!tx->tx_passive_rdma);
- LASSERT (!tx->tx_passive_rdma_wait);
LASSERT (tx->tx_lntmsg[0] == NULL);
LASSERT (tx->tx_lntmsg[1] == NULL);
- RETURN(tx);
+ return tx;
}
-static void
-kibnal_complete_passive_rdma(kib_conn_t *conn, __u64 cookie, int status)
+int
+kibnal_post_rx (kib_rx_t *rx, int credit)
{
- struct list_head *ttmp;
- unsigned long flags;
- int idle;
-
- spin_lock_irqsave (&conn->ibc_lock, flags);
+ kib_conn_t *conn = rx->rx_conn;
+ int rc = 0;
+ unsigned long flags;
+ FSTATUS frc;
- list_for_each (ttmp, &conn->ibc_active_txs) {
- kib_tx_t *tx = list_entry(ttmp, kib_tx_t, tx_list);
+ LASSERT (!in_interrupt());
+
+ rx->rx_gl = (IB_LOCAL_DATASEGMENT) {
+ .Address = rx->rx_hca_msg,
+ .Lkey = kibnal_data.kib_whole_mem.md_lkey,
+ .Length = IBNAL_MSG_SIZE,
+ };
- LASSERT (tx->tx_passive_rdma ||
- !tx->tx_passive_rdma_wait);
+ rx->rx_wrq = (IB_WORK_REQ) {
+ .WorkReqId = kibnal_ptr2wreqid(rx, IBNAL_WID_RX),
+ .MessageLen = IBNAL_MSG_SIZE,
+ .DSList = &rx->rx_gl,
+ .DSListDepth = 1,
+ .Operation = WROpRecv,
+ };
- LASSERT (tx->tx_passive_rdma_wait ||
- tx->tx_sending != 0);
+ LASSERT (conn->ibc_state >= IBNAL_CONN_CONNECTING);
+ LASSERT (rx->rx_nob >= 0); /* not posted */
- if (!tx->tx_passive_rdma_wait ||
- tx->tx_passive_rdma_cookie != cookie)
- continue;
+ CDEBUG(D_NET, "posting rx [%d %x "LPX64"]\n",
+ rx->rx_wrq.DSList->Length,
+ rx->rx_wrq.DSList->Lkey,
+ rx->rx_wrq.DSList->Address);
- CDEBUG(D_NET, "Complete %p "LPD64": %d\n", tx, cookie, status);
+ if (conn->ibc_state > IBNAL_CONN_ESTABLISHED) {
+ /* No more posts for this rx; so lose its ref */
+ kibnal_conn_decref(conn);
+ return 0;
+ }
+
+ rx->rx_nob = -1; /* flag posted */
+ mb();
- /* XXX Set mlength of REPLY here */
+ frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq);
+ if (frc == FSUCCESS) {
+ if (credit) {
+ spin_lock(&conn->ibc_lock);
+ conn->ibc_outstanding_credits++;
+ spin_unlock(&conn->ibc_lock);
- tx->tx_status = status;
- tx->tx_passive_rdma_wait = 0;
- idle = (tx->tx_sending == 0);
+ kibnal_check_sends(conn);
+ }
+ return 0;
+ }
+
+ CERROR ("post rx -> %s failed %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+ rc = -EIO;
+ kibnal_close_conn(rx->rx_conn, rc);
+ /* No more posts for this rx; so lose its ref */
+ kibnal_conn_decref(conn);
+ return rc;
+}
- if (idle)
- list_del (&tx->tx_list);
+int
+kibnal_post_receives (kib_conn_t *conn)
+{
+ int i;
+ int rc;
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ LASSERT (conn->ibc_state == IBNAL_CONN_CONNECTING);
- /* I could be racing with tx callbacks. It's whoever
- * _makes_ tx idle that frees it */
- if (idle)
- kibnal_tx_done (tx);
- return;
+ for (i = 0; i < IBNAL_RX_MSGS; i++) {
+ /* +1 ref for rx desc. This ref remains until kibnal_post_rx
+ * fails (i.e. actual failure or we're disconnecting) */
+ kibnal_conn_addref(conn);
+ rc = kibnal_post_rx (&conn->ibc_rxs[i], 0);
+ if (rc != 0)
+ return rc;
}
-
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
- CERROR ("Unmatched (late?) RDMA completion "LPX64" from %s\n",
- cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ return 0;
}
-static __u32
-kibnal_lkey(kib_pages_t *ibp)
+kib_tx_t *
+kibnal_find_waiting_tx_locked(kib_conn_t *conn, int txtype, __u64 cookie)
{
- if (kibnal_whole_mem())
- return kibnal_data.kib_md.md_lkey;
+ struct list_head *tmp;
+
+ list_for_each(tmp, &conn->ibc_active_txs) {
+ kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+
+ LASSERT (!tx->tx_queued);
+ LASSERT (tx->tx_sending != 0 || tx->tx_waiting);
+
+ if (tx->tx_cookie != cookie)
+ continue;
- return ibp->ibp_lkey;
+ if (tx->tx_waiting &&
+ tx->tx_msg->ibm_type == txtype)
+ return tx;
+
+ CWARN("Bad completion: %swaiting, type %x (wanted %x)\n",
+ tx->tx_waiting ? "" : "NOT ",
+ tx->tx_msg->ibm_type, txtype);
+ }
+ return NULL;
}
-static void
-kibnal_post_rx (kib_rx_t *rx, int do_credits)
+void
+kibnal_handle_completion(kib_conn_t *conn, int txtype, int status, __u64 cookie)
{
- kib_conn_t *conn = rx->rx_conn;
- int rc = 0;
- unsigned long flags;
- FSTATUS frc;
- ENTRY;
+ kib_tx_t *tx;
+ int idle;
- rx->rx_gl = (IB_LOCAL_DATASEGMENT) {
- .Address = rx->rx_vaddr,
- .Length = IBNAL_MSG_SIZE,
- .Lkey = kibnal_lkey(conn->ibc_rx_pages),
- };
+ spin_lock(&conn->ibc_lock);
- rx->rx_wrq = (IB_WORK_REQ) {
- .Operation = WROpRecv,
- .DSListDepth = 1,
- .MessageLen = IBNAL_MSG_SIZE,
- .WorkReqId = kibnal_ptr2wreqid(rx, 1),
- .DSList = &rx->rx_gl,
- };
-
- KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
- IBNAL_CONN_DREP);
- LASSERT (rx->rx_nob >= 0);
- rx->rx_nob = -1; /* flag posted */
- mb();
+ tx = kibnal_find_waiting_tx_locked(conn, txtype, cookie);
+ if (tx == NULL) {
+ spin_unlock(&conn->ibc_lock);
- if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
- rc = -ECONNABORTED;
- else {
- frc = iibt_postrecv(conn->ibc_qp, &rx->rx_wrq);
- if (frc != FSUCCESS) {
- CDEBUG(D_NET, "post failed %d\n", frc);
- rc = -EINVAL;
- }
- CDEBUG(D_NET, "posted rx %p\n", &rx->rx_wrq);
+ CWARN("Unmatched completion type %x cookie "LPX64" from %s\n",
+ txtype, cookie, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kibnal_close_conn (conn, -EPROTO);
+ return;
}
- if (rc == 0) {
- if (do_credits) {
- spin_lock_irqsave(&conn->ibc_lock, flags);
- conn->ibc_outstanding_credits++;
- spin_unlock_irqrestore(&conn->ibc_lock, flags);
+ if (tx->tx_status == 0) { /* success so far */
+ if (status < 0) { /* failed? */
+ tx->tx_status = status;
+ } else if (txtype == IBNAL_MSG_GET_REQ) {
+ /* XXX layering violation: set REPLY data length */
+ LASSERT (tx->tx_lntmsg[1] != NULL);
+ LASSERT (tx->tx_lntmsg[1]->msg_ev.type ==
+ LNET_EVENT_REPLY);
- kibnal_check_sends(conn);
+ tx->tx_lntmsg[1]->msg_ev.mlength = status;
}
- EXIT;
- return;
}
+
+ tx->tx_waiting = 0;
- if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
- CERROR ("Error posting receive -> %s: %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
- kibnal_close_conn (rx->rx_conn, rc);
- } else {
- CDEBUG (D_NET, "Error posting receive -> %s: %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
- }
+ idle = !tx->tx_queued && (tx->tx_sending == 0);
+ if (idle)
+ list_del(&tx->tx_list);
- /* Drop rx's ref */
- kibnal_put_conn (conn);
- EXIT;
+ spin_unlock(&conn->ibc_lock);
+
+ if (idle)
+ kibnal_tx_done(tx);
}
-#if IBNAL_CKSUM
-static inline __u32 kibnal_cksum (void *ptr, int nob)
+void
+kibnal_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie)
{
- char *c = ptr;
- __u32 sum = 0;
-
- while (nob-- > 0)
- sum = ((sum << 1) | (sum >> 31)) + *c++;
+ kib_tx_t *tx = kibnal_get_idle_tx();
+
+ if (tx == NULL) {
+ CERROR("Can't get tx for completion %x for %s\n",
+ type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ return;
+ }
+
+ tx->tx_msg->ibm_u.completion.ibcm_status = status;
+ tx->tx_msg->ibm_u.completion.ibcm_cookie = cookie;
+ kibnal_init_tx_msg(tx, type, sizeof(kib_completion_msg_t));
- return (sum);
+ kibnal_queue_tx(tx, conn);
}
-#endif
-static void hexdump(char *string, void *ptr, int len)
+void
+kibnal_handle_rx (kib_rx_t *rx)
{
- unsigned char *c = ptr;
- int i;
+ kib_msg_t *msg = rx->rx_msg;
+ kib_conn_t *conn = rx->rx_conn;
+ int credits = msg->ibm_credits;
+ kib_tx_t *tx;
+ int rc = 0;
+ int repost = 1;
+ int rc2;
- return;
+ LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
- if (len < 0 || len > 2048) {
- printk("XXX what the hell? %d\n",len);
- return;
+ CDEBUG (D_NET, "Received %x[%d] from %s\n",
+ msg->ibm_type, credits, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+ if (credits != 0) {
+ /* Have I received credits that will let me send? */
+ spin_lock(&conn->ibc_lock);
+ conn->ibc_credits += credits;
+ spin_unlock(&conn->ibc_lock);
+
+ kibnal_check_sends(conn);
}
- printk("%d bytes of '%s' from 0x%p\n", len, string, ptr);
+ /* clear flag so GET_REQ can see if it caused a REPLY */
+ rx->rx_responded = 0;
- for (i = 0; i < len;) {
- printk("%02x",*(c++));
- i++;
- if (!(i & 15)) {
- printk("\n");
- } else if (!(i&1)) {
- printk(" ");
+ switch (msg->ibm_type) {
+ default:
+ CERROR("Bad IBNAL message type %x from %s\n",
+ msg->ibm_type, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ rc = -EPROTO;
+ break;
+
+ case IBNAL_MSG_NOOP:
+ break;
+
+ case IBNAL_MSG_IMMEDIATE:
+ rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr,
+ msg->ibm_srcnid, rx);
+ repost = rc < 0; /* repost on error */
+ break;
+
+ case IBNAL_MSG_PUT_REQ:
+ rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.putreq.ibprm_hdr,
+ msg->ibm_srcnid, rx);
+ repost = rc < 0; /* repost on error */
+ break;
+
+ case IBNAL_MSG_PUT_NAK:
+ CWARN ("PUT_NACK from %s\n", libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kibnal_handle_completion(conn, IBNAL_MSG_PUT_REQ,
+ msg->ibm_u.completion.ibcm_status,
+ msg->ibm_u.completion.ibcm_cookie);
+ break;
+
+ case IBNAL_MSG_PUT_ACK:
+ spin_lock(&conn->ibc_lock);
+ tx = kibnal_find_waiting_tx_locked(conn, IBNAL_MSG_PUT_REQ,
+ msg->ibm_u.putack.ibpam_src_cookie);
+ if (tx != NULL)
+ list_del(&tx->tx_list);
+ spin_unlock(&conn->ibc_lock);
+
+ if (tx == NULL) {
+ CERROR("Unmatched PUT_ACK from %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ rc = -EPROTO;
+ break;
}
- }
- if(len & 15) {
- printk("\n");
+ LASSERT (tx->tx_waiting);
+ /* CAVEAT EMPTOR: I could be racing with tx_complete, but...
+ * (a) I can overwrite tx_msg since my peer has received it!
+ * (b) tx_waiting set tells tx_complete() it's not done. */
+
+ tx->tx_nwrq = 0; /* overwrite PUT_REQ */
+
+ rc2 = kibnal_init_rdma(tx, IBNAL_MSG_PUT_DONE,
+ kibnal_rd_size(&msg->ibm_u.putack.ibpam_rd),
+ &msg->ibm_u.putack.ibpam_rd,
+ msg->ibm_u.putack.ibpam_dst_cookie);
+ if (rc2 < 0)
+ CERROR("Can't setup rdma for PUT to %s: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+
+ spin_lock(&conn->ibc_lock);
+ if (tx->tx_status == 0 && rc2 < 0)
+ tx->tx_status = rc2;
+ tx->tx_waiting = 0; /* clear waiting and queue atomically */
+ kibnal_queue_tx_locked(tx, conn);
+ spin_unlock(&conn->ibc_lock);
+ break;
+
+ case IBNAL_MSG_PUT_DONE:
+ kibnal_handle_completion(conn, IBNAL_MSG_PUT_ACK,
+ msg->ibm_u.completion.ibcm_status,
+ msg->ibm_u.completion.ibcm_cookie);
+ break;
+
+ case IBNAL_MSG_GET_REQ:
+ rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.get.ibgm_hdr,
+ msg->ibm_srcnid, rx);
+ repost = rc < 0; /* repost on error */
+ break;
+
+ case IBNAL_MSG_GET_DONE:
+ kibnal_handle_completion(conn, IBNAL_MSG_GET_REQ,
+ msg->ibm_u.completion.ibcm_status,
+ msg->ibm_u.completion.ibcm_cookie);
+ break;
}
+
+ if (rc < 0) /* protocol error */
+ kibnal_close_conn(conn, rc);
+
+ if (repost)
+ kibnal_post_rx(rx, 1);
}
-static void
-kibnal_rx_callback (IB_WORK_COMPLETION *wc)
+void
+kibnal_rx_complete (IB_WORK_COMPLETION *wc, __u64 rxseq)
{
kib_rx_t *rx = (kib_rx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
+ int nob = wc->Length;
kib_msg_t *msg = rx->rx_msg;
kib_conn_t *conn = rx->rx_conn;
- int nob = wc->Length;
- const int base_nob = offsetof(kib_msg_t, ibm_u);
- int credits;
- int flipped;
unsigned long flags;
- __u32 i;
-#if IBNAL_CKSUM
- __u32 msg_cksum;
- __u32 computed_cksum;
-#endif
-
- /* we set the QP to erroring after we've finished disconnecting,
- * maybe we should do so sooner. */
- KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_ESTABLISHED,
- IBNAL_CONN_DISCONNECTED);
+ int rc;
CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
LASSERT (rx->rx_nob < 0); /* was posted */
/* receives complete with error in any case after we've started
* disconnecting */
if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
- goto failed;
+ goto ignore;
if (wc->Status != WRStatusSuccess) {
CERROR("Rx from %s failed: %d\n",
goto failed;
}
- if (nob < base_nob) {
- CERROR ("Short rx from %s: %d < expected %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- nob, base_nob);
+ rc = kibnal_unpack_msg(msg, nob);
+ if (rc != 0) {
+ CERROR ("Error %d unpacking rx from %s\n",
+ rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
goto failed;
}
- rx->rx_nob = nob;
+ rx->rx_nob = nob; /* Now I know nob > 0 */
mb();
- hexdump("rx", rx->rx_msg, sizeof(kib_msg_t));
-
- /* Receiver does any byte flipping if necessary... */
-
- if (msg->ibm_magic == IBNAL_MSG_MAGIC) {
- flipped = 0;
- } else {
- if (msg->ibm_magic != __swab32(IBNAL_MSG_MAGIC)) {
- CERROR ("Unrecognised magic: %08x from %s\n",
- msg->ibm_magic,
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- goto failed;
- }
- flipped = 1;
- __swab16s (&msg->ibm_version);
- LASSERT (sizeof(msg->ibm_type) == 1);
- LASSERT (sizeof(msg->ibm_credits) == 1);
- }
-
- if (msg->ibm_version != IBNAL_MSG_VERSION) {
- CERROR ("Incompatible msg version %d (%d expected)\n",
- msg->ibm_version, IBNAL_MSG_VERSION);
+ if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+ msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid ||
+ msg->ibm_srcstamp != conn->ibc_incarnation ||
+ msg->ibm_dststamp != kibnal_data.kib_incarnation) {
+ CERROR ("Stale rx from %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
goto failed;
}
-#if IBNAL_CKSUM
- if (nob != msg->ibm_nob) {
- CERROR ("Unexpected # bytes %d (%d expected)\n", nob, msg->ibm_nob);
+ if (msg->ibm_seq != rxseq) {
+ CERROR ("Out-of-sequence rx from %s"
+ ": got "LPD64" but expected "LPD64"\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ msg->ibm_seq, rxseq);
goto failed;
}
- msg_cksum = le32_to_cpu(msg->ibm_cksum);
- msg->ibm_cksum = 0;
- computed_cksum = kibnal_cksum (msg, nob);
-
- if (msg_cksum != computed_cksum) {
- CERROR ("Checksum failure %d: (%d expected)\n",
- computed_cksum, msg_cksum);
-// goto failed;
- }
- CDEBUG(D_NET, "cksum %x, nob %d\n", computed_cksum, nob);
-#endif
-
- /* Have I received credits that will let me send? */
- credits = msg->ibm_credits;
- if (credits != 0) {
- spin_lock_irqsave(&conn->ibc_lock, flags);
- conn->ibc_credits += credits;
- spin_unlock_irqrestore(&conn->ibc_lock, flags);
-
- kibnal_check_sends(conn);
- }
-
- switch (msg->ibm_type) {
- case IBNAL_MSG_NOOP:
- kibnal_post_rx (rx, 1);
- return;
+ /* racing with connection establishment/teardown! */
- case IBNAL_MSG_IMMEDIATE:
- if (nob < base_nob + sizeof (kib_immediate_msg_t)) {
- CERROR ("Short IMMEDIATE from %s: %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- nob);
- goto failed;
- }
- break;
-
- case IBNAL_MSG_PUT_RDMA:
- case IBNAL_MSG_GET_RDMA:
- if (nob < base_nob + sizeof (kib_rdma_msg_t)) {
- CERROR ("Short RDMA msg from %s: %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), nob);
- goto failed;
- }
- if (flipped)
- __swab32(msg->ibm_u.rdma.ibrm_num_descs);
-
- CDEBUG(D_NET, "%d RDMA: cookie "LPX64":\n",
- msg->ibm_type, msg->ibm_u.rdma.ibrm_cookie);
-
- if ((msg->ibm_u.rdma.ibrm_num_descs > LNET_MAX_IOV) ||
- (kib_rdma_msg_len(msg->ibm_u.rdma.ibrm_num_descs) >
- min(nob, IBNAL_MSG_SIZE))) {
- CERROR ("num_descs %d too large\n",
- msg->ibm_u.rdma.ibrm_num_descs);
- goto failed;
- }
-
- if (flipped) {
- __swab32(msg->ibm_u.rdma.rd_key);
- }
-
- for(i = 0; i < msg->ibm_u.rdma.ibrm_num_descs; i++) {
- kib_rdma_desc_t *desc = &msg->ibm_u.rdma.ibrm_desc[i];
-
- if (flipped) {
- __swab32(desc->rd_nob);
- __swab64(desc->rd_addr);
- }
-
- CDEBUG(D_NET, " key %x, " "addr "LPX64", nob %u\n",
- msg->ibm_u.rdma.rd_key, desc->rd_addr, desc->rd_nob);
- }
- break;
-
- case IBNAL_MSG_PUT_DONE:
- case IBNAL_MSG_GET_DONE:
- if (nob < base_nob + sizeof (kib_completion_msg_t)) {
- CERROR ("Short COMPLETION msg from %s: %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), nob);
- goto failed;
+ if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+ /* must check holding global lock to eliminate race */
+ if (conn->ibc_state < IBNAL_CONN_ESTABLISHED) {
+ list_add_tail(&rx->rx_list, &conn->ibc_early_rxs);
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock,
+ flags);
+ return;
}
- if (flipped)
- __swab32s(&msg->ibm_u.completion.ibcm_status);
-
- CDEBUG(D_NET, "%d DONE: cookie "LPX64", status %d\n",
- msg->ibm_type, msg->ibm_u.completion.ibcm_cookie,
- msg->ibm_u.completion.ibcm_status);
-
- kibnal_complete_passive_rdma (conn,
- msg->ibm_u.completion.ibcm_cookie,
- msg->ibm_u.completion.ibcm_status);
- kibnal_post_rx (rx, 1);
- return;
-
- default:
- CERROR ("Can't parse type from %s: %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid), msg->ibm_type);
- goto failed;
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock,
+ flags);
}
-
- /* schedule for kibnal_rx() in thread context */
- spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
-
- list_add_tail (&rx->rx_list, &kibnal_data.kib_sched_rxq);
- wake_up (&kibnal_data.kib_sched_waitq);
-
- spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+ kibnal_handle_rx(rx);
return;
failed:
CDEBUG(D_NET, "rx %p conn %p\n", rx, conn);
- kibnal_close_conn(conn, -ECONNABORTED);
-
+ kibnal_close_conn(conn, -EIO);
+ ignore:
/* Don't re-post rx & drop its ref on conn */
- kibnal_put_conn(conn);
-}
-
-void
-kibnal_rx (kib_rx_t *rx)
-{
- int rc = 0;
- kib_msg_t *msg = rx->rx_msg;
-
- /* Clear flag so I can detect if I've sent an RDMA completion */
- rx->rx_rdma = 0;
-
- switch (msg->ibm_type) {
- case IBNAL_MSG_GET_RDMA:
- rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr,
- rx->rx_conn->ibc_peer->ibp_nid, rx);
- break;
-
- case IBNAL_MSG_PUT_RDMA:
- rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.rdma.ibrm_hdr,
- rx->rx_conn->ibc_peer->ibp_nid, rx);
- break;
-
- case IBNAL_MSG_IMMEDIATE:
- rc = lnet_parse(kibnal_data.kib_ni, &msg->ibm_u.immediate.ibim_hdr,
- rx->rx_conn->ibc_peer->ibp_nid, rx);
- break;
-
- default:
- LBUG();
- break;
- }
-
- if (rc < 0) {
- kibnal_close_conn(rx->rx_conn, rc);
- kibnal_post_rx (rx, 1);
- }
+ kibnal_conn_decref(conn);
}
-static struct page *
+struct page *
kibnal_kvaddr_to_page (unsigned long vaddr)
{
struct page *page;
if (vaddr >= VMALLOC_START &&
- vaddr < VMALLOC_END)
+ vaddr < VMALLOC_END) {
page = vmalloc_to_page ((void *)vaddr);
+ LASSERT (page != NULL);
+ return page;
+ }
#if CONFIG_HIGHMEM
- else if (vaddr >= PKMAP_BASE &&
- vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE))
- page = vmalloc_to_page ((void *)vaddr);
- /* in 2.4 ^ just walks the page tables */
+ if (vaddr >= PKMAP_BASE &&
+ vaddr < (PKMAP_BASE + LAST_PKMAP * PAGE_SIZE)) {
+ /* No highmem pages only used for bulk (kiov) I/O */
+ CERROR("find page for address in highmem\n");
+ LBUG();
+ }
#endif
- else
- page = virt_to_page (vaddr);
-
- if (!VALID_PAGE (page))
- page = NULL;
-
+ page = virt_to_page (vaddr);
+ LASSERT (page != NULL);
return page;
}
-static void
-kibnal_fill_ibrm(kib_tx_t *tx, struct page *page, unsigned long page_offset,
- unsigned long len, int active)
+#if !IBNAL_USE_FMR
+int
+kibnal_append_rdfrag(kib_rdma_desc_t *rd, int active, struct page *page,
+ unsigned long page_offset, unsigned long len)
{
- kib_rdma_msg_t *ibrm = &tx->tx_msg->ibm_u.rdma;
- kib_rdma_desc_t *desc;
+ kib_rdma_frag_t *frag = &rd->rd_frags[rd->rd_nfrag];
- LASSERTF(ibrm->ibrm_num_descs < LNET_MAX_IOV, "%u\n",
- ibrm->ibrm_num_descs);
+ if (rd->rd_nfrag >= IBNAL_MAX_RDMA_FRAGS) {
+ CERROR ("Too many RDMA fragments\n");
+ return -EMSGSIZE;
+ }
- desc = &ibrm->ibrm_desc[ibrm->ibrm_num_descs];
- if (active)
- ibrm->rd_key = kibnal_data.kib_md.md_lkey;
- else
- ibrm->rd_key = kibnal_data.kib_md.md_rkey;
- desc->rd_nob = len; /*PAGE_SIZE - kiov->kiov_offset; */
- desc->rd_addr = kibnal_page2phys(page) + page_offset +
- kibnal_data.kib_md.md_addr;
+ if (active) {
+ if (rd->rd_nfrag == 0)
+ rd->rd_key = kibnal_data.kib_whole_mem.md_lkey;
+ } else {
+ if (rd->rd_nfrag == 0)
+ rd->rd_key = kibnal_data.kib_whole_mem.md_rkey;
+ }
+
+ frag->rf_nob = len;
+ frag->rf_addr = kibnal_data.kib_whole_mem.md_addr +
+ kibnal_page2phys(page) + page_offset;
- ibrm->ibrm_num_descs++;
+ CDEBUG(D_NET,"map key %x frag [%d]["LPX64" for %d]\n",
+ rd->rd_key, rd->rd_nfrag, frag->rf_addr, frag->rf_nob);
+
+ rd->rd_nfrag++;
+ return 0;
}
-static int
-kibnal_map_rdma_iov(kib_tx_t *tx, unsigned long vaddr, int nob, int active)
+int
+kibnal_setup_rd_iov(kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+ unsigned int niov, struct iovec *iov, int offset, int nob)
+
{
- struct page *page;
- int page_offset, len;
+ int fragnob;
+ int rc;
+ unsigned long vaddr;
+ struct page *page;
+ int page_offset;
+
+ LASSERT (nob > 0);
+ LASSERT (niov > 0);
+ LASSERT ((rd != tx->tx_rd) == !active);
+
+ while (offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ niov--;
+ iov++;
+ LASSERT (niov > 0);
+ }
- while (nob > 0) {
+ rd->rd_nfrag = 0;
+ do {
+ LASSERT (niov > 0);
+
+ vaddr = ((unsigned long)iov->iov_base) + offset;
+ page_offset = vaddr & (PAGE_SIZE - 1);
page = kibnal_kvaddr_to_page(vaddr);
- if (page == NULL)
+ if (page == NULL) {
+ CERROR ("Can't find page\n");
return -EFAULT;
+ }
- page_offset = vaddr & (PAGE_SIZE - 1);
- len = min(nob, (int)PAGE_SIZE - page_offset);
-
- kibnal_fill_ibrm(tx, page, page_offset, len, active);
- nob -= len;
- vaddr += len;
+ fragnob = min((int)(iov->iov_len - offset), nob);
+ fragnob = min(fragnob, (int)PAGE_SIZE - page_offset);
+
+ rc = kibnal_append_rdfrag(rd, active, page,
+ page_offset, fragnob);
+ if (rc != 0)
+ return rc;
+
+ if (offset + fragnob < iov->iov_len) {
+ offset += fragnob;
+ } else {
+ offset = 0;
+ iov++;
+ niov--;
+ }
+ nob -= fragnob;
+ } while (nob > 0);
+
+ return 0;
+}
+
+int
+kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+ int nkiov, lnet_kiov_t *kiov, int offset, int nob)
+{
+ int fragnob;
+ int rc;
+
+ CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
+
+ LASSERT (nob > 0);
+ LASSERT (nkiov > 0);
+ LASSERT ((rd != tx->tx_rd) == !active);
+
+ while (offset >= kiov->kiov_len) {
+ offset -= kiov->kiov_len;
+ nkiov--;
+ kiov++;
+ LASSERT (nkiov > 0);
}
+
+ rd->rd_nfrag = 0;
+ do {
+ LASSERT (nkiov > 0);
+ fragnob = min((int)(kiov->kiov_len - offset), nob);
+
+ rc = kibnal_append_rdfrag(rd, active, kiov->kiov_page,
+ kiov->kiov_offset + offset,
+ fragnob);
+ if (rc != 0)
+ return rc;
+
+ offset = 0;
+ kiov++;
+ nkiov--;
+ nob -= fragnob;
+ } while (nob > 0);
+
return 0;
}
+#else
+int
+kibnal_map_tx (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+ int npages, unsigned long page_offset, int nob)
+{
+ IB_ACCESS_CONTROL access = {0,};
+ FSTATUS frc;
-static int
-kibnal_map_iov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
- unsigned int niov, struct iovec *iov, int offset, int nob, int active)
+ LASSERT ((rd != tx->tx_rd) == !active);
+ LASSERT (!tx->tx_md.md_active);
+ LASSERT (tx->tx_md.md_fmrcount > 0);
+ LASSERT (page_offset < PAGE_SIZE);
+ LASSERT (npages >= (1 + ((page_offset + nob - 1)>>PAGE_SHIFT)));
+ LASSERT (npages <= LNET_MAX_IOV);
+
+ if (!active) {
+ // access.s.MWBindable = 1;
+ access.s.LocalWrite = 1;
+ access.s.RdmaWrite = 1;
+ }
+
+ /* Map the memory described by tx->tx_pages
+ frc = iibt_register_physical_memory(kibnal_data.kib_hca,
+ IBNAL_RDMA_BASE,
+ tx->tx_pages, npages,
+ page_offset,
+ kibnal_data.kib_pd,
+ access,
+ &tx->tx_md.md_handle,
+ &tx->tx_md.md_addr,
+ &tx->tx_md.md_lkey,
+ &tx->tx_md.md_rkey);
+ */
+ return -EINVAL;
+}
+
+int
+kibnal_setup_rd_iov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+ unsigned int niov, struct iovec *iov, int offset, int nob)
{
- void *vaddr;
- FSTATUS frc;
+ int resid;
+ int fragnob;
+ struct page *page;
+ int npages;
+ unsigned long page_offset;
+ unsigned long vaddr;
LASSERT (nob > 0);
LASSERT (niov > 0);
- LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
while (offset >= iov->iov_len) {
offset -= iov->iov_len;
return (-EMSGSIZE);
}
- /* our large contiguous iov could be backed by multiple physical
- * pages. */
- if (kibnal_whole_mem()) {
- int rc;
- tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
- rc = kibnal_map_rdma_iov(tx, (unsigned long)iov->iov_base +
- offset, nob, active);
- if (rc != 0) {
- CERROR ("Can't map iov: %d\n", rc);
- return rc;
+ vaddr = ((unsigned long)iov->iov_base) + offset;
+
+ page_offset = vaddr & (PAGE_SIZE - 1);
+ resid = nob;
+ npages = 0;
+
+ do {
+ LASSERT (npages < LNET_MAX_IOV);
+
+ page = kibnal_kvaddr_to_page(vaddr);
+ if (page == NULL) {
+ CERROR("Can't find page for %lu\n", vaddr);
+ return -EFAULT;
}
- return 0;
- }
- vaddr = (void *)(((unsigned long)iov->iov_base) + offset);
- tx->tx_md.md_addr = (__u64)((unsigned long)vaddr);
+ tx->tx_pages[npages++] = kibnal_page2phys(page);
- frc = iibt_register_memory(kibnal_data.kib_hca, vaddr, nob,
- kibnal_data.kib_pd, access,
- &tx->tx_md.md_handle, &tx->tx_md.md_lkey,
- &tx->tx_md.md_rkey);
- if (frc != 0) {
- CERROR ("Can't map vaddr %p: %d\n", vaddr, frc);
- return -EINVAL;
- }
+ fragnob = PAGE_SIZE - (vaddr & (PAGE_SIZE - 1));
+ vaddr += fragnob;
+ resid -= fragnob;
- tx->tx_mapped = KIB_TX_MAPPED;
- return (0);
+ } while (resid > 0);
+
+ return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
}
-static int
-kibnal_map_kiov (kib_tx_t *tx, IB_ACCESS_CONTROL access,
- int nkiov, lnet_kiov_t *kiov,
- int offset, int nob, int active)
+int
+kibnal_setup_rd_kiov (kib_tx_t *tx, kib_rdma_desc_t *rd, int active,
+ int nkiov, lnet_kiov_t *kiov, int offset, int nob)
{
- __u64 *phys = NULL;
- int page_offset;
- int nphys;
- int resid;
- int phys_size = 0;
- FSTATUS frc;
- int i, rc = 0;
-
+ int resid;
+ int npages;
+ unsigned long page_offset;
+
CDEBUG(D_NET, "niov %d offset %d nob %d\n", nkiov, offset, nob);
LASSERT (nob > 0);
LASSERT (nkiov > 0);
- LASSERT (tx->tx_mapped == KIB_TX_UNMAPPED);
+ LASSERT (nkiov <= LNET_MAX_IOV);
+ LASSERT (!tx->tx_md.md_active);
+ LASSERT ((rd != tx->tx_rd) == !active);
while (offset >= kiov->kiov_len) {
offset -= kiov->kiov_len;
}
page_offset = kiov->kiov_offset + offset;
- nphys = 1;
-
- if (!kibnal_whole_mem()) {
- phys_size = nkiov * sizeof (*phys);
- LIBCFS_ALLOC(phys, phys_size);
- if (phys == NULL) {
- CERROR ("Can't allocate tmp phys\n");
- return (-ENOMEM);
- }
-
- phys[0] = kibnal_page2phys(kiov->kiov_page);
- } else {
- tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 0;
- kibnal_fill_ibrm(tx, kiov->kiov_page, kiov->kiov_offset,
- kiov->kiov_len, active);
- }
-
- resid = nob - (kiov->kiov_len - offset);
+
+ resid = offset + nob;
+ npages = 0;
- while (resid > 0) {
- kiov++;
- nkiov--;
+ do {
+ LASSERT (npages < LNET_MAX_IOV);
LASSERT (nkiov > 0);
- if (kiov->kiov_offset != 0 ||
- ((resid > PAGE_SIZE) &&
- kiov->kiov_len < PAGE_SIZE)) {
+ if ((npages > 0 && kiov->kiov_offset != 0) ||
+ (resid > kiov->kiov_len &&
+ (kiov->kiov_offset + kiov->kiov_len) != PAGE_SIZE)) {
/* Can't have gaps */
CERROR ("Can't make payload contiguous in I/O VM:"
- "page %d, offset %d, len %d \n", nphys,
- kiov->kiov_offset, kiov->kiov_len);
-
- for (i = -nphys; i < nkiov; i++)
- {
- CERROR("kiov[%d] %p +%d for %d\n",
- i, kiov[i].kiov_page, kiov[i].kiov_offset, kiov[i].kiov_len);
- }
+ "page %d, offset %d, len %d \n",
+ npages, kiov->kiov_offset, kiov->kiov_len);
- rc = -EINVAL;
- goto out;
+ return -EINVAL;
}
- if (nphys == LNET_MAX_IOV) {
- CERROR ("payload too big (%d)\n", nphys);
- rc = -EMSGSIZE;
- goto out;
- }
-
- if (!kibnal_whole_mem()) {
- LASSERT (nphys * sizeof (*phys) < phys_size);
- phys[nphys] = kibnal_page2phys(kiov->kiov_page);
- } else {
- if (kib_rdma_msg_len(nphys) > IBNAL_MSG_SIZE) {
- CERROR ("payload too big (%d)\n", nphys);
- rc = -EMSGSIZE;
- goto out;
- }
- kibnal_fill_ibrm(tx, kiov->kiov_page,
- kiov->kiov_offset, kiov->kiov_len,
- active);
- }
-
- nphys ++;
- resid -= PAGE_SIZE;
- }
-
- if (kibnal_whole_mem())
- goto out;
-
-#if 0
- CWARN ("nphys %d, nob %d, page_offset %d\n", nphys, nob, page_offset);
- for (i = 0; i < nphys; i++)
- CWARN (" [%d] "LPX64"\n", i, phys[i]);
-#endif
-
-#if IBNAL_FMR
-#error "iibnal hasn't learned about FMR yet"
- rc = ib_fmr_register_physical (kibnal_data.kib_fmr_pool,
- phys, nphys,
- &tx->tx_md.md_addr,
- page_offset,
- &tx->tx_md.md_handle.fmr,
- &tx->tx_md.md_lkey,
- &tx->tx_md.md_rkey);
-#else
- frc = iibt_register_physical_memory(kibnal_data.kib_hca,
- IBNAL_RDMA_BASE,
- phys, nphys,
- 0, /* offset */
- kibnal_data.kib_pd,
- access,
- &tx->tx_md.md_handle,
- &tx->tx_md.md_addr,
- &tx->tx_md.md_lkey,
- &tx->tx_md.md_rkey);
-#endif
- if (frc == FSUCCESS) {
- CDEBUG(D_NET, "Mapped %d pages %d bytes @ offset %d: lkey %x, rkey %x\n",
- nphys, nob, page_offset, tx->tx_md.md_lkey, tx->tx_md.md_rkey);
-#if IBNAL_FMR
- tx->tx_mapped = KIB_TX_MAPPED_FMR;
-#else
- tx->tx_mapped = KIB_TX_MAPPED;
-#endif
- } else {
- CERROR ("Can't map phys: %d\n", frc);
- rc = -EFAULT;
- }
+ tx->tx_pages[npages++] = kibnal_page2phys(kiov->kiov_page);
+ resid -= kiov->kiov_len;
+ kiov++;
+ nkiov--;
+ } while (resid > 0);
- out:
- if (phys != NULL)
- LIBCFS_FREE(phys, phys_size);
- return (rc);
+ return kibnal_map_tx(tx, rd, active, npages, page_offset, nob);
}
+#endif
-static kib_conn_t *
+kib_conn_t *
kibnal_find_conn_locked (kib_peer_t *peer)
{
struct list_head *tmp;
void
kibnal_check_sends (kib_conn_t *conn)
{
- unsigned long flags;
kib_tx_t *tx;
+ FSTATUS frc;
int rc;
- int i;
int done;
+ int i;
int nwork;
- ENTRY;
- spin_lock_irqsave (&conn->ibc_lock, flags);
+ LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
+
+ spin_lock(&conn->ibc_lock);
LASSERT (conn->ibc_nsends_posted <= IBNAL_MSG_QUEUE_SIZE);
if (list_empty(&conn->ibc_tx_queue) &&
conn->ibc_outstanding_credits >= IBNAL_CREDIT_HIGHWATER) {
- spin_unlock_irqrestore(&conn->ibc_lock, flags);
+ spin_unlock(&conn->ibc_lock);
tx = kibnal_get_idle_tx();
if (tx != NULL)
kibnal_init_tx_msg(tx, IBNAL_MSG_NOOP, 0);
- spin_lock_irqsave(&conn->ibc_lock, flags);
+ spin_lock(&conn->ibc_lock);
- if (tx != NULL) {
- atomic_inc(&conn->ibc_refcount);
+ if (tx != NULL)
kibnal_queue_tx_locked(tx, conn);
- }
}
while (!list_empty (&conn->ibc_tx_queue)) {
tx = list_entry (conn->ibc_tx_queue.next, kib_tx_t, tx_list);
+ LASSERT (tx->tx_queued);
/* We rely on this for QP sizing */
- LASSERT (tx->tx_nsp > 0 && tx->tx_nsp <= IBNAL_TX_MAX_SG);
+ LASSERT (tx->tx_nwrq > 0 && tx->tx_nwrq <= 1 + IBNAL_MAX_RDMA_FRAGS);
LASSERT (conn->ibc_outstanding_credits >= 0);
LASSERT (conn->ibc_outstanding_credits <= IBNAL_MSG_QUEUE_SIZE);
LASSERT (conn->ibc_credits >= 0);
LASSERT (conn->ibc_credits <= IBNAL_MSG_QUEUE_SIZE);
- /* Not on ibc_rdma_queue */
- LASSERT (!tx->tx_passive_rdma_wait);
-
- if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE)
- GOTO(out, 0);
-
- if (conn->ibc_credits == 0) /* no credits */
- GOTO(out, 1);
+ if (conn->ibc_nsends_posted == IBNAL_MSG_QUEUE_SIZE) {
+ CDEBUG(D_NET, "%s: posted enough\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ break;
+ }
+
+ if (conn->ibc_credits == 0) { /* no credits */
+ CDEBUG(D_NET, "%s: no credits\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ break;
+ }
if (conn->ibc_credits == 1 && /* last credit reserved for */
- conn->ibc_outstanding_credits == 0) /* giving back credits */
- GOTO(out, 2);
-
+ conn->ibc_outstanding_credits == 0) { /* giving back credits */
+ CDEBUG(D_NET, "%s: not using last credit\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ break;
+ }
+
list_del (&tx->tx_list);
+ tx->tx_queued = 0;
+
+ /* NB don't drop ibc_lock before bumping tx_sending */
if (tx->tx_msg->ibm_type == IBNAL_MSG_NOOP &&
(!list_empty(&conn->ibc_tx_queue) ||
conn->ibc_outstanding_credits < IBNAL_CREDIT_HIGHWATER)) {
/* redundant NOOP */
- spin_unlock_irqrestore(&conn->ibc_lock, flags);
+ spin_unlock(&conn->ibc_lock);
kibnal_tx_done(tx);
- spin_lock_irqsave(&conn->ibc_lock, flags);
+ spin_lock(&conn->ibc_lock);
+ CDEBUG(D_NET, "%s: redundant noop\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
continue;
}
- tx->tx_msg->ibm_credits = conn->ibc_outstanding_credits;
- conn->ibc_outstanding_credits = 0;
+ kibnal_pack_msg(tx->tx_msg, conn->ibc_outstanding_credits,
+ conn->ibc_peer->ibp_nid, conn->ibc_incarnation,
+ conn->ibc_txseq);
+ conn->ibc_txseq++;
+ conn->ibc_outstanding_credits = 0;
conn->ibc_nsends_posted++;
conn->ibc_credits--;
- /* we only get a tx completion for the final rdma op */
- tx->tx_sending = min(tx->tx_nsp, 2);
- tx->tx_passive_rdma_wait = tx->tx_passive_rdma;
+ /* CAVEAT EMPTOR! This tx could be the PUT_DONE of an RDMA
+ * PUT. If so, it was first queued here as a PUT_REQ, sent and
+ * stashed on ibc_active_txs, matched by an incoming PUT_ACK,
+ * and then re-queued here. It's (just) possible that
+ * tx_sending is non-zero if we've not done the tx_complete() from
+ * the first send; hence the ++ rather than = below. */
+ tx->tx_sending++;
+
list_add (&tx->tx_list, &conn->ibc_active_txs);
-#if IBNAL_CKSUM
- tx->tx_msg->ibm_cksum = 0;
- tx->tx_msg->ibm_cksum = kibnal_cksum(tx->tx_msg, tx->tx_msg->ibm_nob);
- CDEBUG(D_NET, "cksum %x, nob %d\n", tx->tx_msg->ibm_cksum, tx->tx_msg->ibm_nob);
-#endif
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
- /* NB the gap between removing tx from the queue and sending it
- * allows message re-ordering to occur */
+ /* Drop the lock while I send (this can re-order sends) */
+ spin_unlock(&conn->ibc_lock);
- LASSERT (tx->tx_nsp > 0);
+ LASSERT (tx->tx_nwrq > 0);
rc = -ECONNABORTED;
+ frc = FSUCCESS;
nwork = 0;
if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
- tx->tx_status = 0;
/* Driver only accepts 1 item at a time */
- for (i = 0; i < tx->tx_nsp; i++) {
- hexdump("tx", tx->tx_msg, sizeof(kib_msg_t));
- rc = iibt_postsend(conn->ibc_qp,
- &tx->tx_wrq[i]);
- if (rc != 0)
+ for (i = 0; i < tx->tx_nwrq; i++) {
+ frc = iibt_postsend(conn->ibc_qp,
+ &tx->tx_wrq[i]);
+ if (frc != FSUCCESS) {
+ rc = -EIO;
break;
- if (wrq_signals_completion(&tx->tx_wrq[i]))
- nwork++;
+ }
CDEBUG(D_NET, "posted tx wrq %p\n",
&tx->tx_wrq[i]);
}
}
- spin_lock_irqsave (&conn->ibc_lock, flags);
+ spin_lock(&conn->ibc_lock);
if (rc != 0) {
/* NB credits are transferred in the actual
* message, which can only be the last work item */
conn->ibc_nsends_posted--;
tx->tx_status = rc;
- tx->tx_passive_rdma_wait = 0;
- tx->tx_sending -= tx->tx_nsp - nwork;
-
+ tx->tx_waiting = 0;
+ tx->tx_sending--;
+
done = (tx->tx_sending == 0);
if (done)
list_del (&tx->tx_list);
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ spin_unlock(&conn->ibc_lock);
if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
- CERROR ("Error %d posting transmit to %s\n", rc,
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ CERROR ("Error %d posting transmit to %s\n",
+ frc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
else
CDEBUG (D_NET, "Error %d posting transmit to %s\n",
rc, libcfs_nid2str(conn->ibc_peer->ibp_nid));
kibnal_tx_done (tx);
return;
}
-
}
- EXIT;
-out:
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ spin_unlock(&conn->ibc_lock);
}
-static void
-kibnal_tx_callback (IB_WORK_COMPLETION *wc)
+void
+kibnal_tx_complete (IB_WORK_COMPLETION *wc)
{
kib_tx_t *tx = (kib_tx_t *)kibnal_wreqid2ptr(wc->WorkReqId);
- kib_conn_t *conn;
- unsigned long flags;
+ kib_conn_t *conn = tx->tx_conn;
+ int failed = wc->Status != WRStatusSuccess;
int idle;
- conn = tx->tx_conn;
- LASSERT (conn != NULL);
- LASSERT (tx->tx_sending != 0);
+ CDEBUG(D_NET, "tx %p conn %p sending %d nwrq %d status %d\n",
+ tx, conn, tx->tx_sending, tx->tx_nwrq, wc->Status);
+
+ LASSERT (tx->tx_sending > 0);
- spin_lock_irqsave(&conn->ibc_lock, flags);
+ if (failed &&
+ tx->tx_status == 0 &&
+ conn->ibc_state == IBNAL_CONN_ESTABLISHED)
+ CERROR("tx -> %s type %x cookie "LPX64
+ "sending %d waiting %d: failed %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ tx->tx_msg->ibm_type, tx->tx_cookie,
+ tx->tx_sending, tx->tx_waiting, wc->Status);
- CDEBUG(D_NET, "conn %p tx %p [%d/%d]: %d\n", conn, tx,
- tx->tx_sending, tx->tx_nsp, wc->Status);
+ spin_lock(&conn->ibc_lock);
/* I could be racing with rdma completion. Whoever makes 'tx' idle
- * gets to free it, which also drops its ref on 'conn'. If it's
- * not me, then I take an extra ref on conn so it can't disappear
- * under me. */
+ * gets to free it, which also drops its ref on 'conn'. */
tx->tx_sending--;
+ conn->ibc_nsends_posted--;
+
+ if (failed) {
+ tx->tx_waiting = 0;
+ tx->tx_status = -EIO;
+ }
+
idle = (tx->tx_sending == 0) && /* This is the final callback */
- (!tx->tx_passive_rdma_wait); /* Not waiting for RDMA completion */
+ !tx->tx_waiting && /* Not waiting for peer */
+ !tx->tx_queued; /* Not re-queued (PUT_DONE) */
if (idle)
list_del(&tx->tx_list);
- CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
- conn, conn->ibc_state,
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- atomic_read (&conn->ibc_refcount));
- atomic_inc (&conn->ibc_refcount);
-
- if (tx->tx_sending == 0)
- conn->ibc_nsends_posted--;
-
- if (wc->Status != WRStatusSuccess &&
- tx->tx_status == 0)
- tx->tx_status = -ECONNABORTED;
-
- spin_unlock_irqrestore(&conn->ibc_lock, flags);
-
- if (idle)
- kibnal_tx_done (tx);
+ kibnal_conn_addref(conn); /* 1 ref for me.... */
- if (wc->Status != WRStatusSuccess) {
- CERROR ("Tx completion to %s failed: %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- wc->Status);
- kibnal_close_conn (conn, -ENETDOWN);
- } else {
- /* can I shovel some more sends out the door? */
- kibnal_check_sends(conn);
- }
+ spin_unlock(&conn->ibc_lock);
- kibnal_put_conn (conn);
-}
-
-void
-kibnal_ca_async_callback (void *ca_arg, IB_EVENT_RECORD *ev)
-{
- /* XXX flesh out. this seems largely for async errors */
- CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode);
-}
-
-void
-kibnal_ca_callback (void *ca_arg, void *cq_arg)
-{
- IB_HANDLE cq = *(IB_HANDLE *)cq_arg;
- IB_HANDLE ca = *(IB_HANDLE *)ca_arg;
- IB_WORK_COMPLETION wc;
- int armed = 0;
-
- CDEBUG(D_NET, "ca %p cq %p\n", ca, cq);
-
- for(;;) {
- while (iibt_cq_poll(cq, &wc) == FSUCCESS) {
-
- /* We will need to rearm the CQ to avoid a potential race. */
- armed = 0;
-
- if (kibnal_wreqid_is_rx(wc.WorkReqId))
- kibnal_rx_callback(&wc);
- else
- kibnal_tx_callback(&wc);
- }
- if (armed)
- return;
- if (iibt_cq_rearm(cq, CQEventSelNextWC) != FSUCCESS) {
- CERROR("rearm failed?\n");
- return;
- }
- armed = 1;
- }
+ if (idle)
+ kibnal_tx_done (tx);
+
+ if (failed)
+ kibnal_close_conn (conn, -EIO);
+ else
+ kibnal_check_sends(conn);
+
+ kibnal_conn_decref(conn); /* ...until here */
}
void
kibnal_init_tx_msg (kib_tx_t *tx, int type, int body_nob)
{
- IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nsp];
- IB_WORK_REQ *wrq = &tx->tx_wrq[tx->tx_nsp];
- int fence;
- int nob = offsetof (kib_msg_t, ibm_u) + body_nob;
+ IB_LOCAL_DATASEGMENT *gl = &tx->tx_gl[tx->tx_nwrq];
+ IB_WORK_REQ *wrq = &tx->tx_wrq[tx->tx_nwrq];
+ int nob = offsetof (kib_msg_t, ibm_u) + body_nob;
- LASSERT (tx->tx_nsp >= 0 &&
- tx->tx_nsp < sizeof(tx->tx_wrq)/sizeof(tx->tx_wrq[0]));
+ LASSERT (tx->tx_nwrq >= 0 &&
+ tx->tx_nwrq < (1 + IBNAL_MAX_RDMA_FRAGS));
LASSERT (nob <= IBNAL_MSG_SIZE);
-
- tx->tx_msg->ibm_magic = IBNAL_MSG_MAGIC;
- tx->tx_msg->ibm_version = IBNAL_MSG_VERSION;
- tx->tx_msg->ibm_type = type;
-#if IBNAL_CKSUM
- tx->tx_msg->ibm_nob = nob;
-#endif
- /* Fence the message if it's bundled with an RDMA read */
- fence = (tx->tx_nsp > 0) &&
- (type == IBNAL_MSG_PUT_DONE);
+
+ kibnal_init_msg(tx->tx_msg, type, body_nob);
*gl = (IB_LOCAL_DATASEGMENT) {
- .Address = tx->tx_vaddr,
+ .Address = tx->tx_hca_msg,
.Length = IBNAL_MSG_SIZE,
- .Lkey = kibnal_lkey(kibnal_data.kib_tx_pages),
+ .Lkey = kibnal_data.kib_whole_mem.md_lkey,
};
- wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0);
+ wrq->WorkReqId = kibnal_ptr2wreqid(tx, IBNAL_WID_TX);
wrq->Operation = WROpSend;
wrq->DSList = gl;
wrq->DSListDepth = 1;
wrq->Req.SendRC.Options.s.SolicitedEvent = 1;
wrq->Req.SendRC.Options.s.SignaledCompletion = 1;
wrq->Req.SendRC.Options.s.ImmediateData = 0;
- wrq->Req.SendRC.Options.s.Fence = fence;
-
- tx->tx_nsp++;
+ wrq->Req.SendRC.Options.s.Fence = 0;
+ /* fence only needed on RDMA reads */
+
+ tx->tx_nwrq++;
}
-static void
-kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+int
+kibnal_init_rdma (kib_tx_t *tx, int type, int nob,
+ kib_rdma_desc_t *dstrd, __u64 dstcookie)
{
- unsigned long flags;
+ kib_msg_t *ibmsg = tx->tx_msg;
+ kib_rdma_desc_t *srcrd = tx->tx_rd;
+ IB_LOCAL_DATASEGMENT *gl;
+ IB_WORK_REQ *wrq;
+ int rc;
- spin_lock_irqsave(&conn->ibc_lock, flags);
+#if IBNAL_USE_FMR
+ LASSERT (tx->tx_nwrq == 0);
- kibnal_queue_tx_locked (tx, conn);
-
- spin_unlock_irqrestore(&conn->ibc_lock, flags);
-
- kibnal_check_sends(conn);
-}
+ gl = &tx->tx_gl[0];
+ gl->Length = nob;
+ gl->Address = srcrd->rd_addr;
+ gl->Lkey = srcrd->rd_key;
-static void
-kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid)
-{
- unsigned long flags;
- kib_peer_t *peer;
- kib_conn_t *conn;
- rwlock_t *g_lock = &kibnal_data.kib_global_lock;
+ wrq = &tx->tx_wrq[0];
- /* If I get here, I've committed to send, so I complete the tx with
- * failure on any problems */
-
- LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */
- LASSERT (tx->tx_nsp > 0); /* work items have been set up */
+ wrq->WorkReqId = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
+ wrq->Operation = WROpRdmaWrite;
+ wrq->DSList = gl;
+ wrq->DSListDepth = 1;
+ wrq->MessageLen = nob;
- read_lock_irqsave(g_lock, flags);
-
- peer = kibnal_find_peer_locked (nid);
- if (peer == NULL) {
- read_unlock_irqrestore(g_lock, flags);
- tx->tx_status = -EHOSTUNREACH;
- kibnal_tx_done (tx);
- return;
- }
+ wrq->Req.SendRC.ImmediateData = 0;
+ wrq->Req.SendRC.Options.s.SolicitedEvent = 0;
+ wrq->Req.SendRC.Options.s.SignaledCompletion = 0;
+ wrq->Req.SendRC.Options.s.ImmediateData = 0;
+ wrq->Req.SendRC.Options.s.Fence = 0;
- conn = kibnal_find_conn_locked (peer);
- if (conn != NULL) {
- CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
- conn, conn->ibc_state,
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- atomic_read (&conn->ibc_refcount));
- atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
- read_unlock_irqrestore(g_lock, flags);
-
- kibnal_queue_tx (tx, conn);
- return;
- }
-
- /* Making one or more connections; I'll need a write lock... */
- read_unlock(g_lock);
- write_lock(g_lock);
+ wrq->Req.SendRC.RemoteDS.Address = dstrd->rd_addr;
+ wrq->Req.SendRC.RemoteDS.Rkey = dstrd->rd_key;
- peer = kibnal_find_peer_locked (nid);
- if (peer == NULL) {
- write_unlock_irqrestore (g_lock, flags);
- tx->tx_status = -EHOSTUNREACH;
- kibnal_tx_done (tx);
- return;
- }
+ tx->tx_nwrq = 1;
+ rc = nob;
+#else
+ /* CAVEAT EMPTOR: this 'consumes' the frags in 'dstrd' */
+ int resid = nob;
+ kib_rdma_frag_t *srcfrag;
+ int srcidx;
+ kib_rdma_frag_t *dstfrag;
+ int dstidx;
+ int wrknob;
- conn = kibnal_find_conn_locked (peer);
- if (conn != NULL) {
- /* Connection exists; queue message on it */
- CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
- conn, conn->ibc_state,
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- atomic_read (&conn->ibc_refcount));
- atomic_inc (&conn->ibc_refcount); /* 1 ref for the tx */
- write_unlock_irqrestore (g_lock, flags);
+ /* Called by scheduler */
+ LASSERT (!in_interrupt());
+
+ LASSERT (type == IBNAL_MSG_GET_DONE ||
+ type == IBNAL_MSG_PUT_DONE);
+
+ srcidx = dstidx = 0;
+ srcfrag = &srcrd->rd_frags[0];
+ dstfrag = &dstrd->rd_frags[0];
+ rc = resid;
+
+ while (resid > 0) {
+ if (srcidx >= srcrd->rd_nfrag) {
+ CERROR("Src buffer exhausted: %d frags\n", srcidx);
+ rc = -EPROTO;
+ break;
+ }
- kibnal_queue_tx (tx, conn);
- return;
- }
+ if (dstidx == dstrd->rd_nfrag) {
+ CERROR("Dst buffer exhausted: %d frags\n", dstidx);
+ rc = -EPROTO;
+ break;
+ }
- if (peer->ibp_connecting == 0) {
- if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */
- time_after_eq(jiffies, peer->ibp_reconnect_time))) {
- write_unlock_irqrestore (g_lock, flags);
- tx->tx_status = -EHOSTUNREACH;
- kibnal_tx_done (tx);
- return;
+ if (tx->tx_nwrq == IBNAL_MAX_RDMA_FRAGS) {
+ CERROR("RDMA too fragmented: %d/%d src %d/%d dst frags\n",
+ srcidx, srcrd->rd_nfrag,
+ dstidx, dstrd->rd_nfrag);
+ rc = -EMSGSIZE;
+ break;
}
-
- peer->ibp_connecting = 1;
- kib_peer_addref(peer); /* extra ref for connd */
-
- spin_lock (&kibnal_data.kib_connd_lock);
-
- list_add_tail (&peer->ibp_connd_list,
- &kibnal_data.kib_connd_peers);
- wake_up (&kibnal_data.kib_connd_waitq);
-
- spin_unlock (&kibnal_data.kib_connd_lock);
- }
-
- /* A connection is being established; queue the message... */
- list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
- write_unlock_irqrestore (g_lock, flags);
-}
+ wrknob = MIN(MIN(srcfrag->rf_nob, dstfrag->rf_nob), resid);
-static int
-kibnal_start_passive_rdma (int type, int may_block, lnet_msg_t *lntmsg)
-{
- lnet_nid_t nid = lntmsg->msg_target.nid;
- int nob = lntmsg->msg_md->md_length;
- kib_tx_t *tx;
- kib_msg_t *ibmsg;
- int rc;
- IB_ACCESS_CONTROL access = {0,};
-
- LASSERT (type == IBNAL_MSG_PUT_RDMA || type == IBNAL_MSG_GET_RDMA);
- LASSERT (nob > 0);
- LASSERT (!in_interrupt()); /* Mapping could block */
+ gl = &tx->tx_gl[tx->tx_nwrq];
+ gl->Length = wrknob;
+ gl->Address = srcfrag->rf_addr;
+ gl->Lkey = srcrd->rd_key;
- access.s.MWBindable = 1;
- access.s.LocalWrite = 1;
- access.s.RdmaRead = 1;
- access.s.RdmaWrite = 1;
+ wrq = &tx->tx_wrq[tx->tx_nwrq];
- tx = kibnal_get_idle_tx ();
- if (tx == NULL) {
- CERROR("Can't allocate %s txd for %s\n",
- (type == IBNAL_MSG_PUT_RDMA) ? "PUT/REPLY" : "GET",
- libcfs_nid2str(nid));
- return -ENOMEM;
- }
+ wrq->WorkReqId = kibnal_ptr2wreqid(tx, IBNAL_WID_RDMA);
+ wrq->Operation = WROpRdmaWrite;
+ wrq->DSList = gl;
+ wrq->DSListDepth = 1;
+ wrq->MessageLen = nob;
- if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
- rc = kibnal_map_iov (tx, access,
- lntmsg->msg_md->md_niov,
- lntmsg->msg_md->md_iov.iov,
- 0, nob, 0);
- else
- rc = kibnal_map_kiov (tx, access,
- lntmsg->msg_md->md_niov,
- lntmsg->msg_md->md_iov.kiov,
- 0, nob, 0);
+ wrq->Req.SendRC.ImmediateData = 0;
+ wrq->Req.SendRC.Options.s.SolicitedEvent = 0;
+ wrq->Req.SendRC.Options.s.SignaledCompletion = 0;
+ wrq->Req.SendRC.Options.s.ImmediateData = 0;
+ wrq->Req.SendRC.Options.s.Fence = 0;
- if (rc != 0) {
- CERROR ("Can't map RDMA for %s: %d\n",
- libcfs_nid2str(nid), rc);
- goto failed;
- }
-
- if (type == IBNAL_MSG_GET_RDMA) {
- /* reply gets finalized when tx completes */
- tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni,
- lntmsg);
- if (tx->tx_lntmsg[1] == NULL) {
- CERROR ("Can't create reply for GET -> %s\n",
- libcfs_nid2str(nid));
- rc = -ENOMEM;
- goto failed;
+ wrq->Req.SendRC.RemoteDS.Address = dstfrag->rf_addr;
+ wrq->Req.SendRC.RemoteDS.Rkey = dstrd->rd_key;
+
+ resid -= wrknob;
+ if (wrknob < srcfrag->rf_nob) {
+ srcfrag->rf_addr += wrknob;
+ srcfrag->rf_nob -= wrknob;
+ } else {
+ srcfrag++;
+ srcidx++;
+ }
+
+ if (wrknob < dstfrag->rf_nob) {
+ dstfrag->rf_addr += wrknob;
+ dstfrag->rf_nob -= wrknob;
+ } else {
+ dstfrag++;
+ dstidx++;
}
+
+ tx->tx_nwrq++;
}
+
+ if (rc < 0) /* no RDMA if completing with failure */
+ tx->tx_nwrq = 0;
+#endif
- tx->tx_passive_rdma = 1;
+ ibmsg->ibm_u.completion.ibcm_status = rc;
+ ibmsg->ibm_u.completion.ibcm_cookie = dstcookie;
+ kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
- ibmsg = tx->tx_msg;
+ return rc;
+}
- ibmsg->ibm_u.rdma.ibrm_hdr = lntmsg->msg_hdr;
- ibmsg->ibm_u.rdma.ibrm_cookie = tx->tx_passive_rdma_cookie;
- /* map_kiov alrady filled the rdma descs for the whole_mem case */
- if (!kibnal_whole_mem()) {
- ibmsg->ibm_u.rdma.rd_key = tx->tx_md.md_rkey;
- ibmsg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
- ibmsg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
- ibmsg->ibm_u.rdma.ibrm_num_descs = 1;
- }
+void
+kibnal_queue_tx (kib_tx_t *tx, kib_conn_t *conn)
+{
+ spin_lock(&conn->ibc_lock);
+ kibnal_queue_tx_locked (tx, conn);
+ spin_unlock(&conn->ibc_lock);
+
+ kibnal_check_sends(conn);
+}
- kibnal_init_tx_msg (tx, type,
- kib_rdma_msg_len(ibmsg->ibm_u.rdma.ibrm_num_descs));
+void
+kibnal_schedule_active_connect_locked (kib_peer_t *peer)
+{
+ /* Called holding kib_global_lock exclusive */
+ unsigned long flags;
- CDEBUG(D_NET, "Passive: %p cookie "LPX64", key %x, addr "
- LPX64", nob %d\n",
- tx, tx->tx_passive_rdma_cookie, tx->tx_md.md_rkey,
- tx->tx_md.md_addr, nob);
-
- /* lntmsg gets finalized when tx completes. */
- tx->tx_lntmsg[0] = lntmsg;
+ peer->ibp_connecting++; /* I'm connecting */
+ kibnal_peer_addref(peer); /* extra ref for connd */
- kibnal_launch_tx(tx, nid);
- return (0);
+ spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
- failed:
- tx->tx_status = rc;
- kibnal_tx_done (tx);
- return (-EIO);
+ list_add_tail (&peer->ibp_connd_list, &kibnal_data.kib_connd_peers);
+ wake_up (&kibnal_data.kib_connd_waitq);
+
+ spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
}
void
-kibnal_start_active_rdma (int type, int status,
- kib_rx_t *rx, lnet_msg_t *lntmsg,
- unsigned int niov,
- struct iovec *iov, lnet_kiov_t *kiov,
- unsigned int offset, unsigned int nob)
+kibnal_launch_tx (kib_tx_t *tx, lnet_nid_t nid)
{
- kib_msg_t *rxmsg = rx->rx_msg;
- kib_msg_t *txmsg;
- kib_tx_t *tx;
- IB_ACCESS_CONTROL access = {0,};
- IB_WR_OP rdma_op;
- int rc;
- __u32 i;
+ kib_peer_t *peer;
+ kib_conn_t *conn;
+ unsigned long flags;
+ rwlock_t *g_lock = &kibnal_data.kib_global_lock;
+ int retry;
+ int rc;
- CDEBUG(D_NET, "type %d, status %d, niov %d, offset %d, nob %d\n",
- type, status, niov, offset, nob);
+ /* If I get here, I've committed to send, so I complete the tx with
+ * failure on any problems */
+
+ LASSERT (tx->tx_conn == NULL); /* only set when assigned a conn */
+ LASSERT (tx->tx_nwrq > 0); /* work items have been set up */
- /* Called by scheduler */
- LASSERT (!in_interrupt ());
+ for (retry = 0; ; retry = 1) {
+ read_lock_irqsave(g_lock, flags);
+
+ peer = kibnal_find_peer_locked (nid);
+ if (peer != NULL) {
+ conn = kibnal_find_conn_locked (peer);
+ if (conn != NULL) {
+ kibnal_conn_addref(conn); /* 1 ref for me... */
+ read_unlock_irqrestore(g_lock, flags);
+
+ kibnal_queue_tx (tx, conn);
+ kibnal_conn_decref(conn); /* ...to here */
+ return;
+ }
+ }
+
+ /* Making one or more connections; I'll need a write lock... */
+ read_unlock(g_lock);
+ write_lock(g_lock);
- /* Either all pages or all vaddrs */
- LASSERT (!(kiov != NULL && iov != NULL));
+ peer = kibnal_find_peer_locked (nid);
+ if (peer != NULL)
+ break;
- /* No data if we're completing with failure */
- LASSERT (status == 0 || nob == 0);
+ write_unlock_irqrestore(g_lock, flags);
- LASSERT (type == IBNAL_MSG_GET_DONE ||
- type == IBNAL_MSG_PUT_DONE);
+ if (retry) {
+ CERROR("Can't find peer %s\n", libcfs_nid2str(nid));
- /* Flag I'm completing the RDMA. Even if I fail to send the
- * completion message, I will have tried my best so further
- * attempts shouldn't be tried. */
- LASSERT (!rx->rx_rdma);
- rx->rx_rdma = 1;
+ tx->tx_status = -EHOSTUNREACH;
+ tx->tx_waiting = 0;
+ kibnal_tx_done (tx);
+ return;
+ }
- if (type == IBNAL_MSG_GET_DONE) {
- rdma_op = WROpRdmaWrite;
- LASSERT (rxmsg->ibm_type == IBNAL_MSG_GET_RDMA);
- } else {
- access.s.LocalWrite = 1;
- rdma_op = WROpRdmaRead;
- LASSERT (rxmsg->ibm_type == IBNAL_MSG_PUT_RDMA);
+ rc = kibnal_add_persistent_peer(nid);
+ if (rc != 0) {
+ CERROR("Can't add peer %s: %d\n",
+ libcfs_nid2str(nid), rc);
+
+ tx->tx_status = -EHOSTUNREACH;
+ tx->tx_waiting = 0;
+ kibnal_tx_done (tx);
+ return;
+ }
}
- tx = kibnal_get_idle_tx ();
- if (tx == NULL) {
- CERROR ("tx descs exhausted on RDMA from %s"
- " completing locally with failure\n",
- libcfs_nid2str(rx->rx_conn->ibc_peer->ibp_nid));
- lnet_finalize (kibnal_data.kib_ni, lntmsg, -ENOMEM);
+ conn = kibnal_find_conn_locked (peer);
+ if (conn != NULL) {
+ /* Connection exists; queue message on it */
+ kibnal_conn_addref(conn); /* 1 ref for me... */
+ write_unlock_irqrestore(g_lock, flags);
+
+ kibnal_queue_tx (tx, conn);
+ kibnal_conn_decref(conn); /* ...until here */
return;
}
- LASSERT (tx->tx_nsp == 0);
-
- if (nob == 0)
- GOTO(init_tx, 0);
-
- /* We actually need to transfer some data (the transfer
- * size could get truncated to zero when the incoming
- * message is matched) */
- if (kiov != NULL)
- rc = kibnal_map_kiov (tx, access, niov, kiov, offset, nob, 1);
- else
- rc = kibnal_map_iov (tx, access, niov, iov, offset, nob, 1);
-
- if (rc != 0) {
- CERROR ("Can't map RDMA -> %s: %d\n",
- libcfs_nid2str(rx->rx_conn->ibc_peer->ibp_nid), rc);
- /* We'll skip the RDMA and complete with failure. */
- status = rc;
- nob = 0;
- GOTO(init_tx, rc);
- }
-
- if (!kibnal_whole_mem()) {
- tx->tx_msg->ibm_u.rdma.rd_key = tx->tx_md.md_lkey;
- tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_addr = tx->tx_md.md_addr;
- tx->tx_msg->ibm_u.rdma.ibrm_desc[0].rd_nob = nob;
- tx->tx_msg->ibm_u.rdma.ibrm_num_descs = 1;
- }
-
- /* XXX ugh. different page-sized hosts. */
- if (tx->tx_msg->ibm_u.rdma.ibrm_num_descs !=
- rxmsg->ibm_u.rdma.ibrm_num_descs) {
- CERROR("tx descs (%u) != rx descs (%u)\n",
- tx->tx_msg->ibm_u.rdma.ibrm_num_descs,
- rxmsg->ibm_u.rdma.ibrm_num_descs);
- /* We'll skip the RDMA and complete with failure. */
- status = rc;
- nob = 0;
- GOTO(init_tx, rc);
- }
-
- /* map_kiov filled in the rdma descs which describe our side of the
- * rdma transfer. */
- /* ibrm_num_descs was verified in rx_callback */
- for(i = 0; i < rxmsg->ibm_u.rdma.ibrm_num_descs; i++) {
- kib_rdma_desc_t *ldesc, *rdesc; /* local, remote */
- IB_LOCAL_DATASEGMENT *ds = &tx->tx_gl[i];
- IB_WORK_REQ *wrq = &tx->tx_wrq[i];
-
- ldesc = &tx->tx_msg->ibm_u.rdma.ibrm_desc[i];
- rdesc = &rxmsg->ibm_u.rdma.ibrm_desc[i];
-
- ds->Address = ldesc->rd_addr;
- ds->Length = ldesc->rd_nob;
- ds->Lkey = tx->tx_msg->ibm_u.rdma.rd_key;
-
- memset(wrq, 0, sizeof(*wrq));
- wrq->WorkReqId = kibnal_ptr2wreqid(tx, 0);
- wrq->Operation = rdma_op;
- wrq->DSList = ds;
- wrq->DSListDepth = 1;
- wrq->MessageLen = ds->Length;
- wrq->Req.SendRC.ImmediateData = 0;
- wrq->Req.SendRC.Options.s.SolicitedEvent = 0;
- wrq->Req.SendRC.Options.s.SignaledCompletion = 0;
- wrq->Req.SendRC.Options.s.ImmediateData = 0;
- wrq->Req.SendRC.Options.s.Fence = 0;
- wrq->Req.SendRC.RemoteDS.Address = rdesc->rd_addr;
- wrq->Req.SendRC.RemoteDS.Rkey = rxmsg->ibm_u.rdma.rd_key;
- /* only the last rdma post triggers tx completion */
- if (i == rxmsg->ibm_u.rdma.ibrm_num_descs - 1)
- wrq->Req.SendRC.Options.s.SignaledCompletion = 1;
+ if (peer->ibp_connecting == 0) {
+ if (!(peer->ibp_reconnect_interval == 0 || /* first attempt */
+ time_after_eq(jiffies, peer->ibp_reconnect_time))) {
+ write_unlock_irqrestore(g_lock, flags);
+ tx->tx_status = -EHOSTUNREACH;
+ tx->tx_waiting = 0;
+ kibnal_tx_done (tx);
+ return;
+ }
- tx->tx_nsp++;
+ kibnal_schedule_active_connect_locked(peer);
}
-
-init_tx:
- txmsg = tx->tx_msg;
-
- txmsg->ibm_u.completion.ibcm_cookie = rxmsg->ibm_u.rdma.ibrm_cookie;
- txmsg->ibm_u.completion.ibcm_status = status;
- kibnal_init_tx_msg(tx, type, sizeof (kib_completion_msg_t));
+ /* A connection is being established; queue the message... */
+ list_add_tail (&tx->tx_list, &peer->ibp_tx_queue);
- if (status == 0 && nob != 0) {
- LASSERT (tx->tx_nsp > 1);
- /* RDMA: lntmsg gets finalized when the tx completes. This
- * is after the completion message has been sent, which in
- * turn is after the RDMA has finished. */
- tx->tx_lntmsg[0] = lntmsg;
- } else {
- LASSERT (tx->tx_nsp == 1);
- /* No RDMA: local completion happens now! */
- CDEBUG(D_WARNING,"No data: immediate completion\n");
- lnet_finalize (kibnal_data.kib_ni, lntmsg,
- status == 0 ? 0 : -EIO);
- }
-
- /* +1 ref for this tx... */
- CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
- rx->rx_conn, rx->rx_conn->ibc_state,
- libcfs_nid2str(rx->rx_conn->ibc_peer->ibp_nid),
- atomic_read (&rx->rx_conn->ibc_refcount));
- atomic_inc (&rx->rx_conn->ibc_refcount);
- /* ...and queue it up */
- kibnal_queue_tx(tx, rx->rx_conn);
+ write_unlock_irqrestore(g_lock, flags);
}
int
lnet_process_id_t target = lntmsg->msg_target;
int target_is_router = lntmsg->msg_target_is_router;
int routing = lntmsg->msg_routing;
- unsigned int payload_niov = lntmsg->msg_niov;
+ unsigned int payload_niov = lntmsg->msg_niov;
struct iovec *payload_iov = lntmsg->msg_iov;
lnet_kiov_t *payload_kiov = lntmsg->msg_kiov;
unsigned int payload_offset = lntmsg->msg_offset;
kib_msg_t *ibmsg;
kib_tx_t *tx;
int nob;
+ int rc;
/* NB 'private' is different depending on what we're sending.... */
- CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
+ CDEBUG(D_NET, "sending %d bytes in %d frags to %s\n",
payload_nob, payload_niov, libcfs_id2str(target));
LASSERT (payload_nob == 0 || payload_niov > 0);
LASSERT (payload_niov <= LNET_MAX_IOV);
- /* Thread context if we're sending payload */
- LASSERT (!in_interrupt() || payload_nob == 0);
+ /* Thread context */
+ LASSERT (!in_interrupt());
/* payload is either all vaddrs or all pages */
LASSERT (!(payload_kiov != NULL && payload_iov != NULL));
case LNET_MSG_GET:
if (routing || target_is_router)
break; /* send IMMEDIATE */
-
+
/* is the REPLY message too small for RDMA? */
nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
if (nob <= IBNAL_MSG_SIZE)
break; /* send IMMEDIATE */
- return kibnal_start_passive_rdma(IBNAL_MSG_GET_RDMA, 1, lntmsg);
+ tx = kibnal_get_idle_tx();
+ if (tx == NULL) {
+ CERROR("Can allocate txd for GET to %s: \n",
+ libcfs_nid2str(target.nid));
+ return -ENOMEM;
+ }
+
+ ibmsg = tx->tx_msg;
+ ibmsg->ibm_u.get.ibgm_hdr = *hdr;
+ ibmsg->ibm_u.get.ibgm_cookie = tx->tx_cookie;
+
+ if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0)
+ rc = kibnal_setup_rd_iov(tx, &ibmsg->ibm_u.get.ibgm_rd,
+ 0,
+ lntmsg->msg_md->md_niov,
+ lntmsg->msg_md->md_iov.iov,
+ 0, lntmsg->msg_md->md_length);
+ else
+ rc = kibnal_setup_rd_kiov(tx, &ibmsg->ibm_u.get.ibgm_rd,
+ 0,
+ lntmsg->msg_md->md_niov,
+ lntmsg->msg_md->md_iov.kiov,
+ 0, lntmsg->msg_md->md_length);
+ if (rc != 0) {
+ CERROR("Can't setup GET sink for %s: %d\n",
+ libcfs_nid2str(target.nid), rc);
+ kibnal_tx_done(tx);
+ return -EIO;
+ }
+
+#if IBNAL_USE_FMR
+ nob = sizeof(kib_get_msg_t);
+#else
+ {
+ int n = ibmsg->ibm_u.get.ibgm_rd.rd_nfrag;
+
+ nob = offsetof(kib_get_msg_t, ibgm_rd.rd_frags[n]);
+ }
+#endif
+ kibnal_init_tx_msg(tx, IBNAL_MSG_GET_REQ, nob);
+
+ tx->tx_lntmsg[1] = lnet_create_reply_msg(kibnal_data.kib_ni,
+ lntmsg);
+ if (tx->tx_lntmsg[1] == NULL) {
+ CERROR("Can't create reply for GET -> %s\n",
+ libcfs_nid2str(target.nid));
+ kibnal_tx_done(tx);
+ return -EIO;
+ }
+
+ tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg[0,1] on completion */
+ tx->tx_waiting = 1; /* waiting for GET_DONE */
+ kibnal_launch_tx(tx, target.nid);
+ return 0;
case LNET_MSG_REPLY: {
/* reply's 'private' is the incoming receive */
kib_rx_t *rx = private;
- LASSERT (routing || rx != NULL);
+ LASSERT(routing || rx != NULL);
- /* RDMA reply expected? */
if (!routing && rx->rx_msg->ibm_type != IBNAL_MSG_IMMEDIATE) {
- /* Incoming message consistent with RDMA */
- if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_RDMA) {
- CERROR ("REPLY to %s bad ibm type %d!!!\n",
- libcfs_nid2str(target.nid),
- rx->rx_msg->ibm_type);
- return (-EIO);
+ /* Incoming message consistent with RDMA? */
+ if (rx->rx_msg->ibm_type != IBNAL_MSG_GET_REQ) {
+ CERROR("REPLY to %s bad msg type %x!!!\n",
+ libcfs_nid2str(target.nid),
+ rx->rx_msg->ibm_type);
+ return -EIO;
}
- kibnal_start_active_rdma(IBNAL_MSG_GET_DONE, 0,
- rx, lntmsg, payload_niov,
- payload_iov, payload_kiov,
- payload_offset, payload_nob);
- return (0);
+ /* NB handle_rx() will send GET_NAK when I return to
+ * it from here, unless I set rx_responded! */
+
+ tx = kibnal_get_idle_tx();
+ if (tx == NULL) {
+ CERROR("Can't get tx for REPLY to %s\n",
+ libcfs_nid2str(target.nid));
+ return -ENOMEM;
+ }
+
+ if (payload_nob == 0)
+ rc = 0;
+ else if (payload_kiov == NULL)
+ rc = kibnal_setup_rd_iov(
+ tx, tx->tx_rd, 1,
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
+ else
+ rc = kibnal_setup_rd_kiov(
+ tx, tx->tx_rd, 1,
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
+ if (rc != 0) {
+ CERROR("Can't setup GET src for %s: %d\n",
+ libcfs_nid2str(target.nid), rc);
+ kibnal_tx_done(tx);
+ return -EIO;
+ }
+
+ rc = kibnal_init_rdma(tx, IBNAL_MSG_GET_DONE,
+ payload_nob,
+ &rx->rx_msg->ibm_u.get.ibgm_rd,
+ rx->rx_msg->ibm_u.get.ibgm_cookie);
+ if (rc < 0) {
+ CERROR("Can't setup rdma for GET from %s: %d\n",
+ libcfs_nid2str(target.nid), rc);
+ } else if (rc == 0) {
+ /* No RDMA: local completion may happen now! */
+ lnet_finalize (kibnal_data.kib_ni, lntmsg, 0);
+ } else {
+ /* RDMA: lnet_finalize(lntmsg) when it
+ * completes */
+ tx->tx_lntmsg[0] = lntmsg;
+ }
+
+ kibnal_queue_tx(tx, rx->rx_conn);
+ rx->rx_responded = 1;
+ return (rc >= 0) ? 0 : -EIO;
}
- /* Fall through to handle like PUT */
+ /* fall through to handle like PUT */
}
case LNET_MSG_PUT:
nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob]);
if (nob <= IBNAL_MSG_SIZE)
break; /* send IMMEDIATE */
-
- return kibnal_start_passive_rdma(IBNAL_MSG_PUT_RDMA,
- !(routing || type == LNET_MSG_REPLY),
- lntmsg);
+
+ tx = kibnal_get_idle_tx();
+ if (tx == NULL) {
+ CERROR("Can't allocate %s txd for %s\n",
+ type == LNET_MSG_PUT ? "PUT" : "REPLY",
+ libcfs_nid2str(target.nid));
+ return -ENOMEM;
+ }
+
+ if (payload_kiov == NULL)
+ rc = kibnal_setup_rd_iov(tx, tx->tx_rd, 1,
+ payload_niov, payload_iov,
+ payload_offset, payload_nob);
+ else
+ rc = kibnal_setup_rd_kiov(tx, tx->tx_rd, 1,
+ payload_niov, payload_kiov,
+ payload_offset, payload_nob);
+ if (rc != 0) {
+ CERROR("Can't setup PUT src for %s: %d\n",
+ libcfs_nid2str(target.nid), rc);
+ kibnal_tx_done(tx);
+ return -EIO;
+ }
+
+ ibmsg = tx->tx_msg;
+ ibmsg->ibm_u.putreq.ibprm_hdr = *hdr;
+ ibmsg->ibm_u.putreq.ibprm_cookie = tx->tx_cookie;
+ kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_REQ, sizeof(kib_putreq_msg_t));
+
+ tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
+ tx->tx_waiting = 1; /* waiting for PUT_{ACK,NAK} */
+ kibnal_launch_tx(tx, target.nid);
+ return 0;
}
- /* Send IMMEDIATE */
+ /* send IMMEDIATE */
+
+ LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
+ <= IBNAL_MSG_SIZE);
tx = kibnal_get_idle_tx();
if (tx == NULL) {
- CERROR ("Can't send %d to %s: tx descs exhausted%s\n",
- type, libcfs_nid2str(target.nid),
- in_interrupt() ? " (intr)" : "");
- return (-ENOMEM);
+ CERROR ("Can't send %d to %s: tx descs exhausted\n",
+ type, libcfs_nid2str(target.nid));
+ return -ENOMEM;
}
ibmsg = tx->tx_msg;
ibmsg->ibm_u.immediate.ibim_hdr = *hdr;
if (payload_kiov != NULL)
- lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg,
+ lnet_copy_kiov2flat(IBNAL_MSG_SIZE, ibmsg,
offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
- payload_niov, payload_kiov,
+ payload_niov, payload_kiov,
payload_offset, payload_nob);
else
- lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg,
+ lnet_copy_iov2flat(IBNAL_MSG_SIZE, ibmsg,
offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
- payload_niov, payload_iov,
+ payload_niov, payload_iov,
payload_offset, payload_nob);
- kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE,
- offsetof(kib_immediate_msg_t,
- ibim_payload[payload_nob]));
-
- /* lntmsg gets finalized when tx completes */
- tx->tx_lntmsg[0] = lntmsg;
+ nob = offsetof(kib_immediate_msg_t, ibim_payload[payload_nob]);
+ kibnal_init_tx_msg (tx, IBNAL_MSG_IMMEDIATE, nob);
+ tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
kibnal_launch_tx(tx, target.nid);
- return (0);
+ return 0;
}
int
-kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
- int delayed, unsigned int niov,
- struct iovec *iov, lnet_kiov_t *kiov,
+kibnal_recv (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
+ unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
unsigned int offset, unsigned int mlen, unsigned int rlen)
{
kib_rx_t *rx = private;
kib_msg_t *rxmsg = rx->rx_msg;
- int msg_nob;
+ kib_conn_t *conn = rx->rx_conn;
+ kib_tx_t *tx;
+ kib_msg_t *txmsg;
+ int nob;
int rc = 0;
-
+
LASSERT (mlen <= rlen);
- LASSERT (!in_interrupt ());
+ LASSERT (!in_interrupt());
/* Either all pages or all vaddrs */
LASSERT (!(kiov != NULL && iov != NULL));
switch (rxmsg->ibm_type) {
default:
LBUG();
-
+
case IBNAL_MSG_IMMEDIATE:
- msg_nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
- if (msg_nob > rx->rx_nob) {
+ nob = offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[rlen]);
+ if (nob > rx->rx_nob) {
CERROR ("Immediate message from %s too big: %d(%d)\n",
libcfs_nid2str(rxmsg->ibm_u.immediate.ibim_hdr.src_nid),
- msg_nob, rx->rx_nob);
+ nob, rx->rx_nob);
rc = -EPROTO;
break;
}
offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
mlen);
else
- lnet_copy_flat2iov(niov, iov, offset,
- IBNAL_MSG_SIZE, rxmsg,
- offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
- mlen);
+ lnet_copy_flat2iov(niov, iov, offset,
+ IBNAL_MSG_SIZE, rxmsg,
+ offsetof(kib_msg_t, ibm_u.immediate.ibim_payload),
+ mlen);
+ lnet_finalize (ni, lntmsg, 0);
+ break;
+
+ case IBNAL_MSG_PUT_REQ:
+ if (mlen == 0) {
+ lnet_finalize(ni, lntmsg, 0);
+ kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, 0,
+ rxmsg->ibm_u.putreq.ibprm_cookie);
+ break;
+ }
+
+ tx = kibnal_get_idle_tx();
+ if (tx == NULL) {
+ CERROR("Can't allocate tx for %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ /* Not replying will break the connection */
+ rc = -ENOMEM;
+ break;
+ }
+
+ txmsg = tx->tx_msg;
+ if (kiov == NULL)
+ rc = kibnal_setup_rd_iov(tx,
+ &txmsg->ibm_u.putack.ibpam_rd,
+ 0,
+ niov, iov, offset, mlen);
+ else
+ rc = kibnal_setup_rd_kiov(tx,
+ &txmsg->ibm_u.putack.ibpam_rd,
+ 0,
+ niov, kiov, offset, mlen);
+ if (rc != 0) {
+ CERROR("Can't setup PUT sink for %s: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
+ kibnal_tx_done(tx);
+ /* tell peer it's over */
+ kibnal_send_completion(rx->rx_conn, IBNAL_MSG_PUT_NAK, rc,
+ rxmsg->ibm_u.putreq.ibprm_cookie);
+ break;
+ }
- lnet_finalize (ni, lntmsg, 0);
+ txmsg->ibm_u.putack.ibpam_src_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+ txmsg->ibm_u.putack.ibpam_dst_cookie = tx->tx_cookie;
+#if IBNAL_USE_FMR
+ nob = sizeof(kib_putack_msg_t);
+#else
+ {
+ int n = tx->tx_msg->ibm_u.putack.ibpam_rd.rd_nfrag;
+
+ nob = offsetof(kib_putack_msg_t, ibpam_rd.rd_frags[n]);
+ }
+#endif
+ kibnal_init_tx_msg(tx, IBNAL_MSG_PUT_ACK, nob);
+
+ tx->tx_lntmsg[0] = lntmsg; /* finalise lntmsg on completion */
+ tx->tx_waiting = 1; /* waiting for PUT_DONE */
+ kibnal_queue_tx(tx, conn);
break;
- case IBNAL_MSG_GET_RDMA:
+ case IBNAL_MSG_GET_REQ:
LASSERT (lntmsg == NULL); /* no need to finalise */
- if (!rx->rx_rdma) {
+ if (!rx->rx_responded) {
/* GET didn't match anything */
- kibnal_start_active_rdma (IBNAL_MSG_GET_DONE, -ENODATA,
- rx, NULL, 0, NULL, NULL, 0, 0);
+ kibnal_send_completion(rx->rx_conn, IBNAL_MSG_GET_DONE,
+ -ENODATA,
+ rxmsg->ibm_u.get.ibgm_cookie);
}
break;
-
- case IBNAL_MSG_PUT_RDMA:
- kibnal_start_active_rdma (IBNAL_MSG_PUT_DONE, 0, rx, lntmsg,
- niov, iov, kiov, offset, mlen);
- break;
}
kibnal_post_rx(rx, 1);
return rc;
}
-/*****************************************************************************
- * the rest of this file concerns connection management. active connetions
- * start with connect_peer, passive connections start with passive_callback.
- * active disconnects start with conn_close, cm_callback starts passive
- * disconnects and contains the guts of how the disconnect state machine
- * progresses.
- *****************************************************************************/
-
int
kibnal_thread_start (int (*fn)(void *arg), void *arg)
{
return (0);
}
-static void
+void
kibnal_thread_fini (void)
{
atomic_dec (&kibnal_data.kib_nthreads);
}
-/* this can be called by anyone at any time to close a connection. if
- * the connection is still established it heads to the connd to start
- * the disconnection in a safe context. It has no effect if called
- * on a connection that is already disconnecting */
void
-kibnal_close_conn_locked (kib_conn_t *conn, int error)
+kibnal_schedule_conn (kib_conn_t *conn)
{
- /* This just does the immmediate housekeeping, and schedules the
- * connection for the connd to finish off.
- * Caller holds kib_global_lock exclusively in irq context */
- kib_peer_t *peer = conn->ibc_peer;
+ unsigned long flags;
- KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_CONNECTING,
- IBNAL_CONN_DISCONNECTED);
+ kibnal_conn_addref(conn); /* ++ref for connd */
+
+ spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
- if (conn->ibc_state > IBNAL_CONN_ESTABLISHED)
- return; /* already disconnecting */
+ list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
+ wake_up (&kibnal_data.kib_connd_waitq);
+
+ spin_unlock_irqrestore(&kibnal_data.kib_connd_lock, flags);
+}
- CDEBUG (error == 0 ? D_NET : D_ERROR,
- "closing conn to %s: error %d\n",
- libcfs_nid2str(peer->ibp_nid), error);
+void
+kibnal_close_conn_locked (kib_conn_t *conn, int error)
+{
+ /* This just does the immediate housekeeping to start shutdown of an
+ * established connection. 'error' is zero for a normal shutdown.
+ * Caller holds kib_global_lock exclusively in irq context */
+ kib_peer_t *peer = conn->ibc_peer;
+
+ LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
- if (conn->ibc_state == IBNAL_CONN_ESTABLISHED) {
- /* kib_connd_conns takes ibc_list's ref */
- list_del (&conn->ibc_list);
+ if (conn->ibc_state != IBNAL_CONN_ESTABLISHED)
+ return; /* already being handled */
+
+ /* NB Can't take ibc_lock here (could be in IRQ context), without
+ * risking deadlock, so access to ibc_{tx_queue,active_txs} is racey */
+
+ if (error == 0 &&
+ list_empty(&conn->ibc_tx_queue) &&
+ list_empty(&conn->ibc_active_txs)) {
+ CDEBUG(D_NET, "closing conn to %s"
+ " rx# "LPD64" tx# "LPD64"\n",
+ libcfs_nid2str(peer->ibp_nid),
+ conn->ibc_txseq, conn->ibc_rxseq);
} else {
- /* new ref for kib_connd_conns */
- CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
- conn, conn->ibc_state,
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- atomic_read (&conn->ibc_refcount));
- atomic_inc (&conn->ibc_refcount);
+ CERROR("Closing conn to %s: error %d%s%s"
+ " rx# "LPD64" tx# "LPD64"\n",
+ libcfs_nid2str(peer->ibp_nid), error,
+ list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+ list_empty(&conn->ibc_active_txs) ? "" : "(waiting)",
+ conn->ibc_txseq, conn->ibc_rxseq);
+#if 0
+ /* can't skip down the queue without holding ibc_lock (see above) */
+ list_for_each(tmp, &conn->ibc_tx_queue) {
+ kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+
+ CERROR(" queued tx type %x cookie "LPX64
+ " sending %d waiting %d ticks %ld/%d\n",
+ tx->tx_msg->ibm_type, tx->tx_cookie,
+ tx->tx_sending, tx->tx_waiting,
+ (long)(tx->tx_deadline - jiffies), HZ);
+ }
+
+ list_for_each(tmp, &conn->ibc_active_txs) {
+ kib_tx_t *tx = list_entry(tmp, kib_tx_t, tx_list);
+
+ CERROR(" active tx type %x cookie "LPX64
+ " sending %d waiting %d ticks %ld/%d\n",
+ tx->tx_msg->ibm_type, tx->tx_cookie,
+ tx->tx_sending, tx->tx_waiting,
+ (long)(tx->tx_deadline - jiffies), HZ);
+ }
+#endif
}
+
+ list_del (&conn->ibc_list);
if (list_empty (&peer->ibp_conns) && /* no more conns */
peer->ibp_persistence == 0 && /* non-persistent peer */
kibnal_unlink_peer_locked (peer);
}
- conn->ibc_state = IBNAL_CONN_SEND_DREQ;
+ kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTING);
- spin_lock (&kibnal_data.kib_connd_lock);
+ kibnal_schedule_conn(conn);
+ kibnal_conn_decref(conn); /* lose ibc_list's ref */
+}
- list_add_tail (&conn->ibc_list, &kibnal_data.kib_connd_conns);
- wake_up (&kibnal_data.kib_connd_waitq);
+void
+kibnal_close_conn (kib_conn_t *conn, int error)
+{
+ unsigned long flags;
+
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+
+ kibnal_close_conn_locked (conn, error);
+
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+}
+
+void
+kibnal_handle_early_rxs(kib_conn_t *conn)
+{
+ unsigned long flags;
+ kib_rx_t *rx;
+
+ LASSERT (!in_interrupt());
+ LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
+
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+ while (!list_empty(&conn->ibc_early_rxs)) {
+ rx = list_entry(conn->ibc_early_rxs.next,
+ kib_rx_t, rx_list);
+ list_del(&rx->rx_list);
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
- spin_unlock (&kibnal_data.kib_connd_lock);
+ kibnal_handle_rx(rx);
+
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+ }
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
}
void
-kibnal_close_conn (kib_conn_t *conn, int error)
+kibnal_conn_disconnected(kib_conn_t *conn)
{
- unsigned long flags;
+ static IB_QP_ATTRIBUTES_MODIFY qpam = {.RequestState = QPStateError};
- write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+ LIST_HEAD (zombies);
+ struct list_head *tmp;
+ struct list_head *nxt;
+ kib_tx_t *tx;
+ FSTATUS frc;
+ int done;
- kibnal_close_conn_locked (conn, error);
+ LASSERT (conn->ibc_state >= IBNAL_CONN_INIT_QP);
+
+ kibnal_set_conn_state(conn, IBNAL_CONN_DISCONNECTED);
+
+ /* move QP to error state to make posted work items complete */
+ frc = iibt_qp_modify(conn->ibc_qp, &qpam, NULL);
+ if (frc != FSUCCESS)
+ CERROR("can't move qp state to error: %d\n", frc);
+
+ spin_lock(&conn->ibc_lock);
+
+ /* Complete all tx descs not waiting for sends to complete.
+ * NB we should be safe from RDMA now that the QP has changed state */
+
+ list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
+ tx = list_entry (tmp, kib_tx_t, tx_list);
+
+ LASSERT (tx->tx_queued);
+
+ tx->tx_status = -ECONNABORTED;
+ tx->tx_queued = 0;
+ tx->tx_waiting = 0;
+
+ if (tx->tx_sending != 0)
+ continue;
+
+ list_del (&tx->tx_list);
+ list_add (&tx->tx_list, &zombies);
+ }
+
+ list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
+ tx = list_entry (tmp, kib_tx_t, tx_list);
+
+ LASSERT (!tx->tx_queued);
+ LASSERT (tx->tx_waiting ||
+ tx->tx_sending != 0);
+
+ tx->tx_status = -ECONNABORTED;
+ tx->tx_waiting = 0;
+
+ if (tx->tx_sending != 0)
+ continue;
+
+ list_del (&tx->tx_list);
+ list_add (&tx->tx_list, &zombies);
+ }
- write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+ spin_unlock(&conn->ibc_lock);
+
+ while (!list_empty(&zombies)) {
+ tx = list_entry (zombies.next, kib_tx_t, tx_list);
+
+ list_del(&tx->tx_list);
+ kibnal_tx_done (tx);
+ }
+
+ kibnal_handle_early_rxs(conn);
}
-static void
+void
kibnal_peer_connect_failed (kib_peer_t *peer, int active, int rc)
{
LIST_HEAD (zombies);
LASSERT (rc != 0);
- write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
LASSERT (peer->ibp_connecting != 0);
peer->ibp_connecting--;
if (peer->ibp_connecting != 0) {
- /* another connection attempt under way (loopback?)... */
+ /* another connection attempt under way (e.g. STALE on first
+ * attempt)... */
write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
return;
}
MIN(peer->ibp_reconnect_interval,
*kibnal_tunables.kib_max_reconnect_interval);
- peer->ibp_reconnect_time = jiffies +
+ peer->ibp_reconnect_time = jiffies +
peer->ibp_reconnect_interval * HZ;
-
- /* Take peer's blocked blocked transmits; I'll complete
- * them with error */
- while (!list_empty (&peer->ibp_tx_queue)) {
- tx = list_entry (peer->ibp_tx_queue.next,
- kib_tx_t, tx_list);
-
- list_del (&tx->tx_list);
- list_add_tail (&tx->tx_list, &zombies);
- }
+
+ /* Take peer's blocked transmits to complete with error */
+ list_add(&zombies, &peer->ibp_tx_queue);
+ list_del_init(&peer->ibp_tx_queue);
if (kibnal_peer_active(peer) &&
(peer->ibp_persistence == 0)) {
LASSERT (list_empty(&peer->ibp_tx_queue));
}
- write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
-
- if (!list_empty (&zombies))
- CERROR ("Deleting messages for %s: connection failed\n",
- libcfs_nid2str(peer->ibp_nid));
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
- while (!list_empty (&zombies)) {
+ if (list_empty (&zombies))
+ return;
+
+ CERROR ("Deleting messages for %s: connection failed\n",
+ libcfs_nid2str(peer->ibp_nid));
+ do {
tx = list_entry (zombies.next, kib_tx_t, tx_list);
list_del (&tx->tx_list);
/* complete now */
tx->tx_status = -EHOSTUNREACH;
kibnal_tx_done (tx);
- }
+ } while (!list_empty (&zombies));
}
-static void
+void
kibnal_connreq_done (kib_conn_t *conn, int active, int status)
{
- int state = conn->ibc_state;
kib_peer_t *peer = conn->ibc_peer;
+ struct list_head txs;
kib_tx_t *tx;
unsigned long flags;
int i;
- /* passive connection has no connreq & vice versa */
- LASSERTF(!active == !(conn->ibc_connreq != NULL),
- "%d %p\n", active, conn->ibc_connreq);
- if (active) {
- LIBCFS_FREE (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
- conn->ibc_connreq = NULL;
+ LASSERT (conn->ibc_state >= IBNAL_CONN_INIT_QP);
+ LASSERT (conn->ibc_state < IBNAL_CONN_ESTABLISHED);
+ LASSERT (peer->ibp_connecting > 0);
+
+ LIBCFS_FREE(conn->ibc_cvars, sizeof(*conn->ibc_cvars));
+ conn->ibc_cvars = NULL;
+
+ if (status != 0) {
+ /* failed to establish connection */
+ kibnal_peer_connect_failed(conn->ibc_peer, active, status);
+ kibnal_conn_disconnected(conn);
+ kibnal_conn_decref(conn); /* Lose CM's ref */
+ return;
}
- write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
+ /* connection established */
+ LASSERT(conn->ibc_state == IBNAL_CONN_CONNECTING);
+ kibnal_set_conn_state(conn, IBNAL_CONN_ESTABLISHED);
- LASSERT (peer->ibp_connecting != 0);
-
- if (status == 0) {
- /* connection established... */
- KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_CONNECTING);
- conn->ibc_state = IBNAL_CONN_ESTABLISHED;
-
- if (!kibnal_peer_active(peer)) {
- /* ...but peer deleted meantime */
- status = -ECONNABORTED;
- }
- } else {
- KIB_ASSERT_CONN_STATE_RANGE(conn, IBNAL_CONN_INIT_QP,
- IBNAL_CONN_CONNECTING);
+ CDEBUG(D_WARNING, "Connection %p -> %s ESTABLISHED\n",
+ conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+
+ write_lock_irqsave(&kibnal_data.kib_global_lock, flags);
+
+ if (!kibnal_peer_active(peer)) {
+ /* peer has been deleted */
+ kibnal_close_conn_locked(conn, -ECONNABORTED);
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+
+ kibnal_peer_connect_failed(conn->ibc_peer, active,
+ -ECONNABORTED);
+ kibnal_conn_decref(conn); /* lose CM's ref */
+ return;
}
+
+ peer->ibp_connecting--;
+ peer->ibp_reconnect_interval = 0; /* OK to reconnect at any time */
- if (status == 0) {
- /* Everything worked! */
+ /* Add conn to peer's list and nuke any dangling conns from a different
+ * peer instance... */
+ kibnal_conn_addref(conn); /* +1 ref for ibc_list */
+ kibnal_close_stale_conns_locked(peer, conn->ibc_incarnation);
- peer->ibp_connecting--;
+ /* grab txs blocking for a conn */
+ list_add(&txs, &peer->ibp_tx_queue);
+ list_del_init(&peer->ibp_tx_queue);
- /* +1 ref for ibc_list; caller(== CM)'s ref remains until
- * the IB_CM_IDLE callback */
- CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
- conn, conn->ibc_state,
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- atomic_read (&conn->ibc_refcount));
- atomic_inc (&conn->ibc_refcount);
- list_add (&conn->ibc_list, &peer->ibp_conns);
-
- peer->ibp_reconnect_interval = 0; /* OK to reconnect at any time */
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
+
+ /* Schedule blocked txs */
+ spin_lock (&conn->ibc_lock);
+ while (!list_empty (&txs)) {
+ tx = list_entry (txs.next, kib_tx_t, tx_list);
+ list_del (&tx->tx_list);
- /* post blocked sends to the new connection */
- spin_lock (&conn->ibc_lock);
-
- while (!list_empty (&peer->ibp_tx_queue)) {
- tx = list_entry (peer->ibp_tx_queue.next,
- kib_tx_t, tx_list);
-
- list_del (&tx->tx_list);
-
- /* +1 ref for each tx */
- CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
- conn, conn->ibc_state,
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- atomic_read (&conn->ibc_refcount));
- atomic_inc (&conn->ibc_refcount);
- kibnal_queue_tx_locked (tx, conn);
+ kibnal_queue_tx_locked (tx, conn);
+ }
+ spin_unlock (&conn->ibc_lock);
+ kibnal_check_sends (conn);
+}
+
+void
+kibnal_reject (lnet_nid_t nid, IB_HANDLE cep, int reason)
+{
+ static CM_REJECT_INFO msgs[] = {{.Reason = RC_USER_REJ},
+ {.Reason = RC_NO_RESOURCES}};
+ const int nmsg = sizeof(msgs)/sizeof(msgs[0]);
+ CM_REJECT_INFO *msg;
+ FSTATUS frc;
+
+ for (msg = &msgs[0]; msg < &msgs[nmsg]; msg++)
+ if (msg->Reason == reason)
+ break;
+
+ LASSERT (msg < &msgs[nmsg]);
+
+ frc = iibt_cm_reject(cep, msg);
+ if (frc != FSUCCESS)
+ CERROR("Error %d rejecting %s\n", frc, libcfs_nid2str(nid));
+}
+
+void
+kibnal_check_connreject(kib_conn_t *conn, int active, CM_REJECT_INFO *rej)
+{
+ kib_peer_t *peer = conn->ibc_peer;
+ unsigned long flags;
+ FSTATUS frc;
+
+ if (rej->Reason != RC_STALE_CONN) {
+ CERROR("%s connection to %s rejected: %d\n",
+ active ? "Active" : "Passive",
+ libcfs_nid2str(peer->ibp_nid), rej->Reason);
+ } else {
+ if (!active) {
+ CERROR("Connection to %s rejected (stale QP)\n",
+ libcfs_nid2str(peer->ibp_nid));
+ } else {
+ CWARN("Connection to %s rejected (stale QP): "
+ "retrying...\n", libcfs_nid2str(peer->ibp_nid));
+
+ /* retry from scratch to allocate a new conn
+ * which will use a different QP */
+ write_lock_irqsave(&kibnal_data.kib_global_lock,
+ flags);
+ kibnal_schedule_active_connect_locked(peer);
+ write_unlock_irqrestore(&kibnal_data.kib_global_lock,
+ flags);
}
-
- spin_unlock (&conn->ibc_lock);
- /* Nuke any dangling conns from a different peer instance... */
- kibnal_close_stale_conns_locked (conn->ibc_peer,
- conn->ibc_incarnation);
+ /* An FCM_DISCONNECTED callback is still outstanding: give it a
+ * ref since kibnal_connreq_done() drops the CM's ref on conn
+ * on failure */
+ kibnal_conn_addref(conn);
+ }
+
+ kibnal_connreq_done(conn, 1, -ECONNRESET);
+}
- write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+void
+kibnal_cm_disconnect_callback(kib_conn_t *conn, CM_CONN_INFO *info)
+{
+ CDEBUG(D_NET, "status 0x%x\n", info->Status);
- /* queue up all the receives */
- for (i = 0; i < IBNAL_RX_MSGS; i++) {
- /* +1 ref for rx desc */
- CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
- conn, conn->ibc_state,
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- atomic_read (&conn->ibc_refcount));
- atomic_inc (&conn->ibc_refcount);
+ LASSERT (conn->ibc_state >= IBNAL_CONN_ESTABLISHED);
- CDEBUG(D_NET, "RX[%d] %p->%p - "LPX64"\n",
- i, &conn->ibc_rxs[i], conn->ibc_rxs[i].rx_msg,
- conn->ibc_rxs[i].rx_vaddr);
+ switch (info->Status) {
+ default:
+ LBUG();
+ break;
- kibnal_post_rx (&conn->ibc_rxs[i], 0);
- }
+ case FCM_DISCONNECT_REPLY:
+ /* You can't get this if you set TIMEWAIT */
+ CERROR("Unexpected FCM_DISCONNECT_REPLY for %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ LBUG();
+ break;
+
+ case FCM_DISCONNECT_REQUEST:
+ /* Schedule conn to iibt_cm_disconnect() if it wasn't already */
+ kibnal_close_conn (conn, 0);
+ break;
- kibnal_check_sends (conn);
- return;
+ case FCM_DISCONNECTED:
+ CDEBUG(D_NET, "Connection %p -> %s disconnected.\n",
+ conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kibnal_conn_decref(conn); /* Lose CM's ref */
+ break;
}
+}
- /* connection failed */
- if (state == IBNAL_CONN_CONNECTING) {
- /* schedule for connd to close */
- kibnal_close_conn_locked (conn, status);
- } else {
- /* Don't have a CM comm_id; just wait for refs to drain */
- conn->ibc_state = IBNAL_CONN_DISCONNECTED;
- }
+void
+kibnal_cm_passive_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
+ kib_conn_t *conn = arg;
- write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
+ CDEBUG(D_NET, "status 0x%x\n", info->Status);
+ kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING);
+
+ /* Established Connection Notifier */
+ switch (info->Status) {
+ default:
+ CERROR("Unexpected status %d on Connection %p -> %s\n",
+ info->Status, conn,
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ LBUG();
+ break;
+
+ case FCM_CONNECT_TIMEOUT:
+ kibnal_connreq_done(conn, 0, -ETIMEDOUT);
+ break;
+
+ case FCM_CONNECT_REJECT:
+ kibnal_check_connreject(conn, 0, &info->Info.Reject);
+ break;
- kibnal_peer_connect_failed (conn->ibc_peer, active, status);
+ case FCM_CONNECT_ESTABLISHED:
+ kibnal_connreq_done(conn, 0, 0);
+ break;
- /* If we didn't establish the connection we don't have to pass
- * through the disconnect protocol before dropping the CM ref */
- if (state < IBNAL_CONN_CONNECTING)
- kibnal_put_conn (conn);
+ case FCM_DISCONNECT_REQUEST:
+ case FCM_DISCONNECT_REPLY:
+ case FCM_DISCONNECTED:
+ kibnal_cm_disconnect_callback(conn, info);
+ break;
+ }
}
-static int
-kibnal_accept (kib_conn_t **connp, IB_HANDLE *cep,
- lnet_nid_t nid, __u64 incarnation, int queue_depth)
+int
+kibnal_accept (kib_conn_t **connp, kib_msg_t *msg, int nob)
{
- kib_conn_t *conn = kibnal_create_conn();
+ lnet_nid_t nid;
+ kib_conn_t *conn;
kib_peer_t *peer;
kib_peer_t *peer2;
unsigned long flags;
+ int rc;
- if (conn == NULL)
- return (-ENOMEM);
+ rc = kibnal_unpack_msg(msg, nob);
+ if (rc != 0) {
+ CERROR("Error %d unpacking connreq\n", rc);
+ return -EPROTO;
+ }
- if (queue_depth != IBNAL_MSG_QUEUE_SIZE) {
- CERROR("Can't accept %s: bad queue depth %d (%d expected)\n",
+ nid = msg->ibm_srcnid;
+
+ if (msg->ibm_type != IBNAL_MSG_CONNREQ) {
+ CERROR("Can't accept %s: bad request type %d (%d expected)\n",
+ libcfs_nid2str(nid), msg->ibm_type, IBNAL_MSG_CONNREQ);
+ return -EPROTO;
+ }
+
+ if (msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid) {
+ CERROR("Can't accept %s: bad dst NID %s (%s expected)\n",
libcfs_nid2str(nid),
- queue_depth, IBNAL_MSG_QUEUE_SIZE);
- atomic_dec (&conn->ibc_refcount);
- kibnal_destroy_conn(conn);
- return (-EPROTO);
+ libcfs_nid2str(msg->ibm_dstnid),
+ libcfs_nid2str(kibnal_data.kib_ni->ni_nid));
+ return -EPROTO;
}
+ if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE ||
+ msg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE ||
+ msg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
+ CERROR("Reject %s: q %d sz %d frag %d, (%d %d %d expected)\n",
+ libcfs_nid2str(nid),
+ msg->ibm_u.connparams.ibcp_queue_depth,
+ msg->ibm_u.connparams.ibcp_max_msg_size,
+ msg->ibm_u.connparams.ibcp_max_frags,
+ IBNAL_MSG_QUEUE_SIZE,
+ IBNAL_MSG_SIZE,
+ IBNAL_MAX_RDMA_FRAGS);
+ return -EPROTO;
+ }
+
+ conn = kibnal_create_conn(nid);
+ if (conn == NULL)
+ return -ENOMEM;
+
/* assume 'nid' is a new peer */
- peer = kibnal_create_peer (nid);
- if (peer == NULL) {
- CDEBUG(D_NET, "--conn[%p] state %d -> %s (%d)\n",
- conn, conn->ibc_state,
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- atomic_read (&conn->ibc_refcount));
- atomic_dec (&conn->ibc_refcount);
- kibnal_destroy_conn(conn);
- return (-ENOMEM);
+ rc = kibnal_create_peer(&peer, nid);
+ if (rc != 0) {
+ kibnal_conn_decref(conn);
+ return rc;
}
write_lock_irqsave (&kibnal_data.kib_global_lock, flags);
/* peer table takes my ref on peer */
list_add_tail (&peer->ibp_list, kibnal_nid2peerlist(nid));
} else {
- kib_peer_decref (peer);
+ kibnal_peer_decref(peer);
peer = peer2;
}
- kib_peer_addref(peer); /* +1 ref for conn */
+ kibnal_peer_addref(peer); /* +1 ref for conn */
peer->ibp_connecting++;
- write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
-
+ kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING);
conn->ibc_peer = peer;
- conn->ibc_state = IBNAL_CONN_CONNECTING;
- /* conn->ibc_cep is set when cm_accept is called */
- conn->ibc_incarnation = incarnation;
+ conn->ibc_incarnation = msg->ibm_srcstamp;
conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
- *connp = conn;
- return (0);
-}
-
-static void kibnal_set_qp_state(IB_HANDLE *qp, IB_QP_STATE state)
-{
- IB_QP_ATTRIBUTES_MODIFY modify_attr = {0,};
- FSTATUS frc;
-
- modify_attr.RequestState = state;
-
- frc = iibt_qp_modify(qp, &modify_attr, NULL);
- if (frc != FSUCCESS)
- CERROR("couldn't set qp state to %d, error %d\n", state, frc);
-}
-
-static void kibnal_flush_pending(kib_conn_t *conn)
-{
- LIST_HEAD (zombies);
- struct list_head *tmp;
- struct list_head *nxt;
- kib_tx_t *tx;
- unsigned long flags;
- int done;
-
- /* NB we wait until the connection has closed before completing
- * outstanding passive RDMAs so we can be sure the network can't
- * touch the mapped memory any more. */
- KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_DISCONNECTED);
-
- /* set the QP to the error state so that we get flush callbacks
- * on our posted receives which can then drop their conn refs */
- kibnal_set_qp_state(conn->ibc_qp, QPStateError);
-
- spin_lock_irqsave (&conn->ibc_lock, flags);
-
- /* grab passive RDMAs not waiting for the tx callback */
- list_for_each_safe (tmp, nxt, &conn->ibc_active_txs) {
- tx = list_entry (tmp, kib_tx_t, tx_list);
-
- LASSERT (tx->tx_passive_rdma ||
- !tx->tx_passive_rdma_wait);
-
- LASSERT (tx->tx_passive_rdma_wait ||
- tx->tx_sending != 0);
-
- /* still waiting for tx callback? */
- if (!tx->tx_passive_rdma_wait)
- continue;
+ write_unlock_irqrestore (&kibnal_data.kib_global_lock, flags);
- tx->tx_status = -ECONNABORTED;
- tx->tx_passive_rdma_wait = 0;
- done = (tx->tx_sending == 0);
+ *connp = conn;
+ return (0);
+}
- if (!done)
- continue;
+void
+kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+{
- list_del (&tx->tx_list);
- list_add (&tx->tx_list, &zombies);
- }
+ CM_REQUEST_INFO *req = &info->Info.Request;
+ CM_REPLY_INFO *rep;
+ kib_conn_t *conn;
+ FSTATUS frc;
+ int rc;
+
+ LASSERT(arg == NULL); /* no conn yet for passive */
- /* grab all blocked transmits */
- list_for_each_safe (tmp, nxt, &conn->ibc_tx_queue) {
- tx = list_entry (tmp, kib_tx_t, tx_list);
-
- list_del (&tx->tx_list);
- list_add (&tx->tx_list, &zombies);
+ CDEBUG(D_NET, "%x\n", info->Status);
+
+ if (info->Status == FCM_CONNECT_CANCEL) {
+ up(&kibnal_data.kib_listener_signal);
+ return;
}
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ LASSERT (info->Status == FCM_CONNECT_REQUEST);
- while (!list_empty(&zombies)) {
- tx = list_entry (zombies.next, kib_tx_t, tx_list);
+ rc = kibnal_accept(&conn, (kib_msg_t *)req->PrivateData,
+ CM_REQUEST_INFO_USER_LEN);
+ if (rc != 0) {
+ kibnal_reject(LNET_NID_ANY, cep,
+ (rc == -EPROTO) ? RC_USER_REJ : RC_NO_RESOURCES);
+ return;
+ }
- list_del(&tx->tx_list);
- kibnal_tx_done (tx);
+ conn->ibc_cvars->cv_path = req->PathInfo.Path;
+
+ rc = kibnal_conn_rts(conn,
+ req->CEPInfo.QPN,
+ req->CEPInfo.OfferedInitiatorDepth,
+ req->CEPInfo.OfferedResponderResources,
+ req->CEPInfo.StartingPSN);
+ if (rc != 0) {
+ kibnal_reject(conn->ibc_peer->ibp_nid, cep, RC_NO_RESOURCES);
+ kibnal_connreq_done(conn, 0, -ECONNABORTED);
+ return;
}
-}
-static void
-kibnal_reject (IB_HANDLE cep, uint16_t reason)
-{
- CM_REJECT_INFO *rej;
+ memset(&conn->ibc_cvars->cv_cmci, 0, sizeof(conn->ibc_cvars->cv_cmci));
+ rep = &conn->ibc_cvars->cv_cmci.Info.Reply;
- LIBCFS_ALLOC(rej, sizeof(*rej));
- if (rej == NULL) /* LIBCFS_ALLOC() will CERROR on failure */
- return;
+ rep->QPN = conn->ibc_cvars->cv_qpattrs.QPNumber;
+ rep->QKey = conn->ibc_cvars->cv_qpattrs.Qkey;
+ rep->StartingPSN = conn->ibc_cvars->cv_qpattrs.RecvPSN;
+ rep->EndToEndFlowControl = conn->ibc_cvars->cv_qpattrs.FlowControl;
+ rep->ArbInitiatorDepth = conn->ibc_cvars->cv_qpattrs.InitiatorDepth;
+ rep->ArbResponderResources = conn->ibc_cvars->cv_qpattrs.ResponderResources;
+ rep->TargetAckDelay = kibnal_data.kib_hca_attrs.LocalCaAckDelay;
+ rep->FailoverAccepted = IBNAL_FAILOVER_ACCEPTED;
+ rep->RnRRetryCount = req->CEPInfo.RnrRetryCount;
+
+ CLASSERT (CM_REPLY_INFO_USER_LEN >=
+ offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t));
- rej->Reason = reason;
- iibt_cm_reject(cep, rej);
- LIBCFS_FREE(rej, sizeof(*rej));
-}
+ kibnal_pack_connmsg((kib_msg_t *)rep->PrivateData,
+ CM_REPLY_INFO_USER_LEN,
+ IBNAL_MSG_CONNACK,
+ conn->ibc_peer->ibp_nid, conn->ibc_incarnation);
-static FSTATUS
-kibnal_qp_rts(IB_HANDLE qp_handle, __u32 qpn, __u8 resp_res,
- IB_PATH_RECORD *path, __u8 init_depth, __u32 send_psn)
-{
- IB_QP_ATTRIBUTES_MODIFY modify_attr;
- FSTATUS frc;
- ENTRY;
-
- modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
- .RequestState = QPStateReadyToRecv,
- .RecvPSN = IBNAL_STARTING_PSN,
- .DestQPNumber = qpn,
- .ResponderResources = resp_res,
- .MinRnrTimer = UsecToRnrNakTimer(2000), /* 20 ms */
- .Attrs = (IB_QP_ATTR_RECVPSN |
- IB_QP_ATTR_DESTQPNUMBER |
- IB_QP_ATTR_RESPONDERRESOURCES |
- IB_QP_ATTR_DESTAV |
- IB_QP_ATTR_PATHMTU |
- IB_QP_ATTR_MINRNRTIMER),
- };
- GetAVFromPath(0, path, &modify_attr.PathMTU, NULL,
- &modify_attr.DestAV);
-
- frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
- if (frc != FSUCCESS)
- RETURN(frc);
-
- modify_attr = (IB_QP_ATTRIBUTES_MODIFY) {
- .RequestState = QPStateReadyToSend,
- .FlowControl = TRUE,
- .InitiatorDepth = init_depth,
- .SendPSN = send_psn,
- .LocalAckTimeout = path->PktLifeTime + 2, /* 2 or 1? */
- .RetryCount = IBNAL_RETRY,
- .RnrRetryCount = IBNAL_RNR_RETRY,
- .Attrs = (IB_QP_ATTR_FLOWCONTROL |
- IB_QP_ATTR_INITIATORDEPTH |
- IB_QP_ATTR_SENDPSN |
- IB_QP_ATTR_LOCALACKTIMEOUT |
- IB_QP_ATTR_RETRYCOUNT |
- IB_QP_ATTR_RNRRETRYCOUNT),
- };
+ LASSERT (conn->ibc_cep == NULL);
+ kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING);
+
+ frc = iibt_cm_accept(cep,
+ &conn->ibc_cvars->cv_cmci,
+ NULL,
+ kibnal_cm_passive_callback, conn,
+ &conn->ibc_cep);
- frc = iibt_qp_modify(qp_handle, &modify_attr, NULL);
- RETURN(frc);
+ if (frc == FSUCCESS || frc == FPENDING)
+ return;
+
+ CERROR("iibt_cm_accept(%s) failed: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+ kibnal_connreq_done(conn, 0, -ECONNABORTED);
}
-static void
-kibnal_connect_reply (IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+void
+kibnal_check_connreply(kib_conn_t *conn, CM_REPLY_INFO *rep)
{
- IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
- kib_conn_t *conn = arg;
- kib_wire_connreq_t *wcr;
- CM_REPLY_INFO *rep = &info->Info.Reply;
- uint16_t reason;
- FSTATUS frc;
+ kib_msg_t *msg = (kib_msg_t *)rep->PrivateData;
+ lnet_nid_t nid = conn->ibc_peer->ibp_nid;
+ FSTATUS frc;
+ int rc;
- wcr = (kib_wire_connreq_t *)info->Info.Reply.PrivateData;
+ rc = kibnal_unpack_msg(msg, CM_REPLY_INFO_USER_LEN);
+ if (rc != 0) {
+ CERROR ("Error %d unpacking connack from %s\n",
+ rc, libcfs_nid2str(nid));
+ kibnal_reject(nid, conn->ibc_cep, RC_USER_REJ);
+ kibnal_connreq_done(conn, 1, -EPROTO);
+ return;
+ }
+
+ if (msg->ibm_type != IBNAL_MSG_CONNACK) {
+ CERROR("Bad connack request type %d (%d expected) from %s\n",
+ msg->ibm_type, IBNAL_MSG_CONNREQ,
+ libcfs_nid2str(msg->ibm_srcnid));
+ kibnal_reject(nid, conn->ibc_cep, RC_USER_REJ);
+ kibnal_connreq_done(conn, 1, -EPROTO);
+ return;
+ }
- if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
- CERROR ("Can't connect %s: bad magic %08x\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- le32_to_cpu(wcr->wcr_magic));
- GOTO(reject, reason = RC_USER_REJ);
+ if (msg->ibm_srcnid != conn->ibc_peer->ibp_nid ||
+ msg->ibm_dstnid != kibnal_data.kib_ni->ni_nid ||
+ msg->ibm_dststamp != kibnal_data.kib_incarnation) {
+ CERROR("Stale connack from %s(%s): %s(%s), "LPX64"("LPX64")\n",
+ libcfs_nid2str(msg->ibm_srcnid),
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ libcfs_nid2str(msg->ibm_dstnid),
+ libcfs_nid2str(kibnal_data.kib_ni->ni_nid),
+ msg->ibm_dststamp, kibnal_data.kib_incarnation);
+ kibnal_reject(nid, conn->ibc_cep, RC_USER_REJ);
+ kibnal_connreq_done(conn, 1, -EPROTO);
+ return;
}
- if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
- CERROR ("Can't connect %s: bad version %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- le16_to_cpu(wcr->wcr_magic));
- GOTO(reject, reason = RC_USER_REJ);
- }
-
- if (wcr->wcr_queue_depth != cpu_to_le16(IBNAL_MSG_QUEUE_SIZE)) {
- CERROR ("Can't connect %s: bad queue depth %d\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- le16_to_cpu(wcr->wcr_queue_depth));
- GOTO(reject, reason = RC_USER_REJ);
+ if (msg->ibm_u.connparams.ibcp_queue_depth != IBNAL_MSG_QUEUE_SIZE ||
+ msg->ibm_u.connparams.ibcp_max_msg_size > IBNAL_MSG_SIZE ||
+ msg->ibm_u.connparams.ibcp_max_frags > IBNAL_MAX_RDMA_FRAGS) {
+ CERROR("Reject %s: q %d sz %d frag %d, (%d %d %d expected)\n",
+ libcfs_nid2str(msg->ibm_srcnid),
+ msg->ibm_u.connparams.ibcp_queue_depth,
+ msg->ibm_u.connparams.ibcp_max_msg_size,
+ msg->ibm_u.connparams.ibcp_max_frags,
+ IBNAL_MSG_QUEUE_SIZE,
+ IBNAL_MSG_SIZE,
+ IBNAL_MAX_RDMA_FRAGS);
+ kibnal_reject(nid, conn->ibc_cep, RC_USER_REJ);
+ kibnal_connreq_done(conn, 1, -EPROTO);
+ return;
}
- if (le64_to_cpu(wcr->wcr_nid) != conn->ibc_peer->ibp_nid) {
- CERROR ("Unexpected NID %s from %s\n",
- libcfs_nid2str(le64_to_cpu(wcr->wcr_nid)),
- libcfs_nid2str(conn->ibc_peer->ibp_nid));
- GOTO(reject, reason = RC_USER_REJ);
- }
-
CDEBUG(D_NET, "Connection %p -> %s REP_RECEIVED.\n",
conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
- conn->ibc_incarnation = le64_to_cpu(wcr->wcr_incarnation);
+ conn->ibc_incarnation = msg->ibm_srcstamp;
conn->ibc_credits = IBNAL_MSG_QUEUE_SIZE;
- frc = kibnal_qp_rts(conn->ibc_qp, rep->QPN,
- min_t(__u8, rep->ArbInitiatorDepth,
- ca_attr->MaxQPResponderResources),
- &conn->ibc_connreq->cr_path,
- min_t(__u8, rep->ArbResponderResources,
- ca_attr->MaxQPInitiatorDepth),
- rep->StartingPSN);
- if (frc != FSUCCESS) {
- CERROR("Connection %p -> %s QP RTS/RTR failed: %d\n",
- conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
- GOTO(reject, reason = RC_NO_QP);
- }
-
- /* the callback arguments are ignored for an active accept */
- conn->ibc_connreq->cr_discarded.Status = FSUCCESS;
- frc = iibt_cm_accept(cep, &conn->ibc_connreq->cr_discarded,
- NULL, NULL, NULL, NULL);
- if (frc != FCM_CONNECT_ESTABLISHED) {
- CERROR("Connection %p -> %s CMAccept failed: %d\n",
- conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
- kibnal_connreq_done (conn, 1, -ECONNABORTED);
- /* XXX don't call reject after accept fails? */
+ rc = kibnal_conn_rts(conn,
+ rep->QPN,
+ rep->ArbInitiatorDepth,
+ rep->ArbResponderResources,
+ rep->StartingPSN);
+ if (rc != 0) {
+ kibnal_reject(nid, conn->ibc_cep, RC_NO_RESOURCES);
+ kibnal_connreq_done(conn, 1, -EIO);
return;
}
- CDEBUG(D_NET, "Connection %p -> %s Established\n",
- conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ memset(&conn->ibc_cvars->cv_cmci, 0, sizeof(conn->ibc_cvars->cv_cmci));
+
+ frc = iibt_cm_accept(conn->ibc_cep,
+ &conn->ibc_cvars->cv_cmci,
+ NULL, NULL, NULL, NULL);
+
+ if (frc == FCM_CONNECT_ESTABLISHED)
+ kibnal_connreq_done(conn, 1, 0);
- kibnal_connreq_done (conn, 1, 0);
- return;
-reject:
- kibnal_reject(cep, reason);
- kibnal_connreq_done (conn, 1, -EPROTO);
+ CERROR("Connection %p -> %s CMAccept failed: %d\n",
+ conn, libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+ kibnal_connreq_done(conn, 1, -ECONNABORTED);
}
-/* ib_cm.h has a wealth of information on the CM procedures */
-static void
-kibnal_cm_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
+void
+kibnal_cm_active_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
{
kib_conn_t *conn = arg;
CDEBUG(D_NET, "status 0x%x\n", info->Status);
- /* Established Connection Notifier */
switch (info->Status) {
default:
- CERROR("unknown status %d on Connection %p -> %s\n",
+ CERROR("unknown status %d on Connection %p -> %s\n",
info->Status, conn,
libcfs_nid2str(conn->ibc_peer->ibp_nid));
LBUG();
break;
- case FCM_CONNECT_REPLY:
- kibnal_connect_reply(cep, info, arg);
+ case FCM_CONNECT_TIMEOUT:
+ kibnal_connreq_done(conn, 1, -ETIMEDOUT);
+ break;
+
+ case FCM_CONNECT_REJECT:
+ kibnal_check_connreject(conn, 1, &info->Info.Reject);
break;
- case FCM_DISCONNECT_REQUEST:
- /* XXX lock around these state management bits? */
- if (conn->ibc_state == IBNAL_CONN_ESTABLISHED)
- kibnal_close_conn (conn, 0);
- conn->ibc_state = IBNAL_CONN_DREP;
- iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+ case FCM_CONNECT_REPLY:
+ kibnal_check_connreply(conn, &info->Info.Reply);
break;
- /* these both guarantee that no more cm callbacks will occur */
- case FCM_DISCONNECTED: /* aka FCM_DISCONNECT_TIMEOUT */
+ case FCM_DISCONNECT_REQUEST:
case FCM_DISCONNECT_REPLY:
- CDEBUG(D_NET, "Connection %p -> %s disconnect done.\n",
- conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
-
- conn->ibc_state = IBNAL_CONN_DISCONNECTED;
- kibnal_flush_pending(conn);
- kibnal_put_conn(conn); /* Lose CM's ref */
+ case FCM_DISCONNECTED:
+ kibnal_cm_disconnect_callback(conn, info);
break;
}
-
- return;
-}
-
-static int
-kibnal_set_cm_flags(IB_HANDLE cep)
-{
- FSTATUS frc;
- uint32 value = 1;
-
- frc = iibt_cm_modify_cep(cep, CM_FLAG_TIMEWAIT_CALLBACK,
- (char *)&value, sizeof(value), 0);
- if (frc != FSUCCESS) {
- CERROR("error setting timeout callback: %d\n", frc);
- return -1;
- }
-
-#if 0
- frc = iibt_cm_modify_cep(cep, CM_FLAG_ASYNC_ACCEPT, (char *)&value,
- sizeof(value), 0);
- if (frc != FSUCCESS) {
- CERROR("error setting async accept: %d\n", frc);
- return -1;
- }
-#endif
-
- return 0;
}
void
-kibnal_listen_callback(IB_HANDLE cep, CM_CONN_INFO *info, void *arg)
-{
- IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
- IB_QP_ATTRIBUTES_QUERY *query;
- CM_REQUEST_INFO *req;
- CM_CONN_INFO *rep = NULL, *rcv = NULL;
- kib_wire_connreq_t *wcr;
- kib_conn_t *conn = NULL;
- uint16_t reason = 0;
- FSTATUS frc;
- int rc = 0;
-
- LASSERT(cep);
- LASSERT(info);
- LASSERT(arg == NULL); /* no conn yet for passive */
-
- CDEBUG(D_NET, "status 0x%x\n", info->Status);
-
- req = &info->Info.Request;
- wcr = (kib_wire_connreq_t *)req->PrivateData;
-
- CDEBUG(D_NET, "%d from %s\n", info->Status,
- libcfs_nid2str(le64_to_cpu(wcr->wcr_nid)));
-
- if (info->Status == FCM_CONNECT_CANCEL)
- return;
-
- LASSERT (info->Status == FCM_CONNECT_REQUEST);
-
- if (wcr->wcr_magic != cpu_to_le32(IBNAL_MSG_MAGIC)) {
- CERROR ("Can't accept: bad magic %08x\n",
- le32_to_cpu(wcr->wcr_magic));
- GOTO(out, reason = RC_USER_REJ);
- }
-
- if (wcr->wcr_version != cpu_to_le16(IBNAL_MSG_VERSION)) {
- CERROR ("Can't accept: bad version %d\n",
- le16_to_cpu(wcr->wcr_magic));
- GOTO(out, reason = RC_USER_REJ);
- }
-
- rc = kibnal_accept(&conn, cep,
- le64_to_cpu(wcr->wcr_nid),
- le64_to_cpu(wcr->wcr_incarnation),
- le16_to_cpu(wcr->wcr_queue_depth));
- if (rc != 0) {
- CERROR ("Can't accept %s: %d\n",
- libcfs_nid2str(le64_to_cpu(wcr->wcr_nid)), rc);
- GOTO(out, reason = RC_NO_RESOURCES);
- }
-
- frc = kibnal_qp_rts(conn->ibc_qp, req->CEPInfo.QPN,
- min_t(__u8, req->CEPInfo.OfferedInitiatorDepth,
- ca_attr->MaxQPResponderResources),
- &req->PathInfo.Path,
- min_t(__u8, req->CEPInfo.OfferedResponderResources,
- ca_attr->MaxQPInitiatorDepth),
- req->CEPInfo.StartingPSN);
-
- if (frc != FSUCCESS) {
- CERROR ("Can't mark QP RTS/RTR %s: %d\n",
- libcfs_nid2str(le64_to_cpu(wcr->wcr_nid)), frc);
- GOTO(out, reason = RC_NO_QP);
- }
-
- frc = iibt_qp_query(conn->ibc_qp, &conn->ibc_qp_attrs, NULL);
- if (frc != FSUCCESS) {
- CERROR ("Couldn't query qp attributes %s: %d\n",
- libcfs_nid2str(le64_to_cpu(wcr->wcr_nid)), frc);
- GOTO(out, reason = RC_NO_QP);
- }
- query = &conn->ibc_qp_attrs;
-
- LIBCFS_ALLOC(rep, sizeof(*rep));
- LIBCFS_ALLOC(rcv, sizeof(*rcv));
- if (rep == NULL || rcv == NULL) {
- if (rep) LIBCFS_FREE(rep, sizeof(*rep));
- if (rcv) LIBCFS_FREE(rcv, sizeof(*rcv));
- CERROR ("can't allocate reply and receive buffers\n");
- GOTO(out, reason = RC_INSUFFICIENT_RESP_RES);
- }
-
- /* don't try to deref this into the incoming wcr :) */
- wcr = (kib_wire_connreq_t *)rep->Info.Reply.PrivateData;
-
- rep->Info.Reply = (CM_REPLY_INFO) {
- .QPN = query->QPNumber,
- .QKey = query->Qkey,
- .StartingPSN = query->RecvPSN,
- .EndToEndFlowControl = query->FlowControl,
- /* XXX Hmm. */
- .ArbInitiatorDepth = query->InitiatorDepth,
- .ArbResponderResources = query->ResponderResources,
- .TargetAckDelay = 0,
- .FailoverAccepted = 0,
- .RnRRetryCount = req->CEPInfo.RnrRetryCount,
- };
-
- *wcr = (kib_wire_connreq_t) {
- .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC),
- .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION),
- .wcr_queue_depth = cpu_to_le32(IBNAL_MSG_QUEUE_SIZE),
- .wcr_nid = cpu_to_le64(kibnal_data.kib_ni->ni_nid),
- .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
- };
-
- frc = iibt_cm_accept(cep, rep, rcv, kibnal_cm_callback, conn,
- &conn->ibc_cep);
-
- LIBCFS_FREE(rep, sizeof(*rep));
- LIBCFS_FREE(rcv, sizeof(*rcv));
-
- if (frc != FCM_CONNECT_ESTABLISHED) {
- /* XXX it seems we don't call reject after this point? */
- CERROR("iibt_cm_accept() failed: %d, aborting\n", frc);
- rc = -ECONNABORTED;
- goto out;
- }
-
- if (kibnal_set_cm_flags(conn->ibc_cep)) {
- rc = -ECONNABORTED;
- goto out;
- }
-
- CDEBUG(D_WARNING, "Connection %p -> %s ESTABLISHED.\n",
- conn, libcfs_nid2str(conn->ibc_peer->ibp_nid));
-
-out:
- if (reason) {
- kibnal_reject(cep, reason);
- rc = -ECONNABORTED;
- }
- if (conn != NULL)
- kibnal_connreq_done(conn, 0, rc);
-
- return;
-}
-
-static void
dump_path_records(PATH_RESULTS *results)
{
IB_PATH_RECORD *path;
int i;
- for(i = 0; i < results->NumPathRecords; i++) {
+ for (i = 0; i < results->NumPathRecords; i++) {
path = &results->PathRecords[i];
CDEBUG(D_NET, "%d: sgid "LPX64":"LPX64" dgid "
LPX64":"LPX64" pkey %x\n",
}
}
-static void
-kibnal_pathreq_callback (void *arg, QUERY *query,
- QUERY_RESULT_VALUES *query_res)
+void
+kibnal_pathreq_callback (void *arg, QUERY *qry,
+ QUERY_RESULT_VALUES *qrslt)
{
- IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
- kib_conn_t *conn = arg;
- PATH_RESULTS *path;
- FSTATUS frc;
- lnet_nid_t nid;
+ IB_CA_ATTRIBUTES *ca_attr = &kibnal_data.kib_hca_attrs;
+ kib_conn_t *conn = arg;
+ CM_REQUEST_INFO *req = &conn->ibc_cvars->cv_cmci.Info.Request;
+ PATH_RESULTS *path = (PATH_RESULTS *)qrslt->QueryResult;
+ FSTATUS frc;
- if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
- CERROR ("status %d data size %d\n", query_res->Status,
- query_res->ResultDataSize);
- kibnal_connreq_done (conn, 1, -EINVAL);
+ if (qrslt->Status != FSUCCESS ||
+ qrslt->ResultDataSize < sizeof(*path)) {
+ CERROR ("pathreq %s failed: status %d data size %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ qrslt->Status, qrslt->ResultDataSize);
+ kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
return;
}
- path = (PATH_RESULTS *)query_res->QueryResult;
-
if (path->NumPathRecords < 1) {
- CERROR ("expected path records: %d\n", path->NumPathRecords);
- kibnal_connreq_done (conn, 1, -EINVAL);
+ CERROR ("pathreq %s failed: no path records\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
return;
}
- dump_path_records(path);
+ //dump_path_records(path);
+ conn->ibc_cvars->cv_path = path->PathRecords[0];
- /* just using the first. this is probably a horrible idea. */
- conn->ibc_connreq->cr_path = path->PathRecords[0];
+ LASSERT (conn->ibc_cep == NULL);
- conn->ibc_cep = iibt_cm_create_cep(CM_RC_TYPE);
+ conn->ibc_cep = kibnal_create_cep(conn->ibc_peer->ibp_nid);
if (conn->ibc_cep == NULL) {
- CERROR ("Can't create CEP\n");
- kibnal_connreq_done (conn, 1, -EINVAL);
- return;
- }
-
- if (kibnal_set_cm_flags(conn->ibc_cep)) {
- kibnal_connreq_done (conn, 1, -EINVAL);
+ kibnal_connreq_done(conn, 1, -ENOMEM);
return;
}
- conn->ibc_connreq->cr_wcr = (kib_wire_connreq_t) {
- .wcr_magic = cpu_to_le32(IBNAL_MSG_MAGIC),
- .wcr_version = cpu_to_le16(IBNAL_MSG_VERSION),
- .wcr_queue_depth = cpu_to_le16(IBNAL_MSG_QUEUE_SIZE),
- .wcr_nid = cpu_to_le64(kibnal_data.kib_ni->ni_nid),
- .wcr_incarnation = cpu_to_le64(kibnal_data.kib_incarnation),
- };
-
- conn->ibc_connreq->cr_cmreq = (CM_REQUEST_INFO) {
- .SID = conn->ibc_connreq->cr_service.RID.ServiceID,
- .CEPInfo = (CM_CEP_INFO) {
- .CaGUID = kibnal_data.kib_hca_guids[0],
- .EndToEndFlowControl = FALSE,
- .PortGUID = conn->ibc_connreq->cr_path.SGID.Type.Global.InterfaceID,
- .RetryCount = IBNAL_RETRY,
- .RnrRetryCount = IBNAL_RNR_RETRY,
- .AckTimeout = IBNAL_ACK_TIMEOUT,
- .StartingPSN = IBNAL_STARTING_PSN,
- .QPN = conn->ibc_qp_attrs.QPNumber,
- .QKey = conn->ibc_qp_attrs.Qkey,
- .OfferedResponderResources = ca_attr->MaxQPResponderResources,
- .OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth,
- },
- .PathInfo = (CM_CEP_PATHINFO) {
- .bSubnetLocal = TRUE,
- .Path = conn->ibc_connreq->cr_path,
- },
- };
+ memset(req, 0, sizeof(*req));
+ req->SID = conn->ibc_cvars->cv_svcrec.RID.ServiceID;
+ req->CEPInfo.CaGUID = kibnal_data.kib_hca_guids[kibnal_data.kib_hca_idx];
+ req->CEPInfo.EndToEndFlowControl = IBNAL_EE_FLOW;
+ req->CEPInfo.PortGUID = conn->ibc_cvars->cv_path.SGID.Type.Global.InterfaceID;
+ req->CEPInfo.RetryCount = IBNAL_RETRY;
+ req->CEPInfo.RnrRetryCount = IBNAL_RNR_RETRY;
+ req->CEPInfo.AckTimeout = IBNAL_ACK_TIMEOUT;
+ req->CEPInfo.StartingPSN = IBNAL_STARTING_PSN;
+ req->CEPInfo.QPN = conn->ibc_cvars->cv_qpattrs.QPNumber;
+ req->CEPInfo.QKey = conn->ibc_cvars->cv_qpattrs.Qkey;
+ req->CEPInfo.OfferedResponderResources = ca_attr->MaxQPResponderResources;
+ req->CEPInfo.OfferedInitiatorDepth = ca_attr->MaxQPInitiatorDepth;
+ req->PathInfo.bSubnetLocal = IBNAL_LOCAL_SUB;
+ req->PathInfo.Path = conn->ibc_cvars->cv_path;
+
+ CLASSERT (CM_REQUEST_INFO_USER_LEN >=
+ offsetof(kib_msg_t, ibm_u) + sizeof(kib_connparams_t));
+
+ kibnal_pack_connmsg((kib_msg_t *)req->PrivateData,
+ CM_REQUEST_INFO_USER_LEN,
+ IBNAL_MSG_CONNREQ,
+ conn->ibc_peer->ibp_nid, 0);
-#if 0
- /* XXX set timeout just like SDP!!!*/
- conn->ibc_connreq->cr_path.packet_life = 13;
-#endif
/* Flag I'm getting involved with the CM... */
- conn->ibc_state = IBNAL_CONN_CONNECTING;
-
- nid = *kibnal_service_nid_field(&conn->ibc_connreq->cr_service);
-
- CDEBUG(D_NET, "Connecting to, service id "LPX64", on %s\n",
- conn->ibc_connreq->cr_service.RID.ServiceID,
- libcfs_nid2str(nid));
+ kibnal_set_conn_state(conn, IBNAL_CONN_CONNECTING);
- memset(conn->ibc_connreq->cr_cmreq.PrivateData, 0,
- CM_REQUEST_INFO_USER_LEN);
- memcpy(conn->ibc_connreq->cr_cmreq.PrivateData,
- &conn->ibc_connreq->cr_wcr, sizeof(conn->ibc_connreq->cr_wcr));
-
- /* kibnal_cm_callback gets my conn ref */
- frc = iibt_cm_connect(conn->ibc_cep, &conn->ibc_connreq->cr_cmreq,
- kibnal_cm_callback, conn);
- if (frc != FPENDING && frc != FSUCCESS) {
- CERROR ("Connect: %d\n", frc);
- /* Back out state change as connect failed */
- conn->ibc_state = IBNAL_CONN_INIT_QP;
- kibnal_connreq_done (conn, 1, -EINVAL);
- }
+ /* cm callback gets my conn ref */
+ frc = iibt_cm_connect(conn->ibc_cep, req,
+ kibnal_cm_active_callback, conn);
+ if (frc == FPENDING || frc == FSUCCESS)
+ return;
+
+ CERROR ("Connect %s failed: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+ kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
}
-static void
-dump_service_records(SERVICE_RECORD_RESULTS *results)
+void
+kibnal_dump_service_records(SERVICE_RECORD_RESULTS *results)
{
IB_SERVICE_RECORD *svc;
int i;
- for(i = 0; i < results->NumServiceRecords; i++) {
+ for (i = 0; i < results->NumServiceRecords; i++) {
svc = &results->ServiceRecords[i];
CDEBUG(D_NET, "%d: sid "LPX64" gid "LPX64":"LPX64" pkey %x\n",
i,
}
}
-
-static void
-kibnal_service_get_callback (void *arg, QUERY *query,
- QUERY_RESULT_VALUES *query_res)
+void
+kibnal_service_get_callback (void *arg, QUERY *qry,
+ QUERY_RESULT_VALUES *qrslt)
{
- kib_conn_t *conn = arg;
- SERVICE_RECORD_RESULTS *svc;
- COMMAND_CONTROL_PARAMETERS sd_params;
- QUERY path_query;
- FSTATUS frc;
- lnet_nid_t nid;
-
- if (query_res->Status != FSUCCESS || query_res->ResultDataSize == 0) {
- CERROR ("status %d data size %d\n", query_res->Status,
- query_res->ResultDataSize);
- kibnal_connreq_done (conn, 1, -EINVAL);
+ kib_conn_t *conn = arg;
+ SERVICE_RECORD_RESULTS *svc;
+ FSTATUS frc;
+ lnet_nid_t nid;
+
+ if (qrslt->Status != FSUCCESS ||
+ qrslt->ResultDataSize < sizeof(*svc)) {
+ CERROR ("Lookup %s failed: status %d data size %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ qrslt->Status, qrslt->ResultDataSize);
+ kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
return;
}
- svc = (SERVICE_RECORD_RESULTS *)query_res->QueryResult;
-
+ svc = (SERVICE_RECORD_RESULTS *)qrslt->QueryResult;
if (svc->NumServiceRecords < 1) {
- CERROR ("%d service records\n", svc->NumServiceRecords);
- kibnal_connreq_done (conn, 1, -EINVAL);
+ CERROR ("lookup %s failed: no service records\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
return;
}
- dump_service_records(svc);
-
- conn->ibc_connreq->cr_service = svc->ServiceRecords[0];
- nid = *kibnal_service_nid_field(&conn->ibc_connreq->cr_service);
-
- CDEBUG(D_NET, "Got status %d, service id "LPX64", on %s\n",
- query_res->Status , conn->ibc_connreq->cr_service.RID.ServiceID,
- libcfs_nid2str(nid));
+ //kibnal_dump_service_records(svc);
+ conn->ibc_cvars->cv_svcrec = svc->ServiceRecords[0];
- memset(&path_query, 0, sizeof(path_query));
- path_query.InputType = InputTypePortGuidPair;
- path_query.OutputType = OutputTypePathRecord;
- path_query.InputValue.PortGuidPair.SourcePortGuid = kibnal_data.kib_port_guid;
- path_query.InputValue.PortGuidPair.DestPortGuid = conn->ibc_connreq->cr_service.RID.ServiceGID.Type.Global.InterfaceID;
+ qry = &conn->ibc_cvars->cv_query;
+ memset(qry, 0, sizeof(*qry));
- memset(&sd_params, 0, sizeof(sd_params));
- sd_params.RetryCount = IBNAL_RETRY;
- sd_params.Timeout = 10 * 1000; /* wait 10 seconds */
+ qry->OutputType = OutputTypePathRecord;
+ qry->InputType = InputTypePortGuidPair;
- /* kibnal_service_get_callback gets my conn ref */
+ qry->InputValue.PortGuidPair.SourcePortGuid =
+ kibnal_data.kib_port_guid;
+ qry->InputValue.PortGuidPair.DestPortGuid =
+ conn->ibc_cvars->cv_svcrec.RID.ServiceGID.Type.Global.InterfaceID;
+ /* kibnal_pathreq_callback gets my conn ref */
frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
kibnal_data.kib_port_guid,
- &path_query,
+ qry,
kibnal_pathreq_callback,
- &sd_params, conn);
+ &kibnal_data.kib_sdretry,
+ conn);
if (frc == FPENDING)
return;
- CERROR ("Path record request failed: %d\n", frc);
- kibnal_connreq_done (conn, 1, -EINVAL);
+ CERROR ("pathreq %s failed: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+ kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
}
-static void
+void
kibnal_connect_peer (kib_peer_t *peer)
{
- COMMAND_CONTROL_PARAMETERS sd_params;
- QUERY query;
- FSTATUS frc;
- kib_conn_t *conn = kibnal_create_conn();
+ QUERY *qry;
+ FSTATUS frc;
+ kib_conn_t *conn;
LASSERT (peer->ibp_connecting != 0);
+ conn = kibnal_create_conn(peer->ibp_nid);
if (conn == NULL) {
CERROR ("Can't allocate conn\n");
kibnal_peer_connect_failed (peer, 1, -ENOMEM);
}
conn->ibc_peer = peer;
- kib_peer_addref(peer);
-
- LIBCFS_ALLOC (conn->ibc_connreq, sizeof (*conn->ibc_connreq));
- if (conn->ibc_connreq == NULL) {
- CERROR ("Can't allocate connreq\n");
- kibnal_connreq_done (conn, 1, -ENOMEM);
- return;
- }
+ kibnal_peer_addref(peer);
- memset(conn->ibc_connreq, 0, sizeof (*conn->ibc_connreq));
+ qry = &conn->ibc_cvars->cv_query;
+ memset(qry, 0, sizeof(*qry));
- kibnal_set_service_keys(&conn->ibc_connreq->cr_service, peer->ibp_nid);
+ qry->OutputType = OutputTypeServiceRecord;
+ qry->InputType = InputTypeServiceRecord;
- memset(&query, 0, sizeof(query));
- query.InputType = InputTypeServiceRecord;
- query.OutputType = OutputTypeServiceRecord;
- query.InputValue.ServiceRecordValue.ServiceRecord = conn->ibc_connreq->cr_service;
- query.InputValue.ServiceRecordValue.ComponentMask = KIBNAL_SERVICE_KEY_MASK;
-
- memset(&sd_params, 0, sizeof(sd_params));
- sd_params.RetryCount = IBNAL_RETRY;
- sd_params.Timeout = 10 * 1000; /* wait 10 seconds */
+ qry->InputValue.ServiceRecordValue.ComponentMask =
+ KIBNAL_SERVICE_KEY_MASK;
+ kibnal_set_service_keys(
+ &qry->InputValue.ServiceRecordValue.ServiceRecord,
+ peer->ibp_nid);
/* kibnal_service_get_callback gets my conn ref */
frc = iibt_sd_query_port_fabric_information(kibnal_data.kib_sd,
kibnal_data.kib_port_guid,
- &query,
- kibnal_service_get_callback,
- &sd_params, conn);
+ qry,
+ kibnal_service_get_callback,
+ &kibnal_data.kib_sdretry,
+ conn);
if (frc == FPENDING)
return;
- CERROR ("iibt_sd_query_port_fabric_information(): %d\n", frc);
- kibnal_connreq_done (conn, 1, frc);
+ CERROR("Lookup %s failed: %d\n", libcfs_nid2str(peer->ibp_nid), frc);
+ kibnal_connreq_done(conn, 1, -EHOSTUNREACH);
}
-static int
+int
kibnal_conn_timed_out (kib_conn_t *conn)
{
kib_tx_t *tx;
struct list_head *ttmp;
- unsigned long flags;
- spin_lock_irqsave (&conn->ibc_lock, flags);
+ spin_lock(&conn->ibc_lock);
list_for_each (ttmp, &conn->ibc_tx_queue) {
tx = list_entry (ttmp, kib_tx_t, tx_list);
- LASSERT (!tx->tx_passive_rdma_wait);
- LASSERT (tx->tx_sending == 0);
+ LASSERT (tx->tx_queued);
if (time_after_eq (jiffies, tx->tx_deadline)) {
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ spin_unlock(&conn->ibc_lock);
return 1;
}
}
list_for_each (ttmp, &conn->ibc_active_txs) {
tx = list_entry (ttmp, kib_tx_t, tx_list);
- LASSERT (tx->tx_passive_rdma ||
- !tx->tx_passive_rdma_wait);
-
- LASSERT (tx->tx_passive_rdma_wait ||
+ LASSERT (!tx->tx_queued);
+ LASSERT (tx->tx_waiting ||
tx->tx_sending != 0);
if (time_after_eq (jiffies, tx->tx_deadline)) {
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
+ spin_unlock(&conn->ibc_lock);
return 1;
}
}
- spin_unlock_irqrestore (&conn->ibc_lock, flags);
-
+ spin_unlock(&conn->ibc_lock);
return 0;
}
-static void
+void
kibnal_check_conns (int idx)
{
struct list_head *peers = &kibnal_data.kib_peers[idx];
list_for_each (ctmp, &peer->ibp_conns) {
conn = list_entry (ctmp, kib_conn_t, ibc_list);
- KIB_ASSERT_CONN_STATE(conn, IBNAL_CONN_ESTABLISHED);
+ LASSERT (conn->ibc_state == IBNAL_CONN_ESTABLISHED);
/* In case we have enough credits to return via a
* NOOP, but there were no non-blocking tx descs
if (!kibnal_conn_timed_out(conn))
continue;
+
+ /* Handle timeout by closing the whole connection. We
+ * can only be sure RDMA activity has ceased once the
+ * QP has been modified. */
- CDEBUG(D_NET, "++conn[%p] state %d -> %s (%d)\n",
- conn, conn->ibc_state,
- libcfs_nid2str(peer->ibp_nid),
- atomic_read (&conn->ibc_refcount));
+ kibnal_conn_addref(conn); /* 1 ref for me... */
- atomic_inc (&conn->ibc_refcount);
read_unlock_irqrestore(&kibnal_data.kib_global_lock,
flags);
libcfs_nid2str(peer->ibp_nid));
kibnal_close_conn (conn, -ETIMEDOUT);
- kibnal_put_conn (conn);
+ kibnal_conn_decref(conn); /* ...until here */
/* start again now I've dropped the lock */
goto again;
read_unlock_irqrestore(&kibnal_data.kib_global_lock, flags);
}
-static void
-kib_connd_handle_state(kib_conn_t *conn)
+void
+kibnal_disconnect_conn (kib_conn_t *conn)
{
- FSTATUS frc;
-
- switch (conn->ibc_state) {
- /* all refs have gone, free and be done with it */
- case IBNAL_CONN_DISCONNECTED:
- kibnal_destroy_conn (conn);
- return; /* avoid put_conn */
-
- case IBNAL_CONN_SEND_DREQ:
- frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
- if (frc != FSUCCESS) /* XXX do real things */
- CERROR("disconnect failed: %d\n", frc);
- conn->ibc_state = IBNAL_CONN_DREQ;
- break;
+ FSTATUS frc;
- /* a callback got to the conn before we did */
- case IBNAL_CONN_DREP:
- break;
-
- default:
- CERROR ("Bad conn %p state: %d\n", conn,
- conn->ibc_state);
- LBUG();
- break;
- }
+ LASSERT (conn->ibc_state == IBNAL_CONN_DISCONNECTING);
- /* drop ref from close_conn */
- kibnal_put_conn(conn);
+ kibnal_conn_disconnected(conn);
+
+ frc = iibt_cm_disconnect(conn->ibc_cep, NULL, NULL);
+ switch (frc) {
+ case FSUCCESS:
+ break;
+
+ case FINSUFFICIENT_RESOURCES:
+ CERROR("ENOMEM disconnecting %s\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid));
+ /* This might cause the module to become unloadable since the
+ * FCM_DISCONNECTED callback is still outstanding */
+ break;
+
+ default:
+ CERROR("Unexpected error disconnecting %s: %d\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid), frc);
+ LBUG();
+ }
}
int
kib_peer_t *peer;
int timeout;
int i;
+ int did_something;
int peer_index = 0;
unsigned long deadline = jiffies;
init_waitqueue_entry (&wait, current);
- spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+ spin_lock_irqsave(&kibnal_data.kib_connd_lock, flags);
+
+ while (!kibnal_data.kib_shutdown) {
+ did_something = 0;
+
+ if (!list_empty (&kibnal_data.kib_connd_zombies)) {
+ conn = list_entry (kibnal_data.kib_connd_zombies.next,
+ kib_conn_t, ibc_list);
+ list_del (&conn->ibc_list);
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+ did_something = 1;
+
+ kibnal_destroy_conn(conn);
+
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+ }
- for (;;) {
if (!list_empty (&kibnal_data.kib_connd_conns)) {
conn = list_entry (kibnal_data.kib_connd_conns.next,
kib_conn_t, ibc_list);
list_del (&conn->ibc_list);
-
spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
- kib_connd_handle_state(conn);
+ did_something = 1;
+ kibnal_disconnect_conn(conn);
+ kibnal_conn_decref(conn);
+
spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
- continue;
}
if (!list_empty (&kibnal_data.kib_connd_peers)) {
list_del_init (&peer->ibp_connd_list);
spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+ did_something = 1;
kibnal_connect_peer (peer);
- kib_peer_decref (peer);
+ kibnal_peer_decref (peer);
spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
}
- /* shut down and nobody left to reap... */
- if (kibnal_data.kib_shutdown &&
- atomic_read(&kibnal_data.kib_nconns) == 0)
- break;
-
- spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
-
/* careful with the jiffy wrap... */
while ((timeout = (int)(deadline - jiffies)) <= 0) {
const int n = 4;
const int p = 1;
int chunk = kibnal_data.kib_peer_hash_size;
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
+
/* Time to check for RDMA timeouts on a few more
* peers: I do checks every 'p' seconds on a
* proportion of the peer table and I need to check
* connection within (n+1)/n times the timeout
* interval. */
- if (kibnal_tunables.kib_io_timeout > n * p)
+ if (*kibnal_tunables.kib_timeout > n * p)
chunk = (chunk * n * p) /
- kibnal_tunables.kib_io_timeout;
+ *kibnal_tunables.kib_timeout;
if (chunk == 0)
chunk = 1;
}
deadline += p * HZ;
+ spin_lock_irqsave (&kibnal_data.kib_connd_lock, flags);
+ did_something = 1;
}
- kibnal_data.kib_connd_waketime = jiffies + timeout;
+ if (did_something)
+ continue;
+
+ spin_unlock_irqrestore (&kibnal_data.kib_connd_lock, flags);
set_current_state (TASK_INTERRUPTIBLE);
add_wait_queue (&kibnal_data.kib_connd_waitq, &wait);
return (0);
}
+
+void
+kibnal_hca_async_callback (void *hca_arg, IB_EVENT_RECORD *ev)
+{
+ /* XXX flesh out. this seems largely for async errors */
+ CERROR("type: %d code: %u\n", ev->EventType, ev->EventCode);
+}
+
+void
+kibnal_hca_callback (void *hca_arg, void *cq_arg)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+ kibnal_data.kib_ready = 1;
+ wake_up(&kibnal_data.kib_sched_waitq);
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);
+}
+
int
kibnal_scheduler(void *arg)
{
- long id = (long)arg;
- char name[16];
- kib_rx_t *rx;
- kib_tx_t *tx;
- unsigned long flags;
- int rc;
- int counter = 0;
- int did_something;
+ long id = (long)arg;
+ wait_queue_t wait;
+ char name[16];
+ FSTATUS frc;
+ FSTATUS frc2;
+ IB_WORK_COMPLETION wc;
+ kib_rx_t *rx;
+ kib_tx_t *tx;
+ unsigned long flags;
+ int rc;
+ int did_something;
+ __u64 rxseq = 0;
+ int busy_loops = 0;
snprintf(name, sizeof(name), "kibnal_sd_%02ld", id);
libcfs_daemonize(name);
libcfs_blockallsigs();
- spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+ init_waitqueue_entry(&wait, current);
- for (;;) {
- did_something = 0;
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
- while (!list_empty(&kibnal_data.kib_sched_txq)) {
- tx = list_entry(kibnal_data.kib_sched_txq.next,
- kib_tx_t, tx_list);
- list_del(&tx->tx_list);
+ while (!kibnal_data.kib_shutdown) {
+ if (busy_loops++ >= IBNAL_RESCHED) {
spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
flags);
- kibnal_tx_done(tx);
- spin_lock_irqsave(&kibnal_data.kib_sched_lock,
- flags);
+ our_cond_resched();
+ busy_loops = 0;
+
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
}
- if (!list_empty(&kibnal_data.kib_sched_rxq)) {
- rx = list_entry(kibnal_data.kib_sched_rxq.next,
- kib_rx_t, rx_list);
- list_del(&rx->rx_list);
+ if (kibnal_data.kib_ready &&
+ !kibnal_data.kib_checking_cq) {
+ /* take ownership of completion polling */
+ kibnal_data.kib_checking_cq = 1;
+ /* Assume I'll exhaust the CQ */
+ kibnal_data.kib_ready = 0;
spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
flags);
+
+ frc = iibt_cq_poll(kibnal_data.kib_cq, &wc);
+ if (frc == FNOT_DONE) {
+ /* CQ empty */
+ frc2 = iibt_cq_rearm(kibnal_data.kib_cq,
+ CQEventSelNextWC);
+ LASSERT (frc2 == FSUCCESS);
+ }
+
+ if (frc == FSUCCESS &&
+ kibnal_wreqid2type(wc.WorkReqId) == IBNAL_WID_RX) {
+ rx = (kib_rx_t *)kibnal_wreqid2ptr(wc.WorkReqId);
+
+ /* Grab the RX sequence number NOW before
+ * anyone else can get an RX completion */
+ rxseq = rx->rx_conn->ibc_rxseq++;
+ }
+
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+ /* give up ownership of completion polling */
+ kibnal_data.kib_checking_cq = 0;
- kibnal_rx(rx);
+ if (frc == FNOT_DONE)
+ continue;
- did_something = 1;
- spin_lock_irqsave(&kibnal_data.kib_sched_lock,
- flags);
- }
+ LASSERT (frc == FSUCCESS);
+ /* Assume there's more: get another scheduler to check
+ * while I handle this completion... */
- /* shut down and no receives to complete... */
- if (kibnal_data.kib_shutdown &&
- atomic_read(&kibnal_data.kib_nconns) == 0)
- break;
+ kibnal_data.kib_ready = 1;
+ wake_up(&kibnal_data.kib_sched_waitq);
- /* nothing to do or hogging CPU */
- if (!did_something || counter++ == IBNAL_RESCHED) {
spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
flags);
- counter = 0;
-
- if (!did_something) {
- rc = wait_event_interruptible(
- kibnal_data.kib_sched_waitq,
- !list_empty(&kibnal_data.kib_sched_txq) ||
- !list_empty(&kibnal_data.kib_sched_rxq) ||
- (kibnal_data.kib_shutdown &&
- atomic_read (&kibnal_data.kib_nconns) == 0));
- } else {
- our_cond_resched();
- }
- spin_lock_irqsave(&kibnal_data.kib_sched_lock,
- flags);
+ switch (kibnal_wreqid2type(wc.WorkReqId)) {
+ case IBNAL_WID_RX:
+ kibnal_rx_complete(&wc, rxseq);
+ break;
+
+ case IBNAL_WID_TX:
+ kibnal_tx_complete(&wc);
+ break;
+
+ case IBNAL_WID_RDMA:
+ /* We only get RDMA completion notification if
+ * it fails. So we just ignore them completely
+ * because...
+ *
+ * 1) If an RDMA fails, all subsequent work
+ * items, including the final SEND will fail
+ * too, so I'm still guaranteed to notice that
+ * this connection is hosed.
+ *
+ * 2) It's positively dangerous to look inside
+ * the tx descriptor obtained from an RDMA work
+ * item. As soon as I drop the kib_sched_lock,
+ * I give a scheduler on another CPU a chance
+ * to get the final SEND completion, so the tx
+ * descriptor can get freed as I inspect it. */
+ CERROR ("RDMA failed: %d\n", wc.Status);
+ break;
+
+ default:
+ LBUG();
+ }
+
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
+ continue;
}
+
+ /* Nothing to do; sleep... */
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
+ spin_unlock_irqrestore(&kibnal_data.kib_sched_lock,
+ flags);
+
+ schedule();
+
+ remove_wait_queue(&kibnal_data.kib_sched_waitq, &wait);
+ set_current_state(TASK_RUNNING);
+ spin_lock_irqsave(&kibnal_data.kib_sched_lock, flags);
}
spin_unlock_irqrestore(&kibnal_data.kib_sched_lock, flags);