-/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
- * vim:expandtab:shiftwidth=8:tabstop=8:
- *
+/*
* GPL HEADER START
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
}
kib_tx_t *
-kiblnd_get_idle_tx (lnet_ni_t *ni)
+kiblnd_get_idle_tx(lnet_ni_t *ni, lnet_nid_t target)
{
- kib_net_t *net = (kib_net_t *)ni->ni_data;
- cfs_list_t *node;
- kib_tx_t *tx;
+ kib_net_t *net = (kib_net_t *)ni->ni_data;
+ cfs_list_t *node;
+ kib_tx_t *tx;
+ kib_tx_poolset_t *tps;
- node = kiblnd_pool_alloc_node(&net->ibn_tx_ps.tps_poolset);
+ tps = net->ibn_tx_ps[lnet_cpt_of_nid(target)];
+ node = kiblnd_pool_alloc_node(&tps->tps_poolset);
if (node == NULL)
return NULL;
tx = container_of(node, kib_tx_t, tx_list);
}
void
-kiblnd_drop_rx (kib_rx_t *rx)
+kiblnd_drop_rx(kib_rx_t *rx)
{
- kib_conn_t *conn = rx->rx_conn;
- unsigned long flags;
-
- cfs_spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags);
- LASSERT (conn->ibc_nrx > 0);
- conn->ibc_nrx--;
- cfs_spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, flags);
+ kib_conn_t *conn = rx->rx_conn;
+ struct kib_sched_info *sched = conn->ibc_sched;
+ unsigned long flags;
- kiblnd_conn_decref(conn);
+ cfs_spin_lock_irqsave(&sched->ibs_lock, flags);
+ LASSERT(conn->ibc_nrx > 0);
+ conn->ibc_nrx--;
+ cfs_spin_unlock_irqrestore(&sched->ibs_lock, flags);
+
+ kiblnd_conn_decref(conn);
}
int
}
void
-kiblnd_send_completion (kib_conn_t *conn, int type, int status, __u64 cookie)
+kiblnd_send_completion(kib_conn_t *conn, int type, int status, __u64 cookie)
{
- lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
- kib_tx_t *tx = kiblnd_get_idle_tx(ni);
+ lnet_ni_t *ni = conn->ibc_peer->ibp_ni;
+ kib_tx_t *tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
if (tx == NULL) {
CERROR("Can't get tx for completion %x for %s\n",
conn->ibc_credits += credits;
+ /* This ensures the credit taken by NOOP can be returned */
+ if (msg->ibm_type == IBLND_MSG_NOOP &&
+ !IBLND_OOB_CAPABLE(conn->ibc_version)) /* v1 only */
+ conn->ibc_outstanding_credits++;
+
cfs_spin_unlock(&conn->ibc_lock);
kiblnd_check_sends(conn);
}
break;
case IBLND_MSG_NOOP:
- if (IBLND_OOB_CAPABLE(conn->ibc_version))
+ if (IBLND_OOB_CAPABLE(conn->ibc_version)) {
post_credit = IBLND_POSTRX_NO_CREDIT;
- else
+ break;
+ }
+
+ if (credits != 0) /* credit already posted */
+ post_credit = IBLND_POSTRX_NO_CREDIT;
+ else /* a keepalive NOOP */
post_credit = IBLND_POSTRX_PEER_CREDIT;
break;
static int
kiblnd_fmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
{
- kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
- __u64 *pages = tx->tx_pages;
- int npages;
- int size;
- int rc;
- int i;
+ kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
+ __u64 *pages = tx->tx_pages;
+ kib_fmr_poolset_t *fps;
+ int npages;
+ int size;
+ int cpt;
+ int rc;
+ int i;
for (i = 0, npages = 0; i < rd->rd_nfrags; i++) {
for (size = 0; size < rd->rd_frags[i].rf_nob;
}
}
- rc = kiblnd_fmr_pool_map(&net->ibn_fmr_ps, pages, npages, 0, &tx->tx_u.fmr);
+ LASSERT(tx->tx_pool != NULL);
+ LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
+
+ cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+ fps = net->ibn_fmr_ps[cpt];
+ rc = kiblnd_fmr_pool_map(fps, pages, npages, 0, &tx->tx_u.fmr);
if (rc != 0) {
CERROR ("Can't map %d pages: %d\n", npages, rc);
return rc;
static int
kiblnd_pmr_map_tx(kib_net_t *net, kib_tx_t *tx, kib_rdma_desc_t *rd, int nob)
{
- kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
- __u64 iova;
- int rc;
+ kib_hca_dev_t *hdev = tx->tx_pool->tpo_hdev;
+ kib_pmr_poolset_t *pps;
+ __u64 iova;
+ int cpt;
+ int rc;
+
+ iova = rd->rd_frags[0].rf_addr & ~hdev->ibh_page_mask;
- iova = rd->rd_frags[0].rf_addr & ~hdev->ibh_page_mask;
+ LASSERT(tx->tx_pool != NULL);
+ LASSERT(tx->tx_pool->tpo_pool.po_owner != NULL);
- rc = kiblnd_pmr_pool_map(&net->ibn_pmr_ps, hdev, rd, &iova, &tx->tx_u.pmr);
+ cpt = tx->tx_pool->tpo_pool.po_owner->ps_cpt;
+
+ pps = net->ibn_pmr_ps[cpt];
+ rc = kiblnd_pmr_pool_map(pps, hdev, rd, &iova, &tx->tx_u.pmr);
if (rc != 0) {
CERROR("Failed to create MR by phybuf: %d\n", rc);
return rc;
void
kiblnd_unmap_tx(lnet_ni_t *ni, kib_tx_t *tx)
{
- kib_net_t *net = ni->ni_data;
+ kib_net_t *net = ni->ni_data;
- LASSERT (net != NULL);
+ LASSERT(net != NULL);
- if (net->ibn_with_fmr && tx->tx_u.fmr.fmr_pfmr != NULL) {
- kiblnd_fmr_pool_unmap(&tx->tx_u.fmr, tx->tx_status);
- tx->tx_u.fmr.fmr_pfmr = NULL;
- } else if (net->ibn_with_pmr && tx->tx_u.pmr != NULL) {
- kiblnd_pmr_pool_unmap(tx->tx_u.pmr);
- tx->tx_u.pmr = NULL;
- }
+ if (net->ibn_fmr_ps != NULL && tx->tx_u.fmr.fmr_pfmr != NULL) {
+ kiblnd_fmr_pool_unmap(&tx->tx_u.fmr, tx->tx_status);
+ tx->tx_u.fmr.fmr_pfmr = NULL;
+
+ } else if (net->ibn_pmr_ps != NULL && tx->tx_u.pmr != NULL) {
+ kiblnd_pmr_pool_unmap(tx->tx_u.pmr);
+ tx->tx_u.pmr = NULL;
+ }
if (tx->tx_nfrags != 0) {
kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev->ibh_ibdev,
return 0;
}
- if (net->ibn_with_fmr)
- return kiblnd_fmr_map_tx(net, tx, rd, nob);
- else if (net->ibn_with_pmr)
- return kiblnd_pmr_map_tx(net, tx, rd, nob);
+ if (net->ibn_fmr_ps != NULL)
+ return kiblnd_fmr_map_tx(net, tx, rd, nob);
+ else if (net->ibn_pmr_ps != NULL)
+ return kiblnd_pmr_map_tx(net, tx, rd, nob);
- return -EINVAL;
+ return -EINVAL;
}
fragnob = min((int)(kiov->kiov_len - offset), nob);
- memset(sg, 0, sizeof(*sg));
sg_set_page(sg, kiov->kiov_page, fragnob,
kiov->kiov_offset + offset);
sg++;
}
if (credit != 0 && !IBLND_OOB_CAPABLE(ver) &&
- conn->ibc_credits == 1 && /* last credit reserved for */
- conn->ibc_outstanding_credits == 0) { /* giving back credits */
+ conn->ibc_credits == 1 && /* last credit reserved */
+ msg->ibm_type != IBLND_MSG_NOOP) { /* for NOOP */
CDEBUG(D_NET, "%s: not using last credit\n",
libcfs_nid2str(peer->ibp_nid));
return -EAGAIN;
tx->tx_queued = 0;
if (msg->ibm_type == IBLND_MSG_NOOP &&
- (!kiblnd_send_noop(conn) || /* redundant NOOP */
+ (!kiblnd_need_noop(conn) || /* redundant NOOP */
(IBLND_OOB_CAPABLE(ver) && /* posted enough NOOP */
conn->ibc_noops_posted == IBLND_OOB_MSGS(ver)))) {
/* OK to drop when posted enough NOOPs, since
conn->ibc_reserved_credits--;
}
- if (kiblnd_send_noop(conn)) {
+ if (kiblnd_need_noop(conn)) {
cfs_spin_unlock(&conn->ibc_lock);
- tx = kiblnd_get_idle_tx(ni);
+ tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
if (tx != NULL)
kiblnd_init_tx_msg(ni, tx, IBLND_MSG_NOOP, 0);
credit = 0;
tx = cfs_list_entry(conn->ibc_tx_queue_nocred.next,
kib_tx_t, tx_list);
+ } else if (!cfs_list_empty(&conn->ibc_tx_noops)) {
+ LASSERT (!IBLND_OOB_CAPABLE(ver));
+ credit = 1;
+ tx = cfs_list_entry(conn->ibc_tx_noops.next,
+ kib_tx_t, tx_list);
} else if (!cfs_list_empty(&conn->ibc_tx_queue)) {
credit = 1;
tx = cfs_list_entry(conn->ibc_tx_queue.next,
if (IBLND_OOB_CAPABLE(conn->ibc_version))
q = &conn->ibc_tx_queue_nocred;
else
- q = &conn->ibc_tx_queue;
+ q = &conn->ibc_tx_noops;
break;
case IBLND_MSG_IMMEDIATE:
kiblnd_check_sends(conn);
}
+static int kiblnd_resolve_addr(struct rdma_cm_id *cmid,
+ struct sockaddr_in *srcaddr,
+ struct sockaddr_in *dstaddr,
+ int timeout_ms)
+{
+ unsigned short port;
+ int rc;
+
+#ifdef HAVE_OFED_RDMA_SET_REUSEADDR
+ /* allow the port to be reused */
+ rc = rdma_set_reuseaddr(cmid, 1);
+ if (rc != 0) {
+ CERROR("Unable to set reuse on cmid: %d\n", rc);
+ return rc;
+ }
+#endif
+
+ /* look for a free privileged port */
+ for (port = PROT_SOCK-1; port > 0; port--) {
+ srcaddr->sin_port = htons(port);
+ rc = rdma_resolve_addr(cmid,
+ (struct sockaddr *)srcaddr,
+ (struct sockaddr *)dstaddr,
+ timeout_ms);
+ if (rc == 0) {
+ CDEBUG(D_NET, "bound to port %hu\n", port);
+ return 0;
+ } else if (rc == -EADDRINUSE || rc == -EADDRNOTAVAIL) {
+ CDEBUG(D_NET, "bind to port %hu failed: %d\n",
+ port, rc);
+ } else {
+ return rc;
+ }
+ }
+
+ CERROR("Failed to bind to a free privileged port\n");
+#ifndef HAVE_OFED_RDMA_SET_REUSEADDR
+ CERROR("You may need IB verbs that supports rdma_set_reuseaddr()\n");
+#endif
+ return rc;
+}
+
void
kiblnd_connect_peer (kib_peer_t *peer)
{
LASSERT (net != NULL);
LASSERT (peer->ibp_connecting > 0);
- cmid = rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP);
+ cmid = kiblnd_rdma_create_id(kiblnd_cm_callback, peer, RDMA_PS_TCP,
+ IB_QPT_RC);
+
if (IS_ERR(cmid)) {
CERROR("Can't create CMID for %s: %ld\n",
libcfs_nid2str(peer->ibp_nid), PTR_ERR(cmid));
kiblnd_peer_addref(peer); /* cmid's ref */
- rc = rdma_resolve_addr(cmid,
- (struct sockaddr *)&srcaddr,
- (struct sockaddr *)&dstaddr,
- *kiblnd_tunables.kib_timeout * 1000);
- if (rc == 0) {
- LASSERT (cmid->device != NULL);
- CDEBUG(D_NET, "%s: connection bound to %s:%u.%u.%u.%u:%s\n",
- libcfs_nid2str(peer->ibp_nid), dev->ibd_ifname,
- HIPQUAD(dev->ibd_ifip), cmid->device->name);
- return;
+ if (*kiblnd_tunables.kib_use_priv_port) {
+ rc = kiblnd_resolve_addr(cmid, &srcaddr, &dstaddr,
+ *kiblnd_tunables.kib_timeout * 1000);
+ } else {
+ rc = rdma_resolve_addr(cmid,
+ (struct sockaddr *)&srcaddr,
+ (struct sockaddr *)&dstaddr,
+ *kiblnd_tunables.kib_timeout * 1000);
+ }
+ if (rc != 0) {
+ /* Can't initiate address resolution: */
+ CERROR("Can't resolve addr for %s: %d\n",
+ libcfs_nid2str(peer->ibp_nid), rc);
+ goto failed2;
}
- /* Can't initiate address resolution: */
- CERROR("Can't resolve addr for %s: %d\n",
- libcfs_nid2str(peer->ibp_nid), rc);
+ LASSERT (cmid->device != NULL);
+ CDEBUG(D_NET, "%s: connection bound to %s:%u.%u.%u.%u:%s\n",
+ libcfs_nid2str(peer->ibp_nid), dev->ibd_ifname,
+ HIPQUAD(dev->ibd_ifip), cmid->device->name);
+
+ return;
+ failed2:
kiblnd_peer_decref(peer); /* cmid's ref */
rdma_destroy_id(cmid);
failed:
if (nob <= IBLND_MSG_SIZE)
break; /* send IMMEDIATE */
- tx = kiblnd_get_idle_tx(ni);
- if (tx == NULL) {
- CERROR("Can't allocate txd for GET to %s: \n",
- libcfs_nid2str(target.nid));
- return -ENOMEM;
- }
+ tx = kiblnd_get_idle_tx(ni, target.nid);
+ if (tx == NULL) {
+ CERROR("Can't allocate txd for GET to %s\n",
+ libcfs_nid2str(target.nid));
+ return -ENOMEM;
+ }
ibmsg = tx->tx_msg;
if (nob <= IBLND_MSG_SIZE)
break; /* send IMMEDIATE */
- tx = kiblnd_get_idle_tx(ni);
+ tx = kiblnd_get_idle_tx(ni, target.nid);
if (tx == NULL) {
CERROR("Can't allocate %s txd for %s\n",
type == LNET_MSG_PUT ? "PUT" : "REPLY",
LASSERT (offsetof(kib_msg_t, ibm_u.immediate.ibim_payload[payload_nob])
<= IBLND_MSG_SIZE);
- tx = kiblnd_get_idle_tx(ni);
+ tx = kiblnd_get_idle_tx(ni, target.nid);
if (tx == NULL) {
CERROR ("Can't send %d to %s: tx descs exhausted\n",
type, libcfs_nid2str(target.nid));
kib_tx_t *tx;
int rc;
- tx = kiblnd_get_idle_tx(ni);
+ tx = kiblnd_get_idle_tx(ni, rx->rx_conn->ibc_peer->ibp_nid);
if (tx == NULL) {
CERROR("Can't get tx for REPLY to %s\n",
libcfs_nid2str(target.nid));
break;
}
- tx = kiblnd_get_idle_tx(ni);
+ tx = kiblnd_get_idle_tx(ni, conn->ibc_peer->ibp_nid);
if (tx == NULL) {
CERROR("Can't allocate tx for %s\n",
libcfs_nid2str(conn->ibc_peer->ibp_nid));
int
kiblnd_thread_start (int (*fn)(void *arg), void *arg)
{
- long pid = cfs_kernel_thread (fn, arg, 0);
+ long pid = cfs_create_thread (fn, arg, 0);
if (pid < 0)
return ((int)pid);
return; /* already being handled */
if (error == 0 &&
+ cfs_list_empty(&conn->ibc_tx_noops) &&
cfs_list_empty(&conn->ibc_tx_queue) &&
cfs_list_empty(&conn->ibc_tx_queue_rsrvd) &&
cfs_list_empty(&conn->ibc_tx_queue_nocred) &&
CDEBUG(D_NET, "closing conn to %s\n",
libcfs_nid2str(peer->ibp_nid));
} else {
- CNETERR("Closing conn to %s: error %d%s%s%s%s\n",
+ CNETERR("Closing conn to %s: error %d%s%s%s%s%s\n",
libcfs_nid2str(peer->ibp_nid), error,
cfs_list_empty(&conn->ibc_tx_queue) ? "" : "(sending)",
+ cfs_list_empty(&conn->ibc_tx_noops) ? "" : "(sending_noops)",
cfs_list_empty(&conn->ibc_tx_queue_rsrvd) ? "" : "(sending_rsrvd)",
cfs_list_empty(&conn->ibc_tx_queue_nocred) ? "" : "(sending_nocred)",
cfs_list_empty(&conn->ibc_active_txs) ? "" : "(waiting)");
/* Complete all tx descs not waiting for sends to complete.
* NB we should be safe from RDMA now that the QP has changed state */
+ kiblnd_abort_txs(conn, &conn->ibc_tx_noops);
kiblnd_abort_txs(conn, &conn->ibc_tx_queue);
kiblnd_abort_txs(conn, &conn->ibc_tx_queue_rsrvd);
kiblnd_abort_txs(conn, &conn->ibc_tx_queue_nocred);
int version = IBLND_MSG_VERSION;
unsigned long flags;
int rc;
-
+ struct sockaddr_in *peer_addr;
LASSERT (!cfs_in_interrupt());
/* cmid inherits 'context' from the corresponding listener id */
rej.ibr_why = IBLND_REJECT_FATAL;
rej.ibr_cp.ibcp_max_msg_size = IBLND_MSG_SIZE;
+ peer_addr = (struct sockaddr_in *)&(cmid->route.addr.dst_addr);
+ if (*kiblnd_tunables.kib_require_priv_port &&
+ ntohs(peer_addr->sin_port) >= PROT_SOCK) {
+ __u32 ip = ntohl(peer_addr->sin_addr.s_addr);
+ CERROR("Peer's port (%u.%u.%u.%u:%hu) is not privileged\n",
+ HIPQUAD(ip), ntohs(peer_addr->sin_port));
+ goto failed;
+ }
+
if (priv_nob < offsetof(kib_msg_t, ibm_type)) {
CERROR("Short connection request\n");
goto failed;
}
}
-int
-kiblnd_check_txs (kib_conn_t *conn, cfs_list_t *txs)
+static int
+kiblnd_check_txs_locked(kib_conn_t *conn, cfs_list_t *txs)
{
kib_tx_t *tx;
cfs_list_t *ttmp;
- int timed_out = 0;
-
- cfs_spin_lock(&conn->ibc_lock);
cfs_list_for_each (ttmp, txs) {
tx = cfs_list_entry (ttmp, kib_tx_t, tx_list);
}
if (cfs_time_aftereq (jiffies, tx->tx_deadline)) {
- timed_out = 1;
CERROR("Timed out tx: %s, %lu seconds\n",
kiblnd_queue2str(conn, txs),
cfs_duration_sec(jiffies - tx->tx_deadline));
- break;
+ return 1;
}
}
- cfs_spin_unlock(&conn->ibc_lock);
- return timed_out;
+ return 0;
}
-int
-kiblnd_conn_timed_out (kib_conn_t *conn)
+static int
+kiblnd_conn_timed_out_locked(kib_conn_t *conn)
{
- return kiblnd_check_txs(conn, &conn->ibc_tx_queue) ||
- kiblnd_check_txs(conn, &conn->ibc_tx_queue_rsrvd) ||
- kiblnd_check_txs(conn, &conn->ibc_tx_queue_nocred) ||
- kiblnd_check_txs(conn, &conn->ibc_active_txs);
+ return kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue) ||
+ kiblnd_check_txs_locked(conn, &conn->ibc_tx_noops) ||
+ kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_rsrvd) ||
+ kiblnd_check_txs_locked(conn, &conn->ibc_tx_queue_nocred) ||
+ kiblnd_check_txs_locked(conn, &conn->ibc_active_txs);
}
void
kiblnd_check_conns (int idx)
{
- cfs_list_t *peers = &kiblnd_data.kib_peers[idx];
- cfs_list_t *ptmp;
- kib_peer_t *peer;
- kib_conn_t *conn;
- cfs_list_t *ctmp;
- unsigned long flags;
+ CFS_LIST_HEAD (closes);
+ CFS_LIST_HEAD (checksends);
+ cfs_list_t *peers = &kiblnd_data.kib_peers[idx];
+ cfs_list_t *ptmp;
+ kib_peer_t *peer;
+ kib_conn_t *conn;
+ cfs_list_t *ctmp;
+ unsigned long flags;
- again:
/* NB. We expect to have a look at all the peers and not find any
- * rdmas to time out, so we just use a shared lock while we
+ * RDMAs to time out, so we just use a shared lock while we
* take a look... */
cfs_read_lock_irqsave(&kiblnd_data.kib_global_lock, flags);
peer = cfs_list_entry (ptmp, kib_peer_t, ibp_list);
cfs_list_for_each (ctmp, &peer->ibp_conns) {
- conn = cfs_list_entry (ctmp, kib_conn_t, ibc_list);
+ int timedout;
+ int sendnoop;
+
+ conn = cfs_list_entry(ctmp, kib_conn_t, ibc_list);
LASSERT (conn->ibc_state == IBLND_CONN_ESTABLISHED);
- /* In case we have enough credits to return via a
- * NOOP, but there were no non-blocking tx descs
- * free to do it last time... */
- kiblnd_check_sends(conn);
+ cfs_spin_lock(&conn->ibc_lock);
- if (!kiblnd_conn_timed_out(conn))
+ sendnoop = kiblnd_need_noop(conn);
+ timedout = kiblnd_conn_timed_out_locked(conn);
+ if (!sendnoop && !timedout) {
+ cfs_spin_unlock(&conn->ibc_lock);
continue;
+ }
- /* Handle timeout by closing the whole connection. We
- * can only be sure RDMA activity has ceased once the
- * QP has been modified. */
-
- kiblnd_conn_addref(conn); /* 1 ref for me... */
-
- cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock,
- flags);
-
- CERROR("Timed out RDMA with %s (%lu)\n",
- libcfs_nid2str(peer->ibp_nid),
- cfs_duration_sec(cfs_time_current() -
- peer->ibp_last_alive));
-
- kiblnd_close_conn(conn, -ETIMEDOUT);
- kiblnd_conn_decref(conn); /* ...until here */
+ if (timedout) {
+ CERROR("Timed out RDMA with %s (%lu): "
+ "c: %u, oc: %u, rc: %u\n",
+ libcfs_nid2str(peer->ibp_nid),
+ cfs_duration_sec(cfs_time_current() -
+ peer->ibp_last_alive),
+ conn->ibc_credits,
+ conn->ibc_outstanding_credits,
+ conn->ibc_reserved_credits);
+ cfs_list_add(&conn->ibc_connd_list, &closes);
+ } else {
+ cfs_list_add(&conn->ibc_connd_list,
+ &checksends);
+ }
+ /* +ref for 'closes' or 'checksends' */
+ kiblnd_conn_addref(conn);
- /* start again now I've dropped the lock */
- goto again;
+ cfs_spin_unlock(&conn->ibc_lock);
}
}
cfs_read_unlock_irqrestore(&kiblnd_data.kib_global_lock, flags);
+
+ /* Handle timeout by closing the whole
+ * connection. We can only be sure RDMA activity
+ * has ceased once the QP has been modified. */
+ while (!cfs_list_empty(&closes)) {
+ conn = cfs_list_entry(closes.next,
+ kib_conn_t, ibc_connd_list);
+ cfs_list_del(&conn->ibc_connd_list);
+ kiblnd_close_conn(conn, -ETIMEDOUT);
+ kiblnd_conn_decref(conn);
+ }
+
+ /* In case we have enough credits to return via a
+ * NOOP, but there were no non-blocking tx descs
+ * free to do it last time... */
+ while (!cfs_list_empty(&checksends)) {
+ conn = cfs_list_entry(checksends.next,
+ kib_conn_t, ibc_connd_list);
+ cfs_list_del(&conn->ibc_connd_list);
+ kiblnd_check_sends(conn);
+ kiblnd_conn_decref(conn);
+ }
}
void
}
void
-kiblnd_cq_completion (struct ib_cq *cq, void *arg)
+kiblnd_cq_completion(struct ib_cq *cq, void *arg)
{
- /* NB I'm not allowed to schedule this conn once its refcount has
- * reached 0. Since fundamentally I'm racing with scheduler threads
- * consuming my CQ I could be called after all completions have
- * occurred. But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0
- * and this CQ is about to be destroyed so I NOOP. */
- kib_conn_t *conn = (kib_conn_t *)arg;
- unsigned long flags;
+ /* NB I'm not allowed to schedule this conn once its refcount has
+ * reached 0. Since fundamentally I'm racing with scheduler threads
+ * consuming my CQ I could be called after all completions have
+ * occurred. But in this case, ibc_nrx == 0 && ibc_nsends_posted == 0
+ * and this CQ is about to be destroyed so I NOOP. */
+ kib_conn_t *conn = (kib_conn_t *)arg;
+ struct kib_sched_info *sched = conn->ibc_sched;
+ unsigned long flags;
- LASSERT (cq == conn->ibc_cq);
+ LASSERT(cq == conn->ibc_cq);
- cfs_spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags);
+ cfs_spin_lock_irqsave(&sched->ibs_lock, flags);
- conn->ibc_ready = 1;
+ conn->ibc_ready = 1;
- if (!conn->ibc_scheduled &&
- (conn->ibc_nrx > 0 ||
- conn->ibc_nsends_posted > 0)) {
- kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
- conn->ibc_scheduled = 1;
- cfs_list_add_tail(&conn->ibc_sched_list,
- &kiblnd_data.kib_sched_conns);
- cfs_waitq_signal(&kiblnd_data.kib_sched_waitq);
- }
+ if (!conn->ibc_scheduled &&
+ (conn->ibc_nrx > 0 ||
+ conn->ibc_nsends_posted > 0)) {
+ kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
+ conn->ibc_scheduled = 1;
+ cfs_list_add_tail(&conn->ibc_sched_list, &sched->ibs_conns);
+
+ if (cfs_waitq_active(&sched->ibs_waitq))
+ cfs_waitq_signal(&sched->ibs_waitq);
+ }
- cfs_spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, flags);
+ cfs_spin_unlock_irqrestore(&sched->ibs_lock, flags);
}
void
int
kiblnd_scheduler(void *arg)
{
- long id = (long)arg;
- cfs_waitlink_t wait;
- char name[16];
- unsigned long flags;
- kib_conn_t *conn;
- struct ib_wc wc;
- int rc;
- int did_something;
- int busy_loops = 0;
-
- snprintf(name, sizeof(name), "kiblnd_sd_%02ld", id);
- cfs_daemonize(name);
- cfs_block_allsigs();
+ long id = (long)arg;
+ struct kib_sched_info *sched;
+ kib_conn_t *conn;
+ cfs_waitlink_t wait;
+ unsigned long flags;
+ struct ib_wc wc;
+ char name[20];
+ int did_something;
+ int busy_loops = 0;
+ int rc;
- cfs_waitlink_init(&wait);
+ snprintf(name, sizeof(name), "kiblnd_sd_%02ld_%02ld",
+ KIB_THREAD_CPT(id), KIB_THREAD_TID(id));
- cfs_spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags);
+ cfs_daemonize(name);
+ cfs_block_allsigs();
- while (!kiblnd_data.kib_shutdown) {
- if (busy_loops++ >= IBLND_RESCHED) {
- cfs_spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock,
- flags);
+ cfs_waitlink_init(&wait);
- cfs_cond_resched();
- busy_loops = 0;
+ sched = kiblnd_data.kib_scheds[KIB_THREAD_CPT(id)];
- cfs_spin_lock_irqsave(&kiblnd_data.kib_sched_lock,
- flags);
- }
+ rc = cfs_cpt_bind(lnet_cpt_table(), sched->ibs_cpt);
+ if (rc != 0) {
+ CWARN("Failed to bind %s on CPT %d, please verify whether "
+ "all CPUs are healthy and reload modules if necessary, "
+ "otherwise your system might under risk of low "
+ "performance\n", name, sched->ibs_cpt);
+ }
- did_something = 0;
+ cfs_spin_lock_irqsave(&sched->ibs_lock, flags);
- if (!cfs_list_empty(&kiblnd_data.kib_sched_conns)) {
- conn = cfs_list_entry(kiblnd_data.kib_sched_conns.next,
- kib_conn_t, ibc_sched_list);
- /* take over kib_sched_conns' ref on conn... */
- LASSERT(conn->ibc_scheduled);
- cfs_list_del(&conn->ibc_sched_list);
- conn->ibc_ready = 0;
+ while (!kiblnd_data.kib_shutdown) {
+ if (busy_loops++ >= IBLND_RESCHED) {
+ cfs_spin_unlock_irqrestore(&sched->ibs_lock, flags);
- cfs_spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock,
- flags);
+ cfs_cond_resched();
+ busy_loops = 0;
+
+ cfs_spin_lock_irqsave(&sched->ibs_lock, flags);
+ }
+
+ did_something = 0;
+
+ if (!cfs_list_empty(&sched->ibs_conns)) {
+ conn = cfs_list_entry(sched->ibs_conns.next,
+ kib_conn_t, ibc_sched_list);
+ /* take over kib_sched_conns' ref on conn... */
+ LASSERT(conn->ibc_scheduled);
+ cfs_list_del(&conn->ibc_sched_list);
+ conn->ibc_ready = 0;
+
+ cfs_spin_unlock_irqrestore(&sched->ibs_lock, flags);
rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
if (rc == 0) {
libcfs_nid2str(conn->ibc_peer->ibp_nid), rc);
kiblnd_close_conn(conn, -EIO);
kiblnd_conn_decref(conn);
- cfs_spin_lock_irqsave(&kiblnd_data. \
- kib_sched_lock,
- flags);
- continue;
- }
-
- rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
- }
-
- if (rc < 0) {
- CWARN("%s: ib_poll_cq failed: %d, "
- "closing connection\n",
- libcfs_nid2str(conn->ibc_peer->ibp_nid),
- rc);
- kiblnd_close_conn(conn, -EIO);
- kiblnd_conn_decref(conn);
- cfs_spin_lock_irqsave(&kiblnd_data. \
- kib_sched_lock, flags);
- continue;
- }
-
- cfs_spin_lock_irqsave(&kiblnd_data.kib_sched_lock,
- flags);
-
- if (rc != 0 || conn->ibc_ready) {
- /* There may be another completion waiting; get
- * another scheduler to check while I handle
- * this one... */
- kiblnd_conn_addref(conn); /* +1 ref for sched_conns */
- cfs_list_add_tail(&conn->ibc_sched_list,
- &kiblnd_data.kib_sched_conns);
- cfs_waitq_signal(&kiblnd_data.kib_sched_waitq);
- } else {
- conn->ibc_scheduled = 0;
- }
-
- if (rc != 0) {
- cfs_spin_unlock_irqrestore(&kiblnd_data. \
- kib_sched_lock,
- flags);
-
- kiblnd_complete(&wc);
-
- cfs_spin_lock_irqsave(&kiblnd_data. \
- kib_sched_lock,
- flags);
+ cfs_spin_lock_irqsave(&sched->ibs_lock,
+ flags);
+ continue;
+ }
+
+ rc = ib_poll_cq(conn->ibc_cq, 1, &wc);
+ }
+
+ if (rc < 0) {
+ CWARN("%s: ib_poll_cq failed: %d, "
+ "closing connection\n",
+ libcfs_nid2str(conn->ibc_peer->ibp_nid),
+ rc);
+ kiblnd_close_conn(conn, -EIO);
+ kiblnd_conn_decref(conn);
+ cfs_spin_lock_irqsave(&sched->ibs_lock, flags);
+ continue;
+ }
+
+ cfs_spin_lock_irqsave(&sched->ibs_lock, flags);
+
+ if (rc != 0 || conn->ibc_ready) {
+ /* There may be another completion waiting; get
+ * another scheduler to check while I handle
+ * this one... */
+ /* +1 ref for sched_conns */
+ kiblnd_conn_addref(conn);
+ cfs_list_add_tail(&conn->ibc_sched_list,
+ &sched->ibs_conns);
+ if (cfs_waitq_active(&sched->ibs_waitq))
+ cfs_waitq_signal(&sched->ibs_waitq);
+ } else {
+ conn->ibc_scheduled = 0;
+ }
+
+ if (rc != 0) {
+ cfs_spin_unlock_irqrestore(&sched->ibs_lock,
+ flags);
+ kiblnd_complete(&wc);
+
+ cfs_spin_lock_irqsave(&sched->ibs_lock, flags);
}
kiblnd_conn_decref(conn); /* ...drop my ref from above */
if (did_something)
continue;
- cfs_set_current_state(CFS_TASK_INTERRUPTIBLE);
- cfs_waitq_add_exclusive(&kiblnd_data.kib_sched_waitq, &wait);
- cfs_spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, flags);
+ cfs_set_current_state(CFS_TASK_INTERRUPTIBLE);
+ cfs_waitq_add_exclusive(&sched->ibs_waitq, &wait);
+ cfs_spin_unlock_irqrestore(&sched->ibs_lock, flags);
- cfs_waitq_wait(&wait, CFS_TASK_INTERRUPTIBLE);
- busy_loops = 0;
+ cfs_waitq_wait(&wait, CFS_TASK_INTERRUPTIBLE);
+ busy_loops = 0;
- cfs_waitq_del(&kiblnd_data.kib_sched_waitq, &wait);
- cfs_set_current_state(CFS_TASK_RUNNING);
- cfs_spin_lock_irqsave(&kiblnd_data.kib_sched_lock, flags);
- }
+ cfs_waitq_del(&sched->ibs_waitq, &wait);
+ cfs_set_current_state(CFS_TASK_RUNNING);
+ cfs_spin_lock_irqsave(&sched->ibs_lock, flags);
+ }
- cfs_spin_unlock_irqrestore(&kiblnd_data.kib_sched_lock, flags);
+ cfs_spin_unlock_irqrestore(&sched->ibs_lock, flags);
- kiblnd_thread_fini();
- return (0);
+ kiblnd_thread_fini();
+ return 0;
}
int