/* * Copyright (C) 2004 Cluster File Systems, Inc. * * Copyright (C) 2009-2012 Cray, Inc. * * Derived from work by Eric Barton * Author: James Shimek * Author: Nic Henke * * This file is part of Lustre, http://www.lustre.org. * * Lustre is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * * Lustre is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Lustre; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * */ #include #include #include "gnilnd.h" /* this is useful when needed to debug wire corruption. */ static void kgnilnd_dump_blob(int level, char *prefix, void *buf, int len) { __u64 *ptr; ptr = (__u64 *) buf; while (len > 0) { if (len >= 32) { CDEBUG(level, "%s 0x%p: 0x%16.16llx 0x%16.16llx 0x%16.16llx 0x%16.16llx\n", prefix, ptr, *(ptr), *(ptr + 1), *(ptr + 2), *(ptr + 3)); ptr += 4; len -= 32; } else if (len >= 16) { CDEBUG(level, "%s 0x%p: 0x%16.16llx 0x%16.16llx\n", prefix, ptr, *(ptr), *(ptr + 1)); ptr += 2; len -= 16; } else { CDEBUG(level, "%s 0x%p: 0x%16.16llx\n", prefix, ptr, *(ptr)); ptr++; len -= 8; } } } static void kgnilnd_dump_msg(int mask, kgn_msg_t *msg) { CDEBUG(mask, "0x%8.8x 0x%4.4x 0x%4.4x 0x%16.16llx" " 0x%16.16llx 0x%8.8x 0x%4.4x 0x%4.4x 0x%8.8x\n", msg->gnm_magic, msg->gnm_version, msg->gnm_type, msg->gnm_srcnid, msg->gnm_connstamp, msg->gnm_seq, msg->gnm_cksum, msg->gnm_payload_cksum, msg->gnm_payload_len); } void kgnilnd_schedule_device(kgn_device_t *dev) { short already_live = 0; /* we'll only want to wake if the scheduler thread * has come around and set ready to zero */ already_live = cmpxchg(&dev->gnd_ready, GNILND_DEV_IDLE, GNILND_DEV_IRQ); if (!already_live) { wake_up_all(&dev->gnd_waitq); } return; } void kgnilnd_schedule_device_timer(unsigned long arg) { kgn_device_t *dev = (kgn_device_t *) arg; kgnilnd_schedule_device(dev); } void kgnilnd_device_callback(__u32 devid, __u64 arg) { kgn_device_t *dev; int index = (int) arg; if (index >= kgnilnd_data.kgn_ndevs) { /* use _EMERG instead of an LBUG to prevent LBUG'ing in * interrupt context. */ LCONSOLE_EMERG("callback for unknown device %d->%d\n", devid, index); return; } dev = &kgnilnd_data.kgn_devices[index]; /* just basic sanity */ if (dev->gnd_id == devid) { kgnilnd_schedule_device(dev); } else { LCONSOLE_EMERG("callback for bad device %d devid %d\n", dev->gnd_id, devid); } } /* sched_intent values: * < 0 : do not reschedule under any circumstances * == 0: reschedule if someone marked him WANTS_SCHED * > 0 : force a reschedule */ /* Return code 0 means it did not schedule the conn, 1 * means it successfully scheduled the conn. */ int kgnilnd_schedule_process_conn(kgn_conn_t *conn, int sched_intent) { int conn_sched; /* move back to IDLE but save previous state. * if we see WANTS_SCHED, we'll call kgnilnd_schedule_conn and * let the xchg there handle any racing callers to get it * onto gnd_ready_conns */ conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_IDLE); LASSERTF(conn_sched == GNILND_CONN_WANTS_SCHED || conn_sched == GNILND_CONN_PROCESS, "conn %p after process in bad state: %d\n", conn, conn_sched); if (sched_intent >= 0) { if ((sched_intent > 0 || (conn_sched == GNILND_CONN_WANTS_SCHED))) { return kgnilnd_schedule_conn_refheld(conn, 1); } } return 0; } /* Return of 0 for conn not scheduled, 1 returned if conn was scheduled or marked * as scheduled */ int _kgnilnd_schedule_conn(kgn_conn_t *conn, const char *caller, int line, int refheld) { kgn_device_t *dev = conn->gnc_device; int sched; int rc; sched = xchg(&conn->gnc_scheduled, GNILND_CONN_WANTS_SCHED); /* we only care about the last person who marked want_sched since they * are most likely the culprit */ memcpy(conn->gnc_sched_caller, caller, sizeof(conn->gnc_sched_caller)); conn->gnc_sched_line = line; /* if we are IDLE, add to list - only one guy sees IDLE and "wins" * the chance to put it onto gnd_ready_conns. * otherwise, leave marked as WANTS_SCHED and the thread that "owns" * the conn in process_conns will take care of moving it back to * SCHED when it is done processing */ if (sched == GNILND_CONN_IDLE) { /* if the conn is already scheduled, we've already requested * the scheduler thread wakeup */ if (!refheld) { /* Add a reference to the conn if we are not holding a reference * already from the exisiting scheduler. We now use the same * reference if we need to reschedule a conn while in a scheduler * thread. */ kgnilnd_conn_addref(conn); } LASSERTF(list_empty(&conn->gnc_schedlist), "conn %p already sched state %d\n", conn, sched); CDEBUG(D_INFO, "scheduling conn 0x%p caller %s:%d\n", conn, caller, line); spin_lock(&dev->gnd_lock); list_add_tail(&conn->gnc_schedlist, &dev->gnd_ready_conns); spin_unlock(&dev->gnd_lock); set_mb(conn->gnc_last_sched_ask, jiffies); rc = 1; } else { CDEBUG(D_INFO, "not scheduling conn 0x%p: %d caller %s:%d\n", conn, sched, caller, line); rc = 0; } /* make sure thread(s) going to process conns - but let it make * separate decision from conn schedule */ kgnilnd_schedule_device(dev); return rc; } void kgnilnd_schedule_dgram(kgn_device_t *dev) { int wake; wake = xchg(&dev->gnd_dgram_ready, GNILND_DGRAM_SCHED); if (wake != GNILND_DGRAM_SCHED) { wake_up(&dev->gnd_dgram_waitq); } else { CDEBUG(D_NETTRACE, "not waking: %d\n", wake); } } void kgnilnd_free_tx(kgn_tx_t *tx) { /* taken from kgnilnd_tx_add_state_locked */ LASSERTF((tx->tx_list_p == NULL && tx->tx_list_state == GNILND_TX_ALLOCD) && list_empty(&tx->tx_list), "tx %p with bad state %s (list_p %p) tx_list %s\n", tx, kgnilnd_tx_state2str(tx->tx_list_state), tx->tx_list_p, list_empty(&tx->tx_list) ? "empty" : "not empty"); atomic_dec(&kgnilnd_data.kgn_ntx); /* we only allocate this if we need to */ if (tx->tx_phys != NULL) { kmem_cache_free(kgnilnd_data.kgn_tx_phys_cache, tx->tx_phys); CDEBUG(D_MALLOC, "slab-freed 'tx_phys': %lu at %p.\n", LNET_MAX_IOV * sizeof(gni_mem_segment_t), tx->tx_phys); } /* Only free the buffer if we used it */ if (tx->tx_buffer_copy != NULL) { vfree(tx->tx_buffer_copy); tx->tx_buffer_copy = NULL; CDEBUG(D_MALLOC, "vfreed buffer2\n"); } #if 0 KGNILND_POISON(tx, 0x5a, sizeof(kgn_tx_t)); #endif CDEBUG(D_MALLOC, "slab-freed 'tx': %lu at %p.\n", sizeof(*tx), tx); kmem_cache_free(kgnilnd_data.kgn_tx_cache, tx); } kgn_tx_t * kgnilnd_alloc_tx (void) { kgn_tx_t *tx = NULL; if (CFS_FAIL_CHECK(CFS_FAIL_GNI_ALLOC_TX)) return tx; tx = kmem_cache_alloc(kgnilnd_data.kgn_tx_cache, GFP_ATOMIC); if (tx == NULL) { CERROR("failed to allocate tx\n"); return NULL; } CDEBUG(D_MALLOC, "slab-alloced 'tx': %lu at %p.\n", sizeof(*tx), tx); /* need this memset, cache alloc'd memory is not cleared */ memset(tx, 0, sizeof(*tx)); /* setup everything here to minimize time under the lock */ tx->tx_buftype = GNILND_BUF_NONE; tx->tx_msg.gnm_type = GNILND_MSG_NONE; INIT_LIST_HEAD(&tx->tx_list); INIT_LIST_HEAD(&tx->tx_map_list); tx->tx_list_state = GNILND_TX_ALLOCD; atomic_inc(&kgnilnd_data.kgn_ntx); return tx; } /* csum_fold needs to be run on the return value before shipping over the wire */ #define _kgnilnd_cksum(seed, ptr, nob) csum_partial(ptr, nob, seed) /* we don't use offset as every one is passing a buffer reference that already * includes the offset into the base address - * see kgnilnd_setup_virt_buffer and kgnilnd_setup_immediate_buffer */ static inline __u16 kgnilnd_cksum(void *ptr, size_t nob) { __u16 sum; sum = csum_fold(_kgnilnd_cksum(0, ptr, nob)); /* don't use magic 'no checksum' value */ if (sum == 0) sum = 1; CDEBUG(D_INFO, "cksum 0x%x for ptr 0x%p sz %zu\n", sum, ptr, nob); return sum; } inline __u16 kgnilnd_cksum_kiov(unsigned int nkiov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob, int dump_blob) { __wsum cksum = 0; __wsum tmpck; __u16 retsum; void *addr; unsigned int fraglen; int i, odd; LASSERT(nkiov > 0); LASSERT(nob > 0); CDEBUG(D_BUFFS, "calc cksum for kiov 0x%p nkiov %u offset %u nob %u, dump %d\n", kiov, nkiov, offset, nob, dump_blob); /* if loops changes, please change kgnilnd_setup_phys_buffer */ while (offset >= kiov->kiov_len) { offset -= kiov->kiov_len; nkiov--; kiov++; LASSERT(nkiov > 0); } /* ignore nob here, if nob < (kiov_len - offset), kiov == 1 */ odd = (unsigned long) (kiov[0].kiov_len - offset) & 1; if ((odd || *kgnilnd_tunables.kgn_vmap_cksum) && nkiov > 1) { struct page **pages = kgnilnd_data.kgn_cksum_map_pages[get_cpu()]; LASSERTF(pages != NULL, "NULL pages for cpu %d map_pages 0x%p\n", get_cpu(), kgnilnd_data.kgn_cksum_map_pages); CDEBUG(D_BUFFS, "odd %d len %u offset %u nob %u\n", odd, kiov[0].kiov_len, offset, nob); for (i = 0; i < nkiov; i++) { pages[i] = kiov[i].kiov_page; } addr = vmap(pages, nkiov, VM_MAP, PAGE_KERNEL); if (addr == NULL) { CNETERR("Couldn't vmap %d frags on %d bytes to avoid odd length fragment in cksum\n", nkiov, nob); /* return zero to avoid killing tx - we'll just get warning on console * when remote end sees zero checksum */ RETURN(0); } atomic_inc(&kgnilnd_data.kgn_nvmap_cksum); tmpck = _kgnilnd_cksum(0, (void *) addr + kiov[0].kiov_offset + offset, nob); cksum = tmpck; if (dump_blob) { kgnilnd_dump_blob(D_BUFFS, "flat kiov RDMA payload", (void *)addr + kiov[0].kiov_offset + offset, nob); } CDEBUG(D_BUFFS, "cksum 0x%x (+0x%x) for addr 0x%p+%u len %u offset %u\n", cksum, tmpck, addr, kiov[0].kiov_offset, nob, offset); vunmap(addr); } else { do { fraglen = min(kiov->kiov_len - offset, nob); /* make dang sure we don't send a bogus checksum if somehow we get * an odd length fragment on anything but the last entry in a kiov - * we know from kgnilnd_setup_rdma_buffer that we can't have non * PAGE_SIZE pages in the middle, so if nob < PAGE_SIZE, it is the last one */ LASSERTF(!(fraglen&1) || (nob < PAGE_SIZE), "odd fraglen %u on nkiov %d, nob %u kiov_len %u offset %u kiov 0x%p\n", fraglen, nkiov, nob, kiov->kiov_len, offset, kiov); addr = (void *)kmap(kiov->kiov_page) + kiov->kiov_offset + offset; tmpck = _kgnilnd_cksum(cksum, addr, fraglen); CDEBUG(D_BUFFS, "cksum 0x%x (+0x%x) for page 0x%p+%u (0x%p) len %u offset %u\n", cksum, tmpck, kiov->kiov_page, kiov->kiov_offset, addr, fraglen, offset); cksum = tmpck; if (dump_blob) kgnilnd_dump_blob(D_BUFFS, "kiov cksum", addr, fraglen); kunmap(kiov->kiov_page); kiov++; nkiov--; nob -= fraglen; offset = 0; /* iov must not run out before end of data */ LASSERTF(nob == 0 || nkiov > 0, "nob %u nkiov %u\n", nob, nkiov); } while (nob > 0); } retsum = csum_fold(cksum); /* don't use magic 'no checksum' value */ if (retsum == 0) retsum = 1; CDEBUG(D_BUFFS, "retsum 0x%x from cksum 0x%x\n", retsum, cksum); return retsum; } void kgnilnd_init_msg(kgn_msg_t *msg, int type, lnet_nid_t source) { msg->gnm_magic = GNILND_MSG_MAGIC; msg->gnm_version = GNILND_MSG_VERSION; msg->gnm_type = type; msg->gnm_payload_len = 0; msg->gnm_srcnid = source; /* gnm_connstamp gets set when FMA is sent */ /* gnm_srcnid is set on creation via function argument * The right interface/net and nid is passed in when the message * is created. */ } kgn_tx_t * kgnilnd_new_tx_msg(int type, lnet_nid_t source) { kgn_tx_t *tx = kgnilnd_alloc_tx(); if (tx != NULL) { kgnilnd_init_msg(&tx->tx_msg, type, source); } else { CERROR("couldn't allocate new tx type %s!\n", kgnilnd_msgtype2str(type)); } return tx; } static void kgnilnd_nak_rdma(kgn_conn_t *conn, int rx_type, int error, __u64 cookie, lnet_nid_t source) { kgn_tx_t *tx; int nak_type; switch (rx_type) { case GNILND_MSG_GET_REQ: case GNILND_MSG_GET_DONE: nak_type = GNILND_MSG_GET_NAK; break; case GNILND_MSG_PUT_REQ: case GNILND_MSG_PUT_ACK: case GNILND_MSG_PUT_DONE: nak_type = GNILND_MSG_PUT_NAK; break; case GNILND_MSG_PUT_REQ_REV: case GNILND_MSG_PUT_DONE_REV: nak_type = GNILND_MSG_PUT_NAK_REV; break; case GNILND_MSG_GET_REQ_REV: case GNILND_MSG_GET_ACK_REV: case GNILND_MSG_GET_DONE_REV: nak_type = GNILND_MSG_GET_NAK_REV; break; default: CERROR("invalid msg type %s (%d)\n", kgnilnd_msgtype2str(rx_type), rx_type); LBUG(); } /* only allow NAK on error and truncate to zero */ LASSERTF(error <= 0, "error %d conn 0x%p, cookie "LPU64"\n", error, conn, cookie); tx = kgnilnd_new_tx_msg(nak_type, source); if (tx == NULL) { CNETERR("can't get TX to NAK RDMA to %s\n", libcfs_nid2str(conn->gnc_peer->gnp_nid)); return; } tx->tx_msg.gnm_u.completion.gncm_retval = error; tx->tx_msg.gnm_u.completion.gncm_cookie = cookie; kgnilnd_queue_tx(conn, tx); } int kgnilnd_setup_immediate_buffer(kgn_tx_t *tx, unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob) { kgn_msg_t *msg = &tx->tx_msg; int i; /* To help save on MDDs for short messages, we'll vmap a kiov to allow * gni_smsg_send to send that as the payload */ LASSERT(tx->tx_buftype == GNILND_BUF_NONE); if (nob == 0) { tx->tx_buffer = NULL; } else if (kiov != NULL) { if ((niov > 0) && unlikely(niov > (nob/PAGE_SIZE))) { niov = ((nob + offset + PAGE_SIZE - 1) / PAGE_SIZE); } LASSERTF(niov > 0 && niov < GNILND_MAX_IMMEDIATE/PAGE_SIZE, "bad niov %d msg %p kiov %p iov %p offset %d nob%d\n", niov, msg, kiov, iov, offset, nob); while (offset >= kiov->kiov_len) { offset -= kiov->kiov_len; niov--; kiov++; LASSERT(niov > 0); } for (i = 0; i < niov; i++) { /* We can't have a kiov_offset on anything but the first entry, * otherwise we'll have a hole at the end of the mapping as we only map * whole pages. * Also, if we have a kiov_len < PAGE_SIZE but we need to map more * than kiov_len, we will also have a whole at the end of that page * which isn't allowed */ if ((kiov[i].kiov_offset != 0 && i > 0) || (kiov[i].kiov_offset + kiov[i].kiov_len != PAGE_SIZE && i < niov - 1)) { CNETERR("Can't make payload contiguous in I/O VM:" "page %d, offset %u, nob %u, kiov_offset %u kiov_len %u \n", i, offset, nob, kiov->kiov_offset, kiov->kiov_len); RETURN(-EINVAL); } tx->tx_imm_pages[i] = kiov[i].kiov_page; } /* hijack tx_phys for the later unmap */ if (niov == 1) { /* tx->phyx being equal to NULL is the signal for unmap to discern between kmap and vmap */ tx->tx_phys = NULL; tx->tx_buffer = (void *)kmap(tx->tx_imm_pages[0]) + kiov[0].kiov_offset + offset; atomic_inc(&kgnilnd_data.kgn_nkmap_short); GNIDBG_TX(D_NET, tx, "kmapped page for %d bytes for kiov 0x%p, buffer 0x%p", nob, kiov, tx->tx_buffer); } else { tx->tx_phys = vmap(tx->tx_imm_pages, niov, VM_MAP, PAGE_KERNEL); if (tx->tx_phys == NULL) { CNETERR("Couldn't vmap %d frags on %d bytes\n", niov, nob); RETURN(-ENOMEM); } atomic_inc(&kgnilnd_data.kgn_nvmap_short); /* make sure we take into account the kiov offset as the start of the buffer */ tx->tx_buffer = (void *)tx->tx_phys + kiov[0].kiov_offset + offset; GNIDBG_TX(D_NET, tx, "mapped %d pages for %d bytes from kiov 0x%p to 0x%p, buffer 0x%p", niov, nob, kiov, tx->tx_phys, tx->tx_buffer); } tx->tx_buftype = GNILND_BUF_IMMEDIATE_KIOV; tx->tx_nob = nob; } else { /* For now this is almost identical to kgnilnd_setup_virt_buffer, but we * could "flatten" the payload into a single contiguous buffer ready * for sending direct over an FMA if we ever needed to. */ LASSERT(niov > 0); while (offset >= iov->iov_len) { offset -= iov->iov_len; niov--; iov++; LASSERT(niov > 0); } if (nob > iov->iov_len - offset) { CERROR("Can't handle multiple vaddr fragments\n"); return -EMSGSIZE; } tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset); tx->tx_buftype = GNILND_BUF_IMMEDIATE; tx->tx_nob = nob; } /* checksum payload early - it shouldn't be changing after lnd_send */ if (*kgnilnd_tunables.kgn_checksum >= 2) { msg->gnm_payload_cksum = kgnilnd_cksum(tx->tx_buffer, nob); if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM2)) { msg->gnm_payload_cksum += 0xe00e; } if (*kgnilnd_tunables.kgn_checksum_dump > 1) { kgnilnd_dump_blob(D_BUFFS, "payload checksum", tx->tx_buffer, nob); } } else { msg->gnm_payload_cksum = 0; } return 0; } int kgnilnd_setup_virt_buffer(kgn_tx_t *tx, unsigned int niov, struct iovec *iov, unsigned int offset, unsigned int nob) { LASSERT(nob > 0); LASSERT(niov > 0); LASSERT(tx->tx_buftype == GNILND_BUF_NONE); while (offset >= iov->iov_len) { offset -= iov->iov_len; niov--; iov++; LASSERT(niov > 0); } if (nob > iov->iov_len - offset) { CERROR("Can't handle multiple vaddr fragments\n"); return -EMSGSIZE; } tx->tx_buftype = GNILND_BUF_VIRT_UNMAPPED; tx->tx_nob = nob; tx->tx_buffer = (void *)(((unsigned long)iov->iov_base) + offset); return 0; } int kgnilnd_setup_phys_buffer(kgn_tx_t *tx, int nkiov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob) { gni_mem_segment_t *phys; int rc = 0; unsigned int fraglen; GNIDBG_TX(D_NET, tx, "niov %d kiov 0x%p offset %u nob %u", nkiov, kiov, offset, nob); LASSERT(nob > 0); LASSERT(nkiov > 0); LASSERT(tx->tx_buftype == GNILND_BUF_NONE); /* only allocate this if we are going to use it */ tx->tx_phys = kmem_cache_alloc(kgnilnd_data.kgn_tx_phys_cache, GFP_ATOMIC); if (tx->tx_phys == NULL) { CERROR("failed to allocate tx_phys\n"); rc = -ENOMEM; GOTO(error, rc); } CDEBUG(D_MALLOC, "slab-alloced 'tx->tx_phys': %lu at %p.\n", LNET_MAX_IOV * sizeof(gni_mem_segment_t), tx->tx_phys); /* if loops changes, please change kgnilnd_cksum_kiov * and kgnilnd_setup_immediate_buffer */ while (offset >= kiov->kiov_len) { offset -= kiov->kiov_len; nkiov--; kiov++; LASSERT(nkiov > 0); } /* at this point, kiov points to the first page that we'll actually map * now that we've seeked into the koiv for offset and dropped any * leading pages that fall entirely within the offset */ tx->tx_buftype = GNILND_BUF_PHYS_UNMAPPED; tx->tx_nob = nob; /* kiov_offset is start of 'valid' buffer, so index offset past that */ tx->tx_buffer = (void *)((unsigned long)(kiov->kiov_offset + offset)); phys = tx->tx_phys; CDEBUG(D_NET, "tx 0x%p buffer 0x%p map start kiov 0x%p+%u niov %d offset %u\n", tx, tx->tx_buffer, kiov, kiov->kiov_offset, nkiov, offset); do { fraglen = min(kiov->kiov_len - offset, nob); /* We can't have a kiov_offset on anything but the first entry, * otherwise we'll have a hole at the end of the mapping as we only map * whole pages. Only the first page is allowed to have an offset - * we'll add that into tx->tx_buffer and that will get used when we * map in the segments (see kgnilnd_map_buffer). * Also, if we have a kiov_len < PAGE_SIZE but we need to map more * than kiov_len, we will also have a whole at the end of that page * which isn't allowed */ if ((phys != tx->tx_phys) && ((kiov->kiov_offset != 0) || ((kiov->kiov_len < PAGE_SIZE) && (nob > kiov->kiov_len)))) { CERROR("Can't make payload contiguous in I/O VM:" "page %d, offset %u, nob %u, kiov_offset %u kiov_len %u \n", (int)(phys - tx->tx_phys), offset, nob, kiov->kiov_offset, kiov->kiov_len); rc = -EINVAL; GOTO(error, rc); } if ((phys - tx->tx_phys) == LNET_MAX_IOV) { CERROR ("payload too big (%d)\n", (int)(phys - tx->tx_phys)); rc = -EMSGSIZE; GOTO(error, rc); } if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PHYS_SETUP)) { rc = -EINVAL; GOTO(error, rc); } CDEBUG(D_BUFFS, "page 0x%p kiov_offset %u kiov_len %u nob %u " "nkiov %u offset %u\n", kiov->kiov_page, kiov->kiov_offset, kiov->kiov_len, nob, nkiov, offset); phys->address = page_to_phys(kiov->kiov_page); phys++; kiov++; nkiov--; nob -= fraglen; offset = 0; /* iov must not run out before end of data */ LASSERTF(nob == 0 || nkiov > 0, "nob %u nkiov %u\n", nob, nkiov); } while (nob > 0); tx->tx_phys_npages = phys - tx->tx_phys; return 0; error: if (tx->tx_phys != NULL) { kmem_cache_free(kgnilnd_data.kgn_tx_phys_cache, tx->tx_phys); CDEBUG(D_MALLOC, "slab-freed 'tx_phys': %lu at %p.\n", sizeof(*tx->tx_phys), tx->tx_phys); tx->tx_phys = NULL; } return rc; } static inline int kgnilnd_setup_rdma_buffer(kgn_tx_t *tx, unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, unsigned int offset, unsigned int nob) { int rc; LASSERTF((iov == NULL) != (kiov == NULL), "iov 0x%p, kiov 0x%p, tx 0x%p," " offset %d, nob %d, niov %d\n" , iov, kiov, tx, offset, nob, niov); if (kiov != NULL) { rc = kgnilnd_setup_phys_buffer(tx, niov, kiov, offset, nob); } else { rc = kgnilnd_setup_virt_buffer(tx, niov, iov, offset, nob); } return rc; } /* kgnilnd_parse_lnet_rdma() * lntmsg - message passed in from lnet. * niov, kiov, offset - see lnd_t in lib-types.h for descriptions. * nob - actual number of bytes to in this message. * put_len - It is possible for PUTs to have a different length than the * length stored in lntmsg->msg_len since LNET can adjust this * length based on it's buffer size and offset. * lnet_try_match_md() sets the mlength that we use to do the RDMA * transfer. */ static void kgnilnd_parse_lnet_rdma(lnet_msg_t *lntmsg, unsigned int *niov, unsigned int *offset, unsigned int *nob, lnet_kiov_t **kiov, int put_len) { /* GETs are weird, see kgnilnd_send */ if (lntmsg->msg_type == LNET_MSG_GET) { if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) { *kiov = NULL; } else { *kiov = lntmsg->msg_md->md_iov.kiov; } *niov = lntmsg->msg_md->md_niov; *nob = lntmsg->msg_md->md_length; *offset = 0; } else { *kiov = lntmsg->msg_kiov; *niov = lntmsg->msg_niov; *nob = put_len; *offset = lntmsg->msg_offset; } } static inline void kgnilnd_compute_rdma_cksum(kgn_tx_t *tx, int put_len) { unsigned int niov, offset, nob; lnet_kiov_t *kiov; lnet_msg_t *lntmsg = tx->tx_lntmsg[0]; int dump_cksum = (*kgnilnd_tunables.kgn_checksum_dump > 1); GNITX_ASSERTF(tx, ((tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE) || (tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE) || (tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE_REV) || (tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE_REV) || (tx->tx_msg.gnm_type == GNILND_MSG_GET_ACK_REV) || (tx->tx_msg.gnm_type == GNILND_MSG_PUT_REQ_REV)), "bad type %s", kgnilnd_msgtype2str(tx->tx_msg.gnm_type)); if ((tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE_REV) || (tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE_REV)) { tx->tx_msg.gnm_payload_cksum = 0; return; } if (*kgnilnd_tunables.kgn_checksum < 3) { tx->tx_msg.gnm_payload_cksum = 0; return; } GNITX_ASSERTF(tx, lntmsg, "no LNet message!", NULL); kgnilnd_parse_lnet_rdma(lntmsg, &niov, &offset, &nob, &kiov, put_len); if (kiov != NULL) { tx->tx_msg.gnm_payload_cksum = kgnilnd_cksum_kiov(niov, kiov, offset, nob, dump_cksum); } else { tx->tx_msg.gnm_payload_cksum = kgnilnd_cksum(tx->tx_buffer, nob); if (dump_cksum) { kgnilnd_dump_blob(D_BUFFS, "peer RDMA payload", tx->tx_buffer, nob); } } if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM3)) { tx->tx_msg.gnm_payload_cksum += 0xd00d; } } /* kgnilnd_verify_rdma_cksum() * tx - PUT_DONE/GET_DONE matched tx. * rx_cksum - received checksum to compare against. * put_len - see kgnilnd_parse_lnet_rdma comments. */ static inline int kgnilnd_verify_rdma_cksum(kgn_tx_t *tx, __u16 rx_cksum, int put_len) { int rc = 0; __u16 cksum; unsigned int niov, offset, nob; lnet_kiov_t *kiov; lnet_msg_t *lntmsg = tx->tx_lntmsg[0]; int dump_on_err = *kgnilnd_tunables.kgn_checksum_dump; /* we can only match certain requests */ GNITX_ASSERTF(tx, ((tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) || (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK) || (tx->tx_msg.gnm_type == GNILND_MSG_PUT_REQ_REV) || (tx->tx_msg.gnm_type == GNILND_MSG_GET_ACK_REV) || (tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE_REV) || (tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE_REV)), "bad type %s", kgnilnd_msgtype2str(tx->tx_msg.gnm_type)); if ((tx->tx_msg.gnm_type == GNILND_MSG_PUT_REQ_REV) || (tx->tx_msg.gnm_type == GNILND_MSG_GET_ACK_REV)) { return 0; } if (rx_cksum == 0) { if (*kgnilnd_tunables.kgn_checksum >= 3) { GNIDBG_MSG(D_WARNING, &tx->tx_msg, "no RDMA payload checksum when enabled"); } return 0; } GNITX_ASSERTF(tx, lntmsg, "no LNet message!", NULL); kgnilnd_parse_lnet_rdma(lntmsg, &niov, &offset, &nob, &kiov, put_len); if (kiov != NULL) { cksum = kgnilnd_cksum_kiov(niov, kiov, offset, nob, 0); } else { cksum = kgnilnd_cksum(tx->tx_buffer, nob); } if (cksum != rx_cksum) { GNIDBG_MSG(D_NETERROR, &tx->tx_msg, "Bad RDMA payload checksum (%x expected %x); " "kiov 0x%p niov %d nob %u offset %u", cksum, rx_cksum, kiov, niov, nob, offset); switch (dump_on_err) { case 2: if (kiov != NULL) { kgnilnd_cksum_kiov(niov, kiov, offset, nob, 1); } else { kgnilnd_dump_blob(D_BUFFS, "RDMA payload", tx->tx_buffer, nob); } /* fall through to dump log */ case 1: libcfs_debug_dumplog(); break; default: break; } rc = -ENOKEY; /* kgnilnd_check_fma_rx will close conn, kill tx with error */ } return rc; } void kgnilnd_mem_add_map_list(kgn_device_t *dev, kgn_tx_t *tx) { int bytes; GNITX_ASSERTF(tx, list_empty(&tx->tx_map_list), "already mapped!", NULL); spin_lock(&dev->gnd_map_lock); switch (tx->tx_buftype) { default: GNIDBG_TX(D_EMERG, tx, "SOFTWARE BUG: invalid mapping %d", tx->tx_buftype); spin_unlock(&dev->gnd_map_lock); LBUG(); break; case GNILND_BUF_PHYS_MAPPED: bytes = tx->tx_phys_npages * PAGE_SIZE; dev->gnd_map_nphys++; dev->gnd_map_physnop += tx->tx_phys_npages; break; case GNILND_BUF_VIRT_MAPPED: bytes = tx->tx_nob; dev->gnd_map_nvirt++; dev->gnd_map_virtnob += tx->tx_nob; break; } if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK || tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) { atomic64_add(bytes, &dev->gnd_rdmaq_bytes_out); GNIDBG_TX(D_NETTRACE, tx, "rdma ++ %d to "LPD64"", bytes, atomic64_read(&dev->gnd_rdmaq_bytes_out)); } atomic_inc(&dev->gnd_n_mdd); atomic64_add(bytes, &dev->gnd_nbytes_map); /* clear retrans to prevent any SMSG goofiness as that code uses the same counter */ tx->tx_retrans = 0; /* we only get here in the valid cases */ list_add_tail(&tx->tx_map_list, &dev->gnd_map_list); dev->gnd_map_version++; spin_unlock(&dev->gnd_map_lock); } void kgnilnd_mem_del_map_list(kgn_device_t *dev, kgn_tx_t *tx) { int bytes; GNITX_ASSERTF(tx, !list_empty(&tx->tx_map_list), "not mapped!", NULL); spin_lock(&dev->gnd_map_lock); switch (tx->tx_buftype) { default: GNIDBG_TX(D_EMERG, tx, "SOFTWARE BUG: invalid mapping %d", tx->tx_buftype); spin_unlock(&dev->gnd_map_lock); LBUG(); break; case GNILND_BUF_PHYS_UNMAPPED: bytes = tx->tx_phys_npages * PAGE_SIZE; dev->gnd_map_nphys--; dev->gnd_map_physnop -= tx->tx_phys_npages; break; case GNILND_BUF_VIRT_UNMAPPED: bytes = tx->tx_nob; dev->gnd_map_nvirt--; dev->gnd_map_virtnob -= tx->tx_nob; break; } if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK || tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) { atomic64_sub(bytes, &dev->gnd_rdmaq_bytes_out); LASSERTF(atomic64_read(&dev->gnd_rdmaq_bytes_out) >= 0, "bytes_out negative! %ld\n", atomic64_read(&dev->gnd_rdmaq_bytes_out)); GNIDBG_TX(D_NETTRACE, tx, "rdma -- %d to "LPD64"", bytes, atomic64_read(&dev->gnd_rdmaq_bytes_out)); } atomic_dec(&dev->gnd_n_mdd); atomic64_sub(bytes, &dev->gnd_nbytes_map); /* we only get here in the valid cases */ list_del_init(&tx->tx_map_list); dev->gnd_map_version++; spin_unlock(&dev->gnd_map_lock); } int kgnilnd_map_buffer(kgn_tx_t *tx) { kgn_conn_t *conn = tx->tx_conn; kgn_device_t *dev = conn->gnc_device; __u32 flags = GNI_MEM_READWRITE; gni_return_t rrc; /* The kgnilnd_mem_register(_segments) Gemini Driver functions can * be called concurrently as there are internal locks that protect * any data structures or HW resources. We just need to ensure * that our concurrency doesn't result in the kgn_device_t * getting nuked while we are in here */ LASSERTF(conn != NULL, "tx %p with NULL conn, someone forgot" " to set tx_conn before calling %s\n", tx, __FUNCTION__); if (unlikely(CFS_FAIL_CHECK(CFS_FAIL_GNI_MAP_TX))) RETURN(-ENOMEM); if (*kgnilnd_tunables.kgn_bte_relaxed_ordering) { flags |= GNI_MEM_RELAXED_PI_ORDERING; } switch (tx->tx_buftype) { default: LBUG(); case GNILND_BUF_NONE: case GNILND_BUF_IMMEDIATE: case GNILND_BUF_IMMEDIATE_KIOV: case GNILND_BUF_PHYS_MAPPED: case GNILND_BUF_VIRT_MAPPED: return 0; case GNILND_BUF_PHYS_UNMAPPED: GNITX_ASSERTF(tx, tx->tx_phys != NULL, "physical buffer not there!", NULL); rrc = kgnilnd_mem_register_segments(dev->gnd_handle, tx->tx_phys, tx->tx_phys_npages, NULL, GNI_MEM_PHYS_SEGMENTS | flags, &tx->tx_map_key); /* could race with other uses of the map counts, but this is ok * - this needs to turn into a non-fatal error soon to allow * GART resource, etc starvation handling */ if (rrc != GNI_RC_SUCCESS) { GNIDBG_TX(D_NET, tx, "Can't map %d pages: dev %d " "phys %u pp %u, virt %u nob "LPU64"", tx->tx_phys_npages, dev->gnd_id, dev->gnd_map_nphys, dev->gnd_map_physnop, dev->gnd_map_nvirt, dev->gnd_map_virtnob); RETURN(rrc == GNI_RC_ERROR_RESOURCE ? -ENOMEM : -EINVAL); } tx->tx_buftype = GNILND_BUF_PHYS_MAPPED; kgnilnd_mem_add_map_list(dev, tx); return 0; case GNILND_BUF_VIRT_UNMAPPED: rrc = kgnilnd_mem_register(dev->gnd_handle, (__u64)tx->tx_buffer, tx->tx_nob, NULL, flags, &tx->tx_map_key); if (rrc != GNI_RC_SUCCESS) { GNIDBG_TX(D_NET, tx, "Can't map %u bytes: dev %d " "phys %u pp %u, virt %u nob "LPU64"", tx->tx_nob, dev->gnd_id, dev->gnd_map_nphys, dev->gnd_map_physnop, dev->gnd_map_nvirt, dev->gnd_map_virtnob); RETURN(rrc == GNI_RC_ERROR_RESOURCE ? -ENOMEM : -EINVAL); } tx->tx_buftype = GNILND_BUF_VIRT_MAPPED; kgnilnd_mem_add_map_list(dev, tx); if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_ACK || tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) { atomic64_add(tx->tx_nob, &dev->gnd_rdmaq_bytes_out); GNIDBG_TX(D_NETTRACE, tx, "rdma ++ %d to %ld\n", tx->tx_nob, atomic64_read(&dev->gnd_rdmaq_bytes_out)); } return 0; } } void kgnilnd_add_purgatory_tx(kgn_tx_t *tx) { kgn_conn_t *conn = tx->tx_conn; kgn_mdd_purgatory_t *gmp; LIBCFS_ALLOC(gmp, sizeof(*gmp)); LASSERTF(gmp != NULL, "couldn't allocate MDD purgatory member;" " asserting to avoid data corruption\n"); if (tx->tx_buffer_copy) gmp->gmp_map_key = tx->tx_buffer_copy_map_key; else gmp->gmp_map_key = tx->tx_map_key; atomic_inc(&conn->gnc_device->gnd_n_mdd_held); /* ensure that we don't have a blank purgatory - indicating the * conn is not already on purgatory lists - we'd never recover these * MDD if that were the case */ GNITX_ASSERTF(tx, conn->gnc_in_purgatory, "conn 0x%p->%s with NULL purgatory", conn, libcfs_nid2str(conn->gnc_peer->gnp_nid)); /* link 'er up! - only place we really need to lock for * concurrent access */ spin_lock(&conn->gnc_list_lock); list_add_tail(&gmp->gmp_list, &conn->gnc_mdd_list); spin_unlock(&conn->gnc_list_lock); } void kgnilnd_unmap_buffer(kgn_tx_t *tx, int error) { kgn_device_t *dev; gni_return_t rrc; int hold_timeout = 0; /* code below relies on +1 relationship ... */ CLASSERT(GNILND_BUF_PHYS_MAPPED == (GNILND_BUF_PHYS_UNMAPPED + 1)); CLASSERT(GNILND_BUF_VIRT_MAPPED == (GNILND_BUF_VIRT_UNMAPPED + 1)); switch (tx->tx_buftype) { default: LBUG(); case GNILND_BUF_NONE: case GNILND_BUF_IMMEDIATE: case GNILND_BUF_PHYS_UNMAPPED: case GNILND_BUF_VIRT_UNMAPPED: break; case GNILND_BUF_IMMEDIATE_KIOV: if (tx->tx_phys != NULL) { vunmap(tx->tx_phys); } else if (tx->tx_phys == NULL && tx->tx_buffer != NULL) { kunmap(tx->tx_imm_pages[0]); } /* clear to prevent kgnilnd_free_tx from thinking * this is a RDMA descriptor */ tx->tx_phys = NULL; break; case GNILND_BUF_PHYS_MAPPED: case GNILND_BUF_VIRT_MAPPED: LASSERT(tx->tx_conn != NULL); dev = tx->tx_conn->gnc_device; /* only want to hold if we are closing conn without * verified peer notification - the theory is that * a TX error can be communicated in all other cases */ if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED && error != -GNILND_NOPURG && kgnilnd_check_purgatory_conn(tx->tx_conn)) { kgnilnd_add_purgatory_tx(tx); /* The timeout we give to kgni is a deadman stop only. * we are setting high to ensure we don't have the kgni timer * fire before ours fires _and_ is handled */ hold_timeout = GNILND_TIMEOUT2DEADMAN; GNIDBG_TX(D_NET, tx, "dev %p delaying MDD release for %dms key "LPX64"."LPX64"", tx->tx_conn->gnc_device, hold_timeout, tx->tx_map_key.qword1, tx->tx_map_key.qword2); } if (tx->tx_buffer_copy != NULL) { rrc = kgnilnd_mem_deregister(dev->gnd_handle, &tx->tx_buffer_copy_map_key, hold_timeout); LASSERTF(rrc == GNI_RC_SUCCESS, "rrc %d\n", rrc); rrc = kgnilnd_mem_deregister(dev->gnd_handle, &tx->tx_map_key, 0); LASSERTF(rrc == GNI_RC_SUCCESS, "rrc %d\n", rrc); } else { rrc = kgnilnd_mem_deregister(dev->gnd_handle, &tx->tx_map_key, hold_timeout); LASSERTF(rrc == GNI_RC_SUCCESS, "rrc %d\n", rrc); } tx->tx_buftype--; kgnilnd_mem_del_map_list(dev, tx); break; } } void kgnilnd_tx_done(kgn_tx_t *tx, int completion) { lnet_msg_t *lntmsg0, *lntmsg1; int status0, status1; lnet_ni_t *ni = NULL; kgn_conn_t *conn = tx->tx_conn; LASSERT(!in_interrupt()); lntmsg0 = tx->tx_lntmsg[0]; tx->tx_lntmsg[0] = NULL; lntmsg1 = tx->tx_lntmsg[1]; tx->tx_lntmsg[1] = NULL; if (completion && !(tx->tx_state & GNILND_TX_QUIET_ERROR) && !kgnilnd_conn_clean_errno(completion)) { GNIDBG_TOMSG(D_NETERROR, &tx->tx_msg, "error %d on tx 0x%p->%s id %u/%d state %s age %ds", completion, tx, conn ? libcfs_nid2str(conn->gnc_peer->gnp_nid) : "", tx->tx_id.txe_smsg_id, tx->tx_id.txe_idx, kgnilnd_tx_state2str(tx->tx_list_state), cfs_duration_sec((unsigned long)jiffies - tx->tx_qtime)); } /* The error codes determine if we hold onto the MDD */ kgnilnd_unmap_buffer(tx, completion); /* we have to deliver a reply on lntmsg[1] for the GET, so make sure * we play nice with the error codes to avoid delivering a failed * REQUEST and then a REPLY event as well */ /* return -EIO to lnet - it is the magic value for failed sends */ if (tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ) { status0 = 0; status1 = completion; } else { status0 = status1 = completion; } tx->tx_buftype = GNILND_BUF_NONE; tx->tx_msg.gnm_type = GNILND_MSG_NONE; /* lnet_finalize doesn't do anything with the *ni, so ok for us to * set NULL when we are a tx without a conn */ if (conn != NULL) { ni = conn->gnc_peer->gnp_net->gnn_ni; spin_lock(&conn->gnc_tx_lock); LASSERTF(test_and_clear_bit(tx->tx_id.txe_idx, (volatile unsigned long *)&conn->gnc_tx_bits), "conn %p tx %p bit %d already cleared\n", conn, tx, tx->tx_id.txe_idx); LASSERTF(conn->gnc_tx_ref_table[tx->tx_id.txe_idx] != NULL, "msg_id %d already NULL\n", tx->tx_id.txe_idx); conn->gnc_tx_ref_table[tx->tx_id.txe_idx] = NULL; spin_unlock(&conn->gnc_tx_lock); } kgnilnd_free_tx(tx); /* finalize AFTER freeing lnet msgs */ /* warning - we should hold no locks here - calling lnet_finalize * could free up lnet credits, resulting in a call chain back into * the LND via kgnilnd_send and friends */ lnet_finalize(ni, lntmsg0, status0); if (lntmsg1 != NULL) { lnet_finalize(ni, lntmsg1, status1); } } void kgnilnd_txlist_done(struct list_head *txlist, int error) { kgn_tx_t *tx, *txn; int err_printed = 0; if (list_empty(txlist)) return; list_for_each_entry_safe(tx, txn, txlist, tx_list) { /* only print the first error */ if (err_printed) tx->tx_state |= GNILND_TX_QUIET_ERROR; list_del_init(&tx->tx_list); kgnilnd_tx_done(tx, error); err_printed++; } } int kgnilnd_set_tx_id(kgn_tx_t *tx, kgn_conn_t *conn) { int id; spin_lock(&conn->gnc_tx_lock); /* ID zero is NOT ALLOWED!!! */ search_again: id = find_next_zero_bit((unsigned long *)&conn->gnc_tx_bits, GNILND_MAX_MSG_ID, conn->gnc_next_tx); if (id == GNILND_MAX_MSG_ID) { if (conn->gnc_next_tx != 1) { /* we only searched from next_tx to end and didn't find * one, so search again from start */ conn->gnc_next_tx = 1; goto search_again; } /* couldn't find one! */ spin_unlock(&conn->gnc_tx_lock); return -E2BIG; } /* bump next_tx to prevent immediate reuse */ conn->gnc_next_tx = id + 1; set_bit(id, (volatile unsigned long *)&conn->gnc_tx_bits); LASSERTF(conn->gnc_tx_ref_table[id] == NULL, "tx 0x%p already at id %d\n", conn->gnc_tx_ref_table[id], id); /* delay these until we have a valid ID - prevents bad clear of the bit * in kgnilnd_tx_done */ tx->tx_conn = conn; tx->tx_id.txe_cqid = conn->gnc_cqid; tx->tx_id.txe_idx = id; conn->gnc_tx_ref_table[id] = tx; /* Using jiffies to help differentiate against TX reuse - with * the usual minimum of a 250HZ clock, we wrap jiffies on the same TX * if we are sending to the same node faster than 256000/sec. * To help guard against this, we OR in the tx_seq - that is 32 bits */ tx->tx_id.txe_chips = (__u32)(jiffies | atomic_read(&conn->gnc_tx_seq)); GNIDBG_TX(D_NET, tx, "set cookie/id/bits", NULL); spin_unlock(&conn->gnc_tx_lock); return 0; } static inline int kgnilnd_tx_should_retry(kgn_conn_t *conn, kgn_tx_t *tx) { int max_retrans = *kgnilnd_tunables.kgn_max_retransmits; int log_retrans; int log_retrans_level; /* I need kgni credits to send this. Replace tx at the head of the * fmaq and I'll get rescheduled when credits appear */ tx->tx_state = 0; tx->tx_retrans++; conn->gnc_tx_retrans++; log_retrans = ((tx->tx_retrans < 25) || ((tx->tx_retrans % 25) == 0) || (tx->tx_retrans > (max_retrans / 2))); log_retrans_level = tx->tx_retrans < (max_retrans / 2) ? D_NET : D_NETERROR; /* Decision time - either error, warn or just retransmit */ /* we don't care about TX timeout - it could be that the network is slower * or throttled. We'll keep retranmitting - so if the network is so slow * that we fill up our mailbox, we'll keep trying to resend that msg * until we exceed the max_retrans _or_ gnc_last_rx expires, indicating * that he hasn't send us any traffic in return */ if (tx->tx_retrans > max_retrans) { /* this means we are not backing off the retransmits * in a healthy manner and are likely chewing up the * CPU cycles quite badly */ GNIDBG_TOMSG(D_ERROR, &tx->tx_msg, "SOFTWARE BUG: too many retransmits (%d) for tx id %x " "conn 0x%p->%s\n", tx->tx_retrans, tx->tx_id, conn, libcfs_nid2str(conn->gnc_peer->gnp_nid)); /* yes - double errors to help debug this condition */ GNIDBG_TOMSG(D_NETERROR, &tx->tx_msg, "connection dead. " "unable to send to %s for %lu secs (%d tries)", libcfs_nid2str(tx->tx_conn->gnc_peer->gnp_nid), cfs_duration_sec(jiffies - tx->tx_cred_wait), tx->tx_retrans); kgnilnd_close_conn(conn, -ETIMEDOUT); /* caller should terminate */ RETURN(0); } else { /* some reasonable throttling of the debug message */ if (log_retrans) { unsigned long now = jiffies; /* XXX Nic: Mystical TX debug here... */ GNIDBG_SMSG_CREDS(log_retrans_level, conn); GNIDBG_TOMSG(log_retrans_level, &tx->tx_msg, "NOT_DONE on conn 0x%p->%s id %x retrans %d wait %dus" " last_msg %uus/%uus last_cq %uus/%uus", conn, libcfs_nid2str(conn->gnc_peer->gnp_nid), tx->tx_id, tx->tx_retrans, jiffies_to_usecs(now - tx->tx_cred_wait), jiffies_to_usecs(now - conn->gnc_last_tx), jiffies_to_usecs(now - conn->gnc_last_rx), jiffies_to_usecs(now - conn->gnc_last_tx_cq), jiffies_to_usecs(now - conn->gnc_last_rx_cq)); } /* caller should retry */ RETURN(1); } } /* caller must be holding gnd_cq_mutex and not unlock it afterwards, as we need to drop it * to avoid bad ordering with state_lock */ static inline int kgnilnd_sendmsg_nolock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob, spinlock_t *state_lock, kgn_tx_list_state_t state) { kgn_conn_t *conn = tx->tx_conn; kgn_msg_t *msg = &tx->tx_msg; int retry_send; gni_return_t rrc; unsigned long newest_last_rx, timeout; unsigned long now; LASSERTF((msg->gnm_type == GNILND_MSG_IMMEDIATE) ? immediatenob <= *kgnilnd_tunables.kgn_max_immediate : immediatenob == 0, "msg 0x%p type %d wrong payload size %d\n", msg, msg->gnm_type, immediatenob); /* make sure we catch all the cases where we'd send on a dirty old mbox * but allow case for sending CLOSE. Since this check is within the CQ * mutex barrier and the close message is only sent through * kgnilnd_send_conn_close the last message out the door will be the * close message. */ if (atomic_read(&conn->gnc_peer->gnp_dirty_eps) != 0 && msg->gnm_type != GNILND_MSG_CLOSE) { kgnilnd_conn_mutex_unlock(&conn->gnc_smsg_mutex); kgnilnd_gl_mutex_unlock(&conn->gnc_device->gnd_cq_mutex); /* Return -ETIME, we are closing the connection already so we dont want to * have this tx hit the wire. The tx will be killed by the calling function. * Once the EP is marked dirty the close message will be the last * thing to hit the wire */ return -ETIME; } now = jiffies; timeout = cfs_time_seconds(conn->gnc_timeout); newest_last_rx = GNILND_LASTRX(conn); if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SEND_TIMEOUT)) { now = now + (GNILND_TIMEOUTRX(timeout) * 2); } if (time_after_eq(now, newest_last_rx + GNILND_TIMEOUTRX(timeout))) { GNIDBG_CONN(D_NETERROR|D_CONSOLE, conn, "Cant send to %s after timeout lapse of %lu; TO %lu\n", libcfs_nid2str(conn->gnc_peer->gnp_nid), cfs_duration_sec(now - newest_last_rx), cfs_duration_sec(GNILND_TIMEOUTRX(timeout))); kgnilnd_conn_mutex_unlock(&conn->gnc_smsg_mutex); kgnilnd_gl_mutex_unlock(&conn->gnc_device->gnd_cq_mutex); return -ETIME; } GNITX_ASSERTF(tx, (conn != NULL) && (tx->tx_id.txe_idx != 0), "tx id unset!", NULL); /* msg->gnm_srcnid is set when the message is initialized by whatever function is * creating the message this allows the message to contain the correct LNET NID/NET needed * instead of the one that the peer/conn uses for sending the data. */ msg->gnm_connstamp = conn->gnc_my_connstamp; msg->gnm_payload_len = immediatenob; msg->gnm_seq = atomic_read(&conn->gnc_tx_seq); /* always init here - kgn_checksum is a /sys module tunable * and can be flipped at any point, even between msg init and sending */ msg->gnm_cksum = 0; if (*kgnilnd_tunables.kgn_checksum) { /* We must set here and not in kgnilnd_init_msg, * we could resend this msg many times * (NOT_DONE from gni_smsg_send below) and wouldn't pass * through init_msg again */ msg->gnm_cksum = kgnilnd_cksum(msg, sizeof(kgn_msg_t)); if (CFS_FAIL_CHECK(CFS_FAIL_GNI_SMSG_CKSUM1)) { msg->gnm_cksum += 0xf00f; } } GNIDBG_TOMSG(D_NET, msg, "tx 0x%p conn 0x%p->%s sending SMSG sz %u id %x/%d [%p for %u]", tx, conn, libcfs_nid2str(conn->gnc_peer->gnp_nid), sizeof(kgn_msg_t), tx->tx_id.txe_smsg_id, tx->tx_id.txe_idx, immediate, immediatenob); if (unlikely(tx->tx_state & GNILND_TX_FAIL_SMSG)) { rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_NOT_DONE; } else { rrc = kgnilnd_smsg_send(conn->gnc_ephandle, msg, sizeof(*msg), immediate, immediatenob, tx->tx_id.txe_smsg_id); } switch (rrc) { case GNI_RC_SUCCESS: atomic_inc(&conn->gnc_tx_seq); conn->gnc_last_tx = jiffies; /* no locking here as LIVE isn't a list */ kgnilnd_tx_add_state_locked(tx, NULL, conn, GNILND_TX_LIVE_FMAQ, 1); /* this needs to be checked under lock as it might be freed from a completion * event. */ if (msg->gnm_type == GNILND_MSG_NOOP) { set_mb(conn->gnc_last_noop_sent, jiffies); } /* serialize with seeing CQ events for completion on this, as well as * tx_seq */ kgnilnd_conn_mutex_unlock(&conn->gnc_smsg_mutex); kgnilnd_gl_mutex_unlock(&conn->gnc_device->gnd_cq_mutex); atomic_inc(&conn->gnc_device->gnd_short_ntx); atomic64_add(immediatenob, &conn->gnc_device->gnd_short_txbytes); kgnilnd_peer_alive(conn->gnc_peer); GNIDBG_SMSG_CREDS(D_NET, conn); return 0; case GNI_RC_NOT_DONE: /* XXX Nic: We need to figure out how to track this * - there are bound to be good reasons for it, * but we want to know when it happens */ kgnilnd_conn_mutex_unlock(&conn->gnc_smsg_mutex); kgnilnd_gl_mutex_unlock(&conn->gnc_device->gnd_cq_mutex); /* We'll handle this error inline - makes the calling logic much more * clean */ /* If no lock, caller doesn't want us to retry */ if (state_lock == NULL) { return -EAGAIN; } retry_send = kgnilnd_tx_should_retry(conn, tx); if (retry_send) { /* add to head of list for the state and retries */ spin_lock(state_lock); kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, state, 0); spin_unlock(state_lock); /* We only reschedule for a certain number of retries, then * we will wait for the CQ events indicating a release of SMSG * credits */ if (tx->tx_retrans < (*kgnilnd_tunables.kgn_max_retransmits/4)) { kgnilnd_schedule_conn(conn); return 0; } else { /* CQ event coming in signifies either TX completed or * RX receive. Either of these *could* free up credits * in the SMSG mbox and we should try sending again */ GNIDBG_TX(D_NET, tx, "waiting for CQID %u event to resend", tx->tx_conn->gnc_cqid); /* use +ve return code to let upper layers know they * should stop looping on sends */ return EAGAIN; } } else { return -EAGAIN; } default: /* handle bad retcode gracefully */ kgnilnd_conn_mutex_unlock(&conn->gnc_smsg_mutex); kgnilnd_gl_mutex_unlock(&conn->gnc_device->gnd_cq_mutex); return -EIO; } } /* kgnilnd_sendmsg has hard wait on gnd_cq_mutex */ static inline int kgnilnd_sendmsg(kgn_tx_t *tx, void *immediate, unsigned int immediatenob, spinlock_t *state_lock, kgn_tx_list_state_t state) { kgn_device_t *dev = tx->tx_conn->gnc_device; unsigned long timestamp; int rc; timestamp = jiffies; kgnilnd_gl_mutex_lock(&dev->gnd_cq_mutex); kgnilnd_conn_mutex_lock(&tx->tx_conn->gnc_smsg_mutex); /* delay in jiffies - we are really concerned only with things that * result in a schedule() or really holding this off for long times . * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */ dev->gnd_mutex_delay += (long) jiffies - timestamp; rc = kgnilnd_sendmsg_nolock(tx, immediate, immediatenob, state_lock, state); RETURN(rc); } /* returns -EAGAIN for lock miss, anything else < 0 is hard error, >=0 for success */ static inline int kgnilnd_sendmsg_trylock(kgn_tx_t *tx, void *immediate, unsigned int immediatenob, spinlock_t *state_lock, kgn_tx_list_state_t state) { kgn_conn_t *conn = tx->tx_conn; kgn_device_t *dev = conn->gnc_device; unsigned long timestamp; int rc; timestamp = jiffies; /* technically we are doing bad things with the read_lock on the peer_conn * table, but we shouldn't be sleeping inside here - and we don't sleep/block * for the mutex. I bet lockdep is gonna flag this one though... */ /* there are a few cases where we don't want the immediate send - like * when we are in the scheduler thread and it'd harm the latency of * getting messages up to LNet */ /* rmb for gnd_ready */ smp_rmb(); if (conn->gnc_device->gnd_ready == GNILND_DEV_LOOP) { rc = 0; atomic_inc(&conn->gnc_device->gnd_fast_block); } else if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { /* dont hit HW during quiesce */ rc = 0; } else if (unlikely(atomic_read(&conn->gnc_peer->gnp_dirty_eps))) { /* dont hit HW if stale EPs and conns left to close */ rc = 0; } else { atomic_inc(&conn->gnc_device->gnd_fast_try); rc = kgnilnd_trylock(&conn->gnc_device->gnd_cq_mutex, &conn->gnc_smsg_mutex); } if (!rc) { rc = -EAGAIN; } else { /* we got the mutex and weren't blocked */ /* delay in jiffies - we are really concerned only with things that * result in a schedule() or really holding this off for long times . * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */ dev->gnd_mutex_delay += (long) jiffies - timestamp; atomic_inc(&conn->gnc_device->gnd_fast_ok); tx->tx_qtime = jiffies; tx->tx_state = GNILND_TX_WAITING_COMPLETION; rc = kgnilnd_sendmsg_nolock(tx, tx->tx_buffer, tx->tx_nob, &conn->gnc_list_lock, GNILND_TX_FMAQ); /* _nolock unlocks the mutex for us */ } RETURN(rc); } /* lets us know if we can push this RDMA through now */ inline int kgnilnd_auth_rdma_bytes(kgn_device_t *dev, kgn_tx_t *tx) { long bytes_left; bytes_left = atomic64_sub_return(tx->tx_nob, &dev->gnd_rdmaq_bytes_ok); if (bytes_left < 0) { atomic64_add(tx->tx_nob, &dev->gnd_rdmaq_bytes_ok); atomic_inc(&dev->gnd_rdmaq_nstalls); smp_wmb(); CDEBUG(D_NET, "no bytes to send, turning on timer for %lu\n", dev->gnd_rdmaq_deadline); mod_timer(&dev->gnd_rdmaq_timer, dev->gnd_rdmaq_deadline); /* we never del this timer - at worst it schedules us.. */ return -EAGAIN; } else { return 0; } } /* this adds a TX to the queue pending throttling authorization before * we allow our remote peer to launch a PUT at us */ void kgnilnd_queue_rdma(kgn_conn_t *conn, kgn_tx_t *tx) { int rc; /* we cannot go into send_mapped_tx from here as we are holding locks * and mem registration might end up allocating memory in kgni. * That said, we'll push this as far as we can into the queue process */ rc = kgnilnd_auth_rdma_bytes(conn->gnc_device, tx); if (rc < 0) { spin_lock(&conn->gnc_device->gnd_rdmaq_lock); kgnilnd_tx_add_state_locked(tx, NULL, conn, GNILND_TX_RDMAQ, 0); /* lets us know how delayed RDMA is */ tx->tx_qtime = jiffies; spin_unlock(&conn->gnc_device->gnd_rdmaq_lock); } else { /* we have RDMA authorized, now it just needs a MDD and to hit the wire */ spin_lock(&tx->tx_conn->gnc_device->gnd_lock); kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 0); /* lets us know how delayed mapping is */ tx->tx_qtime = jiffies; spin_unlock(&tx->tx_conn->gnc_device->gnd_lock); } /* make sure we wake up sched to run this */ kgnilnd_schedule_device(tx->tx_conn->gnc_device); } /* push TX through state machine */ void kgnilnd_queue_tx(kgn_conn_t *conn, kgn_tx_t *tx) { int rc = 0; int add_tail = 1; /* set the tx_id here, we delay it until we have an actual conn * to fiddle with * in some cases, the tx_id is already set to provide for things * like RDMA completion cookies, etc */ if (tx->tx_id.txe_idx == 0) { rc = kgnilnd_set_tx_id(tx, conn); if (rc != 0) { kgnilnd_tx_done(tx, rc); return; } } CDEBUG(D_NET, "%s to conn %p for %s\n", kgnilnd_msgtype2str(tx->tx_msg.gnm_type), conn, libcfs_nid2str(conn->gnc_peer->gnp_nid)); /* Only let NOOPs to be sent while fail loc is set, otherwise kill the tx. */ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_ONLY_NOOP) && (tx->tx_msg.gnm_type != GNILND_MSG_NOOP)) { kgnilnd_tx_done(tx, rc); return; } switch (tx->tx_msg.gnm_type) { case GNILND_MSG_PUT_ACK: case GNILND_MSG_GET_REQ: case GNILND_MSG_PUT_REQ_REV: case GNILND_MSG_GET_ACK_REV: /* hijacking time! If this messages will authorize our peer to * send his dirty little bytes in an RDMA, we need to get permission */ kgnilnd_queue_rdma(conn, tx); break; case GNILND_MSG_IMMEDIATE: /* try to send right now, can help reduce latency */ rc = kgnilnd_sendmsg_trylock(tx, tx->tx_buffer, tx->tx_nob, &conn->gnc_list_lock, GNILND_TX_FMAQ); if (rc >= 0) { /* it was sent, break out of switch to avoid default case of queueing */ break; } /* needs to queue to try again, so fall through to default case */ case GNILND_MSG_NOOP: /* Just make sure this goes out first for this conn */ add_tail = 0; /* fall through... */ default: spin_lock(&conn->gnc_list_lock); kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, GNILND_TX_FMAQ, add_tail); tx->tx_qtime = jiffies; spin_unlock(&conn->gnc_list_lock); kgnilnd_schedule_conn(conn); } } void kgnilnd_launch_tx(kgn_tx_t *tx, kgn_net_t *net, lnet_process_id_t *target) { kgn_peer_t *peer; kgn_peer_t *new_peer = NULL; kgn_conn_t *conn = NULL; int rc; int node_state; ENTRY; /* If I get here, I've committed to send, so I complete the tx with * failure on any problems */ GNITX_ASSERTF(tx, tx->tx_conn == NULL, "tx already has connection %p", tx->tx_conn); /* do all of the peer & conn searching in one swoop - this avoids * nastiness when dropping locks and needing to maintain a sane state * in the face of stack reset or something else nuking peers & conns */ /* I expect to find him, so only take a read lock */ read_lock(&kgnilnd_data.kgn_peer_conn_lock); peer = kgnilnd_find_peer_locked(target->nid); if (peer != NULL) { conn = kgnilnd_find_conn_locked(peer); /* this could be NULL during quiesce */ if (conn != NULL) { /* Connection exists; queue message on it */ kgnilnd_queue_tx(conn, tx); read_unlock(&kgnilnd_data.kgn_peer_conn_lock); RETURN_EXIT; } /* don't create a connection if the peer is marked down */ if (peer->gnp_down == GNILND_RCA_NODE_DOWN) { read_unlock(&kgnilnd_data.kgn_peer_conn_lock); rc = -ENETRESET; GOTO(no_peer, rc); } } /* creating peer or conn; I'll need a write lock... */ read_unlock(&kgnilnd_data.kgn_peer_conn_lock); CFS_RACE(CFS_FAIL_GNI_FIND_TARGET); node_state = kgnilnd_get_node_state(LNET_NIDADDR(target->nid)); /* NB - this will not block during normal operations - * the only writer of this is in the startup/shutdown path. */ rc = down_read_trylock(&kgnilnd_data.kgn_net_rw_sem); if (!rc) { rc = -ESHUTDOWN; GOTO(no_peer, rc); } /* ignore previous peer entirely - we cycled the lock, so we * will create new peer and at worst drop it if peer is still * in the tables */ rc = kgnilnd_create_peer_safe(&new_peer, target->nid, net, node_state); if (rc != 0) { up_read(&kgnilnd_data.kgn_net_rw_sem); GOTO(no_peer, rc); } write_lock(&kgnilnd_data.kgn_peer_conn_lock); up_read(&kgnilnd_data.kgn_net_rw_sem); /* search for peer again now that we have the lock * if we don't find it, add our new one to the list */ kgnilnd_add_peer_locked(target->nid, new_peer, &peer); /* don't create a connection if the peer is not up */ if (peer->gnp_down != GNILND_RCA_NODE_UP) { write_unlock(&kgnilnd_data.kgn_peer_conn_lock); rc = -ENETRESET; GOTO(no_peer, rc); } conn = kgnilnd_find_or_create_conn_locked(peer); if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DGRAM_DROP_TX)) { write_unlock(&kgnilnd_data.kgn_peer_conn_lock); GOTO(no_peer, rc); } if (conn != NULL) { /* oh hey, found a conn now... magical */ kgnilnd_queue_tx(conn, tx); } else { /* no conn, must be trying to connect - so we queue for now */ tx->tx_qtime = jiffies; kgnilnd_tx_add_state_locked(tx, peer, NULL, GNILND_TX_PEERQ, 1); } write_unlock(&kgnilnd_data.kgn_peer_conn_lock); RETURN_EXIT; no_peer: kgnilnd_tx_done(tx, rc); RETURN_EXIT; } int kgnilnd_rdma(kgn_tx_t *tx, int type, kgn_rdma_desc_t *sink, unsigned int nob, __u64 cookie) { kgn_conn_t *conn = tx->tx_conn; unsigned long timestamp; gni_post_type_t post_type; gni_return_t rrc; int rc = 0; unsigned int desc_nob = nob; void *desc_buffer = tx->tx_buffer; gni_mem_handle_t desc_map_key = tx->tx_map_key; LASSERTF(kgnilnd_tx_mapped(tx), "unmapped tx %p\n", tx); LASSERTF(conn != NULL, "NULL conn on tx %p, naughty, naughty\n", tx); LASSERTF(nob <= sink->gnrd_nob, "nob %u > sink->gnrd_nob %d (%p)\n", nob, sink->gnrd_nob, sink); LASSERTF(nob <= tx->tx_nob, "nob %d > tx(%p)->tx_nob %d\n", nob, tx, tx->tx_nob); switch (type) { case GNILND_MSG_GET_DONE: case GNILND_MSG_PUT_DONE: post_type = GNI_POST_RDMA_PUT; break; case GNILND_MSG_GET_DONE_REV: case GNILND_MSG_PUT_DONE_REV: post_type = GNI_POST_RDMA_GET; break; default: CERROR("invalid msg type %s (%d)\n", kgnilnd_msgtype2str(type), type); LBUG(); } if (post_type == GNI_POST_RDMA_GET) { /* Check for remote buffer / local buffer / length alignment. All must be 4 byte * aligned. If the local buffer is not aligned correctly using the copy buffer * will fix that issue. If length is misaligned copy buffer will also fix the issue, we end * up transferring extra bytes into the buffer but only copy the correct nob into the original * buffer. Remote offset correction is done through a combination of adjusting the offset, * making sure the length and addr are aligned and copying the data into the correct location * once the transfer has completed. */ if ((((__u64)((unsigned long)tx->tx_buffer)) & 3) || (sink->gnrd_addr & 3) || (nob & 3)) { tx->tx_offset = ((__u64)((unsigned long)sink->gnrd_addr)) & 3; if (tx->tx_offset) kgnilnd_admin_addref(kgnilnd_data.kgn_rev_offset); if ((nob + tx->tx_offset) & 3) { desc_nob = ((nob + tx->tx_offset) + (4 - ((nob + tx->tx_offset) & 3))); kgnilnd_admin_addref(kgnilnd_data.kgn_rev_length); } else { desc_nob = (nob + tx->tx_offset); } if (tx->tx_buffer_copy == NULL) { /* Allocate the largest copy buffer we will need, this will prevent us from overwriting data * and require at most we allocate a few extra bytes. */ tx->tx_buffer_copy = vmalloc(desc_nob); if (!tx->tx_buffer_copy) { /* allocation of buffer failed nak the rdma */ kgnilnd_nak_rdma(tx->tx_conn, tx->tx_msg.gnm_type, -EFAULT, cookie, tx->tx_msg.gnm_srcnid); kgnilnd_tx_done(tx, -EFAULT); return 0; } kgnilnd_admin_addref(kgnilnd_data.kgn_rev_copy_buff); rc = kgnilnd_mem_register(conn->gnc_device->gnd_handle, (__u64)tx->tx_buffer_copy, desc_nob, NULL, GNI_MEM_READWRITE, &tx->tx_buffer_copy_map_key); if (rc != GNI_RC_SUCCESS) { /* Registration Failed nak rdma and kill the tx. */ vfree(tx->tx_buffer_copy); tx->tx_buffer_copy = NULL; kgnilnd_nak_rdma(tx->tx_conn, tx->tx_msg.gnm_type, -EFAULT, cookie, tx->tx_msg.gnm_srcnid); kgnilnd_tx_done(tx, -EFAULT); return 0; } } desc_map_key = tx->tx_buffer_copy_map_key; desc_buffer = tx->tx_buffer_copy; } } memset(&tx->tx_rdma_desc, 0, sizeof(tx->tx_rdma_desc)); tx->tx_rdma_desc.post_id = tx->tx_id.txe_cookie; tx->tx_rdma_desc.type = post_type; tx->tx_rdma_desc.cq_mode = GNI_CQMODE_GLOBAL_EVENT; tx->tx_rdma_desc.local_addr = (__u64)((unsigned long)desc_buffer); tx->tx_rdma_desc.local_mem_hndl = desc_map_key; tx->tx_rdma_desc.remote_addr = sink->gnrd_addr - tx->tx_offset; tx->tx_rdma_desc.remote_mem_hndl = sink->gnrd_key; tx->tx_rdma_desc.length = desc_nob; tx->tx_nob_rdma = nob; if (*kgnilnd_tunables.kgn_bte_dlvr_mode) tx->tx_rdma_desc.dlvr_mode = *kgnilnd_tunables.kgn_bte_dlvr_mode; /* prep final completion message */ kgnilnd_init_msg(&tx->tx_msg, type, tx->tx_msg.gnm_srcnid); tx->tx_msg.gnm_u.completion.gncm_cookie = cookie; /* send actual size RDMA'd in retval */ tx->tx_msg.gnm_u.completion.gncm_retval = nob; kgnilnd_compute_rdma_cksum(tx, nob); if (nob == 0) { kgnilnd_queue_tx(conn, tx); return 0; } /* Don't lie (CLOSE == RDMA idle) */ LASSERTF(!conn->gnc_close_sent, "tx %p on conn %p after close sent %d\n", tx, conn, conn->gnc_close_sent); GNIDBG_TX(D_NET, tx, "Post RDMA type 0x%02x conn %p dlvr_mode " "0x%x cookie:"LPX64, type, conn, tx->tx_rdma_desc.dlvr_mode, cookie); /* set CQ dedicated for RDMA */ tx->tx_rdma_desc.src_cq_hndl = conn->gnc_device->gnd_snd_rdma_cqh; timestamp = jiffies; kgnilnd_conn_mutex_lock(&conn->gnc_rdma_mutex); kgnilnd_gl_mutex_lock(&conn->gnc_device->gnd_cq_mutex); /* delay in jiffies - we are really concerned only with things that * result in a schedule() or really holding this off for long times . * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */ conn->gnc_device->gnd_mutex_delay += (long) jiffies - timestamp; rrc = kgnilnd_post_rdma(conn->gnc_ephandle, &tx->tx_rdma_desc); if (rrc == GNI_RC_ERROR_RESOURCE) { kgnilnd_conn_mutex_unlock(&conn->gnc_rdma_mutex); kgnilnd_gl_mutex_unlock(&conn->gnc_device->gnd_cq_mutex); kgnilnd_unmap_buffer(tx, 0); if (tx->tx_buffer_copy != NULL) { vfree(tx->tx_buffer_copy); tx->tx_buffer_copy = NULL; } spin_lock(&tx->tx_conn->gnc_device->gnd_lock); kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 0); spin_unlock(&tx->tx_conn->gnc_device->gnd_lock); kgnilnd_schedule_device(tx->tx_conn->gnc_device); return -EAGAIN; } spin_lock(&conn->gnc_list_lock); kgnilnd_tx_add_state_locked(tx, conn->gnc_peer, conn, GNILND_TX_LIVE_RDMAQ, 1); tx->tx_qtime = jiffies; spin_unlock(&conn->gnc_list_lock); kgnilnd_gl_mutex_unlock(&conn->gnc_device->gnd_cq_mutex); kgnilnd_conn_mutex_unlock(&conn->gnc_rdma_mutex); /* XXX Nic: is this a place we should handle more errors for * robustness sake */ LASSERT(rrc == GNI_RC_SUCCESS); return 0; } kgn_rx_t * kgnilnd_alloc_rx(void) { kgn_rx_t *rx; rx = kmem_cache_alloc(kgnilnd_data.kgn_rx_cache, GFP_ATOMIC); if (rx == NULL) { CERROR("failed to allocate rx\n"); return NULL; } CDEBUG(D_MALLOC, "slab-alloced 'rx': %lu at %p.\n", sizeof(*rx), rx); /* no memset to zero, we'll always fill all members */ return rx; } /* release is to just free connection resources * we use this for the eager path after copying */ void kgnilnd_release_msg(kgn_conn_t *conn) { gni_return_t rrc; unsigned long timestamp; CDEBUG(D_NET, "consuming %p\n", conn); timestamp = jiffies; kgnilnd_gl_mutex_lock(&conn->gnc_device->gnd_cq_mutex); /* delay in jiffies - we are really concerned only with things that * result in a schedule() or really holding this off for long times . * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */ conn->gnc_device->gnd_mutex_delay += (long) jiffies - timestamp; rrc = kgnilnd_smsg_release(conn->gnc_ephandle); kgnilnd_gl_mutex_unlock(&conn->gnc_device->gnd_cq_mutex); LASSERTF(rrc == GNI_RC_SUCCESS, "bad rrc %d\n", rrc); GNIDBG_SMSG_CREDS(D_NET, conn); return; } void kgnilnd_consume_rx(kgn_rx_t *rx) { kgn_conn_t *conn = rx->grx_conn; kgn_msg_t *rxmsg = rx->grx_msg; /* if we are eager, free the cache alloc'd msg */ if (unlikely(rx->grx_eager)) { LIBCFS_FREE(rxmsg, sizeof(*rxmsg) + *kgnilnd_tunables.kgn_max_immediate); atomic_dec(&kgnilnd_data.kgn_neager_allocs); /* release ref from eager_recv */ kgnilnd_conn_decref(conn); } else { GNIDBG_MSG(D_NET, rxmsg, "rx %p processed", rx); kgnilnd_release_msg(conn); } kmem_cache_free(kgnilnd_data.kgn_rx_cache, rx); CDEBUG(D_MALLOC, "slab-freed 'rx': %lu at %p.\n", sizeof(*rx), rx); return; } int kgnilnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg) { lnet_hdr_t *hdr = &lntmsg->msg_hdr; int type = lntmsg->msg_type; lnet_process_id_t target = lntmsg->msg_target; int target_is_router = lntmsg->msg_target_is_router; int routing = lntmsg->msg_routing; unsigned int niov = lntmsg->msg_niov; struct iovec *iov = lntmsg->msg_iov; lnet_kiov_t *kiov = lntmsg->msg_kiov; unsigned int offset = lntmsg->msg_offset; unsigned int nob = lntmsg->msg_len; unsigned int msg_vmflush = lntmsg->msg_vmflush; kgn_net_t *net = ni->ni_data; kgn_tx_t *tx; int rc = 0; int mpflag = 0; int reverse_rdma_flag = *kgnilnd_tunables.kgn_reverse_rdma; /* NB 'private' is different depending on what we're sending.... */ LASSERT(!in_interrupt()); CDEBUG(D_NET, "sending msg type %d with %d bytes in %d frags to %s\n", type, nob, niov, libcfs_id2str(target)); LASSERTF(nob == 0 || niov > 0, "lntmsg %p nob %d niov %d\n", lntmsg, nob, niov); LASSERTF(niov <= LNET_MAX_IOV, "lntmsg %p niov %d\n", lntmsg, niov); /* payload is either all vaddrs or all pages */ LASSERTF(!(kiov != NULL && iov != NULL), "lntmsg %p kiov %p iov %p\n", lntmsg, kiov, iov); if (msg_vmflush) mpflag = cfs_memory_pressure_get_and_set(); switch (type) { default: CERROR("lntmsg %p with unexpected type %d\n", lntmsg, type); LBUG(); case LNET_MSG_ACK: LASSERTF(nob == 0, "lntmsg %p nob %d\n", lntmsg, nob); break; case LNET_MSG_GET: LASSERT(niov == 0); LASSERT(nob == 0); if (routing || target_is_router) break; /* send IMMEDIATE */ /* it is safe to do direct GET with out mapping buffer for RDMA as we * check the eventual sink buffer here - if small enough, remote * end is perfectly capable of returning data in short message - * The magic is that we call lnet_parse in kgnilnd_recv with rdma_req=0 * for IMMEDIATE messages which will have it send a real reply instead * of doing kgnilnd_recv to have the RDMA continued */ if (lntmsg->msg_md->md_length <= *kgnilnd_tunables.kgn_max_immediate) break; if ((reverse_rdma_flag & GNILND_REVERSE_GET) == 0) tx = kgnilnd_new_tx_msg(GNILND_MSG_GET_REQ, ni->ni_nid); else tx = kgnilnd_new_tx_msg(GNILND_MSG_GET_REQ_REV, ni->ni_nid); if (tx == NULL) { rc = -ENOMEM; goto out; } /* slightly different options as we might actually have a GET with a * MD_KIOV set but a non-NULL md_iov.iov */ if ((lntmsg->msg_md->md_options & LNET_MD_KIOV) == 0) rc = kgnilnd_setup_rdma_buffer(tx, lntmsg->msg_md->md_niov, lntmsg->msg_md->md_iov.iov, NULL, 0, lntmsg->msg_md->md_length); else rc = kgnilnd_setup_rdma_buffer(tx, lntmsg->msg_md->md_niov, NULL, lntmsg->msg_md->md_iov.kiov, 0, lntmsg->msg_md->md_length); if (rc != 0) { CERROR("unable to setup buffer: %d\n", rc); kgnilnd_tx_done(tx, rc); rc = -EIO; goto out; } tx->tx_lntmsg[1] = lnet_create_reply_msg(ni, lntmsg); if (tx->tx_lntmsg[1] == NULL) { CERROR("Can't create reply for GET to %s\n", libcfs_nid2str(target.nid)); kgnilnd_tx_done(tx, rc); rc = -EIO; goto out; } tx->tx_lntmsg[0] = lntmsg; if ((reverse_rdma_flag & GNILND_REVERSE_GET) == 0) tx->tx_msg.gnm_u.get.gngm_hdr = *hdr; else tx->tx_msg.gnm_u.putreq.gnprm_hdr = *hdr; /* rest of tx_msg is setup just before it is sent */ kgnilnd_launch_tx(tx, net, &target); goto out; case LNET_MSG_REPLY: case LNET_MSG_PUT: /* to save on MDDs, we'll handle short kiov by vmap'ing * and sending via SMSG */ if (nob <= *kgnilnd_tunables.kgn_max_immediate) break; if ((reverse_rdma_flag & GNILND_REVERSE_PUT) == 0) tx = kgnilnd_new_tx_msg(GNILND_MSG_PUT_REQ, ni->ni_nid); else tx = kgnilnd_new_tx_msg(GNILND_MSG_PUT_REQ_REV, ni->ni_nid); if (tx == NULL) { rc = -ENOMEM; goto out; } rc = kgnilnd_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob); if (rc != 0) { kgnilnd_tx_done(tx, rc); rc = -EIO; goto out; } tx->tx_lntmsg[0] = lntmsg; if ((reverse_rdma_flag & GNILND_REVERSE_PUT) == 0) tx->tx_msg.gnm_u.putreq.gnprm_hdr = *hdr; else tx->tx_msg.gnm_u.get.gngm_hdr = *hdr; /* rest of tx_msg is setup just before it is sent */ kgnilnd_launch_tx(tx, net, &target); goto out; } /* send IMMEDIATE */ LASSERTF(nob <= *kgnilnd_tunables.kgn_max_immediate, "lntmsg 0x%p too large %d\n", lntmsg, nob); tx = kgnilnd_new_tx_msg(GNILND_MSG_IMMEDIATE, ni->ni_nid); if (tx == NULL) { rc = -ENOMEM; goto out; } rc = kgnilnd_setup_immediate_buffer(tx, niov, iov, kiov, offset, nob); if (rc != 0) { kgnilnd_tx_done(tx, rc); goto out; } tx->tx_msg.gnm_u.immediate.gnim_hdr = *hdr; tx->tx_lntmsg[0] = lntmsg; kgnilnd_launch_tx(tx, net, &target); out: /* use stored value as we could have already finalized lntmsg here from a failed launch */ if (msg_vmflush) cfs_memory_pressure_restore(mpflag); return rc; } void kgnilnd_setup_rdma(lnet_ni_t *ni, kgn_rx_t *rx, lnet_msg_t *lntmsg, int mlen) { kgn_conn_t *conn = rx->grx_conn; kgn_msg_t *rxmsg = rx->grx_msg; unsigned int niov = lntmsg->msg_niov; struct iovec *iov = lntmsg->msg_iov; lnet_kiov_t *kiov = lntmsg->msg_kiov; unsigned int offset = lntmsg->msg_offset; unsigned int nob = lntmsg->msg_len; int done_type; kgn_tx_t *tx; int rc = 0; switch (rxmsg->gnm_type) { case GNILND_MSG_PUT_REQ_REV: done_type = GNILND_MSG_PUT_DONE_REV; nob = mlen; break; case GNILND_MSG_GET_REQ: done_type = GNILND_MSG_GET_DONE; break; default: CERROR("invalid msg type %s (%d)\n", kgnilnd_msgtype2str(rxmsg->gnm_type), rxmsg->gnm_type); LBUG(); } tx = kgnilnd_new_tx_msg(done_type, ni->ni_nid); if (tx == NULL) goto failed_0; rc = kgnilnd_set_tx_id(tx, conn); if (rc != 0) goto failed_1; rc = kgnilnd_setup_rdma_buffer(tx, niov, iov, kiov, offset, nob); if (rc != 0) goto failed_1; tx->tx_lntmsg[0] = lntmsg; tx->tx_getinfo = rxmsg->gnm_u.get; /* we only queue from kgnilnd_recv - we might get called from other contexts * and we don't want to block the mutex in those cases */ spin_lock(&tx->tx_conn->gnc_device->gnd_lock); kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 1); spin_unlock(&tx->tx_conn->gnc_device->gnd_lock); kgnilnd_schedule_device(tx->tx_conn->gnc_device); return; failed_1: kgnilnd_tx_done(tx, rc); kgnilnd_nak_rdma(conn, done_type, rc, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid); failed_0: lnet_finalize(ni, lntmsg, rc); } int kgnilnd_eager_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, void **new_private) { kgn_rx_t *rx = private; kgn_conn_t *conn = rx->grx_conn; kgn_msg_t *rxmsg = rx->grx_msg; kgn_msg_t *eagermsg = NULL; kgn_peer_t *peer = NULL; kgn_conn_t *found_conn = NULL; GNIDBG_MSG(D_NET, rxmsg, "eager recv for conn %p, rxmsg %p, lntmsg %p", conn, rxmsg, lntmsg); if (rxmsg->gnm_payload_len > *kgnilnd_tunables.kgn_max_immediate) { GNIDBG_MSG(D_ERROR, rxmsg, "payload too large %d", rxmsg->gnm_payload_len); return -EPROTO; } /* Grab a read lock so the connection doesnt disappear on us * while we look it up */ read_lock(&kgnilnd_data.kgn_peer_conn_lock); peer = kgnilnd_find_peer_locked(rxmsg->gnm_srcnid); if (peer != NULL) found_conn = kgnilnd_find_conn_locked(peer); /* Verify the connection found is the same one that the message * is supposed to be using, if it is not output an error message * and return. */ if (!peer || !found_conn || found_conn->gnc_peer_connstamp != rxmsg->gnm_connstamp) { read_unlock(&kgnilnd_data.kgn_peer_conn_lock); CERROR("Couldnt find matching peer %p or conn %p / %p\n", peer, conn, found_conn); if (found_conn) { CERROR("Unexpected connstamp "LPX64"("LPX64" expected)" " from %s", rxmsg->gnm_connstamp, found_conn->gnc_peer_connstamp, libcfs_nid2str(peer->gnp_nid)); } return -ENOTCONN; } /* add conn ref to ensure it doesn't go away until all eager * messages processed */ kgnilnd_conn_addref(conn); /* Now that we have verified the connection is valid and added a * reference we can remove the read_lock on the peer_conn_lock */ read_unlock(&kgnilnd_data.kgn_peer_conn_lock); /* we have no credits or buffers for this message, so copy it * somewhere for a later kgnilnd_recv */ if (atomic_read(&kgnilnd_data.kgn_neager_allocs) >= *kgnilnd_tunables.kgn_eager_credits) { CERROR("Out of eager credits to %s\n", libcfs_nid2str(conn->gnc_peer->gnp_nid)); return -ENOMEM; } atomic_inc(&kgnilnd_data.kgn_neager_allocs); LIBCFS_ALLOC(eagermsg, sizeof(*eagermsg) + *kgnilnd_tunables.kgn_max_immediate); if (eagermsg == NULL) { kgnilnd_conn_decref(conn); CERROR("couldn't allocate eager rx message for conn %p to %s\n", conn, libcfs_nid2str(conn->gnc_peer->gnp_nid)); return -ENOMEM; } /* copy msg and payload */ memcpy(eagermsg, rxmsg, sizeof(*rxmsg) + rxmsg->gnm_payload_len); rx->grx_msg = eagermsg; rx->grx_eager = 1; /* stash this for lnet_finalize on cancel-on-conn-close */ rx->grx_lntmsg = lntmsg; /* keep the same rx_t, it just has a new grx_msg now */ *new_private = private; /* release SMSG buffer */ kgnilnd_release_msg(conn); return 0; } int kgnilnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed, unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov, unsigned int offset, unsigned int mlen, unsigned int rlen) { kgn_rx_t *rx = private; kgn_conn_t *conn = rx->grx_conn; kgn_msg_t *rxmsg = rx->grx_msg; kgn_tx_t *tx; int rc = 0; __u32 pload_cksum; ENTRY; LASSERT(!in_interrupt()); LASSERTF(mlen <= rlen, "%d <= %d\n", mlen, rlen); /* Either all pages or all vaddrs */ LASSERTF(!(kiov != NULL && iov != NULL), "kiov %p iov %p\n", kiov, iov); GNIDBG_MSG(D_NET, rxmsg, "conn %p, rxmsg %p, lntmsg %p" " niov=%d kiov=%p iov=%p offset=%d mlen=%d rlen=%d", conn, rxmsg, lntmsg, niov, kiov, iov, offset, mlen, rlen); /* we need to lock here as recv can be called from any context */ read_lock(&kgnilnd_data.kgn_peer_conn_lock); if (rx->grx_eager && conn->gnc_state != GNILND_CONN_ESTABLISHED) { read_unlock(&kgnilnd_data.kgn_peer_conn_lock); /* someone closed the conn after we copied this out, nuke it */ kgnilnd_consume_rx(rx); lnet_finalize(ni, lntmsg, conn->gnc_error); RETURN(0); } read_unlock(&kgnilnd_data.kgn_peer_conn_lock); switch (rxmsg->gnm_type) { default: GNIDBG_MSG(D_NETERROR, rxmsg, "conn %p, rx %p, rxmsg %p, lntmsg %p" " niov=%d kiov=%p iov=%p offset=%d mlen=%d rlen=%d", conn, rx, rxmsg, lntmsg, niov, kiov, iov, offset, mlen, rlen); LBUG(); case GNILND_MSG_IMMEDIATE: if (mlen > rxmsg->gnm_payload_len) { GNIDBG_MSG(D_ERROR, rxmsg, "Immediate message from %s too big: %d > %d", libcfs_nid2str(conn->gnc_peer->gnp_nid), mlen, rxmsg->gnm_payload_len); rc = -EINVAL; kgnilnd_consume_rx(rx); RETURN(rc); } /* rxmsg[1] is a pointer to the payload, sitting in the buffer * right after the kgn_msg_t header - so just 'cute' way of saying * rxmsg + sizeof(kgn_msg_t) */ /* check payload checksum if sent */ if (*kgnilnd_tunables.kgn_checksum >= 2 && !rxmsg->gnm_payload_cksum && rxmsg->gnm_payload_len != 0) GNIDBG_MSG(D_WARNING, rxmsg, "no msg payload checksum when enabled"); if (rxmsg->gnm_payload_cksum != 0) { /* gnm_payload_len set in kgnilnd_sendmsg from tx->tx_nob, * which is what is used to calculate the cksum on the TX side */ pload_cksum = kgnilnd_cksum(&rxmsg[1], rxmsg->gnm_payload_len); if (rxmsg->gnm_payload_cksum != pload_cksum) { GNIDBG_MSG(D_NETERROR, rxmsg, "Bad payload checksum (%x expected %x)", pload_cksum, rxmsg->gnm_payload_cksum); switch (*kgnilnd_tunables.kgn_checksum_dump) { case 2: kgnilnd_dump_blob(D_BUFFS, "bad payload checksum", &rxmsg[1], rxmsg->gnm_payload_len); /* fall through to dump */ case 1: libcfs_debug_dumplog(); break; default: break; } rc = -ENOKEY; /* checksum problems are fatal, kill the conn */ kgnilnd_consume_rx(rx); kgnilnd_close_conn(conn, rc); RETURN(rc); } } if (kiov != NULL) lnet_copy_flat2kiov( niov, kiov, offset, *kgnilnd_tunables.kgn_max_immediate, &rxmsg[1], 0, mlen); else lnet_copy_flat2iov( niov, iov, offset, *kgnilnd_tunables.kgn_max_immediate, &rxmsg[1], 0, mlen); kgnilnd_consume_rx(rx); lnet_finalize(ni, lntmsg, 0); RETURN(0); case GNILND_MSG_PUT_REQ: /* LNET wants to truncate or drop transaction, sending NAK */ if (mlen == 0) { kgnilnd_consume_rx(rx); lnet_finalize(ni, lntmsg, 0); /* only error if lntmsg == NULL, otherwise we are just * short circuiting the rdma process of 0 bytes */ kgnilnd_nak_rdma(conn, rxmsg->gnm_type, lntmsg == NULL ? -ENOENT : 0, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid); RETURN(0); } /* sending ACK with sink buff. info */ tx = kgnilnd_new_tx_msg(GNILND_MSG_PUT_ACK, ni->ni_nid); if (tx == NULL) { kgnilnd_consume_rx(rx); RETURN(-ENOMEM); } rc = kgnilnd_set_tx_id(tx, conn); if (rc != 0) { GOTO(nak_put_req, rc); } rc = kgnilnd_setup_rdma_buffer(tx, niov, iov, kiov, offset, mlen); if (rc != 0) { GOTO(nak_put_req, rc); } tx->tx_msg.gnm_u.putack.gnpam_src_cookie = rxmsg->gnm_u.putreq.gnprm_cookie; tx->tx_msg.gnm_u.putack.gnpam_dst_cookie = tx->tx_id.txe_cookie; tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_addr = (__u64)((unsigned long)tx->tx_buffer); tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_nob = mlen; tx->tx_lntmsg[0] = lntmsg; /* finalize this on RDMA_DONE */ tx->tx_qtime = jiffies; /* we only queue from kgnilnd_recv - we might get called from other contexts * and we don't want to block the mutex in those cases */ spin_lock(&tx->tx_conn->gnc_device->gnd_lock); kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 1); spin_unlock(&tx->tx_conn->gnc_device->gnd_lock); kgnilnd_schedule_device(tx->tx_conn->gnc_device); kgnilnd_consume_rx(rx); RETURN(0); nak_put_req: /* make sure we send an error back when the PUT fails */ kgnilnd_nak_rdma(conn, rxmsg->gnm_type, rc, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid); kgnilnd_tx_done(tx, rc); kgnilnd_consume_rx(rx); /* return magic LNet network error */ RETURN(-EIO); case GNILND_MSG_GET_REQ_REV: /* LNET wants to truncate or drop transaction, sending NAK */ if (mlen == 0) { kgnilnd_consume_rx(rx); lnet_finalize(ni, lntmsg, 0); /* only error if lntmsg == NULL, otherwise we are just * short circuiting the rdma process of 0 bytes */ kgnilnd_nak_rdma(conn, rxmsg->gnm_type, lntmsg == NULL ? -ENOENT : 0, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid); RETURN(0); } /* lntmsg can be null when parsing a LNET_GET */ if (lntmsg != NULL) { /* sending ACK with sink buff. info */ tx = kgnilnd_new_tx_msg(GNILND_MSG_GET_ACK_REV, ni->ni_nid); if (tx == NULL) { kgnilnd_consume_rx(rx); RETURN(-ENOMEM); } rc = kgnilnd_set_tx_id(tx, conn); if (rc != 0) GOTO(nak_get_req_rev, rc); rc = kgnilnd_setup_rdma_buffer(tx, niov, iov, kiov, offset, mlen); if (rc != 0) GOTO(nak_get_req_rev, rc); tx->tx_msg.gnm_u.putack.gnpam_src_cookie = rxmsg->gnm_u.putreq.gnprm_cookie; tx->tx_msg.gnm_u.putack.gnpam_dst_cookie = tx->tx_id.txe_cookie; tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_addr = (__u64)((unsigned long)tx->tx_buffer); tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_nob = mlen; tx->tx_lntmsg[0] = lntmsg; /* finalize this on RDMA_DONE */ /* we only queue from kgnilnd_recv - we might get called from other contexts * and we don't want to block the mutex in those cases */ spin_lock(&tx->tx_conn->gnc_device->gnd_lock); kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 1); spin_unlock(&tx->tx_conn->gnc_device->gnd_lock); kgnilnd_schedule_device(tx->tx_conn->gnc_device); } else { /* No match */ kgnilnd_nak_rdma(conn, rxmsg->gnm_type, -ENOENT, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid); } kgnilnd_consume_rx(rx); RETURN(0); nak_get_req_rev: /* make sure we send an error back when the GET fails */ kgnilnd_nak_rdma(conn, rxmsg->gnm_type, rc, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid); kgnilnd_tx_done(tx, rc); kgnilnd_consume_rx(rx); /* return magic LNet network error */ RETURN(-EIO); case GNILND_MSG_PUT_REQ_REV: /* LNET wants to truncate or drop transaction, sending NAK */ if (mlen == 0) { kgnilnd_consume_rx(rx); lnet_finalize(ni, lntmsg, 0); /* only error if lntmsg == NULL, otherwise we are just * short circuiting the rdma process of 0 bytes */ kgnilnd_nak_rdma(conn, rxmsg->gnm_type, lntmsg == NULL ? -ENOENT : 0, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid); RETURN(0); } if (lntmsg != NULL) { /* Matched! */ kgnilnd_setup_rdma(ni, rx, lntmsg, mlen); } else { /* No match */ kgnilnd_nak_rdma(conn, rxmsg->gnm_type, -ENOENT, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid); } kgnilnd_consume_rx(rx); RETURN(0); case GNILND_MSG_GET_REQ: if (lntmsg != NULL) { /* Matched! */ kgnilnd_setup_rdma(ni, rx, lntmsg, mlen); } else { /* No match */ kgnilnd_nak_rdma(conn, rxmsg->gnm_type, -ENOENT, rxmsg->gnm_u.get.gngm_cookie, ni->ni_nid); } kgnilnd_consume_rx(rx); RETURN(0); } RETURN(0); } /* needs write_lock on kgn_peer_conn_lock held */ int kgnilnd_check_conn_timeouts_locked(kgn_conn_t *conn) { unsigned long timeout, keepalive; unsigned long now = jiffies; unsigned long newest_last_rx; kgn_tx_t *tx; /* given that we found this conn hanging off a peer, it better damned * well be connected */ LASSERTF(conn->gnc_state == GNILND_CONN_ESTABLISHED, "conn 0x%p->%s with bad state%s\n", conn, conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid) : "", kgnilnd_conn_state2str(conn)); CDEBUG(D_NET, "checking conn %p->%s timeout %d keepalive %d " "rx_diff %lu tx_diff %lu\n", conn, libcfs_nid2str(conn->gnc_peer->gnp_nid), conn->gnc_timeout, GNILND_TO2KA(conn->gnc_timeout), cfs_duration_sec(now - conn->gnc_last_rx_cq), cfs_duration_sec(now - conn->gnc_last_tx)); timeout = cfs_time_seconds(conn->gnc_timeout); keepalive = cfs_time_seconds(GNILND_TO2KA(conn->gnc_timeout)); /* just in case our lack of RX msg processing is gumming up the works - give the * remove an extra chance */ newest_last_rx = GNILND_LASTRX(conn); if (time_after_eq(now, newest_last_rx + timeout)) { uint32_t level = D_CONSOLE|D_NETERROR; if (conn->gnc_peer->gnp_down == GNILND_RCA_NODE_DOWN) { level = D_NET; } GNIDBG_CONN(level, conn, "No gnilnd traffic received from %s for %lu " "seconds, terminating connection. Is node down? ", libcfs_nid2str(conn->gnc_peer->gnp_nid), cfs_duration_sec(now - newest_last_rx)); return -ETIMEDOUT; } /* we don't timeout on last_tx stalls - we are going to trust the * underlying network to let us know when sends are failing. * At worst, the peer will timeout our RX stamp and drop the connection * at that point. We'll then see his CLOSE or at worst his RX * stamp stop and drop the connection on our end */ if (time_after_eq(now, conn->gnc_last_tx + keepalive)) { CDEBUG(D_NET, "sending NOOP -> %s (%p idle %lu(%lu)) " "last %lu/%lu/%lu %lus/%lus/%lus\n", libcfs_nid2str(conn->gnc_peer->gnp_nid), conn, cfs_duration_sec(jiffies - conn->gnc_last_tx), keepalive, conn->gnc_last_noop_want, conn->gnc_last_noop_sent, conn->gnc_last_noop_cq, cfs_duration_sec(jiffies - conn->gnc_last_noop_want), cfs_duration_sec(jiffies - conn->gnc_last_noop_sent), cfs_duration_sec(jiffies - conn->gnc_last_noop_cq)); set_mb(conn->gnc_last_noop_want, jiffies); atomic_inc(&conn->gnc_reaper_noop); if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NOOP_SEND)) return 0; tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, conn->gnc_peer->gnp_net->gnn_ni->ni_nid); if (tx == NULL) return 0; kgnilnd_queue_tx(conn, tx); } return 0; } /* needs write_lock on kgn_peer_conn_lock held */ void kgnilnd_check_peer_timeouts_locked(kgn_peer_t *peer, struct list_head *todie, struct list_head *souls) { unsigned long timeout; kgn_conn_t *conn, *connN = NULL; kgn_tx_t *tx, *txN; int rc = 0; int count = 0; int reconnect; int to_reconn; short releaseconn = 0; unsigned long first_rx = 0; int purgatory_conn_cnt = 0; CDEBUG(D_NET, "checking peer 0x%p->%s for timeouts; interval %lus\n", peer, libcfs_nid2str(peer->gnp_nid), peer->gnp_reconnect_interval); timeout = cfs_time_seconds(MAX(*kgnilnd_tunables.kgn_timeout, GNILND_MIN_TIMEOUT)); conn = kgnilnd_find_conn_locked(peer); if (conn) { /* if there is a valid conn, check the queues for timeouts */ rc = kgnilnd_check_conn_timeouts_locked(conn); if (rc) { if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RX_CLOSE_CLOSING)) { /* simulate a RX CLOSE after the timeout but before * the scheduler thread gets it */ conn->gnc_close_recvd = GNILND_CLOSE_INJECT1; conn->gnc_peer_error = -ETIMEDOUT; } /* Once we mark closed, any of the scheduler threads could * get it and move through before we hit the fail loc code */ kgnilnd_close_conn_locked(conn, rc); } else { /* first_rx is used to decide when to release a conn from purgatory. */ first_rx = conn->gnc_first_rx; } } /* now regardless of starting new conn, find tx on peer queue that * are old and smell bad - do this first so we don't trigger * reconnect on empty queue if we timeout all */ list_for_each_entry_safe(tx, txN, &peer->gnp_tx_queue, tx_list) { if (time_after_eq(jiffies, tx->tx_qtime + timeout)) { if (count == 0) { LCONSOLE_INFO("could not send to %s due to connection" " setup failure after %lu seconds\n", libcfs_nid2str(peer->gnp_nid), cfs_duration_sec(jiffies - tx->tx_qtime)); } kgnilnd_tx_del_state_locked(tx, peer, NULL, GNILND_TX_ALLOCD); list_add_tail(&tx->tx_list, todie); count++; } } if (count || peer->gnp_connecting == GNILND_PEER_KILL) { CDEBUG(D_NET, "canceling %d tx for peer 0x%p->%s\n", count, peer, libcfs_nid2str(peer->gnp_nid)); /* if we nuked all the TX, stop peer connection attempt (if there is one..) */ if (list_empty(&peer->gnp_tx_queue) || peer->gnp_connecting == GNILND_PEER_KILL) { /* we pass down todie to use a common function - but we know there are * no TX to add */ kgnilnd_cancel_peer_connect_locked(peer, todie); } } /* Don't reconnect if we are still trying to clear out old conns. * This prevents us sending traffic on the new mbox before ensuring we are done * with the old one */ reconnect = (peer->gnp_down == GNILND_RCA_NODE_UP) && (atomic_read(&peer->gnp_dirty_eps) == 0); /* fast reconnect after a timeout */ to_reconn = !conn && (peer->gnp_last_errno == -ETIMEDOUT) && *kgnilnd_tunables.kgn_fast_reconn; /* if we are not connected and there are tx on the gnp_tx_queue waiting * to be sent, we'll check the reconnect interval and fire up a new * connection request */ if (reconnect && (peer->gnp_connecting == GNILND_PEER_IDLE) && (time_after_eq(jiffies, peer->gnp_reconnect_time)) && (!list_empty(&peer->gnp_tx_queue) || to_reconn)) { CDEBUG(D_NET, "starting connect to %s\n", libcfs_nid2str(peer->gnp_nid)); LASSERTF(peer->gnp_connecting == GNILND_PEER_IDLE, "Peer was idle and we" "have a write_lock, state issue %d\n", peer->gnp_connecting); peer->gnp_connecting = GNILND_PEER_CONNECT; kgnilnd_peer_addref(peer); /* extra ref for connd */ spin_lock(&peer->gnp_net->gnn_dev->gnd_connd_lock); list_add_tail(&peer->gnp_connd_list, &peer->gnp_net->gnn_dev->gnd_connd_peers); spin_unlock(&peer->gnp_net->gnn_dev->gnd_connd_lock); kgnilnd_schedule_dgram(peer->gnp_net->gnn_dev); } /* fail_loc to allow us to delay release of purgatory */ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PURG_REL_DELAY)) return; /* This check allows us to verify that the new conn is actually being used. This allows us to * pull the old conns out of purgatory if they have actually seen traffic. * We only release a conn from purgatory during stack reset, admin command, or when a peer reconnects */ if (first_rx && time_after(jiffies, first_rx + cfs_time_seconds(*kgnilnd_tunables.kgn_hardware_timeout))) { CDEBUG(D_INFO, "We can release peer %s conn's from purgatory %lu\n", libcfs_nid2str(peer->gnp_nid), first_rx + cfs_time_seconds(*kgnilnd_tunables.kgn_hardware_timeout)); releaseconn = 1; } list_for_each_entry_safe (conn, connN, &peer->gnp_conns, gnc_list) { /* check for purgatory timeouts */ if (conn->gnc_in_purgatory) { /* We cannot detach this conn from purgatory if it has not been closed so we reschedule it * that way the next time we check it we can detach it from purgatory */ if (conn->gnc_state != GNILND_CONN_DONE) { /* Skip over conns that are currently not DONE. If they arent already scheduled * for completion something in the state machine is broken. */ continue; } /* We only detach a conn that is in purgatory if we have received a close message, * we have a new valid connection that has successfully received data, or an admin * command tells us we need to detach. */ if (conn->gnc_close_recvd || releaseconn || conn->gnc_needs_detach) { unsigned long waiting; waiting = (long) jiffies - conn->gnc_last_rx_cq; /* C.E: The remote peer is expected to close the * connection (see kgnilnd_check_conn_timeouts) * via the reaper thread and nuke out the MDD and * FMA resources after conn->gnc_timeout has expired * without an FMA RX */ CDEBUG(D_NET, "Reconnected to %s in %lds or admin forced detach, dropping " " held resources\n", libcfs_nid2str(conn->gnc_peer->gnp_nid), cfs_duration_sec(waiting)); kgnilnd_detach_purgatory_locked(conn, souls); } else { purgatory_conn_cnt++; } } } /* If we have too many connections in purgatory we could run out of * resources. Limit the number of connections to a tunable number, * clean up to the minimum all in one fell swoop... there are * situations where dvs will retry tx's and we can eat up several * hundread connection requests at once. */ if (purgatory_conn_cnt > *kgnilnd_tunables.kgn_max_purgatory) { list_for_each_entry_safe(conn, connN, &peer->gnp_conns, gnc_list) { if (conn->gnc_in_purgatory && conn->gnc_state == GNILND_CONN_DONE) { CDEBUG(D_NET, "Dropping Held resource due to" " resource limits being hit\n"); kgnilnd_detach_purgatory_locked(conn, souls); if (purgatory_conn_cnt-- < *kgnilnd_tunables.kgn_max_purgatory) break; } } } return; } void kgnilnd_reaper_check(int idx) { struct list_head *peers = &kgnilnd_data.kgn_peers[idx]; struct list_head *ctmp, *ctmpN; struct list_head geriatrics; struct list_head souls; INIT_LIST_HEAD(&geriatrics); INIT_LIST_HEAD(&souls); write_lock(&kgnilnd_data.kgn_peer_conn_lock); list_for_each_safe(ctmp, ctmpN, peers) { kgn_peer_t *peer = NULL; /* don't timeout stuff if the network is mucked or shutting down */ if (kgnilnd_check_hw_quiesce()) { break; } peer = list_entry(ctmp, kgn_peer_t, gnp_list); kgnilnd_check_peer_timeouts_locked(peer, &geriatrics, &souls); } write_unlock(&kgnilnd_data.kgn_peer_conn_lock); kgnilnd_txlist_done(&geriatrics, -EHOSTUNREACH); kgnilnd_release_purgatory_list(&souls); } void kgnilnd_update_reaper_timeout(long timeout) { LASSERT(timeout > 0); spin_lock(&kgnilnd_data.kgn_reaper_lock); if (timeout < kgnilnd_data.kgn_new_min_timeout) kgnilnd_data.kgn_new_min_timeout = timeout; spin_unlock(&kgnilnd_data.kgn_reaper_lock); } static void kgnilnd_reaper_poke_with_stick(unsigned long arg) { wake_up(&kgnilnd_data.kgn_reaper_waitq); } int kgnilnd_reaper(void *arg) { long timeout; int i; int hash_index = 0; unsigned long next_check_time = jiffies; long current_min_timeout = MAX_SCHEDULE_TIMEOUT; struct timer_list timer; DEFINE_WAIT(wait); cfs_block_allsigs(); /* all gnilnd threads need to run fairly urgently */ set_user_nice(current, *kgnilnd_tunables.kgn_nice); spin_lock(&kgnilnd_data.kgn_reaper_lock); while (!kgnilnd_data.kgn_shutdown) { /* I wake up every 'p' seconds to check for timeouts on some * more peers. I try to check every connection 'n' times * within the global minimum of all keepalive and timeout * intervals, to ensure I attend to every connection within * (n+1)/n times its timeout intervals. */ const int p = GNILND_REAPER_THREAD_WAKE; const int n = GNILND_REAPER_NCHECKS; int chunk; /* to quiesce or to not quiesce, that is the question */ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { spin_unlock(&kgnilnd_data.kgn_reaper_lock); KGNILND_SPIN_QUIESCE; spin_lock(&kgnilnd_data.kgn_reaper_lock); } /* careful with the jiffy wrap... */ timeout = (long)(next_check_time - jiffies); if (timeout > 0) { prepare_to_wait(&kgnilnd_data.kgn_reaper_waitq, &wait, TASK_INTERRUPTIBLE); spin_unlock(&kgnilnd_data.kgn_reaper_lock); setup_timer(&timer, kgnilnd_reaper_poke_with_stick, next_check_time); mod_timer(&timer, (long) jiffies + timeout); /* check flag variables before comitting */ if (!kgnilnd_data.kgn_shutdown && !kgnilnd_data.kgn_quiesce_trigger) { CDEBUG(D_INFO, "schedule timeout %ld (%lu sec)\n", timeout, cfs_duration_sec(timeout)); schedule(); CDEBUG(D_INFO, "awake after schedule\n"); } del_singleshot_timer_sync(&timer); spin_lock(&kgnilnd_data.kgn_reaper_lock); finish_wait(&kgnilnd_data.kgn_reaper_waitq, &wait); continue; } /* new_min_timeout is set from the conn timeouts and keepalive * this should end up with a min timeout of * GNILND_TIMEOUT2KEEPALIVE(t) or roughly LND_TIMEOUT/2 */ if (kgnilnd_data.kgn_new_min_timeout < current_min_timeout) { current_min_timeout = kgnilnd_data.kgn_new_min_timeout; CDEBUG(D_NET, "Set new min timeout %ld\n", current_min_timeout); } spin_unlock(&kgnilnd_data.kgn_reaper_lock); /* Compute how many table entries to check now so I get round * the whole table fast enough given that I do this at fixed * intervals of 'p' seconds) */ chunk = *kgnilnd_tunables.kgn_peer_hash_size; if (kgnilnd_data.kgn_new_min_timeout > n * p) chunk = (chunk * n * p) / kgnilnd_data.kgn_new_min_timeout; if (chunk == 0) chunk = 1; for (i = 0; i < chunk; i++) { kgnilnd_reaper_check(hash_index); hash_index = (hash_index + 1) % *kgnilnd_tunables.kgn_peer_hash_size; } next_check_time = (long) jiffies + cfs_time_seconds(p); CDEBUG(D_INFO, "next check at %lu or in %d sec\n", next_check_time, p); spin_lock(&kgnilnd_data.kgn_reaper_lock); } spin_unlock(&kgnilnd_data.kgn_reaper_lock); kgnilnd_thread_fini(); return 0; } int kgnilnd_recv_bte_get(kgn_tx_t *tx) { unsigned niov, offset, nob; lnet_kiov_t *kiov; lnet_msg_t *lntmsg = tx->tx_lntmsg[0]; kgnilnd_parse_lnet_rdma(lntmsg, &niov, &offset, &nob, &kiov, tx->tx_nob_rdma); if (kiov != NULL) { lnet_copy_flat2kiov( niov, kiov, offset, nob, tx->tx_buffer_copy + tx->tx_offset, 0, nob); } else { memcpy(tx->tx_buffer, tx->tx_buffer_copy + tx->tx_offset, nob); } return 0; } int kgnilnd_check_rdma_cq(kgn_device_t *dev) { gni_return_t rrc; gni_post_descriptor_t *desc; __u64 event_data; kgn_tx_ev_id_t ev_id; char err_str[256]; int should_retry, rc; long num_processed = 0; kgn_conn_t *conn = NULL; kgn_tx_t *tx = NULL; kgn_rdma_desc_t *rdesc; unsigned int rnob; __u64 rcookie; for (;;) { /* make sure we don't keep looping if we need to reset */ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { return num_processed; } rc = kgnilnd_mutex_trylock(&dev->gnd_cq_mutex); if (!rc) { /* we didn't get the mutex, so return that there is still work * to be done */ return 1; } if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DELAY_RDMA)) { /* a bit gross - but we need a good way to test for * delayed RDMA completions and the easiest way to do * that is to delay the RDMA CQ events */ rrc = GNI_RC_NOT_DONE; } else { rrc = kgnilnd_cq_get_event(dev->gnd_snd_rdma_cqh, &event_data); } if (rrc == GNI_RC_NOT_DONE) { kgnilnd_gl_mutex_unlock(&dev->gnd_cq_mutex); CDEBUG(D_INFO, "SEND RDMA CQ %d empty processed %ld\n", dev->gnd_id, num_processed); return num_processed; } dev->gnd_sched_alive = jiffies; num_processed++; LASSERTF(!GNI_CQ_OVERRUN(event_data), "this is bad, somehow our credits didn't protect us" " from CQ overrun\n"); LASSERTF(GNI_CQ_GET_TYPE(event_data) == GNI_CQ_EVENT_TYPE_POST, "rrc %d, GNI_CQ_GET_TYPE("LPX64") = "LPX64"\n", rrc, event_data, GNI_CQ_GET_TYPE(event_data)); rrc = kgnilnd_get_completed(dev->gnd_snd_rdma_cqh, event_data, &desc); kgnilnd_gl_mutex_unlock(&dev->gnd_cq_mutex); /* XXX Nic: Need better error handling here... */ LASSERTF((rrc == GNI_RC_SUCCESS) || (rrc == GNI_RC_TRANSACTION_ERROR), "rrc %d\n", rrc); ev_id.txe_cookie = desc->post_id; kgnilnd_validate_tx_ev_id(&ev_id, &tx, &conn); if (conn == NULL || tx == NULL) { /* either conn or tx was already nuked and this is a "late" * completion, so drop it */ continue; } GNITX_ASSERTF(tx, tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE || tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE || tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE_REV || tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE_REV, "tx %p with type %d\n", tx, tx->tx_msg.gnm_type); GNIDBG_TX(D_NET, tx, "RDMA completion for %d bytes", tx->tx_nob); if (tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE_REV) { lnet_set_reply_msg_len(NULL, tx->tx_lntmsg[1], tx->tx_msg.gnm_u.completion.gncm_retval); } rc = 0; if (tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE_REV && desc->status == GNI_RC_SUCCESS) { if (tx->tx_buffer_copy != NULL) kgnilnd_recv_bte_get(tx); rc = kgnilnd_verify_rdma_cksum(tx, tx->tx_putinfo.gnpam_payload_cksum, tx->tx_nob_rdma); } if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE_REV && desc->status == GNI_RC_SUCCESS) { if (tx->tx_buffer_copy != NULL) kgnilnd_recv_bte_get(tx); rc = kgnilnd_verify_rdma_cksum(tx, tx->tx_getinfo.gngm_payload_cksum, tx->tx_nob_rdma); } /* remove from rdmaq */ kgnilnd_conn_mutex_lock(&conn->gnc_rdma_mutex); spin_lock(&conn->gnc_list_lock); kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD); spin_unlock(&conn->gnc_list_lock); kgnilnd_conn_mutex_unlock(&conn->gnc_rdma_mutex); if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RDMA_CQ_ERROR)) { event_data = 1LL << 48; rc = 1; } if (likely(desc->status == GNI_RC_SUCCESS) && rc == 0) { atomic_inc(&dev->gnd_rdma_ntx); atomic64_add(tx->tx_nob, &dev->gnd_rdma_txbytes); /* transaction succeeded, add into fmaq */ kgnilnd_queue_tx(conn, tx); kgnilnd_peer_alive(conn->gnc_peer); /* drop ref from kgnilnd_validate_tx_ev_id */ kgnilnd_admin_decref(conn->gnc_tx_in_use); kgnilnd_conn_decref(conn); continue; } /* fall through to the TRANSACTION_ERROR case */ tx->tx_retrans++; /* get stringified version for log messages */ kgnilnd_cq_error_str(event_data, &err_str, 256); kgnilnd_cq_error_recoverable(event_data, &should_retry); /* make sure we are not off in the weeds with this tx */ if (tx->tx_retrans > *kgnilnd_tunables.kgn_max_retransmits) { GNIDBG_TX(D_NETERROR, tx, "giving up on TX, too many retries", NULL); should_retry = 0; } GNIDBG_TX(D_NETERROR, tx, "RDMA %s error (%s)", should_retry ? "transient" : "unrecoverable", err_str); if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_DONE || tx->tx_msg.gnm_type == GNILND_MSG_GET_DONE_REV) { rdesc = &tx->tx_putinfo.gnpam_desc; rnob = tx->tx_putinfo.gnpam_desc.gnrd_nob; rcookie = tx->tx_putinfo.gnpam_dst_cookie; } else { rdesc = &tx->tx_getinfo.gngm_desc; rnob = tx->tx_lntmsg[0]->msg_len; rcookie = tx->tx_getinfo.gngm_cookie; } if (should_retry) { kgnilnd_rdma(tx, tx->tx_msg.gnm_type, rdesc, rnob, rcookie); } else { kgnilnd_nak_rdma(conn, tx->tx_msg.gnm_type, -EFAULT, rcookie, tx->tx_msg.gnm_srcnid); kgnilnd_tx_done(tx, -GNILND_NOPURG); kgnilnd_close_conn(conn, -ECOMM); } /* drop ref from kgnilnd_validate_tx_ev_id */ kgnilnd_admin_decref(conn->gnc_tx_in_use); kgnilnd_conn_decref(conn); } } int kgnilnd_check_fma_send_cq(kgn_device_t *dev) { gni_return_t rrc; __u64 event_data; kgn_tx_ev_id_t ev_id; kgn_tx_t *tx = NULL; kgn_conn_t *conn = NULL; int queued_fma, saw_reply, rc; long num_processed = 0; for (;;) { /* make sure we don't keep looping if we need to reset */ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { return num_processed; } rc = kgnilnd_mutex_trylock(&dev->gnd_cq_mutex); if (!rc) { /* we didn't get the mutex, so return that there is still work * to be done */ return 1; } rrc = kgnilnd_cq_get_event(dev->gnd_snd_fma_cqh, &event_data); kgnilnd_gl_mutex_unlock(&dev->gnd_cq_mutex); if (rrc == GNI_RC_NOT_DONE) { CDEBUG(D_INFO, "SMSG send CQ %d not ready (data "LPX64") " "processed %ld\n", dev->gnd_id, event_data, num_processed); return num_processed; } dev->gnd_sched_alive = jiffies; num_processed++; LASSERTF(!GNI_CQ_OVERRUN(event_data), "this is bad, somehow our credits didn't " "protect us from CQ overrun\n"); LASSERTF(GNI_CQ_GET_TYPE(event_data) == GNI_CQ_EVENT_TYPE_SMSG, "rrc %d, GNI_CQ_GET_TYPE("LPX64") = "LPX64"\n", rrc, event_data, GNI_CQ_GET_TYPE(event_data)); /* if SMSG couldn't handle an error, time for conn to die */ if (unlikely(rrc == GNI_RC_TRANSACTION_ERROR)) { char err_str[256]; /* need to take the write_lock to ensure atomicity * on the conn state if we need to close it */ write_lock(&kgnilnd_data.kgn_peer_conn_lock); conn = kgnilnd_cqid2conn_locked(GNI_CQ_GET_INST_ID(event_data)); if (conn == NULL) { /* Conn was destroyed? */ CDEBUG(D_NET, "SMSG CQID lookup "LPX64" failed\n", GNI_CQ_GET_INST_ID(event_data)); write_unlock(&kgnilnd_data.kgn_peer_conn_lock); continue; } kgnilnd_cq_error_str(event_data, &err_str, 256); CNETERR("SMSG send error to %s: rc %d (%s)\n", libcfs_nid2str(conn->gnc_peer->gnp_nid), rrc, err_str); kgnilnd_close_conn_locked(conn, -ECOMM); write_unlock(&kgnilnd_data.kgn_peer_conn_lock); /* no need to process rest of this tx - * it is getting canceled */ continue; } /* fall through to GNI_RC_SUCCESS case */ ev_id.txe_smsg_id = GNI_CQ_GET_MSG_ID(event_data); kgnilnd_validate_tx_ev_id(&ev_id, &tx, &conn); if (conn == NULL || tx == NULL) { /* either conn or tx was already nuked and this is a "late" * completion, so drop it */ continue; } tx->tx_conn->gnc_last_tx_cq = jiffies; if (tx->tx_msg.gnm_type == GNILND_MSG_NOOP) { set_mb(conn->gnc_last_noop_cq, jiffies); } /* lock tx_list_state and tx_state */ kgnilnd_conn_mutex_lock(&conn->gnc_smsg_mutex); spin_lock(&tx->tx_conn->gnc_list_lock); GNITX_ASSERTF(tx, tx->tx_list_state == GNILND_TX_LIVE_FMAQ, "state not GNILND_TX_LIVE_FMAQ", NULL); GNITX_ASSERTF(tx, tx->tx_state & GNILND_TX_WAITING_COMPLETION, "not waiting for completion", NULL); GNIDBG_TX(D_NET, tx, "SMSG complete tx_state %x rc %d", tx->tx_state, rrc); tx->tx_state &= ~GNILND_TX_WAITING_COMPLETION; /* This will trigger other FMA sends that were * pending this completion */ queued_fma = !list_empty(&tx->tx_conn->gnc_fmaq); /* we either did not expect reply or we already got it */ saw_reply = !(tx->tx_state & GNILND_TX_WAITING_REPLY); spin_unlock(&tx->tx_conn->gnc_list_lock); kgnilnd_conn_mutex_unlock(&conn->gnc_smsg_mutex); if (queued_fma) { CDEBUG(D_NET, "scheduling conn 0x%p->%s for fmaq\n", conn, libcfs_nid2str(conn->gnc_peer->gnp_nid)); kgnilnd_schedule_conn(conn); } /* If saw_reply is false as soon as gnc_list_lock is dropped the tx could be nuked * If saw_reply is true we know that the tx is safe to use as the other thread * is already finished with it. */ if (saw_reply) { /* no longer need to track on the live_fmaq */ kgnilnd_tx_del_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_ALLOCD); if (tx->tx_state & GNILND_TX_PENDING_RDMA) { /* we already got reply & were waiting for * completion of initial send */ /* to initiate RDMA transaction */ GNIDBG_TX(D_NET, tx, "Pending RDMA 0x%p type 0x%02x", tx->tx_msg.gnm_type); tx->tx_state &= ~GNILND_TX_PENDING_RDMA; rc = kgnilnd_send_mapped_tx(tx, 0); GNITX_ASSERTF(tx, rc == 0, "RDMA send failed: %d\n", rc); } else { /* we are done with this tx */ GNIDBG_TX(D_NET, tx, "Done with tx type 0x%02x", tx->tx_msg.gnm_type); kgnilnd_tx_done(tx, tx->tx_rc); } } /* drop ref from kgnilnd_validate_tx_ev_id */ kgnilnd_admin_decref(conn->gnc_tx_in_use); kgnilnd_conn_decref(conn); /* if we are waiting for a REPLY, we'll handle the tx then */ } /* end for loop */ } int kgnilnd_check_fma_rcv_cq(kgn_device_t *dev) { kgn_conn_t *conn; gni_return_t rrc; __u64 event_data; long num_processed = 0; struct list_head *conns; struct list_head *tmp; int rc; for (;;) { /* make sure we don't keep looping if we need to reset */ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { return num_processed; } rc = kgnilnd_mutex_trylock(&dev->gnd_cq_mutex); if (!rc) { /* we didn't get the mutex, so return that there is still work * to be done */ return 1; } rrc = kgnilnd_cq_get_event(dev->gnd_rcv_fma_cqh, &event_data); kgnilnd_gl_mutex_unlock(&dev->gnd_cq_mutex); if (rrc == GNI_RC_NOT_DONE) { CDEBUG(D_INFO, "SMSG RX CQ %d empty data "LPX64" " "processed %ld\n", dev->gnd_id, event_data, num_processed); return num_processed; } dev->gnd_sched_alive = jiffies; num_processed++; /* this is the only CQ that can really handle transient * CQ errors */ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CQ_GET_EVENT)) { rrc = cfs_fail_val ? cfs_fail_val : GNI_RC_ERROR_RESOURCE; if (rrc == GNI_RC_ERROR_RESOURCE) { /* set overrun too */ event_data |= (1UL << 63); LASSERTF(GNI_CQ_OVERRUN(event_data), "(1UL << 63) is no longer the bit to" "set to indicate CQ_OVERRUN\n"); } } /* sender should get error event too and take care of failed transaction by re-transmitting */ if (rrc == GNI_RC_TRANSACTION_ERROR) { CDEBUG(D_NET, "SMSG RX CQ error "LPX64"\n", event_data); continue; } if (likely(!GNI_CQ_OVERRUN(event_data))) { read_lock(&kgnilnd_data.kgn_peer_conn_lock); conn = kgnilnd_cqid2conn_locked( GNI_CQ_GET_INST_ID(event_data)); if (conn == NULL) { CDEBUG(D_NET, "SMSG RX CQID lookup "LPU64" " "failed, dropping event "LPX64"\n", GNI_CQ_GET_INST_ID(event_data), event_data); } else { CDEBUG(D_NET, "SMSG RX: CQID "LPU64" " "conn %p->%s\n", GNI_CQ_GET_INST_ID(event_data), conn, conn->gnc_peer ? libcfs_nid2str(conn->gnc_peer->gnp_nid) : ""); conn->gnc_last_rx_cq = jiffies; /* stash first rx so we can clear out purgatory. */ if (conn->gnc_first_rx == 0) { conn->gnc_first_rx = jiffies; } kgnilnd_peer_alive(conn->gnc_peer); kgnilnd_schedule_conn(conn); } read_unlock(&kgnilnd_data.kgn_peer_conn_lock); continue; } /* FMA CQ has overflowed: check ALL conns */ CNETERR("SMSG RX CQ overflow: scheduling ALL " "conns on device %d\n", dev->gnd_id); for (rc = 0; rc < *kgnilnd_tunables.kgn_peer_hash_size; rc++) { read_lock(&kgnilnd_data.kgn_peer_conn_lock); conns = &kgnilnd_data.kgn_conns[rc]; list_for_each(tmp, conns) { conn = list_entry(tmp, kgn_conn_t, gnc_hashlist); if (conn->gnc_device == dev) { kgnilnd_schedule_conn(conn); conn->gnc_last_rx_cq = jiffies; } } /* don't block write lockers for too long... */ read_unlock(&kgnilnd_data.kgn_peer_conn_lock); } } } /* try_map_if_full should only be used when processing TX from list of * backlog TX waiting on mappings to free up * * Return Codes: * try_map_if_full = 0: 0 (sent or queued), (-|+)errno failure of kgnilnd_sendmsg * try_map_if_full = 1: 0 (sent), -ENOMEM for caller to requeue, (-|+)errno failure of kgnilnd_sendmsg */ int kgnilnd_send_mapped_tx(kgn_tx_t *tx, int try_map_if_full) { /* slight bit of race if multiple people calling, but at worst we'll have * order altered just a bit... which would not be determenistic anyways */ int rc = atomic_read(&tx->tx_conn->gnc_device->gnd_nq_map); GNIDBG_TX(D_NET, tx, "try %d nq_map %d", try_map_if_full, rc); /* We know that we have a GART reservation that should guarantee forward progress. * This means we don't need to take any extraordinary efforts if we are failing * mappings here - even if we are holding a very small number of these. */ if (try_map_if_full || (rc == 0)) { rc = kgnilnd_map_buffer(tx); } /* rc should be 0 if we mapped successfully here, if non-zero * we are queueing */ if (rc != 0) { /* if try_map_if_full set, they handle requeuing */ if (unlikely(try_map_if_full)) { RETURN(rc); } else { spin_lock(&tx->tx_conn->gnc_device->gnd_lock); kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 1); spin_unlock(&tx->tx_conn->gnc_device->gnd_lock); /* make sure we wake up sched to run this */ kgnilnd_schedule_device(tx->tx_conn->gnc_device); /* return 0 as this is now queued for later sending */ RETURN(0); } } switch (tx->tx_msg.gnm_type) { default: LBUG(); break; /* GET_REQ and PUT_ACK are outbound messages sending our mapping key to * remote node where the RDMA will be started * Special case -EAGAIN logic - this should just queued as if the mapping couldn't * be satisified. The rest of the errors are "hard" errors that require * upper layers to handle themselves. * If kgnilnd_post_rdma returns a resource error, kgnilnd_rdma will put * the tx back on the TX_MAPQ. When this tx is pulled back off the MAPQ, * it's gnm_type will now be GNILND_MSG_PUT_DONE or * GNILND_MSG_GET_DONE_REV. */ case GNILND_MSG_GET_REQ: tx->tx_msg.gnm_u.get.gngm_desc.gnrd_key = tx->tx_map_key; tx->tx_msg.gnm_u.get.gngm_cookie = tx->tx_id.txe_cookie; tx->tx_msg.gnm_u.get.gngm_desc.gnrd_addr = (__u64)((unsigned long)tx->tx_buffer); tx->tx_msg.gnm_u.get.gngm_desc.gnrd_nob = tx->tx_nob; tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY; if (CFS_FAIL_CHECK(CFS_FAIL_GNI_GET_REQ_AGAIN)) { tx->tx_state |= GNILND_TX_FAIL_SMSG; } /* redirect to FMAQ on failure, no need to infinite loop here in MAPQ */ rc = kgnilnd_sendmsg(tx, NULL, 0, &tx->tx_conn->gnc_list_lock, GNILND_TX_FMAQ); break; case GNILND_MSG_PUT_ACK: tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_key = tx->tx_map_key; tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY; if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PUT_ACK_AGAIN)) { tx->tx_state |= GNILND_TX_FAIL_SMSG; } /* redirect to FMAQ on failure, no need to infinite loop here in MAPQ */ rc = kgnilnd_sendmsg(tx, NULL, 0, &tx->tx_conn->gnc_list_lock, GNILND_TX_FMAQ); break; /* PUT_REQ and GET_DONE are where we do the actual RDMA */ case GNILND_MSG_PUT_DONE: case GNILND_MSG_PUT_REQ: rc = kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE, &tx->tx_putinfo.gnpam_desc, tx->tx_putinfo.gnpam_desc.gnrd_nob, tx->tx_putinfo.gnpam_dst_cookie); RETURN(try_map_if_full ? rc : 0); break; case GNILND_MSG_GET_DONE: rc = kgnilnd_rdma(tx, GNILND_MSG_GET_DONE, &tx->tx_getinfo.gngm_desc, tx->tx_lntmsg[0]->msg_len, tx->tx_getinfo.gngm_cookie); RETURN(try_map_if_full ? rc : 0); break; case GNILND_MSG_PUT_REQ_REV: tx->tx_msg.gnm_u.get.gngm_desc.gnrd_key = tx->tx_map_key; tx->tx_msg.gnm_u.get.gngm_cookie = tx->tx_id.txe_cookie; tx->tx_msg.gnm_u.get.gngm_desc.gnrd_addr = (__u64)((unsigned long)tx->tx_buffer); tx->tx_msg.gnm_u.get.gngm_desc.gnrd_nob = tx->tx_nob; tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY; kgnilnd_compute_rdma_cksum(tx, tx->tx_nob); tx->tx_msg.gnm_u.get.gngm_payload_cksum = tx->tx_msg.gnm_payload_cksum; rc = kgnilnd_sendmsg(tx, NULL, 0, &tx->tx_conn->gnc_list_lock, GNILND_TX_FMAQ); break; case GNILND_MSG_PUT_DONE_REV: rc = kgnilnd_rdma(tx, GNILND_MSG_PUT_DONE_REV, &tx->tx_getinfo.gngm_desc, tx->tx_nob, tx->tx_getinfo.gngm_cookie); RETURN(try_map_if_full ? rc : 0); break; case GNILND_MSG_GET_ACK_REV: tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_key = tx->tx_map_key; tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY; /* LNET_GETS are a special case for parse */ kgnilnd_compute_rdma_cksum(tx, tx->tx_msg.gnm_u.putack.gnpam_desc.gnrd_nob); tx->tx_msg.gnm_u.putack.gnpam_payload_cksum = tx->tx_msg.gnm_payload_cksum; if (CFS_FAIL_CHECK(CFS_FAIL_GNI_PUT_ACK_AGAIN)) tx->tx_state |= GNILND_TX_FAIL_SMSG; /* redirect to FMAQ on failure, no need to infinite loop here in MAPQ */ rc = kgnilnd_sendmsg(tx, NULL, 0, &tx->tx_conn->gnc_list_lock, GNILND_TX_FMAQ); break; case GNILND_MSG_GET_DONE_REV: case GNILND_MSG_GET_REQ_REV: rc = kgnilnd_rdma(tx, GNILND_MSG_GET_DONE_REV, &tx->tx_putinfo.gnpam_desc, tx->tx_putinfo.gnpam_desc.gnrd_nob, tx->tx_putinfo.gnpam_dst_cookie); RETURN(try_map_if_full ? rc : 0); break; } RETURN(rc); } void kgnilnd_process_fmaq(kgn_conn_t *conn) { int more_to_do = 0; kgn_tx_t *tx = NULL; void *buffer = NULL; unsigned int nob = 0; int rc; /* NB 1. kgnilnd_sendmsg() may fail if I'm out of credits right now. * However I will be rescheduled by an FMA completion event * when I eventually get some. * NB 2. Sampling gnc_state here races with setting it elsewhere. * But it doesn't matter if I try to send a "real" message just * as I start closing because I'll get scheduled to send the * close anyway. */ /* Short circuit if the ep_handle is null we cant send anyway. */ if (conn->gnc_ephandle == NULL) return; LASSERTF(!conn->gnc_close_sent, "Conn %p close was sent\n", conn); spin_lock(&conn->gnc_list_lock); if (list_empty(&conn->gnc_fmaq)) { int keepalive = GNILND_TO2KA(conn->gnc_timeout); spin_unlock(&conn->gnc_list_lock); if (time_after_eq(jiffies, conn->gnc_last_tx + cfs_time_seconds(keepalive))) { CDEBUG(D_NET, "sending NOOP -> %s (%p idle %lu(%d)) " "last %lu/%lu/%lu %lus/%lus/%lus\n", libcfs_nid2str(conn->gnc_peer->gnp_nid), conn, cfs_duration_sec(jiffies - conn->gnc_last_tx), keepalive, conn->gnc_last_noop_want, conn->gnc_last_noop_sent, conn->gnc_last_noop_cq, cfs_duration_sec(jiffies - conn->gnc_last_noop_want), cfs_duration_sec(jiffies - conn->gnc_last_noop_sent), cfs_duration_sec(jiffies - conn->gnc_last_noop_cq)); atomic_inc(&conn->gnc_sched_noop); set_mb(conn->gnc_last_noop_want, jiffies); if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NOOP_SEND)) return; tx = kgnilnd_new_tx_msg(GNILND_MSG_NOOP, conn->gnc_peer->gnp_net->gnn_ni->ni_nid); if (tx != NULL) { int rc; rc = kgnilnd_set_tx_id(tx, conn); if (rc != 0) { kgnilnd_tx_done(tx, rc); return; } } } } else { tx = list_first_entry(&conn->gnc_fmaq, kgn_tx_t, tx_list); /* move from fmaq to allocd, kgnilnd_sendmsg will move to live_fmaq */ kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD); more_to_do = !list_empty(&conn->gnc_fmaq); spin_unlock(&conn->gnc_list_lock); } /* if there is no real TX or no NOOP to send, bail */ if (tx == NULL) { return; } if (!tx->tx_retrans) tx->tx_cred_wait = jiffies; GNITX_ASSERTF(tx, tx->tx_id.txe_smsg_id != 0, "tx with zero id", NULL); CDEBUG(D_NET, "sending regular msg: %p, type %s(0x%02x), cookie "LPX64"\n", tx, kgnilnd_msgtype2str(tx->tx_msg.gnm_type), tx->tx_msg.gnm_type, tx->tx_id.txe_cookie); rc = 0; switch (tx->tx_msg.gnm_type) { default: LBUG(); case GNILND_MSG_NOOP: case GNILND_MSG_CLOSE: case GNILND_MSG_IMMEDIATE: tx->tx_state = GNILND_TX_WAITING_COMPLETION; buffer = tx->tx_buffer; nob = tx->tx_nob; break; case GNILND_MSG_GET_DONE: case GNILND_MSG_PUT_DONE: case GNILND_MSG_PUT_DONE_REV: case GNILND_MSG_GET_DONE_REV: case GNILND_MSG_PUT_NAK: case GNILND_MSG_GET_NAK: case GNILND_MSG_GET_NAK_REV: case GNILND_MSG_PUT_NAK_REV: tx->tx_state = GNILND_TX_WAITING_COMPLETION; break; case GNILND_MSG_PUT_REQ: case GNILND_MSG_GET_REQ_REV: tx->tx_msg.gnm_u.putreq.gnprm_cookie = tx->tx_id.txe_cookie; case GNILND_MSG_PUT_ACK: case GNILND_MSG_PUT_REQ_REV: case GNILND_MSG_GET_ACK_REV: case GNILND_MSG_GET_REQ: /* This is really only to handle the retransmit of SMSG once these * two messages are setup in send_mapped_tx */ tx->tx_state = GNILND_TX_WAITING_COMPLETION | GNILND_TX_WAITING_REPLY; break; } if (likely(rc == 0)) { rc = kgnilnd_sendmsg(tx, buffer, nob, &conn->gnc_list_lock, GNILND_TX_FMAQ); } if (rc > 0) { /* don't explicitly reschedule here - we are short credits and will rely on * kgnilnd_sendmsg to resched the conn if need be */ more_to_do = 0; } else if (rc < 0) { /* bail: it wasn't sent and we didn't get EAGAIN indicating we should retrans * almost certainly a software bug, but lets play nice with the other kids */ kgnilnd_tx_done(tx, rc); /* just for fun, kick peer in arse - resetting conn might help to correct * this almost certainly buggy software caused return code */ kgnilnd_close_conn(conn, rc); } if (more_to_do) { CDEBUG(D_NET, "Rescheduling %p (more to do)\n", conn); kgnilnd_schedule_conn(conn); } } int kgnilnd_process_rdmaq(kgn_device_t *dev) { int found_work = 0; kgn_tx_t *tx; if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DELAY_RDMAQ)) { RETURN(found_work); } if (time_after_eq(jiffies, dev->gnd_rdmaq_deadline)) { unsigned long dead_bump; long new_ok; /* if we think we need to adjust, take lock to serialize and recheck */ spin_lock(&dev->gnd_rdmaq_lock); if (time_after_eq(jiffies, dev->gnd_rdmaq_deadline)) { del_singleshot_timer_sync(&dev->gnd_rdmaq_timer); dead_bump = cfs_time_seconds(1) / *kgnilnd_tunables.kgn_rdmaq_intervals; /* roll the bucket forward */ dev->gnd_rdmaq_deadline = jiffies + dead_bump; if (kgnilnd_data.kgn_rdmaq_override && (*kgnilnd_tunables.kgn_rdmaq_intervals != 0)) { new_ok = kgnilnd_data.kgn_rdmaq_override / *kgnilnd_tunables.kgn_rdmaq_intervals; } else { new_ok = ~0UL >> 1; } /* roll current outstanding forward to make sure we carry outstanding * committment forward * new_ok starts out as the whole interval value * - first subtract bytes_out from last interval, as that would push us over * strict limits for this interval * - second, set bytes_ok to new_ok to ensure it doesn't exceed the current auth * * there is a small race here if someone is actively processing mappings and * adding to rdmaq_bytes_out, but it should be small as the mappings are triggered * quite quickly after kgnilnd_auth_rdma_bytes gives us the go-ahead * - if this gives us problems in the future, we could use a read/write lock * to protect the resetting of these values */ new_ok -= atomic64_read(&dev->gnd_rdmaq_bytes_out); atomic64_set(&dev->gnd_rdmaq_bytes_ok, new_ok); CDEBUG(D_NET, "resetting rdmaq bytes to %ld, deadline +%lu -> %lu, " "current out %ld\n", atomic64_read(&dev->gnd_rdmaq_bytes_ok), dead_bump, dev->gnd_rdmaq_deadline, atomic64_read(&dev->gnd_rdmaq_bytes_out)); } spin_unlock(&dev->gnd_rdmaq_lock); } spin_lock(&dev->gnd_rdmaq_lock); while (!list_empty(&dev->gnd_rdmaq)) { int rc; /* make sure we break out early on quiesce */ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { /* always break with lock held - we unlock outside loop */ break; } tx = list_first_entry(&dev->gnd_rdmaq, kgn_tx_t, tx_list); kgnilnd_tx_del_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_ALLOCD); found_work++; /* sample with lock held, serializing with kgnilnd_complete_closed_conn */ if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED) { /* if conn is dying, mark tx in tx_ref_table for * kgnilnd_complete_closed_conn to finish up */ kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_DYING, 1); /* tx was moved to DYING, get next */ continue; } spin_unlock(&dev->gnd_rdmaq_lock); rc = kgnilnd_auth_rdma_bytes(dev, tx); spin_lock(&dev->gnd_rdmaq_lock); if (rc < 0) { /* no ticket! add back to head */ kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_RDMAQ, 0); /* clear found_work so scheduler threads wait for timer */ found_work = 0; break; } else { /* TX is GO for launch */ tx->tx_qtime = jiffies; kgnilnd_send_mapped_tx(tx, 0); found_work++; } } spin_unlock(&dev->gnd_rdmaq_lock); RETURN(found_work); } static inline void kgnilnd_swab_rdma_desc(kgn_rdma_desc_t *d) { __swab64s(&d->gnrd_key.qword1); __swab64s(&d->gnrd_key.qword2); __swab64s(&d->gnrd_addr); __swab32s(&d->gnrd_nob); } #define kgnilnd_match_reply_either(w, x, y, z) _kgnilnd_match_reply(w, x, y, z) #define kgnilnd_match_reply(x, y, z) _kgnilnd_match_reply(x, y, GNILND_MSG_NONE, z) kgn_tx_t * _kgnilnd_match_reply(kgn_conn_t *conn, int type1, int type2, __u64 cookie) { kgn_tx_ev_id_t ev_id; kgn_tx_t *tx; /* we use the cookie from the original TX, so we can find the match * by parsing that and using the txe_idx */ ev_id.txe_cookie = cookie; tx = conn->gnc_tx_ref_table[ev_id.txe_idx]; if (tx != NULL) { /* check tx to make sure kgni didn't eat it */ GNITX_ASSERTF(tx, tx->tx_msg.gnm_magic == GNILND_MSG_MAGIC, "came back from kgni with bad magic %x\n", tx->tx_msg.gnm_magic); GNITX_ASSERTF(tx, ((tx->tx_id.txe_idx == ev_id.txe_idx) && (tx->tx_id.txe_cookie = cookie)), "conn 0x%p->%s tx_ref_table hosed: wanted " "txe_cookie "LPX64" txe_idx %d " "found tx %p cookie "LPX64" txe_idx %d\n", conn, libcfs_nid2str(conn->gnc_peer->gnp_nid), cookie, ev_id.txe_idx, tx, tx->tx_id.txe_cookie, tx->tx_id.txe_idx); LASSERTF((((tx->tx_msg.gnm_type == type1) || (tx->tx_msg.gnm_type == type2)) && (tx->tx_state & GNILND_TX_WAITING_REPLY)), "Unexpected TX type (%x, %x or %x) " "or state (%x, expected +%x) " "matched reply from %s\n", tx->tx_msg.gnm_type, type1, type2, tx->tx_state, GNILND_TX_WAITING_REPLY, libcfs_nid2str(conn->gnc_peer->gnp_nid)); } else { CWARN("Unmatched reply %02x, or %02x/"LPX64" from %s\n", type1, type2, cookie, libcfs_nid2str(conn->gnc_peer->gnp_nid)); } return tx; } static inline void kgnilnd_complete_tx(kgn_tx_t *tx, int rc) { int complete = 0; kgn_conn_t *conn = tx->tx_conn; __u64 nob = tx->tx_nob; __u32 physnop = tx->tx_phys_npages; int id = tx->tx_id.txe_smsg_id; int buftype = tx->tx_buftype; gni_mem_handle_t hndl; hndl.qword1 = tx->tx_map_key.qword1; hndl.qword2 = tx->tx_map_key.qword2; spin_lock(&conn->gnc_list_lock); GNITX_ASSERTF(tx, tx->tx_state & GNILND_TX_WAITING_REPLY, "not waiting for reply", NULL); tx->tx_rc = rc; tx->tx_state &= ~GNILND_TX_WAITING_REPLY; if (rc == -EFAULT) { CDEBUG(D_NETERROR, "Error %d TX data: TX %p tx_id %x nob %16"LPF64"u physnop %8d buffertype %#8x MemHandle "LPX64"."LPX64"x\n", rc, tx, id, nob, physnop, buftype, hndl.qword1, hndl.qword2); if(*kgnilnd_tunables.kgn_efault_lbug) { GNIDBG_TOMSG(D_NETERROR, &tx->tx_msg, "error %d on tx 0x%p->%s id %u/%d state %s age %ds", rc, tx, conn ? libcfs_nid2str(conn->gnc_peer->gnp_nid) : "", tx->tx_id.txe_smsg_id, tx->tx_id.txe_idx, kgnilnd_tx_state2str(tx->tx_list_state), cfs_duration_sec((unsigned long) jiffies - tx->tx_qtime)); LBUG(); } } if (!(tx->tx_state & GNILND_TX_WAITING_COMPLETION)) { kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD); /* sample under lock as follow on steps require gnc_list_lock * - or call kgnilnd_tx_done which requires no locks held over * call to lnet_finalize */ complete = 1; } spin_unlock(&conn->gnc_list_lock); if (complete) { kgnilnd_tx_done(tx, tx->tx_rc); } } static inline void kgnilnd_finalize_rx_done(kgn_tx_t *tx, kgn_msg_t *msg) { int rc; kgn_conn_t *conn = tx->tx_conn; atomic_inc(&conn->gnc_device->gnd_rdma_nrx); atomic64_add(tx->tx_nob, &conn->gnc_device->gnd_rdma_rxbytes); /* the gncm_retval is passed in for PUTs */ rc = kgnilnd_verify_rdma_cksum(tx, msg->gnm_payload_cksum, msg->gnm_u.completion.gncm_retval); kgnilnd_complete_tx(tx, rc); } void kgnilnd_check_fma_rx(kgn_conn_t *conn) { __u32 seq; kgn_tx_t *tx; kgn_rx_t *rx; kgn_msg_t *msg; void *prefix; gni_return_t rrc; kgn_peer_t *peer = conn->gnc_peer; kgn_net_t *net; int rc = 0; __u16 tmp_cksum = 0, msg_cksum = 0; int repost = 1, saw_complete; unsigned long timestamp, newest_last_rx, timeout; int last_seq; ENTRY; /* Short circuit if the ep_handle is null. * It's likely that its about to be closed as stale. */ if (conn->gnc_ephandle == NULL) RETURN_EXIT; timestamp = jiffies; kgnilnd_gl_mutex_lock(&conn->gnc_device->gnd_cq_mutex); /* delay in jiffies - we are really concerned only with things that * result in a schedule() or really holding this off for long times . * NB - mutex_lock could spin for 2 jiffies before going to sleep to wait */ conn->gnc_device->gnd_mutex_delay += (long) jiffies - timestamp; /* Resample current time as we have no idea how long it took to get the mutex */ timestamp = jiffies; /* We check here when the last time we received an rx, we do this before * we call getnext in case the thread has been blocked for a while. If we * havent received an rx since our timeout value we close the connection * as we should assume the other side has closed the connection. This will * stop us from sending replies to a mailbox that is already in purgatory. */ timeout = cfs_time_seconds(conn->gnc_timeout); newest_last_rx = GNILND_LASTRX(conn); /* Error injection to validate that timestamp checking works and closing the conn */ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RECV_TIMEOUT)) { timestamp = timestamp + (GNILND_TIMEOUTRX(timeout) * 2); } if (time_after_eq(timestamp, newest_last_rx + (GNILND_TIMEOUTRX(timeout)))) { GNIDBG_CONN(D_NETERROR|D_CONSOLE, conn, "Cant receive from %s after timeout lapse of %lu; TO %lu", libcfs_nid2str(conn->gnc_peer->gnp_nid), cfs_duration_sec(timestamp - newest_last_rx), cfs_duration_sec(GNILND_TIMEOUTRX(timeout))); kgnilnd_gl_mutex_unlock(&conn->gnc_device->gnd_cq_mutex); rc = -ETIME; kgnilnd_close_conn(conn, rc); RETURN_EXIT; } rrc = kgnilnd_smsg_getnext(conn->gnc_ephandle, &prefix); if (rrc == GNI_RC_NOT_DONE) { kgnilnd_gl_mutex_unlock(&conn->gnc_device->gnd_cq_mutex); CDEBUG(D_INFO, "SMSG RX empty conn 0x%p\n", conn); RETURN_EXIT; } /* Instead of asserting when we get mailbox corruption lets attempt to * close the conn and recover. We can put the conn/mailbox into * purgatory and let purgatory deal with the problem. If we see * this NETTERROR reported on production systems in large amounts * we will need to revisit the state machine to see if we can tighten * it up further to improve data protection. */ if (rrc == GNI_RC_INVALID_STATE) { kgnilnd_gl_mutex_unlock(&conn->gnc_device->gnd_cq_mutex); GNIDBG_CONN(D_NETERROR | D_CONSOLE, conn, "Mailbox corruption " "detected closing conn %p from peer %s\n", conn, libcfs_nid2str(conn->gnc_peer->gnp_nid)); rc = -EIO; kgnilnd_close_conn(conn, rc); RETURN_EXIT; } LASSERTF(rrc == GNI_RC_SUCCESS, "bad rc %d on conn %p from peer %s\n", rrc, conn, libcfs_nid2str(peer->gnp_nid)); msg = (kgn_msg_t *)prefix; rx = kgnilnd_alloc_rx(); if (rx == NULL) { kgnilnd_gl_mutex_unlock(&conn->gnc_device->gnd_cq_mutex); kgnilnd_release_msg(conn); GNIDBG_MSG(D_NETERROR, msg, "Dropping SMSG RX from 0x%p->%s, no RX memory", conn, libcfs_nid2str(peer->gnp_nid)); RETURN_EXIT; } GNIDBG_MSG(D_INFO, msg, "SMSG RX on %p", conn); timestamp = conn->gnc_last_rx; seq = last_seq = atomic_read(&conn->gnc_rx_seq); atomic_inc(&conn->gnc_rx_seq); conn->gnc_last_rx = jiffies; /* stash first rx so we can clear out purgatory */ if (conn->gnc_first_rx == 0) conn->gnc_first_rx = jiffies; /* needs to linger to protect gnc_rx_seq like we do with gnc_tx_seq */ kgnilnd_gl_mutex_unlock(&conn->gnc_device->gnd_cq_mutex); kgnilnd_peer_alive(conn->gnc_peer); rx->grx_msg = msg; rx->grx_conn = conn; rx->grx_eager = 0; rx->grx_received = current_kernel_time(); if (CFS_FAIL_CHECK(CFS_FAIL_GNI_NET_LOOKUP)) { rc = -ENONET; } else { rc = kgnilnd_find_net(msg->gnm_srcnid, &net); } if (rc < 0) { GOTO(out, rc); } else { kgnilnd_net_decref(net); } if (*kgnilnd_tunables.kgn_checksum && !msg->gnm_cksum) GNIDBG_MSG(D_WARNING, msg, "no msg header checksum when enabled"); /* XXX Nic: Do we need to swab cksum */ if (msg->gnm_cksum != 0) { msg_cksum = msg->gnm_cksum; msg->gnm_cksum = 0; tmp_cksum = kgnilnd_cksum(msg, sizeof(kgn_msg_t)); if (tmp_cksum != msg_cksum) { GNIDBG_MSG(D_NETERROR, msg, "Bad hdr checksum (%x expected %x)", tmp_cksum, msg_cksum); kgnilnd_dump_msg(D_BUFFS, msg); rc = -ENOKEY; GOTO(out, rc); } } /* restore checksum for future debug messages */ msg->gnm_cksum = tmp_cksum; if (msg->gnm_magic != GNILND_MSG_MAGIC) { if (__swab32(msg->gnm_magic) != GNILND_MSG_MAGIC) { GNIDBG_MSG(D_NETERROR, msg, "Unexpected magic %08x from %s", msg->gnm_magic, libcfs_nid2str(peer->gnp_nid)); rc = -EPROTO; GOTO(out, rc); } __swab32s(&msg->gnm_magic); __swab16s(&msg->gnm_version); __swab16s(&msg->gnm_type); __swab64s(&msg->gnm_srcnid); __swab64s(&msg->gnm_connstamp); __swab32s(&msg->gnm_seq); /* NB message type checked below; NOT here... */ switch (msg->gnm_type) { case GNILND_MSG_GET_ACK_REV: case GNILND_MSG_PUT_ACK: kgnilnd_swab_rdma_desc(&msg->gnm_u.putack.gnpam_desc); break; case GNILND_MSG_PUT_REQ_REV: case GNILND_MSG_GET_REQ: kgnilnd_swab_rdma_desc(&msg->gnm_u.get.gngm_desc); break; default: break; } } if (msg->gnm_version != GNILND_MSG_VERSION) { GNIDBG_MSG(D_NETERROR, msg, "Unexpected protocol version %d from %s", msg->gnm_version, libcfs_nid2str(peer->gnp_nid)); rc = -EPROTO; GOTO(out, rc); } if (LNET_NIDADDR(msg->gnm_srcnid) != LNET_NIDADDR(peer->gnp_nid)) { GNIDBG_MSG(D_NETERROR, msg, "Unexpected peer %s from %s", libcfs_nid2str(msg->gnm_srcnid), libcfs_nid2str(peer->gnp_nid)); rc = -EPROTO; GOTO(out, rc); } if (msg->gnm_connstamp != conn->gnc_peer_connstamp) { GNIDBG_MSG(D_NETERROR, msg, "Unexpected connstamp "LPX64"("LPX64 " expected) from %s", msg->gnm_connstamp, conn->gnc_peer_connstamp, libcfs_nid2str(peer->gnp_nid)); rc = -EPROTO; GOTO(out, rc); } if (msg->gnm_seq != seq) { GNIDBG_MSG(D_NETERROR, msg, "Unexpected sequence number %d(%d expected) from %s", msg->gnm_seq, seq, libcfs_nid2str(peer->gnp_nid)); rc = -EPROTO; GOTO(out, rc); } atomic_inc(&conn->gnc_device->gnd_short_nrx); if (msg->gnm_type == GNILND_MSG_CLOSE) { CDEBUG(D_NETTRACE, "%s sent us CLOSE msg\n", libcfs_nid2str(conn->gnc_peer->gnp_nid)); write_lock(&kgnilnd_data.kgn_peer_conn_lock); conn->gnc_close_recvd = GNILND_CLOSE_RX; conn->gnc_peer_error = msg->gnm_u.completion.gncm_retval; /* double check state with lock held */ if (conn->gnc_state == GNILND_CONN_ESTABLISHED) { /* only error if we are not already closing */ if (conn->gnc_peer_error == -ETIMEDOUT) { unsigned long now = jiffies; CNETERR("peer 0x%p->%s closed connection 0x%p due to timeout. " "Is node down? " "RX %d @ %lus/%lus; TX %d @ %lus/%lus; " "NOOP %lus/%lus/%lus; sched %lus/%lus/%lus ago\n", conn->gnc_peer, libcfs_nid2str(conn->gnc_peer->gnp_nid), conn, last_seq, cfs_duration_sec(now - timestamp), cfs_duration_sec(now - conn->gnc_last_rx_cq), atomic_read(&conn->gnc_tx_seq), cfs_duration_sec(now - conn->gnc_last_tx), cfs_duration_sec(now - conn->gnc_last_tx_cq), cfs_duration_sec(now - conn->gnc_last_noop_want), cfs_duration_sec(now - conn->gnc_last_noop_sent), cfs_duration_sec(now - conn->gnc_last_noop_cq), cfs_duration_sec(now - conn->gnc_last_sched_ask), cfs_duration_sec(now - conn->gnc_last_sched_do), cfs_duration_sec(now - conn->gnc_device->gnd_sched_alive)); } kgnilnd_close_conn_locked(conn, -ECONNRESET); } write_unlock(&kgnilnd_data.kgn_peer_conn_lock); GOTO(out, rc); } if (conn->gnc_close_recvd) { GNIDBG_MSG(D_NETERROR, msg, "Unexpected message %s(%d/%d) after CLOSE from %s", kgnilnd_msgtype2str(msg->gnm_type), msg->gnm_type, conn->gnc_close_recvd, libcfs_nid2str(conn->gnc_peer->gnp_nid)); rc = -EPROTO; GOTO(out, rc); } if (conn->gnc_state != GNILND_CONN_ESTABLISHED) { /* XXX Nic: log message received on bad connection state */ GOTO(out, rc); } switch (msg->gnm_type) { case GNILND_MSG_NOOP: /* Nothing to do; just a keepalive */ break; case GNILND_MSG_IMMEDIATE: /* only get SMSG payload for IMMEDIATE */ atomic64_add(msg->gnm_payload_len, &conn->gnc_device->gnd_short_rxbytes); rc = lnet_parse(net->gnn_ni, &msg->gnm_u.immediate.gnim_hdr, msg->gnm_srcnid, rx, 0); repost = rc < 0; break; case GNILND_MSG_GET_REQ_REV: case GNILND_MSG_PUT_REQ: rc = lnet_parse(net->gnn_ni, &msg->gnm_u.putreq.gnprm_hdr, msg->gnm_srcnid, rx, 1); repost = rc < 0; break; case GNILND_MSG_GET_NAK_REV: tx = kgnilnd_match_reply_either(conn, GNILND_MSG_GET_REQ_REV, GNILND_MSG_GET_ACK_REV, msg->gnm_u.completion.gncm_cookie); if (tx == NULL) break; kgnilnd_complete_tx(tx, msg->gnm_u.completion.gncm_retval); break; case GNILND_MSG_PUT_NAK: tx = kgnilnd_match_reply_either(conn, GNILND_MSG_PUT_REQ, GNILND_MSG_PUT_ACK, msg->gnm_u.completion.gncm_cookie); if (tx == NULL) break; kgnilnd_complete_tx(tx, msg->gnm_u.completion.gncm_retval); break; case GNILND_MSG_PUT_ACK: tx = kgnilnd_match_reply(conn, GNILND_MSG_PUT_REQ, msg->gnm_u.putack.gnpam_src_cookie); if (tx == NULL) break; /* store putack data for later: deferred rdma or re-try */ tx->tx_putinfo = msg->gnm_u.putack; saw_complete = 0; spin_lock(&tx->tx_conn->gnc_list_lock); GNITX_ASSERTF(tx, tx->tx_state & GNILND_TX_WAITING_REPLY, "not waiting for reply", NULL); tx->tx_state &= ~GNILND_TX_WAITING_REPLY; if (likely(!(tx->tx_state & GNILND_TX_WAITING_COMPLETION))) { kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD); /* sample under lock as follow on steps require gnc_list_lock * - or call kgnilnd_tx_done which requires no locks held over * call to lnet_finalize */ saw_complete = 1; } else { /* cannot launch rdma if still waiting for fma-msg completion */ CDEBUG(D_NET, "tx 0x%p type 0x%02x will need to " "wait for SMSG completion\n", tx, tx->tx_msg.gnm_type); tx->tx_state |= GNILND_TX_PENDING_RDMA; } spin_unlock(&tx->tx_conn->gnc_list_lock); if (saw_complete) { rc = kgnilnd_send_mapped_tx(tx, 0); if (rc < 0) kgnilnd_tx_done(tx, rc); } break; case GNILND_MSG_GET_ACK_REV: tx = kgnilnd_match_reply(conn, GNILND_MSG_GET_REQ_REV, msg->gnm_u.putack.gnpam_src_cookie); if (tx == NULL) break; /* store putack data for later: deferred rdma or re-try */ tx->tx_putinfo = msg->gnm_u.putack; saw_complete = 0; spin_lock(&tx->tx_conn->gnc_list_lock); GNITX_ASSERTF(tx, tx->tx_state & GNILND_TX_WAITING_REPLY, "not waiting for reply", NULL); tx->tx_state &= ~GNILND_TX_WAITING_REPLY; if (likely(!(tx->tx_state & GNILND_TX_WAITING_COMPLETION))) { kgnilnd_tx_del_state_locked(tx, NULL, conn, GNILND_TX_ALLOCD); /* sample under lock as follow on steps require gnc_list_lock * - or call kgnilnd_tx_done which requires no locks held over * call to lnet_finalize */ saw_complete = 1; } else { /* cannot launch rdma if still waiting for fma-msg completion */ CDEBUG(D_NET, "tx 0x%p type 0x%02x will need to " "wait for SMSG completion\n", tx, tx->tx_msg.gnm_type); tx->tx_state |= GNILND_TX_PENDING_RDMA; } spin_unlock(&tx->tx_conn->gnc_list_lock); if (saw_complete) { rc = kgnilnd_send_mapped_tx(tx, 0); if (rc < 0) kgnilnd_tx_done(tx, rc); } break; case GNILND_MSG_PUT_DONE: tx = kgnilnd_match_reply(conn, GNILND_MSG_PUT_ACK, msg->gnm_u.completion.gncm_cookie); if (tx == NULL) break; GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED || tx->tx_buftype == GNILND_BUF_VIRT_MAPPED, "bad tx buftype %d", tx->tx_buftype); kgnilnd_finalize_rx_done(tx, msg); break; case GNILND_MSG_PUT_REQ_REV: case GNILND_MSG_GET_REQ: rc = lnet_parse(net->gnn_ni, &msg->gnm_u.get.gngm_hdr, msg->gnm_srcnid, rx, 1); repost = rc < 0; break; case GNILND_MSG_GET_NAK: tx = kgnilnd_match_reply(conn, GNILND_MSG_GET_REQ, msg->gnm_u.completion.gncm_cookie); if (tx == NULL) break; GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED || tx->tx_buftype == GNILND_BUF_VIRT_MAPPED, "bad tx buftype %d", tx->tx_buftype); kgnilnd_complete_tx(tx, msg->gnm_u.completion.gncm_retval); break; case GNILND_MSG_GET_DONE: tx = kgnilnd_match_reply(conn, GNILND_MSG_GET_REQ, msg->gnm_u.completion.gncm_cookie); if (tx == NULL) break; GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED || tx->tx_buftype == GNILND_BUF_VIRT_MAPPED, "bad tx buftype %d", tx->tx_buftype); lnet_set_reply_msg_len(net->gnn_ni, tx->tx_lntmsg[1], msg->gnm_u.completion.gncm_retval); kgnilnd_finalize_rx_done(tx, msg); break; case GNILND_MSG_GET_DONE_REV: tx = kgnilnd_match_reply(conn, GNILND_MSG_GET_ACK_REV, msg->gnm_u.completion.gncm_cookie); if (tx == NULL) break; GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED || tx->tx_buftype == GNILND_BUF_VIRT_MAPPED, "bad tx buftype %d", tx->tx_buftype); kgnilnd_finalize_rx_done(tx, msg); break; case GNILND_MSG_PUT_DONE_REV: tx = kgnilnd_match_reply(conn, GNILND_MSG_PUT_REQ_REV, msg->gnm_u.completion.gncm_cookie); if (tx == NULL) break; GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED || tx->tx_buftype == GNILND_BUF_VIRT_MAPPED, "bad tx buftype %d", tx->tx_buftype); kgnilnd_finalize_rx_done(tx, msg); break; case GNILND_MSG_PUT_NAK_REV: tx = kgnilnd_match_reply(conn, GNILND_MSG_PUT_REQ_REV, msg->gnm_u.completion.gncm_cookie); if (tx == NULL) break; GNITX_ASSERTF(tx, tx->tx_buftype == GNILND_BUF_PHYS_MAPPED || tx->tx_buftype == GNILND_BUF_VIRT_MAPPED, "bad tx buftype %d", tx->tx_buftype); kgnilnd_complete_tx(tx, msg->gnm_u.completion.gncm_retval); break; } out: if (rc < 0) /* protocol/comms error */ kgnilnd_close_conn(conn, rc); if (repost && rx != NULL) { kgnilnd_consume_rx(rx); } /* we got an event so assume more there and call for reschedule */ if (rc >= 0) kgnilnd_schedule_conn(conn); EXIT; } /* Do the failure injections that we need to affect conn processing in the following function. * When writing tests that use this function make sure to use a fail_loc with a fail mask. * If you dont you can cause the scheduler threads to spin on the conn without it leaving * process_conns. * * intent is used to signal the calling function whether or not the conn needs to be rescheduled. */ static inline int kgnilnd_check_conn_fail_loc(kgn_device_t *dev, kgn_conn_t *conn, int *intent) { int rc = 0; /* short circuit out when not set */ if (likely(!cfs_fail_loc)) { RETURN(rc); } /* failure injection to test for stack reset clean ups */ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DROP_CLOSING)) { /* we can't rely on busy loops being nice enough to get the * stack reset triggered - it'd just spin on this conn */ CFS_RACE(CFS_FAIL_GNI_DROP_CLOSING); rc = 1; *intent = 1; GOTO(did_fail_loc, rc); } if (conn->gnc_state == GNILND_CONN_DESTROY_EP) { /* DESTROY_EP set in kgnilnd_conn_decref on gnc_refcount = 1 */ if (CFS_FAIL_CHECK(CFS_FAIL_GNI_DROP_DESTROY_EP)) { CFS_RACE(CFS_FAIL_GNI_DROP_DESTROY_EP); rc = 1; *intent = 1; GOTO(did_fail_loc, rc); } } /* CFS_FAIL_GNI_FINISH_PURG2 is used to stop a connection from fully closing. This scheduler * will spin on the CFS_FAIL_TIMEOUT until the fail_loc is cleared at which time the connection * will be closed by kgnilnd_complete_closed_conn. */ if ((conn->gnc_state == GNILND_CONN_CLOSED) && CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG2)) { while (CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_FINISH_PURG2, 1)) {}; rc = 1; *intent = 1; GOTO(did_fail_loc, rc); } /* this one is a bit gross - we can't hold the mutex from process_conns * across a CFS_RACE here - it'd block the conn threads from doing an ep_bind * and moving onto finish_connect * so, we'll just set the rc - kgnilnd_process_conns will clear * found_work on a fail_loc, getting the scheduler thread to call schedule() * and effectively getting this thread to sleep */ if ((conn->gnc_state == GNILND_CONN_CLOSED) && CFS_FAIL_CHECK(CFS_FAIL_GNI_FINISH_PURG)) { rc = 1; *intent = 1; GOTO(did_fail_loc, rc); } did_fail_loc: RETURN(rc); } static inline void kgnilnd_send_conn_close(kgn_conn_t *conn) { kgn_tx_t *tx; /* we are closing the conn - we will try to send the CLOSE msg * but will not wait for anything else to flush */ /* send the close if not already done so or received one */ if (!conn->gnc_close_sent && !conn->gnc_close_recvd) { /* set close_sent regardless of the success of the * CLOSE message. We are going to try once and then * kick him out of the sandbox */ conn->gnc_close_sent = 1; mb(); /* EP might be null already if remote side initiated a new connection. * kgnilnd_finish_connect destroys existing ep_handles before wiring up the new connection, * so this check is here to make sure we dont attempt to send with a null ep_handle. */ if (conn->gnc_ephandle != NULL) { int rc = 0; tx = kgnilnd_new_tx_msg(GNILND_MSG_CLOSE, conn->gnc_peer->gnp_net->gnn_ni->ni_nid); if (tx != NULL) { tx->tx_msg.gnm_u.completion.gncm_retval = conn->gnc_error; tx->tx_state = GNILND_TX_WAITING_COMPLETION; tx->tx_qtime = jiffies; if (tx->tx_id.txe_idx == 0) { rc = kgnilnd_set_tx_id(tx, conn); if (rc != 0) { kgnilnd_tx_done(tx, rc); } } CDEBUG(D_NETTRACE, "sending close with errno %d\n", conn->gnc_error); if (CFS_FAIL_CHECK(CFS_FAIL_GNI_CLOSE_SEND)) { kgnilnd_tx_done(tx, -EAGAIN); } else if (!rc) { rc = kgnilnd_sendmsg(tx, NULL, 0, NULL, GNILND_TX_FMAQ); if (rc) { /* It wasnt sent and we dont care. */ kgnilnd_tx_done(tx, rc); } } } } } /* When changing gnc_state we need to take the kgn_peer_conn_lock */ write_lock(&kgnilnd_data.kgn_peer_conn_lock); conn->gnc_state = GNILND_CONN_CLOSED; write_unlock(&kgnilnd_data.kgn_peer_conn_lock); /* mark this conn as CLOSED now that we processed it * do after TX, so we can use CLOSING in asserts */ mb(); if (CFS_FAIL_CHECK(CFS_FAIL_GNI_RX_CLOSE_CLOSED)) { /* simulate a RX CLOSE after the timeout but before * the scheduler thread gets it */ conn->gnc_close_recvd = GNILND_CLOSE_INJECT2; conn->gnc_peer_error = -ETIMEDOUT; } /* schedule to allow potential CLOSE and get the complete phase run */ kgnilnd_schedule_conn(conn); } int kgnilnd_process_mapped_tx(kgn_device_t *dev) { int found_work = 0; int rc = 0; kgn_tx_t *tx; int fast_remaps = GNILND_FAST_MAPPING_TRY; int log_retrans, log_retrans_level; static int last_map_version; ENTRY; spin_lock(&dev->gnd_lock); if (list_empty(&dev->gnd_map_tx)) { /* if the list is empty make sure we dont have a timer running */ del_singleshot_timer_sync(&dev->gnd_map_timer); spin_unlock(&dev->gnd_lock); RETURN(0); } dev->gnd_sched_alive = jiffies; /* we'll retry as fast as possible up to 25% of the limit, then we start * backing off until our map version changes - indicating we unmapped * something */ tx = list_first_entry(&dev->gnd_map_tx, kgn_tx_t, tx_list); if (likely(dev->gnd_map_attempt == 0) || time_after_eq(jiffies, dev->gnd_next_map) || last_map_version != dev->gnd_map_version) { /* if this is our first attempt at mapping set last mapped to current * jiffies so we can timeout our attempt correctly. */ if (dev->gnd_map_attempt == 0) dev->gnd_last_map = jiffies; } else { GNIDBG_TX(D_NET, tx, "waiting for mapping event event to retry", NULL); spin_unlock(&dev->gnd_lock); RETURN(0); } /* delete the previous timer if it exists */ del_singleshot_timer_sync(&dev->gnd_map_timer); /* stash the last map version to let us know when a good one was seen */ last_map_version = dev->gnd_map_version; /* we need to to take the lock and continually refresh the head of the list as * kgnilnd_complete_closed_conn might be nuking stuff and we are cycling the lock * allowing them to squeeze in */ while (!list_empty(&dev->gnd_map_tx)) { /* make sure we break out early on quiesce */ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { /* always break with lock held - we unlock outside loop */ break; } tx = list_first_entry(&dev->gnd_map_tx, kgn_tx_t, tx_list); kgnilnd_tx_del_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_ALLOCD); found_work++; /* sample with lock held, serializing with kgnilnd_complete_closed_conn */ if (tx->tx_conn->gnc_state != GNILND_CONN_ESTABLISHED) { /* if conn is dying, mark tx in tx_ref_table for * kgnilnd_complete_closed_conn to finish up */ kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_DYING, 1); found_work++; /* tx was moved to DYING, get next */ continue; } spin_unlock(&dev->gnd_lock); rc = kgnilnd_send_mapped_tx(tx, 1); /* We made it! skip error handling.. */ if (rc >= 0) { /* OK to continue on +ve errors as it won't get seen until * this function is called again - we operate on a copy of the original * list and not the live list */ spin_lock(&dev->gnd_lock); /* reset map attempts back to zero we successfully * mapped so we can reset our timers */ dev->gnd_map_attempt = 0; continue; } else if (rc == -EAGAIN) { spin_lock(&dev->gnd_lock); mod_timer(&dev->gnd_map_timer, dev->gnd_next_map); spin_unlock(&dev->gnd_lock); GOTO(get_out_mapped, rc); } else if (rc != -ENOMEM) { /* carp, failure we can't handle */ kgnilnd_tx_done(tx, rc); spin_lock(&dev->gnd_lock); /* reset map attempts back to zero we dont know what happened but it * wasnt a failed mapping */ dev->gnd_map_attempt = 0; continue; } /* time to handle the retry cases.. lock so we dont have 2 threads * mucking with gnd_map_attempt, or gnd_next_map at the same time. */ spin_lock(&dev->gnd_lock); dev->gnd_map_attempt++; if (dev->gnd_map_attempt < fast_remaps) { /* do nothing we just want it to go as fast as possible. * just set gnd_next_map to current jiffies so it will process * as fast as possible. */ dev->gnd_next_map = jiffies; } else { /* Retry based on GNILND_MAP_RETRY_RATE */ dev->gnd_next_map = jiffies + GNILND_MAP_RETRY_RATE; } /* only log occasionally once we've retried fast_remaps */ log_retrans = (dev->gnd_map_attempt >= fast_remaps) && ((dev->gnd_map_attempt % fast_remaps) == 0); log_retrans_level = log_retrans ? D_NETERROR : D_NET; /* make sure we are not off in the weeds with this tx */ if (time_after(jiffies, dev->gnd_last_map + GNILND_MAP_TIMEOUT)) { GNIDBG_TX(D_NETERROR, tx, "giving up on TX, too many retries", NULL); spin_unlock(&dev->gnd_lock); if (tx->tx_msg.gnm_type == GNILND_MSG_PUT_REQ || tx->tx_msg.gnm_type == GNILND_MSG_GET_REQ_REV) { kgnilnd_nak_rdma(tx->tx_conn, tx->tx_msg.gnm_type, -ENOMEM, tx->tx_putinfo.gnpam_dst_cookie, tx->tx_msg.gnm_srcnid); } else { kgnilnd_nak_rdma(tx->tx_conn, tx->tx_msg.gnm_type, -ENOMEM, tx->tx_getinfo.gngm_cookie, tx->tx_msg.gnm_srcnid); } kgnilnd_tx_done(tx, -ENOMEM); GOTO(get_out_mapped, rc); } else { GNIDBG_TX(log_retrans_level, tx, "transient map failure #%d %d pages/%d bytes phys %u@%u " "virt %u@"LPU64" " "nq_map %d mdd# %d/%d GART %ld", dev->gnd_map_attempt, tx->tx_phys_npages, tx->tx_nob, dev->gnd_map_nphys, dev->gnd_map_physnop * PAGE_SIZE, dev->gnd_map_nvirt, dev->gnd_map_virtnob, atomic_read(&dev->gnd_nq_map), atomic_read(&dev->gnd_n_mdd), atomic_read(&dev->gnd_n_mdd_held), atomic64_read(&dev->gnd_nbytes_map)); } /* we need to stop processing the rest of the list, so add it back in */ /* set timer to wake device when we need to schedule this tx */ mod_timer(&dev->gnd_map_timer, dev->gnd_next_map); kgnilnd_tx_add_state_locked(tx, NULL, tx->tx_conn, GNILND_TX_MAPQ, 0); spin_unlock(&dev->gnd_lock); GOTO(get_out_mapped, rc); } spin_unlock(&dev->gnd_lock); get_out_mapped: RETURN(found_work); } int kgnilnd_process_conns(kgn_device_t *dev, unsigned long deadline) { int found_work = 0; int conn_sched; int intent = 0; int error_inject = 0; int rc = 0; kgn_conn_t *conn; spin_lock(&dev->gnd_lock); while (!list_empty(&dev->gnd_ready_conns) && time_before(jiffies, deadline)) { dev->gnd_sched_alive = jiffies; error_inject = 0; rc = 0; if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { /* break with lock held */ break; } conn = list_first_entry(&dev->gnd_ready_conns, kgn_conn_t, gnc_schedlist); list_del_init(&conn->gnc_schedlist); spin_unlock(&dev->gnd_lock); conn_sched = xchg(&conn->gnc_scheduled, GNILND_CONN_PROCESS); LASSERTF(conn_sched != GNILND_CONN_IDLE && conn_sched != GNILND_CONN_PROCESS, "conn %p on ready list but in bad state: %d\n", conn, conn_sched); CDEBUG(D_INFO, "conn %p@%s for processing\n", conn, kgnilnd_conn_state2str(conn)); found_work++; set_mb(conn->gnc_last_sched_do, jiffies); if (kgnilnd_check_conn_fail_loc(dev, conn, &intent)) { /* based on intent see if we should run again. */ rc = kgnilnd_schedule_process_conn(conn, intent); error_inject = 1; /* drop ref from gnd_ready_conns */ if (atomic_read(&conn->gnc_refcount) == 1 && rc != 1) { down_write(&dev->gnd_conn_sem); kgnilnd_conn_decref(conn); up_write(&dev->gnd_conn_sem); } else if (rc != 1) { kgnilnd_conn_decref(conn); } /* clear this so that scheduler thread doesn't spin */ found_work = 0; /* break with lock held... */ spin_lock(&dev->gnd_lock); break; } if (unlikely(conn->gnc_state == GNILND_CONN_CLOSED)) { down_write(&dev->gnd_conn_sem); /* CONN_CLOSED set in procces_fmaq when CLOSE is sent */ if (unlikely(atomic_read(&conn->gnc_tx_in_use))) { /* If there are tx's currently in use in another * thread we dont want to complete the close * yet. Cycle this conn back through * the scheduler. */ kgnilnd_schedule_conn(conn); } else { kgnilnd_complete_closed_conn(conn); } up_write(&dev->gnd_conn_sem); } else if (unlikely(conn->gnc_state == GNILND_CONN_DESTROY_EP)) { /* DESTROY_EP set in kgnilnd_conn_decref on gnc_refcount = 1 */ /* serialize SMSG CQs with ep_bind and smsg_release */ down_write(&dev->gnd_conn_sem); kgnilnd_destroy_conn_ep(conn); up_write(&dev->gnd_conn_sem); } else if (unlikely(conn->gnc_state == GNILND_CONN_CLOSING)) { /* if we need to do some CLOSE sending, etc done here do it */ down_write(&dev->gnd_conn_sem); kgnilnd_send_conn_close(conn); kgnilnd_check_fma_rx(conn); up_write(&dev->gnd_conn_sem); } else if (atomic_read(&conn->gnc_peer->gnp_dirty_eps) == 0) { /* start moving traffic if the old conns are cleared out */ down_read(&dev->gnd_conn_sem); kgnilnd_check_fma_rx(conn); kgnilnd_process_fmaq(conn); up_read(&dev->gnd_conn_sem); } rc = kgnilnd_schedule_process_conn(conn, 0); /* drop ref from gnd_ready_conns */ if (atomic_read(&conn->gnc_refcount) == 1 && rc != 1) { down_write(&dev->gnd_conn_sem); kgnilnd_conn_decref(conn); up_write(&dev->gnd_conn_sem); } else if (rc != 1) { kgnilnd_conn_decref(conn); } /* check list again with lock held */ spin_lock(&dev->gnd_lock); } /* If we are short circuiting due to timing we want to be scheduled * as soon as possible. */ if (!list_empty(&dev->gnd_ready_conns) && !error_inject) found_work++; spin_unlock(&dev->gnd_lock); RETURN(found_work); } int kgnilnd_scheduler(void *arg) { int threadno = (long)arg; kgn_device_t *dev; int busy_loops = 0; unsigned long deadline = 0; DEFINE_WAIT(wait); dev = &kgnilnd_data.kgn_devices[(threadno + 1) % kgnilnd_data.kgn_ndevs]; cfs_block_allsigs(); /* all gnilnd threads need to run fairly urgently */ set_user_nice(current, *kgnilnd_tunables.kgn_sched_nice); deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_sched_timeout); while (!kgnilnd_data.kgn_shutdown) { int found_work = 0; /* Safe: kgn_shutdown only set when quiescent */ /* to quiesce or to not quiesce, that is the question */ if (unlikely(kgnilnd_data.kgn_quiesce_trigger)) { KGNILND_SPIN_QUIESCE; } /* tracking for when thread goes AWOL */ dev->gnd_sched_alive = jiffies; CFS_FAIL_TIMEOUT(CFS_FAIL_GNI_SCHED_DEADLINE, (*kgnilnd_tunables.kgn_sched_timeout + 1)); /* let folks know we are up and kicking * - they can use this for latency savings, etc * - only change if IRQ, if IDLE leave alone as that * schedule_device calls to put us back to IRQ */ (void)cmpxchg(&dev->gnd_ready, GNILND_DEV_IRQ, GNILND_DEV_LOOP); down_read(&dev->gnd_conn_sem); /* always check these - they are super low cost */ found_work += kgnilnd_check_fma_send_cq(dev); found_work += kgnilnd_check_fma_rcv_cq(dev); /* rdma CQ doesn't care about eps */ found_work += kgnilnd_check_rdma_cq(dev); /* move some RDMA ? */ found_work += kgnilnd_process_rdmaq(dev); /* map some pending RDMA requests ? */ found_work += kgnilnd_process_mapped_tx(dev); /* the EP for a conn is not destroyed until all the references * to it are gone, so these checks should be safe * even if run in parallel with the CQ checking functions * _AND_ a thread that processes the CLOSED->DONE * transistion * ...should.... */ up_read(&dev->gnd_conn_sem); /* process all conns ready now */ found_work += kgnilnd_process_conns(dev, deadline); /* do an eager check to avoid the IRQ disabling in * prepare_to_wait and friends */ if (found_work && (busy_loops++ < *kgnilnd_tunables.kgn_loops) && time_before(jiffies, deadline)) { found_work = 0; if ((busy_loops % 10) == 0) { /* tickle heartbeat and watchdog to ensure our * piggishness doesn't turn into heartbeat failure */ touch_nmi_watchdog(); kgnilnd_hw_hb(); } continue; } /* if we got here, found_work was zero or busy_loops means we * need to take a break. We'll clear gnd_ready but we'll check * one last time if there is an IRQ that needs processing */ prepare_to_wait(&dev->gnd_waitq, &wait, TASK_INTERRUPTIBLE); /* the first time this will go LOOP -> IDLE and let us do one final check * during which we might get an IRQ, then IDLE->IDLE and schedule() * - this might allow other threads to block us for a bit if they * try to get the mutex, but that is good as we'd need to wake * up soon to handle the CQ or other processing anyways */ found_work += xchg(&dev->gnd_ready, GNILND_DEV_IDLE); if ((busy_loops >= *kgnilnd_tunables.kgn_loops) || time_after_eq(jiffies, deadline)) { CDEBUG(D_INFO, "yeilding: found_work %d busy_loops %d\n", found_work, busy_loops); busy_loops = 0; /* use yield if we are bailing due to busy_loops * - this will ensure we wake up soonish. This closes * a race with kgnilnd_device_callback - where it'd * not call wake_up() because gnd_ready == 1, but then * we come down and schedule() because of busy_loops. * We'd not be woken up until something poked our waitq * again. yield() ensures we wake up without another * waitq poke in that case */ atomic_inc(&dev->gnd_n_yield); kgnilnd_data.kgn_last_condresched = jiffies; yield(); CDEBUG(D_INFO, "awake after yeild\n"); deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_sched_timeout); } else if (found_work == GNILND_DEV_IDLE) { /* busy_loops is low and there is nothing to do, * go to sleep and wait for a waitq poke */ CDEBUG(D_INFO, "scheduling: found_work %d busy_loops %d\n", found_work, busy_loops); atomic_inc(&dev->gnd_n_schedule); kgnilnd_data.kgn_last_scheduled = jiffies; schedule(); CDEBUG(D_INFO, "awake after schedule\n"); deadline = jiffies + cfs_time_seconds(*kgnilnd_tunables.kgn_sched_timeout); } finish_wait(&dev->gnd_waitq, &wait); } kgnilnd_thread_fini(); return 0; }