X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lnet%2Fklnds%2Fralnd%2Fralnd.c;h=0da7af4faff1abd17385da2aba6937069fe7ec14;hp=35f436e444996541f5978c7be803052e1a84453e;hb=6815097cbb06aa1a727e6bf7a8ee9e916a33ee6d;hpb=64cd6738edccfefb928825112da62b2a44db284e;ds=sidebyside diff --git a/lnet/klnds/ralnd/ralnd.c b/lnet/klnds/ralnd/ralnd.c index 35f436e..0da7af4 100644 --- a/lnet/klnds/ralnd/ralnd.c +++ b/lnet/klnds/ralnd/ralnd.c @@ -22,6 +22,8 @@ */ #include "ranal.h" +static int kranal_devids[] = {RAPK_MAIN_DEVICE_ID, + RAPK_EXPANSION_DEVICE_ID}; nal_t kranal_api; ptl_handle_ni_t kranal_ni; @@ -533,15 +535,32 @@ int kranal_set_conn_params(kra_conn_t *conn, kra_connreq_t *connreq, __u32 peer_ip, int peer_port) { - RAP_RETURN rrc; + kra_device_t *dev = conn->rac_device; + unsigned long flags; + RAP_RETURN rrc; + + /* CAVEAT EMPTOR: we're really overloading rac_last_tx + rac_keepalive + * to do RapkCompleteSync() timekeeping (see kibnal_scheduler). */ + conn->rac_last_tx = jiffies; + conn->rac_keepalive = 0; + + /* Schedule conn on rad_new_conns */ + kranal_conn_addref(conn); + spin_lock_irqsave(&dev->rad_lock, flags); + list_add_tail(&conn->rac_schedlist, &dev->rad_new_conns); + wake_up(&dev->rad_waitq); + spin_unlock_irqrestore(&dev->rad_lock, flags); rrc = RapkSetRiParams(conn->rac_rihandle, &connreq->racr_riparams); if (rrc != RAP_SUCCESS) { CERROR("Error setting riparams from %u.%u.%u.%u/%d: %d\n", HIPQUAD(peer_ip), peer_port, rrc); - return -EPROTO; + return -ECONNABORTED; } + /* Scheduler doesn't touch conn apart from to deschedule and decref it + * after RapkCompleteSync() return success, so conn is all mine */ + conn->rac_peerstamp = connreq->racr_peerstamp; conn->rac_peer_connstamp = connreq->racr_connstamp; conn->rac_keepalive = RANAL_TIMEOUT2KEEPALIVE(connreq->racr_timeout); @@ -894,6 +913,9 @@ kranal_conn_handshake (struct socket *sock, kra_peer_t *peer) if (nstale != 0) CWARN("Closed %d stale conns to "LPX64"\n", nstale, peer_nid); + CDEBUG(D_WARNING, "New connection to "LPX64" on devid[%d] = %d\n", + peer_nid, conn->rac_device->rad_idx, conn->rac_device->rad_id); + /* Ensure conn gets checked. Transmits may have been queued and an * FMA event may have happened before it got in the cq hash table */ kranal_schedule_conn(conn); @@ -1823,13 +1845,19 @@ kranal_api_shutdown (nal_t *nal) break; } + /* Conn/Peer state all cleaned up BEFORE setting shutdown, so threads + * don't have to worry about shutdown races */ + LASSERT (atomic_read(&kranal_data.kra_nconns) == 0); + LASSERT (atomic_read(&kranal_data.kra_npeers) == 0); + /* flag threads to terminate; wake and wait for them to die */ kranal_data.kra_shutdown = 1; for (i = 0; i < kranal_data.kra_ndevs; i++) { kra_device_t *dev = &kranal_data.kra_devices[i]; - LASSERT (list_empty(&dev->rad_connq)); + LASSERT (list_empty(&dev->rad_ready_conns)); + LASSERT (list_empty(&dev->rad_new_conns)); spin_lock_irqsave(&dev->rad_lock, flags); wake_up(&dev->rad_waitq); @@ -1893,8 +1921,6 @@ kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid, ptl_ni_limits_t *requested_limits, ptl_ni_limits_t *actual_limits) { - static int device_ids[] = {RAPK_MAIN_DEVICE_ID, - RAPK_EXPANSION_DEVICE_ID}; struct timeval tv; ptl_process_id_t process_id; int pkmem = atomic_read(&portal_kmemory); @@ -1935,7 +1961,8 @@ kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid, kra_device_t *dev = &kranal_data.kra_devices[i]; dev->rad_idx = i; - INIT_LIST_HEAD(&dev->rad_connq); + INIT_LIST_HEAD(&dev->rad_ready_conns); + INIT_LIST_HEAD(&dev->rad_new_conns); init_waitqueue_head(&dev->rad_waitq); spin_lock_init(&dev->rad_lock); } @@ -2012,14 +2039,25 @@ kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid, } } - LASSERT(kranal_data.kra_ndevs == 0); - for (i = 0; i < sizeof(device_ids)/sizeof(device_ids[0]); i++) { + LASSERT (kranal_data.kra_ndevs == 0); + + for (i = 0; i < sizeof(kranal_devids)/sizeof(kranal_devids[0]); i++) { + LASSERT (i < RANAL_MAXDEVS); + dev = &kranal_data.kra_devices[kranal_data.kra_ndevs]; - rc = kranal_device_init(device_ids[i], dev); + rc = kranal_device_init(kranal_devids[i], dev); if (rc == 0) kranal_data.kra_ndevs++; - + } + + if (kranal_data.kra_ndevs == 0) { + CERROR("Can't initialise any RapidArray devices\n"); + goto failed; + } + + for (i = 0; i < kranal_data.kra_ndevs; i++) { + dev = &kranal_data.kra_devices[i]; rc = kranal_thread_start(kranal_scheduler, dev); if (rc != 0) { CERROR("Can't spawn ranal scheduler[%d]: %d\n", @@ -2028,9 +2066,6 @@ kranal_api_startup (nal_t *nal, ptl_pid_t requested_pid, } } - if (kranal_data.kra_ndevs == 0) - goto failed; - rc = libcfs_nal_cmd_register(RANAL, &kranal_cmd, NULL); if (rc != 0) { CERROR("Can't initialise command interface (rc = %d)\n", rc);