1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
32 * Copyright (C) 2006 Myricom, Inc.
35 * This file is part of Lustre, http://www.lustre.org/
36 * Lustre is a trademark of Sun Microsystems, Inc.
38 * lnet/klnds/mxlnd/mxlnd.c
40 * Author: Eric Barton <eric@bartonsoftware.com>
41 * Author: Scott Atchley <atchley at myri.com>
48 .lnd_startup = mxlnd_startup,
49 .lnd_shutdown = mxlnd_shutdown,
51 .lnd_send = mxlnd_send,
52 .lnd_recv = mxlnd_recv,
55 kmx_data_t kmxlnd_data;
58 mxlnd_free_pages(kmx_pages_t *p)
60 int npages = p->mxg_npages;
63 CDEBUG(D_MALLOC, "freeing %d pages\n", npages);
65 for (i = 0; i < npages; i++) {
66 if (p->mxg_pages[i] != NULL) {
67 __free_page(p->mxg_pages[i]);
68 cfs_spin_lock(&kmxlnd_data.kmx_mem_lock);
69 kmxlnd_data.kmx_mem_used -= PAGE_SIZE;
70 cfs_spin_unlock(&kmxlnd_data.kmx_mem_lock);
74 MXLND_FREE(p, offsetof(kmx_pages_t, mxg_pages[npages]));
78 mxlnd_alloc_pages(kmx_pages_t **pp, int npages)
80 kmx_pages_t *p = NULL;
83 CDEBUG(D_MALLOC, "allocing %d pages\n", npages);
85 MXLND_ALLOC(p, offsetof(kmx_pages_t, mxg_pages[npages]));
87 CERROR("Can't allocate descriptor for %d pages\n", npages);
91 memset(p, 0, offsetof(kmx_pages_t, mxg_pages[npages]));
92 p->mxg_npages = npages;
94 for (i = 0; i < npages; i++) {
95 p->mxg_pages[i] = alloc_page(GFP_KERNEL);
96 if (p->mxg_pages[i] == NULL) {
97 CERROR("Can't allocate page %d of %d\n", i, npages);
101 cfs_spin_lock(&kmxlnd_data.kmx_mem_lock);
102 kmxlnd_data.kmx_mem_used += PAGE_SIZE;
103 cfs_spin_unlock(&kmxlnd_data.kmx_mem_lock);
111 * mxlnd_ctx_init - reset ctx struct to the default values
112 * @ctx - a kmx_ctx pointer
115 mxlnd_ctx_init(kmx_ctx_t *ctx)
117 if (ctx == NULL) return;
119 /* do not change mxc_type */
120 ctx->mxc_incarnation = 0;
121 ctx->mxc_deadline = 0;
122 ctx->mxc_state = MXLND_CTX_IDLE;
123 if (!cfs_list_empty(&ctx->mxc_list))
124 cfs_list_del_init(&ctx->mxc_list);
125 /* ignore mxc_rx_list */
126 if (ctx->mxc_type == MXLND_REQ_TX) {
128 ctx->mxc_peer = NULL;
129 ctx->mxc_conn = NULL;
132 ctx->mxc_lntmsg[0] = NULL;
133 ctx->mxc_lntmsg[1] = NULL;
134 ctx->mxc_msg_type = 0;
135 ctx->mxc_cookie = 0LL;
136 ctx->mxc_match = 0LL;
137 /* ctx->mxc_seg.segment_ptr points to backing page */
138 ctx->mxc_seg.segment_length = 0;
139 if (ctx->mxc_seg_list != NULL) {
140 LASSERT(ctx->mxc_nseg > 0);
141 MXLND_FREE(ctx->mxc_seg_list, ctx->mxc_nseg * sizeof(mx_ksegment_t));
143 ctx->mxc_seg_list = NULL;
146 memset(&ctx->mxc_mxreq, 0, sizeof(mx_request_t));
147 memset(&ctx->mxc_status, 0, sizeof(mx_status_t));
152 ctx->mxc_msg->mxm_type = 0;
153 ctx->mxc_msg->mxm_credits = 0;
154 ctx->mxc_msg->mxm_nob = 0;
160 * mxlnd_free_txs - free kmx_txs and associated pages
162 * Called from mxlnd_shutdown()
168 kmx_ctx_t *tx = NULL;
170 if (kmxlnd_data.kmx_tx_pages) {
171 for (i = 0; i < MXLND_TX_MSGS(); i++) {
172 tx = &kmxlnd_data.kmx_txs[i];
173 if (tx->mxc_seg_list != NULL) {
174 LASSERT(tx->mxc_nseg > 0);
175 MXLND_FREE(tx->mxc_seg_list,
177 sizeof(*tx->mxc_seg_list));
180 MXLND_FREE(kmxlnd_data.kmx_txs,
181 MXLND_TX_MSGS() * sizeof(kmx_ctx_t));
182 mxlnd_free_pages(kmxlnd_data.kmx_tx_pages);
189 * mxlnd_init_txs - allocate tx descriptors then stash on txs and idle tx lists
191 * Called from mxlnd_startup()
192 * returns 0 on success, else -ENOMEM
202 kmx_ctx_t *tx = NULL;
203 kmx_pages_t *pages = NULL;
204 struct page *page = NULL;
206 /* pre-mapped messages are not bigger than 1 page */
207 CLASSERT(MXLND_MSG_SIZE <= PAGE_SIZE);
209 /* No fancy arithmetic when we do the buffer calculations */
210 CLASSERT (PAGE_SIZE % MXLND_MSG_SIZE == 0);
212 ret = mxlnd_alloc_pages(&pages, MXLND_TX_MSG_PAGES());
214 CERROR("Can't allocate tx pages\n");
217 kmxlnd_data.kmx_tx_pages = pages;
219 MXLND_ALLOC(kmxlnd_data.kmx_txs, MXLND_TX_MSGS() * sizeof(kmx_ctx_t));
220 if (&kmxlnd_data.kmx_txs == NULL) {
221 CERROR("Can't allocate %d tx descriptors\n", MXLND_TX_MSGS());
222 mxlnd_free_pages(pages);
226 memset(kmxlnd_data.kmx_txs, 0, MXLND_TX_MSGS() * sizeof(kmx_ctx_t));
228 for (i = 0; i < MXLND_TX_MSGS(); i++) {
230 tx = &kmxlnd_data.kmx_txs[i];
231 tx->mxc_type = MXLND_REQ_TX;
233 CFS_INIT_LIST_HEAD(&tx->mxc_list);
235 /* map mxc_msg to page */
236 page = pages->mxg_pages[ipage];
237 addr = page_address(page);
238 LASSERT(addr != NULL);
239 tx->mxc_msg = (kmx_msg_t *)(addr + offset);
240 tx->mxc_seg.segment_ptr = MX_PA_TO_U64(virt_to_phys(tx->mxc_msg));
244 offset += MXLND_MSG_SIZE;
245 LASSERT (offset <= PAGE_SIZE);
247 if (offset == PAGE_SIZE) {
250 LASSERT (ipage <= MXLND_TX_MSG_PAGES());
253 /* in startup(), no locks required */
254 cfs_list_add_tail(&tx->mxc_list, &kmxlnd_data.kmx_tx_idle);
261 * mxlnd_free_peers - free peers
263 * Called from mxlnd_shutdown()
266 mxlnd_free_peers(void)
270 kmx_peer_t *peer = NULL;
271 kmx_peer_t *next = NULL;
273 for (i = 0; i < MXLND_HASH_SIZE; i++) {
274 cfs_list_for_each_entry_safe(peer, next,
275 &kmxlnd_data.kmx_peers[i],
277 cfs_list_del_init(&peer->mxp_list);
278 if (peer->mxp_conn) mxlnd_conn_decref(peer->mxp_conn);
279 mxlnd_peer_decref(peer);
283 CDEBUG(D_NET, "%s: freed %d peers\n", __func__, count);
287 * mxlnd_init_mx - open the endpoint, set our ID, register the EAGER callback
288 * @ni - the network interface
290 * Returns 0 on success, -1 on failure
293 mxlnd_init_mx(lnet_ni_t *ni)
297 u32 board = *kmxlnd_tunables.kmx_board;
298 u32 ep_id = *kmxlnd_tunables.kmx_ep_id;
306 if (mxret != MX_SUCCESS) {
307 CERROR("mx_init() failed with %s (%d)\n", mx_strerror(mxret), mxret);
311 if (ni->ni_interfaces[0] != NULL) {
312 /* Use the IPoMX interface specified in 'networks=' */
314 CLASSERT (LNET_MAX_INTERFACES > 1);
315 if (ni->ni_interfaces[1] != NULL) {
316 CERROR("Multiple interfaces not supported\n");
317 goto failed_with_init;
320 ifname = ni->ni_interfaces[0];
322 ifname = *kmxlnd_tunables.kmx_default_ipif;
325 ret = libcfs_ipif_query(ifname, &if_up, &ip, &netmask);
327 CERROR("Can't query IPoMX interface %s: %d\n",
329 goto failed_with_init;
333 CERROR("Can't query IPoMX interface %s: it's down\n",
335 goto failed_with_init;
338 mxret = mx_open_endpoint(board, ep_id, MXLND_MSG_MAGIC,
339 NULL, 0, &kmxlnd_data.kmx_endpt);
340 if (mxret != MX_SUCCESS) {
341 CERROR("mx_open_endpoint() failed with %d\n", mxret);
342 goto failed_with_init;
345 mx_get_endpoint_addr(kmxlnd_data.kmx_endpt, &kmxlnd_data.kmx_epa);
346 mx_decompose_endpoint_addr(kmxlnd_data.kmx_epa, &nic_id, &ep_id);
347 mxret = mx_connect(kmxlnd_data.kmx_endpt, nic_id, ep_id,
348 MXLND_MSG_MAGIC, MXLND_CONNECT_TIMEOUT/CFS_HZ*1000,
349 &kmxlnd_data.kmx_epa);
350 if (mxret != MX_SUCCESS) {
351 CDEBUG(D_NETERROR, "unable to connect to myself (%s)\n", mx_strerror(mxret));
352 goto failed_with_endpoint;
355 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), ip);
356 CDEBUG(D_NET, "My NID is 0x%llx\n", ni->ni_nid);
358 /* this will catch all unexpected receives. */
359 mxret = mx_register_unexp_handler(kmxlnd_data.kmx_endpt,
360 (mx_unexp_handler_t) mxlnd_unexpected_recv,
362 if (mxret != MX_SUCCESS) {
363 CERROR("mx_register_unexp_callback() failed with %s\n",
365 goto failed_with_endpoint;
367 mxret = mx_set_request_timeout(kmxlnd_data.kmx_endpt, NULL,
368 MXLND_COMM_TIMEOUT/CFS_HZ*1000);
369 if (mxret != MX_SUCCESS) {
370 CERROR("mx_set_request_timeout() failed with %s\n",
372 goto failed_with_endpoint;
376 failed_with_endpoint:
377 mx_close_endpoint(kmxlnd_data.kmx_endpt);
385 * mxlnd_thread_start - spawn a kernel thread with this function
386 * @fn - function pointer
387 * @arg - pointer to the parameter data
389 * Returns 0 on success and a negative value on failure
392 mxlnd_thread_start(int (*fn)(void *arg), void *arg)
395 int i = (int) ((long) arg);
397 cfs_atomic_inc(&kmxlnd_data.kmx_nthreads);
398 cfs_init_completion(&kmxlnd_data.kmx_completions[i]);
400 pid = cfs_kernel_thread (fn, arg, 0);
402 CERROR("cfs_kernel_thread() failed with %d\n", pid);
403 cfs_atomic_dec(&kmxlnd_data.kmx_nthreads);
409 * mxlnd_thread_stop - decrement thread counter
411 * The thread returns 0 when it detects shutdown.
412 * We are simply decrementing the thread counter.
415 mxlnd_thread_stop(long id)
418 cfs_atomic_dec (&kmxlnd_data.kmx_nthreads);
419 cfs_complete(&kmxlnd_data.kmx_completions[i]);
423 * mxlnd_shutdown - stop IO, clean up state
424 * @ni - LNET interface handle
426 * No calls to the LND should be made after calling this function.
429 mxlnd_shutdown (lnet_ni_t *ni)
432 int nthreads = MXLND_NDAEMONS
433 + *kmxlnd_tunables.kmx_n_waitd;
435 LASSERT (ni == kmxlnd_data.kmx_ni);
436 LASSERT (ni->ni_data == &kmxlnd_data);
437 CDEBUG(D_NET, "in shutdown()\n");
439 CDEBUG(D_MALLOC, "before MXLND cleanup: libcfs_kmemory %d "
440 "kmx_mem_used %ld\n", cfs_atomic_read(&libcfs_kmemory),
441 kmxlnd_data.kmx_mem_used);
444 CDEBUG(D_NET, "setting shutdown = 1\n");
445 cfs_atomic_set(&kmxlnd_data.kmx_shutdown, 1);
447 switch (kmxlnd_data.kmx_init) {
451 /* calls write_[un]lock(kmx_global_lock) */
452 mxlnd_del_peer(LNET_NID_ANY);
454 /* wakeup request_waitds */
455 mx_wakeup(kmxlnd_data.kmx_endpt);
456 cfs_up(&kmxlnd_data.kmx_tx_queue_sem);
457 cfs_up(&kmxlnd_data.kmx_conn_sem);
458 mxlnd_sleep(2 * CFS_HZ);
462 case MXLND_INIT_THREADS:
464 CDEBUG(D_NET, "waiting on threads\n");
465 /* wait for threads to complete */
466 for (i = 0; i < nthreads; i++) {
467 cfs_wait_for_completion(&kmxlnd_data.kmx_completions[i]);
469 LASSERT(cfs_atomic_read(&kmxlnd_data.kmx_nthreads) == 0);
471 CDEBUG(D_NET, "freeing completions\n");
472 MXLND_FREE(kmxlnd_data.kmx_completions,
473 nthreads * sizeof(cfs_completion_t));
479 CDEBUG(D_NET, "stopping mx\n");
481 /* no peers left, close the endpoint */
482 mx_close_endpoint(kmxlnd_data.kmx_endpt);
489 CDEBUG(D_NET, "freeing txs\n");
491 /* free all txs and associated pages */
496 case MXLND_INIT_DATA:
498 CDEBUG(D_NET, "freeing peers\n");
500 /* peers should be gone, but check again */
503 /* conn zombies should be gone, but check again */
504 mxlnd_free_conn_zombies();
508 case MXLND_INIT_NOTHING:
511 CDEBUG(D_NET, "shutdown complete\n");
513 CDEBUG(D_MALLOC, "after MXLND cleanup: libcfs_kmemory %d "
514 "kmx_mem_used %ld\n", cfs_atomic_read(&libcfs_kmemory),
515 kmxlnd_data.kmx_mem_used);
517 kmxlnd_data.kmx_init = MXLND_INIT_NOTHING;
523 * mxlnd_startup - initialize state, open an endpoint, start IO
524 * @ni - LNET interface handle
526 * Initialize state, open an endpoint, start monitoring threads.
527 * Should only be called once.
530 mxlnd_startup (lnet_ni_t *ni)
534 int nthreads = MXLND_NDAEMONS /* tx_queued, timeoutd, connd */
535 + *kmxlnd_tunables.kmx_n_waitd;
538 LASSERT (ni->ni_lnd == &the_kmxlnd);
540 if (kmxlnd_data.kmx_init != MXLND_INIT_NOTHING) {
541 CERROR("Only 1 instance supported\n");
544 CDEBUG(D_MALLOC, "before MXLND startup: libcfs_kmemory %d "
545 "kmx_mem_used %ld\n", cfs_atomic_read(&libcfs_kmemory),
546 kmxlnd_data.kmx_mem_used);
548 ni->ni_maxtxcredits = MXLND_TX_MSGS();
549 ni->ni_peertxcredits = *kmxlnd_tunables.kmx_peercredits;
550 if (ni->ni_maxtxcredits < ni->ni_peertxcredits)
551 ni->ni_maxtxcredits = ni->ni_peertxcredits;
554 memset (&kmxlnd_data, 0, sizeof (kmxlnd_data));
556 kmxlnd_data.kmx_ni = ni;
557 ni->ni_data = &kmxlnd_data;
559 cfs_gettimeofday(&tv);
560 kmxlnd_data.kmx_incarnation = (((__u64)tv.tv_sec) * 1000000) + tv.tv_usec;
561 CDEBUG(D_NET, "my incarnation is %llu\n", kmxlnd_data.kmx_incarnation);
563 cfs_rwlock_init (&kmxlnd_data.kmx_global_lock);
564 cfs_spin_lock_init (&kmxlnd_data.kmx_mem_lock);
566 CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_conn_reqs);
567 CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_conn_zombies);
568 CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_orphan_msgs);
569 cfs_spin_lock_init (&kmxlnd_data.kmx_conn_lock);
570 cfs_sema_init(&kmxlnd_data.kmx_conn_sem, 0);
572 for (i = 0; i < MXLND_HASH_SIZE; i++) {
573 CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_peers[i]);
576 CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_tx_idle);
577 cfs_spin_lock_init (&kmxlnd_data.kmx_tx_idle_lock);
578 kmxlnd_data.kmx_tx_next_cookie = 1;
579 CFS_INIT_LIST_HEAD (&kmxlnd_data.kmx_tx_queue);
580 cfs_spin_lock_init (&kmxlnd_data.kmx_tx_queue_lock);
581 cfs_sema_init(&kmxlnd_data.kmx_tx_queue_sem, 0);
583 kmxlnd_data.kmx_init = MXLND_INIT_DATA;
584 /*****************************************************/
586 ret = mxlnd_init_txs();
588 CERROR("Can't alloc tx descs: %d\n", ret);
591 kmxlnd_data.kmx_init = MXLND_INIT_TXS;
592 /*****************************************************/
594 ret = mxlnd_init_mx(ni);
596 CERROR("Can't init mx\n");
600 kmxlnd_data.kmx_init = MXLND_INIT_MX;
601 /*****************************************************/
605 MXLND_ALLOC(kmxlnd_data.kmx_completions,
606 nthreads * sizeof(cfs_completion_t));
607 if (kmxlnd_data.kmx_completions == NULL) {
608 CERROR("failed to alloc kmxlnd_data.kmx_completions\n");
611 memset(kmxlnd_data.kmx_completions, 0,
612 nthreads * sizeof(cfs_completion_t));
614 CDEBUG(D_NET, "using %d %s in mx_wait_any()\n",
615 *kmxlnd_tunables.kmx_n_waitd,
616 *kmxlnd_tunables.kmx_n_waitd == 1 ? "thread" : "threads");
618 for (i = 0; i < *kmxlnd_tunables.kmx_n_waitd; i++) {
619 ret = mxlnd_thread_start(mxlnd_request_waitd, (void*)((long)i));
621 CERROR("Starting mxlnd_request_waitd[%d] failed with %d\n", i, ret);
622 cfs_atomic_set(&kmxlnd_data.kmx_shutdown, 1);
623 mx_wakeup(kmxlnd_data.kmx_endpt);
624 for (--i; i >= 0; i--) {
625 cfs_wait_for_completion(&kmxlnd_data.kmx_completions[i]);
627 LASSERT(cfs_atomic_read(&kmxlnd_data.kmx_nthreads) == 0);
628 MXLND_FREE(kmxlnd_data.kmx_completions,
629 nthreads * sizeof(cfs_completion_t));
634 ret = mxlnd_thread_start(mxlnd_tx_queued, (void*)((long)i++));
636 CERROR("Starting mxlnd_tx_queued failed with %d\n", ret);
637 cfs_atomic_set(&kmxlnd_data.kmx_shutdown, 1);
638 mx_wakeup(kmxlnd_data.kmx_endpt);
639 for (--i; i >= 0; i--) {
640 cfs_wait_for_completion(&kmxlnd_data.kmx_completions[i]);
642 LASSERT(cfs_atomic_read(&kmxlnd_data.kmx_nthreads) == 0);
643 MXLND_FREE(kmxlnd_data.kmx_completions,
644 nthreads * sizeof(cfs_completion_t));
647 ret = mxlnd_thread_start(mxlnd_timeoutd, (void*)((long)i++));
649 CERROR("Starting mxlnd_timeoutd failed with %d\n", ret);
650 cfs_atomic_set(&kmxlnd_data.kmx_shutdown, 1);
651 mx_wakeup(kmxlnd_data.kmx_endpt);
652 cfs_up(&kmxlnd_data.kmx_tx_queue_sem);
653 for (--i; i >= 0; i--) {
654 cfs_wait_for_completion(&kmxlnd_data.kmx_completions[i]);
656 LASSERT(cfs_atomic_read(&kmxlnd_data.kmx_nthreads) == 0);
657 MXLND_FREE(kmxlnd_data.kmx_completions,
658 nthreads * sizeof(cfs_completion_t));
661 ret = mxlnd_thread_start(mxlnd_connd, (void*)((long)i++));
663 CERROR("Starting mxlnd_connd failed with %d\n", ret);
664 cfs_atomic_set(&kmxlnd_data.kmx_shutdown, 1);
665 mx_wakeup(kmxlnd_data.kmx_endpt);
666 cfs_up(&kmxlnd_data.kmx_tx_queue_sem);
667 for (--i; i >= 0; i--) {
668 cfs_wait_for_completion(&kmxlnd_data.kmx_completions[i]);
670 LASSERT(cfs_atomic_read(&kmxlnd_data.kmx_nthreads) == 0);
671 MXLND_FREE(kmxlnd_data.kmx_completions,
672 nthreads * sizeof(cfs_completion_t));
676 kmxlnd_data.kmx_init = MXLND_INIT_THREADS;
677 /*****************************************************/
679 kmxlnd_data.kmx_init = MXLND_INIT_ALL;
680 CDEBUG(D_MALLOC, "startup complete (kmx_mem_used %ld)\n", kmxlnd_data.kmx_mem_used);
684 CERROR("mxlnd_startup failed\n");
689 static int mxlnd_init(void)
691 lnet_register_lnd(&the_kmxlnd);
695 static void mxlnd_exit(void)
697 lnet_unregister_lnd(&the_kmxlnd);
701 module_init(mxlnd_init);
702 module_exit(mxlnd_exit);
704 MODULE_LICENSE("GPL");
705 MODULE_AUTHOR("Myricom, Inc. - help@myri.com");
706 MODULE_DESCRIPTION("Kernel MyrinetExpress LND");
707 MODULE_VERSION("0.6.0");