4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
31 * This file is part of Lustre, http://www.lustre.org/
32 * Lustre is a trademark of Sun Microsystems, Inc.
34 * lnet/klnds/qswlnd/qswlnd.c
36 * Author: Eric Barton <eric@bartonsoftware.com>
45 .lnd_startup = kqswnal_startup,
46 .lnd_shutdown = kqswnal_shutdown,
47 .lnd_ctl = kqswnal_ctl,
48 .lnd_send = kqswnal_send,
49 .lnd_recv = kqswnal_recv,
52 kqswnal_data_t kqswnal_data;
55 kqswnal_get_tx_desc (struct libcfs_ioctl_data *data)
61 int index = data->ioc_count;
64 spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags);
66 cfs_list_for_each (tmp, &kqswnal_data.kqn_activetxds) {
70 ktx = cfs_list_entry (tmp, kqswnal_tx_t, ktx_list);
71 hdr = (lnet_hdr_t *)ktx->ktx_buffer;
73 data->ioc_count = le32_to_cpu(hdr->payload_length);
74 data->ioc_nid = le64_to_cpu(hdr->dest_nid);
75 data->ioc_u64[0] = ktx->ktx_nid;
76 data->ioc_u32[0] = le32_to_cpu(hdr->type);
77 data->ioc_u32[1] = ktx->ktx_launcher;
79 (cfs_list_empty (&ktx->ktx_schedlist) ? 0 : 1) |
80 (ktx->ktx_state << 2);
85 spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags);
90 kqswnal_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg)
92 struct libcfs_ioctl_data *data = arg;
94 LASSERT (ni == kqswnal_data.kqn_ni);
97 case IOC_LIBCFS_GET_TXDESC:
98 return (kqswnal_get_tx_desc (data));
100 case IOC_LIBCFS_REGISTER_MYNID:
101 if (data->ioc_nid == ni->ni_nid)
104 LASSERT (LNET_NIDNET(data->ioc_nid) == LNET_NIDNET(ni->ni_nid));
106 CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID for %s(%s)\n",
107 libcfs_nid2str(data->ioc_nid),
108 libcfs_nid2str(ni->ni_nid));
117 kqswnal_shutdown(lnet_ni_t *ni)
123 CDEBUG (D_NET, "shutdown\n");
124 LASSERT (ni->ni_data == &kqswnal_data);
125 LASSERT (ni == kqswnal_data.kqn_ni);
127 switch (kqswnal_data.kqn_init)
137 /**********************************************************************/
138 /* Signal the start of shutdown... */
139 spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags);
140 kqswnal_data.kqn_shuttingdown = 1;
141 spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags);
143 /**********************************************************************/
144 /* wait for sends that have allocated a tx desc to launch or give up */
145 while (cfs_atomic_read (&kqswnal_data.kqn_pending_txs) != 0) {
146 CDEBUG(D_NET, "waiting for %d pending sends\n",
147 cfs_atomic_read (&kqswnal_data.kqn_pending_txs));
148 cfs_pause(cfs_time_seconds(1));
151 /**********************************************************************/
152 /* close elan comms */
153 /* Shut down receivers first; rx callbacks might try sending... */
154 if (kqswnal_data.kqn_eprx_small != NULL)
155 ep_free_rcvr (kqswnal_data.kqn_eprx_small);
157 if (kqswnal_data.kqn_eprx_large != NULL)
158 ep_free_rcvr (kqswnal_data.kqn_eprx_large);
160 /* NB ep_free_rcvr() returns only after we've freed off all receive
161 * buffers (see shutdown handling in kqswnal_requeue_rx()). This
162 * means we must have completed any messages we passed to
165 if (kqswnal_data.kqn_eptx != NULL)
166 ep_free_xmtr (kqswnal_data.kqn_eptx);
168 /* NB ep_free_xmtr() returns only after all outstanding transmits
169 * have called their callback... */
170 LASSERT(cfs_list_empty(&kqswnal_data.kqn_activetxds));
172 /**********************************************************************/
173 /* flag threads to terminate, wake them and wait for them to die */
174 kqswnal_data.kqn_shuttingdown = 2;
175 cfs_waitq_broadcast (&kqswnal_data.kqn_sched_waitq);
177 while (cfs_atomic_read (&kqswnal_data.kqn_nthreads) != 0) {
178 CDEBUG(D_NET, "waiting for %d threads to terminate\n",
179 cfs_atomic_read (&kqswnal_data.kqn_nthreads));
180 cfs_pause(cfs_time_seconds(1));
183 /**********************************************************************/
184 /* No more threads. No more portals, router or comms callbacks!
185 * I control the horizontals and the verticals...
188 LASSERT (cfs_list_empty (&kqswnal_data.kqn_readyrxds));
189 LASSERT (cfs_list_empty (&kqswnal_data.kqn_donetxds));
190 LASSERT (cfs_list_empty (&kqswnal_data.kqn_delayedtxds));
192 /**********************************************************************/
193 /* Unmap message buffers and free all descriptors and buffers
196 /* FTTB, we need to unmap any remaining mapped memory. When
197 * ep_dvma_release() get fixed (and releases any mappings in the
198 * region), we can delete all the code from here --------> */
200 for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx = ktx->ktx_alloclist) {
201 /* If ktx has a buffer, it got mapped; unmap now. NB only
202 * the pre-mapped stuff is still mapped since all tx descs
205 if (ktx->ktx_buffer != NULL)
206 ep_dvma_unload(kqswnal_data.kqn_ep,
207 kqswnal_data.kqn_ep_tx_nmh,
211 for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
212 /* If krx_kiov[0].kiov_page got allocated, it got mapped.
213 * NB subsequent pages get merged */
215 if (krx->krx_kiov[0].kiov_page != NULL)
216 ep_dvma_unload(kqswnal_data.kqn_ep,
217 kqswnal_data.kqn_ep_rx_nmh,
218 &krx->krx_elanbuffer);
220 /* <----------- to here */
222 if (kqswnal_data.kqn_ep_rx_nmh != NULL)
223 ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_rx_nmh);
225 if (kqswnal_data.kqn_ep_tx_nmh != NULL)
226 ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_tx_nmh);
228 while (kqswnal_data.kqn_txds != NULL) {
229 ktx = kqswnal_data.kqn_txds;
231 if (ktx->ktx_buffer != NULL)
232 LIBCFS_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
234 kqswnal_data.kqn_txds = ktx->ktx_alloclist;
235 LIBCFS_FREE(ktx, sizeof(*ktx));
238 while (kqswnal_data.kqn_rxds != NULL) {
241 krx = kqswnal_data.kqn_rxds;
242 for (i = 0; i < krx->krx_npages; i++)
243 if (krx->krx_kiov[i].kiov_page != NULL)
244 __free_page (krx->krx_kiov[i].kiov_page);
246 kqswnal_data.kqn_rxds = krx->krx_alloclist;
247 LIBCFS_FREE(krx, sizeof (*krx));
250 /* resets flags, pointers to NULL etc */
251 memset(&kqswnal_data, 0, sizeof (kqswnal_data));
253 CDEBUG (D_MALLOC, "done kmem %d\n", cfs_atomic_read(&libcfs_kmemory));
259 kqswnal_startup (lnet_ni_t *ni)
261 EP_RAILMASK all_rails = EP_RAILMASK_ALL;
268 LASSERT (ni->ni_lnd == &the_kqswlnd);
270 /* Only 1 instance supported */
271 if (kqswnal_data.kqn_init != KQN_INIT_NOTHING) {
272 CERROR ("Only 1 instance supported\n");
276 if (ni->ni_interfaces[0] != NULL) {
277 CERROR("Explicit interface config not supported\n");
281 if (*kqswnal_tunables.kqn_credits >=
282 *kqswnal_tunables.kqn_ntxmsgs) {
283 LCONSOLE_ERROR_MSG(0x12e, "Configuration error: please set "
284 "ntxmsgs(%d) > credits(%d)\n",
285 *kqswnal_tunables.kqn_ntxmsgs,
286 *kqswnal_tunables.kqn_credits);
289 CDEBUG (D_MALLOC, "start kmem %d\n", cfs_atomic_read(&libcfs_kmemory));
291 /* ensure all pointers NULL etc */
292 memset (&kqswnal_data, 0, sizeof (kqswnal_data));
294 kqswnal_data.kqn_ni = ni;
295 ni->ni_data = &kqswnal_data;
296 ni->ni_peertxcredits = *kqswnal_tunables.kqn_peercredits;
297 ni->ni_maxtxcredits = *kqswnal_tunables.kqn_credits;
299 CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds);
300 CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_activetxds);
301 spin_lock_init(&kqswnal_data.kqn_idletxd_lock);
303 CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds);
304 CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_donetxds);
305 CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds);
307 spin_lock_init(&kqswnal_data.kqn_sched_lock);
308 cfs_waitq_init (&kqswnal_data.kqn_sched_waitq);
310 /* pointers/lists/locks initialised */
311 kqswnal_data.kqn_init = KQN_INIT_DATA;
314 kqswnal_data.kqn_ep = ep_system();
315 if (kqswnal_data.kqn_ep == NULL) {
316 CERROR("Can't initialise EKC\n");
317 kqswnal_shutdown(ni);
321 if (ep_waitfor_nodeid(kqswnal_data.kqn_ep) == ELAN_INVALID_NODE) {
322 CERROR("Can't get elan ID\n");
323 kqswnal_shutdown(ni);
327 kqswnal_data.kqn_nnodes = ep_numnodes (kqswnal_data.kqn_ep);
328 kqswnal_data.kqn_elanid = ep_nodeid (kqswnal_data.kqn_ep);
330 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), kqswnal_data.kqn_elanid);
332 /**********************************************************************/
333 /* Get the transmitter */
335 kqswnal_data.kqn_eptx = ep_alloc_xmtr (kqswnal_data.kqn_ep);
336 if (kqswnal_data.kqn_eptx == NULL)
338 CERROR ("Can't allocate transmitter\n");
339 kqswnal_shutdown (ni);
343 /**********************************************************************/
344 /* Get the receivers */
346 kqswnal_data.kqn_eprx_small =
347 ep_alloc_rcvr (kqswnal_data.kqn_ep,
348 EP_MSG_SVC_PORTALS_SMALL,
349 *kqswnal_tunables.kqn_ep_envelopes_small);
350 if (kqswnal_data.kqn_eprx_small == NULL)
352 CERROR ("Can't install small msg receiver\n");
353 kqswnal_shutdown (ni);
357 kqswnal_data.kqn_eprx_large =
358 ep_alloc_rcvr (kqswnal_data.kqn_ep,
359 EP_MSG_SVC_PORTALS_LARGE,
360 *kqswnal_tunables.kqn_ep_envelopes_large);
361 if (kqswnal_data.kqn_eprx_large == NULL)
363 CERROR ("Can't install large msg receiver\n");
364 kqswnal_shutdown (ni);
368 /**********************************************************************/
369 /* Reserve Elan address space for transmit descriptors NB we may
370 * either send the contents of associated buffers immediately, or
371 * map them for the peer to suck/blow... */
372 kqswnal_data.kqn_ep_tx_nmh =
373 ep_dvma_reserve(kqswnal_data.kqn_ep,
374 KQSW_NTXMSGPAGES*(*kqswnal_tunables.kqn_ntxmsgs),
376 if (kqswnal_data.kqn_ep_tx_nmh == NULL) {
377 CERROR("Can't reserve tx dma space\n");
378 kqswnal_shutdown(ni);
382 /**********************************************************************/
383 /* Reserve Elan address space for receive buffers */
384 kqswnal_data.kqn_ep_rx_nmh =
385 ep_dvma_reserve(kqswnal_data.kqn_ep,
386 KQSW_NRXMSGPAGES_SMALL *
387 (*kqswnal_tunables.kqn_nrxmsgs_small) +
388 KQSW_NRXMSGPAGES_LARGE *
389 (*kqswnal_tunables.kqn_nrxmsgs_large),
391 if (kqswnal_data.kqn_ep_tx_nmh == NULL) {
392 CERROR("Can't reserve rx dma space\n");
393 kqswnal_shutdown(ni);
397 /**********************************************************************/
398 /* Allocate/Initialise transmit descriptors */
400 kqswnal_data.kqn_txds = NULL;
401 for (i = 0; i < (*kqswnal_tunables.kqn_ntxmsgs); i++)
404 int basepage = i * KQSW_NTXMSGPAGES;
406 LIBCFS_ALLOC (ktx, sizeof(*ktx));
408 kqswnal_shutdown (ni);
412 memset(ktx, 0, sizeof(*ktx)); /* NULL pointers; zero flags */
413 ktx->ktx_alloclist = kqswnal_data.kqn_txds;
414 kqswnal_data.kqn_txds = ktx;
416 LIBCFS_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
417 if (ktx->ktx_buffer == NULL)
419 kqswnal_shutdown (ni);
423 /* Map pre-allocated buffer NOW, to save latency on transmit */
424 premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer,
425 KQSW_TX_BUFFER_SIZE);
426 ep_dvma_load(kqswnal_data.kqn_ep, NULL,
427 ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE,
428 kqswnal_data.kqn_ep_tx_nmh, basepage,
429 &all_rails, &ktx->ktx_ebuffer);
431 ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */
432 ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */
434 CFS_INIT_LIST_HEAD (&ktx->ktx_schedlist);
436 ktx->ktx_state = KTX_IDLE;
437 ktx->ktx_rail = -1; /* unset rail */
439 cfs_list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_idletxds);
442 /**********************************************************************/
443 /* Allocate/Initialise receive descriptors */
444 kqswnal_data.kqn_rxds = NULL;
446 for (i = 0; i < *kqswnal_tunables.kqn_nrxmsgs_small + *kqswnal_tunables.kqn_nrxmsgs_large; i++)
451 LIBCFS_ALLOC(krx, sizeof(*krx));
453 kqswnal_shutdown(ni);
457 memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */
458 krx->krx_alloclist = kqswnal_data.kqn_rxds;
459 kqswnal_data.kqn_rxds = krx;
461 if (i < *kqswnal_tunables.kqn_nrxmsgs_small)
463 krx->krx_npages = KQSW_NRXMSGPAGES_SMALL;
464 krx->krx_eprx = kqswnal_data.kqn_eprx_small;
468 krx->krx_npages = KQSW_NRXMSGPAGES_LARGE;
469 krx->krx_eprx = kqswnal_data.kqn_eprx_large;
472 LASSERT (krx->krx_npages > 0);
473 for (j = 0; j < krx->krx_npages; j++)
475 struct page *page = alloc_page(GFP_KERNEL);
478 kqswnal_shutdown (ni);
482 krx->krx_kiov[j] = (lnet_kiov_t) {.kiov_page = page,
484 .kiov_len = PAGE_SIZE};
485 LASSERT(page_address(page) != NULL);
487 ep_dvma_load(kqswnal_data.kqn_ep, NULL,
489 PAGE_SIZE, kqswnal_data.kqn_ep_rx_nmh,
490 elan_page_idx, &all_rails, &elanbuffer);
493 krx->krx_elanbuffer = elanbuffer;
495 rc = ep_nmd_merge(&krx->krx_elanbuffer,
496 &krx->krx_elanbuffer,
498 /* NB contiguous mapping */
505 LASSERT (elan_page_idx ==
506 (*kqswnal_tunables.kqn_nrxmsgs_small * KQSW_NRXMSGPAGES_SMALL) +
507 (*kqswnal_tunables.kqn_nrxmsgs_large * KQSW_NRXMSGPAGES_LARGE));
509 /**********************************************************************/
510 /* Queue receives, now that it's OK to run their completion callbacks */
512 for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
513 /* NB this enqueue can allocate/sleep (attr == 0) */
514 krx->krx_state = KRX_POSTED;
515 rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx,
516 &krx->krx_elanbuffer, 0);
517 if (rc != EP_SUCCESS) {
518 CERROR ("failed ep_queue_receive %d\n", rc);
519 kqswnal_shutdown (ni);
524 /**********************************************************************/
525 /* Spawn scheduling threads */
526 for (i = 0; i < cfs_num_online_cpus(); i++) {
527 rc = kqswnal_thread_start (kqswnal_scheduler, NULL);
530 CERROR ("failed to spawn scheduling thread: %d\n", rc);
531 kqswnal_shutdown (ni);
536 kqswnal_data.kqn_init = KQN_INIT_ALL;
541 kqswnal_finalise (void)
543 lnet_unregister_lnd(&the_kqswlnd);
544 kqswnal_tunables_fini();
548 kqswnal_initialise (void)
550 int rc = kqswnal_tunables_init();
555 lnet_register_lnd(&the_kqswlnd);
559 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
560 MODULE_DESCRIPTION("Kernel Quadrics/Elan LND v1.01");
561 MODULE_LICENSE("GPL");
563 module_init (kqswnal_initialise);
564 module_exit (kqswnal_finalise);