4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lnet/klnds/qswlnd/qswlnd.c
38 * Author: Eric Barton <eric@bartonsoftware.com>
47 .lnd_startup = kqswnal_startup,
48 .lnd_shutdown = kqswnal_shutdown,
49 .lnd_ctl = kqswnal_ctl,
50 .lnd_send = kqswnal_send,
51 .lnd_recv = kqswnal_recv,
54 kqswnal_data_t kqswnal_data;
57 kqswnal_get_tx_desc (struct libcfs_ioctl_data *data)
63 int index = data->ioc_count;
66 spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags);
68 cfs_list_for_each (tmp, &kqswnal_data.kqn_activetxds) {
72 ktx = cfs_list_entry (tmp, kqswnal_tx_t, ktx_list);
73 hdr = (lnet_hdr_t *)ktx->ktx_buffer;
75 data->ioc_count = le32_to_cpu(hdr->payload_length);
76 data->ioc_nid = le64_to_cpu(hdr->dest_nid);
77 data->ioc_u64[0] = ktx->ktx_nid;
78 data->ioc_u32[0] = le32_to_cpu(hdr->type);
79 data->ioc_u32[1] = ktx->ktx_launcher;
81 (cfs_list_empty (&ktx->ktx_schedlist) ? 0 : 1) |
82 (ktx->ktx_state << 2);
87 spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags);
92 kqswnal_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg)
94 struct libcfs_ioctl_data *data = arg;
96 LASSERT (ni == kqswnal_data.kqn_ni);
99 case IOC_LIBCFS_GET_TXDESC:
100 return (kqswnal_get_tx_desc (data));
102 case IOC_LIBCFS_REGISTER_MYNID:
103 if (data->ioc_nid == ni->ni_nid)
106 LASSERT (LNET_NIDNET(data->ioc_nid) == LNET_NIDNET(ni->ni_nid));
108 CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID for %s(%s)\n",
109 libcfs_nid2str(data->ioc_nid),
110 libcfs_nid2str(ni->ni_nid));
119 kqswnal_shutdown(lnet_ni_t *ni)
125 CDEBUG (D_NET, "shutdown\n");
126 LASSERT (ni->ni_data == &kqswnal_data);
127 LASSERT (ni == kqswnal_data.kqn_ni);
129 switch (kqswnal_data.kqn_init)
139 /**********************************************************************/
140 /* Signal the start of shutdown... */
141 spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags);
142 kqswnal_data.kqn_shuttingdown = 1;
143 spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags);
145 /**********************************************************************/
146 /* wait for sends that have allocated a tx desc to launch or give up */
147 while (cfs_atomic_read (&kqswnal_data.kqn_pending_txs) != 0) {
148 CDEBUG(D_NET, "waiting for %d pending sends\n",
149 cfs_atomic_read (&kqswnal_data.kqn_pending_txs));
150 cfs_pause(cfs_time_seconds(1));
153 /**********************************************************************/
154 /* close elan comms */
155 /* Shut down receivers first; rx callbacks might try sending... */
156 if (kqswnal_data.kqn_eprx_small != NULL)
157 ep_free_rcvr (kqswnal_data.kqn_eprx_small);
159 if (kqswnal_data.kqn_eprx_large != NULL)
160 ep_free_rcvr (kqswnal_data.kqn_eprx_large);
162 /* NB ep_free_rcvr() returns only after we've freed off all receive
163 * buffers (see shutdown handling in kqswnal_requeue_rx()). This
164 * means we must have completed any messages we passed to
167 if (kqswnal_data.kqn_eptx != NULL)
168 ep_free_xmtr (kqswnal_data.kqn_eptx);
170 /* NB ep_free_xmtr() returns only after all outstanding transmits
171 * have called their callback... */
172 LASSERT(cfs_list_empty(&kqswnal_data.kqn_activetxds));
174 /**********************************************************************/
175 /* flag threads to terminate, wake them and wait for them to die */
176 kqswnal_data.kqn_shuttingdown = 2;
177 cfs_waitq_broadcast (&kqswnal_data.kqn_sched_waitq);
179 while (cfs_atomic_read (&kqswnal_data.kqn_nthreads) != 0) {
180 CDEBUG(D_NET, "waiting for %d threads to terminate\n",
181 cfs_atomic_read (&kqswnal_data.kqn_nthreads));
182 cfs_pause(cfs_time_seconds(1));
185 /**********************************************************************/
186 /* No more threads. No more portals, router or comms callbacks!
187 * I control the horizontals and the verticals...
190 LASSERT (cfs_list_empty (&kqswnal_data.kqn_readyrxds));
191 LASSERT (cfs_list_empty (&kqswnal_data.kqn_donetxds));
192 LASSERT (cfs_list_empty (&kqswnal_data.kqn_delayedtxds));
194 /**********************************************************************/
195 /* Unmap message buffers and free all descriptors and buffers
198 /* FTTB, we need to unmap any remaining mapped memory. When
199 * ep_dvma_release() get fixed (and releases any mappings in the
200 * region), we can delete all the code from here --------> */
202 for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx = ktx->ktx_alloclist) {
203 /* If ktx has a buffer, it got mapped; unmap now. NB only
204 * the pre-mapped stuff is still mapped since all tx descs
207 if (ktx->ktx_buffer != NULL)
208 ep_dvma_unload(kqswnal_data.kqn_ep,
209 kqswnal_data.kqn_ep_tx_nmh,
213 for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
214 /* If krx_kiov[0].kiov_page got allocated, it got mapped.
215 * NB subsequent pages get merged */
217 if (krx->krx_kiov[0].kiov_page != NULL)
218 ep_dvma_unload(kqswnal_data.kqn_ep,
219 kqswnal_data.kqn_ep_rx_nmh,
220 &krx->krx_elanbuffer);
222 /* <----------- to here */
224 if (kqswnal_data.kqn_ep_rx_nmh != NULL)
225 ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_rx_nmh);
227 if (kqswnal_data.kqn_ep_tx_nmh != NULL)
228 ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_tx_nmh);
230 while (kqswnal_data.kqn_txds != NULL) {
231 ktx = kqswnal_data.kqn_txds;
233 if (ktx->ktx_buffer != NULL)
234 LIBCFS_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
236 kqswnal_data.kqn_txds = ktx->ktx_alloclist;
237 LIBCFS_FREE(ktx, sizeof(*ktx));
240 while (kqswnal_data.kqn_rxds != NULL) {
243 krx = kqswnal_data.kqn_rxds;
244 for (i = 0; i < krx->krx_npages; i++)
245 if (krx->krx_kiov[i].kiov_page != NULL)
246 __free_page (krx->krx_kiov[i].kiov_page);
248 kqswnal_data.kqn_rxds = krx->krx_alloclist;
249 LIBCFS_FREE(krx, sizeof (*krx));
252 /* resets flags, pointers to NULL etc */
253 memset(&kqswnal_data, 0, sizeof (kqswnal_data));
255 CDEBUG (D_MALLOC, "done kmem %d\n", cfs_atomic_read(&libcfs_kmemory));
257 module_put(THIS_MODULE);
261 kqswnal_startup (lnet_ni_t *ni)
263 EP_RAILMASK all_rails = EP_RAILMASK_ALL;
270 LASSERT (ni->ni_lnd == &the_kqswlnd);
272 /* Only 1 instance supported */
273 if (kqswnal_data.kqn_init != KQN_INIT_NOTHING) {
274 CERROR ("Only 1 instance supported\n");
278 if (ni->ni_interfaces[0] != NULL) {
279 CERROR("Explicit interface config not supported\n");
283 if (*kqswnal_tunables.kqn_credits >=
284 *kqswnal_tunables.kqn_ntxmsgs) {
285 LCONSOLE_ERROR_MSG(0x12e, "Configuration error: please set "
286 "ntxmsgs(%d) > credits(%d)\n",
287 *kqswnal_tunables.kqn_ntxmsgs,
288 *kqswnal_tunables.kqn_credits);
291 CDEBUG (D_MALLOC, "start kmem %d\n", cfs_atomic_read(&libcfs_kmemory));
293 /* ensure all pointers NULL etc */
294 memset (&kqswnal_data, 0, sizeof (kqswnal_data));
296 kqswnal_data.kqn_ni = ni;
297 ni->ni_data = &kqswnal_data;
298 ni->ni_peertxcredits = *kqswnal_tunables.kqn_peercredits;
299 ni->ni_maxtxcredits = *kqswnal_tunables.kqn_credits;
301 CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds);
302 CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_activetxds);
303 spin_lock_init(&kqswnal_data.kqn_idletxd_lock);
305 CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds);
306 CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_donetxds);
307 CFS_INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds);
309 spin_lock_init(&kqswnal_data.kqn_sched_lock);
310 cfs_waitq_init (&kqswnal_data.kqn_sched_waitq);
312 /* pointers/lists/locks initialised */
313 kqswnal_data.kqn_init = KQN_INIT_DATA;
314 try_module_get(THIS_MODULE);
316 kqswnal_data.kqn_ep = ep_system();
317 if (kqswnal_data.kqn_ep == NULL) {
318 CERROR("Can't initialise EKC\n");
319 kqswnal_shutdown(ni);
323 if (ep_waitfor_nodeid(kqswnal_data.kqn_ep) == ELAN_INVALID_NODE) {
324 CERROR("Can't get elan ID\n");
325 kqswnal_shutdown(ni);
329 kqswnal_data.kqn_nnodes = ep_numnodes (kqswnal_data.kqn_ep);
330 kqswnal_data.kqn_elanid = ep_nodeid (kqswnal_data.kqn_ep);
332 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), kqswnal_data.kqn_elanid);
334 /**********************************************************************/
335 /* Get the transmitter */
337 kqswnal_data.kqn_eptx = ep_alloc_xmtr (kqswnal_data.kqn_ep);
338 if (kqswnal_data.kqn_eptx == NULL)
340 CERROR ("Can't allocate transmitter\n");
341 kqswnal_shutdown (ni);
345 /**********************************************************************/
346 /* Get the receivers */
348 kqswnal_data.kqn_eprx_small =
349 ep_alloc_rcvr (kqswnal_data.kqn_ep,
350 EP_MSG_SVC_PORTALS_SMALL,
351 *kqswnal_tunables.kqn_ep_envelopes_small);
352 if (kqswnal_data.kqn_eprx_small == NULL)
354 CERROR ("Can't install small msg receiver\n");
355 kqswnal_shutdown (ni);
359 kqswnal_data.kqn_eprx_large =
360 ep_alloc_rcvr (kqswnal_data.kqn_ep,
361 EP_MSG_SVC_PORTALS_LARGE,
362 *kqswnal_tunables.kqn_ep_envelopes_large);
363 if (kqswnal_data.kqn_eprx_large == NULL)
365 CERROR ("Can't install large msg receiver\n");
366 kqswnal_shutdown (ni);
370 /**********************************************************************/
371 /* Reserve Elan address space for transmit descriptors NB we may
372 * either send the contents of associated buffers immediately, or
373 * map them for the peer to suck/blow... */
374 kqswnal_data.kqn_ep_tx_nmh =
375 ep_dvma_reserve(kqswnal_data.kqn_ep,
376 KQSW_NTXMSGPAGES*(*kqswnal_tunables.kqn_ntxmsgs),
378 if (kqswnal_data.kqn_ep_tx_nmh == NULL) {
379 CERROR("Can't reserve tx dma space\n");
380 kqswnal_shutdown(ni);
384 /**********************************************************************/
385 /* Reserve Elan address space for receive buffers */
386 kqswnal_data.kqn_ep_rx_nmh =
387 ep_dvma_reserve(kqswnal_data.kqn_ep,
388 KQSW_NRXMSGPAGES_SMALL *
389 (*kqswnal_tunables.kqn_nrxmsgs_small) +
390 KQSW_NRXMSGPAGES_LARGE *
391 (*kqswnal_tunables.kqn_nrxmsgs_large),
393 if (kqswnal_data.kqn_ep_tx_nmh == NULL) {
394 CERROR("Can't reserve rx dma space\n");
395 kqswnal_shutdown(ni);
399 /**********************************************************************/
400 /* Allocate/Initialise transmit descriptors */
402 kqswnal_data.kqn_txds = NULL;
403 for (i = 0; i < (*kqswnal_tunables.kqn_ntxmsgs); i++)
406 int basepage = i * KQSW_NTXMSGPAGES;
408 LIBCFS_ALLOC (ktx, sizeof(*ktx));
410 kqswnal_shutdown (ni);
414 memset(ktx, 0, sizeof(*ktx)); /* NULL pointers; zero flags */
415 ktx->ktx_alloclist = kqswnal_data.kqn_txds;
416 kqswnal_data.kqn_txds = ktx;
418 LIBCFS_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
419 if (ktx->ktx_buffer == NULL)
421 kqswnal_shutdown (ni);
425 /* Map pre-allocated buffer NOW, to save latency on transmit */
426 premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer,
427 KQSW_TX_BUFFER_SIZE);
428 ep_dvma_load(kqswnal_data.kqn_ep, NULL,
429 ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE,
430 kqswnal_data.kqn_ep_tx_nmh, basepage,
431 &all_rails, &ktx->ktx_ebuffer);
433 ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */
434 ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */
436 CFS_INIT_LIST_HEAD (&ktx->ktx_schedlist);
438 ktx->ktx_state = KTX_IDLE;
439 ktx->ktx_rail = -1; /* unset rail */
441 cfs_list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_idletxds);
444 /**********************************************************************/
445 /* Allocate/Initialise receive descriptors */
446 kqswnal_data.kqn_rxds = NULL;
448 for (i = 0; i < *kqswnal_tunables.kqn_nrxmsgs_small + *kqswnal_tunables.kqn_nrxmsgs_large; i++)
453 LIBCFS_ALLOC(krx, sizeof(*krx));
455 kqswnal_shutdown(ni);
459 memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */
460 krx->krx_alloclist = kqswnal_data.kqn_rxds;
461 kqswnal_data.kqn_rxds = krx;
463 if (i < *kqswnal_tunables.kqn_nrxmsgs_small)
465 krx->krx_npages = KQSW_NRXMSGPAGES_SMALL;
466 krx->krx_eprx = kqswnal_data.kqn_eprx_small;
470 krx->krx_npages = KQSW_NRXMSGPAGES_LARGE;
471 krx->krx_eprx = kqswnal_data.kqn_eprx_large;
474 LASSERT (krx->krx_npages > 0);
475 for (j = 0; j < krx->krx_npages; j++)
477 struct page *page = alloc_page(GFP_KERNEL);
480 kqswnal_shutdown (ni);
484 krx->krx_kiov[j] = (lnet_kiov_t) {.kiov_page = page,
486 .kiov_len = PAGE_SIZE};
487 LASSERT(page_address(page) != NULL);
489 ep_dvma_load(kqswnal_data.kqn_ep, NULL,
491 PAGE_SIZE, kqswnal_data.kqn_ep_rx_nmh,
492 elan_page_idx, &all_rails, &elanbuffer);
495 krx->krx_elanbuffer = elanbuffer;
497 rc = ep_nmd_merge(&krx->krx_elanbuffer,
498 &krx->krx_elanbuffer,
500 /* NB contiguous mapping */
507 LASSERT (elan_page_idx ==
508 (*kqswnal_tunables.kqn_nrxmsgs_small * KQSW_NRXMSGPAGES_SMALL) +
509 (*kqswnal_tunables.kqn_nrxmsgs_large * KQSW_NRXMSGPAGES_LARGE));
511 /**********************************************************************/
512 /* Queue receives, now that it's OK to run their completion callbacks */
514 for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
515 /* NB this enqueue can allocate/sleep (attr == 0) */
516 krx->krx_state = KRX_POSTED;
517 rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx,
518 &krx->krx_elanbuffer, 0);
519 if (rc != EP_SUCCESS) {
520 CERROR ("failed ep_queue_receive %d\n", rc);
521 kqswnal_shutdown (ni);
526 /**********************************************************************/
527 /* Spawn scheduling threads */
528 for (i = 0; i < num_online_cpus(); i++) {
529 rc = kqswnal_thread_start(kqswnal_scheduler, NULL,
533 CERROR ("failed to spawn scheduling thread: %d\n", rc);
534 kqswnal_shutdown (ni);
539 kqswnal_data.kqn_init = KQN_INIT_ALL;
544 kqswnal_finalise (void)
546 lnet_unregister_lnd(&the_kqswlnd);
547 kqswnal_tunables_fini();
551 kqswnal_initialise (void)
553 int rc = kqswnal_tunables_init();
558 lnet_register_lnd(&the_kqswlnd);
562 MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
563 MODULE_DESCRIPTION("Kernel Quadrics/Elan LND v1.01");
564 MODULE_LICENSE("GPL");
566 module_init (kqswnal_initialise);
567 module_exit (kqswnal_finalise);