1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
3 * Copyright (C) 2002-2004 Cluster File Systems, Inc.
4 * Author: Eric Barton <eric@bartonsoftware.com>
6 * This file is part of Portals, http://www.lustre.org
8 * Portals is free software; you can redistribute it and/or
9 * modify it under the terms of version 2 of the GNU General Public
10 * License as published by the Free Software Foundation.
12 * Portals is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with Portals; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29 .lnd_startup = kqswnal_startup,
30 .lnd_shutdown = kqswnal_shutdown,
31 .lnd_ctl = kqswnal_ctl,
32 .lnd_send = kqswnal_send,
33 .lnd_recv = kqswnal_recv,
36 kqswnal_data_t kqswnal_data;
39 kqswnal_get_tx_desc (struct libcfs_ioctl_data *data)
42 struct list_head *tmp;
45 int index = data->ioc_count;
48 spin_lock_irqsave (&kqswnal_data.kqn_idletxd_lock, flags);
50 list_for_each (tmp, &kqswnal_data.kqn_activetxds) {
54 ktx = list_entry (tmp, kqswnal_tx_t, ktx_list);
55 hdr = (lnet_hdr_t *)ktx->ktx_buffer;
57 data->ioc_count = le32_to_cpu(hdr->payload_length);
58 data->ioc_nid = le64_to_cpu(hdr->dest_nid);
59 data->ioc_u64[0] = ktx->ktx_nid;
60 data->ioc_u32[0] = le32_to_cpu(hdr->type);
61 data->ioc_u32[1] = ktx->ktx_launcher;
62 data->ioc_flags = (list_empty (&ktx->ktx_schedlist) ? 0 : 1) |
63 (ktx->ktx_state << 2);
68 spin_unlock_irqrestore (&kqswnal_data.kqn_idletxd_lock, flags);
73 kqswnal_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg)
75 struct libcfs_ioctl_data *data = arg;
77 LASSERT (ni == kqswnal_data.kqn_ni);
80 case IOC_LIBCFS_GET_TXDESC:
81 return (kqswnal_get_tx_desc (data));
83 case IOC_LIBCFS_REGISTER_MYNID:
84 if (data->ioc_nid == ni->ni_nid)
87 LASSERT (LNET_NIDNET(data->ioc_nid) == LNET_NIDNET(ni->ni_nid));
89 CERROR("obsolete IOC_LIBCFS_REGISTER_MYNID for %s(%s)\n",
90 libcfs_nid2str(data->ioc_nid),
91 libcfs_nid2str(ni->ni_nid));
100 kqswnal_shutdown(lnet_ni_t *ni)
106 CDEBUG (D_NET, "shutdown\n");
107 LASSERT (ni->ni_data == &kqswnal_data);
108 LASSERT (ni == kqswnal_data.kqn_ni);
110 switch (kqswnal_data.kqn_init)
120 /**********************************************************************/
121 /* Signal the start of shutdown... */
122 spin_lock_irqsave(&kqswnal_data.kqn_idletxd_lock, flags);
123 kqswnal_data.kqn_shuttingdown = 1;
124 spin_unlock_irqrestore(&kqswnal_data.kqn_idletxd_lock, flags);
126 /**********************************************************************/
127 /* wait for sends that have allocated a tx desc to launch or give up */
128 while (atomic_read (&kqswnal_data.kqn_pending_txs) != 0) {
129 CDEBUG(D_NET, "waiting for %d pending sends\n",
130 atomic_read (&kqswnal_data.kqn_pending_txs));
131 cfs_pause(cfs_time_seconds(1));
134 /**********************************************************************/
135 /* close elan comms */
136 /* Shut down receivers first; rx callbacks might try sending... */
137 if (kqswnal_data.kqn_eprx_small != NULL)
138 ep_free_rcvr (kqswnal_data.kqn_eprx_small);
140 if (kqswnal_data.kqn_eprx_large != NULL)
141 ep_free_rcvr (kqswnal_data.kqn_eprx_large);
143 /* NB ep_free_rcvr() returns only after we've freed off all receive
144 * buffers (see shutdown handling in kqswnal_requeue_rx()). This
145 * means we must have completed any messages we passed to
148 if (kqswnal_data.kqn_eptx != NULL)
149 ep_free_xmtr (kqswnal_data.kqn_eptx);
151 /* NB ep_free_xmtr() returns only after all outstanding transmits
152 * have called their callback... */
153 LASSERT(list_empty(&kqswnal_data.kqn_activetxds));
155 /**********************************************************************/
156 /* flag threads to terminate, wake them and wait for them to die */
157 kqswnal_data.kqn_shuttingdown = 2;
158 wake_up_all (&kqswnal_data.kqn_sched_waitq);
160 while (atomic_read (&kqswnal_data.kqn_nthreads) != 0) {
161 CDEBUG(D_NET, "waiting for %d threads to terminate\n",
162 atomic_read (&kqswnal_data.kqn_nthreads));
163 cfs_pause(cfs_time_seconds(1));
166 /**********************************************************************/
167 /* No more threads. No more portals, router or comms callbacks!
168 * I control the horizontals and the verticals...
171 LASSERT (list_empty (&kqswnal_data.kqn_readyrxds));
172 LASSERT (list_empty (&kqswnal_data.kqn_donetxds));
173 LASSERT (list_empty (&kqswnal_data.kqn_delayedtxds));
175 /**********************************************************************/
176 /* Unmap message buffers and free all descriptors and buffers
179 /* FTTB, we need to unmap any remaining mapped memory. When
180 * ep_dvma_release() get fixed (and releases any mappings in the
181 * region), we can delete all the code from here --------> */
183 for (ktx = kqswnal_data.kqn_txds; ktx != NULL; ktx = ktx->ktx_alloclist) {
184 /* If ktx has a buffer, it got mapped; unmap now. NB only
185 * the pre-mapped stuff is still mapped since all tx descs
188 if (ktx->ktx_buffer != NULL)
189 ep_dvma_unload(kqswnal_data.kqn_ep,
190 kqswnal_data.kqn_ep_tx_nmh,
194 for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
195 /* If krx_kiov[0].kiov_page got allocated, it got mapped.
196 * NB subsequent pages get merged */
198 if (krx->krx_kiov[0].kiov_page != NULL)
199 ep_dvma_unload(kqswnal_data.kqn_ep,
200 kqswnal_data.kqn_ep_rx_nmh,
201 &krx->krx_elanbuffer);
203 /* <----------- to here */
205 if (kqswnal_data.kqn_ep_rx_nmh != NULL)
206 ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_rx_nmh);
208 if (kqswnal_data.kqn_ep_tx_nmh != NULL)
209 ep_dvma_release(kqswnal_data.kqn_ep, kqswnal_data.kqn_ep_tx_nmh);
211 while (kqswnal_data.kqn_txds != NULL) {
212 ktx = kqswnal_data.kqn_txds;
214 if (ktx->ktx_buffer != NULL)
215 LIBCFS_FREE(ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
217 kqswnal_data.kqn_txds = ktx->ktx_alloclist;
218 LIBCFS_FREE(ktx, sizeof(*ktx));
221 while (kqswnal_data.kqn_rxds != NULL) {
224 krx = kqswnal_data.kqn_rxds;
225 for (i = 0; i < krx->krx_npages; i++)
226 if (krx->krx_kiov[i].kiov_page != NULL)
227 __free_page (krx->krx_kiov[i].kiov_page);
229 kqswnal_data.kqn_rxds = krx->krx_alloclist;
230 LIBCFS_FREE(krx, sizeof (*krx));
233 /* resets flags, pointers to NULL etc */
234 memset(&kqswnal_data, 0, sizeof (kqswnal_data));
236 CDEBUG (D_MALLOC, "done kmem %d\n", atomic_read(&libcfs_kmemory));
242 kqswnal_startup (lnet_ni_t *ni)
244 EP_RAILMASK all_rails = EP_RAILMASK_ALL;
251 LASSERT (ni->ni_lnd == &the_kqswlnd);
254 if (the_lnet.ln_ptlcompat != 0) {
255 CERROR("Checksumming version not portals compatible\n");
259 /* Only 1 instance supported */
260 if (kqswnal_data.kqn_init != KQN_INIT_NOTHING) {
261 CERROR ("Only 1 instance supported\n");
265 if (ni->ni_interfaces[0] != NULL) {
266 CERROR("Explicit interface config not supported\n");
270 if (*kqswnal_tunables.kqn_credits >=
271 *kqswnal_tunables.kqn_ntxmsgs) {
272 LCONSOLE_ERROR_MSG(0x12e, "Configuration error: please set "
273 "ntxmsgs(%d) > credits(%d)\n",
274 *kqswnal_tunables.kqn_ntxmsgs,
275 *kqswnal_tunables.kqn_credits);
278 CDEBUG (D_MALLOC, "start kmem %d\n", atomic_read(&libcfs_kmemory));
280 /* ensure all pointers NULL etc */
281 memset (&kqswnal_data, 0, sizeof (kqswnal_data));
283 kqswnal_data.kqn_ni = ni;
284 ni->ni_data = &kqswnal_data;
285 ni->ni_peertxcredits = *kqswnal_tunables.kqn_peercredits;
286 ni->ni_maxtxcredits = *kqswnal_tunables.kqn_credits;
288 INIT_LIST_HEAD (&kqswnal_data.kqn_idletxds);
289 INIT_LIST_HEAD (&kqswnal_data.kqn_activetxds);
290 spin_lock_init (&kqswnal_data.kqn_idletxd_lock);
292 INIT_LIST_HEAD (&kqswnal_data.kqn_delayedtxds);
293 INIT_LIST_HEAD (&kqswnal_data.kqn_donetxds);
294 INIT_LIST_HEAD (&kqswnal_data.kqn_readyrxds);
296 spin_lock_init (&kqswnal_data.kqn_sched_lock);
297 init_waitqueue_head (&kqswnal_data.kqn_sched_waitq);
299 /* pointers/lists/locks initialised */
300 kqswnal_data.kqn_init = KQN_INIT_DATA;
303 kqswnal_data.kqn_ep = ep_system();
304 if (kqswnal_data.kqn_ep == NULL) {
305 CERROR("Can't initialise EKC\n");
306 kqswnal_shutdown(ni);
310 if (ep_waitfor_nodeid(kqswnal_data.kqn_ep) == ELAN_INVALID_NODE) {
311 CERROR("Can't get elan ID\n");
312 kqswnal_shutdown(ni);
316 kqswnal_data.kqn_nnodes = ep_numnodes (kqswnal_data.kqn_ep);
317 kqswnal_data.kqn_elanid = ep_nodeid (kqswnal_data.kqn_ep);
319 ni->ni_nid = LNET_MKNID(LNET_NIDNET(ni->ni_nid), kqswnal_data.kqn_elanid);
321 /**********************************************************************/
322 /* Get the transmitter */
324 kqswnal_data.kqn_eptx = ep_alloc_xmtr (kqswnal_data.kqn_ep);
325 if (kqswnal_data.kqn_eptx == NULL)
327 CERROR ("Can't allocate transmitter\n");
328 kqswnal_shutdown (ni);
332 /**********************************************************************/
333 /* Get the receivers */
335 kqswnal_data.kqn_eprx_small =
336 ep_alloc_rcvr (kqswnal_data.kqn_ep,
337 EP_MSG_SVC_PORTALS_SMALL,
338 *kqswnal_tunables.kqn_ep_envelopes_small);
339 if (kqswnal_data.kqn_eprx_small == NULL)
341 CERROR ("Can't install small msg receiver\n");
342 kqswnal_shutdown (ni);
346 kqswnal_data.kqn_eprx_large =
347 ep_alloc_rcvr (kqswnal_data.kqn_ep,
348 EP_MSG_SVC_PORTALS_LARGE,
349 *kqswnal_tunables.kqn_ep_envelopes_large);
350 if (kqswnal_data.kqn_eprx_large == NULL)
352 CERROR ("Can't install large msg receiver\n");
353 kqswnal_shutdown (ni);
357 /**********************************************************************/
358 /* Reserve Elan address space for transmit descriptors NB we may
359 * either send the contents of associated buffers immediately, or
360 * map them for the peer to suck/blow... */
361 kqswnal_data.kqn_ep_tx_nmh =
362 ep_dvma_reserve(kqswnal_data.kqn_ep,
363 KQSW_NTXMSGPAGES*(*kqswnal_tunables.kqn_ntxmsgs),
365 if (kqswnal_data.kqn_ep_tx_nmh == NULL) {
366 CERROR("Can't reserve tx dma space\n");
367 kqswnal_shutdown(ni);
371 /**********************************************************************/
372 /* Reserve Elan address space for receive buffers */
373 kqswnal_data.kqn_ep_rx_nmh =
374 ep_dvma_reserve(kqswnal_data.kqn_ep,
375 KQSW_NRXMSGPAGES_SMALL *
376 (*kqswnal_tunables.kqn_nrxmsgs_small) +
377 KQSW_NRXMSGPAGES_LARGE *
378 (*kqswnal_tunables.kqn_nrxmsgs_large),
380 if (kqswnal_data.kqn_ep_tx_nmh == NULL) {
381 CERROR("Can't reserve rx dma space\n");
382 kqswnal_shutdown(ni);
386 /**********************************************************************/
387 /* Allocate/Initialise transmit descriptors */
389 kqswnal_data.kqn_txds = NULL;
390 for (i = 0; i < (*kqswnal_tunables.kqn_ntxmsgs); i++)
393 int basepage = i * KQSW_NTXMSGPAGES;
395 LIBCFS_ALLOC (ktx, sizeof(*ktx));
397 kqswnal_shutdown (ni);
401 memset(ktx, 0, sizeof(*ktx)); /* NULL pointers; zero flags */
402 ktx->ktx_alloclist = kqswnal_data.kqn_txds;
403 kqswnal_data.kqn_txds = ktx;
405 LIBCFS_ALLOC (ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE);
406 if (ktx->ktx_buffer == NULL)
408 kqswnal_shutdown (ni);
412 /* Map pre-allocated buffer NOW, to save latency on transmit */
413 premapped_pages = kqswnal_pages_spanned(ktx->ktx_buffer,
414 KQSW_TX_BUFFER_SIZE);
415 ep_dvma_load(kqswnal_data.kqn_ep, NULL,
416 ktx->ktx_buffer, KQSW_TX_BUFFER_SIZE,
417 kqswnal_data.kqn_ep_tx_nmh, basepage,
418 &all_rails, &ktx->ktx_ebuffer);
420 ktx->ktx_basepage = basepage + premapped_pages; /* message mapping starts here */
421 ktx->ktx_npages = KQSW_NTXMSGPAGES - premapped_pages; /* for this many pages */
423 INIT_LIST_HEAD (&ktx->ktx_schedlist);
425 ktx->ktx_state = KTX_IDLE;
426 ktx->ktx_rail = -1; /* unset rail */
428 list_add_tail (&ktx->ktx_list, &kqswnal_data.kqn_idletxds);
431 /**********************************************************************/
432 /* Allocate/Initialise receive descriptors */
433 kqswnal_data.kqn_rxds = NULL;
435 for (i = 0; i < *kqswnal_tunables.kqn_nrxmsgs_small + *kqswnal_tunables.kqn_nrxmsgs_large; i++)
440 LIBCFS_ALLOC(krx, sizeof(*krx));
442 kqswnal_shutdown(ni);
446 memset(krx, 0, sizeof(*krx)); /* clear flags, null pointers etc */
447 krx->krx_alloclist = kqswnal_data.kqn_rxds;
448 kqswnal_data.kqn_rxds = krx;
450 if (i < *kqswnal_tunables.kqn_nrxmsgs_small)
452 krx->krx_npages = KQSW_NRXMSGPAGES_SMALL;
453 krx->krx_eprx = kqswnal_data.kqn_eprx_small;
457 krx->krx_npages = KQSW_NRXMSGPAGES_LARGE;
458 krx->krx_eprx = kqswnal_data.kqn_eprx_large;
461 LASSERT (krx->krx_npages > 0);
462 for (j = 0; j < krx->krx_npages; j++)
464 struct page *page = alloc_page(GFP_KERNEL);
467 kqswnal_shutdown (ni);
471 krx->krx_kiov[j] = (lnet_kiov_t) {.kiov_page = page,
473 .kiov_len = PAGE_SIZE};
474 LASSERT(page_address(page) != NULL);
476 ep_dvma_load(kqswnal_data.kqn_ep, NULL,
478 PAGE_SIZE, kqswnal_data.kqn_ep_rx_nmh,
479 elan_page_idx, &all_rails, &elanbuffer);
482 krx->krx_elanbuffer = elanbuffer;
484 rc = ep_nmd_merge(&krx->krx_elanbuffer,
485 &krx->krx_elanbuffer,
487 /* NB contiguous mapping */
494 LASSERT (elan_page_idx ==
495 (*kqswnal_tunables.kqn_nrxmsgs_small * KQSW_NRXMSGPAGES_SMALL) +
496 (*kqswnal_tunables.kqn_nrxmsgs_large * KQSW_NRXMSGPAGES_LARGE));
498 /**********************************************************************/
499 /* Queue receives, now that it's OK to run their completion callbacks */
501 for (krx = kqswnal_data.kqn_rxds; krx != NULL; krx = krx->krx_alloclist) {
502 /* NB this enqueue can allocate/sleep (attr == 0) */
503 krx->krx_state = KRX_POSTED;
504 rc = ep_queue_receive(krx->krx_eprx, kqswnal_rxhandler, krx,
505 &krx->krx_elanbuffer, 0);
506 if (rc != EP_SUCCESS) {
507 CERROR ("failed ep_queue_receive %d\n", rc);
508 kqswnal_shutdown (ni);
513 /**********************************************************************/
514 /* Spawn scheduling threads */
515 for (i = 0; i < num_online_cpus(); i++) {
516 rc = kqswnal_thread_start (kqswnal_scheduler, NULL);
519 CERROR ("failed to spawn scheduling thread: %d\n", rc);
520 kqswnal_shutdown (ni);
525 kqswnal_data.kqn_init = KQN_INIT_ALL;
530 kqswnal_finalise (void)
532 lnet_unregister_lnd(&the_kqswlnd);
533 kqswnal_tunables_fini();
537 kqswnal_initialise (void)
539 int rc = kqswnal_tunables_init();
544 lnet_register_lnd(&the_kqswlnd);
548 MODULE_AUTHOR("Cluster File Systems, Inc. <info@clusterfs.com>");
549 MODULE_DESCRIPTION("Kernel Quadrics/Elan LND v1.01");
550 MODULE_LICENSE("GPL");
552 module_init (kqswnal_initialise);
553 module_exit (kqswnal_finalise);