1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lnet/klnds/qswlnd/qswlnd.h
38 * Basic library routines.
44 # define EXPORT_SYMTAB
47 #include <qsnet/kernel.h>
48 #undef printf /* nasty QSW #define */
49 #ifndef AUTOCONF_INCLUDED
50 #include <linux/config.h>
52 #include <linux/module.h>
54 #include <elan/epcomms.h>
56 #include <linux/kernel.h>
58 #include <linux/string.h>
59 #include <linux/stat.h>
60 #include <linux/errno.h>
61 #if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
62 #include <linux/locks.h> /* wait_on_buffer */
64 #include <linux/buffer_head.h> /* wait_on_buffer */
66 #include <linux/unistd.h>
68 #include <linux/uio.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
74 #include <linux/file.h>
75 #include <linux/stat.h>
76 #include <linux/list.h>
77 #include <linux/sysctl.h>
79 #define DEBUG_SUBSYSTEM S_LND
81 #include <libcfs/libcfs.h>
82 #include <lnet/lnet.h>
83 #include <lnet/lib-lnet.h>
84 #include <lnet/lnet-sysctl.h>
87 #define KQSW_SMALLMSG (4<<10) /* small/large ep receiver breakpoint */
88 #define KQSW_RESCHED 100 /* # busy loops that forces scheduler to yield */
90 #define KQSW_CKSUM 0 /* enable checksumming (protocol incompatible) */
96 #define KQSW_TX_BUFFER_SIZE (offsetof(kqswnal_msg_t, \
97 kqm_u.immediate.kqim_payload[*kqswnal_tunables.kqn_tx_maxcontig]))
98 /* The pre-allocated tx buffer (hdr + small payload) */
100 #define KQSW_NTXMSGPAGES (btopr(KQSW_TX_BUFFER_SIZE) + 1 + btopr(LNET_MAX_PAYLOAD) + 1)
101 /* Reserve elan address space for pre-allocated and pre-mapped transmit
102 * buffer and a full payload too. Extra pages allow for page alignment */
104 #define KQSW_NRXMSGPAGES_SMALL (btopr(KQSW_SMALLMSG))
105 /* receive hdr/payload always contiguous and page aligned */
106 #define KQSW_NRXMSGBYTES_SMALL (KQSW_NRXMSGPAGES_SMALL * PAGE_SIZE)
108 #define KQSW_NRXMSGPAGES_LARGE (btopr(sizeof(lnet_msg_t) + LNET_MAX_PAYLOAD))
109 /* receive hdr/payload always contiguous and page aligned */
110 #define KQSW_NRXMSGBYTES_LARGE (KQSW_NRXMSGPAGES_LARGE * PAGE_SIZE)
111 /* biggest complete packet we can receive (or transmit) */
114 /* Remote memory descriptor */
117 __u32 kqrmd_nfrag; /* # frags */
118 EP_NMD kqrmd_frag[0]; /* actual frags */
119 } kqswnal_remotemd_t;
124 lnet_hdr_t kqim_hdr; /* LNET header */
125 char kqim_payload[0]; /* piggy-backed payload */
126 } WIRE_ATTR kqswnal_immediate_msg_t;
131 lnet_hdr_t kqrm_hdr; /* LNET header */
132 kqswnal_remotemd_t kqrm_rmd; /* peer's buffer */
133 } WIRE_ATTR kqswnal_rdma_msg_t;
137 __u32 kqm_magic; /* I'm a qswlnd message */
138 __u16 kqm_version; /* this is my version number */
139 __u16 kqm_type; /* msg type */
141 __u32 kqm_cksum; /* crc32 checksum */
142 __u32 kqm_nob; /* original msg length */
145 kqswnal_immediate_msg_t immediate;
146 kqswnal_rdma_msg_t rdma;
148 } WIRE_ATTR kqswnal_msg_t;
150 #if KQSW_CKSUM /* enable checksums ? */
151 # include <linux/crc32.h>
152 static inline __u32 kqswnal_csum(__u32 crc, unsigned char const *p, size_t len)
155 return crc32_le(crc, p, len);
158 crc = ((crc + 0x100) & ~0xff) | ((crc + *p++) & 0xff) ;
162 # define QSWLND_PROTO_VERSION 0xbeef
164 # define QSWLND_PROTO_VERSION 1
167 #define QSWLND_MSG_IMMEDIATE 0
168 #define QSWLND_MSG_RDMA 1
171 EP_STATUSBLK ep_statusblk;
183 } kqswnal_rpc_reply_t;
185 typedef struct kqswnal_rx
187 cfs_list_t krx_list; /* enqueue -> thread */
188 struct kqswnal_rx *krx_alloclist;/* stack in kqn_rxds */
189 EP_RCVR *krx_eprx; /* port to post receives to */
190 EP_RXD *krx_rxd; /* receive descriptor (for repost) */
191 EP_NMD krx_elanbuffer;/* contiguous Elan buffer */
192 int krx_npages; /* # pages in receive buffer */
193 int krx_nob; /* Number Of Bytes received into buffer */
194 int krx_rpc_reply_needed:1; /* peer waiting for EKC RPC reply */
195 int krx_state; /* what this RX is doing */
196 cfs_atomic_t krx_refcount; /* how to tell when rpc is done */
198 __u32 krx_cksum; /* checksum */
200 kqswnal_rpc_reply_t krx_rpc_reply; /* rpc reply status block */
201 lnet_kiov_t krx_kiov[KQSW_NRXMSGPAGES_LARGE];/* buffer frags */
204 #define KRX_POSTED 1 /* receiving */
205 #define KRX_PARSE 2 /* ready to be parsed */
206 #define KRX_COMPLETING 3 /* waiting to be completed */
209 typedef struct kqswnal_tx
211 cfs_list_t ktx_list; /* enqueue idle/active */
212 cfs_list_t ktx_schedlist; /* enqueue on scheduler */
213 struct kqswnal_tx *ktx_alloclist; /* stack in kqn_txds */
214 unsigned int ktx_state:7; /* What I'm doing */
215 unsigned int ktx_firsttmpfrag:1; /* ktx_frags[0] is in my ebuffer ? 0 : 1 */
216 __u32 ktx_basepage; /* page offset in reserved elan tx vaddrs for mapping pages */
217 int ktx_npages; /* pages reserved for mapping messages */
218 int ktx_nmappedpages; /* # pages mapped for current message */
219 int ktx_port; /* destination ep port */
220 lnet_nid_t ktx_nid; /* destination node */
221 void *ktx_args[3]; /* completion passthru */
222 char *ktx_buffer; /* pre-allocated contiguous buffer for hdr + small payloads */
223 cfs_time_t ktx_launchtime; /* when (in jiffies) the
224 * transmit was launched */
225 int ktx_status; /* completion status */
227 __u32 ktx_cksum; /* optimized GET payload checksum */
229 /* debug/info fields */
230 pid_t ktx_launcher; /* pid of launching process */
232 int ktx_nfrag; /* # message frags */
233 int ktx_rail; /* preferred rail */
234 EP_NMD ktx_ebuffer; /* elan mapping of ktx_buffer */
235 EP_NMD ktx_frags[EP_MAXFRAG];/* elan mapping of msg frags */
238 #define KTX_IDLE 0 /* on kqn_idletxds */
239 #define KTX_SENDING 1 /* normal send */
240 #define KTX_GETTING 2 /* sending optimised get */
241 #define KTX_PUTTING 3 /* sending optimised put */
242 #define KTX_RDMA_FETCH 4 /* handling optimised put */
243 #define KTX_RDMA_STORE 5 /* handling optimised get */
247 int *kqn_tx_maxcontig; /* maximum payload to defrag */
248 int *kqn_ntxmsgs; /* # normal tx msgs */
249 int *kqn_credits; /* # concurrent sends */
250 int *kqn_peercredits; /* # concurrent sends to 1 peer */
251 int *kqn_nrxmsgs_large; /* # 'large' rx msgs */
252 int *kqn_ep_envelopes_large; /* # 'large' rx ep envelopes */
253 int *kqn_nrxmsgs_small; /* # 'small' rx msgs */
254 int *kqn_ep_envelopes_small; /* # 'small' rx ep envelopes */
255 int *kqn_optimized_puts; /* optimized PUTs? */
256 int *kqn_optimized_gets; /* optimized GETs? */
258 int *kqn_inject_csum_error; /* # csum errors to inject */
261 #if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
262 cfs_sysctl_table_header_t *kqn_sysctl; /* sysctl interface */
264 } kqswnal_tunables_t;
268 char kqn_init; /* what's been initialised */
269 char kqn_shuttingdown;/* I'm trying to shut down */
270 cfs_atomic_t kqn_nthreads; /* # threads running */
271 lnet_ni_t *kqn_ni; /* _the_ instance of me */
273 kqswnal_rx_t *kqn_rxds; /* stack of all the receive descriptors */
274 kqswnal_tx_t *kqn_txds; /* stack of all the transmit descriptors */
276 cfs_list_t kqn_idletxds; /* transmit descriptors free to use */
277 cfs_list_t kqn_activetxds; /* transmit descriptors being used */
278 cfs_spinlock_t kqn_idletxd_lock; /* serialise idle txd access */
279 cfs_atomic_t kqn_pending_txs;/* # transmits being prepped */
281 cfs_spinlock_t kqn_sched_lock; /* serialise packet schedulers */
282 cfs_waitq_t kqn_sched_waitq;/* scheduler blocks here */
284 cfs_list_t kqn_readyrxds; /* rxds full of data */
285 cfs_list_t kqn_donetxds; /* completed transmits */
286 cfs_list_t kqn_delayedtxds;/* delayed transmits */
288 EP_SYS *kqn_ep; /* elan system */
289 EP_NMH *kqn_ep_tx_nmh; /* elan reserved tx vaddrs */
290 EP_NMH *kqn_ep_rx_nmh; /* elan reserved rx vaddrs */
291 EP_XMTR *kqn_eptx; /* elan transmitter */
292 EP_RCVR *kqn_eprx_small; /* elan receiver (small messages) */
293 EP_RCVR *kqn_eprx_large; /* elan receiver (large messages) */
295 int kqn_nnodes; /* this cluster's size */
296 int kqn_elanid; /* this nodes's elan ID */
298 EP_STATUSBLK kqn_rpc_success;/* preset RPC reply status blocks */
299 EP_STATUSBLK kqn_rpc_failed;
300 EP_STATUSBLK kqn_rpc_version;/* reply to future version query */
301 EP_STATUSBLK kqn_rpc_magic; /* reply to future version query */
305 #define KQN_INIT_NOTHING 0 /* MUST BE ZERO so zeroed state is initialised OK */
306 #define KQN_INIT_DATA 1
307 #define KQN_INIT_ALL 2
309 extern kqswnal_tunables_t kqswnal_tunables;
310 extern kqswnal_data_t kqswnal_data;
312 extern int kqswnal_thread_start (int (*fn)(void *arg), void *arg);
313 extern void kqswnal_rxhandler(EP_RXD *rxd);
314 extern int kqswnal_scheduler (void *);
315 extern void kqswnal_rx_done (kqswnal_rx_t *krx);
317 static inline lnet_nid_t
318 kqswnal_elanid2nid (int elanid)
320 return LNET_MKNID(LNET_NIDNET(kqswnal_data.kqn_ni->ni_nid), elanid);
324 kqswnal_nid2elanid (lnet_nid_t nid)
326 __u32 elanid = LNET_NIDADDR(nid);
328 /* not in this cluster? */
329 return (elanid >= kqswnal_data.kqn_nnodes) ? -1 : elanid;
332 static inline lnet_nid_t
333 kqswnal_rx_nid(kqswnal_rx_t *krx)
335 return (kqswnal_elanid2nid(ep_rxd_node(krx->krx_rxd)));
339 kqswnal_pages_spanned (void *base, int nob)
341 unsigned long first_page = ((unsigned long)base) >> PAGE_SHIFT;
342 unsigned long last_page = (((unsigned long)base) + (nob - 1)) >> PAGE_SHIFT;
344 LASSERT (last_page >= first_page); /* can't wrap address space */
345 return (last_page - first_page + 1);
348 static inline void kqswnal_rx_decref (kqswnal_rx_t *krx)
350 LASSERT (cfs_atomic_read (&krx->krx_refcount) > 0);
351 if (cfs_atomic_dec_and_test (&krx->krx_refcount))
352 kqswnal_rx_done(krx);
355 int kqswnal_startup (lnet_ni_t *ni);
356 void kqswnal_shutdown (lnet_ni_t *ni);
357 int kqswnal_ctl (lnet_ni_t *ni, unsigned int cmd, void *arg);
358 int kqswnal_send (lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
359 int kqswnal_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
360 int delayed, unsigned int niov,
361 struct iovec *iov, lnet_kiov_t *kiov,
362 unsigned int offset, unsigned int mlen, unsigned int rlen);
364 int kqswnal_tunables_init(void);
365 void kqswnal_tunables_fini(void);
367 #endif /* _QSWNAL_H */