1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see
20 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
32 * Copyright (C) 2006 Myricom, Inc.
35 * This file is part of Lustre, http://www.lustre.org/
36 * Lustre is a trademark of Sun Microsystems, Inc.
38 * lnet/klnds/mxlnd/mxlnd.h
40 * Author: Eric Barton <eric@bartonsoftware.com>
41 * Author: Scott Atchley <atchley at myri.com>
47 #ifndef AUTOCONF_INCLUDED
48 #include <linux/config.h>
50 #include <linux/module.h> /* module */
51 #include <linux/kernel.h> /* module */
53 #include <linux/string.h>
54 #include <linux/stat.h>
55 #include <linux/errno.h>
56 #include <linux/smp_lock.h>
57 #include <linux/unistd.h>
58 #include <linux/uio.h>
61 #include <asm/system.h>
62 #include <asm/uaccess.h>
65 #include <linux/init.h> /* module */
67 #include <linux/file.h>
68 #include <linux/stat.h>
69 #include <linux/list.h>
70 #include <linux/kmod.h>
71 #include <linux/sysctl.h>
72 #include <linux/random.h>
73 #include <linux/utsname.h>
74 #include <linux/jiffies.h> /* msecs_to_jiffies */
79 #include <asm/byteorder.h> /* __LITTLE_ENDIAN */
80 #include <net/arp.h> /* arp table */
81 #include <linux/netdevice.h> /* get_device_by_name */
82 #include <linux/inetdevice.h> /* neigh_lookup, etc. */
83 #include <linux/net.h> /* sock_create_kern, kernel_connect, sock_release */
85 #define DEBUG_SUBSYSTEM S_LND
87 #include "libcfs/libcfs.h"
88 #include "lnet/lnet.h"
89 #include "lnet/lib-lnet.h"
90 #include <lnet/lnet-sysctl.h>
93 #include "mx_extensions.h"
94 #include "myriexpress.h"
96 #if LNET_MAX_IOV > MX_MAX_SEGMENTS
97 #error LNET_MAX_IOV is greater then MX_MAX_SEGMENTS
100 #define MXLND_MSG_MAGIC 0x4d583130 /* unique magic 'MX10' */
101 #define MXLND_MSG_VERSION 0x02
103 /* Using MX's 64 match bits
104 * We are using the match bits to specify message type and the cookie. The
105 * highest four bits (60-63) are reserved for message type. Below we specify
106 * the types. MXLND_MASK_ICON_REQ and MXLND_MASK_ICON_ACK are used for
107 * mx_iconnect(). We reserve the remaining combinations for future use. The
108 * next 8 bits (52-59) are reserved for returning a status code for failed
109 * GET_DATA (payload) messages. The last 52 bits are used for cookies. That
110 * should allow unique cookies for 4 KB messages at 10 Gbps line rate without
111 * rollover for about 8 years. That should be enough. */
113 #define MXLND_MSG_OFFSET 60 /* msg type offset */
114 #define MXLND_MSG_BITS (64 - MXLND_MSG_OFFSET)
115 #define MXLND_MSG_MASK (((1ULL<<MXLND_MSG_BITS) - 1) << MXLND_MSG_OFFSET)
116 #define MXLND_MSG_TYPE(x) (((x) & MXLND_MSG_MASK) >> MXLND_MSG_OFFSET)
118 #define MXLND_ERROR_OFFSET 52 /* error value offset */
119 #define MXLND_ERROR_BITS (MXLND_MSG_OFFSET - MXLND_ERROR_OFFSET)
120 #define MXLND_ERROR_MASK (((1ULL<<MXLND_ERROR_BITS) - 1) << MXLND_ERROR_OFFSET)
121 #define MXLND_ERROR_VAL(x) (((x) & MXLND_ERROR_MASK) >> MXLND_ERROR_OFFSET)
124 #define MXLND_MSG_ICON_REQ 0xb /* mx_iconnect() before CONN_REQ */
125 #define MXLND_MSG_CONN_REQ 0xc /* connection request */
126 #define MXLND_MSG_ICON_ACK 0x9 /* mx_iconnect() before CONN_ACK */
127 #define MXLND_MSG_CONN_ACK 0xa /* connection request response */
128 #define MXLND_MSG_BYE 0xd /* disconnect msg */
129 #define MXLND_MSG_EAGER 0xe /* eager message */
130 #define MXLND_MSG_NOOP 0x1 /* no msg, return credits */
131 #define MXLND_MSG_PUT_REQ 0x2 /* put request src->sink */
132 #define MXLND_MSG_PUT_ACK 0x3 /* put ack src<-sink */
133 #define MXLND_MSG_PUT_DATA 0x4 /* put payload src->sink */
134 #define MXLND_MSG_GET_REQ 0x5 /* get request sink->src */
135 #define MXLND_MSG_GET_DATA 0x6 /* get payload sink<-src */
137 /* when to roll-over the cookie value */
138 #define MXLND_MAX_COOKIE ((1ULL << MXLND_ERROR_OFFSET) - 1)
140 #define MXLND_NCOMPLETIONS (MXLND_N_SCHED + 2) /* max threads for completion array */
142 /* defaults for configurable parameters */
143 #define MXLND_N_SCHED 1 /* # schedulers (mx_wait_any() threads) */
144 #define MXLND_MX_BOARD 0 /* Use the first MX NIC if more than 1 avail */
145 #define MXLND_MX_EP_ID 3 /* MX endpoint ID */
146 #define MXLND_COMM_TIMEOUT (20 * HZ) /* timeout for send/recv (jiffies) */
147 #define MXLND_WAIT_TIMEOUT HZ /* timeout for wait (jiffies) */
148 #define MXLND_POLLING 1000 /* poll iterations before blocking */
149 #define MXLND_MAX_PEERS 1024 /* number of nodes talking to me */
150 #define MXLND_EAGER_NUM MXLND_MAX_PEERS /* number of pre-posted receives */
151 #define MXLND_EAGER_SIZE PAGE_SIZE /* pre-posted eager message size */
152 #define MXLND_MSG_QUEUE_DEPTH 8 /* msg queue depth */
153 #define MXLND_CREDIT_HIGHWATER (MXLND_MSG_QUEUE_DEPTH - 2)
154 /* when to send a noop to return credits */
155 #define MXLND_NTX 256 /* # of kmx_tx - total sends in flight
156 1/2 are reserved for connect messages */
158 #define MXLND_HASH_BITS 6 /* the number of bits to hash over */
159 #define MXLND_HASH_SIZE (1<<MXLND_HASH_BITS)
160 /* number of peer lists for lookup.
161 we hash over the last N bits of
162 the IP address converted to an int. */
163 #define MXLND_HASH_MASK (MXLND_HASH_SIZE - 1)
164 /* ensure we use only the last N bits */
166 /* debugging features */
167 #define MXLND_CKSUM 0 /* checksum kmx_msg_t */
168 #define MXLND_DEBUG 0 /* additional CDEBUG messages */
170 /* provide wrappers around LIBCFS_ALLOC/FREE to keep MXLND specific
171 * memory usage stats that include pages */
173 #define MXLND_ALLOC(x, size) \
175 spin_lock(&kmxlnd_data.kmx_mem_lock); \
176 kmxlnd_data.kmx_mem_used += size; \
177 spin_unlock(&kmxlnd_data.kmx_mem_lock); \
178 LIBCFS_ALLOC(x, size); \
179 if (unlikely(x == NULL)) { \
180 spin_lock(&kmxlnd_data.kmx_mem_lock); \
181 kmxlnd_data.kmx_mem_used -= size; \
182 spin_unlock(&kmxlnd_data.kmx_mem_lock); \
186 #define MXLND_FREE(x, size) \
188 spin_lock(&kmxlnd_data.kmx_mem_lock); \
189 kmxlnd_data.kmx_mem_used -= size; \
190 spin_unlock(&kmxlnd_data.kmx_mem_lock); \
191 LIBCFS_FREE(x, size); \
195 typedef struct kmx_tunables {
196 int *kmx_n_waitd; /* # completion threads */
197 int *kmx_max_peers; /* max # of potential peers */
198 int *kmx_cksum; /* checksum small msgs? */
199 int *kmx_ntx; /* total # of tx (1/2 for LNET 1/2 for CONN_REQ */
200 int *kmx_credits; /* concurrent sends to 1 peer */
201 int *kmx_board; /* MX board (NIC) number */
202 int *kmx_ep_id; /* MX endpoint number */
203 char **kmx_default_ipif; /* IPoMX interface name */
204 int *kmx_polling; /* if 0, block. if > 0, poll this many
205 iterations before blocking */
208 /* global interface state */
209 typedef struct kmx_data
211 int kmx_init; /* initialization state */
212 int kmx_shutdown; /* shutting down? */
213 atomic_t kmx_nthreads; /* number of threads */
214 struct completion *kmx_completions; /* array of completion structs */
215 lnet_ni_t *kmx_ni; /* the LND instance */
216 u64 kmx_incarnation; /* my incarnation value - unused */
217 long kmx_mem_used; /* memory used */
218 struct kmx_peer *kmx_localhost; /* pointer to my kmx_peer info */
219 mx_endpoint_t kmx_endpt; /* the MX endpoint */
221 rwlock_t kmx_global_lock; /* global lock */
222 spinlock_t kmx_mem_lock; /* memory accounting lock */
224 struct list_head kmx_conn_req; /* list of connection requests */
225 spinlock_t kmx_conn_lock; /* connection list lock */
226 struct semaphore kmx_conn_sem; /* semaphore for connection request list */
228 struct list_head kmx_peers[MXLND_HASH_SIZE];
229 /* list of all known peers */
230 //rwlock_t kmx_peers_lock; /* peer list rw lock */
231 atomic_t kmx_npeers; /* number of peers */
233 struct list_head kmx_txs; /* all tx descriptors */
234 struct list_head kmx_tx_idle; /* list of idle tx */
235 spinlock_t kmx_tx_idle_lock; /* lock for idle tx list */
236 s32 kmx_tx_used; /* txs in use */
237 u64 kmx_tx_next_cookie; /* unique id for tx */
238 struct list_head kmx_tx_queue; /* generic send queue */
239 spinlock_t kmx_tx_queue_lock; /* lock for generic sends */
240 struct semaphore kmx_tx_queue_sem; /* semaphore for tx queue */
242 struct list_head kmx_rxs; /* all rx descriptors */
243 spinlock_t kmx_rxs_lock; /* lock for rxs list */
244 struct list_head kmx_rx_idle; /* list of idle tx */
245 spinlock_t kmx_rx_idle_lock; /* lock for idle rx list */
248 #define MXLND_INIT_NOTHING 0 /* in the beginning, there was nothing... */
249 #define MXLND_INIT_DATA 1 /* main data structures created */
250 #define MXLND_INIT_TXS 2 /* tx descriptors created */
251 #define MXLND_INIT_RXS 3 /* initial rx descriptors created */
252 #define MXLND_INIT_MX 4 /* initiate MX library, open endpoint, get NIC id */
253 #define MXLND_INIT_THREADS 5 /* waitd, timeoutd, tx_queued threads */
254 #define MXLND_INIT_ALL 6 /* startup completed */
256 /************************************************************************
257 * MXLND Wire message format.
258 * These are sent in sender's byte order (i.e. receiver flips).
261 typedef struct kmx_connreq_msg
263 u32 mxcrm_queue_depth; /* per peer max messages in flight */
264 u32 mxcrm_eager_size; /* size of preposted eager messages */
265 } WIRE_ATTR kmx_connreq_msg_t;
267 typedef struct kmx_eager_msg
269 lnet_hdr_t mxem_hdr; /* lnet header */
270 char mxem_payload[0]; /* piggy-backed payload */
271 } WIRE_ATTR kmx_eager_msg_t;
273 typedef struct kmx_putreq_msg
275 lnet_hdr_t mxprm_hdr; /* lnet header */
276 u64 mxprm_cookie; /* opaque completion cookie */
277 } WIRE_ATTR kmx_putreq_msg_t;
279 typedef struct kmx_putack_msg
281 u64 mxpam_src_cookie; /* reflected completion cookie */
282 u64 mxpam_dst_cookie; /* opaque completion cookie */
283 } WIRE_ATTR kmx_putack_msg_t;
285 typedef struct kmx_getreq_msg
287 lnet_hdr_t mxgrm_hdr; /* lnet header */
288 u64 mxgrm_cookie; /* opaque completion cookie */
289 } WIRE_ATTR kmx_getreq_msg_t;
291 typedef struct kmx_msg
293 /* First two fields fixed for all time */
294 u32 mxm_magic; /* MXLND message */
295 u16 mxm_version; /* version number */
297 u8 mxm_type; /* message type */
298 u8 mxm_credits; /* returned credits */
299 u32 mxm_nob; /* # of bytes in whole message */
300 u32 mxm_cksum; /* checksum (0 == no checksum) */
301 u64 mxm_srcnid; /* sender's NID */
302 u64 mxm_srcstamp; /* sender's incarnation */
303 u64 mxm_dstnid; /* destination's NID */
304 u64 mxm_dststamp; /* destination's incarnation */
305 u64 mxm_seq; /* sequence number */
308 kmx_connreq_msg_t conn_req;
309 kmx_eager_msg_t eager;
310 kmx_putreq_msg_t put_req;
311 kmx_putack_msg_t put_ack;
312 kmx_getreq_msg_t get_req;
314 } WIRE_ATTR kmx_msg_t;
316 /***********************************************************************/
323 /* The life cycle of a request */
325 MXLND_CTX_INIT = 0, /* just created */
326 MXLND_CTX_IDLE = 1, /* available for use */
327 MXLND_CTX_PREP = 2, /* getting ready for send/recv */
328 MXLND_CTX_PENDING = 3, /* mx_isend() or mx_irecv() called */
329 MXLND_CTX_COMPLETED = 4, /* cleaning up after completion or timeout */
330 MXLND_CTX_CANCELED = 5, /* timed out but still in ctx list */
333 /* Context Structure - generic tx/rx descriptor
334 * It represents the context (or state) of each send or receive request.
335 * In other LNDs, they have separate TX and RX descriptors and this replaces both.
337 * We will keep the these on the global kmx_rxs and kmx_txs lists for cleanup
338 * during shutdown(). We will move them between the rx/tx idle lists and the
339 * pending list which is monitored by mxlnd_timeoutd().
342 enum kmx_req_type mxc_type; /* TX or RX */
343 u64 mxc_incarnation; /* store the peer's incarnation here
344 to verify before changing flow
345 control credits after completion */
346 unsigned long mxc_deadline; /* request time out in absolute jiffies */
347 enum kmx_req_state mxc_state; /* what is the state of the request? */
348 struct list_head mxc_global_list; /* place on kmx_rxs or kmx_txs */
349 struct list_head mxc_list; /* place on rx/tx idle list, tx q, peer tx */
350 struct list_head mxc_rx_list; /* place on mxp_rx_posted list */
351 spinlock_t mxc_lock; /* lock */
353 lnet_nid_t mxc_nid; /* dst's NID if peer is not known */
354 struct kmx_peer *mxc_peer; /* owning peer */
355 struct kmx_conn *mxc_conn; /* owning conn */
356 struct kmx_msg *mxc_msg; /* msg hdr mapped to mxc_page */
357 struct page *mxc_page; /* buffer for eager msgs */
358 lnet_msg_t *mxc_lntmsg[2]; /* lnet msgs to finalize */
360 u8 mxc_msg_type; /* what type of message is this? */
361 u64 mxc_cookie; /* completion cookie */
362 u64 mxc_match; /* MX match info */
363 mx_ksegment_t mxc_seg; /* local MX ksegment for non-DATA */
364 mx_ksegment_t *mxc_seg_list; /* MX ksegment array for DATA */
365 int mxc_nseg; /* number of segments */
366 unsigned long mxc_pin_type; /* MX_PIN_KERNEL or MX_PIN_PHYSICAL */
367 u32 mxc_nob; /* number of bytes sent/received */
368 mx_request_t mxc_mxreq; /* MX request */
369 mx_status_t mxc_status; /* MX status */
370 s64 mxc_get; /* # of times returned from idle list */
371 s64 mxc_put; /* # of times returned from idle list */
374 #define MXLND_CONN_DISCONNECT -2 /* conn is being destroyed - do not add txs */
375 #define MXLND_CONN_FAIL -1 /* connect failed (bad handshake, unavail, etc.) */
376 #define MXLND_CONN_INIT 0 /* in the beginning, there was nothing... */
377 #define MXLND_CONN_REQ 1 /* a connection request message is needed */
378 #define MXLND_CONN_ACK 2 /* a connection ack is needed */
379 #define MXLND_CONN_WAIT 3 /* waiting for req or ack to complete */
380 #define MXLND_CONN_READY 4 /* ready to send */
382 /* connection state - queues for queued and pending msgs */
385 u64 mxk_incarnation; /* connections's incarnation value */
386 atomic_t mxk_refcount; /* reference counting */
388 struct kmx_peer *mxk_peer; /* owning peer */
389 mx_endpoint_addr_t mxk_epa; /* peer's endpoint address */
391 struct list_head mxk_list; /* for placing on mxp_conns */
392 spinlock_t mxk_lock; /* lock */
393 unsigned long mxk_timeout; /* expiration of oldest pending tx/rx */
394 unsigned long mxk_last_tx; /* when last tx completed with success */
395 unsigned long mxk_last_rx; /* when last rx completed */
397 int mxk_credits; /* # of my credits for sending to peer */
398 int mxk_outstanding; /* # of credits to return */
400 int mxk_status; /* can we send messages? MXLND_CONN_* */
401 struct list_head mxk_tx_credit_queue; /* send queue for peer */
402 struct list_head mxk_tx_free_queue; /* send queue for peer */
403 int mxk_ntx_msgs; /* # of msgs on tx queues */
404 int mxk_ntx_data ; /* # of DATA on tx queues */
405 int mxk_ntx_posted; /* # of tx msgs in flight */
406 int mxk_data_posted; /* # of tx data payloads in flight */
408 struct list_head mxk_pending; /* in flight rxs and txs */
414 lnet_nid_t mxp_nid; /* peer's LNET NID */
415 u64 mxp_incarnation; /* peer's incarnation value */
416 u32 mxp_sid; /* MX session ID */
417 atomic_t mxp_refcount; /* reference counts */
419 u32 mxp_ip; /* IP address as int */
420 u32 mxp_board; /* peer's board rank */
421 u32 mxp_ep_id; /* peer's MX endpoint ID */
422 u64 mxp_nic_id; /* remote's MX nic_id for mx_connect() */
424 struct list_head mxp_peers; /* for placing on kmx_peers */
426 struct list_head mxp_conns; /* list of connections */
427 struct kmx_conn *mxp_conn; /* current connection */
429 unsigned long mxp_reconnect_time; /* when to retry connect */
430 int mxp_incompatible; /* incorrect conn_req values */
433 extern kmx_data_t kmxlnd_data;
434 extern kmx_tunables_t kmxlnd_tunables;
436 /* required for the LNET API */
437 int mxlnd_startup(lnet_ni_t *ni);
438 void mxlnd_shutdown(lnet_ni_t *ni);
439 int mxlnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
440 int mxlnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
441 int mxlnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
442 unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
443 unsigned int offset, unsigned int mlen, unsigned int rlen);
446 extern void mxlnd_thread_stop(long id);
447 extern int mxlnd_ctx_alloc(struct kmx_ctx **ctxp, enum kmx_req_type type);
448 extern void mxlnd_ctx_free(struct kmx_ctx *ctx);
449 extern void mxlnd_ctx_init(struct kmx_ctx *ctx);
450 extern int mxlnd_peer_alloc(struct kmx_peer **peerp, lnet_nid_t nid,
451 u32 board, u32 ep_id, u64 nic_id);
454 void mxlnd_eager_recv(void *context, uint64_t match_value, uint32_t length);
455 extern mx_unexp_handler_action_t mxlnd_unexpected_recv(void *context,
456 mx_endpoint_addr_t source, uint64_t match_value, uint32_t length,
457 void *data_if_available);
458 extern void mxlnd_peer_free(struct kmx_peer *peer);
459 extern void mxlnd_conn_free(struct kmx_conn *conn);
460 extern void mxlnd_conn_disconnect(struct kmx_conn *conn, int mx_dis, int send_bye);
461 extern int mxlnd_close_matching_conns(lnet_nid_t nid);
462 extern void mxlnd_sleep(unsigned long timeout);
463 extern int mxlnd_tx_queued(void *arg);
464 extern void mxlnd_handle_rx_completion(struct kmx_ctx *rx);
465 extern int mxlnd_check_sends(struct kmx_peer *peer);
466 extern int mxlnd_tx_peer_queued(void *arg);
467 extern int mxlnd_request_waitd(void *arg);
468 extern int mxlnd_unex_recvd(void *arg);
469 extern int mxlnd_timeoutd(void *arg);
470 extern int mxlnd_connd(void *arg);
476 * Takes the u64 nid and XORs the lowest N bits by the next lowest N bits.
479 mxlnd_nid_to_hash(lnet_nid_t nid)
481 return (nid & MXLND_HASH_MASK) ^
482 ((nid & (MXLND_HASH_MASK << MXLND_HASH_BITS)) >> MXLND_HASH_BITS);
486 #define mxlnd_peer_addref(peer) \
488 LASSERT(peer != NULL); \
489 LASSERT(atomic_read(&(peer)->mxp_refcount) > 0); \
490 atomic_inc(&(peer)->mxp_refcount); \
494 #define mxlnd_peer_decref(peer) \
496 LASSERT(atomic_read(&(peer)->mxp_refcount) > 0); \
497 if (atomic_dec_and_test(&(peer)->mxp_refcount)) \
498 mxlnd_peer_free(peer); \
501 #define mxlnd_conn_addref(conn) \
503 LASSERT(conn != NULL); \
504 LASSERT(atomic_read(&(conn)->mxk_refcount) > 0); \
505 atomic_inc(&(conn)->mxk_refcount); \
509 #define mxlnd_conn_decref(conn) \
511 LASSERT(atomic_read(&(conn)->mxk_refcount) > 0); \
512 if (atomic_dec_and_test(&(conn)->mxk_refcount)) \
513 mxlnd_conn_free(conn); \