1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2004 Cluster File Systems, Inc.
5 * Author: Eric Barton <eric@bartonsoftware.com>
6 * Copyright (C) 2006 Myricom, Inc.
7 * Author: Scott Atchley <atchley at myri.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 #ifndef AUTOCONF_INCLUDED
29 #include <linux/config.h>
31 #include <linux/module.h> /* module */
32 #include <linux/kernel.h> /* module */
34 #include <linux/string.h>
35 #include <linux/stat.h>
36 #include <linux/errno.h>
37 #include <linux/smp_lock.h>
38 #include <linux/unistd.h>
39 #include <linux/uio.h>
42 #include <asm/system.h>
43 #include <asm/uaccess.h>
46 #include <linux/init.h> /* module */
48 #include <linux/file.h>
49 #include <linux/stat.h>
50 #include <linux/list.h>
51 #include <linux/kmod.h>
52 #include <linux/sysctl.h>
53 #include <linux/random.h>
54 #include <linux/utsname.h>
59 #include <linux/netdevice.h> /* these are needed for ARP */
60 #include <linux/if_arp.h>
62 #include <linux/inetdevice.h>
64 #define DEBUG_SUBSYSTEM S_LND
66 #include "libcfs/libcfs.h"
67 #include "lnet/lnet.h"
68 #include "lnet/lib-lnet.h"
71 #include "mx_extensions.h"
72 #include "myriexpress.h"
74 #if LNET_MAX_IOV > MX_MAX_SEGMENTS
75 #error LNET_MAX_IOV is greater then MX_MAX_SEGMENTS
78 /* Using MX's 64 match bits
79 * We are using the match bits to specify message type and the cookie. The
80 * highest four bits (60-63) are reserved for message type. Below we specify
81 * the types. MXLND_MASK_ICON_REQ and MXLND_MASK_ICON_ACK are used for
82 * mx_iconnect(). We reserve the remaining combinations for future use. The
83 * next 8 bits (52-59) are reserved for returning a status code for failed
84 * GET_DATA (payload) messages. The last 52 bits are used for cookies. That
85 * should allow unique cookies for 4 KB messages at 10 Gbps line rate without
86 * rollover for about 8 years. That should be enough. */
89 #define MXLND_MASK_ICON_REQ (0xBLL << 60) /* it is a mx_iconnect() completion */
90 #define MXLND_MASK_CONN_REQ (0xCLL << 60) /* CONN_REQ msg */
91 #define MXLND_MASK_ICON_ACK (0x9LL << 60) /* it is a mx_iconnect() completion */
92 #define MXLND_MASK_CONN_ACK (0xALL << 60) /* CONN_ACK msg*/
93 #define MXLND_MASK_EAGER (0xELL << 60) /* EAGER msg */
94 #define MXLND_MASK_NOOP (0x1LL << 60) /* NOOP msg */
95 #define MXLND_MASK_PUT_REQ (0x2LL << 60) /* PUT_REQ msg */
96 #define MXLND_MASK_PUT_ACK (0x3LL << 60) /* PUT_ACK msg */
97 #define MXLND_MASK_PUT_DATA (0x4LL << 60) /* PUT_DATA msg */
98 #define MXLND_MASK_GET_REQ (0x5LL << 60) /* GET_REQ msg */
99 #define MXLND_MASK_GET_DATA (0x6LL << 60) /* GET_DATA msg */
100 //#define MXLND_MASK_NAK (0x7LL << 60) /* NAK msg */
102 #define MXLND_MAX_COOKIE ((1LL << 52) - 1) /* when to roll-over the cookie value */
103 #define MXLND_NCOMPLETIONS (MXLND_N_SCHED + 2) /* max threads for completion array */
105 /* defaults for configurable parameters */
106 #define MXLND_N_SCHED 1 /* # schedulers (mx_wait_any() threads) */
107 #define MXLND_MX_BOARD 0 /* Use the first MX NIC if more than 1 avail */
108 #define MXLND_MX_EP_ID 3 /* MX endpoint ID */
109 #define MXLND_COMM_TIMEOUT (20 * HZ) /* timeout for send/recv (jiffies) */
110 #define MXLND_WAIT_TIMEOUT HZ /* timeout for wait (jiffies) */
111 #define MXLND_POLLING 1000 /* poll iterations before blocking */
112 #define MXLND_MAX_PEERS 1024 /* number of nodes talking to me */
113 #define MXLND_EAGER_NUM MXLND_MAX_PEERS /* number of pre-posted receives */
114 #define MXLND_EAGER_SIZE PAGE_SIZE /* pre-posted eager message size */
115 #define MXLND_MSG_QUEUE_DEPTH 8 /* msg queue depth */
116 #define MXLND_CREDIT_HIGHWATER (MXLND_MSG_QUEUE_DEPTH - 2)
117 /* when to send a noop to return credits */
118 #define MXLND_NTX 256 /* # of kmx_tx - total sends in flight
119 1/2 are reserved for connect messages */
121 #define MXLND_HASH_BITS 6 /* the number of bits to hash over */
122 #define MXLND_HASH_SIZE (1<<MXLND_HASH_BITS)
123 /* number of peer lists for lookup.
124 we hash over the last N bits of
125 the IP address converted to an int. */
126 #define MXLND_HASH_MASK (MXLND_HASH_SIZE - 1)
127 /* ensure we use only the last N bits */
129 /* debugging features */
130 #define MXLND_CKSUM 0 /* checksum kmx_msg_t */
131 #define MXLND_DEBUG 0 /* turn on printk()s */
133 extern inline void mxlnd_noop(char *s, ...);
135 #define MXLND_PRINT printk
137 #define MXLND_PRINT mxlnd_noop
140 /* provide wrappers around LIBCFS_ALLOC/FREE to keep MXLND specific
141 * memory usage stats that include pages */
143 #define MXLND_ALLOC(x, size) \
145 spin_lock(&kmxlnd_data.kmx_global_lock); \
146 kmxlnd_data.kmx_mem_used += size; \
147 spin_unlock(&kmxlnd_data.kmx_global_lock); \
148 LIBCFS_ALLOC(x, size); \
150 spin_lock(&kmxlnd_data.kmx_global_lock); \
151 kmxlnd_data.kmx_mem_used -= size; \
152 spin_unlock(&kmxlnd_data.kmx_global_lock); \
156 #define MXLND_FREE(x, size) \
158 spin_lock(&kmxlnd_data.kmx_global_lock); \
159 kmxlnd_data.kmx_mem_used -= size; \
160 spin_unlock(&kmxlnd_data.kmx_global_lock); \
161 LIBCFS_FREE(x, size); \
165 typedef struct kmx_tunables {
166 int *kmx_n_waitd; /* # completion threads */
167 int *kmx_max_peers; /* max # of potential peers */
168 int *kmx_cksum; /* checksum small msgs? */
169 int *kmx_ntx; /* total # of tx (1/2 for LNET 1/2 for CONN_REQ */
170 int *kmx_credits; /* concurrent sends to 1 peer */
171 int *kmx_board; /* MX board (NIC) number */
172 int *kmx_ep_id; /* MX endpoint number */
173 int *kmx_polling; /* if 0, block. if > 0, poll this many
174 iterations before blocking */
175 char **kmx_hosts; /* Location of hosts file, if used */
178 /* structure to hold IP-to-hostname resolution data */
180 struct kmx_peer *mxh_peer; /* pointer to matching peer */
181 u32 mxh_addr; /* IP address as int */
182 char *mxh_hostname; /* peer's hostname */
183 u32 mxh_board; /* peer's board rank */
184 u32 mxh_ep_id; /* peer's MX endpoint ID */
185 struct list_head mxh_list; /* position on kmx_hosts */
186 spinlock_t mxh_lock; /* lock */
189 /* global interface state */
190 typedef struct kmx_data
192 int kmx_init; /* initialization state */
193 int kmx_shutdown; /* shutting down? */
194 atomic_t kmx_nthreads; /* number of threads */
195 struct completion *kmx_completions; /* array of completion structs */
196 lnet_ni_t *kmx_ni; /* the LND instance */
197 u64 kmx_incarnation; /* my incarnation value - unused */
198 long kmx_mem_used; /* memory used */
199 struct kmx_host *kmx_localhost; /* pointer to my kmx_host info */
200 mx_endpoint_t kmx_endpt; /* the MX endpoint */
202 spinlock_t kmx_global_lock; /* global lock */
204 struct list_head kmx_conn_req; /* list of connection requests */
205 spinlock_t kmx_conn_lock; /* connection list lock */
206 struct semaphore kmx_conn_sem; /* semaphore for connection request list */
208 struct list_head kmx_hosts; /* host lookup info */
209 spinlock_t kmx_hosts_lock; /* hosts list lock */
211 struct list_head kmx_peers[MXLND_HASH_SIZE];
212 /* list of all known peers */
213 rwlock_t kmx_peers_lock; /* peer list rw lock */
214 atomic_t kmx_npeers; /* number of peers */
216 struct list_head kmx_txs; /* all tx descriptors */
217 struct list_head kmx_tx_idle; /* list of idle tx */
218 spinlock_t kmx_tx_idle_lock; /* lock for idle tx list */
219 s32 kmx_tx_used; /* txs in use */
220 u64 kmx_tx_next_cookie; /* unique id for tx */
221 struct list_head kmx_tx_queue; /* generic send queue */
222 spinlock_t kmx_tx_queue_lock; /* lock for generic sends */
223 struct semaphore kmx_tx_queue_sem; /* semaphore for tx queue */
225 struct list_head kmx_rxs; /* all rx descriptors */
226 spinlock_t kmx_rxs_lock; /* lock for rxs list */
227 struct list_head kmx_rx_idle; /* list of idle tx */
228 spinlock_t kmx_rx_idle_lock; /* lock for idle rx list */
231 #define MXLND_INIT_NOTHING 0 /* in the beginning, there was nothing... */
232 #define MXLND_INIT_DATA 1 /* main data structures created */
233 #define MXLND_INIT_TXS 2 /* tx descriptors created */
234 #define MXLND_INIT_RXS 3 /* initial rx descriptors created */
235 #define MXLND_INIT_MX 4 /* initiate MX library, open endpoint, get NIC id */
236 #define MXLND_INIT_THREADS 5 /* waitd, timeoutd, tx_queued threads */
237 #define MXLND_INIT_ALL 6 /* startup completed */
239 #include "mxlnd_wire.h"
246 /* The life cycle of a request */
248 MXLND_CTX_INIT = 0, /* just created */
249 MXLND_CTX_IDLE = 1, /* available for use */
250 MXLND_CTX_PREP = 2, /* getting ready for send/recv */
251 MXLND_CTX_PENDING = 3, /* mx_isend() or mx_irecv() called */
252 MXLND_CTX_COMPLETED = 4, /* cleaning up after completion or timeout */
253 MXLND_CTX_CANCELED = 5, /* timed out but still in ctx list */
256 /* Context Structure - generic tx/rx descriptor
257 * It represents the context (or state) of each send or receive request.
258 * In other LNDs, they have separate TX and RX descriptors and this replaces both.
260 * We will keep the these on the global kmx_rxs and kmx_txs lists for cleanup
261 * during shutdown(). We will move them between the rx/tx idle lists and the
262 * pending list which is monitored by mxlnd_timeoutd().
265 enum kmx_req_type mxc_type; /* TX or RX */
266 u64 mxc_incarnation; /* store the peer's incarnation here
267 to verify before changing flow
268 control credits after completion */
269 unsigned long mxc_deadline; /* request time out in absolute jiffies */
270 enum kmx_req_state mxc_state; /* what is the state of the request? */
271 struct list_head mxc_global_list; /* place on kmx_rxs or kmx_txs */
272 struct list_head mxc_list; /* place on rx/tx idle list, tx q, peer tx */
273 struct list_head mxc_rx_list; /* place on mxp_rx_posted list */
274 spinlock_t mxc_lock; /* lock */
276 lnet_nid_t mxc_nid; /* dst's NID if peer is not known */
277 struct kmx_peer *mxc_peer; /* owning peer */
278 struct kmx_conn *mxc_conn; /* owning conn */
279 struct kmx_msg *mxc_msg; /* msg hdr mapped to mxc_page */
280 struct page *mxc_page; /* buffer for eager msgs */
281 lnet_msg_t *mxc_lntmsg[2]; /* lnet msgs to finalize */
283 u8 mxc_msg_type; /* what type of message is this? */
284 u64 mxc_cookie; /* completion cookie */
285 u64 mxc_match; /* MX match info */
286 mx_ksegment_t mxc_seg; /* local MX ksegment for non-DATA */
287 mx_ksegment_t *mxc_seg_list; /* MX ksegment array for DATA */
288 int mxc_nseg; /* number of segments */
289 unsigned long mxc_pin_type; /* MX_PIN_KERNEL or MX_PIN_PHYSICAL */
290 u32 mxc_nob; /* number of bytes sent/received */
291 mx_request_t mxc_mxreq; /* MX request */
292 mx_status_t mxc_status; /* MX status */
293 s64 mxc_get; /* # of times returned from idle list */
294 s64 mxc_put; /* # of times returned from idle list */
297 #define MXLND_CONN_DISCONNECT -2 /* conn is being destroyed - do not add txs */
298 #define MXLND_CONN_FAIL -1 /* connect failed (bad handshake, unavail, etc.) */
299 #define MXLND_CONN_INIT 0 /* in the beginning, there was nothing... */
300 #define MXLND_CONN_REQ 1 /* a connection request message is needed */
301 #define MXLND_CONN_ACK 2 /* a connection ack is needed */
302 #define MXLND_CONN_WAIT 3 /* waiting for req or ack to complete */
303 #define MXLND_CONN_READY 4 /* ready to send */
305 /* connection state - queues for queued and pending msgs */
308 u64 mxk_incarnation; /* connections's incarnation value */
309 atomic_t mxk_refcount; /* reference counting */
311 struct kmx_peer *mxk_peer; /* owning peer */
312 mx_endpoint_addr_t mxk_epa; /* peer's endpoint address */
314 struct list_head mxk_list; /* for placing on mxp_conns */
315 spinlock_t mxk_lock; /* lock */
316 unsigned long mxk_timeout; /* expiration of oldest pending tx/rx */
317 unsigned long mxk_last_tx; /* when last tx completed with success */
318 unsigned long mxk_last_rx; /* when last rx completed */
320 int mxk_credits; /* # of my credits for sending to peer */
321 int mxk_outstanding; /* # of credits to return */
323 int mxk_status; /* can we send messages? MXLND_CONN_* */
324 struct list_head mxk_tx_credit_queue; /* send queue for peer */
325 struct list_head mxk_tx_free_queue; /* send queue for peer */
326 int mxk_ntx_msgs; /* # of msgs on tx queues */
327 int mxk_ntx_data ; /* # of DATA on tx queues */
328 int mxk_ntx_posted; /* # of tx msgs in flight */
329 int mxk_data_posted; /* # of tx data payloads in flight */
331 struct list_head mxk_pending; /* in flight rxs and txs */
337 lnet_nid_t mxp_nid; /* peer's LNET NID */
338 u64 mxp_incarnation; /* peer's incarnation value */
339 atomic_t mxp_refcount; /* reference counts */
341 struct kmx_host *mxp_host; /* peer lookup info */
342 u64 mxp_nic_id; /* remote's MX nic_id for mx_connect() */
344 struct list_head mxp_peers; /* for placing on kmx_peers */
345 spinlock_t mxp_lock; /* lock */
347 struct list_head mxp_conns; /* list of connections */
348 struct kmx_conn *mxp_conn; /* current connection */
350 unsigned long mxp_reconnect_time; /* when to retry connect */
351 int mxp_incompatible; /* incorrect conn_req values */
354 extern kmx_data_t kmxlnd_data;
355 extern kmx_tunables_t kmxlnd_tunables;
357 /* required for the LNET API */
358 int mxlnd_startup(lnet_ni_t *ni);
359 void mxlnd_shutdown(lnet_ni_t *ni);
360 int mxlnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
361 int mxlnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
362 int mxlnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg, int delayed,
363 unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
364 unsigned int offset, unsigned int mlen, unsigned int rlen);
367 extern void mxlnd_thread_stop(long id);
368 extern int mxlnd_ctx_alloc(struct kmx_ctx **ctxp, enum kmx_req_type type);
369 extern void mxlnd_ctx_free(struct kmx_ctx *ctx);
370 extern void mxlnd_ctx_init(struct kmx_ctx *ctx);
371 extern lnet_nid_t mxlnd_nic_id2nid(lnet_ni_t *ni, u64 nic_id);
372 extern u64 mxlnd_nid2nic_id(lnet_nid_t nid);
375 void mxlnd_eager_recv(void *context, uint64_t match_value, uint32_t length);
376 extern mx_unexp_handler_action_t mxlnd_unexpected_recv(void *context,
377 mx_endpoint_addr_t source, uint64_t match_value, uint32_t length,
378 void *data_if_available);
379 extern void mxlnd_peer_free(struct kmx_peer *peer);
380 extern void mxlnd_conn_free(struct kmx_conn *conn);
381 extern void mxlnd_sleep(unsigned long timeout);
382 extern int mxlnd_tx_queued(void *arg);
383 extern void mxlnd_handle_rx_completion(struct kmx_ctx *rx);
384 extern int mxlnd_check_sends(struct kmx_peer *peer);
385 extern int mxlnd_tx_peer_queued(void *arg);
386 extern int mxlnd_request_waitd(void *arg);
387 extern int mxlnd_unex_recvd(void *arg);
388 extern int mxlnd_timeoutd(void *arg);
389 extern int mxlnd_connd(void *arg);
391 #define mxlnd_peer_addref(peer) \
393 LASSERT(peer != NULL); \
394 LASSERT(atomic_read(&(peer)->mxp_refcount) > 0); \
395 atomic_inc(&(peer)->mxp_refcount); \
399 #define mxlnd_peer_decref(peer) \
401 LASSERT(atomic_read(&(peer)->mxp_refcount) > 0); \
402 if (atomic_dec_and_test(&(peer)->mxp_refcount)) \
403 mxlnd_peer_free(peer); \
406 #define mxlnd_conn_addref(conn) \
408 LASSERT(conn != NULL); \
409 LASSERT(atomic_read(&(conn)->mxk_refcount) > 0); \
410 atomic_inc(&(conn)->mxk_refcount); \
414 #define mxlnd_conn_decref(conn) \
416 LASSERT(atomic_read(&(conn)->mxk_refcount) > 0); \
417 if (atomic_dec_and_test(&(conn)->mxk_refcount)) \
418 mxlnd_conn_free(conn); \