4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
31 * This file is part of Lustre, http://www.lustre.org/
32 * Lustre is a trademark of Sun Microsystems, Inc.
34 * lnet/klnds/ptllnd/ptllnd.h
36 * Author: PJ Kirner <pjkirner@clusterfs.com>
40 # define EXPORT_SYMTAB
42 #ifndef AUTOCONF_INCLUDED
43 #include <linux/config.h>
45 #include <linux/module.h>
46 #include <linux/kernel.h>
48 #include <linux/string.h>
49 #include <linux/stat.h>
50 #include <linux/errno.h>
51 #include <linux/smp_lock.h>
52 #include <linux/unistd.h>
53 #include <linux/uio.h>
55 #include <asm/system.h>
56 #include <asm/uaccess.h>
59 #include <linux/init.h>
61 #include <linux/file.h>
62 #include <linux/stat.h>
63 #include <linux/list.h>
64 #include <linux/kmod.h>
65 #include <linux/sysctl.h>
66 #include <linux/random.h>
72 #define DEBUG_SUBSYSTEM S_LND
74 #include <libcfs/libcfs.h>
75 #include <lnet/lnet.h>
76 #include <lnet/lib-lnet.h>
77 #include <lnet/lnet-sysctl.h>
78 #include <portals/p30.h>
80 #include <portals/ptltrace.h>
82 #include <lnet/ptllnd.h> /* Depends on portals/p30.h */
85 * Define this to enable console debug logging
88 //#define PJK_DEBUGGING
91 # define PTLLND_N_SCHED cfs_num_online_cpus() /* # schedulers */
93 # define PTLLND_N_SCHED 1 /* # schedulers */
96 #define PTLLND_CREDIT_HIGHWATER ((*kptllnd_tunables.kptl_peertxcredits)-1)
97 /* when eagerly to return credits */
101 int *kptl_ntx; /* # tx descs to pre-allocate */
102 int *kptl_max_nodes; /* max # nodes all talking to me */
103 int *kptl_max_procs_per_node; /* max # processes per node */
104 int *kptl_checksum; /* checksum kptl_msg_t? */
105 int *kptl_timeout; /* comms timeout (seconds) */
106 int *kptl_portal; /* portal number */
107 int *kptl_pid; /* portals PID (self + kernel peers) */
108 int *kptl_rxb_npages; /* number of pages for rx buffer */
109 int *kptl_rxb_nspare; /* number of spare rx buffers */
110 int *kptl_credits; /* number of credits */
111 int *kptl_peertxcredits; /* number of peer tx credits */
112 int *kptl_peerrtrcredits; /* number of peer router credits */
113 int *kptl_max_msg_size; /* max immd message size*/
114 int *kptl_peer_hash_table_size; /* # slots in peer hash table */
115 int *kptl_reschedule_loops; /* scheduler yield loops */
116 int *kptl_ack_puts; /* make portals ack PUTs */
118 int *kptl_ptltrace_on_timeout; /* dump pltrace on timeout? */
119 int *kptl_ptltrace_on_fail; /* dump pltrace on PTL_NAL_FAILED? */
120 char **kptl_ptltrace_basename; /* ptltrace dump file basename */
123 int *kptl_simulation_bitmap;/* simulation bitmap */
126 #if defined(CONFIG_SYSCTL) && !CFS_SYSFS_MODULE_PARM
127 cfs_sysctl_table_header_t *kptl_sysctl; /* sysctl interface */
131 #include "lnet/ptllnd_wire.h"
133 /***********************************************************************/
135 typedef struct kptl_data kptl_data_t;
136 typedef struct kptl_net kptl_net_t;
137 typedef struct kptl_rx_buffer kptl_rx_buffer_t;
138 typedef struct kptl_peer kptl_peer_t;
144 #define PTLLND_EVENTARG_TYPE_MSG 0x1
145 #define PTLLND_EVENTARG_TYPE_RDMA 0x2
146 #define PTLLND_EVENTARG_TYPE_BUF 0x3
148 typedef struct kptl_rx /* receive message */
150 cfs_list_t rx_list; /* queue for attention */
151 kptl_rx_buffer_t *rx_rxb; /* the rx buffer pointer */
152 kptl_msg_t *rx_msg; /* received message */
153 int rx_nob; /* received message size */
154 unsigned long rx_treceived; /* time received */
155 ptl_process_id_t rx_initiator; /* sender's address */
157 ptl_uid_t rx_uid; /* sender's uid */
159 kptl_peer_t *rx_peer; /* pointer to peer */
160 char rx_space[0]; /* copy of incoming request */
163 #define PTLLND_POSTRX_DONT_POST 0 /* don't post */
164 #define PTLLND_POSTRX_NO_CREDIT 1 /* post: no credits */
165 #define PTLLND_POSTRX_PEER_CREDIT 2 /* post: give peer back 1 credit */
167 typedef struct kptl_rx_buffer_pool
169 cfs_spinlock_t rxbp_lock;
170 cfs_list_t rxbp_list; /* all allocated buffers */
171 int rxbp_count; /* # allocated buffers */
172 int rxbp_reserved; /* # requests to buffer */
173 int rxbp_shutdown; /* shutdown flag */
174 } kptl_rx_buffer_pool_t;
176 struct kptl_rx_buffer
178 kptl_rx_buffer_pool_t *rxb_pool;
179 cfs_list_t rxb_list; /* for the rxb_pool list */
180 cfs_list_t rxb_repost_list;/* for the kptl_sched_rxbq list */
181 int rxb_posted:1; /* on the net */
182 int rxb_idle:1; /* all done */
183 kptl_eventarg_t rxb_eventarg; /* event->md.user_ptr */
184 int rxb_refcount; /* reference count */
185 ptl_handle_md_t rxb_mdh; /* the portals memory descriptor (MD) handle */
186 char *rxb_buffer; /* the buffer */
192 TX_TYPE_RESERVED = 0,
193 TX_TYPE_SMALL_MESSAGE = 1,
194 TX_TYPE_PUT_REQUEST = 2,
195 TX_TYPE_GET_REQUEST = 3,
196 TX_TYPE_PUT_RESPONSE = 4,
197 TX_TYPE_GET_RESPONSE = 5,
201 #ifdef _USING_LUSTRE_PORTALS_
202 struct iovec iov[PTL_MD_MAX_IOV];
203 lnet_kiov_t kiov[PTL_MD_MAX_IOV];
205 ptl_md_iovec_t iov[PTL_MD_MAX_IOV];
209 typedef struct kptl_tx /* transmit message */
211 cfs_list_t tx_list; /* queue on idle_txs etc */
212 cfs_atomic_t tx_refcount; /* reference count*/
213 enum kptl_tx_type tx_type; /* small msg/{put,get}{req,resp} */
214 int tx_active:1; /* queued on the peer */
215 int tx_idle:1; /* on the free list */
216 int tx_acked:1; /* portals ACK wanted (for debug only) */
217 kptl_eventarg_t tx_msg_eventarg; /* event->md.user_ptr */
218 kptl_eventarg_t tx_rdma_eventarg; /* event->md.user_ptr */
219 int tx_status; /* the status of this tx descriptor */
220 ptl_handle_md_t tx_rdma_mdh; /* RDMA buffer */
221 ptl_handle_md_t tx_msg_mdh; /* the portals MD handle for the initial message */
222 lnet_msg_t *tx_lnet_msg; /* LNET message to finalize */
223 lnet_msg_t *tx_lnet_replymsg; /* LNET reply message to finalize */
224 kptl_msg_t *tx_msg; /* the message data */
225 kptl_peer_t *tx_peer; /* the peer this is waiting on */
226 unsigned long tx_deadline; /* deadline */
227 unsigned long tx_tposted; /* time posted */
228 ptl_md_t tx_rdma_md; /* rdma descriptor */
229 kptl_fragvec_t *tx_frags; /* buffer fragments */
232 enum kptllnd_peer_state
234 PEER_STATE_UNINITIALIZED = 0,
235 PEER_STATE_ALLOCATED = 1,
236 PEER_STATE_WAITING_HELLO = 2,
237 PEER_STATE_ACTIVE = 3,
238 PEER_STATE_CLOSING = 4,
239 PEER_STATE_ZOMBIE = 5,
244 cfs_list_t peer_list;
245 cfs_atomic_t peer_refcount; /* The current references */
246 enum kptllnd_peer_state peer_state;
247 cfs_spinlock_t peer_lock; /* serialize */
248 cfs_list_t peer_noops; /* PTLLND_MSG_TYPE_NOOP txs */
249 cfs_list_t peer_sendq; /* txs waiting for mh handles */
250 cfs_list_t peer_activeq; /* txs awaiting completion */
251 lnet_process_id_t peer_id; /* Peer's LNET id */
252 ptl_process_id_t peer_ptlid; /* Peer's portals id */
253 __u64 peer_incarnation; /* peer's incarnation */
254 __u64 peer_myincarnation; /* my incarnation at HELLO */
255 int peer_sent_hello; /* have I sent HELLO? */
256 int peer_credits; /* number of send credits */
257 int peer_outstanding_credits;/* number of peer credits to return */
258 int peer_sent_credits; /* #msg buffers posted for peer */
259 int peer_max_msg_size; /* peer's rx buffer size */
260 int peer_error; /* errno on closing this peer */
261 int peer_retry_noop; /* need to retry returning credits */
262 int peer_check_stamp; /* watchdog check stamp */
263 cfs_time_t peer_last_alive; /* when (in jiffies) I was last alive */
264 __u64 peer_next_matchbits; /* Next value to register RDMA from peer */
265 __u64 peer_last_matchbits_seen; /* last matchbits used to RDMA to peer */
270 int kptl_init; /* initialisation state */
271 volatile int kptl_shutdown; /* shut down? */
272 cfs_atomic_t kptl_nthreads; /* # live threads */
273 ptl_handle_ni_t kptl_nih; /* network inteface handle */
274 ptl_process_id_t kptl_portals_id; /* Portals ID of interface */
275 __u64 kptl_incarnation; /* which one am I */
276 ptl_handle_eq_t kptl_eqh; /* Event Queue (EQ) */
278 cfs_rwlock_t kptl_net_rw_lock; /* serialise... */
279 cfs_list_t kptl_nets; /* kptl_net instances */
281 cfs_spinlock_t kptl_sched_lock; /* serialise... */
282 cfs_waitq_t kptl_sched_waitq; /* schedulers sleep here */
283 cfs_list_t kptl_sched_txq; /* tx requiring attention */
284 cfs_list_t kptl_sched_rxq; /* rx requiring attention */
285 cfs_list_t kptl_sched_rxbq; /* rxb requiring reposting */
287 cfs_waitq_t kptl_watchdog_waitq; /* watchdog sleeps here */
288 cfs_atomic_t kptl_needs_ptltrace; /* watchdog thread to dump ptltrace */
290 kptl_rx_buffer_pool_t kptl_rx_buffer_pool; /* rx buffer pool */
291 cfs_mem_cache_t* kptl_rx_cache; /* rx descripter cache */
293 cfs_atomic_t kptl_ntx; /* # tx descs allocated */
294 cfs_spinlock_t kptl_tx_lock; /* serialise idle tx list*/
295 cfs_list_t kptl_idle_txs; /* idle tx descriptors */
297 cfs_rwlock_t kptl_peer_rw_lock; /* lock for peer table */
298 cfs_list_t *kptl_peers; /* hash table of all my known peers */
299 cfs_list_t kptl_closing_peers; /* peers being closed */
300 cfs_list_t kptl_zombie_peers; /* peers waiting for refs to drain */
301 int kptl_peer_hash_size; /* size of kptl_peers */
302 int kptl_npeers; /* # peers extant */
303 int kptl_n_active_peers; /* # active peers */
304 int kptl_expected_peers; /* # peers I can buffer HELLOs from */
306 kptl_msg_t *kptl_nak_msg; /* common NAK message */
307 cfs_spinlock_t kptl_ptlid2str_lock; /* serialise str ops */
312 cfs_list_t net_list; /* chain on kptl_data:: kptl_nets */
314 cfs_atomic_t net_refcount; /* # current references */
315 int net_shutdown; /* lnd_shutdown called */
320 PTLLND_INIT_NOTHING = 0,
325 extern kptl_tunables_t kptllnd_tunables;
326 extern kptl_data_t kptllnd_data;
328 static inline lnet_nid_t
329 kptllnd_ptl2lnetnid(lnet_nid_t ni_nid, ptl_nid_t ptl_nid)
331 #ifdef _USING_LUSTRE_PORTALS_
332 return LNET_MKNID(LNET_NIDNET(ni_nid), LNET_NIDADDR(ptl_nid));
334 return LNET_MKNID(LNET_NIDNET(ni_nid), ptl_nid);
338 static inline ptl_nid_t
339 kptllnd_lnet2ptlnid(lnet_nid_t lnet_nid)
341 #ifdef _USING_LUSTRE_PORTALS_
342 return LNET_MKNID(LNET_NIDNET(kptllnd_data.kptl_portals_id.nid),
343 LNET_NIDADDR(lnet_nid));
345 return LNET_NIDADDR(lnet_nid);
350 kptllnd_schedule_ptltrace_dump (void)
353 if (*kptllnd_tunables.kptl_ptltrace_on_fail) {
354 cfs_atomic_inc(&kptllnd_data.kptl_needs_ptltrace);
355 cfs_waitq_signal(&kptllnd_data.kptl_watchdog_waitq);
360 int kptllnd_startup(lnet_ni_t *ni);
361 void kptllnd_shutdown(lnet_ni_t *ni);
362 int kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
363 void kptllnd_query (struct lnet_ni *ni, lnet_nid_t nid, cfs_time_t *when);
364 int kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
365 int kptllnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
366 int delayed, unsigned int niov,
367 struct iovec *iov, lnet_kiov_t *kiov,
368 unsigned int offset, unsigned int mlen, unsigned int rlen);
369 int kptllnd_eager_recv(struct lnet_ni *ni, void *private,
370 lnet_msg_t *msg, void **new_privatep);
371 void kptllnd_eq_callback(ptl_event_t *evp);
372 int kptllnd_scheduler(void *arg);
373 int kptllnd_watchdog(void *arg);
374 int kptllnd_thread_start(int (*fn)(void *arg), void *arg);
375 int kptllnd_tunables_init(void);
376 void kptllnd_tunables_fini(void);
378 const char *kptllnd_evtype2str(int evtype);
379 const char *kptllnd_msgtype2str(int msgtype);
380 const char *kptllnd_errtype2str(int errtype);
383 kptllnd_eventarg2obj (kptl_eventarg_t *eva)
385 switch (eva->eva_type) {
388 case PTLLND_EVENTARG_TYPE_BUF:
389 return cfs_list_entry(eva, kptl_rx_buffer_t, rxb_eventarg);
390 case PTLLND_EVENTARG_TYPE_RDMA:
391 return cfs_list_entry(eva, kptl_tx_t, tx_rdma_eventarg);
392 case PTLLND_EVENTARG_TYPE_MSG:
393 return cfs_list_entry(eva, kptl_tx_t, tx_msg_eventarg);
398 * RX BUFFER SUPPORT FUNCTIONS
400 void kptllnd_rx_buffer_pool_init(kptl_rx_buffer_pool_t *rxbp);
401 void kptllnd_rx_buffer_pool_fini(kptl_rx_buffer_pool_t *rxbp);
402 int kptllnd_rx_buffer_pool_reserve(kptl_rx_buffer_pool_t *rxbp, int count);
403 void kptllnd_rx_buffer_pool_unreserve(kptl_rx_buffer_pool_t *rxbp, int count);
404 void kptllnd_rx_buffer_callback(ptl_event_t *ev);
405 void kptllnd_rx_buffer_post(kptl_rx_buffer_t *rxb);
408 kptllnd_rx_buffer_size(void)
410 return PAGE_SIZE * (*kptllnd_tunables.kptl_rxb_npages);
414 kptllnd_rx_buffer_addref(kptl_rx_buffer_t *rxb)
418 cfs_spin_lock_irqsave(&rxb->rxb_pool->rxbp_lock, flags);
420 cfs_spin_unlock_irqrestore(&rxb->rxb_pool->rxbp_lock, flags);
424 kptllnd_rx_buffer_decref_locked(kptl_rx_buffer_t *rxb)
426 if (--(rxb->rxb_refcount) == 0) {
427 cfs_spin_lock(&kptllnd_data.kptl_sched_lock);
429 cfs_list_add_tail(&rxb->rxb_repost_list,
430 &kptllnd_data.kptl_sched_rxbq);
431 cfs_waitq_signal(&kptllnd_data.kptl_sched_waitq);
433 cfs_spin_unlock(&kptllnd_data.kptl_sched_lock);
438 kptllnd_rx_buffer_decref(kptl_rx_buffer_t *rxb)
443 cfs_spin_lock_irqsave(&rxb->rxb_pool->rxbp_lock, flags);
444 count = --(rxb->rxb_refcount);
445 cfs_spin_unlock_irqrestore(&rxb->rxb_pool->rxbp_lock, flags);
448 kptllnd_rx_buffer_post(rxb);
452 * RX SUPPORT FUNCTIONS
454 void kptllnd_rx_parse(kptl_rx_t *rx);
455 void kptllnd_rx_done(kptl_rx_t *rx, int post_credit);
458 * PEER SUPPORT FUNCTIONS
460 int kptllnd_get_peer_info(int index,
461 lnet_process_id_t *id,
462 int *state, int *sent_hello,
463 int *refcount, __u64 *incarnation,
464 __u64 *next_matchbits, __u64 *last_matchbits_seen,
465 int *nsendq, int *nactiveq,
466 int *credits, int *outstanding_credits);
467 void kptllnd_peer_destroy(kptl_peer_t *peer);
468 int kptllnd_peer_del(lnet_process_id_t id);
469 void kptllnd_peer_close_locked(kptl_peer_t *peer, int why);
470 void kptllnd_peer_close(kptl_peer_t *peer, int why);
471 void kptllnd_handle_closing_peers(void);
472 int kptllnd_peer_connect(kptl_tx_t *tx, lnet_nid_t nid);
473 void kptllnd_peer_check_sends(kptl_peer_t *peer);
474 void kptllnd_peer_check_bucket(int idx, int stamp);
475 void kptllnd_tx_launch(kptl_peer_t *peer, kptl_tx_t *tx, int nfrag);
476 int kptllnd_find_target(kptl_net_t *net, lnet_process_id_t target,
477 kptl_peer_t **peerp);
478 kptl_peer_t *kptllnd_peer_handle_hello(kptl_net_t *net,
479 ptl_process_id_t initiator,
481 kptl_peer_t *kptllnd_id2peer_locked(lnet_process_id_t id);
482 void kptllnd_peer_alive(kptl_peer_t *peer);
485 kptllnd_peer_addref (kptl_peer_t *peer)
487 cfs_atomic_inc(&peer->peer_refcount);
491 kptllnd_peer_decref (kptl_peer_t *peer)
493 if (cfs_atomic_dec_and_test(&peer->peer_refcount))
494 kptllnd_peer_destroy(peer);
498 kptllnd_net_addref (kptl_net_t *net)
500 LASSERT (cfs_atomic_read(&net->net_refcount) > 0);
501 cfs_atomic_inc(&net->net_refcount);
505 kptllnd_net_decref (kptl_net_t *net)
507 LASSERT (cfs_atomic_read(&net->net_refcount) > 0);
508 cfs_atomic_dec(&net->net_refcount);
512 kptllnd_set_tx_peer(kptl_tx_t *tx, kptl_peer_t *peer)
514 LASSERT (tx->tx_peer == NULL);
516 kptllnd_peer_addref(peer);
520 static inline cfs_list_t *
521 kptllnd_nid2peerlist(lnet_nid_t nid)
523 /* Only one copy of peer state for all logical peers, so the net part
524 * of NIDs is ignored; e.g. A@ptl0 and A@ptl2 share peer state */
525 unsigned int hash = ((unsigned int)LNET_NIDADDR(nid)) %
526 kptllnd_data.kptl_peer_hash_size;
528 return &kptllnd_data.kptl_peers[hash];
531 static inline kptl_peer_t *
532 kptllnd_id2peer(lnet_process_id_t id)
537 cfs_read_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags);
538 peer = kptllnd_id2peer_locked(id);
539 cfs_read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags);
545 kptllnd_reserve_buffers(int n)
547 return kptllnd_rx_buffer_pool_reserve(&kptllnd_data.kptl_rx_buffer_pool,
552 kptllnd_peer_reserve_buffers(void)
554 return kptllnd_reserve_buffers(*kptllnd_tunables.kptl_peertxcredits);
558 kptllnd_peer_unreserve_buffers(void)
560 kptllnd_rx_buffer_pool_unreserve(&kptllnd_data.kptl_rx_buffer_pool,
561 *kptllnd_tunables.kptl_peertxcredits);
565 * TX SUPPORT FUNCTIONS
567 int kptllnd_setup_tx_descs(void);
568 void kptllnd_cleanup_tx_descs(void);
569 void kptllnd_tx_fini(kptl_tx_t *tx);
570 void kptllnd_cancel_txlist(cfs_list_t *peerq, cfs_list_t *txs);
571 void kptllnd_restart_txs(kptl_net_t *net, lnet_process_id_t id,
572 cfs_list_t *restarts);
573 kptl_tx_t *kptllnd_get_idle_tx(enum kptl_tx_type purpose);
574 void kptllnd_tx_callback(ptl_event_t *ev);
575 const char *kptllnd_tx_typestr(int type);
578 kptllnd_tx_addref(kptl_tx_t *tx)
580 cfs_atomic_inc(&tx->tx_refcount);
584 kptllnd_tx_decref(kptl_tx_t *tx)
586 LASSERT (!cfs_in_interrupt()); /* Thread context only */
588 if (cfs_atomic_dec_and_test(&tx->tx_refcount))
593 * MESSAGE SUPPORT FUNCTIONS
595 void kptllnd_init_msg(kptl_msg_t *msg, int type,
596 lnet_process_id_t target, int body_nob);
597 void kptllnd_msg_pack(kptl_msg_t *msg, kptl_peer_t *peer);
598 int kptllnd_msg_unpack(kptl_msg_t *msg, int nob);
601 * MISC SUPPORT FUNCTIONS
603 void kptllnd_init_rdma_md(kptl_tx_t *tx, unsigned int niov,
604 struct iovec *iov, lnet_kiov_t *kiov,
605 unsigned int offset, unsigned int nob);
606 char *kptllnd_ptlid2str(ptl_process_id_t id);
608 void kptllnd_init_ptltrace(void);
609 void kptllnd_dump_ptltrace(void);
612 #define SIMULATION_FAIL_TX_PUT_ALLOC 0 /* 0x00000001 */
613 #define SIMULATION_FAIL_TX_GET_ALLOC 1 /* 0x00000002 */
614 #define SIMULATION_FAIL_TX 2 /* 0x00000004 */
615 #define SIMULATION_FAIL_RX_ALLOC 3 /* 0x00000008 */
617 #define IS_SIMULATION_ENABLED(x) \
618 (((*kptllnd_tunables.kptl_simulation_bitmap) & 1<< SIMULATION_##x) != 0)
620 #define IS_SIMULATION_ENABLED(x) 0