1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
5 * Author: PJ Kirner <pjkirner@clusterfs.com>
7 * This file is part of the Lustre file system, http://www.lustre.org
8 * Lustre is a trademark of Cluster File Systems, Inc.
10 * This file is confidential source code owned by Cluster File Systems.
11 * No viewing, modification, compilation, redistribution, or any other
12 * form of use is permitted except through a signed license agreement.
14 * If you have not signed such an agreement, then you have no rights to
15 * this file. Please destroy it immediately and contact CFS.
20 # define EXPORT_SYMTAB
22 #ifndef AUTOCONF_INCLUDED
23 #include <linux/config.h>
25 #include <linux/module.h>
26 #include <linux/kernel.h>
28 #include <linux/string.h>
29 #include <linux/stat.h>
30 #include <linux/errno.h>
31 #include <linux/smp_lock.h>
32 #include <linux/unistd.h>
33 #include <linux/uio.h>
35 #include <asm/system.h>
36 #include <asm/uaccess.h>
39 #include <linux/init.h>
41 #include <linux/file.h>
42 #include <linux/stat.h>
43 #include <linux/list.h>
44 #include <linux/kmod.h>
45 #include <linux/sysctl.h>
46 #include <linux/random.h>
52 #define DEBUG_SUBSYSTEM S_LND
54 #include <libcfs/kp30.h>
55 #include <lnet/lnet.h>
56 #include <lnet/lib-lnet.h>
57 #include <portals/p30.h>
59 #include <portals/ptltrace.h>
61 #include <lnet/ptllnd.h> /* Depends on portals/p30.h */
64 * Define this to enable console debug logging
67 //#define PJK_DEBUGGING
70 # define PTLLND_N_SCHED num_online_cpus() /* # schedulers */
72 # define PTLLND_N_SCHED 1 /* # schedulers */
75 #define PTLLND_CREDIT_HIGHWATER ((*kptllnd_tunables.kptl_peercredits)-1)
76 /* when eagerly to return credits */
80 int *kptl_ntx; /* # tx descs to pre-allocate */
81 int *kptl_max_nodes; /* max # nodes all talking to me */
82 int *kptl_max_procs_per_node; /* max # processes per node */
83 int *kptl_checksum; /* checksum kptl_msg_t? */
84 int *kptl_timeout; /* comms timeout (seconds) */
85 int *kptl_portal; /* portal number */
86 int *kptl_pid; /* portals PID (self + kernel peers) */
87 int *kptl_rxb_npages; /* number of pages for rx buffer */
88 int *kptl_rxb_nspare; /* number of spare rx buffers */
89 int *kptl_credits; /* number of credits */
90 int *kptl_peercredits; /* number of credits */
91 int *kptl_max_msg_size; /* max immd message size*/
92 int *kptl_peer_hash_table_size; /* # slots in peer hash table */
93 int *kptl_reschedule_loops; /* scheduler yield loops */
94 int *kptl_ack_puts; /* make portals ack PUTs */
96 int *kptl_ptltrace_on_timeout; /* dump pltrace on timeout? */
97 char **kptl_ptltrace_basename; /* ptltrace dump file basename */
100 int *kptl_simulation_bitmap;/* simulation bitmap */
103 #if CONFIG_SYSCTL && !CFS_SYSFS_MODULE_PARM
104 struct ctl_table_header *kptl_sysctl; /* sysctl interface */
108 #include "lnet/ptllnd_wire.h"
110 /***********************************************************************/
112 typedef struct kptl_data kptl_data_t;
113 typedef struct kptl_rx_buffer kptl_rx_buffer_t;
114 typedef struct kptl_peer kptl_peer_t;
120 #define PTLLND_EVENTARG_TYPE_MSG 0x1
121 #define PTLLND_EVENTARG_TYPE_RDMA 0x2
122 #define PTLLND_EVENTARG_TYPE_BUF 0x3
124 typedef struct kptl_rx /* receive message */
126 struct list_head rx_list; /* queue for attention */
127 kptl_rx_buffer_t *rx_rxb; /* the rx buffer pointer */
128 kptl_msg_t *rx_msg; /* received message */
129 int rx_nob; /* received message size */
130 unsigned long rx_treceived; /* time received */
131 ptl_process_id_t rx_initiator; /* sender's address */
133 ptl_uid_t rx_uid; /* sender's uid */
135 kptl_peer_t *rx_peer; /* pointer to peer */
136 char rx_space[0]; /* copy of incoming request */
139 typedef struct kptl_rx_buffer_pool
141 spinlock_t rxbp_lock;
142 struct list_head rxbp_list; /* all allocated buffers */
143 int rxbp_count; /* # allocated buffers */
144 int rxbp_reserved; /* # requests to buffer */
145 int rxbp_shutdown; /* shutdown flag */
146 } kptl_rx_buffer_pool_t;
148 struct kptl_rx_buffer
150 kptl_rx_buffer_pool_t *rxb_pool;
151 struct list_head rxb_list; /* for the rxb_pool list */
152 struct list_head rxb_repost_list;/* for the kptl_sched_rxbq list */
153 int rxb_posted:1; /* on the net */
154 int rxb_idle:1; /* all done */
155 kptl_eventarg_t rxb_eventarg; /* event->md.user_ptr */
156 int rxb_refcount; /* reference count */
157 ptl_handle_md_t rxb_mdh; /* the portals memory descriptor (MD) handle */
158 char *rxb_buffer; /* the buffer */
164 TX_TYPE_RESERVED = 0,
165 TX_TYPE_SMALL_MESSAGE = 1,
166 TX_TYPE_PUT_REQUEST = 2,
167 TX_TYPE_GET_REQUEST = 3,
168 TX_TYPE_PUT_RESPONSE = 4,
169 TX_TYPE_GET_RESPONSE = 5,
173 #ifdef _USING_LUSTRE_PORTALS_
174 struct iovec iov[PTL_MD_MAX_IOV];
175 lnet_kiov_t kiov[PTL_MD_MAX_IOV];
177 ptl_md_iovec_t iov[PTL_MD_MAX_IOV];
181 typedef struct kptl_tx /* transmit message */
183 struct list_head tx_list; /* queue on idle_txs etc */
184 atomic_t tx_refcount; /* reference count*/
185 enum kptl_tx_type tx_type; /* small msg/{put,get}{req,resp} */
186 int tx_active:1; /* queued on the peer */
187 int tx_idle:1; /* on the free list */
188 int tx_acked:1; /* portals ACK wanted (for debug only) */
189 kptl_eventarg_t tx_msg_eventarg; /* event->md.user_ptr */
190 kptl_eventarg_t tx_rdma_eventarg; /* event->md.user_ptr */
191 int tx_status; /* the status of this tx descriptor */
192 ptl_handle_md_t tx_rdma_mdh; /* RDMA buffer */
193 ptl_handle_md_t tx_msg_mdh; /* the portals MD handle for the initial message */
194 lnet_msg_t *tx_lnet_msg; /* LNET message to finalize */
195 lnet_msg_t *tx_lnet_replymsg; /* LNET reply message to finalize */
196 kptl_msg_t *tx_msg; /* the message data */
197 kptl_peer_t *tx_peer; /* the peer this is waiting on */
198 unsigned long tx_deadline; /* deadline */
199 unsigned long tx_tposted; /* time posted */
200 ptl_md_t tx_rdma_md; /* rdma descriptor */
201 kptl_fragvec_t *tx_frags; /* buffer fragments */
204 enum kptllnd_peer_state
206 PEER_STATE_UNINITIALIZED = 0,
207 PEER_STATE_ALLOCATED = 1,
208 PEER_STATE_WAITING_HELLO = 2,
209 PEER_STATE_ACTIVE = 3,
210 PEER_STATE_CLOSING = 4,
211 PEER_STATE_ZOMBIE = 5,
216 struct list_head peer_list;
217 atomic_t peer_refcount; /* The current refrences */
218 enum kptllnd_peer_state peer_state;
219 spinlock_t peer_lock; /* serialize */
220 struct list_head peer_sendq; /* txs waiting for mh handles */
221 struct list_head peer_activeq; /* txs awaiting completion */
222 lnet_process_id_t peer_id; /* Peer's LNET id */
223 ptl_process_id_t peer_ptlid; /* Peer's portals id */
224 __u64 peer_incarnation; /* peer's incarnation */
225 __u64 peer_myincarnation; /* my incarnation at HELLO */
226 int peer_sent_hello; /* have I sent HELLO? */
227 int peer_credits; /* number of send credits */
228 int peer_outstanding_credits;/* number of peer credits to return */
229 int peer_sent_credits; /* #msg buffers posted for peer */
230 int peer_max_msg_size; /* peer's rx buffer size */
231 int peer_error; /* errno on closing this peer */
232 int peer_retry_noop; /* need to retry returning credits */
233 int peer_check_stamp; /* watchdog check stamp */
234 cfs_time_t peer_last_alive; /* when (in jiffies) I was last alive */
235 __u64 peer_next_matchbits; /* Next value to register RDMA from peer */
236 __u64 peer_last_matchbits_seen; /* last matchbits used to RDMA to peer */
241 int kptl_init; /* initialisation state */
242 volatile int kptl_shutdown; /* shut down? */
243 atomic_t kptl_nthreads; /* # live threads */
244 lnet_ni_t *kptl_ni; /* _the_ LND instance */
245 ptl_handle_ni_t kptl_nih; /* network inteface handle */
246 ptl_process_id_t kptl_portals_id; /* Portals ID of interface */
247 __u64 kptl_incarnation; /* which one am I */
248 ptl_handle_eq_t kptl_eqh; /* Event Queue (EQ) */
250 spinlock_t kptl_sched_lock; /* serialise... */
251 wait_queue_head_t kptl_sched_waitq; /* schedulers sleep here */
252 struct list_head kptl_sched_txq; /* tx requiring attention */
253 struct list_head kptl_sched_rxq; /* rx requiring attention */
254 struct list_head kptl_sched_rxbq; /* rxb requiring reposting */
256 wait_queue_head_t kptl_watchdog_waitq; /* watchdog sleeps here */
258 kptl_rx_buffer_pool_t kptl_rx_buffer_pool; /* rx buffer pool */
259 cfs_mem_cache_t* kptl_rx_cache; /* rx descripter cache */
261 atomic_t kptl_ntx; /* # tx descs allocated */
262 spinlock_t kptl_tx_lock; /* serialise idle tx list*/
263 struct list_head kptl_idle_txs; /* idle tx descriptors */
265 rwlock_t kptl_peer_rw_lock; /* lock for peer table */
266 struct list_head *kptl_peers; /* hash table of all my known peers */
267 struct list_head kptl_closing_peers; /* peers being closed */
268 struct list_head kptl_zombie_peers; /* peers waiting for refs to drain */
269 int kptl_peer_hash_size; /* size of kptl_peers */
270 int kptl_npeers; /* # peers extant */
271 int kptl_n_active_peers; /* # active peers */
272 int kptl_expected_peers; /* # peers I can buffer HELLOs from */
274 kptl_msg_t *kptl_nak_msg; /* common NAK message */
275 spinlock_t kptl_ptlid2str_lock; /* serialise str ops */
280 PTLLND_INIT_NOTHING = 0,
285 extern kptl_tunables_t kptllnd_tunables;
286 extern kptl_data_t kptllnd_data;
288 static inline lnet_nid_t
289 kptllnd_ptl2lnetnid(ptl_nid_t ptl_nid)
291 #ifdef _USING_LUSTRE_PORTALS_
292 return LNET_MKNID(LNET_NIDNET(kptllnd_data.kptl_ni->ni_nid),
293 LNET_NIDADDR(ptl_nid));
295 return LNET_MKNID(LNET_NIDNET(kptllnd_data.kptl_ni->ni_nid),
300 static inline ptl_nid_t
301 kptllnd_lnet2ptlnid(lnet_nid_t lnet_nid)
303 #ifdef _USING_LUSTRE_PORTALS_
304 return LNET_MKNID(LNET_NIDNET(kptllnd_data.kptl_portals_id.nid),
305 LNET_NIDADDR(lnet_nid));
307 return LNET_NIDADDR(lnet_nid);
311 int kptllnd_startup(lnet_ni_t *ni);
312 void kptllnd_shutdown(lnet_ni_t *ni);
313 int kptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg);
314 int kptllnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
315 int kptllnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg,
316 int delayed, unsigned int niov,
317 struct iovec *iov, lnet_kiov_t *kiov,
318 unsigned int offset, unsigned int mlen, unsigned int rlen);
319 int kptllnd_eager_recv(struct lnet_ni *ni, void *private,
320 lnet_msg_t *msg, void **new_privatep);
321 void kptllnd_eq_callback(ptl_event_t *evp);
322 int kptllnd_scheduler(void *arg);
323 int kptllnd_watchdog(void *arg);
324 int kptllnd_thread_start(int (*fn)(void *arg), void *arg);
325 int kptllnd_tunables_init(void);
326 void kptllnd_tunables_fini(void);
328 const char *kptllnd_evtype2str(int evtype);
329 const char *kptllnd_msgtype2str(int msgtype);
330 const char *kptllnd_errtype2str(int errtype);
333 kptllnd_eventarg2obj (kptl_eventarg_t *eva)
335 switch (eva->eva_type) {
338 case PTLLND_EVENTARG_TYPE_BUF:
339 return list_entry(eva, kptl_rx_buffer_t, rxb_eventarg);
340 case PTLLND_EVENTARG_TYPE_RDMA:
341 return list_entry(eva, kptl_tx_t, tx_rdma_eventarg);
342 case PTLLND_EVENTARG_TYPE_MSG:
343 return list_entry(eva, kptl_tx_t, tx_msg_eventarg);
348 * RX BUFFER SUPPORT FUNCTIONS
350 void kptllnd_rx_buffer_pool_init(kptl_rx_buffer_pool_t *rxbp);
351 void kptllnd_rx_buffer_pool_fini(kptl_rx_buffer_pool_t *rxbp);
352 int kptllnd_rx_buffer_pool_reserve(kptl_rx_buffer_pool_t *rxbp, int count);
353 void kptllnd_rx_buffer_pool_unreserve(kptl_rx_buffer_pool_t *rxbp, int count);
354 void kptllnd_rx_buffer_callback(ptl_event_t *ev);
355 void kptllnd_rx_buffer_post(kptl_rx_buffer_t *rxb);
358 kptllnd_rx_buffer_size(void)
360 return PAGE_SIZE * (*kptllnd_tunables.kptl_rxb_npages);
364 kptllnd_rx_buffer_addref(kptl_rx_buffer_t *rxb)
368 spin_lock_irqsave(&rxb->rxb_pool->rxbp_lock, flags);
370 spin_unlock_irqrestore(&rxb->rxb_pool->rxbp_lock, flags);
374 kptllnd_rx_buffer_decref_locked(kptl_rx_buffer_t *rxb)
376 if (--(rxb->rxb_refcount) == 0) {
377 spin_lock(&kptllnd_data.kptl_sched_lock);
379 list_add_tail(&rxb->rxb_repost_list,
380 &kptllnd_data.kptl_sched_rxbq);
381 wake_up(&kptllnd_data.kptl_sched_waitq);
383 spin_unlock(&kptllnd_data.kptl_sched_lock);
388 kptllnd_rx_buffer_decref(kptl_rx_buffer_t *rxb)
393 spin_lock_irqsave(&rxb->rxb_pool->rxbp_lock, flags);
394 count = --(rxb->rxb_refcount);
395 spin_unlock_irqrestore(&rxb->rxb_pool->rxbp_lock, flags);
398 kptllnd_rx_buffer_post(rxb);
402 * RX SUPPORT FUNCTIONS
404 void kptllnd_rx_done(kptl_rx_t *rx);
405 void kptllnd_rx_parse(kptl_rx_t *rx);
408 * PEER SUPPORT FUNCTIONS
410 int kptllnd_get_peer_info(int index,
411 lnet_process_id_t *id,
412 int *state, int *sent_hello,
413 int *refcount, __u64 *incarnation,
414 __u64 *next_matchbits, __u64 *last_matchbits_seen,
415 int *nsendq, int *nactiveq,
416 int *credits, int *outstanding_credits);
417 void kptllnd_peer_destroy(kptl_peer_t *peer);
418 int kptllnd_peer_del(lnet_process_id_t id);
419 void kptllnd_peer_close_locked(kptl_peer_t *peer, int why);
420 void kptllnd_peer_close(kptl_peer_t *peer, int why);
421 void kptllnd_handle_closing_peers(void);
422 int kptllnd_peer_connect(kptl_tx_t *tx, lnet_nid_t nid);
423 void kptllnd_peer_check_sends(kptl_peer_t *peer);
424 void kptllnd_peer_check_bucket(int idx, int stamp);
425 void kptllnd_tx_launch(kptl_peer_t *peer, kptl_tx_t *tx, int nfrag);
426 int kptllnd_find_target(kptl_peer_t **peerp, lnet_process_id_t target);
427 kptl_peer_t *kptllnd_peer_handle_hello(ptl_process_id_t initiator,
429 kptl_peer_t *kptllnd_id2peer_locked(lnet_process_id_t id);
430 void kptllnd_peer_alive(kptl_peer_t *peer);
433 kptllnd_peer_addref (kptl_peer_t *peer)
435 atomic_inc(&peer->peer_refcount);
439 kptllnd_peer_decref (kptl_peer_t *peer)
441 if (atomic_dec_and_test(&peer->peer_refcount))
442 kptllnd_peer_destroy(peer);
446 kptllnd_set_tx_peer(kptl_tx_t *tx, kptl_peer_t *peer)
448 LASSERT (tx->tx_peer == NULL);
450 kptllnd_peer_addref(peer);
454 static inline struct list_head *
455 kptllnd_nid2peerlist(lnet_nid_t nid)
457 unsigned int hash = ((unsigned int)nid) %
458 kptllnd_data.kptl_peer_hash_size;
460 return &kptllnd_data.kptl_peers[hash];
463 static inline kptl_peer_t *
464 kptllnd_id2peer(lnet_process_id_t id)
469 read_lock_irqsave(&kptllnd_data.kptl_peer_rw_lock, flags);
470 peer = kptllnd_id2peer_locked(id);
471 read_unlock_irqrestore(&kptllnd_data.kptl_peer_rw_lock, flags);
477 kptllnd_reserve_buffers(int n)
479 return kptllnd_rx_buffer_pool_reserve(&kptllnd_data.kptl_rx_buffer_pool,
484 kptllnd_peer_reserve_buffers(void)
486 return kptllnd_reserve_buffers(*kptllnd_tunables.kptl_peercredits);
490 kptllnd_peer_unreserve_buffers(void)
492 kptllnd_rx_buffer_pool_unreserve(&kptllnd_data.kptl_rx_buffer_pool,
493 *kptllnd_tunables.kptl_peercredits);
497 * TX SUPPORT FUNCTIONS
499 int kptllnd_setup_tx_descs(void);
500 void kptllnd_cleanup_tx_descs(void);
501 void kptllnd_tx_fini(kptl_tx_t *tx);
502 kptl_tx_t *kptllnd_get_idle_tx(enum kptl_tx_type purpose);
503 void kptllnd_tx_callback(ptl_event_t *ev);
504 const char *kptllnd_tx_typestr(int type);
507 kptllnd_tx_addref(kptl_tx_t *tx)
509 atomic_inc(&tx->tx_refcount);
513 kptllnd_tx_decref(kptl_tx_t *tx)
515 LASSERT (!in_interrupt()); /* Thread context only */
517 if (atomic_dec_and_test(&tx->tx_refcount))
522 * MESSAGE SUPPORT FUNCTIONS
524 void kptllnd_init_msg(kptl_msg_t *msg, int type, int body_nob);
525 void kptllnd_msg_pack(kptl_msg_t *msg, kptl_peer_t *peer);
526 int kptllnd_msg_unpack(kptl_msg_t *msg, int nob);
529 * MISC SUPPORT FUNCTIONS
531 void kptllnd_init_rdma_md(kptl_tx_t *tx, unsigned int niov,
532 struct iovec *iov, lnet_kiov_t *kiov,
533 unsigned int offset, unsigned int nob);
534 char *kptllnd_ptlid2str(ptl_process_id_t id);
536 void kptllnd_init_ptltrace(void);
537 void kptllnd_dump_ptltrace(void);
540 #define SIMULATION_FAIL_TX_PUT_ALLOC 0 /* 0x00000001 */
541 #define SIMULATION_FAIL_TX_GET_ALLOC 1 /* 0x00000002 */
542 #define SIMULATION_FAIL_TX 2 /* 0x00000004 */
543 #define SIMULATION_FAIL_RX_ALLOC 3 /* 0x00000008 */
545 #define IS_SIMULATION_ENABLED(x) \
546 (((*kptllnd_tunables.kptl_simulation_bitmap) & 1<< SIMULATION_##x) != 0)
548 #define IS_SIMULATION_ENABLED(x) 0