4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
20 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
21 * CA 95054 USA or visit www.sun.com if you need additional information or
27 * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
28 * Use is subject to license terms.
30 * Copyright (c) 2012, Intel Corporation.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lnet/ulnds/socklnd/usocklnd.h
38 * Author: Maxim Patlasov <maxim@clusterfs.com>
45 #include <lnet/lib-lnet.h>
46 #include <lnet/socklnd.h>
49 struct list_head tx_list; /* neccessary to form tx list */
50 lnet_msg_t *tx_lnetmsg; /* lnet message for lnet_finalize() */
51 ksock_msg_t tx_msg; /* buffer for wire header of ksock msg */
52 int tx_resid; /* # of residual bytes */
53 int tx_nob; /* # of packet bytes */
54 int tx_size; /* size of this descriptor */
55 struct iovec *tx_iov; /* points to tx_iova[i] */
56 int tx_niov; /* # of packet iovec frags */
57 struct iovec tx_iova[1]; /* iov for header */
63 cfs_socket_t *uc_sock; /* socket */
64 int uc_type; /* conn type */
65 int uc_activeflag; /* active side of connection? */
66 int uc_flip; /* is peer other endian? */
67 int uc_state; /* connection state */
68 struct usock_peer_s *uc_peer; /* owning peer */
69 lnet_process_id_t uc_peerid; /* id of remote peer */
70 int uc_pt_idx; /* index in ud_pollthreads[] of
71 * owning poll thread */
72 lnet_ni_t *uc_ni; /* parent NI while accepting */
73 struct usock_preq_s *uc_preq; /* preallocated request */
74 __u32 uc_peer_ip; /* IP address of the peer */
75 __u16 uc_peer_port; /* port of the peer */
76 struct list_head uc_stale_list; /* orphaned connections */
79 int uc_rx_state; /* message or hello state */
80 ksock_hello_msg_t *uc_rx_hello; /* hello buffer */
81 struct iovec *uc_rx_iov; /* points to uc_rx_iova[i] */
82 struct iovec uc_rx_iova[LNET_MAX_IOV]; /* message frags */
83 int uc_rx_niov; /* # frags */
84 int uc_rx_nob_left; /* # bytes to next hdr/body */
85 int uc_rx_nob_wanted; /* # of bytes actually wanted */
86 void *uc_rx_lnetmsg; /* LNET message being received */
87 cfs_time_t uc_rx_deadline; /* when to time out */
88 int uc_rx_flag; /* deadline valid? */
89 ksock_msg_t uc_rx_msg; /* message buffer */
92 struct list_head uc_tx_list; /* pending txs */
93 struct list_head uc_zcack_list; /* pending zc_acks */
94 cfs_time_t uc_tx_deadline; /* when to time out */
95 int uc_tx_flag; /* deadline valid? */
96 int uc_sending; /* send op is in progress */
97 usock_tx_t *uc_tx_hello; /* fake tx with hello */
99 mt_atomic_t uc_refcount; /* # of users */
100 pthread_mutex_t uc_lock; /* serialize */
101 int uc_errored; /* a flag for lnet_notify() */
104 /* Allowable conn states are: */
105 #define UC_CONNECTING 1
106 #define UC_SENDING_HELLO 2
107 #define UC_RECEIVING_HELLO 3
111 /* Allowable RX states are: */
112 #define UC_RX_HELLO_MAGIC 1
113 #define UC_RX_HELLO_VERSION 2
114 #define UC_RX_HELLO_BODY 3
115 #define UC_RX_HELLO_IPS 4
116 #define UC_RX_KSM_HEADER 5
117 #define UC_RX_LNET_HEADER 6
118 #define UC_RX_PARSE 7
119 #define UC_RX_PARSE_WAIT 8
120 #define UC_RX_LNET_PAYLOAD 9
121 #define UC_RX_SKIPPING 10
123 #define N_CONN_TYPES 3 /* CONTROL, BULK_IN and BULK_OUT */
125 typedef struct usock_peer_s {
126 /* neccessary to form peer list */
127 struct list_head up_list;
128 lnet_process_id_t up_peerid; /* id of remote peer */
129 usock_conn_t *up_conns[N_CONN_TYPES]; /* conns that connect us
130 * us with the peer */
131 lnet_ni_t *up_ni; /* pointer to parent NI */
132 __u64 up_incarnation; /* peer's incarnation */
133 int up_incrn_is_set;/* 0 if peer's incarnation
134 * hasn't been set so far */
135 mt_atomic_t up_refcount; /* # of users */
136 pthread_mutex_t up_lock; /* serialize */
137 int up_errored; /* a flag for lnet_notify() */
138 cfs_time_t up_last_alive; /* when the peer was last alive */
142 cfs_socket_t *upt_notifier[2]; /* notifier sockets: 1st for
143 * writing, 2nd for reading */
144 struct pollfd *upt_pollfd; /* poll fds */
145 int upt_nfds; /* active poll fds */
146 int upt_npollfd; /* allocated poll fds */
147 usock_conn_t **upt_idx2conn; /* conns corresponding to
149 int *upt_skip; /* skip chain */
150 int *upt_fd2idx; /* index into upt_pollfd[]
152 int upt_nfd2idx; /* # of allocated elements
154 struct list_head upt_stale_list; /* list of orphaned conns */
155 struct list_head upt_pollrequests; /* list of poll requests */
156 pthread_mutex_t upt_pollrequests_lock; /* serialize */
157 int upt_errno; /* non-zero if errored */
158 struct completion upt_completion; /* wait/signal facility for
159 * syncronizing shutdown */
160 } usock_pollthread_t;
162 /* Number of elements in upt_pollfd[], upt_idx2conn[] and upt_fd2idx[]
163 * at initialization time. Will be resized on demand */
164 #define UPT_START_SIZ 32
167 #define UD_PEER_HASH_SIZE 101
170 int ud_state; /* initialization state */
171 int ud_npollthreads; /* # of poll threads */
172 usock_pollthread_t *ud_pollthreads; /* their state */
173 int ud_shutdown; /* shutdown flag */
174 int ud_nets_count; /* # of instances */
175 struct list_head ud_peers[UD_PEER_HASH_SIZE]; /* peer hash table */
176 pthread_rwlock_t ud_peers_lock; /* serialize */
179 extern usock_data_t usock_data;
181 /* ud_state allowed values */
182 #define UD_STATE_INIT_NOTHING 0
183 #define UD_STATE_INITIALIZED 1
186 int un_peercount; /* # of peers */
187 int un_shutdown; /* shutdown flag */
188 __u64 un_incarnation; /* my epoch */
189 pthread_cond_t un_cond; /* condvar to wait for notifications */
190 pthread_mutex_t un_lock; /* a lock to protect un_cond */
194 int ut_poll_timeout; /* the third arg for poll(2) (seconds) */
195 int ut_timeout; /* "stuck" socket timeout (seconds) */
196 int ut_npollthreads; /* number of poll thread to spawn */
197 int ut_fair_limit; /* how many packets can we receive or transmit
198 * without calling poll(2) */
199 int ut_min_bulk; /* smallest "large" message */
200 int ut_txcredits; /* # concurrent sends */
201 int ut_peertxcredits; /* # concurrent sends to 1 peer */
202 int ut_socknagle; /* Is Nagle alg on ? */
203 int ut_sockbufsiz; /* size of socket buffers */
206 extern usock_tunables_t usock_tuns;
208 typedef struct usock_preq_s {
209 int upr_type; /* type of requested action */
210 short upr_value; /* bitmask of POLLIN and POLLOUT bits */
211 usock_conn_t * upr_conn; /* a conn for the sake of which
212 * action will be performed */
213 struct list_head upr_list; /* neccessary to form list */
214 } usock_pollrequest_t;
216 /* Allowable poll request types are: */
217 #define POLL_ADD_REQUEST 1
218 #define POLL_DEL_REQUEST 2
219 #define POLL_RX_SET_REQUEST 3
220 #define POLL_TX_SET_REQUEST 4
221 #define POLL_SET_REQUEST 5
224 struct list_head zc_list; /* neccessary to form zc_ack list */
225 __u64 zc_cookie; /* zero-copy cookie */
229 usocklnd_conn_addref(usock_conn_t *conn)
231 LASSERT(mt_atomic_read(&conn->uc_refcount) > 0);
232 mt_atomic_inc(&conn->uc_refcount);
235 void usocklnd_destroy_conn(usock_conn_t *conn);
238 usocklnd_conn_decref(usock_conn_t *conn)
240 LASSERT(mt_atomic_read(&conn->uc_refcount) > 0);
241 if (mt_atomic_dec_and_test(&conn->uc_refcount))
242 usocklnd_destroy_conn(conn);
246 usocklnd_peer_addref(usock_peer_t *peer)
248 LASSERT(mt_atomic_read(&peer->up_refcount) > 0);
249 mt_atomic_inc(&peer->up_refcount);
252 void usocklnd_destroy_peer(usock_peer_t *peer);
255 usocklnd_peer_decref(usock_peer_t *peer)
257 LASSERT(mt_atomic_read(&peer->up_refcount) > 0);
258 if (mt_atomic_dec_and_test(&peer->up_refcount))
259 usocklnd_destroy_peer(peer);
263 usocklnd_ip2pt_idx(__u32 ip) {
264 return ip % usock_data.ud_npollthreads;
267 static inline struct list_head *
268 usocklnd_nid2peerlist(lnet_nid_t nid)
270 unsigned int hash = ((unsigned int)nid) % UD_PEER_HASH_SIZE;
272 return &usock_data.ud_peers[hash];
275 int usocklnd_startup(lnet_ni_t *ni);
276 void usocklnd_shutdown(lnet_ni_t *ni);
277 int usocklnd_send(lnet_ni_t *ni, void *private, lnet_msg_t *lntmsg);
278 int usocklnd_recv(lnet_ni_t *ni, void *private, lnet_msg_t *msg, int delayed,
279 unsigned int niov, struct iovec *iov, lnet_kiov_t *kiov,
280 unsigned int offset, unsigned int mlen, unsigned int rlen);
281 int usocklnd_accept(lnet_ni_t *ni, cfs_socket_t *sock);
283 int usocklnd_poll_thread(void *arg);
284 int usocklnd_add_pollrequest(usock_conn_t *conn, int type, short value);
285 void usocklnd_add_killrequest(usock_conn_t *conn);
286 int usocklnd_process_pollrequest(usock_pollrequest_t *pr,
287 usock_pollthread_t *pt_data);
288 void usocklnd_execute_handlers(usock_pollthread_t *pt_data);
289 int usocklnd_calculate_chunk_size(int num);
290 void usocklnd_wakeup_pollthread(int i);
292 int usocklnd_notifier_handler(int fd);
293 void usocklnd_exception_handler(usock_conn_t *conn);
294 int usocklnd_read_handler(usock_conn_t *conn);
295 int usocklnd_read_msg(usock_conn_t *conn, int *cont_flag);
296 int usocklnd_handle_zc_req(usock_peer_t *peer, __u64 cookie);
297 int usocklnd_read_hello(usock_conn_t *conn, int *cont_flag);
298 int usocklnd_activeconn_hellorecv(usock_conn_t *conn);
299 int usocklnd_passiveconn_hellorecv(usock_conn_t *conn);
300 int usocklnd_write_handler(usock_conn_t *conn);
301 usock_tx_t *usocklnd_try_piggyback(struct list_head *tx_list_p,
302 struct list_head *zcack_list_p);
303 int usocklnd_activeconn_hellosent(usock_conn_t *conn);
304 int usocklnd_passiveconn_hellosent(usock_conn_t *conn);
305 int usocklnd_send_tx(usock_conn_t *conn, usock_tx_t *tx);
306 int usocklnd_read_data(usock_conn_t *conn);
308 void usocklnd_release_poll_states(int n);
309 int usocklnd_base_startup();
310 void usocklnd_base_shutdown(int n);
311 __u64 usocklnd_new_incarnation();
312 void usocklnd_del_all_peers(lnet_ni_t *ni);
313 void usocklnd_del_peer_and_conns(usock_peer_t *peer);
314 void usocklnd_del_conns_locked(usock_peer_t *peer);
316 int usocklnd_conn_timed_out(usock_conn_t *conn, cfs_time_t current_time);
317 void usocklnd_conn_kill(usock_conn_t *conn);
318 void usocklnd_conn_kill_locked(usock_conn_t *conn);
319 usock_conn_t *usocklnd_conn_allocate();
320 void usocklnd_conn_free(usock_conn_t *conn);
321 void usocklnd_tear_peer_conn(usock_conn_t *conn);
322 void usocklnd_check_peer_stale(lnet_ni_t *ni, lnet_process_id_t id);
323 int usocklnd_create_passive_conn(lnet_ni_t *ni,
324 cfs_socket_t *sock, usock_conn_t **connp);
325 int usocklnd_create_active_conn(usock_peer_t *peer, int type,
326 usock_conn_t **connp);
327 int usocklnd_connect_srv_mode(cfs_socket_t **sockp,
328 __u32 dst_ip, __u16 dst_port);
329 int usocklnd_connect_cli_mode(cfs_socket_t **sockp,
330 __u32 dst_ip, __u16 dst_port);
331 int usocklnd_set_sock_options(cfs_socket_t *sock);
332 usock_tx_t *usocklnd_create_noop_tx(__u64 cookie);
333 usock_tx_t *usocklnd_create_tx(lnet_msg_t *lntmsg);
334 void usocklnd_init_hello_msg(ksock_hello_msg_t *hello,
335 lnet_ni_t *ni, int type, lnet_nid_t peer_nid);
336 usock_tx_t *usocklnd_create_hello_tx(lnet_ni_t *ni,
337 int type, lnet_nid_t peer_nid);
338 usock_tx_t *usocklnd_create_cr_hello_tx(lnet_ni_t *ni,
339 int type, lnet_nid_t peer_nid);
340 void usocklnd_destroy_tx(lnet_ni_t *ni, usock_tx_t *tx);
341 void usocklnd_destroy_txlist(lnet_ni_t *ni, struct list_head *txlist);
342 void usocklnd_destroy_zcack_list(struct list_head *zcack_list);
343 void usocklnd_destroy_peer (usock_peer_t *peer);
344 int usocklnd_get_conn_type(lnet_msg_t *lntmsg);
345 int usocklnd_type2idx(int type);
346 usock_peer_t *usocklnd_find_peer_locked(lnet_ni_t *ni, lnet_process_id_t id);
347 int usocklnd_create_peer(lnet_ni_t *ni, lnet_process_id_t id,
348 usock_peer_t **peerp);
349 int usocklnd_find_or_create_peer(lnet_ni_t *ni, lnet_process_id_t id,
350 usock_peer_t **peerp);
351 int usocklnd_find_or_create_conn(usock_peer_t *peer, int type,
352 usock_conn_t **connp,
353 usock_tx_t *tx, usock_zc_ack_t *zc_ack,
354 int *send_immediately_flag);
355 void usocklnd_link_conn_to_peer(usock_conn_t *conn, usock_peer_t *peer, int idx);
356 int usocklnd_invert_type(int type);
357 void usocklnd_conn_new_state(usock_conn_t *conn, int new_state);
358 void usocklnd_cleanup_stale_conns(usock_peer_t *peer, __u64 incrn,
359 usock_conn_t *skip_conn);
361 void usocklnd_rx_hellomagic_state_transition(usock_conn_t *conn);
362 void usocklnd_rx_helloversion_state_transition(usock_conn_t *conn);
363 void usocklnd_rx_hellobody_state_transition(usock_conn_t *conn);
364 void usocklnd_rx_helloIPs_state_transition(usock_conn_t *conn);
365 void usocklnd_rx_lnethdr_state_transition(usock_conn_t *conn);
366 void usocklnd_rx_ksmhdr_state_transition(usock_conn_t *conn);
367 void usocklnd_rx_skipping_state_transition(usock_conn_t *conn);