2 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
3 * vim:expandtab:shiftwidth=8:tabstop=8:
5 * Copyright (C) 2005 Cluster File Systems, Inc. All rights reserved.
6 * Author: Eric Barton <eeb@bartonsoftware.com>
8 * This file is part of the Lustre file system, http://www.lustre.org
9 * Lustre is a trademark of Cluster File Systems, Inc.
11 * This file is confidential source code owned by Cluster File Systems.
12 * No viewing, modification, compilation, redistribution, or any other
13 * form of use is permitted except through a signed license agreement.
15 * If you have not signed such an agreement, then you have no rights to
16 * this file. Please destroy it immediately and contact CFS.
24 .lnd_startup = ptllnd_startup,
25 .lnd_shutdown = ptllnd_shutdown,
26 .lnd_ctl = ptllnd_ctl,
27 .lnd_send = ptllnd_send,
28 .lnd_recv = ptllnd_recv,
29 .lnd_eager_recv = ptllnd_eager_recv,
30 .lnd_notify = ptllnd_notify,
31 .lnd_wait = ptllnd_wait,
34 static int ptllnd_ni_count = 0;
37 ptllnd_assert_wire_constants (void)
39 /* Wire protocol assertions generated by 'wirecheck'
40 * running on Linux fedora 2.6.11-co-0.6.4 #1 Mon Jun 19 05:36:13 UTC 2006 i686 i686 i386 GNU
41 * with gcc version 4.1.1 20060525 (Red Hat 4.1.1-1) */
45 CLASSERT (PTL_RESERVED_MATCHBITS == 0x100);
46 CLASSERT (LNET_MSG_MATCHBITS == 0);
47 CLASSERT (PTLLND_MSG_MAGIC == 0x50746C4E);
48 CLASSERT (PTLLND_MSG_VERSION == 0x04);
49 CLASSERT (PTLLND_RDMA_OK == 0x00);
50 CLASSERT (PTLLND_RDMA_FAIL == 0x01);
51 CLASSERT (PTLLND_MSG_TYPE_INVALID == 0x00);
52 CLASSERT (PTLLND_MSG_TYPE_PUT == 0x01);
53 CLASSERT (PTLLND_MSG_TYPE_GET == 0x02);
54 CLASSERT (PTLLND_MSG_TYPE_IMMEDIATE == 0x03);
55 CLASSERT (PTLLND_MSG_TYPE_NOOP == 0x04);
56 CLASSERT (PTLLND_MSG_TYPE_HELLO == 0x05);
57 CLASSERT (PTLLND_MSG_TYPE_NAK == 0x06);
59 /* Checks for struct kptl_msg_t */
60 CLASSERT ((int)sizeof(kptl_msg_t) == 136);
61 CLASSERT ((int)offsetof(kptl_msg_t, ptlm_magic) == 0);
62 CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_magic) == 4);
63 CLASSERT ((int)offsetof(kptl_msg_t, ptlm_version) == 4);
64 CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_version) == 2);
65 CLASSERT ((int)offsetof(kptl_msg_t, ptlm_type) == 6);
66 CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_type) == 1);
67 CLASSERT ((int)offsetof(kptl_msg_t, ptlm_credits) == 7);
68 CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_credits) == 1);
69 CLASSERT ((int)offsetof(kptl_msg_t, ptlm_nob) == 8);
70 CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_nob) == 4);
71 CLASSERT ((int)offsetof(kptl_msg_t, ptlm_cksum) == 12);
72 CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_cksum) == 4);
73 CLASSERT ((int)offsetof(kptl_msg_t, ptlm_srcnid) == 16);
74 CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_srcnid) == 8);
75 CLASSERT ((int)offsetof(kptl_msg_t, ptlm_srcstamp) == 24);
76 CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_srcstamp) == 8);
77 CLASSERT ((int)offsetof(kptl_msg_t, ptlm_dstnid) == 32);
78 CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_dstnid) == 8);
79 CLASSERT ((int)offsetof(kptl_msg_t, ptlm_dststamp) == 40);
80 CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_dststamp) == 8);
81 CLASSERT ((int)offsetof(kptl_msg_t, ptlm_srcpid) == 48);
82 CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_srcpid) == 4);
83 CLASSERT ((int)offsetof(kptl_msg_t, ptlm_dstpid) == 52);
84 CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_dstpid) == 4);
85 CLASSERT ((int)offsetof(kptl_msg_t, ptlm_u.immediate) == 56);
86 CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_u.immediate) == 72);
87 CLASSERT ((int)offsetof(kptl_msg_t, ptlm_u.rdma) == 56);
88 CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_u.rdma) == 80);
89 CLASSERT ((int)offsetof(kptl_msg_t, ptlm_u.hello) == 56);
90 CLASSERT ((int)sizeof(((kptl_msg_t *)0)->ptlm_u.hello) == 12);
92 /* Checks for struct kptl_immediate_msg_t */
93 CLASSERT ((int)sizeof(kptl_immediate_msg_t) == 72);
94 CLASSERT ((int)offsetof(kptl_immediate_msg_t, kptlim_hdr) == 0);
95 CLASSERT ((int)sizeof(((kptl_immediate_msg_t *)0)->kptlim_hdr) == 72);
96 CLASSERT ((int)offsetof(kptl_immediate_msg_t, kptlim_payload[13]) == 85);
97 CLASSERT ((int)sizeof(((kptl_immediate_msg_t *)0)->kptlim_payload[13]) == 1);
99 /* Checks for struct kptl_rdma_msg_t */
100 CLASSERT ((int)sizeof(kptl_rdma_msg_t) == 80);
101 CLASSERT ((int)offsetof(kptl_rdma_msg_t, kptlrm_hdr) == 0);
102 CLASSERT ((int)sizeof(((kptl_rdma_msg_t *)0)->kptlrm_hdr) == 72);
103 CLASSERT ((int)offsetof(kptl_rdma_msg_t, kptlrm_matchbits) == 72);
104 CLASSERT ((int)sizeof(((kptl_rdma_msg_t *)0)->kptlrm_matchbits) == 8);
106 /* Checks for struct kptl_hello_msg_t */
107 CLASSERT ((int)sizeof(kptl_hello_msg_t) == 12);
108 CLASSERT ((int)offsetof(kptl_hello_msg_t, kptlhm_matchbits) == 0);
109 CLASSERT ((int)sizeof(((kptl_hello_msg_t *)0)->kptlhm_matchbits) == 8);
110 CLASSERT ((int)offsetof(kptl_hello_msg_t, kptlhm_max_msg_size) == 8);
111 CLASSERT ((int)sizeof(((kptl_hello_msg_t *)0)->kptlhm_max_msg_size) == 4);
115 ptllnd_parse_int_tunable(int *value, char *name, int dflt)
117 char *env = getenv(name);
125 *value = strtoull(env, &end, 0);
129 CERROR("Can't parse tunable %s=%s\n", name, env);
134 ptllnd_get_tunables(lnet_ni_t *ni)
136 ptllnd_ni_t *plni = ni->ni_data;
142 rc = ptllnd_parse_int_tunable(&plni->plni_portal,
143 "PTLLND_PORTAL", PTLLND_PORTAL);
147 rc = ptllnd_parse_int_tunable(&temp,
148 "PTLLND_PID", PTLLND_PID);
151 plni->plni_ptllnd_pid = (ptl_pid_t)temp;
153 rc = ptllnd_parse_int_tunable(&plni->plni_peer_credits,
154 "PTLLND_PEERCREDITS", PTLLND_PEERCREDITS);
158 rc = ptllnd_parse_int_tunable(&max_msg_size,
159 "PTLLND_MAX_MSG_SIZE",
160 PTLLND_MAX_MSG_SIZE);
164 rc = ptllnd_parse_int_tunable(&msgs_per_buffer,
165 "PTLLND_MSGS_PER_BUFFER",
166 PTLLND_MSGS_PER_BUFFER);
170 rc = ptllnd_parse_int_tunable(&plni->plni_msgs_spare,
176 rc = ptllnd_parse_int_tunable(&plni->plni_peer_hash_size,
177 "PTLLND_PEER_HASH_SIZE",
178 PTLLND_PEER_HASH_SIZE);
183 rc = ptllnd_parse_int_tunable(&plni->plni_eq_size,
184 "PTLLND_EQ_SIZE", PTLLND_EQ_SIZE);
188 rc = ptllnd_parse_int_tunable(&plni->plni_checksum,
189 "PTLLND_CHECKSUM", 0);
193 rc = ptllnd_parse_int_tunable(&plni->plni_max_tx_history,
194 "PTLLND_TX_HISTORY", PTLLND_TX_HISTORY);
198 rc = ptllnd_parse_int_tunable(&plni->plni_abort_on_nak,
199 "PTLLND_ABORT_ON_NAK",
200 PTLLND_ABORT_ON_NAK);
204 plni->plni_max_msg_size = max_msg_size & ~7;
205 if (plni->plni_max_msg_size < sizeof(kptl_msg_t))
206 plni->plni_max_msg_size = (sizeof(kptl_msg_t) + 7) & ~7;
208 plni->plni_buffer_size = plni->plni_max_msg_size * msgs_per_buffer;
210 CDEBUG(D_NET, "portal = %d\n",plni->plni_portal);
211 CDEBUG(D_NET, "ptllnd_pid = %d\n",plni->plni_ptllnd_pid);
212 CDEBUG(D_NET, "max_msg_size = %d\n",max_msg_size);
213 CDEBUG(D_NET, "msgs_per_buffer = %d\n",msgs_per_buffer);
214 CDEBUG(D_NET, "msgs_spare = %d\n",plni->plni_msgs_spare);
215 CDEBUG(D_NET, "peer_hash_size = %d\n",plni->plni_peer_hash_size);
216 CDEBUG(D_NET, "eq_size = %d\n",plni->plni_eq_size);
217 CDEBUG(D_NET, "max_msg_size = %d\n",plni->plni_max_msg_size);
218 CDEBUG(D_NET, "buffer_size = %d\n",plni->plni_buffer_size);
224 ptllnd_create_buffer (lnet_ni_t *ni)
226 ptllnd_ni_t *plni = ni->ni_data;
227 ptllnd_buffer_t *buf;
229 LIBCFS_ALLOC(buf, sizeof(*buf));
231 CERROR("Can't allocate buffer descriptor\n");
237 CFS_INIT_LIST_HEAD(&buf->plb_list);
239 LIBCFS_ALLOC(buf->plb_buffer, plni->plni_buffer_size);
240 if (buf->plb_buffer == NULL) {
241 CERROR("Can't allocate buffer size %d\n",
242 plni->plni_buffer_size);
243 LIBCFS_FREE(buf, sizeof(*buf));
247 list_add(&buf->plb_list, &plni->plni_buffers);
248 plni->plni_nbuffers++;
254 ptllnd_destroy_buffer (ptllnd_buffer_t *buf)
256 ptllnd_ni_t *plni = buf->plb_ni->ni_data;
258 LASSERT (!buf->plb_posted);
260 plni->plni_nbuffers--;
261 list_del(&buf->plb_list);
262 LIBCFS_FREE(buf->plb_buffer, plni->plni_buffer_size);
263 LIBCFS_FREE(buf, sizeof(*buf));
267 ptllnd_grow_buffers (lnet_ni_t *ni)
269 ptllnd_ni_t *plni = ni->ni_data;
270 ptllnd_buffer_t *buf;
275 CDEBUG(D_NET, "nposted_buffers = %d (before)\n",plni->plni_nposted_buffers);
276 CDEBUG(D_NET, "nbuffers = %d (before)\n",plni->plni_nbuffers);
278 nmsgs = plni->plni_npeers * plni->plni_peer_credits +
279 plni->plni_msgs_spare;
281 nbufs = (nmsgs * plni->plni_max_msg_size + plni->plni_buffer_size - 1) /
282 plni->plni_buffer_size;
284 while (nbufs > plni->plni_nbuffers) {
285 buf = ptllnd_create_buffer(ni);
290 rc = ptllnd_post_buffer(buf);
292 /* TODO - this path seems to orpahn the buffer
293 * in a state where its not posted and will never be
294 * However it does not leak the buffer as it's
295 * already been put onto the global buffer list
296 * and will be cleaned up
302 CDEBUG(D_NET, "nposted_buffers = %d (after)\n",plni->plni_nposted_buffers);
303 CDEBUG(D_NET, "nbuffers = %d (after)\n",plni->plni_nbuffers);
308 ptllnd_destroy_buffers (lnet_ni_t *ni)
310 ptllnd_ni_t *plni = ni->ni_data;
311 ptllnd_buffer_t *buf;
312 struct list_head *tmp;
313 struct list_head *nxt;
315 CDEBUG(D_NET, "nposted_buffers = %d (before)\n",plni->plni_nposted_buffers);
316 CDEBUG(D_NET, "nbuffers = %d (before)\n",plni->plni_nbuffers);
318 list_for_each_safe(tmp, nxt, &plni->plni_buffers) {
319 buf = list_entry(tmp, ptllnd_buffer_t, plb_list);
321 //CDEBUG(D_NET, "buf=%p posted=%d\n",buf,buf->plb_posted);
323 LASSERT (plni->plni_nbuffers > 0);
324 if (buf->plb_posted) {
325 time_t start = cfs_time_current_sec();
326 int w = PTLLND_WARN_LONG_WAIT;
328 LASSERT (plni->plni_nposted_buffers > 0);
330 #ifdef LUSTRE_PORTALS_UNLINK_SEMANTICS
331 (void) PtlMDUnlink(buf->plb_md);
333 while (buf->plb_posted) {
334 if (cfs_time_current_sec() > start + w) {
335 CWARN("Waited %ds to unlink buffer\n", w);
338 ptllnd_wait(ni, w*1000);
341 while (buf->plb_posted) {
342 rc = PtlMDUnlink(buf->plb_md);
345 plni->plni_nposted_buffers--;
348 LASSERT (rc == PTL_MD_IN_USE);
349 if (cfs_time_current_sec() > start + w) {
350 CWARN("Waited %ds to unlink buffer\n", w);
353 ptllnd_wait(ni, w*1000);
357 ptllnd_destroy_buffer(buf);
360 CDEBUG(D_NET, "nposted_buffers = %d (after)\n",plni->plni_nposted_buffers);
361 CDEBUG(D_NET, "nbuffers = %d (after)\n",plni->plni_nbuffers);
363 LASSERT (plni->plni_nposted_buffers == 0);
364 LASSERT (plni->plni_nbuffers == 0);
368 ptllnd_create_peer_hash (lnet_ni_t *ni)
370 ptllnd_ni_t *plni = ni->ni_data;
373 plni->plni_npeers = 0;
375 LIBCFS_ALLOC(plni->plni_peer_hash,
376 plni->plni_peer_hash_size * sizeof(*plni->plni_peer_hash));
377 if (plni->plni_peer_hash == NULL) {
378 CERROR("Can't allocate ptllnd peer hash (size %d)\n",
379 plni->plni_peer_hash_size);
383 for (i = 0; i < plni->plni_peer_hash_size; i++)
384 CFS_INIT_LIST_HEAD(&plni->plni_peer_hash[i]);
390 ptllnd_destroy_peer_hash (lnet_ni_t *ni)
392 ptllnd_ni_t *plni = ni->ni_data;
395 LASSERT( plni->plni_npeers == 0);
397 for (i = 0; i < plni->plni_peer_hash_size; i++)
398 LASSERT (list_empty(&plni->plni_peer_hash[i]));
400 LIBCFS_FREE(plni->plni_peer_hash,
401 plni->plni_peer_hash_size * sizeof(*plni->plni_peer_hash));
405 ptllnd_close_peers (lnet_ni_t *ni)
407 ptllnd_ni_t *plni = ni->ni_data;
411 for (i = 0; i < plni->plni_peer_hash_size; i++)
412 while (!list_empty(&plni->plni_peer_hash[i])) {
413 plp = list_entry(plni->plni_peer_hash[i].next,
414 ptllnd_peer_t, plp_list);
416 ptllnd_close_peer(plp, 0);
421 ptllnd_ctl(lnet_ni_t *ni, unsigned int cmd, void *arg)
424 case IOC_LIBCFS_DEBUG_PEER:
425 ptllnd_debug_peer(ni, *((lnet_process_id_t *)arg));
434 ptllnd_get_timestamp(void)
437 int rc = gettimeofday(&tv, NULL);
440 return ((__u64)tv.tv_sec) * 1000000 + tv.tv_usec;
444 ptllnd_shutdown (lnet_ni_t *ni)
446 ptllnd_ni_t *plni = ni->ni_data;
448 time_t start = cfs_time_current_sec();
449 int w = PTLLND_WARN_LONG_WAIT;
451 LASSERT (ptllnd_ni_count == 1);
452 plni->plni_max_tx_history = 0;
454 ptllnd_cull_tx_history(plni);
456 ptllnd_destroy_buffers(ni);
457 ptllnd_close_peers(ni);
459 while (plni->plni_npeers > 0) {
460 if (cfs_time_current_sec() > start + w) {
461 CWARN("Waited %ds for peers to shutdown\n", w);
464 ptllnd_wait(ni, w*1000);
467 LASSERT (plni->plni_ntxs == 0);
468 LASSERT (plni->plni_nrxs == 0);
470 rc = PtlEQFree(plni->plni_eqh);
471 LASSERT (rc == PTL_OK);
473 rc = PtlNIFini(plni->plni_nih);
474 LASSERT (rc == PTL_OK);
476 ptllnd_destroy_peer_hash(ni);
477 LIBCFS_FREE(plni, sizeof(*plni));
482 ptllnd_startup (lnet_ni_t *ni)
487 /* could get limits from portals I guess... */
488 ni->ni_maxtxcredits =
489 ni->ni_peertxcredits = 1000;
491 if (ptllnd_ni_count != 0) {
492 CERROR("Can't have > 1 instance of ptllnd\n");
498 LIBCFS_ALLOC(plni, sizeof(*plni));
500 CERROR("Can't allocate ptllnd state\n");
507 plni->plni_stamp = ptllnd_get_timestamp();
510 plni->plni_ntx_history = 0;
511 CFS_INIT_LIST_HEAD(&plni->plni_zombie_txs);
512 CFS_INIT_LIST_HEAD(&plni->plni_tx_history);
515 * Initilize buffer related data structures
517 CFS_INIT_LIST_HEAD(&plni->plni_buffers);
518 plni->plni_nbuffers = 0;
519 plni->plni_nposted_buffers = 0;
521 rc = ptllnd_get_tunables(ni);
525 rc = ptllnd_create_peer_hash(ni);
529 /* NB I most probably won't get the PID I requested here. It doesn't
530 * matter because I don't need a fixed PID (only connection acceptors
531 * need a "well known" PID). */
533 rc = PtlNIInit(PTL_IFACE_DEFAULT, plni->plni_ptllnd_pid,
534 NULL, NULL, &plni->plni_nih);
535 if (rc != PTL_OK && rc != PTL_IFACE_DUP) {
536 CERROR("PtlNIInit failed: %d\n", rc);
541 rc = PtlEQAlloc(plni->plni_nih, plni->plni_eq_size,
542 PTL_EQ_HANDLER_NONE, &plni->plni_eqh);
544 CERROR("PtlEQAlloc failed: %d\n", rc);
550 * Fetch the Portals NID
552 if(rc != PtlGetId(plni->plni_nih,&plni->plni_portals_id)){
553 CERROR ("PtlGetID failed : %d\n", rc);
558 CDEBUG(D_NET, "lnet nid=" LPX64 " (passed in)\n",ni->ni_nid);
561 * Create the new NID. Based on the LND network type
562 * and the lower ni's address data.
564 ni->ni_nid = ptllnd_ptl2lnetnid(ni, plni->plni_portals_id.nid);
566 CDEBUG(D_NET, "ptl id =%s\n", ptllnd_ptlid2str(plni->plni_portals_id));
567 CDEBUG(D_NET, "lnet id =%s (passed back)\n",
568 libcfs_id2str((lnet_process_id_t) {
569 .nid = ni->ni_nid, .pid = the_lnet.ln_pid}));
571 rc = ptllnd_grow_buffers(ni);
578 ptllnd_destroy_buffers(ni);
579 PtlEQFree(plni->plni_eqh);
581 PtlNIFini(plni->plni_nih);
583 ptllnd_destroy_peer_hash(ni);
585 LIBCFS_FREE(plni, sizeof(*plni));
588 CDEBUG(D_NET, "<<< rc=%d\n",rc);
592 const char *ptllnd_evtype2str(int type)
594 #define DO_TYPE(x) case x: return #x;
597 DO_TYPE(PTL_EVENT_GET_START);
598 DO_TYPE(PTL_EVENT_GET_END);
599 DO_TYPE(PTL_EVENT_PUT_START);
600 DO_TYPE(PTL_EVENT_PUT_END);
601 DO_TYPE(PTL_EVENT_REPLY_START);
602 DO_TYPE(PTL_EVENT_REPLY_END);
603 DO_TYPE(PTL_EVENT_ACK);
604 DO_TYPE(PTL_EVENT_SEND_START);
605 DO_TYPE(PTL_EVENT_SEND_END);
606 DO_TYPE(PTL_EVENT_UNLINK);
613 const char *ptllnd_msgtype2str(int type)
615 #define DO_TYPE(x) case x: return #x;
618 DO_TYPE(PTLLND_MSG_TYPE_INVALID);
619 DO_TYPE(PTLLND_MSG_TYPE_PUT);
620 DO_TYPE(PTLLND_MSG_TYPE_GET);
621 DO_TYPE(PTLLND_MSG_TYPE_IMMEDIATE);
622 DO_TYPE(PTLLND_MSG_TYPE_HELLO);
623 DO_TYPE(PTLLND_MSG_TYPE_NOOP);
624 DO_TYPE(PTLLND_MSG_TYPE_NAK);