lnet/lnet/lib-msg.c

   1 /*
   2  * GPL HEADER START
   3  *
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License version 2 only,
   8  * as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope that it will be useful, but
  11  * WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * General Public License version 2 for more details (a copy is included
  14  * in the LICENSE file that accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * version 2 along with this program; If not, see
  18  * http://www.gnu.org/licenses/gpl-2.0.html
  19  *
  20  * GPL HEADER END
  21  */
  22 /*
  23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Use is subject to license terms.
  25  *
  26  * Copyright (c) 2012, 2017, Intel Corporation.
  27  */
  28 /*
  29  * This file is part of Lustre, http://www.lustre.org/
  30  * Lustre is a trademark of Sun Microsystems, Inc.
  31  *
  32  * lnet/lnet/lib-msg.c
  33  *
  34  * Message decoding, parsing and finalizing routines
  35  */
  36
  37 #define DEBUG_SUBSYSTEM S_LNET
  38
  39 #include <lnet/lib-lnet.h>
  40
  41 void
  42 lnet_build_unlink_event(struct lnet_libmd *md, struct lnet_event *ev)
  43 {
  44         ENTRY;
  45
  46         memset(ev, 0, sizeof(*ev));
  47
  48         ev->status   = 0;
  49         ev->unlinked = 1;
  50         ev->type     = LNET_EVENT_UNLINK;
  51         lnet_md_deconstruct(md, &ev->md);
  52         lnet_md2handle(&ev->md_handle, md);
  53         EXIT;
  54 }
  55
  56 /*
  57  * Don't need any lock, must be called after lnet_commit_md
  58  */
  59 void
  60 lnet_build_msg_event(struct lnet_msg *msg, enum lnet_event_kind ev_type)
  61 {
  62         struct lnet_hdr *hdr = &msg->msg_hdr;
  63         struct lnet_event *ev = &msg->msg_ev;
  64
  65         LASSERT(!msg->msg_routing);
  66
  67         ev->type = ev_type;
  68         ev->msg_type = msg->msg_type;
  69
  70         if (ev_type == LNET_EVENT_SEND) {
  71                 /* event for active message */
  72                 ev->target.nid    = le64_to_cpu(hdr->dest_nid);
  73                 ev->target.pid    = le32_to_cpu(hdr->dest_pid);
  74                 ev->initiator.nid = LNET_NID_ANY;
  75                 ev->initiator.pid = the_lnet.ln_pid;
  76                 ev->source.nid    = LNET_NID_ANY;
  77                 ev->source.pid    = the_lnet.ln_pid;
  78                 ev->sender        = LNET_NID_ANY;
  79         } else {
  80                 /* event for passive message */
  81                 ev->target.pid    = hdr->dest_pid;
  82                 ev->target.nid    = hdr->dest_nid;
  83                 ev->initiator.pid = hdr->src_pid;
  84                 /* Multi-Rail: resolve src_nid to "primary" peer NID */
  85                 ev->initiator.nid = msg->msg_initiator;
  86                 /* Multi-Rail: track source NID. */
  87                 ev->source.pid    = hdr->src_pid;
  88                 ev->source.nid    = hdr->src_nid;
  89                 ev->rlength       = hdr->payload_length;
  90                 ev->sender        = msg->msg_from;
  91                 ev->mlength       = msg->msg_wanted;
  92                 ev->offset        = msg->msg_offset;
  93         }
  94
  95         switch (ev_type) {
  96         default:
  97                 LBUG();
  98
  99         case LNET_EVENT_PUT: /* passive PUT */
 100                 ev->pt_index   = hdr->msg.put.ptl_index;
 101                 ev->match_bits = hdr->msg.put.match_bits;
 102                 ev->hdr_data   = hdr->msg.put.hdr_data;
 103                 return;
 104
 105         case LNET_EVENT_GET: /* passive GET */
 106                 ev->pt_index   = hdr->msg.get.ptl_index;
 107                 ev->match_bits = hdr->msg.get.match_bits;
 108                 ev->hdr_data   = 0;
 109                 return;
 110
 111         case LNET_EVENT_ACK: /* ACK */
 112                 ev->match_bits = hdr->msg.ack.match_bits;
 113                 ev->mlength    = hdr->msg.ack.mlength;
 114                 return;
 115
 116         case LNET_EVENT_REPLY: /* REPLY */
 117                 return;
 118
 119         case LNET_EVENT_SEND: /* active message */
 120                 if (msg->msg_type == LNET_MSG_PUT) {
 121                         ev->pt_index   = le32_to_cpu(hdr->msg.put.ptl_index);
 122                         ev->match_bits = le64_to_cpu(hdr->msg.put.match_bits);
 123                         ev->offset     = le32_to_cpu(hdr->msg.put.offset);
 124                         ev->mlength    =
 125                         ev->rlength    = le32_to_cpu(hdr->payload_length);
 126                         ev->hdr_data   = le64_to_cpu(hdr->msg.put.hdr_data);
 127
 128                 } else {
 129                         LASSERT(msg->msg_type == LNET_MSG_GET);
 130                         ev->pt_index   = le32_to_cpu(hdr->msg.get.ptl_index);
 131                         ev->match_bits = le64_to_cpu(hdr->msg.get.match_bits);
 132                         ev->mlength    =
 133                         ev->rlength    = le32_to_cpu(hdr->msg.get.sink_length);
 134                         ev->offset     = le32_to_cpu(hdr->msg.get.src_offset);
 135                         ev->hdr_data   = 0;
 136                 }
 137                 return;
 138         }
 139 }
 140
 141 void
 142 lnet_msg_commit(struct lnet_msg *msg, int cpt)
 143 {
 144         struct lnet_msg_container *container = the_lnet.ln_msg_containers[cpt];
 145         struct lnet_counters_common *common;
 146         s64 timeout_ns;
 147
 148         /* set the message deadline */
 149         timeout_ns = lnet_transaction_timeout * NSEC_PER_SEC;
 150         msg->msg_deadline = ktime_add_ns(ktime_get(), timeout_ns);
 151
 152         /* routed message can be committed for both receiving and sending */
 153         LASSERT(!msg->msg_tx_committed);
 154
 155         if (msg->msg_sending) {
 156                 LASSERT(!msg->msg_receiving);
 157                 msg->msg_tx_cpt = cpt;
 158                 msg->msg_tx_committed = 1;
 159                 if (msg->msg_rx_committed) { /* routed message REPLY */
 160                         LASSERT(msg->msg_onactivelist);
 161                         return;
 162                 }
 163         } else {
 164                 LASSERT(!msg->msg_sending);
 165                 msg->msg_rx_cpt = cpt;
 166                 msg->msg_rx_committed = 1;
 167         }
 168
 169         LASSERT(!msg->msg_onactivelist);
 170
 171         msg->msg_onactivelist = 1;
 172         list_add_tail(&msg->msg_activelist, &container->msc_active);
 173
 174         common = &the_lnet.ln_counters[cpt]->lct_common;
 175         common->lcc_msgs_alloc++;
 176         if (common->lcc_msgs_alloc > common->lcc_msgs_max)
 177                 common->lcc_msgs_max = common->lcc_msgs_alloc;
 178 }
 179
 180 static void
 181 lnet_msg_decommit_tx(struct lnet_msg *msg, int status)
 182 {
 183         struct lnet_counters_common *common;
 184         struct lnet_event *ev = &msg->msg_ev;
 185
 186         LASSERT(msg->msg_tx_committed);
 187         if (status != 0)
 188                 goto out;
 189
 190         common = &(the_lnet.ln_counters[msg->msg_tx_cpt]->lct_common);
 191         switch (ev->type) {
 192         default: /* routed message */
 193                 LASSERT(msg->msg_routing);
 194                 LASSERT(msg->msg_rx_committed);
 195                 LASSERT(ev->type == 0);
 196
 197                 common->lcc_route_length += msg->msg_len;
 198                 common->lcc_route_count++;
 199                 goto incr_stats;
 200
 201         case LNET_EVENT_PUT:
 202                 /* should have been decommitted */
 203                 LASSERT(!msg->msg_rx_committed);
 204                 /* overwritten while sending ACK */
 205                 LASSERT(msg->msg_type == LNET_MSG_ACK);
 206                 msg->msg_type = LNET_MSG_PUT; /* fix type */
 207                 break;
 208
 209         case LNET_EVENT_SEND:
 210                 LASSERT(!msg->msg_rx_committed);
 211                 if (msg->msg_type == LNET_MSG_PUT)
 212                         common->lcc_send_length += msg->msg_len;
 213                 break;
 214
 215         case LNET_EVENT_GET:
 216                 LASSERT(msg->msg_rx_committed);
 217                 /* overwritten while sending reply, we should never be
 218                  * here for optimized GET */
 219                 LASSERT(msg->msg_type == LNET_MSG_REPLY);
 220                 msg->msg_type = LNET_MSG_GET; /* fix type */
 221                 break;
 222         }
 223
 224         common->lcc_send_count++;
 225
 226 incr_stats:
 227         if (msg->msg_txpeer)
 228                 lnet_incr_stats(&msg->msg_txpeer->lpni_stats,
 229                                 msg->msg_type,
 230                                 LNET_STATS_TYPE_SEND);
 231         if (msg->msg_txni)
 232                 lnet_incr_stats(&msg->msg_txni->ni_stats,
 233                                 msg->msg_type,
 234                                 LNET_STATS_TYPE_SEND);
 235  out:
 236         lnet_return_tx_credits_locked(msg);
 237         msg->msg_tx_committed = 0;
 238 }
 239
 240 static void
 241 lnet_msg_decommit_rx(struct lnet_msg *msg, int status)
 242 {
 243         struct lnet_counters_common *common;
 244         struct lnet_event *ev = &msg->msg_ev;
 245
 246         LASSERT(!msg->msg_tx_committed); /* decommitted or never committed */
 247         LASSERT(msg->msg_rx_committed);
 248
 249         if (status != 0)
 250                 goto out;
 251
 252         common = &(the_lnet.ln_counters[msg->msg_rx_cpt]->lct_common);
 253         switch (ev->type) {
 254         default:
 255                 LASSERT(ev->type == 0);
 256                 LASSERT(msg->msg_routing);
 257                 goto incr_stats;
 258
 259         case LNET_EVENT_ACK:
 260                 LASSERT(msg->msg_type == LNET_MSG_ACK);
 261                 break;
 262
 263         case LNET_EVENT_GET:
 264                 /* type is "REPLY" if it's an optimized GET on passive side,
 265                  * because optimized GET will never be committed for sending,
 266                  * so message type wouldn't be changed back to "GET" by
 267                  * lnet_msg_decommit_tx(), see details in lnet_parse_get() */
 268                 LASSERT(msg->msg_type == LNET_MSG_REPLY ||
 269                         msg->msg_type == LNET_MSG_GET);
 270                 common->lcc_send_length += msg->msg_wanted;
 271                 break;
 272
 273         case LNET_EVENT_PUT:
 274                 LASSERT(msg->msg_type == LNET_MSG_PUT);
 275                 break;
 276
 277         case LNET_EVENT_REPLY:
 278                 /* type is "GET" if it's an optimized GET on active side,
 279                  * see details in lnet_create_reply_msg() */
 280                 LASSERT(msg->msg_type == LNET_MSG_GET ||
 281                         msg->msg_type == LNET_MSG_REPLY);
 282                 break;
 283         }
 284
 285         common->lcc_recv_count++;
 286
 287 incr_stats:
 288         if (msg->msg_rxpeer)
 289                 lnet_incr_stats(&msg->msg_rxpeer->lpni_stats,
 290                                 msg->msg_type,
 291                                 LNET_STATS_TYPE_RECV);
 292         if (msg->msg_rxni)
 293                 lnet_incr_stats(&msg->msg_rxni->ni_stats,
 294                                 msg->msg_type,
 295                                 LNET_STATS_TYPE_RECV);
 296         if (ev->type == LNET_EVENT_PUT || ev->type == LNET_EVENT_REPLY)
 297                 common->lcc_recv_length += msg->msg_wanted;
 298
 299  out:
 300         lnet_return_rx_credits_locked(msg);
 301         msg->msg_rx_committed = 0;
 302 }
 303
 304 void
 305 lnet_msg_decommit(struct lnet_msg *msg, int cpt, int status)
 306 {
 307         int     cpt2 = cpt;
 308
 309         LASSERT(msg->msg_tx_committed || msg->msg_rx_committed);
 310         LASSERT(msg->msg_onactivelist);
 311
 312         if (msg->msg_tx_committed) { /* always decommit for sending first */
 313                 LASSERT(cpt == msg->msg_tx_cpt);
 314                 lnet_msg_decommit_tx(msg, status);
 315         }
 316
 317         if (msg->msg_rx_committed) {
 318                 /* forwarding msg committed for both receiving and sending */
 319                 if (cpt != msg->msg_rx_cpt) {
 320                         lnet_net_unlock(cpt);
 321                         cpt2 = msg->msg_rx_cpt;
 322                         lnet_net_lock(cpt2);
 323                 }
 324                 lnet_msg_decommit_rx(msg, status);
 325         }
 326
 327         list_del(&msg->msg_activelist);
 328         msg->msg_onactivelist = 0;
 329
 330         the_lnet.ln_counters[cpt2]->lct_common.lcc_msgs_alloc--;
 331
 332         if (cpt2 != cpt) {
 333                 lnet_net_unlock(cpt2);
 334                 lnet_net_lock(cpt);
 335         }
 336 }
 337
 338 void
 339 lnet_msg_attach_md(struct lnet_msg *msg, struct lnet_libmd *md,
 340                    unsigned int offset, unsigned int mlen)
 341 {
 342         /* NB: @offset and @len are only useful for receiving */
 343         /* Here, we attach the MD on lnet_msg and mark it busy and
 344          * decrementing its threshold. Come what may, the lnet_msg "owns"
 345          * the MD until a call to lnet_msg_detach_md or lnet_finalize()
 346          * signals completion. */
 347         LASSERT(!msg->msg_routing);
 348
 349         msg->msg_md = md;
 350         if (msg->msg_receiving) { /* committed for receiving */
 351                 msg->msg_offset = offset;
 352                 msg->msg_wanted = mlen;
 353         }
 354
 355         md->md_refcount++;
 356         if (md->md_threshold != LNET_MD_THRESH_INF) {
 357                 LASSERT(md->md_threshold > 0);
 358                 md->md_threshold--;
 359         }
 360
 361         /* build umd in event */
 362         lnet_md2handle(&msg->msg_ev.md_handle, md);
 363         lnet_md_deconstruct(md, &msg->msg_ev.md);
 364 }
 365
 366 void
 367 lnet_msg_detach_md(struct lnet_msg *msg, int status)
 368 {
 369         struct lnet_libmd *md = msg->msg_md;
 370         int unlink;
 371
 372         /* Now it's safe to drop my caller's ref */
 373         md->md_refcount--;
 374         LASSERT(md->md_refcount >= 0);
 375
 376         unlink = lnet_md_unlinkable(md);
 377         if (md->md_eq != NULL) {
 378                 msg->msg_ev.status   = status;
 379                 msg->msg_ev.unlinked = unlink;
 380                 lnet_eq_enqueue_event(md->md_eq, &msg->msg_ev);
 381         }
 382
 383         if (unlink)
 384                 lnet_md_unlink(md);
 385
 386         msg->msg_md = NULL;
 387 }
 388
 389 static int
 390 lnet_complete_msg_locked(struct lnet_msg *msg, int cpt)
 391 {
 392         struct lnet_handle_wire ack_wmd;
 393         int                rc;
 394         int                status = msg->msg_ev.status;
 395
 396         LASSERT(msg->msg_onactivelist);
 397
 398         if (status == 0 && msg->msg_ack) {
 399                 /* Only send an ACK if the PUT completed successfully */
 400
 401                 lnet_msg_decommit(msg, cpt, 0);
 402
 403                 msg->msg_ack = 0;
 404                 lnet_net_unlock(cpt);
 405
 406                 LASSERT(msg->msg_ev.type == LNET_EVENT_PUT);
 407                 LASSERT(!msg->msg_routing);
 408
 409                 ack_wmd = msg->msg_hdr.msg.put.ack_wmd;
 410
 411                 lnet_prep_send(msg, LNET_MSG_ACK, msg->msg_ev.source, 0, 0);
 412
 413                 msg->msg_hdr.msg.ack.dst_wmd = ack_wmd;
 414                 msg->msg_hdr.msg.ack.match_bits = msg->msg_ev.match_bits;
 415                 msg->msg_hdr.msg.ack.mlength = cpu_to_le32(msg->msg_ev.mlength);
 416
 417                 /* NB: we probably want to use NID of msg::msg_from as 3rd
 418                  * parameter (router NID) if it's routed message */
 419                 rc = lnet_send(msg->msg_ev.target.nid, msg, LNET_NID_ANY);
 420
 421                 lnet_net_lock(cpt);
 422                 /*
 423                  * NB: message is committed for sending, we should return
 424                  * on success because LND will finalize this message later.
 425                  *
 426                  * Also, there is possibility that message is committed for
 427                  * sending and also failed before delivering to LND,
 428                  * i.e: ENOMEM, in that case we can't fall through either
 429                  * because CPT for sending can be different with CPT for
 430                  * receiving, so we should return back to lnet_finalize()
 431                  * to make sure we are locking the correct partition.
 432                  */
 433                 return rc;
 434
 435         } else if (status == 0 &&       /* OK so far */
 436                    (msg->msg_routing && !msg->msg_sending)) {
 437                 /* not forwarded */
 438                 LASSERT(!msg->msg_receiving);   /* called back recv already */
 439                 lnet_net_unlock(cpt);
 440
 441                 rc = lnet_send(LNET_NID_ANY, msg, LNET_NID_ANY);
 442
 443                 lnet_net_lock(cpt);
 444                 /*
 445                  * NB: message is committed for sending, we should return
 446                  * on success because LND will finalize this message later.
 447                  *
 448                  * Also, there is possibility that message is committed for
 449                  * sending and also failed before delivering to LND,
 450                  * i.e: ENOMEM, in that case we can't fall through either:
 451                  * - The rule is message must decommit for sending first if
 452                  *   the it's committed for both sending and receiving
 453                  * - CPT for sending can be different with CPT for receiving,
 454                  *   so we should return back to lnet_finalize() to make
 455                  *   sure we are locking the correct partition.
 456                  */
 457                 return rc;
 458         }
 459
 460         lnet_msg_decommit(msg, cpt, status);
 461         lnet_msg_free(msg);
 462         return 0;
 463 }
 464
 465 static void
 466 lnet_dec_healthv_locked(atomic_t *healthv)
 467 {
 468         int h = atomic_read(healthv);
 469
 470         if (h < lnet_health_sensitivity) {
 471                 atomic_set(healthv, 0);
 472         } else {
 473                 h -= lnet_health_sensitivity;
 474                 atomic_set(healthv, h);
 475         }
 476 }
 477
 478 static void
 479 lnet_handle_local_failure(struct lnet_msg *msg)
 480 {
 481         struct lnet_ni *local_ni;
 482
 483         local_ni = msg->msg_txni;
 484
 485         /*
 486          * the lnet_net_lock(0) is used to protect the addref on the ni
 487          * and the recovery queue.
 488          */
 489         lnet_net_lock(0);
 490         /* the mt could've shutdown and cleaned up the queues */
 491         if (the_lnet.ln_mt_state != LNET_MT_STATE_RUNNING) {
 492                 lnet_net_unlock(0);
 493                 return;
 494         }
 495
 496         lnet_dec_healthv_locked(&local_ni->ni_healthv);
 497         /*
 498          * add the NI to the recovery queue if it's not already there
 499          * and it's health value is actually below the maximum. It's
 500          * possible that the sensitivity might be set to 0, and the health
 501          * value will not be reduced. In this case, there is no reason to
 502          * invoke recovery
 503          */
 504         if (list_empty(&local_ni->ni_recovery) &&
 505             atomic_read(&local_ni->ni_healthv) < LNET_MAX_HEALTH_VALUE) {
 506                 CERROR("ni %s added to recovery queue. Health = %d\n",
 507                         libcfs_nid2str(local_ni->ni_nid),
 508                         atomic_read(&local_ni->ni_healthv));
 509                 list_add_tail(&local_ni->ni_recovery,
 510                               &the_lnet.ln_mt_localNIRecovq);
 511                 lnet_ni_addref_locked(local_ni, 0);
 512         }
 513         lnet_net_unlock(0);
 514 }
 515
 516 void
 517 lnet_handle_remote_failure_locked(struct lnet_peer_ni *lpni)
 518 {
 519         /* lpni could be NULL if we're in the LOLND case */
 520         if (!lpni)
 521                 return;
 522
 523         lnet_dec_healthv_locked(&lpni->lpni_healthv);
 524         /*
 525          * add the peer NI to the recovery queue if it's not already there
 526          * and it's health value is actually below the maximum. It's
 527          * possible that the sensitivity might be set to 0, and the health
 528          * value will not be reduced. In this case, there is no reason to
 529          * invoke recovery
 530          */
 531         lnet_peer_ni_add_to_recoveryq_locked(lpni);
 532 }
 533
 534 static void
 535 lnet_handle_remote_failure(struct lnet_peer_ni *lpni)
 536 {
 537         /* lpni could be NULL if we're in the LOLND case */
 538         if (!lpni)
 539                 return;
 540
 541         lnet_net_lock(0);
 542         lnet_handle_remote_failure_locked(lpni);
 543         lnet_net_unlock(0);
 544 }
 545
 546 static void
 547 lnet_incr_hstats(struct lnet_msg *msg, enum lnet_msg_hstatus hstatus)
 548 {
 549         struct lnet_ni *ni = msg->msg_txni;
 550         struct lnet_peer_ni *lpni = msg->msg_txpeer;
 551         struct lnet_counters_health *health;
 552
 553         health = &the_lnet.ln_counters[0]->lct_health;
 554
 555         switch (hstatus) {
 556         case LNET_MSG_STATUS_LOCAL_INTERRUPT:
 557                 atomic_inc(&ni->ni_hstats.hlt_local_interrupt);
 558                 health->lch_local_interrupt_count++;
 559                 break;
 560         case LNET_MSG_STATUS_LOCAL_DROPPED:
 561                 atomic_inc(&ni->ni_hstats.hlt_local_dropped);
 562                 health->lch_local_dropped_count++;
 563                 break;
 564         case LNET_MSG_STATUS_LOCAL_ABORTED:
 565                 atomic_inc(&ni->ni_hstats.hlt_local_aborted);
 566                 health->lch_local_aborted_count++;
 567                 break;
 568         case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
 569                 atomic_inc(&ni->ni_hstats.hlt_local_no_route);
 570                 health->lch_local_no_route_count++;
 571                 break;
 572         case LNET_MSG_STATUS_LOCAL_TIMEOUT:
 573                 atomic_inc(&ni->ni_hstats.hlt_local_timeout);
 574                 health->lch_local_timeout_count++;
 575                 break;
 576         case LNET_MSG_STATUS_LOCAL_ERROR:
 577                 atomic_inc(&ni->ni_hstats.hlt_local_error);
 578                 health->lch_local_error_count++;
 579                 break;
 580         case LNET_MSG_STATUS_REMOTE_DROPPED:
 581                 if (lpni)
 582                         atomic_inc(&lpni->lpni_hstats.hlt_remote_dropped);
 583                 health->lch_remote_dropped_count++;
 584                 break;
 585         case LNET_MSG_STATUS_REMOTE_ERROR:
 586                 if (lpni)
 587                         atomic_inc(&lpni->lpni_hstats.hlt_remote_error);
 588                 health->lch_remote_error_count++;
 589                 break;
 590         case LNET_MSG_STATUS_REMOTE_TIMEOUT:
 591                 if (lpni)
 592                         atomic_inc(&lpni->lpni_hstats.hlt_remote_timeout);
 593                 health->lch_remote_timeout_count++;
 594                 break;
 595         case LNET_MSG_STATUS_NETWORK_TIMEOUT:
 596                 if (lpni)
 597                         atomic_inc(&lpni->lpni_hstats.hlt_network_timeout);
 598                 health->lch_network_timeout_count++;
 599                 break;
 600         case LNET_MSG_STATUS_OK:
 601                 break;
 602         default:
 603                 LBUG();
 604         }
 605 }
 606
 607 /*
 608  * Do a health check on the message:
 609  * return -1 if we're not going to handle the error or
 610  *   if we've reached the maximum number of retries.
 611  *   success case will return -1 as well
 612  * return 0 if it the message is requeued for send
 613  */
 614 static int
 615 lnet_health_check(struct lnet_msg *msg)
 616 {
 617         enum lnet_msg_hstatus hstatus = msg->msg_health_status;
 618         bool lo = false;
 619
 620         /* if we're shutting down no point in handling health. */
 621         if (the_lnet.ln_state != LNET_STATE_RUNNING)
 622                 return -1;
 623
 624         LASSERT(msg->msg_txni);
 625
 626         /*
 627          * if we're sending to the LOLND then the msg_txpeer will not be
 628          * set. So no need to sanity check it.
 629          */
 630         if (LNET_NETTYP(LNET_NIDNET(msg->msg_txni->ni_nid)) != LOLND)
 631                 LASSERT(msg->msg_txpeer);
 632         else
 633                 lo = true;
 634
 635         if (hstatus != LNET_MSG_STATUS_OK &&
 636             ktime_compare(ktime_get(), msg->msg_deadline) >= 0)
 637                 return -1;
 638
 639         /*
 640          * stats are only incremented for errors so avoid wasting time
 641          * incrementing statistics if there is no error.
 642          */
 643         if (hstatus != LNET_MSG_STATUS_OK) {
 644                 lnet_net_lock(0);
 645                 lnet_incr_hstats(msg, hstatus);
 646                 lnet_net_unlock(0);
 647         }
 648
 649         CDEBUG(D_NET, "health check: %s->%s: %s: %s\n",
 650                libcfs_nid2str(msg->msg_txni->ni_nid),
 651                (lo) ? "self" : libcfs_nid2str(msg->msg_txpeer->lpni_nid),
 652                lnet_msgtyp2str(msg->msg_type),
 653                lnet_health_error2str(hstatus));
 654
 655         switch (hstatus) {
 656         case LNET_MSG_STATUS_OK:
 657                 lnet_inc_healthv(&msg->msg_txni->ni_healthv);
 658                 /*
 659                  * It's possible msg_txpeer is NULL in the LOLND
 660                  * case.
 661                  */
 662                 if (msg->msg_txpeer)
 663                         lnet_inc_healthv(&msg->msg_txpeer->lpni_healthv);
 664
 665                 /* we can finalize this message */
 666                 return -1;
 667         case LNET_MSG_STATUS_LOCAL_INTERRUPT:
 668         case LNET_MSG_STATUS_LOCAL_DROPPED:
 669         case LNET_MSG_STATUS_LOCAL_ABORTED:
 670         case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
 671         case LNET_MSG_STATUS_LOCAL_TIMEOUT:
 672                 lnet_handle_local_failure(msg);
 673                 /* add to the re-send queue */
 674                 goto resend;
 675
 676         /*
 677          * These errors will not trigger a resend so simply
 678          * finalize the message
 679          */
 680         case LNET_MSG_STATUS_LOCAL_ERROR:
 681                 lnet_handle_local_failure(msg);
 682                 return -1;
 683
 684         /*
 685          * TODO: since the remote dropped the message we can
 686          * attempt a resend safely.
 687          */
 688         case LNET_MSG_STATUS_REMOTE_DROPPED:
 689                 lnet_handle_remote_failure(msg->msg_txpeer);
 690                 goto resend;
 691
 692         case LNET_MSG_STATUS_REMOTE_ERROR:
 693         case LNET_MSG_STATUS_REMOTE_TIMEOUT:
 694         case LNET_MSG_STATUS_NETWORK_TIMEOUT:
 695                 lnet_handle_remote_failure(msg->msg_txpeer);
 696                 return -1;
 697         default:
 698                 LBUG();
 699         }
 700
 701 resend:
 702         /* don't resend recovery messages */
 703         if (msg->msg_recovery) {
 704                 CDEBUG(D_NET, "msg %s->%s is a recovery ping. retry# %d\n",
 705                         libcfs_nid2str(msg->msg_from),
 706                         libcfs_nid2str(msg->msg_target.nid),
 707                         msg->msg_retry_count);
 708                 return -1;
 709         }
 710
 711         /*
 712          * if we explicitly indicated we don't want to resend then just
 713          * return
 714          */
 715         if (msg->msg_no_resend) {
 716                 CDEBUG(D_NET, "msg %s->%s requested no resend. retry# %d\n",
 717                         libcfs_nid2str(msg->msg_from),
 718                         libcfs_nid2str(msg->msg_target.nid),
 719                         msg->msg_retry_count);
 720                 return -1;
 721         }
 722
 723         /* check if the message has exceeded the number of retries */
 724         if (msg->msg_retry_count >= lnet_retry_count) {
 725                 CNETERR("msg %s->%s exceeded retry count %d\n",
 726                         libcfs_nid2str(msg->msg_from),
 727                         libcfs_nid2str(msg->msg_target.nid),
 728                         msg->msg_retry_count);
 729                 return -1;
 730         }
 731         msg->msg_retry_count++;
 732
 733         lnet_net_lock(msg->msg_tx_cpt);
 734
 735         /*
 736          * remove message from the active list and reset it in preparation
 737          * for a resend. Two exception to this
 738          *
 739          * 1. the router case, whe a message is committed for rx when
 740          * received, then tx when it is sent. When committed to both tx and
 741          * rx we don't want to remove it from the active list.
 742          *
 743          * 2. The REPLY case since it uses the same msg block for the GET
 744          * that was received.
 745          */
 746         if (!msg->msg_routing && msg->msg_type != LNET_MSG_REPLY) {
 747                 list_del_init(&msg->msg_activelist);
 748                 msg->msg_onactivelist = 0;
 749         }
 750         /*
 751          * The msg_target.nid which was originally set
 752          * when calling LNetGet() or LNetPut() might've
 753          * been overwritten if we're routing this message.
 754          * Call lnet_return_tx_credits_locked() to return
 755          * the credit this message consumed. The message will
 756          * consume another credit when it gets resent.
 757          */
 758         msg->msg_target.nid = msg->msg_hdr.dest_nid;
 759         lnet_msg_decommit_tx(msg, -EAGAIN);
 760         msg->msg_sending = 0;
 761         msg->msg_receiving = 0;
 762         msg->msg_target_is_router = 0;
 763
 764         CDEBUG(D_NET, "%s->%s:%s:%s - queuing for resend\n",
 765                libcfs_nid2str(msg->msg_hdr.src_nid),
 766                libcfs_nid2str(msg->msg_hdr.dest_nid),
 767                lnet_msgtyp2str(msg->msg_type),
 768                lnet_health_error2str(hstatus));
 769
 770         list_add_tail(&msg->msg_list, the_lnet.ln_mt_resendqs[msg->msg_tx_cpt]);
 771         lnet_net_unlock(msg->msg_tx_cpt);
 772
 773         wake_up(&the_lnet.ln_mt_waitq);
 774         return 0;
 775 }
 776
 777 static void
 778 lnet_detach_md(struct lnet_msg *msg, int status)
 779 {
 780         int cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
 781
 782         lnet_res_lock(cpt);
 783         lnet_msg_detach_md(msg, status);
 784         lnet_res_unlock(cpt);
 785 }
 786
 787 static bool
 788 lnet_is_health_check(struct lnet_msg *msg)
 789 {
 790         bool hc;
 791         int status = msg->msg_ev.status;
 792
 793         /*
 794          * perform a health check for any message committed for transmit
 795          */
 796         hc = msg->msg_tx_committed;
 797
 798         /* Check for status inconsistencies */
 799         if (hc &&
 800             ((!status && msg->msg_health_status != LNET_MSG_STATUS_OK) ||
 801              (status && msg->msg_health_status == LNET_MSG_STATUS_OK))) {
 802                 CERROR("Msg is in inconsistent state, don't perform health "
 803                        "checking (%d, %d)\n", status, msg->msg_health_status);
 804                 hc = false;
 805         }
 806
 807         CDEBUG(D_NET, "health check = %d, status = %d, hstatus = %d\n",
 808                hc, status, msg->msg_health_status);
 809
 810         return hc;
 811 }
 812
 813 char *
 814 lnet_health_error2str(enum lnet_msg_hstatus hstatus)
 815 {
 816         switch (hstatus) {
 817         case LNET_MSG_STATUS_LOCAL_INTERRUPT:
 818                 return "LOCAL_INTERRUPT";
 819         case LNET_MSG_STATUS_LOCAL_DROPPED:
 820                 return "LOCAL_DROPPED";
 821         case LNET_MSG_STATUS_LOCAL_ABORTED:
 822                 return "LOCAL_ABORTED";
 823         case LNET_MSG_STATUS_LOCAL_NO_ROUTE:
 824                 return "LOCAL_NO_ROUTE";
 825         case LNET_MSG_STATUS_LOCAL_TIMEOUT:
 826                 return "LOCAL_TIMEOUT";
 827         case LNET_MSG_STATUS_LOCAL_ERROR:
 828                 return "LOCAL_ERROR";
 829         case LNET_MSG_STATUS_REMOTE_DROPPED:
 830                 return "REMOTE_DROPPED";
 831         case LNET_MSG_STATUS_REMOTE_ERROR:
 832                 return "REMOTE_ERROR";
 833         case LNET_MSG_STATUS_REMOTE_TIMEOUT:
 834                 return "REMOTE_TIMEOUT";
 835         case LNET_MSG_STATUS_NETWORK_TIMEOUT:
 836                 return "NETWORK_TIMEOUT";
 837         case LNET_MSG_STATUS_OK:
 838                 return "OK";
 839         default:
 840                 return "<UNKNOWN>";
 841         }
 842 }
 843
 844 bool
 845 lnet_send_error_simulation(struct lnet_msg *msg,
 846                            enum lnet_msg_hstatus *hstatus)
 847 {
 848         if (!msg)
 849                 return false;
 850
 851         if (list_empty(&the_lnet.ln_drop_rules))
 852             return false;
 853
 854         /* match only health rules */
 855         if (!lnet_drop_rule_match(&msg->msg_hdr, hstatus))
 856                 return false;
 857
 858         CDEBUG(D_NET, "src %s, dst %s: %s simulate health error: %s\n",
 859                 libcfs_nid2str(msg->msg_hdr.src_nid),
 860                 libcfs_nid2str(msg->msg_hdr.dest_nid),
 861                 lnet_msgtyp2str(msg->msg_type),
 862                 lnet_health_error2str(*hstatus));
 863
 864         return true;
 865 }
 866 EXPORT_SYMBOL(lnet_send_error_simulation);
 867
 868 void
 869 lnet_finalize(struct lnet_msg *msg, int status)
 870 {
 871         struct lnet_msg_container *container;
 872         int my_slot;
 873         int cpt;
 874         int rc;
 875         int i;
 876         bool hc;
 877
 878         LASSERT(!in_interrupt());
 879
 880         if (msg == NULL)
 881                 return;
 882
 883         msg->msg_ev.status = status;
 884
 885         /*
 886          * if this is an ACK or a REPLY then make sure to remove the
 887          * response tracker.
 888          */
 889         if (msg->msg_ev.type == LNET_EVENT_REPLY ||
 890             msg->msg_ev.type == LNET_EVENT_ACK) {
 891                 cpt = lnet_cpt_of_cookie(msg->msg_md->md_lh.lh_cookie);
 892                 lnet_detach_rsp_tracker(msg->msg_md, cpt);
 893         }
 894
 895         /* if the message is successfully sent, no need to keep the MD around */
 896         if (msg->msg_md != NULL && !status)
 897                 lnet_detach_md(msg, status);
 898
 899 again:
 900         hc = lnet_is_health_check(msg);
 901
 902         /*
 903          * the MD would've been detached from the message if it was
 904          * successfully sent. However, if it wasn't successfully sent the
 905          * MD would be around. And since we recalculate whether to
 906          * health check or not, it's possible that we change our minds and
 907          * we don't want to health check this message. In this case also
 908          * free the MD.
 909          *
 910          * If the message is successful we're going to
 911          * go through the lnet_health_check() function, but that'll just
 912          * increment the appropriate health value and return.
 913          */
 914         if (msg->msg_md != NULL && !hc)
 915                 lnet_detach_md(msg, status);
 916
 917         rc = 0;
 918         if (!msg->msg_tx_committed && !msg->msg_rx_committed) {
 919                 /* not committed to network yet */
 920                 LASSERT(!msg->msg_onactivelist);
 921                 lnet_msg_free(msg);
 922                 return;
 923         }
 924
 925         if (hc) {
 926                 /*
 927                  * Check the health status of the message. If it has one
 928                  * of the errors that we're supposed to handle, and it has
 929                  * not timed out, then
 930                  *      1. Decrement the appropriate health_value
 931                  *      2. queue the message on the resend queue
 932
 933                  * if the message send is success, timed out or failed in the
 934                  * health check for any reason then we'll just finalize the
 935                  * message. Otherwise just return since the message has been
 936                  * put on the resend queue.
 937                  */
 938                 if (!lnet_health_check(msg))
 939                         return;
 940
 941                 /*
 942                  * if we get here then we need to clean up the md because we're
 943                  * finalizing the message.
 944                 */
 945                 if (msg->msg_md != NULL)
 946                         lnet_detach_md(msg, status);
 947         }
 948
 949         /*
 950          * NB: routed message can be committed for both receiving and sending,
 951          * we should finalize in LIFO order and keep counters correct.
 952          * (finalize sending first then finalize receiving)
 953          */
 954         cpt = msg->msg_tx_committed ? msg->msg_tx_cpt : msg->msg_rx_cpt;
 955         lnet_net_lock(cpt);
 956
 957         container = the_lnet.ln_msg_containers[cpt];
 958         list_add_tail(&msg->msg_list, &container->msc_finalizing);
 959
 960         /* Recursion breaker.  Don't complete the message here if I am (or
 961          * enough other threads are) already completing messages */
 962
 963         my_slot = -1;
 964         for (i = 0; i < container->msc_nfinalizers; i++) {
 965                 if (container->msc_finalizers[i] == current)
 966                         break;
 967
 968                 if (my_slot < 0 && container->msc_finalizers[i] == NULL)
 969                         my_slot = i;
 970         }
 971
 972         if (i < container->msc_nfinalizers || my_slot < 0) {
 973                 lnet_net_unlock(cpt);
 974                 return;
 975         }
 976
 977         container->msc_finalizers[my_slot] = current;
 978
 979         while (!list_empty(&container->msc_finalizing)) {
 980                 msg = list_entry(container->msc_finalizing.next,
 981                                  struct lnet_msg, msg_list);
 982
 983                 list_del_init(&msg->msg_list);
 984
 985                 /* NB drops and regains the lnet lock if it actually does
 986                  * anything, so my finalizing friends can chomp along too */
 987                 rc = lnet_complete_msg_locked(msg, cpt);
 988                 if (rc != 0)
 989                         break;
 990         }
 991
 992         if (unlikely(!list_empty(&the_lnet.ln_delay_rules))) {
 993                 lnet_net_unlock(cpt);
 994                 lnet_delay_rule_check();
 995                 lnet_net_lock(cpt);
 996         }
 997
 998         container->msc_finalizers[my_slot] = NULL;
 999         lnet_net_unlock(cpt);
1000
1001         if (rc != 0)
1002                 goto again;
1003 }
1004 EXPORT_SYMBOL(lnet_finalize);
1005
1006 void
1007 lnet_msg_container_cleanup(struct lnet_msg_container *container)
1008 {
1009         int     count = 0;
1010
1011         if (container->msc_init == 0)
1012                 return;
1013
1014         while (!list_empty(&container->msc_active)) {
1015                 struct lnet_msg *msg;
1016
1017                 msg  = list_entry(container->msc_active.next,
1018                                   struct lnet_msg, msg_activelist);
1019                 LASSERT(msg->msg_onactivelist);
1020                 msg->msg_onactivelist = 0;
1021                 list_del_init(&msg->msg_activelist);
1022                 lnet_msg_free(msg);
1023                 count++;
1024         }
1025
1026         if (count > 0)
1027                 CERROR("%d active msg on exit\n", count);
1028
1029         if (container->msc_finalizers != NULL) {
1030                 LIBCFS_FREE(container->msc_finalizers,
1031                             container->msc_nfinalizers *
1032                             sizeof(*container->msc_finalizers));
1033                 container->msc_finalizers = NULL;
1034         }
1035         container->msc_init = 0;
1036 }
1037
1038 int
1039 lnet_msg_container_setup(struct lnet_msg_container *container, int cpt)
1040 {
1041         int rc = 0;
1042
1043         container->msc_init = 1;
1044
1045         INIT_LIST_HEAD(&container->msc_active);
1046         INIT_LIST_HEAD(&container->msc_finalizing);
1047
1048         /* number of CPUs */
1049         container->msc_nfinalizers = cfs_cpt_weight(lnet_cpt_table(), cpt);
1050         if (container->msc_nfinalizers == 0)
1051                 container->msc_nfinalizers = 1;
1052
1053         LIBCFS_CPT_ALLOC(container->msc_finalizers, lnet_cpt_table(), cpt,
1054                          container->msc_nfinalizers *
1055                          sizeof(*container->msc_finalizers));
1056
1057         if (container->msc_finalizers == NULL) {
1058                 CERROR("Failed to allocate message finalizers\n");
1059                 lnet_msg_container_cleanup(container);
1060                 return -ENOMEM;
1061         }
1062
1063         return rc;
1064 }
1065
1066 void
1067 lnet_msg_containers_destroy(void)
1068 {
1069         struct lnet_msg_container *container;
1070         int     i;
1071
1072         if (the_lnet.ln_msg_containers == NULL)
1073                 return;
1074
1075         cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers)
1076                 lnet_msg_container_cleanup(container);
1077
1078         cfs_percpt_free(the_lnet.ln_msg_containers);
1079         the_lnet.ln_msg_containers = NULL;
1080 }
1081
1082 int
1083 lnet_msg_containers_create(void)
1084 {
1085         struct lnet_msg_container *container;
1086         int     rc;
1087         int     i;
1088
1089         the_lnet.ln_msg_containers = cfs_percpt_alloc(lnet_cpt_table(),
1090                                                       sizeof(*container));
1091
1092         if (the_lnet.ln_msg_containers == NULL) {
1093                 CERROR("Failed to allocate cpu-partition data for network\n");
1094                 return -ENOMEM;
1095         }
1096
1097         cfs_percpt_for_each(container, i, the_lnet.ln_msg_containers) {
1098                 rc = lnet_msg_container_setup(container, i);
1099                 if (rc != 0) {
1100                         lnet_msg_containers_destroy();
1101                         return rc;
1102                 }
1103         }
1104
1105         return 0;
1106 }