1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5 * Author: Mike Shaver <shaver@clusterfs.com>
7 * This file is part of the Lustre file system, http://www.lustre.org
8 * Lustre is a trademark of Cluster File Systems, Inc.
10 * You may have signed or agreed to another license before downloading
11 * this software. If so, you are bound by the terms and conditions
12 * of that agreement, and the following does not apply to you. See the
13 * LICENSE file included with this distribution for more information.
15 * If you did not agree to a different license, then this copy of Lustre
16 * is open source software; you can redistribute it and/or modify it
17 * under the terms of version 2 of the GNU General Public License as
18 * published by the Free Software Foundation.
20 * In either case, Lustre is distributed in the hope that it will be
21 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
22 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * license text for more details.
26 #define DEBUG_SUBSYSTEM S_RPC
28 # include <liblustre.h>
31 #include <obd_support.h>
32 #include <lustre_ha.h>
33 #include <lustre_net.h>
34 #include <lustre_import.h>
35 #include <lustre_export.h>
37 #include <obd_class.h>
39 #include "ptlrpc_internal.h"
41 struct ptlrpc_connect_async_args {
42 __u64 pcaa_peer_committed;
43 int pcaa_initial_connect;
46 /* A CLOSED import should remain so. */
47 #define IMPORT_SET_STATE_NOLOCK(imp, state) \
49 if (imp->imp_state != LUSTRE_IMP_CLOSED) { \
50 CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n", \
51 imp, obd2cli_tgt(imp->imp_obd), \
52 ptlrpc_import_state_name(imp->imp_state), \
53 ptlrpc_import_state_name(state)); \
54 imp->imp_state = state; \
58 #define IMPORT_SET_STATE(imp, state) \
60 spin_lock(&imp->imp_lock); \
61 IMPORT_SET_STATE_NOLOCK(imp, state); \
62 spin_unlock(&imp->imp_lock); \
66 static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
68 int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
70 /* Only this function is allowed to change the import state when it is
71 * CLOSED. I would rather refcount the import and free it after
72 * disconnection like we do with exports. To do that, the client_obd
73 * will need to save the peer info somewhere other than in the import,
75 int ptlrpc_init_import(struct obd_import *imp)
77 spin_lock(&imp->imp_lock);
79 imp->imp_generation++;
80 imp->imp_state = LUSTRE_IMP_NEW;
82 spin_unlock(&imp->imp_lock);
86 EXPORT_SYMBOL(ptlrpc_init_import);
88 #define UUID_STR "_UUID"
89 static void deuuidify(char *uuid, const char *prefix, char **uuid_start,
92 *uuid_start = !prefix || strncmp(uuid, prefix, strlen(prefix))
93 ? uuid : uuid + strlen(prefix);
95 *uuid_len = strlen(*uuid_start);
97 if (*uuid_len < strlen(UUID_STR))
100 if (!strncmp(*uuid_start + *uuid_len - strlen(UUID_STR),
101 UUID_STR, strlen(UUID_STR)))
102 *uuid_len -= strlen(UUID_STR);
105 /* Returns true if import was FULL, false if import was already not
107 * @imp - import to be disconnected
108 * @conn_cnt - connection count (epoch) of the request that timed out
109 * and caused the disconnection. In some cases, multiple
110 * inflight requests can fail to a single target (e.g. OST
111 * bulk requests) and if one has already caused a reconnection
112 * (increasing the import->conn_cnt) the older failure should
113 * not also cause a reconnection. If zero it forces a reconnect.
115 int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt)
119 spin_lock(&imp->imp_lock);
121 if (imp->imp_state == LUSTRE_IMP_FULL &&
122 (conn_cnt == 0 || conn_cnt == imp->imp_conn_cnt)) {
126 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
127 &target_start, &target_len);
128 if (imp->imp_replayable) {
129 LCONSOLE_WARN("%s: Connection to service %.*s via nid "
130 "%s was lost; in progress operations using this "
131 "service will wait for recovery to complete.\n",
132 imp->imp_obd->obd_name, target_len, target_start,
133 libcfs_nid2str(imp->imp_connection->c_peer.nid));
135 LCONSOLE_ERROR_MSG(0x166, "%s: Connection to service "
136 "%.*s via nid %s was lost; in progress "
137 "operations using this service will fail.\n",
138 imp->imp_obd->obd_name, target_len, target_start,
139 libcfs_nid2str(imp->imp_connection->c_peer.nid));
141 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
142 spin_unlock(&imp->imp_lock);
144 if (obd_dump_on_timeout)
145 libcfs_debug_dumplog();
147 obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
150 spin_unlock(&imp->imp_lock);
151 CDEBUG(D_HA, "%s: import %p already %s (conn %u, was %u): %s\n",
152 imp->imp_client->cli_name, imp,
153 (imp->imp_state == LUSTRE_IMP_FULL &&
154 imp->imp_conn_cnt > conn_cnt) ?
155 "reconnected" : "not connected", imp->imp_conn_cnt,
156 conn_cnt, ptlrpc_import_state_name(imp->imp_state));
163 * This acts as a barrier; all existing requests are rejected, and
164 * no new requests will be accepted until the import is valid again.
166 void ptlrpc_deactivate_import(struct obd_import *imp)
170 spin_lock(&imp->imp_lock);
171 CDEBUG(D_HA, "setting import %s INVALID\n", obd2cli_tgt(imp->imp_obd));
172 imp->imp_invalid = 1;
173 imp->imp_generation++;
174 spin_unlock(&imp->imp_lock);
176 ptlrpc_abort_inflight(imp);
177 obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
181 * This function will invalidate the import, if necessary, then block
182 * for all the RPC completions, and finally notify the obd to
183 * invalidate its state (ie cancel locks, clear pending requests,
186 void ptlrpc_invalidate_import(struct obd_import *imp)
188 struct list_head *tmp, *n;
189 struct ptlrpc_request *req;
190 struct l_wait_info lwi;
193 atomic_inc(&imp->imp_inval_count);
195 if (!imp->imp_invalid)
196 ptlrpc_deactivate_import(imp);
198 LASSERT(imp->imp_invalid);
200 /* wait for all requests to error out and call completion callbacks.
201 Cap it at obd_timeout -- these should all have been locally
202 cancelled by ptlrpc_abort_inflight. */
203 lwi = LWI_TIMEOUT_INTERVAL(
204 cfs_timeout_cap(cfs_time_seconds(obd_timeout)),
205 cfs_time_seconds(1), NULL, NULL);
206 rc = l_wait_event(imp->imp_recovery_waitq,
207 (atomic_read(&imp->imp_inflight) == 0), &lwi);
210 CERROR("%s: rc = %d waiting for callback (%d != 0)\n",
211 obd2cli_tgt(imp->imp_obd), rc,
212 atomic_read(&imp->imp_inflight));
213 spin_lock(&imp->imp_lock);
214 list_for_each_safe(tmp, n, &imp->imp_sending_list) {
215 req = list_entry(tmp, struct ptlrpc_request, rq_list);
216 DEBUG_REQ(D_ERROR, req, "still on sending list");
218 list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
219 req = list_entry(tmp, struct ptlrpc_request, rq_list);
220 DEBUG_REQ(D_ERROR, req, "still on delayed list");
222 spin_unlock(&imp->imp_lock);
225 obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
227 atomic_dec(&imp->imp_inval_count);
228 cfs_waitq_signal(&imp->imp_recovery_waitq);
231 /* unset imp_invalid */
232 void ptlrpc_activate_import(struct obd_import *imp)
234 struct obd_device *obd = imp->imp_obd;
236 spin_lock(&imp->imp_lock);
237 imp->imp_invalid = 0;
238 spin_unlock(&imp->imp_lock);
240 obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
243 void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt)
247 LASSERT(!imp->imp_dlm_fake);
249 if (ptlrpc_set_import_discon(imp, conn_cnt)) {
250 if (!imp->imp_replayable) {
251 CDEBUG(D_HA, "import %s@%s for %s not replayable, "
252 "auto-deactivating\n",
253 obd2cli_tgt(imp->imp_obd),
254 imp->imp_connection->c_remote_uuid.uuid,
255 imp->imp_obd->obd_name);
256 ptlrpc_deactivate_import(imp);
259 CDEBUG(D_HA, "%s: waking up pinger\n",
260 obd2cli_tgt(imp->imp_obd));
262 spin_lock(&imp->imp_lock);
263 imp->imp_force_verify = 1;
264 spin_unlock(&imp->imp_lock);
266 ptlrpc_pinger_wake_up();
271 static int import_select_connection(struct obd_import *imp)
273 struct obd_import_conn *imp_conn = NULL, *conn;
274 struct obd_export *dlmexp;
278 spin_lock(&imp->imp_lock);
280 if (list_empty(&imp->imp_conn_list)) {
281 CERROR("%s: no connections available\n",
282 imp->imp_obd->obd_name);
283 spin_unlock(&imp->imp_lock);
287 list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
288 CDEBUG(D_HA, "%s: connect to NID %s last attempt "LPU64"\n",
289 imp->imp_obd->obd_name,
290 libcfs_nid2str(conn->oic_conn->c_peer.nid),
291 conn->oic_last_attempt);
293 /* Don't thrash connections */
294 if (cfs_time_before_64(cfs_time_current_64(),
295 conn->oic_last_attempt +
296 cfs_time_seconds(CONNECTION_SWITCH_MIN))) {
300 /* If we have not tried this connection since the
301 the last successful attempt, go with this one */
302 if ((conn->oic_last_attempt == 0) ||
303 cfs_time_beforeq_64(conn->oic_last_attempt,
304 imp->imp_last_success_conn)) {
310 /* If all of the connections have already been tried
311 since the last successful connection; just choose the
312 least recently used */
315 else if (cfs_time_before_64(conn->oic_last_attempt,
316 imp_conn->oic_last_attempt))
320 /* if not found, simply choose the current one */
322 LASSERT(imp->imp_conn_current);
323 imp_conn = imp->imp_conn_current;
326 LASSERT(imp_conn->oic_conn);
328 /* If we've tried everything, and we're back to the beginning of the
329 list, increase our timeout and try again. It will be reset when
330 we do finally connect. (FIXME: really we should wait for all network
331 state associated with the last connection attempt to drain before
332 trying to reconnect on it.) */
333 if (tried_all && (imp->imp_conn_list.next == &imp_conn->oic_item)) {
334 if (at_get(&imp->imp_at.iat_net_latency) <
335 CONNECTION_SWITCH_MAX) {
336 at_add(&imp->imp_at.iat_net_latency,
337 at_get(&imp->imp_at.iat_net_latency) +
338 CONNECTION_SWITCH_INC);
340 LASSERT(imp_conn->oic_last_attempt);
341 CWARN("%s: tried all connections, increasing latency to %ds\n",
342 imp->imp_obd->obd_name,
343 at_get(&imp->imp_at.iat_net_latency));
346 imp_conn->oic_last_attempt = cfs_time_current_64();
348 /* switch connection, don't mind if it's same as the current one */
349 if (imp->imp_connection)
350 ptlrpc_put_connection(imp->imp_connection);
351 imp->imp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
353 dlmexp = class_conn2export(&imp->imp_dlm_handle);
354 LASSERT(dlmexp != NULL);
355 if (dlmexp->exp_connection)
356 ptlrpc_put_connection(dlmexp->exp_connection);
357 dlmexp->exp_connection = ptlrpc_connection_addref(imp_conn->oic_conn);
358 class_export_put(dlmexp);
360 if (imp->imp_conn_current != imp_conn) {
361 if (imp->imp_conn_current)
362 LCONSOLE_INFO("Changing connection for %s to %s/%s\n",
363 imp->imp_obd->obd_name,
364 imp_conn->oic_uuid.uuid,
365 libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
366 imp->imp_conn_current = imp_conn;
369 CDEBUG(D_HA, "%s: import %p using connection %s/%s\n",
370 imp->imp_obd->obd_name, imp, imp_conn->oic_uuid.uuid,
371 libcfs_nid2str(imp_conn->oic_conn->c_peer.nid));
373 spin_unlock(&imp->imp_lock);
378 int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid)
380 struct obd_device *obd = imp->imp_obd;
381 int initial_connect = 0;
383 __u64 committed_before_reconnect = 0;
384 struct ptlrpc_request *request;
385 int size[] = { sizeof(struct ptlrpc_body),
386 sizeof(imp->imp_obd->u.cli.cl_target_uuid),
387 sizeof(obd->obd_uuid),
388 sizeof(imp->imp_dlm_handle),
389 sizeof(imp->imp_connect_data) };
390 char *tmp[] = { NULL,
391 obd2cli_tgt(imp->imp_obd),
393 (char *)&imp->imp_dlm_handle,
394 (char *)&imp->imp_connect_data };
395 struct ptlrpc_connect_async_args *aa;
398 spin_lock(&imp->imp_lock);
399 if (imp->imp_state == LUSTRE_IMP_CLOSED) {
400 spin_unlock(&imp->imp_lock);
401 CERROR("can't connect to a closed import\n");
403 } else if (imp->imp_state == LUSTRE_IMP_FULL) {
404 spin_unlock(&imp->imp_lock);
405 CERROR("already connected\n");
407 } else if (imp->imp_state == LUSTRE_IMP_CONNECTING) {
408 spin_unlock(&imp->imp_lock);
409 CERROR("already connecting\n");
413 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
416 imp->imp_resend_replay = 0;
418 if (!lustre_handle_is_used(&imp->imp_remote_handle))
421 committed_before_reconnect = imp->imp_peer_committed_transno;
423 spin_unlock(&imp->imp_lock);
426 struct obd_uuid uuid;
428 obd_str2uuid(&uuid, new_uuid);
429 rc = import_set_conn_priority(imp, &uuid);
434 rc = import_select_connection(imp);
438 /* last in connection list */
439 if (imp->imp_conn_current->oic_item.next == &imp->imp_conn_list) {
440 if (imp->imp_initial_recov_bk && initial_connect) {
441 CDEBUG(D_HA, "Last connection attempt (%d) for %s\n",
442 imp->imp_conn_cnt, obd2cli_tgt(imp->imp_obd));
443 /* Don't retry if connect fails */
445 obd_set_info_async(obd->obd_self_export,
446 strlen(KEY_INIT_RECOV),
448 sizeof(rc), &rc, NULL);
450 if (imp->imp_recon_bk) {
451 CDEBUG(D_HA, "Last reconnection attempt (%d) for %s\n",
452 imp->imp_conn_cnt, obd2cli_tgt(imp->imp_obd));
453 spin_lock(&imp->imp_lock);
454 imp->imp_last_recon = 1;
455 spin_unlock(&imp->imp_lock);
459 /* Reset connect flags to the originally requested flags, in case
460 * the server is updated on-the-fly we will get the new features. */
461 imp->imp_connect_data.ocd_connect_flags = imp->imp_connect_flags_orig;
462 rc = obd_reconnect(imp->imp_obd->obd_self_export, obd,
463 &obd->obd_uuid, &imp->imp_connect_data);
467 request = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, imp->imp_connect_op,
470 GOTO(out, rc = -ENOMEM);
473 lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_LIBCLIENT);
475 lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_NEXT_VER);
477 request->rq_send_state = LUSTRE_IMP_CONNECTING;
478 /* Allow a slightly larger reply for future growth compatibility */
479 size[REPLY_REC_OFF] = sizeof(struct obd_connect_data) +
481 ptlrpc_req_set_repsize(request, 2, size);
482 request->rq_interpret_reply = ptlrpc_connect_interpret;
484 CLASSERT(sizeof (*aa) <= sizeof (request->rq_async_args));
485 aa = (struct ptlrpc_connect_async_args *)&request->rq_async_args;
486 memset(aa, 0, sizeof *aa);
488 aa->pcaa_peer_committed = committed_before_reconnect;
489 aa->pcaa_initial_connect = initial_connect;
490 if (aa->pcaa_initial_connect) {
491 spin_lock(&imp->imp_lock);
492 imp->imp_replayable = 1;
493 spin_unlock(&imp->imp_lock);
495 /* AT will use INITIAL_CONNECT_TIMEOUT the first
496 time, adaptive after that. */
497 request->rq_timeout = INITIAL_CONNECT_TIMEOUT;
500 DEBUG_REQ(D_RPCTRACE, request, "%sconnect request %d",
501 aa->pcaa_initial_connect ? "initial " : "re",
503 ptlrpcd_add_req(request);
507 IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
512 EXPORT_SYMBOL(ptlrpc_connect_import);
514 static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp)
517 struct obd_import_conn *imp_conn;
523 spin_lock(&imp->imp_lock);
524 if (list_empty(&imp->imp_conn_list))
528 imp_conn = list_entry(imp->imp_conn_list.prev,
529 struct obd_import_conn,
532 if (imp->imp_conn_current != imp_conn) {
533 ptlrpc_ping_import_soon(imp);
538 /* liblustre has no pinger thead, so we wakup pinger anyway */
542 spin_unlock(&imp->imp_lock);
545 ptlrpc_pinger_wake_up();
550 static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
553 struct ptlrpc_connect_async_args *aa = data;
554 struct obd_import *imp = request->rq_import;
555 struct client_obd *cli = &imp->imp_obd->u.cli;
556 struct lustre_handle old_hdl;
560 spin_lock(&imp->imp_lock);
561 if (imp->imp_state == LUSTRE_IMP_CLOSED) {
562 spin_unlock(&imp->imp_lock);
565 spin_unlock(&imp->imp_lock);
570 LASSERT(imp->imp_conn_current);
572 msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
574 /* All imports are pingable */
575 spin_lock(&imp->imp_lock);
576 imp->imp_pingable = 1;
578 if (aa->pcaa_initial_connect) {
579 if (msg_flags & MSG_CONNECT_REPLAYABLE) {
580 imp->imp_replayable = 1;
581 spin_unlock(&imp->imp_lock);
582 CDEBUG(D_HA, "connected to replayable target: %s\n",
583 obd2cli_tgt(imp->imp_obd));
585 imp->imp_replayable = 0;
586 spin_unlock(&imp->imp_lock);
589 if (msg_flags & MSG_CONNECT_NEXT_VER) {
590 imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2;
591 CDEBUG(D_RPCTRACE, "connect to %s with lustre_msg_v2\n",
592 obd2cli_tgt(imp->imp_obd));
594 CDEBUG(D_RPCTRACE, "connect to %s with lustre_msg_v1\n",
595 obd2cli_tgt(imp->imp_obd));
598 imp->imp_remote_handle =
599 *lustre_msg_get_handle(request->rq_repmsg);
601 IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
602 GOTO(finish, rc = 0);
604 spin_unlock(&imp->imp_lock);
607 /* Determine what recovery state to move the import to. */
608 if (MSG_CONNECT_RECONNECT & msg_flags) {
609 memset(&old_hdl, 0, sizeof(old_hdl));
610 if (!memcmp(&old_hdl, lustre_msg_get_handle(request->rq_repmsg),
612 CERROR("%s@%s didn't like our handle "LPX64
613 ", failed\n", obd2cli_tgt(imp->imp_obd),
614 imp->imp_connection->c_remote_uuid.uuid,
615 imp->imp_dlm_handle.cookie);
616 GOTO(out, rc = -ENOTCONN);
619 if (memcmp(&imp->imp_remote_handle,
620 lustre_msg_get_handle(request->rq_repmsg),
621 sizeof(imp->imp_remote_handle))) {
623 /* Old MGC can reconnect to a restarted MGS */
624 if (strcmp(imp->imp_obd->obd_type->typ_name,
625 LUSTRE_MGC_NAME) == 0) {
629 "%s@%s changed handle from "LPX64" to "LPX64
630 "; copying, but this may foreshadow disaster\n",
631 obd2cli_tgt(imp->imp_obd),
632 imp->imp_connection->c_remote_uuid.uuid,
633 imp->imp_remote_handle.cookie,
634 lustre_msg_get_handle(request->rq_repmsg)->
636 imp->imp_remote_handle =
637 *lustre_msg_get_handle(request->rq_repmsg);
639 CDEBUG(D_HA, "reconnected to %s@%s after partition\n",
640 obd2cli_tgt(imp->imp_obd),
641 imp->imp_connection->c_remote_uuid.uuid);
644 if (imp->imp_invalid) {
645 CDEBUG(D_HA, "%s: reconnected but import is invalid; "
646 "marking evicted\n", imp->imp_obd->obd_name);
647 IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
648 } else if (MSG_CONNECT_RECOVERING & msg_flags) {
649 CDEBUG(D_HA, "%s: reconnected to %s during replay\n",
650 imp->imp_obd->obd_name,
651 obd2cli_tgt(imp->imp_obd));
653 spin_lock(&imp->imp_lock);
654 imp->imp_resend_replay = 1;
655 spin_unlock(&imp->imp_lock);
657 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
659 IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
661 } else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) {
662 LASSERT(imp->imp_replayable);
663 imp->imp_remote_handle =
664 *lustre_msg_get_handle(request->rq_repmsg);
665 imp->imp_last_replay_transno = 0;
666 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
668 DEBUG_REQ(D_HA, request, "evicting (not initial connect and "
669 "flags reconnect/recovering not set: %x)",msg_flags);
670 imp->imp_remote_handle =
671 *lustre_msg_get_handle(request->rq_repmsg);
672 IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
675 /* Sanity checks for a reconnected import. */
676 if (!(imp->imp_replayable) != !(msg_flags & MSG_CONNECT_REPLAYABLE)) {
677 CERROR("imp_replayable flag does not match server "
678 "after reconnect. We should LBUG right here.\n");
681 if (lustre_msg_get_last_committed(request->rq_repmsg) <
682 aa->pcaa_peer_committed) {
683 CERROR("%s went back in time (transno "LPD64
684 " was previously committed, server now claims "LPD64
685 ")! See https://bugzilla.clusterfs.com/"
686 "long_list.cgi?buglist=9646\n",
687 obd2cli_tgt(imp->imp_obd), aa->pcaa_peer_committed,
688 lustre_msg_get_last_committed(request->rq_repmsg));
692 rc = ptlrpc_import_recovery_state_machine(imp);
694 if (rc == -ENOTCONN) {
695 CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery;"
696 "invalidating and reconnecting\n",
697 obd2cli_tgt(imp->imp_obd),
698 imp->imp_connection->c_remote_uuid.uuid);
699 ptlrpc_connect_import(imp, NULL);
703 struct obd_connect_data *ocd;
704 struct obd_export *exp;
706 ocd = lustre_swab_repbuf(request, REPLY_REC_OFF, sizeof(*ocd),
707 lustre_swab_connect);
708 spin_lock(&imp->imp_lock);
709 list_del(&imp->imp_conn_current->oic_item);
710 list_add(&imp->imp_conn_current->oic_item, &imp->imp_conn_list);
711 imp->imp_last_success_conn =
712 imp->imp_conn_current->oic_last_attempt;
715 spin_unlock(&imp->imp_lock);
716 CERROR("Wrong connect data from server\n");
721 imp->imp_connect_data = *ocd;
723 exp = class_conn2export(&imp->imp_dlm_handle);
724 spin_unlock(&imp->imp_lock);
726 /* check that server granted subset of flags we asked for. */
727 LASSERTF((ocd->ocd_connect_flags &
728 imp->imp_connect_flags_orig) ==
729 ocd->ocd_connect_flags, LPX64" != "LPX64,
730 imp->imp_connect_flags_orig, ocd->ocd_connect_flags);
733 /* This could happen if export is cleaned during the
735 CERROR("Missing export for %s\n",
736 imp->imp_obd->obd_name);
737 GOTO(out, rc = -ENODEV);
739 exp->exp_connect_flags = ocd->ocd_connect_flags;
740 imp->imp_obd->obd_self_export->exp_connect_flags = ocd->ocd_connect_flags;
741 class_export_put(exp);
743 obd_import_event(imp->imp_obd, imp, IMP_EVENT_OCD);
745 if (!ocd->ocd_ibits_known &&
746 ocd->ocd_connect_flags & OBD_CONNECT_IBITS)
747 CERROR("Inodebits aware server returned zero compatible"
750 if ((ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
751 (ocd->ocd_version > LUSTRE_VERSION_CODE +
752 LUSTRE_VERSION_OFFSET_WARN ||
753 ocd->ocd_version < LUSTRE_VERSION_CODE -
754 LUSTRE_VERSION_OFFSET_WARN)) {
755 /* Sigh, some compilers do not like #ifdef in the middle
756 of macro arguments */
759 "older. Consider upgrading this client";
762 "older. Consider recompiling this application";
764 const char *newer = "newer than client version";
766 LCONSOLE_WARN("Server %s version (%d.%d.%d.%d) "
768 obd2cli_tgt(imp->imp_obd),
769 OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
770 OBD_OCD_VERSION_MINOR(ocd->ocd_version),
771 OBD_OCD_VERSION_PATCH(ocd->ocd_version),
772 OBD_OCD_VERSION_FIX(ocd->ocd_version),
773 ocd->ocd_version > LUSTRE_VERSION_CODE ?
774 newer : older, LUSTRE_VERSION_STRING);
777 if (ocd->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) {
778 cli->cl_max_pages_per_rpc =
779 ocd->ocd_brw_size >> CFS_PAGE_SHIFT;
782 imp->imp_obd->obd_namespace->ns_connect_flags = ocd->ocd_connect_flags;
784 if ((ocd->ocd_connect_flags & OBD_CONNECT_AT) &&
785 (imp->imp_msg_magic == LUSTRE_MSG_MAGIC_V2))
786 /* We need a per-message support flag, because
787 a. we don't know if the incoming connect reply
788 supports AT or not (in reply_in_callback)
790 b. failovered server means export and flags are gone
791 (in ptlrpc_send_reply).
792 Can only be set when we know AT is supported at
794 imp->imp_msg_flags |= MSG_AT_SUPPORT;
796 imp->imp_msg_flags &= ~MSG_AT_SUPPORT;
798 LASSERT((cli->cl_max_pages_per_rpc <= PTLRPC_MAX_BRW_PAGES) &&
799 (cli->cl_max_pages_per_rpc > 0));
804 IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
805 if (aa->pcaa_initial_connect && !imp->imp_initial_recov)
806 ptlrpc_deactivate_import(imp);
808 if (imp->imp_recon_bk && imp->imp_last_recon) {
809 /* Give up trying to reconnect */
810 imp->imp_obd->obd_no_recov = 1;
811 ptlrpc_deactivate_import(imp);
815 struct obd_connect_data *ocd;
816 ocd = lustre_swab_repbuf(request, REPLY_REC_OFF,
818 lustre_swab_connect);
820 (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
821 (ocd->ocd_version != LUSTRE_VERSION_CODE)) {
822 /* Actually servers are only supposed to refuse
823 connection from liblustre clients, so we should
824 never see this from VFS context */
825 LCONSOLE_ERROR_MSG(0x16a, "Server %s version "
827 " refused connection from this client "
828 "with an incompatible version (%s). "
829 "Client must be recompiled\n",
830 obd2cli_tgt(imp->imp_obd),
831 OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
832 OBD_OCD_VERSION_MINOR(ocd->ocd_version),
833 OBD_OCD_VERSION_PATCH(ocd->ocd_version),
834 OBD_OCD_VERSION_FIX(ocd->ocd_version),
835 LUSTRE_VERSION_STRING);
836 ptlrpc_deactivate_import(imp);
837 IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
842 ptlrpc_maybe_ping_import_soon(imp);
844 CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
845 obd2cli_tgt(imp->imp_obd),
846 (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
849 spin_lock(&imp->imp_lock);
850 imp->imp_last_recon = 0;
851 spin_unlock(&imp->imp_lock);
853 cfs_waitq_signal(&imp->imp_recovery_waitq);
857 static int completed_replay_interpret(struct ptlrpc_request *req,
861 atomic_dec(&req->rq_import->imp_replay_inflight);
862 if (req->rq_status == 0) {
863 ptlrpc_import_recovery_state_machine(req->rq_import);
865 CDEBUG(D_HA, "%s: LAST_REPLAY message error: %d, "
867 req->rq_import->imp_obd->obd_name, req->rq_status);
868 ptlrpc_connect_import(req->rq_import, NULL);
874 static int signal_completed_replay(struct obd_import *imp)
876 struct ptlrpc_request *req;
879 LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
880 atomic_inc(&imp->imp_replay_inflight);
882 req = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, OBD_PING, 1, NULL, NULL);
884 atomic_dec(&imp->imp_replay_inflight);
888 ptlrpc_req_set_repsize(req, 1, NULL);
889 req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT;
890 lustre_msg_add_flags(req->rq_reqmsg, MSG_LAST_REPLAY);
891 req->rq_timeout *= 3;
892 req->rq_interpret_reply = completed_replay_interpret;
894 ptlrpcd_add_req(req);
899 static int ptlrpc_invalidate_import_thread(void *data)
901 struct obd_import *imp = data;
905 ptlrpc_daemonize("ll_imp_inval");
907 CDEBUG(D_HA, "thread invalidate import %s to %s@%s\n",
908 imp->imp_obd->obd_name, obd2cli_tgt(imp->imp_obd),
909 imp->imp_connection->c_remote_uuid.uuid);
911 ptlrpc_invalidate_import(imp);
913 if (obd_dump_on_eviction) {
914 CERROR("dump the log upon eviction\n");
915 libcfs_debug_dumplog();
918 IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
919 ptlrpc_import_recovery_state_machine(imp);
925 int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
933 if (imp->imp_state == LUSTRE_IMP_EVICTED) {
934 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
935 &target_start, &target_len);
936 /* Don't care about MGC eviction */
937 if (strcmp(imp->imp_obd->obd_type->typ_name,
938 LUSTRE_MGC_NAME) != 0) {
939 LCONSOLE_ERROR_MSG(0x167, "This client was evicted by "
940 "%.*s; in progress operations using "
941 "this service will fail.\n",
942 target_len, target_start);
944 CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
945 obd2cli_tgt(imp->imp_obd),
946 imp->imp_connection->c_remote_uuid.uuid);
949 rc = cfs_kernel_thread(ptlrpc_invalidate_import_thread, imp,
950 CLONE_VM | CLONE_FILES);
952 CERROR("error starting invalidate thread: %d\n", rc);
957 ptlrpc_invalidate_import(imp);
959 IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
963 if (imp->imp_state == LUSTRE_IMP_REPLAY) {
964 CDEBUG(D_HA, "replay requested by %s\n",
965 obd2cli_tgt(imp->imp_obd));
966 rc = ptlrpc_replay_next(imp, &inflight);
968 atomic_read(&imp->imp_replay_inflight) == 0) {
969 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
970 rc = ldlm_replay_locks(imp);
977 if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) {
978 if (atomic_read(&imp->imp_replay_inflight) == 0) {
979 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT);
980 rc = signal_completed_replay(imp);
987 if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) {
988 if (atomic_read(&imp->imp_replay_inflight) == 0) {
989 IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
993 if (imp->imp_state == LUSTRE_IMP_RECOVER) {
994 CDEBUG(D_HA, "reconnected to %s@%s\n",
995 obd2cli_tgt(imp->imp_obd),
996 imp->imp_connection->c_remote_uuid.uuid);
998 rc = ptlrpc_resend(imp);
1001 IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
1002 ptlrpc_activate_import(imp);
1004 deuuidify(obd2cli_tgt(imp->imp_obd), NULL,
1005 &target_start, &target_len);
1006 LCONSOLE_INFO("%s: Connection restored to service %.*s "
1007 "using nid %s.\n", imp->imp_obd->obd_name,
1008 target_len, target_start,
1009 libcfs_nid2str(imp->imp_connection->c_peer.nid));
1012 if (imp->imp_state == LUSTRE_IMP_FULL) {
1013 cfs_waitq_signal(&imp->imp_recovery_waitq);
1014 ptlrpc_wake_delayed(imp);
1021 static int back_to_sleep(void *unused)
1026 int ptlrpc_disconnect_import(struct obd_import *imp, int noclose)
1028 struct ptlrpc_request *req;
1032 switch (imp->imp_connect_op) {
1033 case OST_CONNECT: rq_opc = OST_DISCONNECT; break;
1034 case MDS_CONNECT: rq_opc = MDS_DISCONNECT; break;
1035 case MGS_CONNECT: rq_opc = MGS_DISCONNECT; break;
1037 CERROR("don't know how to disconnect from %s (connect_op %d)\n",
1038 obd2cli_tgt(imp->imp_obd), imp->imp_connect_op);
1042 if (ptlrpc_import_in_recovery(imp)) {
1043 struct l_wait_info lwi;
1044 cfs_duration_t timeout;
1047 timeout = cfs_time_seconds(obd_timeout);
1049 int idx = import_at_get_index(imp,
1050 imp->imp_client->cli_request_portal);
1051 timeout = cfs_time_seconds(
1052 at_get(&imp->imp_at.iat_service_estimate[idx]));
1054 lwi = LWI_TIMEOUT_INTR(cfs_timeout_cap(timeout),
1055 back_to_sleep, LWI_ON_SIGNAL_NOOP, NULL);
1056 rc = l_wait_event(imp->imp_recovery_waitq,
1057 !ptlrpc_import_in_recovery(imp), &lwi);
1060 spin_lock(&imp->imp_lock);
1061 if (imp->imp_state != LUSTRE_IMP_FULL)
1064 spin_unlock(&imp->imp_lock);
1066 req = ptlrpc_prep_req(imp, LUSTRE_OBD_VERSION, rq_opc, 1, NULL, NULL);
1068 /* We are disconnecting, do not retry a failed DISCONNECT rpc if
1069 * it fails. We can get through the above with a down server
1070 * if the client doesn't know the server is gone yet. */
1071 req->rq_no_resend = 1;
1074 /* We want client umounts to happen quickly, no matter the
1076 req->rq_timeout = min_t(int, req->rq_timeout,
1077 INITIAL_CONNECT_TIMEOUT);
1079 /* ... but we always want liblustre clients to nicely
1080 disconnect, so only use the adaptive value. */
1082 req->rq_timeout = obd_timeout / 3;
1085 IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
1086 req->rq_send_state = LUSTRE_IMP_CONNECTING;
1087 ptlrpc_req_set_repsize(req, 1, NULL);
1088 rc = ptlrpc_queue_wait(req);
1089 ptlrpc_req_finished(req);
1092 spin_lock(&imp->imp_lock);
1095 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
1097 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
1098 memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
1099 /* Try all connections in the future - bz 12758 */
1100 imp->imp_last_recon = 0;
1101 spin_unlock(&imp->imp_lock);
1106 /* Sets maximal number of RPCs possible originating from other side of this
1107 import (server) to us and number of async RPC replies that we are not waiting
1109 void ptlrpc_import_setasync(struct obd_import *imp, int count)
1111 LNetSetAsync(imp->imp_connection->c_peer, count);
1115 /* Adaptive Timeout utils */
1117 /* Bin into timeslices using AT_BINS bins.
1118 This gives us a max of the last binlimit*AT_BINS secs without the storage,
1119 but still smoothing out a return to normalcy from a slow response.
1120 (E.g. remember the maximum latency in each minute of the last 4 minutes.) */
1121 int at_add(struct adaptive_timeout *at, unsigned int val)
1123 unsigned int old = at->at_current;
1124 time_t now = cfs_time_current_sec();
1125 time_t binlimit = max_t(time_t, adaptive_timeout_history / AT_BINS, 1);
1129 CDEBUG(D_INFO, "add %u to %p time=%lu v=%u (%u %u %u %u)\n",
1130 val, at, now - at->at_binstart, at->at_current,
1131 at->at_hist[0], at->at_hist[1], at->at_hist[2], at->at_hist[3]);
1134 /* 0's don't count, because we never want our timeout to
1135 drop to 0, and because 0 could mean an error */
1138 spin_lock(&at->at_lock);
1140 if (unlikely(at->at_binstart == 0)) {
1141 /* Special case to remove default from history */
1142 at->at_current = val;
1143 at->at_worst_ever = val;
1144 at->at_worst_time = now;
1145 at->at_hist[0] = val;
1146 at->at_binstart = now;
1147 } else if (now - at->at_binstart < binlimit ) {
1149 at->at_hist[0] = max(val, at->at_hist[0]);
1150 at->at_current = max(val, at->at_current);
1153 unsigned int maxv = val;
1154 /* move bins over */
1155 shift = (now - at->at_binstart) / binlimit;
1157 for(i = AT_BINS - 1; i >= 0; i--) {
1159 at->at_hist[i] = at->at_hist[i - shift];
1160 maxv = max(maxv, at->at_hist[i]);
1165 at->at_hist[0] = val;
1166 at->at_current = maxv;
1167 at->at_binstart += shift * binlimit;
1170 if (at->at_current > at->at_worst_ever) {
1171 at->at_worst_ever = at->at_current;
1172 at->at_worst_time = now;
1175 if (at->at_flags & AT_FLG_NOHIST)
1176 /* Only keep last reported val; keeping the rest of the history
1178 at->at_current = val;
1181 if (at->at_current != old)
1182 CDEBUG(D_ADAPTTO, "AT change: old=%u new=%u delta=%d (val=%u) "
1183 "hist %u %u %u %u\n",
1184 old, at->at_current, at->at_current - old, val,
1185 at->at_hist[0], at->at_hist[1], at->at_hist[2],
1189 /* if we changed, report the old value */
1190 old = (at->at_current != old) ? old : 0;
1192 spin_unlock(&at->at_lock);
1196 /* Find the imp_at index for a given portal; assign if space available */
1197 int import_at_get_index(struct obd_import *imp, int portal)
1199 struct imp_at *at = &imp->imp_at;
1202 for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
1203 if (at->iat_portal[i] == portal)
1205 if (at->iat_portal[i] == 0)
1210 /* Not found in list, add it under a lock */
1211 spin_lock(&imp->imp_lock);
1213 /* Check unused under lock */
1214 for (; i < IMP_AT_MAX_PORTALS; i++) {
1215 if (at->iat_portal[i] == portal)
1217 if (at->iat_portal[i] == 0)
1222 /* Not enough portals? */
1223 LASSERT(i < IMP_AT_MAX_PORTALS);
1225 at->iat_portal[i] = portal;
1227 spin_unlock(&imp->imp_lock);
1231 /* Get total expected lock callback time (net + service).
1232 Since any early reply will only affect the RPC wait time, and not
1233 any local lock timer we set based on the return value here,
1234 we should be conservative. */
1235 int import_at_get_ldlm(struct obd_import *imp)
1239 if (!imp || !imp->imp_client || AT_OFF)
1242 idx = import_at_get_index(imp, imp->imp_client->cli_request_portal);
1243 tot = at_get(&imp->imp_at.iat_net_latency) +
1244 at_get(&imp->imp_at.iat_service_estimate[idx]);
1246 /* add an arbitrary minimum: 150% + 10 sec */
1247 tot += (tot >> 1) + 10;