1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Portal-RPC reconnection and replay operations, for use in recovery.
6 * Copyright (c) 2002, 2003 Cluster File Systems, Inc.
7 * Author: Mike Shaver <shaver@clusterfs.com>
9 * This file is part of Lustre, http://www.lustre.org.
11 * Lustre is free software; you can redistribute it and/or
12 * modify it under the terms of version 2 of the GNU General Public
13 * License as published by the Free Software Foundation.
15 * Lustre is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with Lustre; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25 #define DEBUG_SUBSYSTEM S_RPC
27 # include <linux/config.h>
28 # include <linux/module.h>
29 # include <linux/kmod.h>
31 # include <liblustre.h>
34 #include <linux/obd_support.h>
35 #include <linux/lustre_ha.h>
36 #include <linux/lustre_net.h>
37 #include <linux/lustre_import.h>
38 #include <linux/lustre_export.h>
39 #include <linux/obd.h>
40 #include <linux/obd_class.h>
41 #include <linux/obd_lov.h> /* for IOC_LOV_SET_OSC_ACTIVE */
43 #include "ptlrpc_internal.h"
45 enum reconnect_result {
46 RECON_RESULT_RECOVERING = 1,
47 RECON_RESULT_RECONNECTED = 2,
48 RECON_RESULT_EVICTED = 3,
51 int ptlrpc_reconnect_import(struct obd_import *imp)
53 struct obd_device *obd = imp->imp_obd;
54 int rc, size[] = {sizeof(imp->imp_target_uuid),
55 sizeof(obd->obd_uuid),
56 sizeof(imp->imp_dlm_handle)};
57 char *tmp[] = {imp->imp_target_uuid.uuid,
59 (char *)&imp->imp_dlm_handle};
60 struct ptlrpc_connection *conn = imp->imp_connection;
61 struct ptlrpc_request *req;
62 struct lustre_handle old_hdl;
63 __u64 committed_before_reconnect = imp->imp_peer_committed_transno;
66 req = ptlrpc_prep_req(imp, imp->imp_connect_op, 3, size, tmp);
69 req->rq_level = LUSTRE_CONN_NEW;
70 req->rq_replen = lustre_msg_size(0, NULL);
71 rc = ptlrpc_queue_wait(req);
73 /* what if rc > 0 ??*/
74 CERROR("cannot connect to %s@%s: rc = %d\n",
75 imp->imp_target_uuid.uuid, conn->c_remote_uuid.uuid, rc);
79 if (lustre_msg_get_op_flags(req->rq_repmsg) & MSG_CONNECT_RECONNECT) {
80 memset(&old_hdl, 0, sizeof(old_hdl));
81 if (!memcmp(&old_hdl, &req->rq_repmsg->handle,
83 CERROR("%s@%s didn't like our handle "LPX64
84 ", failed\n", imp->imp_target_uuid.uuid,
85 conn->c_remote_uuid.uuid,
86 imp->imp_dlm_handle.cookie);
87 GOTO(out_disc, rc = -ENOTCONN);
90 if (memcmp(&imp->imp_remote_handle, &req->rq_repmsg->handle,
91 sizeof(imp->imp_remote_handle))) {
92 CERROR("%s@%s changed handle from "LPX64" to "LPX64
93 "; copying, but this may foreshadow disaster\n",
94 imp->imp_target_uuid.uuid,
95 conn->c_remote_uuid.uuid,
96 imp->imp_remote_handle.cookie,
97 req->rq_repmsg->handle.cookie);
98 imp->imp_remote_handle = req->rq_repmsg->handle;
99 GOTO(out_disc, rc = RECON_RESULT_RECONNECTED);
102 CERROR("reconnected to %s@%s after partition\n",
103 imp->imp_target_uuid.uuid, conn->c_remote_uuid.uuid);
104 GOTO(out_disc, rc = RECON_RESULT_RECONNECTED);
105 } else if (lustre_msg_get_op_flags(req->rq_repmsg) &
106 MSG_CONNECT_RECOVERING) {
107 rc = RECON_RESULT_RECOVERING;
109 rc = RECON_RESULT_EVICTED;
112 old_hdl = imp->imp_remote_handle;
113 imp->imp_remote_handle = req->rq_repmsg->handle;
114 CERROR("reconnected to %s@%s ("LPX64", was "LPX64")!\n",
115 imp->imp_target_uuid.uuid, conn->c_remote_uuid.uuid,
116 imp->imp_remote_handle.cookie, old_hdl.cookie);
117 if (req->rq_repmsg->last_committed < committed_before_reconnect) {
118 CERROR("%s went back in time (transno "LPD64
119 " was committed, server claims "LPD64
120 ")! is shared storage not coherent?\n",
121 imp->imp_target_uuid.uuid,
122 imp->imp_peer_committed_transno,
123 req->rq_repmsg->last_committed);
129 ptlrpc_req_finished(req);
133 void ptlrpc_run_recovery_over_upcall(struct obd_device *obd)
140 argv[0] = obd_lustre_upcall;
141 argv[1] = "RECOVERY_OVER";
142 argv[2] = obd->obd_uuid.uuid;
146 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
149 rc = USERMODEHELPER(argv[0], argv, envp);
151 CERROR("Error invoking recovery upcall %s %s %s: %d; check "
152 "/proc/sys/lustre/upcall\n",
153 argv[0], argv[1], argv[2], rc);
156 CERROR("Invoked upcall %s %s %s",
157 argv[0], argv[1], argv[2]);
161 void ptlrpc_run_failed_import_upcall(struct obd_import* imp)
168 argv[0] = obd_lustre_upcall;
169 argv[1] = "FAILED_IMPORT";
170 argv[2] = imp->imp_target_uuid.uuid;
171 argv[3] = imp->imp_obd->obd_name;
172 argv[4] = imp->imp_connection->c_remote_uuid.uuid;
173 argv[5] = imp->imp_obd->obd_uuid.uuid;
177 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
180 rc = USERMODEHELPER(argv[0], argv, envp);
182 CERROR("Error invoking recovery upcall %s %s %s %s %s: %d; "
183 "check /proc/sys/lustre/lustre_upcall\n",
184 argv[0], argv[1], argv[2], argv[3], argv[4],rc);
187 CERROR("Invoked upcall %s %s %s %s %s\n",
188 argv[0], argv[1], argv[2], argv[3], argv[4]);
192 int ptlrpc_replay(struct obd_import *imp)
195 struct list_head *tmp, *pos;
196 struct ptlrpc_request *req;
200 /* It might have committed some after we last spoke, so make sure we
201 * get rid of them now.
203 spin_lock_irqsave(&imp->imp_lock, flags);
204 ptlrpc_free_committed(imp);
205 spin_unlock_irqrestore(&imp->imp_lock, flags);
207 CDEBUG(D_HA, "import %p from %s has committed "LPD64"\n",
208 imp, imp->imp_target_uuid.uuid, imp->imp_peer_committed_transno);
210 list_for_each(tmp, &imp->imp_replay_list) {
211 req = list_entry(tmp, struct ptlrpc_request, rq_list);
212 DEBUG_REQ(D_HA, req, "RETAINED: ");
215 /* Do I need to hold a lock across this iteration? We shouldn't be
216 * racing with any additions to the list, because we're in recovery
217 * and are therefore not processing additional requests to add. Calls
218 * to ptlrpc_free_committed might commit requests, but nothing "newer"
219 * than the one we're replaying (it can't be committed until it's
220 * replayed, and we're doing that here). l_f_e_safe protects against
221 * problems with the current request being committed, in the unlikely
222 * event of that race. So, in conclusion, I think that it's safe to
223 * perform this list-walk without the imp_lock held.
225 * But, the {mdc,osc}_replay_open callbacks both iterate
226 * request lists, and have comments saying they assume the
227 * imp_lock is being held by ptlrpc_replay, but it's not. it's
228 * just a little race...
230 list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
231 req = list_entry(tmp, struct ptlrpc_request, rq_list);
233 DEBUG_REQ(D_HA, req, "REPLAY:");
235 rc = ptlrpc_replay_req(req);
238 CERROR("recovery replay error %d for req "LPD64"\n",
247 int ptlrpc_resend(struct obd_import *imp)
249 struct list_head *tmp, *pos;
250 struct ptlrpc_request *req;
255 /* As long as we're in recovery, nothing should be added to the sending
256 * list, so we don't need to hold the lock during this iteration and
259 /* Well... what if lctl recover is called twice at the same time?
261 spin_lock_irqsave(&imp->imp_lock, flags);
262 LASSERT(imp->imp_level == LUSTRE_CONN_RECOVER);
263 spin_unlock_irqrestore(&imp->imp_lock, flags);
265 list_for_each_safe(tmp, pos, &imp->imp_sending_list) {
266 req = list_entry(tmp, struct ptlrpc_request, rq_list);
267 ptlrpc_resend_req(req);
273 void ptlrpc_wake_delayed(struct obd_import *imp)
276 struct list_head *tmp, *pos;
277 struct ptlrpc_request *req;
279 spin_lock_irqsave(&imp->imp_lock, flags);
280 list_for_each_safe(tmp, pos, &imp->imp_delayed_list) {
281 req = list_entry(tmp, struct ptlrpc_request, rq_list);
283 ptlrpc_put_connection(req->rq_connection);
285 ptlrpc_connection_addref(req->rq_import->imp_connection);
288 DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
289 wake_up(&req->rq_set->set_waitq);
291 DEBUG_REQ(D_HA, req, "waking:");
292 wake_up(&req->rq_wait_for_rep);
295 spin_unlock_irqrestore(&imp->imp_lock, flags);
298 inline void ptlrpc_invalidate_import_state(struct obd_import *imp)
300 struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
301 if (ptlrpc_ldlm_namespace_cleanup == NULL)
302 CERROR("ptlrpc/ldlm hook is NULL! Please tell phil\n");
304 ptlrpc_ldlm_namespace_cleanup(ns, 1 /* no network ops */);
305 ptlrpc_abort_inflight(imp);
308 void ptlrpc_handle_failed_import(struct obd_import *imp)
311 if (!imp->imp_replayable) {
313 "import %s@%s for %s not replayable, deactivating\n",
314 imp->imp_target_uuid.uuid,
315 imp->imp_connection->c_remote_uuid.uuid,
316 imp->imp_obd->obd_name);
317 ptlrpc_set_import_active(imp, 0);
320 ptlrpc_run_failed_import_upcall(imp);
324 void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
327 struct obd_import *imp= failed_req->rq_import;
331 CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
332 imp->imp_obd->obd_name,
333 imp->imp_target_uuid.uuid,
334 imp->imp_connection->c_remote_uuid.uuid);
335 rc = ptlrpc_recover_import(imp, NULL);
337 ptlrpc_resend_req(failed_req);
339 ptlrpc_handle_failed_import(imp);
340 } else if (rc == RECON_RESULT_RECOVERING) {
341 ptlrpc_resend_req(failed_req);
343 if (rc != RECON_RESULT_EVICTED) {
344 /* like LBUG, without the locking up */
345 CERROR("unknown recover_import result %d\n", rc);
346 portals_debug_dumplog();
347 portals_run_lbug_upcall(__FILE__, __FUNCTION__,
350 LASSERT(failed_req->rq_import_generation < imp->imp_generation);
351 spin_lock_irqsave (&failed_req->rq_lock, flags);
352 failed_req->rq_err = 1;
353 spin_unlock_irqrestore (&failed_req->rq_lock, flags);
358 int ptlrpc_set_import_active(struct obd_import *imp, int active)
360 struct obd_device *notify_obd;
364 LASSERT(imp->imp_obd);
366 notify_obd = imp->imp_obd->u.cli.cl_containing_lov;
368 /* When deactivating, mark import invalid, and abort in-flight
371 spin_lock_irqsave(&imp->imp_lock, flags);
372 /* This is a bit of a hack, but invalidating replayable
373 * imports makes a temporary reconnect failure into a much more
374 * ugly -- and hard to remedy -- situation. */
375 if (!imp->imp_replayable) {
376 CDEBUG(D_HA, "setting import %s INVALID\n",
377 imp->imp_target_uuid.uuid);
378 imp->imp_invalid = 1;
380 imp->imp_generation++;
381 spin_unlock_irqrestore(&imp->imp_lock, flags);
382 ptlrpc_invalidate_import_state(imp);
383 //ptlrpc_abort_inflight(imp);
386 if (notify_obd == NULL)
389 /* How gross is _this_? */
390 if (!list_empty(¬ify_obd->obd_exports)) {
391 struct lustre_handle fakeconn;
392 struct obd_ioctl_data ioc_data = { 0 };
393 struct obd_export *exp =
394 list_entry(notify_obd->obd_exports.next,
395 struct obd_export, exp_obd_chain);
397 fakeconn.cookie = exp->exp_handle.h_cookie;
398 ioc_data.ioc_inlbuf1 = (char *)&imp->imp_target_uuid;
399 ioc_data.ioc_offset = active;
400 rc = obd_iocontrol(IOC_LOV_SET_OSC_ACTIVE, &fakeconn,
401 sizeof ioc_data, &ioc_data, NULL);
403 CERROR("error %sabling %s on LOV %p/%s: %d\n",
404 active ? "en" : "dis",
405 imp->imp_target_uuid.uuid, notify_obd,
406 notify_obd->obd_uuid.uuid, rc);
408 CDEBUG(D_HA, "No exports for obd %p/%s, can't notify about "
409 "%p\n", notify_obd, notify_obd->obd_uuid.uuid,
410 imp->imp_obd->obd_uuid.uuid);
415 /* When activating, mark import valid */
417 CDEBUG(D_HA, "setting import %s VALID\n",
418 imp->imp_target_uuid.uuid);
419 spin_lock_irqsave(&imp->imp_lock, flags);
420 imp->imp_invalid = 0;
421 spin_unlock_irqrestore(&imp->imp_lock, flags);
427 void ptlrpc_fail_import(struct obd_import *imp, int generation)
433 LASSERT (!imp->imp_dlm_fake);
435 spin_lock_irqsave(&imp->imp_lock, flags);
436 if (imp->imp_level != LUSTRE_CONN_FULL)
439 imp->imp_level = LUSTRE_CONN_NOTCONN;
440 spin_unlock_irqrestore(&imp->imp_lock, flags);
447 ptlrpc_handle_failed_import(imp);
451 static int signal_completed_replay(struct obd_import *imp)
453 struct ptlrpc_request *req;
457 req = ptlrpc_prep_req(imp, OBD_PING, 0, NULL, NULL);
461 req->rq_replen = lustre_msg_size(0, NULL);
462 req->rq_level = LUSTRE_CONN_RECOVER;
463 req->rq_reqmsg->flags |= MSG_LAST_REPLAY;
465 rc = ptlrpc_queue_wait(req);
467 ptlrpc_req_finished(req);
471 int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid)
479 spin_lock_irqsave(&imp->imp_lock, flags);
480 if (imp->imp_level == LUSTRE_CONN_FULL ||
481 imp->imp_level == LUSTRE_CONN_NOTCONN)
482 imp->imp_level = LUSTRE_CONN_RECOVER;
485 spin_unlock_irqrestore(&imp->imp_lock, flags);
491 struct ptlrpc_connection *conn;
492 struct obd_uuid uuid;
493 struct ptlrpc_peer peer;
494 struct obd_export *dlmexp;
496 obd_str2uuid(&uuid, new_uuid);
497 if (ptlrpc_uuid_to_peer(&uuid, &peer)) {
498 CERROR("no connection found for UUID %s\n", new_uuid);
502 conn = ptlrpc_get_connection(&peer, &uuid);
506 CDEBUG(D_HA, "switching import %s/%s from %s to %s\n",
507 imp->imp_target_uuid.uuid, imp->imp_obd->obd_name,
508 imp->imp_connection->c_remote_uuid.uuid,
509 conn->c_remote_uuid.uuid);
511 /* Switch the import's connection and the DLM export's
512 * connection (which are almost certainly the same, but we
513 * keep distinct refs just to make things clearer. I think. */
514 if (imp->imp_connection)
515 ptlrpc_put_connection(imp->imp_connection);
516 /* We hand off the ref from ptlrpc_get_connection. */
517 imp->imp_connection = conn;
519 dlmexp = class_conn2export(&imp->imp_dlm_handle);
520 if (dlmexp->exp_connection)
521 ptlrpc_put_connection(dlmexp->exp_connection);
522 dlmexp->exp_connection = ptlrpc_connection_addref(conn);
523 class_export_put(dlmexp);
527 recon_result = ptlrpc_reconnect_import(imp);
529 if (recon_result < 0) {
530 CERROR("failed to reconnect to %s@%s: %d\n",
531 imp->imp_target_uuid.uuid,
532 imp->imp_connection->c_remote_uuid.uuid, recon_result);
533 spin_lock_irqsave(&imp->imp_lock, flags);
534 imp->imp_level = LUSTRE_CONN_NOTCONN;
535 spin_unlock_irqrestore(&imp->imp_lock, flags);
536 RETURN(recon_result);
539 if (recon_result == RECON_RESULT_RECOVERING) {
540 CDEBUG(D_HA, "replay requested by %s\n",
541 imp->imp_target_uuid.uuid);
542 rc = ptlrpc_replay(imp);
546 if (ptlrpc_ldlm_replay_locks == NULL)
547 CERROR("ptlrpc/ldlm hook is NULL! Please tell phil\n");
549 rc = ptlrpc_ldlm_replay_locks(imp);
553 rc = signal_completed_replay(imp);
556 } else if (recon_result == RECON_RESULT_RECONNECTED) {
557 CDEBUG(D_HA, "reconnected to %s@%s\n",
558 imp->imp_target_uuid.uuid,
559 imp->imp_connection->c_remote_uuid.uuid);
560 } else if (recon_result == RECON_RESULT_EVICTED) {
561 CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
562 imp->imp_target_uuid.uuid,
563 imp->imp_connection->c_remote_uuid.uuid);
564 ptlrpc_set_import_active(imp, 0);
569 ptlrpc_set_import_active(imp, 1);
571 rc = ptlrpc_resend(imp);
573 spin_lock_irqsave(&imp->imp_lock, flags);
574 imp->imp_level = LUSTRE_CONN_FULL;
575 spin_unlock_irqrestore(&imp->imp_lock, flags);
577 ptlrpc_wake_delayed(imp);
585 void ptlrpc_fail_export(struct obd_export *exp)
587 int rc, already_failed;
588 struct lustre_handle hdl;
591 spin_lock_irqsave(&exp->exp_lock, flags);
592 already_failed = exp->exp_failed;
594 spin_unlock_irqrestore(&exp->exp_lock, flags);
596 if (already_failed) {
597 CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n",
598 exp, exp->exp_client_uuid.uuid);
602 CDEBUG(D_HA, "disconnecting export %p/%s\n",
603 exp, exp->exp_client_uuid.uuid);
604 hdl.cookie = exp->exp_handle.h_cookie;
605 rc = obd_disconnect(&hdl, 0);
607 CERROR("disconnecting export %p failed: %d\n", exp, rc);