lustre/ptlrpc/recover.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  * Portal-RPC reconnection and replay operations, for use in recovery.
   5  *
   6  * This code is issued under the GNU General Public License.
   7  * See the file COPYING in this distribution
   8  *
   9  * Copyright (C) 1996 Peter J. Braam <braam@stelias.com>
  10  * Copyright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
  11  * Copyright (C) 1999 Seagate Technology Inc.
  12  * Copyright (C) 2001 Mountain View Data, Inc.
  13  * Copyright (C) 2002 Cluster File Systems, Inc.
  14  *
  15  */
  16
  17 #include <linux/config.h>
  18 #include <linux/module.h>
  19 #include <linux/kmod.h>
  20
  21 #define DEBUG_SUBSYSTEM S_RPC
  22
  23 #include <linux/lustre_ha.h>
  24 #include <linux/lustre_net.h>
  25 #include <linux/obd.h>
  26
  27 int ptlrpc_reconnect_import(struct obd_import *imp, int rq_opc)
  28 {
  29         struct obd_device *obd = imp->imp_obd;
  30         struct client_obd *cli = &obd->u.cli;
  31         int size[] = { sizeof(cli->cl_target_uuid), sizeof(obd->obd_uuid) };
  32         char *tmp[] = {cli->cl_target_uuid, obd->obd_uuid };
  33         struct ptlrpc_connection *conn = imp->imp_connection;
  34         struct lustre_handle old_hdl;
  35         struct ptlrpc_request *request;
  36         struct obd_export *ldlmexp;
  37         int rc;
  38
  39         request = ptlrpc_prep_req(imp, rq_opc, 2, size, tmp);
  40         request->rq_level = LUSTRE_CONN_NEW;
  41         request->rq_replen = lustre_msg_size(0, NULL);
  42         /*
  43          * This address is the export that represents our client-side LDLM
  44          * service (for ASTs).  We should only have one on this list, so we
  45          * just grab the first one.
  46          *
  47          * XXX tear down export, call class_obd_connect?
  48          */
  49         ldlmexp = list_entry(obd->obd_exports.next, struct obd_export,
  50                              exp_obd_chain);
  51         request->rq_reqmsg->addr = (__u64)(unsigned long)ldlmexp;
  52         request->rq_reqmsg->cookie = ldlmexp->exp_cookie;
  53         rc = ptlrpc_queue_wait(request);
  54         switch (rc) {
  55             case EALREADY:
  56             case -EALREADY:
  57                 /* already connected! */
  58                 memset(&old_hdl, 0, sizeof(old_hdl));
  59                 if (!memcmp(&old_hdl.addr, &request->rq_repmsg->addr,
  60                             sizeof (old_hdl.addr)) &&
  61                     !memcmp(&old_hdl.cookie, &request->rq_repmsg->cookie,
  62                             sizeof (old_hdl.cookie))) {
  63                         CERROR("%s@%s didn't like our handle %Lx/%Lx, failed\n",
  64                                cli->cl_target_uuid, conn->c_remote_uuid,
  65                                (__u64)(unsigned long)ldlmexp,
  66                                ldlmexp->exp_cookie);
  67                         GOTO(out_disc, rc = -ENOTCONN);
  68                 }
  69
  70                 old_hdl.addr = request->rq_repmsg->addr;
  71                 old_hdl.cookie = request->rq_repmsg->cookie;
  72                 if (memcmp(&imp->imp_handle, &old_hdl, sizeof(old_hdl))) {
  73                         CERROR("%s@%s changed handle from %Lx/%Lx to %Lx/%Lx; "
  74                                "copying, but this may foreshadow disaster\n",
  75                                cli->cl_target_uuid, conn->c_remote_uuid,
  76                                old_hdl.addr, old_hdl.cookie,
  77                                imp->imp_handle.addr, imp->imp_handle.cookie);
  78                         imp->imp_handle.addr = request->rq_repmsg->addr;
  79                         imp->imp_handle.cookie = request->rq_repmsg->cookie;
  80                         GOTO(out_disc, rc = EALREADY);
  81                 }
  82
  83                 CERROR("reconnected to %s@%s after partition\n",
  84                        cli->cl_target_uuid, conn->c_remote_uuid);
  85                 GOTO(out_disc, rc = EALREADY);
  86             case 0:
  87                 old_hdl = imp->imp_handle;
  88                 imp->imp_handle.addr = request->rq_repmsg->addr;
  89                 imp->imp_handle.cookie = request->rq_repmsg->cookie;
  90                 CERROR("now connected to %s@%s (%Lx/%Lx, was %Lx/%Lx)!\n",
  91                        cli->cl_target_uuid, conn->c_remote_uuid,
  92                        imp->imp_handle.addr, imp->imp_handle.cookie,
  93                        old_hdl.addr, old_hdl.cookie);
  94                 GOTO(out_disc, rc = 0);
  95             default:
  96                 CERROR("cannot connect to %s@%s: rc = %d\n",
  97                        cli->cl_target_uuid, conn->c_remote_uuid, rc);
  98                 GOTO(out_disc, rc = -ENOTCONN); /* XXX preserve rc? */
  99         }
 100
 101  out_disc:
 102         ptlrpc_req_finished(request);
 103         return rc;
 104 }
 105
 106 int ptlrpc_run_recovery_upcall(struct ptlrpc_connection *conn)
 107 {
 108         char *argv[3];
 109         char *envp[3];
 110         int rc;
 111
 112         ENTRY;
 113         conn->c_level = LUSTRE_CONN_RECOVD;
 114
 115         argv[0] = obd_recovery_upcall;
 116         argv[1] = conn->c_remote_uuid;
 117         argv[2] = NULL;
 118
 119         envp[0] = "HOME=/";
 120         envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 121         envp[2] = NULL;
 122
 123         rc = call_usermodehelper(argv[0], argv, envp);
 124         if (rc < 0) {
 125                 CERROR("Error invoking recovery upcall %s for %s: %d\n",
 126                        argv[0], argv[1], rc);
 127                 CERROR("Check /proc/sys/lustre/recovery_upcall?\n");
 128         } else {
 129                 CERROR("Invoked upcall %s for connection %s\n",
 130                        argv[0], argv[1]);
 131         }
 132
 133         /*
 134          * We don't want to make this a "failed" recovery, because the system
 135          * administrator -- or, perhaps, tester -- may well be able to rescue
 136          * things by running the correct upcall.
 137          */
 138         RETURN(0);
 139 }
 140
 141 #define REPLAY_COMMITTED     0 /* Fully processed (commit + reply). */
 142 #define REPLAY_REPLAY        1 /* Forced-replay (e.g. open). */
 143 #define REPLAY_RESEND        2 /* Resend required. */
 144 #define REPLAY_RESEND_IGNORE 3 /* Resend, ignore the reply (already saw it). */
 145 #define REPLAY_RESTART       4 /* Have to restart the call, sorry! */
 146
 147 static int replay_state(struct ptlrpc_request *req, __u64 committed)
 148 {
 149         /* This request must always be replayed. */
 150         if (req->rq_flags & PTL_RPC_FL_REPLAY)
 151                 return REPLAY_REPLAY;
 152
 153         /* Uncommitted request */
 154         if (req->rq_transno > committed) {
 155                 if (req->rq_flags & PTL_RPC_FL_REPLIED) {
 156                         /* Saw reply, so resend and ignore new reply. */
 157                         return REPLAY_RESEND_IGNORE;
 158                 }
 159
 160                 /* Didn't see reply either, so resend. */
 161                 return REPLAY_RESEND;
 162         }
 163
 164         /* This request has been committed and we saw the reply.  Goodbye! */
 165         if (req->rq_flags & PTL_RPC_FL_REPLIED)
 166                 return REPLAY_COMMITTED;
 167
 168         /* Request committed, but we didn't see the reply: have to restart. */
 169         return REPLAY_RESTART;
 170 }
 171
 172 static char *replay_state2str(int state) {
 173         static char *state_strings[] = {
 174                 "COMMITTED", "REPLAY", "RESEND", "RESEND_IGNORE", "RESTART",
 175         };
 176         static char *unknown_state = "UNKNOWN";
 177
 178         if (state < 0 ||
 179             state > (sizeof(state_strings) / sizeof(state_strings[0]))) {
 180                 return unknown_state;
 181         }
 182
 183         return state_strings[state];
 184 }
 185
 186 int ptlrpc_replay(struct obd_import *imp, int unreplied_only)
 187 {
 188         int rc = 0, state;
 189         struct list_head *tmp, *pos;
 190         struct ptlrpc_request *req;
 191         struct ptlrpc_connection *conn = imp->imp_connection;
 192         __u64 committed = imp->imp_peer_committed_transno;
 193         ENTRY;
 194
 195         spin_lock(&imp->imp_lock);
 196
 197         CDEBUG(D_HA, "import %p from %s has committed "LPD64"\n",
 198                imp, imp->imp_obd->u.cli.cl_target_uuid, committed);
 199
 200         list_for_each(tmp, &imp->imp_request_list) {
 201                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
 202                 state = replay_state(req, committed);
 203                 DEBUG_REQ(D_HA, req, "SENDING: %s: ", replay_state2str(state));
 204         }
 205
 206         list_for_each(tmp, &conn->c_delayed_head) {
 207                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
 208                 state = replay_state(req, committed);
 209                 DEBUG_REQ(D_HA, req, "DELAYED: %s: ", replay_state2str(state));
 210         }
 211
 212         list_for_each_safe(tmp, pos, &imp->imp_request_list) {
 213                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
 214
 215                 if (unreplied_only) {
 216                         if (!(req->rq_flags & PTL_RPC_FL_REPLIED)) {
 217                                 DEBUG_REQ(D_HA, req, "UNREPLIED:");
 218                                 ptlrpc_restart_req(req);
 219                         }
 220                         continue;
 221                 }
 222
 223                 state = replay_state(req, committed);
 224
 225                 if (req->rq_transno == imp->imp_max_transno) {
 226                         req->rq_reqmsg->flags |= MSG_LAST_REPLAY;
 227                         DEBUG_REQ(D_HA, req, "last for replay");
 228                         LASSERT(state != REPLAY_COMMITTED);
 229                 }
 230
 231                 switch (state) {
 232                     case REPLAY_REPLAY:
 233                         DEBUG_REQ(D_HA, req, "REPLAY:");
 234                         rc = ptlrpc_replay_req(req);
 235 #if 0
 236 #error We should not hold a spinlock over such a lengthy operation.
 237 #error If necessary, drop spinlock, do operation, re-get spinlock, restart loop.
 238 #error If we need to avoid re-processint items, then delete them from the list
 239 #error as they are replayed and re-add at the tail of this list, so the next
 240 #error item to process will always be at the head of the list.
 241 #endif
 242                         if (rc) {
 243                                 CERROR("recovery replay error %d for req %Ld\n",
 244                                        rc, req->rq_xid);
 245                                 GOTO(out, rc);
 246                         }
 247                         break;
 248
 249                     case REPLAY_COMMITTED:
 250                         DEBUG_REQ(D_ERROR, req, "COMMITTED:");
 251                         /* XXX commit now? */
 252                         break;
 253
 254                     case REPLAY_RESEND_IGNORE:
 255                         DEBUG_REQ(D_HA, req, "RESEND_IGNORE:");
 256                         rc = ptlrpc_replay_req(req);
 257                         if (rc) {
 258                                 CERROR("request resend error %d for req %Ld\n",
 259                                        rc, req->rq_xid);
 260                                 GOTO(out, rc);
 261                         }
 262                         break;
 263
 264                     case REPLAY_RESTART:
 265                         DEBUG_REQ(D_HA, req, "RESTART:");
 266                         ptlrpc_restart_req(req);
 267                         break;
 268
 269                     case REPLAY_RESEND:
 270                         DEBUG_REQ(D_HA, req, "RESEND:");
 271                         ptlrpc_resend_req(req);
 272                         break;
 273
 274                     default:
 275                         LBUG();
 276                 }
 277
 278         }
 279
 280         conn->c_level = LUSTRE_CONN_FULL;
 281         recovd_conn_fixed(conn);
 282
 283         CERROR("recovery complete on conn %p(%s), waking delayed reqs\n",
 284                conn, conn->c_remote_uuid);
 285         /* Finally, continue processing requests that blocked for recovery. */
 286         list_for_each_safe(tmp, pos, &conn->c_delayed_head) {
 287                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
 288                 DEBUG_REQ(D_HA, req, "WAKING: ");
 289                 ptlrpc_continue_req(req);
 290         }
 291
 292         EXIT;
 293  out:
 294         spin_unlock(&conn->c_lock);
 295         return rc;
 296 }