lustre/ptlrpc/recover.c

   1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
   2  * vim:expandtab:shiftwidth=8:tabstop=8:
   3  *
   4  * Portal-RPC reconnection and replay operations, for use in recovery.
   5  *
   6  *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
   7  *   Author: Mike Shaver <shaver@clusterfs.com>
   8  *
   9  *   This file is part of Lustre, http://www.lustre.org.
  10  *
  11  *   Lustre is free software; you can redistribute it and/or
  12  *   modify it under the terms of version 2 of the GNU General Public
  13  *   License as published by the Free Software Foundation.
  14  *
  15  *   Lustre is distributed in the hope that it will be useful,
  16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  *   GNU General Public License for more details.
  19  *
  20  *   You should have received a copy of the GNU General Public License
  21  *   along with Lustre; if not, write to the Free Software
  22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  23  */
  24
  25 #define DEBUG_SUBSYSTEM S_RPC
  26 #ifdef __KERNEL__
  27 # include <linux/config.h>
  28 # include <linux/module.h>
  29 # include <linux/kmod.h>
  30 #else
  31 # include <liblustre.h>
  32 #endif
  33
  34 #include <linux/obd_support.h>
  35 #include <linux/lustre_ha.h>
  36 #include <linux/lustre_net.h>
  37 #include <linux/lustre_import.h>
  38 #include <linux/lustre_export.h>
  39 #include <linux/obd.h>
  40 #include <linux/obd_ost.h>
  41 #include <linux/obd_class.h>
  42 #include <linux/obd_lov.h> /* for IOC_LOV_SET_OSC_ACTIVE */
  43
  44 #include "ptlrpc_internal.h"
  45
  46 static int ptlrpc_recover_import_no_retry(struct obd_import *, char *);
  47
  48 void ptlrpc_run_recovery_over_upcall(struct obd_device *obd)
  49 {
  50         char *argv[4];
  51         char *envp[3];
  52         int rc;
  53         ENTRY;
  54
  55         argv[0] = obd_lustre_upcall;
  56         argv[1] = "RECOVERY_OVER";
  57         argv[2] = obd->obd_uuid.uuid;
  58         argv[3] = NULL;
  59
  60         envp[0] = "HOME=/";
  61         envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
  62         envp[2] = NULL;
  63
  64         rc = USERMODEHELPER(argv[0], argv, envp);
  65         if (rc < 0) {
  66                 CERROR("Error invoking recovery upcall %s %s %s: %d; check "
  67                        "/proc/sys/lustre/upcall\n",
  68                        argv[0], argv[1], argv[2], rc);
  69
  70         } else {
  71                 CERROR("Invoked upcall %s %s %s\n",
  72                        argv[0], argv[1], argv[2]);
  73         }
  74 }
  75
  76 void ptlrpc_run_failed_import_upcall(struct obd_import* imp)
  77 {
  78 #ifdef __KERNEL__
  79         unsigned long flags;
  80         char *argv[7];
  81         char *envp[3];
  82         int rc;
  83         ENTRY;
  84
  85         spin_lock_irqsave(&imp->imp_lock, flags);
  86         if (imp->imp_state == LUSTRE_IMP_CLOSED) {
  87                 spin_unlock_irqrestore(&imp->imp_lock, flags);
  88                 EXIT;
  89                 return;
  90         }
  91         spin_unlock_irqrestore(&imp->imp_lock, flags);
  92
  93         argv[0] = obd_lustre_upcall;
  94         argv[1] = "FAILED_IMPORT";
  95         argv[2] = imp->imp_target_uuid.uuid;
  96         argv[3] = imp->imp_obd->obd_name;
  97         argv[4] = imp->imp_connection->c_remote_uuid.uuid;
  98         argv[5] = imp->imp_obd->obd_uuid.uuid;
  99         argv[6] = NULL;
 100
 101         envp[0] = "HOME=/";
 102         envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
 103         envp[2] = NULL;
 104
 105         rc = USERMODEHELPER(argv[0], argv, envp);
 106         if (rc < 0) {
 107                 CERROR("Error invoking recovery upcall %s %s %s %s %s: %d; "
 108                        "check /proc/sys/lustre/lustre_upcall\n",
 109                        argv[0], argv[1], argv[2], argv[3], argv[4],rc);
 110
 111         } else {
 112                 CERROR("Invoked upcall %s %s %s %s %s\n",
 113                        argv[0], argv[1], argv[2], argv[3], argv[4]);
 114         }
 115 #else
 116         ptlrpc_recover_import(imp, NULL);
 117 #endif
 118 }
 119
 120 int ptlrpc_replay_next(struct obd_import *imp)
 121 {
 122         int rc = 0;
 123         struct list_head *tmp, *pos;
 124         struct ptlrpc_request *req;
 125         unsigned long flags;
 126         __u64 last_transno;
 127         int sent_req = 0;
 128         ENTRY;
 129
 130         /* It might have committed some after we last spoke, so make sure we
 131          * get rid of them now.
 132          */
 133         spin_lock_irqsave(&imp->imp_lock, flags);
 134         ptlrpc_free_committed(imp);
 135         last_transno = imp->imp_last_replay_transno;
 136         spin_unlock_irqrestore(&imp->imp_lock, flags);
 137
 138         CDEBUG(D_HA, "import %p from %s has committed "LPD64"\n",
 139                imp, imp->imp_target_uuid.uuid, imp->imp_peer_committed_transno);
 140         /* Do I need to hold a lock across this iteration?  We shouldn't be
 141          * racing with any additions to the list, because we're in recovery
 142          * and are therefore not processing additional requests to add.  Calls
 143          * to ptlrpc_free_committed might commit requests, but nothing "newer"
 144          * than the one we're replaying (it can't be committed until it's
 145          * replayed, and we're doing that here).  l_f_e_safe protects against
 146          * problems with the current request being committed, in the unlikely
 147          * event of that race.  So, in conclusion, I think that it's safe to
 148          * perform this list-walk without the imp_lock held.
 149          *
 150          * But, the {mdc,osc}_replay_open callbacks both iterate
 151          * request lists, and have comments saying they assume the
 152          * imp_lock is being held by ptlrpc_replay, but it's not. it's
 153          * just a little race...
 154          */
 155         list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
 156                 req = list_entry(tmp, struct ptlrpc_request, rq_replay_list);
 157                 if (req->rq_transno > last_transno) {
 158                         /* remove from list so ptlrpcd can send the
 159                            req, it should be reinserted after it is
 160                            sent and replied.  Perhaps better solution
 161                            would be to add req->rq_replay_list so the
 162                            req can be saved for replay and still go
 163                            through the normal send thread. */
 164                         rc = ptlrpc_replay_req(req);
 165                         if (rc) {
 166                                 CERROR("recovery replay error %d for req "LPD64"\n",
 167                                        rc, req->rq_xid);
 168                                 RETURN(rc);
 169                         }
 170                         sent_req = 1;
 171                         break;
 172                 }
 173
 174         }
 175
 176         RETURN(sent_req);
 177 }
 178
 179 int ptlrpc_resend(struct obd_import *imp)
 180 {
 181         struct list_head *tmp, *pos;
 182         struct ptlrpc_request *req;
 183         unsigned long flags;
 184
 185         ENTRY;
 186
 187         /* As long as we're in recovery, nothing should be added to the sending
 188          * list, so we don't need to hold the lock during this iteration and
 189          * resend process.
 190          */
 191         /* Well... what if lctl recover is called twice at the same time?
 192          */
 193         spin_lock_irqsave(&imp->imp_lock, flags);
 194         if (imp->imp_state != LUSTRE_IMP_RECOVER) {
 195                 spin_unlock_irqrestore(&imp->imp_lock, flags);
 196                 RETURN(-1);
 197         }
 198         spin_unlock_irqrestore(&imp->imp_lock, flags);
 199
 200         list_for_each_safe(tmp, pos, &imp->imp_sending_list) {
 201                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
 202                 ptlrpc_resend_req(req);
 203         }
 204
 205         RETURN(0);
 206 }
 207
 208 void ptlrpc_wake_delayed(struct obd_import *imp)
 209 {
 210         unsigned long flags;
 211         struct list_head *tmp, *pos;
 212         struct ptlrpc_request *req;
 213
 214         spin_lock_irqsave(&imp->imp_lock, flags);
 215         list_for_each_safe(tmp, pos, &imp->imp_delayed_list) {
 216                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
 217
 218                 if (req->rq_set) {
 219                         DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
 220                         wake_up(&req->rq_set->set_waitq);
 221                 } else {
 222                         DEBUG_REQ(D_HA, req, "waking:");
 223                         wake_up(&req->rq_reply_waitq);
 224                 }
 225         }
 226         spin_unlock_irqrestore(&imp->imp_lock, flags);
 227 }
 228
 229 inline void ptlrpc_invalidate_import_state(struct obd_import *imp)
 230 {
 231         struct obd_device *obd = imp->imp_obd;
 232         struct ldlm_namespace *ns = obd->obd_namespace;
 233
 234         ptlrpc_abort_inflight(imp);
 235
 236 #if 0
 237         obd_invalidate_import(obd, imp);
 238 #endif
 239
 240         ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY);
 241 }
 242
 243 void ptlrpc_handle_failed_import(struct obd_import *imp)
 244 {
 245         ENTRY;
 246
 247         if (!imp->imp_replayable) {
 248                 CDEBUG(D_HA,
 249                        "import %s@%s for %s not replayable, deactivating\n",
 250                        imp->imp_target_uuid.uuid,
 251                        imp->imp_connection->c_remote_uuid.uuid,
 252                        imp->imp_obd->obd_name);
 253                 ptlrpc_set_import_active(imp, 0);
 254         }
 255
 256         ptlrpc_run_failed_import_upcall(imp);
 257         EXIT;
 258 }
 259
 260 void ptlrpc_request_handle_notconn(struct ptlrpc_request *failed_req)
 261 {
 262         int rc;
 263         struct obd_import *imp= failed_req->rq_import;
 264         unsigned long flags;
 265         ENTRY;
 266
 267         CDEBUG(D_HA, "import %s of %s@%s abruptly disconnected: reconnecting\n",
 268                imp->imp_obd->obd_name,
 269                imp->imp_target_uuid.uuid,
 270                imp->imp_connection->c_remote_uuid.uuid);
 271
 272         ptlrpc_set_import_discon(imp);
 273
 274         rc = ptlrpc_connect_import(imp, NULL);
 275
 276         /* Wait for recovery to complete and resend. If evicted, then
 277            this request will be errored out later.*/
 278         spin_lock_irqsave(&failed_req->rq_lock, flags);
 279         if (!failed_req->rq_no_resend)
 280                 failed_req->rq_resend = 1;
 281         spin_unlock_irqrestore(&failed_req->rq_lock, flags);
 282
 283         EXIT;
 284 }
 285
 286 int ptlrpc_set_import_active(struct obd_import *imp, int active)
 287 {
 288         struct obd_device *obd = imp->imp_obd;
 289         unsigned long flags;
 290
 291         LASSERT(obd);
 292
 293         /* When deactivating, mark import invalid, and abort in-flight
 294          * requests. */
 295         if (!active) {
 296                 spin_lock_irqsave(&imp->imp_lock, flags);
 297                 /* This is a bit of a hack, but invalidating replayable
 298                  * imports makes a temporary reconnect failure into a much more
 299                  * ugly -- and hard to remedy -- situation. */
 300                 if (!imp->imp_replayable) {
 301                         CDEBUG(D_HA, "setting import %s INVALID\n",
 302                                imp->imp_target_uuid.uuid);
 303                         imp->imp_invalid = 1;
 304                 }
 305                 imp->imp_generation++;
 306                 spin_unlock_irqrestore(&imp->imp_lock, flags);
 307                 ptlrpc_invalidate_import_state(imp);
 308         }
 309
 310         /* When activating, mark import valid */
 311         if (active) {
 312                 CDEBUG(D_HA, "setting import %s VALID\n",
 313                        imp->imp_target_uuid.uuid);
 314                 spin_lock_irqsave(&imp->imp_lock, flags);
 315                 imp->imp_invalid = 0;
 316                 spin_unlock_irqrestore(&imp->imp_lock, flags);
 317         }
 318
 319         if (obd->obd_observer)
 320                 RETURN(obd_notify(obd->obd_observer, obd, active));
 321
 322         RETURN(0);
 323 }
 324
 325 int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid)
 326 {
 327         int rc;
 328         ENTRY;
 329
 330         /* force import to be disconnected. */
 331         ptlrpc_set_import_discon(imp);
 332
 333         rc = ptlrpc_recover_import_no_retry(imp, new_uuid);
 334
 335         RETURN(rc);
 336 }
 337
 338 int ptlrpc_import_in_recovery(struct obd_import *imp)
 339 {
 340         unsigned long flags;
 341         int in_recovery = 1;
 342         spin_lock_irqsave(&imp->imp_lock, flags);
 343         if (imp->imp_state == LUSTRE_IMP_FULL ||
 344             imp->imp_state == LUSTRE_IMP_CLOSED ||
 345             imp->imp_state == LUSTRE_IMP_DISCON)
 346                 in_recovery = 0;
 347         spin_unlock_irqrestore(&imp->imp_lock, flags);
 348         return in_recovery;
 349 }
 350
 351 static int ptlrpc_recover_import_no_retry(struct obd_import *imp,
 352                                           char *new_uuid)
 353 {
 354         int rc;
 355         unsigned long flags;
 356         int in_recovery = 0;
 357         struct l_wait_info lwi;
 358         ENTRY;
 359
 360         spin_lock_irqsave(&imp->imp_lock, flags);
 361         if (imp->imp_state != LUSTRE_IMP_DISCON) {
 362                 in_recovery = 1;
 363         }
 364         spin_unlock_irqrestore(&imp->imp_lock, flags);
 365
 366         if (in_recovery == 1)
 367                 RETURN(-EALREADY);
 368
 369
 370         rc = ptlrpc_connect_import(imp, new_uuid);
 371         if (rc)
 372                 RETURN(rc);
 373
 374         CDEBUG(D_ERROR, "%s: recovery started, waiting\n",
 375                imp->imp_client->cli_name);
 376
 377         lwi = LWI_TIMEOUT(MAX(obd_timeout * HZ, 1), NULL, NULL);
 378         rc = l_wait_event(imp->imp_recovery_waitq,
 379                           !ptlrpc_import_in_recovery(imp), &lwi);
 380         CDEBUG(D_ERROR, "%s: recovery finished\n",
 381                imp->imp_client->cli_name);
 382
 383         RETURN(rc);
 384
 385 }
 386
 387 void ptlrpc_fail_export(struct obd_export *exp)
 388 {
 389         int rc, already_failed;
 390         unsigned long flags;
 391
 392         spin_lock_irqsave(&exp->exp_lock, flags);
 393         already_failed = exp->exp_failed;
 394         exp->exp_failed = 1;
 395         spin_unlock_irqrestore(&exp->exp_lock, flags);
 396
 397         if (already_failed) {
 398                 CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n",
 399                        exp, exp->exp_client_uuid.uuid);
 400                 return;
 401         }
 402
 403         CDEBUG(D_HA, "disconnecting export %p/%s\n",
 404                exp, exp->exp_client_uuid.uuid);
 405
 406         /* Most callers into obd_disconnect are removing their own reference
 407          * (request, for example) in addition to the one from the hash table.
 408          * We don't have such a reference here, so make one. */
 409         class_export_get(exp);
 410         rc = obd_disconnect(exp, 0);
 411         if (rc)
 412                 CERROR("disconnecting export %p failed: %d\n", exp, rc);
 413 }