Whamcloud - gitweb
merge b_devel into HEAD, which will become 0.7.3
[fs/lustre-release.git] / lustre / ptlrpc / recover.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Portal-RPC reconnection and replay operations, for use in recovery.
5  *
6  *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
7  *   Author: Mike Shaver <shaver@clusterfs.com>
8  *
9  *   This file is part of Lustre, http://www.lustre.org.
10  *
11  *   Lustre is free software; you can redistribute it and/or
12  *   modify it under the terms of version 2 of the GNU General Public
13  *   License as published by the Free Software Foundation.
14  *
15  *   Lustre is distributed in the hope that it will be useful,
16  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *   GNU General Public License for more details.
19  *
20  *   You should have received a copy of the GNU General Public License
21  *   along with Lustre; if not, write to the Free Software
22  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  */
24
25 #define DEBUG_SUBSYSTEM S_RPC
26 #ifdef __KERNEL__
27 # include <linux/config.h>
28 # include <linux/module.h>
29 # include <linux/kmod.h>
30 #else
31 # include <liblustre.h>
32 #endif
33
34 #include <linux/obd_support.h>
35 #include <linux/lustre_ha.h>
36 #include <linux/lustre_net.h>
37 #include <linux/lustre_import.h>
38 #include <linux/lustre_export.h>
39 #include <linux/obd.h>
40 #include <linux/obd_class.h>
41 #include <linux/obd_lov.h> /* for IOC_LOV_SET_OSC_ACTIVE */
42
43 #include "ptlrpc_internal.h"
44
45 enum reconnect_result {
46         RECON_RESULT_RECOVERING  =  1,
47         RECON_RESULT_RECONNECTED  = 2,
48         RECON_RESULT_EVICTED      = 3,
49 };
50
51 int ptlrpc_reconnect_import(struct obd_import *imp)
52 {
53         struct obd_device *obd = imp->imp_obd;
54         int rc, size[] = {sizeof(imp->imp_target_uuid),
55                                  sizeof(obd->obd_uuid),
56                                  sizeof(imp->imp_dlm_handle)};
57         char *tmp[] = {imp->imp_target_uuid.uuid,
58                        obd->obd_uuid.uuid,
59                        (char *)&imp->imp_dlm_handle};
60         struct ptlrpc_connection *conn = imp->imp_connection;
61         struct ptlrpc_request *req;
62         struct lustre_handle old_hdl;
63         __u64 committed_before_reconnect = imp->imp_peer_committed_transno;
64
65         CERROR("reconnect handle "LPX64"\n",
66                imp->imp_dlm_handle.cookie);
67
68         req = ptlrpc_prep_req(imp, imp->imp_connect_op, 3, size, tmp);
69         if (!req)
70                 RETURN(-ENOMEM);
71         req->rq_level = LUSTRE_CONN_NEW;
72         req->rq_replen = lustre_msg_size(0, NULL);
73         rc = ptlrpc_queue_wait(req);
74         if (rc) {
75                 /* what if rc > 0 ??*/
76                 CERROR("cannot connect to %s@%s: rc = %d\n",
77                        imp->imp_target_uuid.uuid, conn->c_remote_uuid.uuid, rc);
78                 GOTO(out_disc, rc);
79         }
80
81         if (lustre_msg_get_op_flags(req->rq_repmsg) & MSG_CONNECT_RECONNECT) {
82                 memset(&old_hdl, 0, sizeof(old_hdl));
83                 if (!memcmp(&old_hdl, &req->rq_repmsg->handle,
84                             sizeof (old_hdl))) {
85                         CERROR("%s@%s didn't like our handle "LPX64
86                                ", failed\n", imp->imp_target_uuid.uuid,
87                                conn->c_remote_uuid.uuid,
88                                imp->imp_dlm_handle.cookie);
89                         GOTO(out_disc, rc = -ENOTCONN);
90                 }
91
92                 if (memcmp(&imp->imp_remote_handle, &req->rq_repmsg->handle,
93                            sizeof(imp->imp_remote_handle))) {
94                         CERROR("%s@%s changed handle from "LPX64" to "LPX64
95                                "; copying, but this may foreshadow disaster\n",
96                                imp->imp_target_uuid.uuid,
97                                conn->c_remote_uuid.uuid,
98                                imp->imp_remote_handle.cookie,
99                                req->rq_repmsg->handle.cookie);
100                         imp->imp_remote_handle = req->rq_repmsg->handle;
101                         GOTO(out_disc, rc = RECON_RESULT_RECONNECTED);
102                 }
103
104                 CERROR("reconnected to %s@%s after partition\n",
105                        imp->imp_target_uuid.uuid, conn->c_remote_uuid.uuid);
106                 GOTO(out_disc, rc = RECON_RESULT_RECONNECTED);
107         } else if (lustre_msg_get_op_flags(req->rq_repmsg) &
108                    MSG_CONNECT_RECOVERING) {
109                 rc = RECON_RESULT_RECOVERING;
110         } else {
111                 rc = RECON_RESULT_EVICTED;
112         }
113
114         old_hdl = imp->imp_remote_handle;
115         imp->imp_remote_handle = req->rq_repmsg->handle;
116         CERROR("reconnected to %s@%s ("LPX64", was "LPX64")!\n",
117                imp->imp_target_uuid.uuid, conn->c_remote_uuid.uuid,
118                imp->imp_remote_handle.cookie, old_hdl.cookie);
119         if (req->rq_repmsg->last_committed < committed_before_reconnect) {
120                 CERROR("%s went back in time (transno "LPD64
121                        " was committed, server claims "LPD64
122                        ")! is shared storage not coherent?\n",
123                        imp->imp_target_uuid.uuid,
124                        imp->imp_peer_committed_transno,
125                        req->rq_repmsg->last_committed);
126         }
127
128         GOTO(out_disc, rc);
129
130  out_disc:
131         ptlrpc_req_finished(req);
132         return rc;
133 }
134
135 void ptlrpc_run_recovery_over_upcall(struct obd_device *obd)
136 {
137         char *argv[4];
138         char *envp[3];
139         int rc;
140
141         ENTRY;
142         argv[0] = obd_lustre_upcall;
143         argv[1] = "RECOVERY_OVER";
144         argv[2] = obd->obd_uuid.uuid;
145         argv[3] = NULL;
146
147         envp[0] = "HOME=/";
148         envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
149         envp[2] = NULL;
150
151         rc = USERMODEHELPER(argv[0], argv, envp);
152         if (rc < 0) {
153                 CERROR("Error invoking recovery upcall %s %s %s: %d; check "
154                        "/proc/sys/lustre/upcall\n",
155                        argv[0], argv[1], argv[2], rc);
156
157         } else {
158                 CERROR("Invoked upcall %s %s %s",
159                        argv[0], argv[1], argv[2]);
160         }
161 }
162
163 void ptlrpc_run_failed_import_upcall(struct obd_import* imp)
164 {
165         char *argv[7];
166         char *envp[3];
167         int rc;
168
169         ENTRY;
170         argv[0] = obd_lustre_upcall;
171         argv[1] = "FAILED_IMPORT";
172         argv[2] = imp->imp_target_uuid.uuid;
173         argv[3] = imp->imp_obd->obd_name;
174         argv[4] = imp->imp_connection->c_remote_uuid.uuid;
175         argv[5] = imp->imp_obd->obd_uuid.uuid;
176         argv[6] = NULL;
177
178         envp[0] = "HOME=/";
179         envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
180         envp[2] = NULL;
181
182         rc = USERMODEHELPER(argv[0], argv, envp);
183         if (rc < 0) {
184                 CERROR("Error invoking recovery upcall %s %s %s %s %s: %d; "
185                        "check /proc/sys/lustre/lustre_upcall\n",
186                        argv[0], argv[1], argv[2], argv[3], argv[4],rc);
187
188         } else {
189                 CERROR("Invoked upcall %s %s %s %s %s\n",
190                        argv[0], argv[1], argv[2], argv[3], argv[4]);
191         }
192 }
193
194 int ptlrpc_replay(struct obd_import *imp)
195 {
196         int rc = 0;
197         struct list_head *tmp, *pos;
198         struct ptlrpc_request *req;
199         unsigned long flags;
200         ENTRY;
201
202         /* It might have committed some after we last spoke, so make sure we
203          * get rid of them now.
204          */
205         spin_lock_irqsave(&imp->imp_lock, flags);
206         ptlrpc_free_committed(imp);
207         spin_unlock_irqrestore(&imp->imp_lock, flags);
208
209         CDEBUG(D_HA, "import %p from %s has committed "LPD64"\n",
210                imp, imp->imp_target_uuid.uuid, imp->imp_peer_committed_transno);
211
212         list_for_each(tmp, &imp->imp_replay_list) {
213                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
214                 DEBUG_REQ(D_HA, req, "RETAINED: ");
215         }
216
217         /* Do I need to hold a lock across this iteration?  We shouldn't be
218          * racing with any additions to the list, because we're in recovery
219          * and are therefore not processing additional requests to add.  Calls
220          * to ptlrpc_free_committed might commit requests, but nothing "newer"
221          * than the one we're replaying (it can't be committed until it's
222          * replayed, and we're doing that here).  l_f_e_safe protects against
223          * problems with the current request being committed, in the unlikely
224          * event of that race.  So, in conclusion, I think that it's safe to
225          * perform this list-walk without the imp_lock held.
226          *
227          * But, the {mdc,osc}_replay_open callbacks both iterate
228          * request lists, and have comments saying they assume the
229          * imp_lock is being held by ptlrpc_replay, but it's not. it's
230          * just a little race...
231          */
232         list_for_each_safe(tmp, pos, &imp->imp_replay_list) {
233                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
234
235                 DEBUG_REQ(D_HA, req, "REPLAY:");
236
237                 rc = ptlrpc_replay_req(req);
238
239                 if (rc) {
240                         CERROR("recovery replay error %d for req "LPD64"\n",
241                                rc, req->rq_xid);
242                         RETURN(rc);
243                 }
244         }
245
246         RETURN(0);
247 }
248
249 int ptlrpc_resend(struct obd_import *imp)
250 {
251         struct list_head *tmp, *pos;
252         struct ptlrpc_request *req;
253         unsigned long flags;
254
255         ENTRY;
256
257         /* As long as we're in recovery, nothing should be added to the sending
258          * list, so we don't need to hold the lock during this iteration and
259          * resend process.
260          */
261         /* Well... what if lctl recover is called twice at the same time?
262          */
263         spin_lock_irqsave(&imp->imp_lock, flags);
264         LASSERT(imp->imp_level == LUSTRE_CONN_RECOVER);
265         spin_unlock_irqrestore(&imp->imp_lock, flags);
266
267         list_for_each_safe(tmp, pos, &imp->imp_sending_list) {
268                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
269                 ptlrpc_resend_req(req);
270         }
271
272         RETURN(0);
273 }
274
275 void ptlrpc_wake_delayed(struct obd_import *imp)
276 {
277         unsigned long flags;
278         struct list_head *tmp, *pos;
279         struct ptlrpc_request *req;
280
281         spin_lock_irqsave(&imp->imp_lock, flags);
282         list_for_each_safe(tmp, pos, &imp->imp_delayed_list) {
283                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
284
285                 ptlrpc_put_connection(req->rq_connection);
286                 req->rq_connection =
287                        ptlrpc_connection_addref(req->rq_import->imp_connection);
288
289                 if (req->rq_set) {
290                         DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set);
291                         wake_up(&req->rq_set->set_waitq);
292                 } else {
293                         DEBUG_REQ(D_HA, req, "waking:");
294                         wake_up(&req->rq_wait_for_rep);
295                 }
296         }
297         spin_unlock_irqrestore(&imp->imp_lock, flags);
298 }
299
300 inline void ptlrpc_invalidate_import_state(struct obd_import *imp)
301 {
302         struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
303         if (ptlrpc_ldlm_namespace_cleanup == NULL)
304                 CERROR("ptlrpc/ldlm hook is NULL!  Please tell phil\n");
305         else
306                 ptlrpc_ldlm_namespace_cleanup(ns, 1 /* no network ops */);
307         ptlrpc_abort_inflight(imp);
308 }
309
310 void ptlrpc_handle_failed_import(struct obd_import *imp)
311 {
312         ENTRY;
313         if (!imp->imp_replayable) {
314                 CDEBUG(D_HA,
315                        "import %s@%s for %s not replayable, deactivating\n",
316                        imp->imp_target_uuid.uuid,
317                        imp->imp_connection->c_remote_uuid.uuid,
318                        imp->imp_obd->obd_name);
319                 ptlrpc_set_import_active(imp, 0);
320         }
321
322         ptlrpc_run_failed_import_upcall(imp);
323         EXIT;
324 }
325
326 void ptlrpc_request_handle_eviction(struct ptlrpc_request *failed_req)
327 {
328         int rc;
329         struct obd_import *imp= failed_req->rq_import;
330         unsigned long flags;
331         ENTRY;
332
333         CDEBUG(D_HA, "import %s of %s@%s evicted: reconnecting\n",
334                imp->imp_obd->obd_name,
335                imp->imp_target_uuid.uuid,
336                imp->imp_connection->c_remote_uuid.uuid);
337         rc = ptlrpc_recover_import(imp, NULL);
338         if (rc) {
339                 ptlrpc_resend_req(failed_req);
340                 if (rc != -EALREADY)
341                         ptlrpc_handle_failed_import(imp);
342         } else {
343                 LASSERT(failed_req->rq_import_generation < imp->imp_generation);
344                 spin_lock_irqsave (&failed_req->rq_lock, flags);
345                 failed_req->rq_err = 1;
346                 spin_unlock_irqrestore (&failed_req->rq_lock, flags);
347         }
348         EXIT;
349 }
350
351 int ptlrpc_set_import_active(struct obd_import *imp, int active)
352 {
353         struct obd_device *notify_obd;
354         unsigned long flags;
355         int rc;
356
357         LASSERT(imp->imp_obd);
358
359         notify_obd = imp->imp_obd->u.cli.cl_containing_lov;
360
361         /* When deactivating, mark import invalid, and abort in-flight
362          * requests. */
363         if (!active) {
364                 spin_lock_irqsave(&imp->imp_lock, flags);
365                 /* This is a bit of a hack, but invalidating replayable
366                  * imports makes a temporary reconnect failure into a much more
367                  * ugly -- and hard to remedy -- situation. */
368                 if (!imp->imp_replayable) {
369                         CDEBUG(D_HA, "setting import %s INVALID\n",
370                                imp->imp_target_uuid.uuid);
371                         imp->imp_invalid = 1;
372                 }
373                 imp->imp_generation++;
374                 spin_unlock_irqrestore(&imp->imp_lock, flags);
375                 ptlrpc_invalidate_import_state(imp);
376                 //ptlrpc_abort_inflight(imp);
377         }
378
379         if (notify_obd == NULL)
380                 GOTO(out, rc = 0);
381
382         /* How gross is _this_? */
383         if (!list_empty(&notify_obd->obd_exports)) {
384                 struct lustre_handle fakeconn;
385                 struct obd_ioctl_data ioc_data = { 0 };
386                 struct obd_export *exp =
387                         list_entry(notify_obd->obd_exports.next,
388                                    struct obd_export, exp_obd_chain);
389
390                 fakeconn.cookie = exp->exp_handle.h_cookie;
391                 ioc_data.ioc_inlbuf1 = (char *)&imp->imp_target_uuid;
392                 ioc_data.ioc_offset = active;
393                 rc = obd_iocontrol(IOC_LOV_SET_OSC_ACTIVE, &fakeconn,
394                                    sizeof ioc_data, &ioc_data, NULL);
395                 if (rc)
396                         CERROR("error %sabling %s on LOV %p/%s: %d\n",
397                                active ? "en" : "dis",
398                                imp->imp_target_uuid.uuid, notify_obd,
399                                notify_obd->obd_uuid.uuid, rc);
400         } else {
401                 CDEBUG(D_HA, "No exports for obd %p/%s, can't notify about "
402                        "%p\n", notify_obd, notify_obd->obd_uuid.uuid,
403                        imp->imp_obd->obd_uuid.uuid);
404                 rc = -ENOENT;
405         }
406
407 out:
408         /* When activating, mark import valid */
409         if (active && !rc) {
410                 CDEBUG(D_HA, "setting import %s VALID\n",
411                        imp->imp_target_uuid.uuid);
412                 spin_lock_irqsave(&imp->imp_lock, flags);
413                 imp->imp_invalid = 0;
414                 spin_unlock_irqrestore(&imp->imp_lock, flags);
415         }
416
417         RETURN(rc);
418 }
419
420 void ptlrpc_fail_import(struct obd_import *imp, int generation)
421 {
422         unsigned long flags;
423         int in_recovery = 0;
424         ENTRY;
425
426         LASSERT (!imp->imp_dlm_fake);
427
428         spin_lock_irqsave(&imp->imp_lock, flags);
429         if (imp->imp_level != LUSTRE_CONN_FULL)
430                 in_recovery = 1;
431         else
432                 imp->imp_level = LUSTRE_CONN_NOTCONN;
433         spin_unlock_irqrestore(&imp->imp_lock, flags);
434
435         if (in_recovery) {
436                 EXIT;
437                 return;
438         }
439
440         ptlrpc_handle_failed_import(imp);
441         EXIT;
442 }
443
444 static int signal_completed_replay(struct obd_import *imp)
445 {
446         struct ptlrpc_request *req;
447         int rc;
448         ENTRY;
449
450         req = ptlrpc_prep_req(imp, OBD_PING, 0, NULL, NULL);
451         if (!req)
452                 RETURN(-ENOMEM);
453
454         req->rq_replen = lustre_msg_size(0, NULL);
455         req->rq_level = LUSTRE_CONN_RECOVER;
456         req->rq_reqmsg->flags |= MSG_LAST_REPLAY;
457
458         rc = ptlrpc_queue_wait(req);
459
460         ptlrpc_req_finished(req);
461         RETURN(rc);
462 }
463
464 int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid)
465 {
466         int rc;
467         unsigned long flags;
468         int in_recover = 0;
469         int recon_result;
470         ENTRY;
471
472         spin_lock_irqsave(&imp->imp_lock, flags);
473         if (imp->imp_level == LUSTRE_CONN_FULL ||
474             imp->imp_level == LUSTRE_CONN_NOTCONN)
475                     imp->imp_level = LUSTRE_CONN_RECOVER;
476         else
477                 in_recover = 1;
478         spin_unlock_irqrestore(&imp->imp_lock, flags);
479
480         if (in_recover == 1)
481                 RETURN(-EALREADY);
482
483         if (new_uuid) {
484                 struct ptlrpc_connection *conn;
485                 struct obd_uuid uuid;
486                 struct ptlrpc_peer peer;
487                 struct obd_export *dlmexp;
488
489                 obd_str2uuid(&uuid, new_uuid);
490                 if (ptlrpc_uuid_to_peer(&uuid, &peer)) {
491                         CERROR("no connection found for UUID %s\n", new_uuid);
492                         RETURN(-EINVAL);
493                 }
494
495                 conn = ptlrpc_get_connection(&peer, &uuid);
496                 if (!conn)
497                         RETURN(-ENOMEM);
498
499                 CDEBUG(D_HA, "switching import %s/%s from %s to %s\n",
500                        imp->imp_target_uuid.uuid, imp->imp_obd->obd_name,
501                        imp->imp_connection->c_remote_uuid.uuid,
502                        conn->c_remote_uuid.uuid);
503
504                 /* Switch the import's connection and the DLM export's
505                  * connection (which are almost certainly the same, but we
506                  * keep distinct refs just to make things clearer. I think. */
507                 if (imp->imp_connection)
508                         ptlrpc_put_connection(imp->imp_connection);
509                 /* We hand off the ref from ptlrpc_get_connection. */
510                 imp->imp_connection = conn;
511
512                 dlmexp = class_conn2export(&imp->imp_dlm_handle);
513                 if (dlmexp->exp_connection)
514                         ptlrpc_put_connection(dlmexp->exp_connection);
515                 dlmexp->exp_connection = ptlrpc_connection_addref(conn);
516                 class_export_put(dlmexp);
517
518         }
519
520         recon_result = ptlrpc_reconnect_import(imp);
521
522         if (recon_result < 0) {
523                 CERROR("failed to reconnect to %s@%s: %d\n",
524                        imp->imp_target_uuid.uuid,
525                        imp->imp_connection->c_remote_uuid.uuid, recon_result);
526                 spin_lock_irqsave(&imp->imp_lock, flags);
527                 imp->imp_level = LUSTRE_CONN_NOTCONN;
528                 spin_unlock_irqrestore(&imp->imp_lock, flags);
529                 RETURN(recon_result);
530         }
531
532         if (recon_result == RECON_RESULT_RECOVERING) {
533                 CDEBUG(D_HA, "replay requested by %s\n",
534                        imp->imp_target_uuid.uuid);
535                 rc = ptlrpc_replay(imp);
536                 if (rc)
537                         GOTO(out, rc);
538
539                 if (ptlrpc_ldlm_replay_locks == NULL)
540                         CERROR("ptlrpc/ldlm hook is NULL!  Please tell phil\n");
541                 else
542                         rc = ptlrpc_ldlm_replay_locks(imp);
543                 if (rc)
544                         GOTO(out, rc);
545
546                 rc = signal_completed_replay(imp);
547                 if (rc)
548                         GOTO(out, rc);
549         } else if (recon_result == RECON_RESULT_RECONNECTED) {
550                 CDEBUG(D_HA, "reconnected to %s@%s\n",
551                        imp->imp_target_uuid.uuid,
552                        imp->imp_connection->c_remote_uuid.uuid);
553         } else if (recon_result == RECON_RESULT_EVICTED) {
554                 CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
555                        imp->imp_target_uuid.uuid,
556                        imp->imp_connection->c_remote_uuid.uuid);
557                 ptlrpc_set_import_active(imp, 0);
558 //                ptlrpc_invalidate_import_state(imp);
559         } else {
560                 LBUG();
561         }
562
563         ptlrpc_set_import_active(imp, 1);
564
565         rc = ptlrpc_resend(imp);
566
567         spin_lock_irqsave(&imp->imp_lock, flags);
568         imp->imp_level = LUSTRE_CONN_FULL;
569         spin_unlock_irqrestore(&imp->imp_lock, flags);
570
571         ptlrpc_wake_delayed(imp);
572         EXIT;
573  out:
574         return rc;
575 }
576
577 void ptlrpc_fail_export(struct obd_export *exp)
578 {
579         int rc, already_failed;
580         struct lustre_handle hdl;
581         unsigned long flags;
582
583         spin_lock_irqsave(&exp->exp_lock, flags);
584         already_failed = exp->exp_failed;
585         exp->exp_failed = 1;
586         spin_unlock_irqrestore(&exp->exp_lock, flags);
587
588         if (already_failed) {
589                 CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n",
590                        exp, exp->exp_client_uuid.uuid);
591                 return;
592         }
593
594         CDEBUG(D_HA, "disconnecting export %p/%s\n",
595                exp, exp->exp_client_uuid.uuid);
596         hdl.cookie = exp->exp_handle.h_cookie;
597         rc = obd_disconnect(&hdl, 0);
598         if (rc)
599                 CERROR("disconnecting export %p failed: %d\n", exp, rc);
600 }