Whamcloud - gitweb
374e46e25b44c0500700b6cb0c3c6a559f2c2be9
[fs/lustre-release.git] / lustre / ptlrpc / import.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  Copyright (c) 2002, 2003 Cluster File Systems, Inc.
5  *   Author: Mike Shaver <shaver@clusterfs.com>
6  *
7  *   This file is part of Lustre, http://www.lustre.org.
8  *
9  *   Lustre is free software; you can redistribute it and/or
10  *   modify it under the terms of version 2 of the GNU General Public
11  *   License as published by the Free Software Foundation.
12  *
13  *   Lustre is distributed in the hope that it will be useful,
14  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
15  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *   GNU General Public License for more details.
17  *
18  *   You should have received a copy of the GNU General Public License
19  *   along with Lustre; if not, write to the Free Software
20  *   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21  */
22
23 #define DEBUG_SUBSYSTEM S_RPC
24 #ifdef __KERNEL__
25 # include <linux/config.h>
26 # include <linux/module.h>
27 # include <linux/kmod.h>
28 #else
29 # include <liblustre.h>
30 #endif
31
32 #include <linux/obd_support.h>
33 #include <linux/lustre_ha.h>
34 #include <linux/lustre_net.h>
35 #include <linux/lustre_import.h>
36 #include <linux/lustre_export.h>
37 #include <linux/obd.h>
38 #include <linux/obd_class.h>
39
40 #include "ptlrpc_internal.h"
41
42 struct ptlrpc_connect_async_args {
43          __u64 pcaa_peer_committed;
44         int pcaa_initial_connect;
45 };
46
47 /* A CLOSED import should remain so. */
48 #define IMPORT_SET_STATE_NOLOCK(imp, state)                                    \
49 do {                                                                           \
50         if (imp->imp_state != LUSTRE_IMP_CLOSED) {                             \
51                CDEBUG(D_HA, "%p %s: changing import state from %s to %s\n",    \
52                       imp, imp->imp_target_uuid.uuid,                          \
53                       ptlrpc_import_state_name(imp->imp_state),                \
54                       ptlrpc_import_state_name(state));                        \
55                imp->imp_state = state;                                         \
56         }                                                                      \
57 } while(0)
58
59 #define IMPORT_SET_STATE(imp, state)                    \
60 do {                                                    \
61         unsigned long flags;                            \
62                                                         \
63         spin_lock_irqsave(&imp->imp_lock, flags);       \
64         IMPORT_SET_STATE_NOLOCK(imp, state);            \
65         spin_unlock_irqrestore(&imp->imp_lock, flags);  \
66 } while(0)
67
68
69 static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
70                                     void * data, int rc);
71 int ptlrpc_import_recovery_state_machine(struct obd_import *imp);
72
73 /* Only this function is allowed to change the import state when it is
74  * CLOSED. I would rather refcount the import and free it after
75  * disconnection like we do with exports. To do that, the client_obd
76  * will need to save the peer info somewhere other than in the import,
77  * though. */
78 int ptlrpc_init_import(struct obd_import *imp)
79 {
80         unsigned long flags;
81         
82         spin_lock_irqsave(&imp->imp_lock, flags);
83
84         imp->imp_generation++;
85         imp->imp_state =  LUSTRE_IMP_NEW;
86
87         spin_unlock_irqrestore(&imp->imp_lock, flags);
88
89         return 0;
90 }
91
92 /* Returns true if import was FULL, false if import was already not
93  * connected.
94  */
95 int ptlrpc_set_import_discon(struct obd_import *imp)
96 {
97         unsigned long flags;
98         int rc = 0;
99         
100         spin_lock_irqsave(&imp->imp_lock, flags);
101
102         if (imp->imp_state == LUSTRE_IMP_FULL) {
103                 IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON);
104                 spin_unlock_irqrestore(&imp->imp_lock, flags); 
105                 obd_import_event(imp->imp_obd, imp, IMP_EVENT_DISCON);
106                 rc = 1;
107         } else {
108                 spin_unlock_irqrestore(&imp->imp_lock, flags);
109                 CDEBUG(D_HA, "%p %s: import already not connected: %s\n",
110                        imp,imp->imp_client->cli_name, 
111                        ptlrpc_import_state_name(imp->imp_state));
112         }
113
114         return rc;
115 }
116
117 /*
118  * This acts as a barrier; all existing requests are rejected, and
119  * no new requests will be accepted until the import is valid again.
120  */
121 void ptlrpc_deactivate_import(struct obd_import *imp)
122 {
123         unsigned long flags;
124         ENTRY;
125
126         spin_lock_irqsave(&imp->imp_lock, flags);
127         CDEBUG(D_HA, "setting import %s INVALID\n",
128                imp->imp_target_uuid.uuid);
129         imp->imp_invalid = 1;
130         imp->imp_generation++;
131         spin_unlock_irqrestore(&imp->imp_lock, flags);
132
133         ptlrpc_abort_inflight(imp);
134         obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE);
135 }
136
137 /*
138  * This function will invalidate the import, if necessary, then block
139  * for all the RPC completions, and finally notify the obd to
140  * invalidate its state (ie cancel locks, clear pending requests,
141  * etc).
142  *
143  * in_rpc: true if this is called while processing an rpc, like
144  *    CONNECT. It will allow for one RPC to be inflight while
145  *    waiting for requests to complete. Ugly, yes, but I don't see an
146  *    cleaner way right now.
147  */
148 void ptlrpc_invalidate_import(struct obd_import *imp, int in_rpc)
149 {
150         struct l_wait_info lwi;
151         int inflight = 0;
152         int rc;
153
154         if (!imp->imp_invalid)
155                 ptlrpc_deactivate_import(imp);
156         
157         LASSERT(imp->imp_invalid);
158
159         if (in_rpc)
160                 inflight = 1;
161         /* wait for all requests to error out and call completion 
162            callbacks */
163         lwi = LWI_TIMEOUT_INTR(MAX(obd_timeout * HZ, 1), NULL, 
164                                NULL, NULL);
165         rc = l_wait_event(imp->imp_recovery_waitq, 
166                           (atomic_read(&imp->imp_inflight) == inflight), 
167                           &lwi);
168         
169         if (rc)
170                 CERROR("%s: rc = %d waiting for callback (%d != %d)\n",
171                        imp->imp_target_uuid.uuid, rc,
172                        atomic_read(&imp->imp_inflight), inflight);
173         
174         obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE);
175 }
176
177 static void ptlrpc_activate_import(struct obd_import *imp)
178 {
179         struct obd_device *obd = imp->imp_obd;
180         unsigned long flags;
181
182         spin_lock_irqsave(&imp->imp_lock, flags);
183         imp->imp_invalid = 0;
184         spin_unlock_irqrestore(&imp->imp_lock, flags);
185
186         obd_import_event(obd, imp, IMP_EVENT_ACTIVE);
187 }
188
189 void ptlrpc_fail_import(struct obd_import *imp, int generation)
190 {
191         ENTRY;
192
193         LASSERT (!imp->imp_dlm_fake);
194
195         if (ptlrpc_set_import_discon(imp)) {
196                 unsigned long flags;
197
198                 if (!imp->imp_replayable) {
199                         CDEBUG(D_HA, "import %s@%s for %s not replayable, "
200                                "auto-deactivating\n",
201                                imp->imp_target_uuid.uuid,
202                                imp->imp_connection->c_remote_uuid.uuid,
203                                imp->imp_obd->obd_name);
204                         ptlrpc_deactivate_import(imp);
205                 }
206                 
207                 CDEBUG(D_HA, "%s: waking up pinger\n", 
208                        imp->imp_target_uuid.uuid);
209                 
210                 spin_lock_irqsave(&imp->imp_lock, flags);
211                 imp->imp_force_verify = 1;
212                 spin_unlock_irqrestore(&imp->imp_lock, flags);
213                 
214                 ptlrpc_pinger_wake_up();
215                 
216         }
217         EXIT;
218 }
219
220 int ptlrpc_connect_import(struct obd_import *imp, char * new_uuid)
221 {
222         struct obd_device *obd = imp->imp_obd;
223         int initial_connect = 0;
224         int rc;
225         __u64 committed_before_reconnect = 0;
226         struct ptlrpc_request *request;
227         int size[] = {sizeof(imp->imp_target_uuid),
228                                  sizeof(obd->obd_uuid),
229                                  sizeof(imp->imp_dlm_handle)};
230         char *tmp[] = {imp->imp_target_uuid.uuid,
231                        obd->obd_uuid.uuid,
232                        (char *)&imp->imp_dlm_handle};
233         struct ptlrpc_connect_async_args *aa;
234         unsigned long flags;
235
236         spin_lock_irqsave(&imp->imp_lock, flags);
237         if (imp->imp_state == LUSTRE_IMP_CLOSED) {
238                 spin_unlock_irqrestore(&imp->imp_lock, flags);
239                 CERROR("can't connect to a closed import\n");
240                 RETURN(-EINVAL);
241         } else if (imp->imp_state == LUSTRE_IMP_FULL) {
242                 spin_unlock_irqrestore(&imp->imp_lock, flags);
243                 CERROR("already connected\n");
244                 RETURN(0);
245         } else if (imp->imp_state == LUSTRE_IMP_CONNECTING) {
246                 spin_unlock_irqrestore(&imp->imp_lock, flags);
247                 CERROR("already connecting\n");
248                 RETURN(-EALREADY);
249         }
250
251         IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CONNECTING);
252
253         imp->imp_conn_cnt++; 
254         imp->imp_last_replay_transno = 0;
255
256         if (imp->imp_remote_handle.cookie == 0) {
257                 initial_connect = 1;
258         } else {
259                 committed_before_reconnect = imp->imp_peer_committed_transno;;
260
261         }
262
263
264         spin_unlock_irqrestore(&imp->imp_lock, flags);
265
266         if (new_uuid) {
267                 struct ptlrpc_connection *conn;
268                 struct obd_uuid uuid;
269                 struct obd_export *dlmexp;
270
271                 obd_str2uuid(&uuid, new_uuid);
272
273                 conn = ptlrpc_uuid_to_connection(&uuid);
274                 if (!conn)
275                         GOTO(out, rc = -ENOENT);
276
277                 CDEBUG(D_HA, "switching import %s/%s from %s to %s\n",
278                        imp->imp_target_uuid.uuid, imp->imp_obd->obd_name,
279                        imp->imp_connection->c_remote_uuid.uuid,
280                        conn->c_remote_uuid.uuid);
281
282                 /* Switch the import's connection and the DLM export's
283                  * connection (which are almost certainly the same, but we
284                  * keep distinct refs just to make things clearer. I think. */
285                 if (imp->imp_connection)
286                         ptlrpc_put_connection(imp->imp_connection);
287                 /* We hand off the ref from ptlrpc_get_connection. */
288                 imp->imp_connection = conn;
289
290                 dlmexp = class_conn2export(&imp->imp_dlm_handle);
291                 
292                 LASSERT(dlmexp != NULL);
293
294                 if (dlmexp->exp_connection)
295                         ptlrpc_put_connection(dlmexp->exp_connection);
296                 dlmexp->exp_connection = ptlrpc_connection_addref(conn);
297                 class_export_put(dlmexp);
298
299         }
300
301         request = ptlrpc_prep_req(imp, imp->imp_connect_op, 3, size, tmp);
302         if (!request)
303                 GOTO(out, rc = -ENOMEM);
304
305 #ifndef __KERNEL__
306         lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_LIBCLIENT);
307 #endif
308
309         request->rq_send_state = LUSTRE_IMP_CONNECTING;
310         request->rq_replen = lustre_msg_size(0, NULL);
311         request->rq_interpret_reply = ptlrpc_connect_interpret;
312
313         LASSERT (sizeof (*aa) <= sizeof (request->rq_async_args));
314         aa = (struct ptlrpc_connect_async_args *)&request->rq_async_args;
315         memset(aa, 0, sizeof *aa);
316
317         aa->pcaa_peer_committed = committed_before_reconnect;
318         aa->pcaa_initial_connect = initial_connect;
319
320         if (aa->pcaa_initial_connect)
321                 imp->imp_replayable = 1;
322
323         ptlrpcd_add_req(request);
324         rc = 0;
325 out:
326         if (rc != 0) {
327                 IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
328         }
329
330         RETURN(rc);
331 }
332
333 static int ptlrpc_connect_interpret(struct ptlrpc_request *request,
334                                     void * data, int rc)
335 {
336         struct ptlrpc_connect_async_args *aa = data;
337         struct obd_import *imp = request->rq_import;
338         struct lustre_handle old_hdl;
339         unsigned long flags;
340         int msg_flags;
341         ENTRY;
342         
343         spin_lock_irqsave(&imp->imp_lock, flags);
344         if (imp->imp_state == LUSTRE_IMP_CLOSED) {
345                 spin_unlock_irqrestore(&imp->imp_lock, flags);
346                 RETURN(0);
347         }
348         spin_unlock_irqrestore(&imp->imp_lock, flags);
349
350         if (rc)
351                 GOTO(out, rc);
352
353         msg_flags = lustre_msg_get_op_flags(request->rq_repmsg);
354
355         if (aa->pcaa_initial_connect) {
356                 if (msg_flags & MSG_CONNECT_REPLAYABLE) {
357                         CDEBUG(D_HA, "connected to replayable target: %s\n",
358                                imp->imp_target_uuid.uuid);
359                         imp->imp_pingable = imp->imp_replayable = 1;
360                 } else {
361                         imp->imp_replayable = 0;
362                 }
363                 imp->imp_remote_handle = request->rq_repmsg->handle;
364                 IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
365                 GOTO(finish, rc = 0);
366         }
367
368         /* Determine what recovery state to move the import to. */
369         if (MSG_CONNECT_RECONNECT & msg_flags) {
370                 memset(&old_hdl, 0, sizeof(old_hdl));
371                 if (!memcmp(&old_hdl, &request->rq_repmsg->handle,
372                             sizeof (old_hdl))) {
373                         CERROR("%s@%s didn't like our handle "LPX64
374                                ", failed\n", imp->imp_target_uuid.uuid,
375                                imp->imp_connection->c_remote_uuid.uuid,
376                                imp->imp_dlm_handle.cookie);
377                         GOTO(out, rc = -ENOTCONN);
378                 }
379
380                 if (memcmp(&imp->imp_remote_handle, &request->rq_repmsg->handle,
381                            sizeof(imp->imp_remote_handle))) {
382                         CERROR("%s@%s changed handle from "LPX64" to "LPX64
383                                "; copying, but this may foreshadow disaster\n",
384                                imp->imp_target_uuid.uuid,
385                                imp->imp_connection->c_remote_uuid.uuid,
386                                imp->imp_remote_handle.cookie,
387                                request->rq_repmsg->handle.cookie);
388                         imp->imp_remote_handle = request->rq_repmsg->handle;
389                 } else {
390                         CERROR("reconnected to %s@%s after partition\n",
391                                imp->imp_target_uuid.uuid, 
392                                imp->imp_connection->c_remote_uuid.uuid);
393                 }
394
395                 if (imp->imp_invalid)
396                         IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
397                 else
398                         IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
399         } 
400         else if ((MSG_CONNECT_RECOVERING & msg_flags) && !imp->imp_invalid) {
401                 LASSERT(imp->imp_replayable);
402                 imp->imp_remote_handle = request->rq_repmsg->handle;
403                 IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY);
404         } 
405         else {
406                 imp->imp_remote_handle = request->rq_repmsg->handle;
407                 IMPORT_SET_STATE(imp, LUSTRE_IMP_EVICTED);
408         }
409         
410         /* Sanity checks for a reconnected import. */
411         if (!(imp->imp_replayable) != 
412              !(msg_flags & MSG_CONNECT_REPLAYABLE)) {
413                 CERROR("imp_replayable flag does not match server "
414                        "after reconnect. We should LBUG right here.\n");
415         }
416
417         if (request->rq_repmsg->last_committed < aa->pcaa_peer_committed) {
418                 CERROR("%s went back in time (transno "LPD64
419                        " was previously committed, server now claims "LPD64
420                        ")! is shared storage not coherent?\n",
421                        imp->imp_target_uuid.uuid,
422                        aa->pcaa_peer_committed,
423                        request->rq_repmsg->last_committed);
424         }
425
426 finish:
427         rc = ptlrpc_import_recovery_state_machine(imp);
428         if (rc != 0) {
429                 if (rc == -ENOTCONN) {
430                         CDEBUG(D_HA, "evicted/aborted by %s@%s during recovery;"
431                                "invalidating and reconnecting\n",
432                                imp->imp_target_uuid.uuid,
433                                imp->imp_connection->c_remote_uuid.uuid);
434                         ptlrpc_connect_import(imp, NULL);
435                         RETURN(0);
436                 } 
437         }
438  out:
439         if (rc != 0) {
440                 IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
441                 if (aa->pcaa_initial_connect && !imp->imp_initial_recov) {
442                         ptlrpc_deactivate_import(imp);
443                 }
444                 CDEBUG(D_ERROR, "recovery of %s on %s failed (%d)\n",
445                        imp->imp_target_uuid.uuid,
446                        (char *)imp->imp_connection->c_remote_uuid.uuid, rc);
447         }
448
449         wake_up(&imp->imp_recovery_waitq);
450         RETURN(rc);
451 }
452
453 static int completed_replay_interpret(struct ptlrpc_request *req,
454                                     void * data, int rc)
455 {
456         atomic_dec(&req->rq_import->imp_replay_inflight);
457         ptlrpc_import_recovery_state_machine(req->rq_import);
458         RETURN(0);
459 }
460
461 static int signal_completed_replay(struct obd_import *imp)
462  {
463         struct ptlrpc_request *req;
464         ENTRY;
465
466         LASSERT(atomic_read(&imp->imp_replay_inflight) == 0);
467         atomic_inc(&imp->imp_replay_inflight);
468
469         req = ptlrpc_prep_req(imp, OBD_PING, 0, NULL, NULL);
470         if (!req)
471                 RETURN(-ENOMEM);
472
473         req->rq_replen = lustre_msg_size(0, NULL);
474         req->rq_send_state = LUSTRE_IMP_REPLAY_WAIT;
475         req->rq_reqmsg->flags |= MSG_LAST_REPLAY;
476         req->rq_timeout *= 3; 
477         req->rq_interpret_reply = completed_replay_interpret;
478
479         ptlrpcd_add_req(req);
480         RETURN(0);
481 }
482
483 int ptlrpc_import_recovery_state_machine(struct obd_import *imp)
484 {
485         int rc = 0;
486         int inflight;
487
488         if (imp->imp_state == LUSTRE_IMP_EVICTED) {
489                 CDEBUG(D_HA, "evicted from %s@%s; invalidating\n",
490                        imp->imp_target_uuid.uuid,
491                        imp->imp_connection->c_remote_uuid.uuid);
492
493                 ptlrpc_invalidate_import(imp, 1);
494
495                 IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
496         } 
497         
498         if (imp->imp_state == LUSTRE_IMP_REPLAY) {
499                 CDEBUG(D_HA, "replay requested by %s\n",
500                        imp->imp_target_uuid.uuid);
501                 rc = ptlrpc_replay_next(imp, &inflight);
502                 if (inflight == 0 && 
503                     atomic_read(&imp->imp_replay_inflight) == 0) {
504                         IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_LOCKS);
505                         rc = ldlm_replay_locks(imp);
506                         if (rc)
507                                 GOTO(out, rc);
508                 }
509                 rc = 0;
510         }
511
512         if (imp->imp_state == LUSTRE_IMP_REPLAY_LOCKS) {
513                 if (atomic_read(&imp->imp_replay_inflight) == 0) {
514                         IMPORT_SET_STATE(imp, LUSTRE_IMP_REPLAY_WAIT);
515                         rc = signal_completed_replay(imp);
516                         if (rc)
517                                 GOTO(out, rc);
518                 }
519
520         }
521
522         if (imp->imp_state == LUSTRE_IMP_REPLAY_WAIT) {
523                 if (atomic_read(&imp->imp_replay_inflight) == 0) {
524                         IMPORT_SET_STATE(imp, LUSTRE_IMP_RECOVER);
525                 }
526         }
527
528         if (imp->imp_state == LUSTRE_IMP_RECOVER) {
529                 CDEBUG(D_HA, "reconnected to %s@%s\n",
530                        imp->imp_target_uuid.uuid,
531                        imp->imp_connection->c_remote_uuid.uuid);
532
533                 rc = ptlrpc_resend(imp);
534                 if (rc)
535                         GOTO(out, rc);
536                 IMPORT_SET_STATE(imp, LUSTRE_IMP_FULL);
537                 ptlrpc_activate_import(imp);
538         } 
539
540         if (imp->imp_state == LUSTRE_IMP_FULL) {
541                 wake_up(&imp->imp_recovery_waitq);
542                 ptlrpc_wake_delayed(imp);
543         }
544
545  out:
546         RETURN(rc);
547 }
548
549 static int back_to_sleep(void *unused) 
550 {
551         return 0;
552 }
553
554 int ptlrpc_disconnect_import(struct obd_import *imp)
555 {
556         struct ptlrpc_request *request;
557         int rq_opc;
558         int rc = 0;
559         unsigned long flags;
560         ENTRY;
561
562         switch (imp->imp_connect_op) {
563         case OST_CONNECT: rq_opc = OST_DISCONNECT; break;
564         case MDS_CONNECT: rq_opc = MDS_DISCONNECT; break;
565         case MGMT_CONNECT:rq_opc = MGMT_DISCONNECT;break;
566         default:
567                 CERROR("don't know how to disconnect from %s (connect_op %d)\n",
568                        imp->imp_target_uuid.uuid, imp->imp_connect_op);
569                 RETURN(-EINVAL);
570         }
571
572
573         if (ptlrpc_import_in_recovery(imp)) {
574                 struct l_wait_info lwi;
575                 lwi = LWI_TIMEOUT_INTR(MAX(obd_timeout * HZ, 1), back_to_sleep, 
576                                        NULL, NULL);
577                 rc = l_wait_event(imp->imp_recovery_waitq, 
578                                   !ptlrpc_import_in_recovery(imp), &lwi);
579
580         }
581
582         spin_lock_irqsave(&imp->imp_lock, flags);
583         if (imp->imp_state != LUSTRE_IMP_FULL) {
584                 GOTO(out, 0);
585         }
586         spin_unlock_irqrestore(&imp->imp_lock, flags);
587
588         request = ptlrpc_prep_req(imp, rq_opc, 0, NULL, NULL);
589         if (request) {
590                 /* For non-replayable connections, don't attempt
591                    reconnect if this fails */
592                 if (!imp->imp_replayable) {
593                         request->rq_no_resend = 1;
594                         IMPORT_SET_STATE(imp, LUSTRE_IMP_CONNECTING);
595                         request->rq_send_state =  LUSTRE_IMP_CONNECTING;
596                 }
597                 request->rq_replen = lustre_msg_size(0, NULL);
598                 rc = ptlrpc_queue_wait(request);
599                 ptlrpc_req_finished(request);
600         }
601
602         spin_lock_irqsave(&imp->imp_lock, flags);
603 out:
604         IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_CLOSED);
605         memset(&imp->imp_remote_handle, 0, sizeof(imp->imp_remote_handle));
606         spin_unlock_irqrestore(&imp->imp_lock, flags);
607
608         RETURN(rc);
609 }
610