Whamcloud - gitweb
Merge trivial changes from branch to head.
[fs/lustre-release.git] / lustre / llite / recover.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Lustre Light Super operations
5  *
6  * This code is issued under the GNU General Public License.
7  * See the file COPYING in this distribution
8  *
9  * Copryright (C) 1996 Peter J. Braam <braam@stelias.com>
10  * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
11  * Copryright (C) 1999 Seagate Technology Inc.
12  * Copryright (C) 2001 Mountain View Data, Inc.
13  * Copryright (C) 2002 Cluster File Systems, Inc.
14  *
15  */
16
17 #include <linux/config.h>
18 #include <linux/module.h>
19
20 #define DEBUG_SUBSYSTEM S_LLITE
21
22 #include <linux/lustre_lite.h>
23 #include <linux/lustre_ha.h>
24
25
26 static int ll_reconnect(struct ll_sb_info *sbi)
27 {
28         struct ll_fid rootfid;
29         __u64 last_committed, last_rcvd;
30         __u32 last_xid;
31         int err;
32         struct ptlrpc_request *request; 
33
34         ptlrpc_readdress_connection(sbi2mdc(sbi)->mdc_conn, "mds");
35
36         err = connmgr_connect(ptlrpc_connmgr, sbi2mdc(sbi)->mdc_conn);
37         if (err) {
38                 CERROR("cannot connect to MDS: rc = %d\n", err);
39                 ptlrpc_put_connection(sbi2mdc(sbi)->mdc_conn);
40                 GOTO(out_disc, err = -ENOTCONN);
41         }
42         sbi2mdc(sbi)->mdc_conn->c_level = LUSTRE_CONN_CON;
43
44         /* XXX: need to store the last_* values somewhere */
45         err = mdc_getstatus(&sbi->ll_mdc_conn,
46                           &rootfid, &last_committed, 
47                           &last_rcvd,
48                           &last_xid,
49                           &request);
50         if (err) {
51                 CERROR("cannot mds_connect: rc = %d\n", err);
52                 GOTO(out_disc, err = -ENOTCONN);
53         }
54         sbi2mdc(sbi)->mdc_client->cli_last_rcvd = last_xid;
55         sbi2mdc(sbi)->mdc_conn->c_level = LUSTRE_CONN_RECOVD;
56
57  out_disc:
58         return err;
59 }
60
61
62 int ll_recover(struct ptlrpc_client *cli)
63 {
64         struct ptlrpc_request *req;
65         struct list_head *tmp, *pos;
66         struct ll_sb_info *sbi = cli->cli_data;
67         int rc = 0;
68         ENTRY;
69
70         /* 1. reconnect */
71         ll_reconnect(sbi);
72         
73         /* 2. walk the request list */
74         spin_lock(&cli->cli_lock);
75         list_for_each_safe(tmp, pos, &cli->cli_sending_head) { 
76                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
77                 
78                 /* replay what needs to be replayed */
79                 if (req->rq_flags & PTL_RPC_FL_REPLAY) {
80                         CDEBUG(D_INODE, "req %Ld needs replay [last rcvd %Ld]\n", 
81                                req->rq_xid, cli->cli_last_rcvd);
82                         rc = ptlrpc_replay_req(req); 
83                         if (rc) { 
84                                 CERROR("recovery replay error %d for request %Ld\n", 
85                                        rc, req->rq_xid); 
86                                 GOTO(out, rc);
87                         }
88                 }
89
90                 /* server has seen req, we have reply: skip */
91                 if ((req->rq_flags & PTL_RPC_FL_REPLIED)  &&
92                     req->rq_xid <= cli->cli_last_rcvd) { 
93                         CDEBUG(D_INODE, "req %Ld was complete: skip [last rcvd %Ld]\n", 
94                                req->rq_xid, cli->cli_last_rcvd);
95                         continue;
96                 }
97
98                 /* server has lost req, we have reply: resend, ign reply */
99                 if ((req->rq_flags & PTL_RPC_FL_REPLIED)  &&
100                     req->rq_xid > cli->cli_last_rcvd) { 
101                         CDEBUG(D_INODE, "lost req %Ld have rep: replay [last rcvd %Ld]\n", 
102                                req->rq_xid, cli->cli_last_rcvd);
103                         rc = ptlrpc_replay_req(req); 
104                         if (rc) {
105                                 CERROR("request resend error %d for request %Ld\n", 
106                                        rc, req->rq_xid); 
107                                 GOTO(out, rc);
108                         }
109                 }
110
111                 /* server has seen req, we have lost reply: -ERESTARTSYS */
112                 if ( !(req->rq_flags & PTL_RPC_FL_REPLIED)  &&
113                      req->rq_xid <= cli->cli_last_rcvd) { 
114                         CDEBUG(D_INODE, "lost rep %Ld srv did req: restart [last rcvd %Ld]\n", 
115                                req->rq_xid, cli->cli_last_rcvd);
116                         ptlrpc_restart_req(req);
117                 }
118
119                 /* service has not seen req, no reply: resend */
120                 if ( !(req->rq_flags & PTL_RPC_FL_REPLIED)  &&
121                      req->rq_xid > cli->cli_last_rcvd) {
122                         CDEBUG(D_INODE, "lost rep/req %Ld: resend [last rcvd %Ld]\n", 
123                                req->rq_xid, cli->cli_last_rcvd);
124                         ptlrpc_resend_req(req);
125                 }
126
127         }
128
129         sbi2mdc(sbi)->mdc_conn->c_level = LUSTRE_CONN_FULL;
130         recovd_cli_fixed(cli);
131
132         /* Finally, continue what we delayed since recovery started */
133         list_for_each_safe(tmp, pos, &cli->cli_delayed_head) { 
134                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
135                 ptlrpc_continue_req(req);
136         }
137
138         EXIT;
139  out:
140         spin_unlock(&cli->cli_lock);
141         return rc;
142 }