Whamcloud - gitweb
- Maintain a list in the ll_inode_data of data (OST) locks held by this client
[fs/lustre-release.git] / lustre / llite / recover.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  * Lustre Light Super operations
5  *
6  * This code is issued under the GNU General Public License.
7  * See the file COPYING in this distribution
8  *
9  * Copryright (C) 1996 Peter J. Braam <braam@stelias.com>
10  * Copryright (C) 1999 Stelias Computing Inc. <braam@stelias.com>
11  * Copryright (C) 1999 Seagate Technology Inc.
12  * Copryright (C) 2001 Mountain View Data, Inc.
13  * Copryright (C) 2002 Cluster File Systems, Inc.
14  *
15  */
16
17 #include <linux/config.h>
18 #include <linux/module.h>
19
20 #define DEBUG_SUBSYSTEM S_LLITE
21
22 #include <linux/lustre_lite.h>
23 #include <linux/lustre_ha.h>
24
25
26 static int ll_reconnect(struct ll_sb_info *sbi)
27 {
28         struct ll_fid rootfid;
29         __u64 last_committed;
30         __u32 last_xid;
31         int err;
32         struct ptlrpc_request *request; 
33
34         ptlrpc_readdress_connection(sbi2mdc(sbi)->cl_conn, "mds");
35
36         err = connmgr_connect(ptlrpc_connmgr, sbi2mdc(sbi)->cl_conn);
37         if (err) {
38                 CERROR("cannot connect to MDS: rc = %d\n", err);
39                 ptlrpc_put_connection(sbi2mdc(sbi)->cl_conn);
40                 GOTO(out_disc, err = -ENOTCONN);
41         }
42         sbi2mdc(sbi)->cl_conn->c_level = LUSTRE_CONN_CON;
43
44         /* XXX: need to store the last_* values somewhere */
45         err = mdc_getstatus(&sbi->ll_mdc_conn,
46                           &rootfid, &last_committed, 
47                           &last_xid,
48                           &request);
49         if (err) {
50                 CERROR("cannot mds_connect: rc = %d\n", err);
51                 GOTO(out_disc, err = -ENOTCONN);
52         }
53         sbi2mdc(sbi)->cl_client->cli_last_xid = last_xid;
54         sbi2mdc(sbi)->cl_conn->c_level = LUSTRE_CONN_RECOVD;
55
56  out_disc:
57         return err;
58 }
59
60 int ll_recover(struct ptlrpc_client *cli)
61 {
62         struct ptlrpc_request *req;
63         struct list_head *tmp, *pos;
64         struct ll_sb_info *sbi = cli->cli_data;
65         int rc = 0;
66         ENTRY;
67
68         /* 1. reconnect */
69         ll_reconnect(sbi);
70         
71         /* 2. walk the request list */
72         spin_lock(&cli->cli_lock);
73         list_for_each_safe(tmp, pos, &cli->cli_sending_head) { 
74                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
75                 
76                 /* replay what needs to be replayed */
77                 if (req->rq_flags & PTL_RPC_FL_REPLAY) {
78                         CDEBUG(D_INODE, "req %Ld needs replay [last rcvd %Ld]\n", 
79                                req->rq_xid, cli->cli_last_xid);
80                         rc = ptlrpc_replay_req(req); 
81                         if (rc) { 
82                                 CERROR("recovery replay error %d for request %Ld\n", 
83                                        rc, req->rq_xid); 
84                                 GOTO(out, rc);
85                         }
86                 }
87
88                 /* server has seen req, we have reply: skip */
89                 if ((req->rq_flags & PTL_RPC_FL_REPLIED)  &&
90                     req->rq_xid <= cli->cli_last_xid) { 
91                         CDEBUG(D_INODE, "req %Ld was complete: skip [last rcvd %Ld]\n", 
92                                req->rq_xid, cli->cli_last_xid);
93                         continue;
94                 }
95
96                 /* server has lost req, we have reply: resend, ign reply */
97                 if ((req->rq_flags & PTL_RPC_FL_REPLIED)  &&
98                     req->rq_xid > cli->cli_last_xid) { 
99                         CDEBUG(D_INODE, "lost req %Ld have rep: replay [last rcvd %Ld]\n", 
100                                req->rq_xid, cli->cli_last_xid);
101                         rc = ptlrpc_replay_req(req); 
102                         if (rc) {
103                                 CERROR("request resend error %d for request %Ld\n", 
104                                        rc, req->rq_xid); 
105                                 GOTO(out, rc);
106                         }
107                 }
108
109                 /* server has seen req, we have lost reply: -ERESTARTSYS */
110                 if ( !(req->rq_flags & PTL_RPC_FL_REPLIED)  &&
111                      req->rq_xid <= cli->cli_last_xid) { 
112                         CDEBUG(D_INODE, "lost rep %Ld srv did req: restart [last rcvd %Ld]\n", 
113                                req->rq_xid, cli->cli_last_xid);
114                         ptlrpc_restart_req(req);
115                 }
116
117                 /* service has not seen req, no reply: resend */
118                 if ( !(req->rq_flags & PTL_RPC_FL_REPLIED)  &&
119                      req->rq_xid > cli->cli_last_xid) {
120                         CDEBUG(D_INODE, "lost rep/req %Ld: resend [last rcvd %Ld]\n", 
121                                req->rq_xid, cli->cli_last_xid);
122                         ptlrpc_resend_req(req);
123                 }
124
125         }
126
127         sbi2mdc(sbi)->cl_conn->c_level = LUSTRE_CONN_FULL;
128         recovd_cli_fixed(cli);
129
130         /* Finally, continue what we delayed since recovery started */
131         list_for_each_safe(tmp, pos, &cli->cli_delayed_head) { 
132                 req = list_entry(tmp, struct ptlrpc_request, rq_list);
133                 ptlrpc_continue_req(req);
134         }
135
136         EXIT;
137  out:
138         spin_unlock(&cli->cli_lock);
139         return rc;
140 }