Whamcloud - gitweb
Ignore PtlPut error, so that we will still timeout and trigger recovery.
[fs/lustre-release.git] / lustre / ptlrpc / recovd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  obd/rpc/recovd.c
5  *
6  *  Lustre High Availability Daemon
7  *
8  *  Copyright (C) 2001, 2002 Cluster File Systems, Inc.
9  *
10  *  This code is issued under the GNU General Public License.
11  *  See the file COPYING in this distribution
12  *
13  *  by Peter Braam <braam@clusterfs.com>
14  *
15  */
16
17 #define DEBUG_SUBSYSTEM S_RPC
18
19 #include <linux/lustre_lite.h>
20 #include <linux/lustre_ha.h>
21 #include <linux/obd_support.h>
22
23 void recovd_conn_manage(struct ptlrpc_connection *conn,
24                         struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover)
25 {
26         struct recovd_data *rd = &conn->c_recovd_data;
27         ENTRY;
28
29         rd->rd_recovd = recovd;
30         rd->rd_recover = recover;
31
32         spin_lock(&recovd->recovd_lock);
33         list_add(&rd->rd_managed_chain, &recovd->recovd_managed_items);
34         spin_unlock(&recovd->recovd_lock);
35
36         EXIT;
37 }
38
39 void recovd_conn_fail(struct ptlrpc_connection *conn)
40 {
41         struct recovd_data *rd = &conn->c_recovd_data;
42         struct recovd_obd *recovd = rd->rd_recovd;
43         ENTRY;
44
45         if (!recovd) {
46                 CERROR("no recovd for connection %p\n", conn);
47                 return;
48         }
49
50         spin_lock(&recovd->recovd_lock);
51         list_del(&rd->rd_managed_chain);
52         list_add_tail(&rd->rd_managed_chain, &recovd->recovd_troubled_items);
53         spin_unlock(&recovd->recovd_lock);
54
55         wake_up(&recovd->recovd_waitq);
56
57         EXIT;
58 }
59
60 /* this function must be called with conn->c_lock held */
61 void recovd_conn_fixed(struct ptlrpc_connection *conn)
62 {
63         struct recovd_data *rd = &conn->c_recovd_data;
64         ENTRY;
65
66         list_del(&rd->rd_managed_chain);
67         list_add(&rd->rd_managed_chain, &rd->rd_recovd->recovd_managed_items);
68
69         EXIT;
70 }
71
72
73 static int recovd_check_event(struct recovd_obd *recovd)
74 {
75         int rc = 0;
76         ENTRY;
77
78         spin_lock(&recovd->recovd_lock);
79
80         if (recovd->recovd_phase == RECOVD_IDLE &&
81             !list_empty(&recovd->recovd_troubled_items)) {
82                 GOTO(out, rc = 1);
83         }
84
85         if (recovd->recovd_flags & RECOVD_STOPPING)
86                 GOTO(out, rc = 1);
87
88         if (recovd->recovd_flags & RECOVD_FAILED) {
89                 LASSERT(recovd->recovd_phase != RECOVD_IDLE && 
90                         recovd->recovd_current_rd);
91                 GOTO(out, rc = 1);
92         }
93
94         if (recovd->recovd_phase == recovd->recovd_next_phase)
95                 GOTO(out, rc = 1);
96
97  out:
98         spin_unlock(&recovd->recovd_lock);
99         RETURN(rc);
100 }
101
102 static int recovd_handle_event(struct recovd_obd *recovd)
103 {
104         struct recovd_data *rd;
105         int rc;
106         ENTRY;
107
108         if (recovd->recovd_flags & RECOVD_FAILED) {
109
110                 LASSERT(recovd->recovd_phase != RECOVD_IDLE && 
111                         recovd->recovd_current_rd);
112
113                 rd = recovd->recovd_current_rd;
114         cb_failed:
115                 CERROR("recovery FAILED for rd %p (conn %p), recovering\n",
116                        rd, class_rd2conn(rd));
117
118                 list_add(&rd->rd_managed_chain, &recovd->recovd_managed_items);
119                 spin_unlock(&recovd->recovd_lock);
120                 rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_FAILURE);
121                 spin_lock(&recovd->recovd_lock);
122                 recovd->recovd_phase = RECOVD_IDLE;
123                 recovd->recovd_next_phase = RECOVD_PREPARING;
124                 
125                 recovd->recovd_flags &= ~RECOVD_FAILED;
126
127                 RETURN(1);
128         }
129
130         switch (recovd->recovd_phase) {
131             case RECOVD_IDLE:
132                 if (recovd->recovd_current_rd ||
133                     list_empty(&recovd->recovd_troubled_items))
134                         break;
135                 rd = list_entry(recovd->recovd_troubled_items.next,
136                                 struct recovd_data, rd_managed_chain);
137                 
138                 list_del(&rd->rd_managed_chain);
139                 if (!rd->rd_recover)
140                         LBUG();
141
142                 CERROR("starting recovery for rd %p (conn %p)\n",
143                        rd, class_rd2conn(rd));
144                 recovd->recovd_current_rd = rd;
145                 recovd->recovd_flags &= ~RECOVD_FAILED;
146                 recovd->recovd_phase = RECOVD_PREPARING;
147
148                 spin_unlock(&recovd->recovd_lock);
149                 rc = rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_PREPARE);
150                 spin_lock(&recovd->recovd_lock);
151                 if (rc)
152                         goto cb_failed;
153                 
154                 recovd->recovd_next_phase = RECOVD_PREPARED;
155                 break;
156
157             case RECOVD_PREPARED:
158                 rd = recovd->recovd_current_rd;
159                 recovd->recovd_phase = RECOVD_RECOVERING;
160
161                 CERROR("recovery prepared for rd %p (conn %p), recovering\n",
162                        rd, class_rd2conn(rd));
163
164                 spin_unlock(&recovd->recovd_lock);
165                 rc = rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_RECOVER);
166                 spin_lock(&recovd->recovd_lock);
167                 if (rc)
168                         goto cb_failed;
169                 
170                 recovd->recovd_next_phase = RECOVD_RECOVERED;
171                 break;
172
173             case RECOVD_RECOVERED:
174                 rd = recovd->recovd_current_rd;
175                 recovd->recovd_phase = RECOVD_IDLE;
176                 recovd->recovd_next_phase = RECOVD_PREPARING;
177
178                 CERROR("recovery complete for rd %p (conn %p), recovering\n",
179                        rd, class_rd2conn(rd));
180                 break;
181
182             default:
183                 break;
184         }
185
186         RETURN(0);
187 }
188
189 static int recovd_main(void *arg)
190 {
191         struct recovd_obd *recovd = (struct recovd_obd *)arg;
192
193         ENTRY;
194
195         lock_kernel();
196         daemonize();
197         spin_lock_irq(&current->sigmask_lock);
198         sigfillset(&current->blocked);
199         recalc_sigpending(current);
200         spin_unlock_irq(&current->sigmask_lock);
201
202         sprintf(current->comm, "lustre_recovd");
203         unlock_kernel();
204
205         /* Record that the  thread is running */
206         recovd->recovd_thread = current;
207         recovd->recovd_flags = RECOVD_IDLE;
208         wake_up(&recovd->recovd_ctl_waitq);
209
210         /* And now, loop forever on requests */
211         while (1) {
212                 wait_event(recovd->recovd_waitq, recovd_check_event(recovd));
213
214                 spin_lock(&recovd->recovd_lock);
215
216                 if (recovd->recovd_flags & RECOVD_STOPPING) {
217                         spin_unlock(&recovd->recovd_lock);
218                         CERROR("lustre_recovd stopping\n");
219                         EXIT;
220                         break;
221                 }
222
223                 recovd_handle_event(recovd);
224                 spin_unlock(&recovd->recovd_lock);
225         }
226
227         recovd->recovd_thread = NULL;
228         recovd->recovd_flags = RECOVD_STOPPED;
229         wake_up(&recovd->recovd_ctl_waitq);
230         CDEBUG(D_NET, "mgr exiting process %d\n", current->pid);
231         RETURN(0);
232 }
233
234 int recovd_setup(struct recovd_obd *recovd)
235 {
236         int rc;
237         extern void (*class_signal_connection_failure)
238                 (struct ptlrpc_connection *);
239
240         ENTRY;
241
242         INIT_LIST_HEAD(&recovd->recovd_managed_items);
243         INIT_LIST_HEAD(&recovd->recovd_troubled_items);
244         spin_lock_init(&recovd->recovd_lock);
245
246         init_waitqueue_head(&recovd->recovd_waitq);
247         init_waitqueue_head(&recovd->recovd_recovery_waitq);
248         init_waitqueue_head(&recovd->recovd_ctl_waitq);
249
250         recovd->recovd_next_phase = RECOVD_PREPARING;
251         
252         rc = kernel_thread(recovd_main, (void *)recovd,
253                            CLONE_VM | CLONE_FS | CLONE_FILES);
254         if (rc < 0) {
255                 CERROR("cannot start thread\n");
256                 RETURN(-EINVAL);
257         }
258         wait_event(recovd->recovd_ctl_waitq,
259                    recovd->recovd_phase == RECOVD_IDLE);
260
261         /* exported and called by obdclass timeout handlers */
262         class_signal_connection_failure = recovd_conn_fail;
263         ptlrpc_recovd = recovd;
264
265         RETURN(0);
266 }
267
268 int recovd_cleanup(struct recovd_obd *recovd)
269 {
270         spin_lock(&recovd->recovd_lock);
271         recovd->recovd_flags = RECOVD_STOPPING;
272         wake_up(&recovd->recovd_waitq);
273         spin_unlock(&recovd->recovd_lock);
274
275         wait_event(recovd->recovd_ctl_waitq,
276                    (recovd->recovd_flags & RECOVD_STOPPED));
277         RETURN(0);
278 }
279
280 struct recovd_obd *ptlrpc_recovd;