1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * Lustre High Availability Daemon
8 * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
10 * This code is issued under the GNU General Public License.
11 * See the file COPYING in this distribution
13 * by Peter Braam <braam@clusterfs.com>
17 #define DEBUG_SUBSYSTEM S_RPC
19 #include <linux/lustre_lite.h>
20 #include <linux/lustre_ha.h>
21 #include <linux/obd_support.h>
23 void recovd_conn_manage(struct ptlrpc_connection *conn,
24 struct recovd_obd *recovd, ptlrpc_recovery_cb_t recover)
26 struct recovd_data *rd = &conn->c_recovd_data;
29 rd->rd_recovd = recovd;
30 rd->rd_recover = recover;
32 spin_lock(&recovd->recovd_lock);
33 list_add(&rd->rd_managed_chain, &recovd->recovd_managed_items);
34 spin_unlock(&recovd->recovd_lock);
39 void recovd_conn_fail(struct ptlrpc_connection *conn)
41 struct recovd_data *rd = &conn->c_recovd_data;
42 struct recovd_obd *recovd = rd->rd_recovd;
46 CERROR("no recovd for connection %p\n", conn);
50 spin_lock(&recovd->recovd_lock);
51 list_del(&rd->rd_managed_chain);
52 list_add_tail(&rd->rd_managed_chain, &recovd->recovd_troubled_items);
53 spin_unlock(&recovd->recovd_lock);
55 wake_up(&recovd->recovd_waitq);
60 /* this function must be called with conn->c_lock held */
61 void recovd_conn_fixed(struct ptlrpc_connection *conn)
63 struct recovd_data *rd = &conn->c_recovd_data;
66 list_del(&rd->rd_managed_chain);
67 list_add(&rd->rd_managed_chain, &rd->rd_recovd->recovd_managed_items);
73 static int recovd_check_event(struct recovd_obd *recovd)
78 spin_lock(&recovd->recovd_lock);
80 if (recovd->recovd_phase == RECOVD_IDLE &&
81 !list_empty(&recovd->recovd_troubled_items)) {
85 if (recovd->recovd_flags & RECOVD_STOPPING)
88 if (recovd->recovd_flags & RECOVD_FAILED) {
89 LASSERT(recovd->recovd_phase != RECOVD_IDLE &&
90 recovd->recovd_current_rd);
94 if (recovd->recovd_phase == recovd->recovd_next_phase)
98 spin_unlock(&recovd->recovd_lock);
102 static int recovd_handle_event(struct recovd_obd *recovd)
104 struct recovd_data *rd;
108 if (recovd->recovd_flags & RECOVD_FAILED) {
110 LASSERT(recovd->recovd_phase != RECOVD_IDLE &&
111 recovd->recovd_current_rd);
113 rd = recovd->recovd_current_rd;
115 CERROR("recovery FAILED for rd %p (conn %p), recovering\n",
116 rd, class_rd2conn(rd));
118 list_add(&rd->rd_managed_chain, &recovd->recovd_managed_items);
119 spin_unlock(&recovd->recovd_lock);
120 rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_FAILURE);
121 spin_lock(&recovd->recovd_lock);
122 recovd->recovd_phase = RECOVD_IDLE;
123 recovd->recovd_next_phase = RECOVD_PREPARING;
125 recovd->recovd_flags &= ~RECOVD_FAILED;
130 switch (recovd->recovd_phase) {
132 if (recovd->recovd_current_rd ||
133 list_empty(&recovd->recovd_troubled_items))
135 rd = list_entry(recovd->recovd_troubled_items.next,
136 struct recovd_data, rd_managed_chain);
138 list_del(&rd->rd_managed_chain);
142 CERROR("starting recovery for rd %p (conn %p)\n",
143 rd, class_rd2conn(rd));
144 recovd->recovd_current_rd = rd;
145 recovd->recovd_flags &= ~RECOVD_FAILED;
146 recovd->recovd_phase = RECOVD_PREPARING;
148 spin_unlock(&recovd->recovd_lock);
149 rc = rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_PREPARE);
150 spin_lock(&recovd->recovd_lock);
154 recovd->recovd_next_phase = RECOVD_PREPARED;
157 case RECOVD_PREPARED:
158 rd = recovd->recovd_current_rd;
159 recovd->recovd_phase = RECOVD_RECOVERING;
161 CERROR("recovery prepared for rd %p (conn %p), recovering\n",
162 rd, class_rd2conn(rd));
164 spin_unlock(&recovd->recovd_lock);
165 rc = rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_RECOVER);
166 spin_lock(&recovd->recovd_lock);
170 recovd->recovd_next_phase = RECOVD_RECOVERED;
173 case RECOVD_RECOVERED:
174 rd = recovd->recovd_current_rd;
175 recovd->recovd_phase = RECOVD_IDLE;
176 recovd->recovd_next_phase = RECOVD_PREPARING;
178 CERROR("recovery complete for rd %p (conn %p), recovering\n",
179 rd, class_rd2conn(rd));
189 static int recovd_main(void *arg)
191 struct recovd_obd *recovd = (struct recovd_obd *)arg;
197 spin_lock_irq(¤t->sigmask_lock);
198 sigfillset(¤t->blocked);
199 recalc_sigpending(current);
200 spin_unlock_irq(¤t->sigmask_lock);
202 sprintf(current->comm, "lustre_recovd");
205 /* Record that the thread is running */
206 recovd->recovd_thread = current;
207 recovd->recovd_flags = RECOVD_IDLE;
208 wake_up(&recovd->recovd_ctl_waitq);
210 /* And now, loop forever on requests */
212 wait_event(recovd->recovd_waitq, recovd_check_event(recovd));
214 spin_lock(&recovd->recovd_lock);
216 if (recovd->recovd_flags & RECOVD_STOPPING) {
217 spin_unlock(&recovd->recovd_lock);
218 CERROR("lustre_recovd stopping\n");
223 recovd_handle_event(recovd);
224 spin_unlock(&recovd->recovd_lock);
227 recovd->recovd_thread = NULL;
228 recovd->recovd_flags = RECOVD_STOPPED;
229 wake_up(&recovd->recovd_ctl_waitq);
230 CDEBUG(D_NET, "mgr exiting process %d\n", current->pid);
234 int recovd_setup(struct recovd_obd *recovd)
237 extern void (*class_signal_connection_failure)
238 (struct ptlrpc_connection *);
242 INIT_LIST_HEAD(&recovd->recovd_managed_items);
243 INIT_LIST_HEAD(&recovd->recovd_troubled_items);
244 spin_lock_init(&recovd->recovd_lock);
246 init_waitqueue_head(&recovd->recovd_waitq);
247 init_waitqueue_head(&recovd->recovd_recovery_waitq);
248 init_waitqueue_head(&recovd->recovd_ctl_waitq);
250 recovd->recovd_next_phase = RECOVD_PREPARING;
252 rc = kernel_thread(recovd_main, (void *)recovd,
253 CLONE_VM | CLONE_FS | CLONE_FILES);
255 CERROR("cannot start thread\n");
258 wait_event(recovd->recovd_ctl_waitq,
259 recovd->recovd_phase == RECOVD_IDLE);
261 /* exported and called by obdclass timeout handlers */
262 class_signal_connection_failure = recovd_conn_fail;
263 ptlrpc_recovd = recovd;
268 int recovd_cleanup(struct recovd_obd *recovd)
270 spin_lock(&recovd->recovd_lock);
271 recovd->recovd_flags = RECOVD_STOPPING;
272 wake_up(&recovd->recovd_waitq);
273 spin_unlock(&recovd->recovd_lock);
275 wait_event(recovd->recovd_ctl_waitq,
276 (recovd->recovd_flags & RECOVD_STOPPED));
280 struct recovd_obd *ptlrpc_recovd;