1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * Lustre High Availability Daemon
8 * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
10 * This code is issued under the GNU General Public License.
11 * See the file COPYING in this distribution
13 * by Peter Braam <braam@clusterfs.com>
17 #define DEBUG_SUBSYSTEM S_RPC
19 #include <linux/kmod.h>
20 #include <linux/lustre_lite.h>
21 #include <linux/lustre_ha.h>
22 #include <linux/obd_support.h>
24 void recovd_conn_manage(struct recovd_obd *recovd,
25 struct ptlrpc_connection *conn)
28 conn->c_recovd = recovd;
29 spin_lock(&recovd->recovd_lock);
30 list_add(&conn->c_recovd_data.rd_managed_chain,
31 &recovd->recovd_managed_items);
32 spin_unlock(&recovd->recovd_lock);
36 void recovd_conn_fail(struct ptlrpc_connection *conn)
39 spin_lock(&conn->c_recovd->recovd_lock);
40 conn->c_recovd->recovd_flags |= RECOVD_FAIL;
41 conn->c_recovd->recovd_wakeup_flag = 1;
42 list_del(&conn->c_recovd_data.rd_managed_chain);
43 list_add(&conn->c_recovd_data.rd_managed_chain,
44 &conn->c_recovd->recovd_troubled_items);
45 spin_unlock(&conn->c_recovd->recovd_lock);
46 wake_up(&conn->c_recovd->recovd_waitq);
50 /* this function must be called with conn->c_lock held */
51 void recovd_conn_fixed(struct ptlrpc_connection *conn)
54 list_del(&conn->c_recovd_data.rd_managed_chain);
55 list_add(&conn->c_recovd_data.rd_managed_chain,
56 &conn->c_recovd->recovd_managed_items);
61 static int recovd_upcall(void)
66 argv[0] = obd_recovery_upcall;
70 envp [1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
73 return call_usermodehelper(argv[0], argv, envp);
76 static int recovd_check_event(struct recovd_obd *recovd)
81 spin_lock(&recovd->recovd_lock);
83 recovd->recovd_waketime = CURRENT_TIME;
84 if (recovd->recovd_timeout)
85 schedule_timeout(recovd->recovd_timeout);
87 if (recovd->recovd_wakeup_flag) {
88 CERROR("service woken\n");
92 if (recovd->recovd_timeout &&
93 CURRENT_TIME > recovd->recovd_waketime + recovd->recovd_timeout) {
94 recovd->recovd_flags |= RECOVD_TIMEOUT;
99 if (recovd->recovd_flags & RECOVD_STOPPING) {
100 CERROR("recovd stopping\n");
105 recovd->recovd_wakeup_flag = 0;
106 spin_unlock(&recovd->recovd_lock);
110 static int recovd_handle_event(struct recovd_obd *recovd)
114 if (!(recovd->recovd_flags & RECOVD_UPCALL_WAIT) &&
115 recovd->recovd_flags & RECOVD_FAIL) {
117 CERROR("client in trouble: flags -> UPCALL_WAITING\n");
118 recovd->recovd_flags |= RECOVD_UPCALL_WAIT;
121 recovd->recovd_waketime = CURRENT_TIME;
122 recovd->recovd_timeout = 10 * HZ;
123 schedule_timeout(recovd->recovd_timeout);
126 if (recovd->recovd_flags & RECOVD_TIMEOUT) {
127 CERROR("timeout - no news from upcall?\n");
128 recovd->recovd_flags &= ~RECOVD_TIMEOUT;
131 if (recovd->recovd_flags & RECOVD_UPCALL_ANSWER) {
132 CERROR("UPCALL_WAITING: upcall answer\n");
134 while (!list_empty(&recovd->recovd_troubled_items)) {
135 struct recovd_data *rd =
136 list_entry(recovd->recovd_troubled_items.next,
137 struct recovd_data, rd_managed_chain);
139 list_del(&rd->rd_managed_chain);
140 if (rd->rd_recover) {
141 spin_unlock(&recovd->recovd_lock);
143 spin_lock(&recovd->recovd_lock);
147 recovd->recovd_timeout = 0;
148 recovd->recovd_flags = RECOVD_IDLE;
154 static int recovd_main(void *arg)
156 struct recovd_obd *recovd = (struct recovd_obd *)arg;
162 spin_lock_irq(¤t->sigmask_lock);
163 sigfillset(¤t->blocked);
164 recalc_sigpending(current);
165 spin_unlock_irq(¤t->sigmask_lock);
167 sprintf(current->comm, "lustre_recovd");
170 /* Record that the thread is running */
171 recovd->recovd_thread = current;
172 recovd->recovd_flags = RECOVD_IDLE;
173 wake_up(&recovd->recovd_ctl_waitq);
175 /* And now, loop forever on requests */
177 wait_event(recovd->recovd_waitq, recovd_check_event(recovd));
179 spin_lock(&recovd->recovd_lock);
180 if (recovd->recovd_flags & RECOVD_STOPPING) {
181 spin_unlock(&recovd->recovd_lock);
182 CERROR("lustre_recovd stopping\n");
187 recovd_handle_event(recovd);
188 spin_unlock(&recovd->recovd_lock);
191 recovd->recovd_thread = NULL;
192 recovd->recovd_flags = RECOVD_STOPPED;
193 wake_up(&recovd->recovd_ctl_waitq);
194 CDEBUG(D_NET, "mgr exiting process %d\n", current->pid);
198 int recovd_setup(struct recovd_obd *recovd)
201 extern void (*class_signal_connection_failure)
202 (struct ptlrpc_connection *);
206 INIT_LIST_HEAD(&recovd->recovd_managed_items);
207 INIT_LIST_HEAD(&recovd->recovd_troubled_items);
208 spin_lock_init(&recovd->recovd_lock);
210 init_waitqueue_head(&recovd->recovd_waitq);
211 init_waitqueue_head(&recovd->recovd_recovery_waitq);
212 init_waitqueue_head(&recovd->recovd_ctl_waitq);
214 rc = kernel_thread(recovd_main, (void *)recovd,
215 CLONE_VM | CLONE_FS | CLONE_FILES);
217 CERROR("cannot start thread\n");
220 wait_event(recovd->recovd_ctl_waitq, recovd->recovd_flags & RECOVD_IDLE);
222 /* exported and called by obdclass timeout handlers */
223 class_signal_connection_failure = recovd_conn_fail;
228 int recovd_cleanup(struct recovd_obd *recovd)
230 spin_lock(&recovd->recovd_lock);
231 recovd->recovd_flags = RECOVD_STOPPING;
232 wake_up(&recovd->recovd_waitq);
233 spin_unlock(&recovd->recovd_lock);
235 wait_event(recovd->recovd_ctl_waitq,
236 (recovd->recovd_flags & RECOVD_STOPPED));