1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * Lustre High Availability Daemon
8 * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
10 * This code is issued under the GNU General Public License.
11 * See the file COPYING in this distribution
13 * by Peter Braam <braam@clusterfs.com>
17 #define DEBUG_SUBSYSTEM S_RPC
19 #include <linux/kmod.h>
20 #include <linux/lustre_lite.h>
21 #include <linux/lustre_ha.h>
23 struct recovd_obd *ptlrpc_connmgr;
25 void recovd_cli_manage(struct recovd_obd *recovd, struct ptlrpc_client *cli)
28 cli->cli_recovd = recovd;
29 spin_lock(&recovd->recovd_lock);
30 list_add(&cli->cli_ha_item, &recovd->recovd_clients_lh);
31 spin_unlock(&recovd->recovd_lock);
35 void recovd_cli_fail(struct ptlrpc_client *cli)
38 spin_lock(&cli->cli_recovd->recovd_lock);
39 cli->cli_recovd->recovd_flags |= RECOVD_FAIL;
40 cli->cli_recovd->recovd_wakeup_flag = 1;
41 list_del(&cli->cli_ha_item);
42 list_add(&cli->cli_ha_item, &cli->cli_recovd->recovd_troubled_lh);
43 spin_unlock(&cli->cli_recovd->recovd_lock);
44 wake_up(&cli->cli_recovd->recovd_waitq);
48 /* this function must be called with cli->cli_lock held */
49 void recovd_cli_fixed(struct ptlrpc_client *cli)
52 list_del(&cli->cli_ha_item);
53 list_add(&cli->cli_ha_item, &cli->cli_recovd->recovd_clients_lh);
58 static int recovd_upcall(void)
63 argv[0] = "/usr/src/obd/utils/ha_assist.sh";
67 envp [1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
70 return call_usermodehelper(argv[0], argv, envp);
73 static int recovd_check_event(struct recovd_obd *recovd)
78 spin_lock(&recovd->recovd_lock);
80 recovd->recovd_waketime = CURRENT_TIME;
81 if (recovd->recovd_timeout)
82 schedule_timeout(recovd->recovd_timeout);
84 if (recovd->recovd_wakeup_flag) {
85 CERROR("service woken\n");
89 if (recovd->recovd_timeout &&
90 CURRENT_TIME > recovd->recovd_waketime + recovd->recovd_timeout) {
91 recovd->recovd_flags |= RECOVD_TIMEOUT;
96 if (recovd->recovd_flags & RECOVD_STOPPING) {
97 CERROR("recovd stopping\n");
102 recovd->recovd_wakeup_flag = 0;
103 spin_unlock(&recovd->recovd_lock);
107 static int recovd_handle_event(struct recovd_obd *recovd)
111 if (!(recovd->recovd_flags & RECOVD_UPCALL_WAIT) &&
112 recovd->recovd_flags & RECOVD_FAIL) {
114 CERROR("client in trouble: flags -> UPCALL_WAITING\n");
115 recovd->recovd_flags |= RECOVD_UPCALL_WAIT;
118 recovd->recovd_waketime = CURRENT_TIME;
119 recovd->recovd_timeout = 10 * HZ;
120 schedule_timeout(recovd->recovd_timeout);
123 if (recovd->recovd_flags & RECOVD_TIMEOUT) {
124 CERROR("timeout - no news from upcall?\n");
125 recovd->recovd_flags &= ~RECOVD_TIMEOUT;
128 if (recovd->recovd_flags & RECOVD_UPCALL_ANSWER) {
129 CERROR("UPCALL_WAITING: upcall answer\n");
131 while (!list_empty(&recovd->recovd_troubled_lh)) {
132 struct ptlrpc_client *cli =
133 list_entry(recovd->recovd_troubled_lh.next,
134 struct ptlrpc_client, cli_ha_item);
136 list_del(&cli->cli_ha_item);
137 if (cli->cli_recover) {
138 spin_unlock(&recovd->recovd_lock);
139 cli->cli_recover(cli);
140 spin_lock(&recovd->recovd_lock);
144 recovd->recovd_timeout = 0;
145 recovd->recovd_flags = RECOVD_IDLE;
151 static int recovd_main(void *arg)
153 struct recovd_obd *recovd = (struct recovd_obd *)arg;
159 spin_lock_irq(¤t->sigmask_lock);
160 sigfillset(¤t->blocked);
161 recalc_sigpending(current);
162 spin_unlock_irq(¤t->sigmask_lock);
164 sprintf(current->comm, "lustre_recovd");
166 /* Record that the thread is running */
167 recovd->recovd_thread = current;
168 recovd->recovd_flags = RECOVD_IDLE;
169 wake_up(&recovd->recovd_ctl_waitq);
171 /* And now, loop forever on requests */
173 wait_event(recovd->recovd_waitq, recovd_check_event(recovd));
175 spin_lock(&recovd->recovd_lock);
176 if (recovd->recovd_flags & RECOVD_STOPPING) {
177 spin_unlock(&recovd->recovd_lock);
178 CERROR("lustre_recovd stopping\n");
183 recovd_handle_event(recovd);
184 spin_unlock(&recovd->recovd_lock);
187 recovd->recovd_thread = NULL;
188 recovd->recovd_flags = RECOVD_STOPPED;
189 wake_up(&recovd->recovd_ctl_waitq);
190 CDEBUG(D_NET, "mgr exiting process %d\n", current->pid);
194 int recovd_setup(struct recovd_obd *recovd)
197 extern void (*class_signal_client_failure)(struct ptlrpc_client *);
201 INIT_LIST_HEAD(&recovd->recovd_clients_lh);
202 INIT_LIST_HEAD(&recovd->recovd_troubled_lh);
203 spin_lock_init(&recovd->recovd_lock);
205 init_waitqueue_head(&recovd->recovd_waitq);
206 init_waitqueue_head(&recovd->recovd_recovery_waitq);
207 init_waitqueue_head(&recovd->recovd_ctl_waitq);
209 rc = kernel_thread(recovd_main, (void *)recovd,
210 CLONE_VM | CLONE_FS | CLONE_FILES);
212 CERROR("cannot start thread\n");
215 wait_event(recovd->recovd_ctl_waitq, recovd->recovd_flags & RECOVD_IDLE);
217 /* exported and called by obdclass timeout handlers */
218 class_signal_client_failure = recovd_cli_fail;
223 int recovd_cleanup(struct recovd_obd *recovd)
225 spin_lock(&recovd->recovd_lock);
226 recovd->recovd_flags = RECOVD_STOPPING;
227 wake_up(&recovd->recovd_waitq);
228 spin_unlock(&recovd->recovd_lock);
230 wait_event(recovd->recovd_ctl_waitq,
231 (recovd->recovd_flags & RECOVD_STOPPED));