Whamcloud - gitweb
* l_wait_event can now do interrupts without a timeout, if we're feeling brave.
[fs/lustre-release.git] / lustre / ptlrpc / recovd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  obd/rpc/recovd.c
5  *
6  *  Lustre High Availability Daemon
7  *
8  *  Copyright (C) 2001, 2002 Cluster File Systems, Inc.
9  *
10  *  This code is issued under the GNU General Public License.
11  *  See the file COPYING in this distribution
12  *
13  *  by Peter Braam <braam@clusterfs.com>
14  *
15  */
16
17 #define DEBUG_SUBSYSTEM S_RPC
18
19 #include <linux/kmod.h>
20 #include <linux/lustre_lite.h>
21 #include <linux/lustre_ha.h>
22 #include <linux/obd_support.h>
23
24 struct recovd_obd *ptlrpc_connmgr;
25
26 void recovd_cli_manage(struct recovd_obd *recovd, struct ptlrpc_client *cli)
27 {
28         ENTRY;
29         cli->cli_recovd = recovd;
30         spin_lock(&recovd->recovd_lock);
31         list_add(&cli->cli_ha_item, &recovd->recovd_clients_lh);
32         spin_unlock(&recovd->recovd_lock);
33         EXIT;
34 }
35
36 void recovd_cli_fail(struct ptlrpc_client *cli)
37 {
38         ENTRY;
39         spin_lock(&cli->cli_recovd->recovd_lock);
40         cli->cli_recovd->recovd_flags |= RECOVD_FAIL;
41         cli->cli_recovd->recovd_wakeup_flag = 1;
42         list_del(&cli->cli_ha_item);
43         list_add(&cli->cli_ha_item, &cli->cli_recovd->recovd_troubled_lh);
44         spin_unlock(&cli->cli_recovd->recovd_lock);
45         wake_up(&cli->cli_recovd->recovd_waitq);
46         EXIT;
47 }
48
49 /* this function must be called with cli->cli_lock held */
50 void recovd_cli_fixed(struct ptlrpc_client *cli)
51 {
52         ENTRY;
53         list_del(&cli->cli_ha_item);
54         list_add(&cli->cli_ha_item, &cli->cli_recovd->recovd_clients_lh);
55         EXIT;
56 }
57
58
59 static int recovd_upcall(void)
60 {
61         char *argv[2];
62         char *envp[3];
63
64         argv[0] = obd_recovery_upcall;
65         argv[1] = NULL;
66
67         envp [0] = "HOME=/";
68         envp [1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
69         envp [2] = NULL;
70
71         return call_usermodehelper(argv[0], argv, envp);
72 }
73
74 static int recovd_check_event(struct recovd_obd *recovd)
75 {
76         int rc = 0;
77         ENTRY;
78
79         spin_lock(&recovd->recovd_lock);
80
81         recovd->recovd_waketime = CURRENT_TIME;
82         if (recovd->recovd_timeout) 
83                 schedule_timeout(recovd->recovd_timeout);
84
85         if (recovd->recovd_wakeup_flag) {
86                 CERROR("service woken\n"); 
87                 GOTO(out, rc = 1);
88         }
89
90         if (recovd->recovd_timeout && 
91             CURRENT_TIME > recovd->recovd_waketime + recovd->recovd_timeout) {
92                 recovd->recovd_flags |= RECOVD_TIMEOUT;
93                 CERROR("timeout\n");
94                 GOTO(out, rc = 1);
95         }
96
97         if (recovd->recovd_flags & RECOVD_STOPPING) {
98                 CERROR("recovd stopping\n");
99                 rc = 1;
100         }
101
102  out:
103         recovd->recovd_wakeup_flag = 0;
104         spin_unlock(&recovd->recovd_lock);
105         RETURN(rc);
106 }
107
108 static int recovd_handle_event(struct recovd_obd *recovd)
109 {
110         ENTRY;
111
112         if (!(recovd->recovd_flags & RECOVD_UPCALL_WAIT) &&
113             recovd->recovd_flags & RECOVD_FAIL) { 
114
115                 CERROR("client in trouble: flags -> UPCALL_WAITING\n");
116                 recovd->recovd_flags |= RECOVD_UPCALL_WAIT;
117
118                 recovd_upcall();
119                 recovd->recovd_waketime = CURRENT_TIME;
120                 recovd->recovd_timeout = 10 * HZ;
121                 schedule_timeout(recovd->recovd_timeout);
122         }
123
124         if (recovd->recovd_flags & RECOVD_TIMEOUT) { 
125                 CERROR("timeout - no news from upcall?\n");
126                 recovd->recovd_flags &= ~RECOVD_TIMEOUT;
127         }
128
129         if (recovd->recovd_flags & RECOVD_UPCALL_ANSWER) { 
130                 CERROR("UPCALL_WAITING: upcall answer\n");
131
132                 while (!list_empty(&recovd->recovd_troubled_lh)) {
133                         struct ptlrpc_client *cli =
134                                 list_entry(recovd->recovd_troubled_lh.next,
135                                            struct ptlrpc_client, cli_ha_item);
136
137                         list_del(&cli->cli_ha_item); 
138                         if (cli->cli_recover) {
139                                 spin_unlock(&recovd->recovd_lock);
140                                 cli->cli_recover(cli); 
141                                 spin_lock(&recovd->recovd_lock);
142                         }
143                 }
144
145                 recovd->recovd_timeout = 0;
146                 recovd->recovd_flags = RECOVD_IDLE; 
147         }
148
149         RETURN(0);
150 }
151
152 static int recovd_main(void *arg)
153 {
154         struct recovd_obd *recovd = (struct recovd_obd *)arg;
155
156         ENTRY;
157
158         lock_kernel();
159         daemonize();
160         spin_lock_irq(&current->sigmask_lock);
161         sigfillset(&current->blocked);
162         recalc_sigpending(current);
163         spin_unlock_irq(&current->sigmask_lock);
164
165         sprintf(current->comm, "lustre_recovd");
166
167         /* Record that the  thread is running */
168         recovd->recovd_thread = current;
169         recovd->recovd_flags = RECOVD_IDLE;
170         wake_up(&recovd->recovd_ctl_waitq);
171
172         /* And now, loop forever on requests */
173         while (1) {
174                 wait_event(recovd->recovd_waitq, recovd_check_event(recovd));
175
176                 spin_lock(&recovd->recovd_lock);
177                 if (recovd->recovd_flags & RECOVD_STOPPING) {
178                         spin_unlock(&recovd->recovd_lock);
179                         CERROR("lustre_recovd stopping\n");
180                         EXIT;
181                         break;
182                 }
183
184                 recovd_handle_event(recovd);
185                 spin_unlock(&recovd->recovd_lock);
186         }
187
188         recovd->recovd_thread = NULL;
189         recovd->recovd_flags = RECOVD_STOPPED;
190         wake_up(&recovd->recovd_ctl_waitq);
191         CDEBUG(D_NET, "mgr exiting process %d\n", current->pid);
192         RETURN(0);
193 }
194
195 int recovd_setup(struct recovd_obd *recovd)
196 {
197         int rc;
198         extern void (*class_signal_client_failure)(struct ptlrpc_client *);
199
200         ENTRY;
201
202         INIT_LIST_HEAD(&recovd->recovd_clients_lh);
203         INIT_LIST_HEAD(&recovd->recovd_troubled_lh);
204         spin_lock_init(&recovd->recovd_lock);
205
206         init_waitqueue_head(&recovd->recovd_waitq);
207         init_waitqueue_head(&recovd->recovd_recovery_waitq);
208         init_waitqueue_head(&recovd->recovd_ctl_waitq);
209
210         rc = kernel_thread(recovd_main, (void *)recovd,
211                            CLONE_VM | CLONE_FS | CLONE_FILES);
212         if (rc < 0) {
213                 CERROR("cannot start thread\n");
214                 RETURN(-EINVAL);
215         }
216         wait_event(recovd->recovd_ctl_waitq, recovd->recovd_flags & RECOVD_IDLE);
217
218         /* exported and called by obdclass timeout handlers */
219         class_signal_client_failure = recovd_cli_fail;
220
221         RETURN(0);
222 }
223
224 int recovd_cleanup(struct recovd_obd *recovd)
225 {
226         spin_lock(&recovd->recovd_lock);
227         recovd->recovd_flags = RECOVD_STOPPING;
228         wake_up(&recovd->recovd_waitq);
229         spin_unlock(&recovd->recovd_lock);
230
231         wait_event(recovd->recovd_ctl_waitq,
232                    (recovd->recovd_flags & RECOVD_STOPPED));
233         RETURN(0);
234 }