Whamcloud - gitweb
Prevent C-c and C-z from locking us up, and make most of our waits
[fs/lustre-release.git] / lustre / ptlrpc / recovd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  obd/rpc/recovd.c
5  *
6  *  Lustre High Availability Daemon
7  *
8  *  Copyright (C) 2001, 2002 Cluster File Systems, Inc.
9  *
10  *  This code is issued under the GNU General Public License.
11  *  See the file COPYING in this distribution
12  *
13  *  by Peter Braam <braam@clusterfs.com>
14  *
15  */
16
17 #define DEBUG_SUBSYSTEM S_RPC
18
19 #include <linux/kmod.h>
20 #include <linux/lustre_lite.h>
21 #include <linux/lustre_ha.h>
22
23 struct recovd_obd *ptlrpc_connmgr;
24
25 void recovd_cli_manage(struct recovd_obd *recovd, struct ptlrpc_client *cli)
26 {
27         ENTRY;
28         cli->cli_recovd = recovd;
29         spin_lock(&recovd->recovd_lock);
30         list_add(&cli->cli_ha_item, &recovd->recovd_clients_lh);
31         spin_unlock(&recovd->recovd_lock);
32         EXIT;
33 }
34
35 void recovd_cli_fail(struct ptlrpc_client *cli)
36 {
37         ENTRY;
38         spin_lock(&cli->cli_recovd->recovd_lock);
39         cli->cli_recovd->recovd_flags |= RECOVD_FAIL;
40         cli->cli_recovd->recovd_wakeup_flag = 1;
41         list_del(&cli->cli_ha_item);
42         list_add(&cli->cli_ha_item, &cli->cli_recovd->recovd_troubled_lh);
43         spin_unlock(&cli->cli_recovd->recovd_lock);
44         wake_up(&cli->cli_recovd->recovd_waitq);
45         EXIT;
46 }
47
48 /* this function must be called with cli->cli_lock held */
49 void recovd_cli_fixed(struct ptlrpc_client *cli)
50 {
51         ENTRY;
52         list_del(&cli->cli_ha_item);
53         list_add(&cli->cli_ha_item, &cli->cli_recovd->recovd_clients_lh);
54         EXIT;
55 }
56
57
58 static int recovd_upcall(void)
59 {
60         char *argv[2];
61         char *envp[3];
62
63         argv[0] = "/usr/src/obd/utils/ha_assist.sh";
64         argv[1] = NULL;
65
66         envp [0] = "HOME=/";
67         envp [1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
68         envp [2] = NULL;
69
70         return call_usermodehelper(argv[0], argv, envp);
71 }
72
73 static int recovd_check_event(struct recovd_obd *recovd)
74 {
75         int rc = 0;
76         ENTRY;
77
78         spin_lock(&recovd->recovd_lock);
79
80         recovd->recovd_waketime = CURRENT_TIME;
81         if (recovd->recovd_timeout) 
82                 schedule_timeout(recovd->recovd_timeout);
83
84         if (recovd->recovd_wakeup_flag) {
85                 CERROR("service woken\n"); 
86                 GOTO(out, rc = 1);
87         }
88
89         if (recovd->recovd_timeout && 
90             CURRENT_TIME > recovd->recovd_waketime + recovd->recovd_timeout) {
91                 recovd->recovd_flags |= RECOVD_TIMEOUT;
92                 CERROR("timeout\n");
93                 GOTO(out, rc = 1);
94         }
95
96         if (recovd->recovd_flags & RECOVD_STOPPING) {
97                 CERROR("recovd stopping\n");
98                 rc = 1;
99         }
100
101  out:
102         recovd->recovd_wakeup_flag = 0;
103         spin_unlock(&recovd->recovd_lock);
104         RETURN(rc);
105 }
106
107 static int recovd_handle_event(struct recovd_obd *recovd)
108 {
109         ENTRY;
110
111         if (!(recovd->recovd_flags & RECOVD_UPCALL_WAIT) &&
112             recovd->recovd_flags & RECOVD_FAIL) { 
113
114                 CERROR("client in trouble: flags -> UPCALL_WAITING\n");
115                 recovd->recovd_flags |= RECOVD_UPCALL_WAIT;
116
117                 recovd_upcall();
118                 recovd->recovd_waketime = CURRENT_TIME;
119                 recovd->recovd_timeout = 10 * HZ;
120                 schedule_timeout(recovd->recovd_timeout);
121         }
122
123         if (recovd->recovd_flags & RECOVD_TIMEOUT) { 
124                 CERROR("timeout - no news from upcall?\n");
125                 recovd->recovd_flags &= ~RECOVD_TIMEOUT;
126         }
127
128         if (recovd->recovd_flags & RECOVD_UPCALL_ANSWER) { 
129                 CERROR("UPCALL_WAITING: upcall answer\n");
130
131                 while (!list_empty(&recovd->recovd_troubled_lh)) {
132                         struct ptlrpc_client *cli =
133                                 list_entry(recovd->recovd_troubled_lh.next,
134                                            struct ptlrpc_client, cli_ha_item);
135
136                         list_del(&cli->cli_ha_item); 
137                         if (cli->cli_recover) {
138                                 spin_unlock(&recovd->recovd_lock);
139                                 cli->cli_recover(cli); 
140                                 spin_lock(&recovd->recovd_lock);
141                         }
142                 }
143
144                 recovd->recovd_timeout = 0;
145                 recovd->recovd_flags = RECOVD_IDLE; 
146         }
147
148         RETURN(0);
149 }
150
151 static int recovd_main(void *arg)
152 {
153         struct recovd_obd *recovd = (struct recovd_obd *)arg;
154
155         ENTRY;
156
157         lock_kernel();
158         daemonize();
159         spin_lock_irq(&current->sigmask_lock);
160         sigfillset(&current->blocked);
161         recalc_sigpending(current);
162         spin_unlock_irq(&current->sigmask_lock);
163
164         sprintf(current->comm, "lustre_recovd");
165
166         /* Record that the  thread is running */
167         recovd->recovd_thread = current;
168         recovd->recovd_flags = RECOVD_IDLE;
169         wake_up(&recovd->recovd_ctl_waitq);
170
171         /* And now, loop forever on requests */
172         while (1) {
173                 wait_event(recovd->recovd_waitq, recovd_check_event(recovd));
174
175                 spin_lock(&recovd->recovd_lock);
176                 if (recovd->recovd_flags & RECOVD_STOPPING) {
177                         spin_unlock(&recovd->recovd_lock);
178                         CERROR("lustre_recovd stopping\n");
179                         EXIT;
180                         break;
181                 }
182
183                 recovd_handle_event(recovd);
184                 spin_unlock(&recovd->recovd_lock);
185         }
186
187         recovd->recovd_thread = NULL;
188         recovd->recovd_flags = RECOVD_STOPPED;
189         wake_up(&recovd->recovd_ctl_waitq);
190         CDEBUG(D_NET, "mgr exiting process %d\n", current->pid);
191         RETURN(0);
192 }
193
194 int recovd_setup(struct recovd_obd *recovd)
195 {
196         int rc;
197         ENTRY;
198
199         INIT_LIST_HEAD(&recovd->recovd_clients_lh);
200         INIT_LIST_HEAD(&recovd->recovd_troubled_lh);
201         spin_lock_init(&recovd->recovd_lock);
202
203         init_waitqueue_head(&recovd->recovd_waitq);
204         init_waitqueue_head(&recovd->recovd_recovery_waitq);
205         init_waitqueue_head(&recovd->recovd_ctl_waitq);
206
207         rc = kernel_thread(recovd_main, (void *)recovd,
208                            CLONE_VM | CLONE_FS | CLONE_FILES);
209         if (rc < 0) {
210                 CERROR("cannot start thread\n");
211                 RETURN(-EINVAL);
212         }
213         wait_event(recovd->recovd_ctl_waitq, recovd->recovd_flags & RECOVD_IDLE);
214
215         RETURN(0);
216 }
217
218 int recovd_cleanup(struct recovd_obd *recovd)
219 {
220         spin_lock(&recovd->recovd_lock);
221         recovd->recovd_flags = RECOVD_STOPPING;
222         wake_up(&recovd->recovd_waitq);
223         spin_unlock(&recovd->recovd_lock);
224
225         wait_event(recovd->recovd_ctl_waitq,
226                    (recovd->recovd_flags & RECOVD_STOPPED));
227         RETURN(0);
228 }