Whamcloud - gitweb
- documentation update for MDS recovery
[fs/lustre-release.git] / lustre / ptlrpc / recovd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  obd/rpc/recovd.c
5  *
6  *  Lustre High Availability Daemon
7  *
8  *  Copyright (C) 2001, 2002 Cluster File Systems, Inc.
9  *
10  *  This code is issued under the GNU General Public License.
11  *  See the file COPYING in this distribution
12  *
13  *  by Peter Braam <braam@clusterfs.com>
14  *
15  */
16
17 #define EXPORT_SYMTAB
18 #define DEBUG_SUBSYSTEM S_RPC
19
20 #include <linux/kmod.h>
21 #include <linux/lustre_lite.h>
22 #include <linux/lustre_ha.h>
23
24 struct recovd_obd *ptlrpc_connmgr;
25
26 void recovd_cli_manage(struct recovd_obd *recovd, struct ptlrpc_client *cli)
27 {
28         ENTRY;
29         cli->cli_recovd = recovd;
30         spin_lock(&recovd->recovd_lock);
31         list_add(&cli->cli_ha_item, &recovd->recovd_clients_lh);
32         spin_unlock(&recovd->recovd_lock);
33         EXIT;
34 }
35
36 void recovd_cli_fail(struct ptlrpc_client *cli)
37 {
38         ENTRY;
39         spin_lock(&cli->cli_recovd->recovd_lock);
40         cli->cli_recovd->recovd_flags |= RECOVD_FAIL;
41         cli->cli_recovd->recovd_wakeup_flag = 1;
42         list_del(&cli->cli_ha_item);
43         list_add(&cli->cli_ha_item, &cli->cli_recovd->recovd_troubled_lh);
44         spin_unlock(&cli->cli_recovd->recovd_lock);
45         wake_up(&cli->cli_recovd->recovd_waitq);
46         EXIT;
47 }
48
49 void recovd_cli_fixed(struct ptlrpc_client *cli)
50 {
51         ENTRY;
52         list_del(&cli->cli_ha_item);
53         list_add(&cli->cli_ha_item, &cli->cli_recovd->recovd_clients_lh);
54         EXIT;
55 }
56
57
58 static int recovd_upcall(void)
59 {
60         char *argv[2];
61         char *envp[3];
62
63         argv[0] = "/usr/src/obd/utils/ha_assist.sh";
64         argv[1] = NULL;
65
66         envp [0] = "HOME=/";
67         envp [1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
68         envp [2] = NULL;
69
70         return call_usermodehelper(argv[0], argv, envp);
71 }
72
73 static int recovd_check_event(struct recovd_obd *recovd)
74 {
75         int rc = 0;
76         ENTRY;
77
78         spin_lock(&recovd->recovd_lock);
79
80         recovd->recovd_waketime = CURRENT_TIME;
81         if (recovd->recovd_timeout) 
82                 schedule_timeout(recovd->recovd_timeout);
83
84         if (recovd->recovd_wakeup_flag) {
85                 CERROR("service woken\n"); 
86                 GOTO(out, rc = 1);
87         }
88
89         if (recovd->recovd_timeout && 
90             CURRENT_TIME > recovd->recovd_waketime + recovd->recovd_timeout) {
91                 recovd->recovd_flags |= RECOVD_TIMEOUT;
92                 CERROR("timeout\n");
93                 GOTO(out, rc = 1);
94         }
95
96         if (recovd->recovd_flags & RECOVD_STOPPING) {
97                 CERROR("recovd stopping\n");
98                 rc = 1;
99         }
100
101  out:
102         recovd->recovd_wakeup_flag = 0;
103         spin_unlock(&recovd->recovd_lock);
104         RETURN(rc);
105 }
106
107 static int recovd_handle_event(struct recovd_obd *recovd)
108 {
109         ENTRY;
110         spin_lock(&recovd->recovd_lock);
111
112         if (!(recovd->recovd_flags & RECOVD_UPCALL_WAIT) &&
113             recovd->recovd_flags & RECOVD_FAIL) { 
114
115                 CERROR("client in trouble: flags -> UPCALL_WAITING\n");
116                 recovd->recovd_flags |= RECOVD_UPCALL_WAIT;
117
118                 recovd_upcall();
119                 recovd->recovd_waketime = CURRENT_TIME;
120                 recovd->recovd_timeout = 10 * HZ;
121                 schedule_timeout(recovd->recovd_timeout);
122         }
123
124         if (recovd->recovd_flags & RECOVD_TIMEOUT) { 
125                 CERROR("timeout - no news from upcall?\n");
126                 recovd->recovd_flags &= ~RECOVD_TIMEOUT;
127         }
128
129         if (recovd->recovd_flags & RECOVD_UPCALL_ANSWER) { 
130                 struct list_head *tmp, *pos;
131                 CERROR("UPCALL_WAITING: upcall answer\n");
132                 CERROR("** fill me in with recovery\n");
133
134                 list_for_each_safe(tmp, pos, &recovd->recovd_troubled_lh) { 
135                         struct ptlrpc_client *cli = list_entry
136                                 (tmp, struct ptlrpc_client, cli_ha_item);
137
138                         list_del(&cli->cli_ha_item); 
139                         spin_unlock(&recovd->recovd_lock);
140                         if (cli->cli_recover)
141                                 cli->cli_recover(cli); 
142                         spin_lock(&recovd->recovd_lock);
143                 }
144
145                 recovd->recovd_timeout = 0;
146                 recovd->recovd_flags = RECOVD_IDLE; 
147         }
148
149         spin_unlock(&recovd->recovd_lock);
150         RETURN(0);
151 }
152
153 static int recovd_main(void *arg)
154 {
155         struct recovd_obd *recovd = (struct recovd_obd *)arg;
156
157         ENTRY;
158
159         lock_kernel();
160         daemonize();
161         spin_lock_irq(&current->sigmask_lock);
162         sigfillset(&current->blocked);
163         recalc_sigpending(current);
164         spin_unlock_irq(&current->sigmask_lock);
165
166         sprintf(current->comm, "lustre_recovd");
167
168         /* Record that the  thread is running */
169         recovd->recovd_thread = current;
170         recovd->recovd_flags = RECOVD_IDLE;
171         wake_up(&recovd->recovd_ctl_waitq);
172
173         /* And now, loop forever on requests */
174         while (1) {
175                 wait_event_interruptible(recovd->recovd_waitq,
176                                          recovd_check_event(recovd));
177
178                 spin_lock(&recovd->recovd_lock);
179                 if (recovd->recovd_flags & RECOVD_STOPPING) {
180                         spin_unlock(&recovd->recovd_lock);
181                         CERROR("lustre_recovd stopping\n");
182                         EXIT;
183                         break;
184                 }
185
186                 recovd_handle_event(recovd);
187                 spin_unlock(&recovd->recovd_lock);
188         }
189
190         recovd->recovd_thread = NULL;
191         recovd->recovd_flags = RECOVD_STOPPED;
192         wake_up(&recovd->recovd_ctl_waitq);
193         CDEBUG(D_NET, "mgr exiting process %d\n", current->pid);
194         RETURN(0);
195 }
196
197 int recovd_setup(struct recovd_obd *recovd)
198 {
199         int rc;
200         ENTRY;
201
202         INIT_LIST_HEAD(&recovd->recovd_clients_lh);
203         INIT_LIST_HEAD(&recovd->recovd_troubled_lh);
204         spin_lock_init(&recovd->recovd_lock);
205
206         init_waitqueue_head(&recovd->recovd_waitq);
207         init_waitqueue_head(&recovd->recovd_recovery_waitq);
208         init_waitqueue_head(&recovd->recovd_ctl_waitq);
209
210         rc = kernel_thread(recovd_main, (void *)recovd,
211                            CLONE_VM | CLONE_FS | CLONE_FILES);
212         if (rc < 0) {
213                 CERROR("cannot start thread\n");
214                 RETURN(-EINVAL);
215         }
216         wait_event(recovd->recovd_ctl_waitq, recovd->recovd_flags & RECOVD_IDLE);
217
218         RETURN(0);
219 }
220
221 int recovd_cleanup(struct recovd_obd *recovd)
222 {
223         spin_lock(&recovd->recovd_lock);
224         recovd->recovd_flags = RECOVD_STOPPING;
225         wake_up(&recovd->recovd_waitq);
226         spin_unlock(&recovd->recovd_lock);
227
228         wait_event_interruptible(recovd->recovd_ctl_waitq,
229                                  (recovd->recovd_flags & RECOVD_STOPPED));
230         RETURN(0);
231 }