Whamcloud - gitweb
62e70f13c5e3a62e6f911dddbd019bd4742a5bc0
[fs/lustre-release.git] / lustre / ptlrpc / recovd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  obd/rpc/recovd.c
5  *
6  *  Lustre High Availability Daemon
7  *
8  *  Copyright (C) 2001, 2002 Cluster File Systems, Inc.
9  *
10  *  This code is issued under the GNU General Public License.
11  *  See the file COPYING in this distribution
12  *
13  *  by Peter Braam <braam@clusterfs.com>
14  *
15  */
16
17 #define DEBUG_SUBSYSTEM S_RPC
18
19 #include <linux/kmod.h>
20 #include <linux/lustre_lite.h>
21 #include <linux/lustre_ha.h>
22 #include <linux/obd_support.h>
23
24 void recovd_conn_manage(struct recovd_obd *recovd,
25                         struct ptlrpc_connection *conn)
26 {
27         ENTRY;
28         conn->c_recovd = recovd;
29         spin_lock(&recovd->recovd_lock);
30         list_add(&conn->c_recovd_data.rd_managed_chain,
31                  &recovd->recovd_managed_items);
32         spin_unlock(&recovd->recovd_lock);
33         EXIT;
34 }
35
36 void recovd_conn_fail(struct ptlrpc_connection *conn)
37 {
38         ENTRY;
39         spin_lock(&conn->c_recovd->recovd_lock);
40         conn->c_recovd->recovd_flags |= RECOVD_FAIL;
41         conn->c_recovd->recovd_wakeup_flag = 1;
42         list_del(&conn->c_recovd_data.rd_managed_chain);
43         list_add(&conn->c_recovd_data.rd_managed_chain, 
44                  &conn->c_recovd->recovd_troubled_items);
45         spin_unlock(&conn->c_recovd->recovd_lock);
46         wake_up(&conn->c_recovd->recovd_waitq);
47         EXIT;
48 }
49
50 /* this function must be called with conn->c_lock held */
51 void recovd_conn_fixed(struct ptlrpc_connection *conn)
52 {
53         ENTRY;
54         list_del(&conn->c_recovd_data.rd_managed_chain);
55         list_add(&conn->c_recovd_data.rd_managed_chain,
56                  &conn->c_recovd->recovd_managed_items);
57         EXIT;
58 }
59
60
61 static int recovd_upcall(void)
62 {
63         char *argv[2];
64         char *envp[3];
65
66         argv[0] = obd_recovery_upcall;
67         argv[1] = NULL;
68
69         envp [0] = "HOME=/";
70         envp [1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
71         envp [2] = NULL;
72
73         return call_usermodehelper(argv[0], argv, envp);
74 }
75
76 static int recovd_check_event(struct recovd_obd *recovd)
77 {
78         int rc = 0;
79         ENTRY;
80
81         spin_lock(&recovd->recovd_lock);
82
83         recovd->recovd_waketime = CURRENT_TIME;
84         if (recovd->recovd_timeout) 
85                 schedule_timeout(recovd->recovd_timeout);
86
87         if (recovd->recovd_wakeup_flag) {
88                 CERROR("service woken\n"); 
89                 GOTO(out, rc = 1);
90         }
91
92         if (recovd->recovd_timeout && 
93             CURRENT_TIME > recovd->recovd_waketime + recovd->recovd_timeout) {
94                 recovd->recovd_flags |= RECOVD_TIMEOUT;
95                 CERROR("timeout\n");
96                 GOTO(out, rc = 1);
97         }
98
99         if (recovd->recovd_flags & RECOVD_STOPPING) {
100                 CERROR("recovd stopping\n");
101                 rc = 1;
102         }
103
104  out:
105         recovd->recovd_wakeup_flag = 0;
106         spin_unlock(&recovd->recovd_lock);
107         RETURN(rc);
108 }
109
110 static int recovd_handle_event(struct recovd_obd *recovd)
111 {
112         ENTRY;
113
114         if (!(recovd->recovd_flags & RECOVD_UPCALL_WAIT) &&
115             recovd->recovd_flags & RECOVD_FAIL) { 
116
117                 CERROR("client in trouble: flags -> UPCALL_WAITING\n");
118                 recovd->recovd_flags |= RECOVD_UPCALL_WAIT;
119
120                 recovd_upcall();
121                 recovd->recovd_waketime = CURRENT_TIME;
122                 recovd->recovd_timeout = 10 * HZ;
123                 schedule_timeout(recovd->recovd_timeout);
124         }
125
126         if (recovd->recovd_flags & RECOVD_TIMEOUT) { 
127                 CERROR("timeout - no news from upcall?\n");
128                 recovd->recovd_flags &= ~RECOVD_TIMEOUT;
129         }
130
131         if (recovd->recovd_flags & RECOVD_UPCALL_ANSWER) { 
132                 CERROR("UPCALL_WAITING: upcall answer\n");
133
134                 while (!list_empty(&recovd->recovd_troubled_items)) {
135                         struct recovd_data *rd =
136                                 list_entry(recovd->recovd_troubled_items.next,
137                                            struct recovd_data, rd_managed_chain);
138
139                         list_del(&rd->rd_managed_chain);
140                         if (rd->rd_recover) {
141                                 spin_unlock(&recovd->recovd_lock);
142                                 rd->rd_recover(rd);
143                                 spin_lock(&recovd->recovd_lock);
144                         }
145                 }
146
147                 recovd->recovd_timeout = 0;
148                 recovd->recovd_flags = RECOVD_IDLE; 
149         }
150
151         RETURN(0);
152 }
153
154 static int recovd_main(void *arg)
155 {
156         struct recovd_obd *recovd = (struct recovd_obd *)arg;
157
158         ENTRY;
159
160         lock_kernel();
161         daemonize();
162         spin_lock_irq(&current->sigmask_lock);
163         sigfillset(&current->blocked);
164         recalc_sigpending(current);
165         spin_unlock_irq(&current->sigmask_lock);
166
167         sprintf(current->comm, "lustre_recovd");
168         unlock_kernel();
169
170         /* Record that the  thread is running */
171         recovd->recovd_thread = current;
172         recovd->recovd_flags = RECOVD_IDLE;
173         wake_up(&recovd->recovd_ctl_waitq);
174
175         /* And now, loop forever on requests */
176         while (1) {
177                 wait_event(recovd->recovd_waitq, recovd_check_event(recovd));
178
179                 spin_lock(&recovd->recovd_lock);
180                 if (recovd->recovd_flags & RECOVD_STOPPING) {
181                         spin_unlock(&recovd->recovd_lock);
182                         CERROR("lustre_recovd stopping\n");
183                         EXIT;
184                         break;
185                 }
186
187                 recovd_handle_event(recovd);
188                 spin_unlock(&recovd->recovd_lock);
189         }
190
191         recovd->recovd_thread = NULL;
192         recovd->recovd_flags = RECOVD_STOPPED;
193         wake_up(&recovd->recovd_ctl_waitq);
194         CDEBUG(D_NET, "mgr exiting process %d\n", current->pid);
195         RETURN(0);
196 }
197
198 int recovd_setup(struct recovd_obd *recovd)
199 {
200         int rc;
201         extern void (*class_signal_connection_failure)
202                 (struct ptlrpc_connection *);
203
204         ENTRY;
205
206         INIT_LIST_HEAD(&recovd->recovd_managed_items);
207         INIT_LIST_HEAD(&recovd->recovd_troubled_items);
208         spin_lock_init(&recovd->recovd_lock);
209
210         init_waitqueue_head(&recovd->recovd_waitq);
211         init_waitqueue_head(&recovd->recovd_recovery_waitq);
212         init_waitqueue_head(&recovd->recovd_ctl_waitq);
213
214         rc = kernel_thread(recovd_main, (void *)recovd,
215                            CLONE_VM | CLONE_FS | CLONE_FILES);
216         if (rc < 0) {
217                 CERROR("cannot start thread\n");
218                 RETURN(-EINVAL);
219         }
220         wait_event(recovd->recovd_ctl_waitq, recovd->recovd_flags & RECOVD_IDLE);
221
222         /* exported and called by obdclass timeout handlers */
223         class_signal_connection_failure = recovd_conn_fail;
224
225         RETURN(0);
226 }
227
228 int recovd_cleanup(struct recovd_obd *recovd)
229 {
230         spin_lock(&recovd->recovd_lock);
231         recovd->recovd_flags = RECOVD_STOPPING;
232         wake_up(&recovd->recovd_waitq);
233         spin_unlock(&recovd->recovd_lock);
234
235         wait_event(recovd->recovd_ctl_waitq,
236                    (recovd->recovd_flags & RECOVD_STOPPED));
237         RETURN(0);
238 }