Whamcloud - gitweb
Avoid cli_lock deadlock in ptlrpc_free_req
[fs/lustre-release.git] / lustre / ptlrpc / recovd.c
1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2  * vim:expandtab:shiftwidth=8:tabstop=8:
3  *
4  *  obd/rpc/recovd.c
5  *
6  *  Lustre High Availability Daemon
7  *
8  *  Copyright (C) 2001, 2002 Cluster File Systems, Inc.
9  *
10  *  This code is issued under the GNU General Public License.
11  *  See the file COPYING in this distribution
12  *
13  *  by Peter Braam <braam@clusterfs.com>
14  *
15  */
16
17 #define EXPORT_SYMTAB
18 #define DEBUG_SUBSYSTEM S_RPC
19
20 #include <linux/kmod.h>
21 #include <linux/lustre_lite.h>
22 #include <linux/lustre_ha.h>
23
24 struct recovd_obd *ptlrpc_connmgr;
25
26 void recovd_cli_manage(struct recovd_obd *recovd, struct ptlrpc_client *cli)
27 {
28         ENTRY;
29         cli->cli_recovd = recovd;
30         spin_lock(&recovd->recovd_lock);
31         list_add(&cli->cli_ha_item, &recovd->recovd_clients_lh);
32         spin_unlock(&recovd->recovd_lock);
33         EXIT;
34 }
35
36 void recovd_cli_fail(struct ptlrpc_client *cli)
37 {
38         ENTRY;
39         spin_lock(&cli->cli_recovd->recovd_lock);
40         cli->cli_recovd->recovd_flags |= RECOVD_FAIL;
41         cli->cli_recovd->recovd_wakeup_flag = 1;
42         list_del(&cli->cli_ha_item);
43         list_add(&cli->cli_ha_item, &cli->cli_recovd->recovd_troubled_lh);
44         spin_unlock(&cli->cli_recovd->recovd_lock);
45         wake_up(&cli->cli_recovd->recovd_waitq);
46         EXIT;
47 }
48
49 /* this function must be called with cli->cli_lock held */
50 void recovd_cli_fixed(struct ptlrpc_client *cli)
51 {
52         ENTRY;
53         list_del(&cli->cli_ha_item);
54         list_add(&cli->cli_ha_item, &cli->cli_recovd->recovd_clients_lh);
55         EXIT;
56 }
57
58
59 static int recovd_upcall(void)
60 {
61         char *argv[2];
62         char *envp[3];
63
64         argv[0] = "/usr/src/obd/utils/ha_assist.sh";
65         argv[1] = NULL;
66
67         envp [0] = "HOME=/";
68         envp [1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
69         envp [2] = NULL;
70
71         return call_usermodehelper(argv[0], argv, envp);
72 }
73
74 static int recovd_check_event(struct recovd_obd *recovd)
75 {
76         int rc = 0;
77         ENTRY;
78
79         spin_lock(&recovd->recovd_lock);
80
81         recovd->recovd_waketime = CURRENT_TIME;
82         if (recovd->recovd_timeout) 
83                 schedule_timeout(recovd->recovd_timeout);
84
85         if (recovd->recovd_wakeup_flag) {
86                 CERROR("service woken\n"); 
87                 GOTO(out, rc = 1);
88         }
89
90         if (recovd->recovd_timeout && 
91             CURRENT_TIME > recovd->recovd_waketime + recovd->recovd_timeout) {
92                 recovd->recovd_flags |= RECOVD_TIMEOUT;
93                 CERROR("timeout\n");
94                 GOTO(out, rc = 1);
95         }
96
97         if (recovd->recovd_flags & RECOVD_STOPPING) {
98                 CERROR("recovd stopping\n");
99                 rc = 1;
100         }
101
102  out:
103         recovd->recovd_wakeup_flag = 0;
104         spin_unlock(&recovd->recovd_lock);
105         RETURN(rc);
106 }
107
108 static int recovd_handle_event(struct recovd_obd *recovd)
109 {
110         ENTRY;
111         spin_lock(&recovd->recovd_lock);
112
113         if (!(recovd->recovd_flags & RECOVD_UPCALL_WAIT) &&
114             recovd->recovd_flags & RECOVD_FAIL) { 
115
116                 CERROR("client in trouble: flags -> UPCALL_WAITING\n");
117                 recovd->recovd_flags |= RECOVD_UPCALL_WAIT;
118
119                 recovd_upcall();
120                 recovd->recovd_waketime = CURRENT_TIME;
121                 recovd->recovd_timeout = 10 * HZ;
122                 schedule_timeout(recovd->recovd_timeout);
123         }
124
125         if (recovd->recovd_flags & RECOVD_TIMEOUT) { 
126                 CERROR("timeout - no news from upcall?\n");
127                 recovd->recovd_flags &= ~RECOVD_TIMEOUT;
128         }
129
130         if (recovd->recovd_flags & RECOVD_UPCALL_ANSWER) { 
131                 struct list_head *tmp, *pos;
132                 CERROR("UPCALL_WAITING: upcall answer\n");
133                 CERROR("** fill me in with recovery\n");
134
135                 list_for_each_safe(tmp, pos, &recovd->recovd_troubled_lh) { 
136                         struct ptlrpc_client *cli = list_entry
137                                 (tmp, struct ptlrpc_client, cli_ha_item);
138
139                         list_del(&cli->cli_ha_item); 
140                         spin_unlock(&recovd->recovd_lock);
141                         if (cli->cli_recover)
142                                 cli->cli_recover(cli); 
143                         spin_lock(&recovd->recovd_lock);
144                 }
145
146                 recovd->recovd_timeout = 0;
147                 recovd->recovd_flags = RECOVD_IDLE; 
148         }
149
150         spin_unlock(&recovd->recovd_lock);
151         RETURN(0);
152 }
153
154 static int recovd_main(void *arg)
155 {
156         struct recovd_obd *recovd = (struct recovd_obd *)arg;
157
158         ENTRY;
159
160         lock_kernel();
161         daemonize();
162         spin_lock_irq(&current->sigmask_lock);
163         sigfillset(&current->blocked);
164         recalc_sigpending(current);
165         spin_unlock_irq(&current->sigmask_lock);
166
167         sprintf(current->comm, "lustre_recovd");
168
169         /* Record that the  thread is running */
170         recovd->recovd_thread = current;
171         recovd->recovd_flags = RECOVD_IDLE;
172         wake_up(&recovd->recovd_ctl_waitq);
173
174         /* And now, loop forever on requests */
175         while (1) {
176                 wait_event_interruptible(recovd->recovd_waitq,
177                                          recovd_check_event(recovd));
178
179                 spin_lock(&recovd->recovd_lock);
180                 if (recovd->recovd_flags & RECOVD_STOPPING) {
181                         spin_unlock(&recovd->recovd_lock);
182                         CERROR("lustre_recovd stopping\n");
183                         EXIT;
184                         break;
185                 }
186
187                 recovd_handle_event(recovd);
188                 spin_unlock(&recovd->recovd_lock);
189         }
190
191         recovd->recovd_thread = NULL;
192         recovd->recovd_flags = RECOVD_STOPPED;
193         wake_up(&recovd->recovd_ctl_waitq);
194         CDEBUG(D_NET, "mgr exiting process %d\n", current->pid);
195         RETURN(0);
196 }
197
198 int recovd_setup(struct recovd_obd *recovd)
199 {
200         int rc;
201         ENTRY;
202
203         INIT_LIST_HEAD(&recovd->recovd_clients_lh);
204         INIT_LIST_HEAD(&recovd->recovd_troubled_lh);
205         spin_lock_init(&recovd->recovd_lock);
206
207         init_waitqueue_head(&recovd->recovd_waitq);
208         init_waitqueue_head(&recovd->recovd_recovery_waitq);
209         init_waitqueue_head(&recovd->recovd_ctl_waitq);
210
211         rc = kernel_thread(recovd_main, (void *)recovd,
212                            CLONE_VM | CLONE_FS | CLONE_FILES);
213         if (rc < 0) {
214                 CERROR("cannot start thread\n");
215                 RETURN(-EINVAL);
216         }
217         wait_event(recovd->recovd_ctl_waitq, recovd->recovd_flags & RECOVD_IDLE);
218
219         RETURN(0);
220 }
221
222 int recovd_cleanup(struct recovd_obd *recovd)
223 {
224         spin_lock(&recovd->recovd_lock);
225         recovd->recovd_flags = RECOVD_STOPPING;
226         wake_up(&recovd->recovd_waitq);
227         spin_unlock(&recovd->recovd_lock);
228
229         wait_event_interruptible(recovd->recovd_ctl_waitq,
230                                  (recovd->recovd_flags & RECOVD_STOPPED));
231         RETURN(0);
232 }