1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * Lustre High Availability Daemon
8 * Copyright (C) 2001, 2002 Cluster File Systems, Inc.
10 * This code is issued under the GNU General Public License.
11 * See the file COPYING in this distribution
13 * by Peter Braam <braam@clusterfs.com>
18 #define DEBUG_SUBSYSTEM S_RPC
20 #include <linux/kmod.h>
21 #include <linux/lustre_lite.h>
22 #include <linux/lustre_ha.h>
24 struct recovd_obd *ptlrpc_connmgr;
26 void connmgr_cli_manage(struct recovd_obd *recovd, struct ptlrpc_client *cli)
29 cli->cli_recovd = recovd;
30 spin_lock(&recovd->recovd_lock);
31 list_add(&cli->cli_ha_item, &recovd->recovd_connections_lh);
32 spin_unlock(&recovd->recovd_lock);
36 void connmgr_cli_fail(struct ptlrpc_client *cli)
39 spin_lock(&cli->cli_recovd->recovd_lock);
40 cli->cli_recovd->recovd_flags |= SVC_HA_EVENT;
41 list_del(&cli->cli_ha_item);
42 list_add(&cli->cli_ha_item, &cli->cli_recovd->recovd_troubled_lh);
43 spin_unlock(&cli->cli_recovd->recovd_lock);
44 wake_up(&cli->cli_recovd->recovd_waitq);
48 static int connmgr_upcall(void)
53 argv[0] = "/usr/src/obd/utils/ha_assist.sh";
57 envp [1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
60 return call_usermodehelper(argv[0], argv, envp);
63 static void connmgr_unpack_body(struct ptlrpc_request *req)
65 struct connmgr_body *b = lustre_msg_buf(req->rq_repmsg, 0);
69 b->generation = NTOH__u32(b->generation);
72 int connmgr_connect(struct recovd_obd *recovd, struct ptlrpc_connection *conn)
74 struct ptlrpc_request *req;
75 struct ptlrpc_client *cl;
76 struct connmgr_body *body;
77 int rc, size = sizeof(*body);
81 CERROR("no manager\n");
84 cl = recovd->recovd_client;
86 req = ptlrpc_prep_req(cl, conn, CONNMGR_CONNECT, 1, &size, NULL);
88 GOTO(out, rc = -ENOMEM);
90 body = lustre_msg_buf(req->rq_reqmsg, 0);
91 body->generation = HTON__u32(conn->c_generation);
92 body->conn = (__u64)(unsigned long)conn;
93 body->conn_token = conn->c_token;
95 req->rq_replen = lustre_msg_size(1, &size);
97 rc = ptlrpc_queue_wait(req);
98 rc = ptlrpc_check_status(req, rc);
100 connmgr_unpack_body(req);
101 body = lustre_msg_buf(req->rq_repmsg, 0);
102 CDEBUG(D_NET, "remote generation: %o\n", body->generation);
103 conn->c_level = LUSTRE_CONN_CON;
104 conn->c_remote_conn = body->conn;
105 conn->c_remote_token = body->conn_token;
108 ptlrpc_free_req(req);
114 static int connmgr_handle_connect(struct ptlrpc_request *req)
116 struct connmgr_body *body;
117 int rc, size = sizeof(*body);
120 rc = lustre_pack_msg(1, &size, NULL, &req->rq_replen, &req->rq_repmsg);
122 CERROR("connmgr: out of memory\n");
123 req->rq_status = -ENOMEM;
127 body = lustre_msg_buf(req->rq_reqmsg, 0);
128 connmgr_unpack_body(req);
130 req->rq_connection->c_remote_conn = body->conn;
131 req->rq_connection->c_remote_token = body->conn_token;
133 CERROR("incoming generation %d\n", body->generation);
134 body = lustre_msg_buf(req->rq_repmsg, 0);
135 body->generation = 4711;
136 body->conn = (__u64)(unsigned long)req->rq_connection;
137 body->conn_token = req->rq_connection->c_token;
139 req->rq_connection->c_level = LUSTRE_CONN_CON;
143 int connmgr_handle(struct obd_device *dev, struct ptlrpc_service *svc,
144 struct ptlrpc_request *req)
149 rc = lustre_unpack_msg(req->rq_reqmsg, req->rq_reqlen);
151 CERROR("Invalid request\n");
155 if (req->rq_reqmsg->type != NTOH__u32(PTL_RPC_MSG_REQUEST)) {
156 CERROR("wrong packet type sent %d\n",
157 req->rq_reqmsg->type);
158 GOTO(out, rc = -EINVAL);
161 switch (req->rq_reqmsg->opc) {
162 case CONNMGR_CONNECT:
163 CDEBUG(D_INODE, "connmgr connect\n");
164 rc = connmgr_handle_connect(req);
168 rc = ptlrpc_error(svc, req);
175 ptlrpc_error(svc, req);
177 CDEBUG(D_NET, "sending reply\n");
178 ptlrpc_reply(svc, req);
184 static int recovd_check_event(struct recovd_obd *recovd)
189 spin_lock(&recovd->recovd_lock);
191 if (!(recovd->recovd_flags & MGR_WORKING) &&
192 !list_empty(&recovd->recovd_troubled_lh)) {
194 CERROR("connection in trouble - state: WORKING, upcall\n");
195 recovd->recovd_flags = MGR_WORKING;
197 recovd->recovd_waketime = CURRENT_TIME;
198 recovd->recovd_timeout = 5 * HZ;
199 schedule_timeout(recovd->recovd_timeout);
202 if (recovd->recovd_flags & MGR_WORKING &&
203 CURRENT_TIME <= recovd->recovd_waketime + recovd->recovd_timeout) {
204 CERROR("WORKING: new event\n");
206 recovd->recovd_waketime = CURRENT_TIME;
207 schedule_timeout(recovd->recovd_timeout);
210 if (recovd->recovd_flags & MGR_STOPPING) {
211 CERROR("ha mgr stopping\n");
215 spin_unlock(&recovd->recovd_lock);
219 static int recovd_handle_event(struct recovd_obd *recovd)
221 spin_lock(&recovd->recovd_lock);
223 if (!(recovd->recovd_flags & MGR_WORKING) &&
224 !list_empty(&recovd->recovd_troubled_lh)) {
226 CERROR("connection in trouble - state: WORKING, upcall\n");
227 recovd->recovd_flags = MGR_WORKING;
231 recovd->recovd_waketime = CURRENT_TIME;
232 recovd->recovd_timeout = 5 * HZ;
233 schedule_timeout(recovd->recovd_timeout);
236 if (recovd->recovd_flags & MGR_WORKING &&
237 CURRENT_TIME <= recovd->recovd_waketime + recovd->recovd_timeout) {
238 CERROR("WORKING: new event\n");
240 recovd->recovd_waketime = CURRENT_TIME;
241 schedule_timeout(recovd->recovd_timeout);
244 spin_unlock(&recovd->recovd_lock);
248 static int recovd_main(void *arg)
250 struct recovd_obd *recovd = (struct recovd_obd *)arg;
256 spin_lock_irq(¤t->sigmask_lock);
257 sigfillset(¤t->blocked);
258 recalc_sigpending(current);
259 spin_unlock_irq(¤t->sigmask_lock);
261 sprintf(current->comm, "lustre_recovd");
263 /* Record that the thread is running */
264 recovd->recovd_thread = current;
265 recovd->recovd_flags = MGR_RUNNING;
266 wake_up(&recovd->recovd_ctl_waitq);
268 /* And now, loop forever on requests */
270 wait_event_interruptible(recovd->recovd_waitq,
271 recovd_check_event(recovd));
273 spin_lock(&recovd->recovd_lock);
274 if (recovd->recovd_flags & MGR_STOPPING) {
275 spin_unlock(&recovd->recovd_lock);
276 CERROR("lustre_hamgr quitting\n");
281 recovd_handle_event(recovd);
282 spin_unlock(&recovd->recovd_lock);
285 recovd->recovd_thread = NULL;
286 recovd->recovd_flags = MGR_STOPPED;
287 wake_up(&recovd->recovd_ctl_waitq);
288 CDEBUG(D_NET, "mgr exiting process %d\n", current->pid);
292 int recovd_setup(struct recovd_obd *recovd)
297 INIT_LIST_HEAD(&recovd->recovd_connections_lh);
298 INIT_LIST_HEAD(&recovd->recovd_troubled_lh);
299 spin_lock_init(&recovd->recovd_lock);
301 init_waitqueue_head(&recovd->recovd_waitq);
302 init_waitqueue_head(&recovd->recovd_recovery_waitq);
303 init_waitqueue_head(&recovd->recovd_ctl_waitq);
305 rc = kernel_thread(recovd_main, (void *)recovd,
306 CLONE_VM | CLONE_FS | CLONE_FILES);
308 CERROR("cannot start thread\n");
311 wait_event(recovd->recovd_ctl_waitq, recovd->recovd_flags & MGR_RUNNING);
316 int recovd_cleanup(struct recovd_obd *recovd)
318 recovd->recovd_flags = MGR_STOPPING;
320 wake_up(&recovd->recovd_waitq);
321 wait_event_interruptible(recovd->recovd_ctl_waitq,
322 (recovd->recovd_flags & MGR_STOPPED));