1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2003 Cluster File Systems, Inc.
5 * Author: Andreas Dilger <adilger@clusterfs.com>
7 * This file is part of Lustre, http://www.lustre.org.
9 * Lustre is free software; you can redistribute it and/or
10 * modify it under the terms of version 2 of the GNU General Public
11 * License as published by the Free Software Foundation.
13 * Lustre is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with Lustre; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 * OST<->MDS recovery logging thread.
24 * Invariants in implementation:
25 * - we do not share logs among different OST<->MDS connections, so that
26 * if an OST or MDS fails it need only look at log(s) relevant to itself
29 #define DEBUG_SUBSYSTEM S_LOG
32 # define EXPORT_SYMTAB
38 # include <libcfs/list.h>
39 # include <liblustre.h>
42 #include <libcfs/kp30.h>
43 #include <linux/obd_class.h>
44 #include <linux/lustre_commit_confd.h>
45 #include <linux/obd_support.h>
46 #include <linux/obd_class.h>
47 #include <linux/lustre_net.h>
48 #include <portals/types.h>
49 #include <libcfs/list.h>
50 #include <linux/lustre_log.h>
51 #include "ptlrpc_internal.h"
53 #define LLCD_SIZE 4096
57 static struct llog_commit_master lustre_lcm;
58 static struct llog_commit_master *lcm = &lustre_lcm;
60 /* Allocate new commit structs in case we do not have enough */
61 static int llcd_alloc(void)
63 struct llog_canceld_ctxt *llcd;
64 int offset = offsetof(struct llog_canceld_ctxt, llcd_cookies);
66 OBD_ALLOC(llcd, LLCD_SIZE + offset);
72 spin_lock(&lcm->lcm_llcd_lock);
73 list_add(&llcd->llcd_list, &lcm->lcm_llcd_free);
74 atomic_inc(&lcm->lcm_llcd_numfree);
75 spin_unlock(&lcm->lcm_llcd_lock);
80 /* Get a free cookie struct from the list */
81 struct llog_canceld_ctxt *llcd_grab(void)
83 struct llog_canceld_ctxt *llcd;
85 spin_lock(&lcm->lcm_llcd_lock);
86 if (list_empty(&lcm->lcm_llcd_free)) {
87 spin_unlock(&lcm->lcm_llcd_lock);
88 if (llcd_alloc() < 0) {
89 CERROR("unable to allocate log commit data!\n");
92 spin_lock(&lcm->lcm_llcd_lock);
95 llcd = list_entry(lcm->lcm_llcd_free.next, typeof(*llcd), llcd_list);
96 list_del(&llcd->llcd_list);
97 atomic_dec(&lcm->lcm_llcd_numfree);
98 spin_unlock(&lcm->lcm_llcd_lock);
100 llcd->llcd_tries = 0;
101 llcd->llcd_cookiebytes = 0;
105 EXPORT_SYMBOL(llcd_grab);
107 static void llcd_put(struct llog_canceld_ctxt *llcd)
109 int offset = offsetof(struct llog_canceld_ctxt, llcd_cookies);
111 if (atomic_read(&lcm->lcm_llcd_numfree) >= lcm->lcm_llcd_maxfree) {
112 OBD_FREE(llcd, LLCD_SIZE + offset);
114 spin_lock(&lcm->lcm_llcd_lock);
115 list_add(&llcd->llcd_list, &lcm->lcm_llcd_free);
116 atomic_inc(&lcm->lcm_llcd_numfree);
117 spin_unlock(&lcm->lcm_llcd_lock);
121 /* Send some cookies to the appropriate target */
122 void llcd_send(struct llog_canceld_ctxt *llcd)
124 spin_lock(&llcd->llcd_lcm->lcm_llcd_lock);
125 list_add_tail(&llcd->llcd_list, &llcd->llcd_lcm->lcm_llcd_pending);
126 spin_unlock(&llcd->llcd_lcm->lcm_llcd_lock);
128 wake_up_nr(&llcd->llcd_lcm->lcm_waitq, 1);
130 EXPORT_SYMBOL(llcd_send);
132 /* deleted objects have a commit callback that cancels the MDS
133 * log record for the deletion. The commit callback calls this
136 int llog_obd_repl_cancel(struct llog_ctxt *ctxt,
137 struct lov_stripe_md *lsm, int count,
138 struct llog_cookie *cookies, int flags)
140 struct llog_canceld_ctxt *llcd;
146 down(&ctxt->loc_sem);
147 if (ctxt->loc_imp == NULL) {
148 CWARN("no import for ctxt %p\n", ctxt);
152 llcd = ctxt->loc_llcd;
154 if (count > 0 && cookies != NULL) {
158 CERROR("couldn't get an llcd - dropped "LPX64
160 cookies->lgc_lgl.lgl_oid,
161 cookies->lgc_lgl.lgl_ogen,
163 GOTO(out, rc = -ENOMEM);
165 llcd->llcd_ctxt = ctxt;
166 ctxt->loc_llcd = llcd;
169 memcpy((char *)llcd->llcd_cookies + llcd->llcd_cookiebytes,
170 cookies, sizeof(*cookies));
171 llcd->llcd_cookiebytes += sizeof(*cookies);
173 if (llcd == NULL || !(flags & OBD_LLOG_FL_SENDNOW))
177 if ((LLCD_SIZE - llcd->llcd_cookiebytes < sizeof(*cookies) ||
178 flags & OBD_LLOG_FL_SENDNOW)) {
179 CDEBUG(D_HA, "send llcd %p:%p\n", llcd, llcd->llcd_ctxt);
180 ctxt->loc_llcd = NULL;
187 EXPORT_SYMBOL(llog_obd_repl_cancel);
189 int llog_obd_repl_sync(struct llog_ctxt *ctxt, struct obd_export *exp)
194 if (exp && (ctxt->loc_imp == exp->exp_imp_reverse)) {
195 CDEBUG(D_INFO, "reverse import disconnected, put llcd %p:%p\n",
196 ctxt->loc_llcd, ctxt);
197 down(&ctxt->loc_sem);
198 if (ctxt->loc_llcd != NULL) {
199 llcd_put(ctxt->loc_llcd);
200 ctxt->loc_llcd = NULL;
202 ctxt->loc_imp = NULL;
205 rc = llog_cancel(ctxt, NULL, 0, NULL, OBD_LLOG_FL_SENDNOW);
210 EXPORT_SYMBOL(llog_obd_repl_sync);
212 static int log_commit_thread(void *arg)
214 struct llog_commit_master *lcm = arg;
215 struct llog_commit_daemon *lcd;
216 struct llog_canceld_ctxt *llcd, *n;
220 OBD_ALLOC(lcd, sizeof(*lcd));
225 ptlrpc_daemonize(); /* thread never needs to do IO */
227 SIGNAL_MASK_LOCK(current, flags);
228 sigfillset(¤t->blocked);
230 SIGNAL_MASK_UNLOCK(current, flags);
232 spin_lock(&lcm->lcm_thread_lock);
233 THREAD_NAME(current->comm, sizeof(current->comm) - 1,
234 "ll_log_comt_%02d", atomic_read(&lcm->lcm_thread_total));
235 atomic_inc(&lcm->lcm_thread_total);
236 spin_unlock(&lcm->lcm_thread_lock);
239 INIT_LIST_HEAD(&lcd->lcd_lcm_list);
240 INIT_LIST_HEAD(&lcd->lcd_llcd_list);
243 CDEBUG(D_HA, "%s started\n", current->comm);
245 struct ptlrpc_request *request;
246 struct obd_import *import = NULL;
247 struct list_head *sending_list;
250 /* If we do not have enough pages available, allocate some */
251 while (atomic_read(&lcm->lcm_llcd_numfree) <
252 lcm->lcm_llcd_minfree) {
253 if (llcd_alloc() < 0)
257 spin_lock(&lcm->lcm_thread_lock);
258 atomic_inc(&lcm->lcm_thread_numidle);
259 list_move(&lcd->lcd_lcm_list, &lcm->lcm_thread_idle);
260 spin_unlock(&lcm->lcm_thread_lock);
262 wait_event_interruptible(lcm->lcm_waitq,
263 !list_empty(&lcm->lcm_llcd_pending) ||
264 lcm->lcm_flags & LLOG_LCM_FL_EXIT);
266 /* If we are the last available thread, start a new one in case
267 * we get blocked on an RPC (nobody else will start a new one)*/
268 spin_lock(&lcm->lcm_thread_lock);
269 atomic_dec(&lcm->lcm_thread_numidle);
270 list_move(&lcd->lcd_lcm_list, &lcm->lcm_thread_busy);
271 spin_unlock(&lcm->lcm_thread_lock);
273 sending_list = &lcm->lcm_llcd_pending;
275 if (lcm->lcm_flags & LLOG_LCM_FL_EXIT) {
276 lcm->lcm_llcd_maxfree = 0;
277 lcm->lcm_llcd_minfree = 0;
278 lcm->lcm_thread_max = 0;
280 if (list_empty(&lcm->lcm_llcd_pending) ||
281 lcm->lcm_flags & LLOG_LCM_FL_EXIT_FORCE)
285 if (atomic_read(&lcm->lcm_thread_numidle) <= 1 &&
286 atomic_read(&lcm->lcm_thread_total) < lcm->lcm_thread_max) {
287 rc = llog_start_commit_thread();
289 CERROR("error starting thread: rc %d\n", rc);
292 /* Move all of the pending cancels from the same OST off of
293 * the list, so we don't get multiple threads blocked and/or
294 * doing upcalls on the same OST in case of failure. */
295 spin_lock(&lcm->lcm_llcd_lock);
296 if (!list_empty(sending_list)) {
297 list_move_tail(sending_list->next,
298 &lcd->lcd_llcd_list);
299 llcd = list_entry(lcd->lcd_llcd_list.next,
300 typeof(*llcd), llcd_list);
301 LASSERT(llcd->llcd_lcm == lcm);
302 import = llcd->llcd_ctxt->loc_imp;
304 list_for_each_entry_safe(llcd, n, sending_list, llcd_list) {
305 LASSERT(llcd->llcd_lcm == lcm);
306 if (import == llcd->llcd_ctxt->loc_imp)
307 list_move_tail(&llcd->llcd_list,
308 &lcd->lcd_llcd_list);
310 if (sending_list != &lcm->lcm_llcd_resend) {
311 list_for_each_entry_safe(llcd, n, &lcm->lcm_llcd_resend,
313 LASSERT(llcd->llcd_lcm == lcm);
314 if (import == llcd->llcd_ctxt->loc_imp)
315 list_move_tail(&llcd->llcd_list,
316 &lcd->lcd_llcd_list);
319 spin_unlock(&lcm->lcm_llcd_lock);
321 /* We are the only one manipulating our local list - no lock */
322 list_for_each_entry_safe(llcd,n, &lcd->lcd_llcd_list,llcd_list){
323 char *bufs[1] = {(char *)llcd->llcd_cookies};
325 list_del(&llcd->llcd_list);
326 if (llcd->llcd_cookiebytes == 0) {
327 CDEBUG(D_HA, "put empty llcd %p:%p\n",
328 llcd, llcd->llcd_ctxt);
333 down(&llcd->llcd_ctxt->loc_sem);
334 if (llcd->llcd_ctxt->loc_imp == NULL) {
335 up(&llcd->llcd_ctxt->loc_sem);
336 CWARN("import will be destroyed, put "
337 "llcd %p:%p\n", llcd, llcd->llcd_ctxt);
341 up(&llcd->llcd_ctxt->loc_sem);
343 request = ptlrpc_prep_req(import, OBD_LOG_CANCEL, 1,
344 &llcd->llcd_cookiebytes,
347 if (request == NULL) {
349 CERROR("error preparing commit: rc %d\n", rc);
351 spin_lock(&lcm->lcm_llcd_lock);
352 list_splice(&lcd->lcd_llcd_list,
353 &lcm->lcm_llcd_resend);
354 INIT_LIST_HEAD(&lcd->lcd_llcd_list);
355 spin_unlock(&lcm->lcm_llcd_lock);
359 request->rq_replen = lustre_msg_size(0, NULL);
360 down(&llcd->llcd_ctxt->loc_sem);
361 if (llcd->llcd_ctxt->loc_imp == NULL) {
362 up(&llcd->llcd_ctxt->loc_sem);
363 CWARN("import will be destroyed, put "
364 "llcd %p:%p\n", llcd, llcd->llcd_ctxt);
366 ptlrpc_req_finished(request);
369 up(&llcd->llcd_ctxt->loc_sem);
370 rc = ptlrpc_queue_wait(request);
371 ptlrpc_req_finished(request);
373 /* If the RPC failed, we put this and the remaining
374 * messages onto the resend list for another time. */
380 #if 0 /* FIXME just put llcd, not put it on resend list */
381 spin_lock(&lcm->lcm_llcd_lock);
382 list_splice(&lcd->lcd_llcd_list, &lcm->lcm_llcd_resend);
383 if (++llcd->llcd_tries < 5) {
384 CERROR("commit %p failed on attempt %d: rc %d\n",
385 llcd, llcd->llcd_tries, rc);
387 list_add_tail(&llcd->llcd_list,
388 &lcm->lcm_llcd_resend);
389 spin_unlock(&lcm->lcm_llcd_lock);
391 spin_unlock(&lcm->lcm_llcd_lock);
393 CERROR("commit %p:%p drop %d cookies: rc %d\n",
394 llcd, llcd->llcd_ctxt,
395 (int)(llcd->llcd_cookiebytes /
396 sizeof(*llcd->llcd_cookies)), rc);
405 sending_list = &lcm->lcm_llcd_resend;
406 if (!list_empty(sending_list))
411 /* If we are force exiting, just drop all of the cookies. */
412 if (lcm->lcm_flags & LLOG_LCM_FL_EXIT_FORCE) {
413 spin_lock(&lcm->lcm_llcd_lock);
414 list_splice(&lcm->lcm_llcd_pending, &lcd->lcd_llcd_list);
415 list_splice(&lcm->lcm_llcd_resend, &lcd->lcd_llcd_list);
416 list_splice(&lcm->lcm_llcd_free, &lcd->lcd_llcd_list);
417 spin_unlock(&lcm->lcm_llcd_lock);
419 list_for_each_entry_safe(llcd, n, &lcd->lcd_llcd_list,llcd_list)
423 spin_lock(&lcm->lcm_thread_lock);
424 list_del(&lcd->lcd_lcm_list);
425 spin_unlock(&lcm->lcm_thread_lock);
426 OBD_FREE(lcd, sizeof(*lcd));
428 CDEBUG(D_HA, "%s exiting\n", current->comm);
430 spin_lock(&lcm->lcm_thread_lock);
431 atomic_dec(&lcm->lcm_thread_total);
432 spin_unlock(&lcm->lcm_thread_lock);
433 wake_up(&lcm->lcm_waitq);
438 int llog_start_commit_thread(void)
443 if (atomic_read(&lcm->lcm_thread_total) >= lcm->lcm_thread_max)
446 rc = kernel_thread(log_commit_thread, lcm, CLONE_VM | CLONE_FILES);
448 CERROR("error starting thread #%d: %d\n",
449 atomic_read(&lcm->lcm_thread_total), rc);
455 EXPORT_SYMBOL(llog_start_commit_thread);
457 static struct llog_process_args {
458 struct semaphore llpa_sem;
459 struct llog_ctxt *llpa_ctxt;
464 int llog_init_commit_master(void)
466 INIT_LIST_HEAD(&lcm->lcm_thread_busy);
467 INIT_LIST_HEAD(&lcm->lcm_thread_idle);
468 spin_lock_init(&lcm->lcm_thread_lock);
469 atomic_set(&lcm->lcm_thread_numidle, 0);
470 init_waitqueue_head(&lcm->lcm_waitq);
471 INIT_LIST_HEAD(&lcm->lcm_llcd_pending);
472 INIT_LIST_HEAD(&lcm->lcm_llcd_resend);
473 INIT_LIST_HEAD(&lcm->lcm_llcd_free);
474 spin_lock_init(&lcm->lcm_llcd_lock);
475 atomic_set(&lcm->lcm_llcd_numfree, 0);
476 lcm->lcm_llcd_minfree = 0;
477 lcm->lcm_thread_max = 5;
478 /* FIXME initialize semaphore for llog_process_args */
479 sema_init(&llpa.llpa_sem, 1);
483 int llog_cleanup_commit_master(int force)
485 lcm->lcm_flags |= LLOG_LCM_FL_EXIT;
487 lcm->lcm_flags |= LLOG_LCM_FL_EXIT_FORCE;
488 wake_up(&lcm->lcm_waitq);
490 wait_event_interruptible(lcm->lcm_waitq,
491 atomic_read(&lcm->lcm_thread_total) == 0);
495 static int log_process_thread(void *args)
497 struct llog_process_args *data = args;
498 struct llog_ctxt *ctxt = data->llpa_ctxt;
499 void *cb = data->llpa_cb;
500 struct llog_logid logid = *(struct llog_logid *)(data->llpa_arg);
501 struct llog_handle *llh = NULL;
508 ptlrpc_daemonize(); /* thread does IO to log files */
509 THREAD_NAME(current->comm, sizeof(current->comm) - 1, "llog_process");
511 SIGNAL_MASK_LOCK(current, flags);
512 sigfillset(¤t->blocked);
514 SIGNAL_MASK_UNLOCK(current, flags);
517 rc = llog_create(ctxt, &llh, &logid, NULL);
519 CERROR("llog_create failed %d\n", rc);
522 rc = llog_init_handle(llh, LLOG_F_IS_CAT, NULL);
524 CERROR("llog_init_handle failed %d\n", rc);
529 rc = llog_cat_process(llh, (llog_cb_t)cb, NULL);
530 if (rc != LLOG_PROC_BREAK)
531 CERROR("llog_cat_process failed %d\n", rc);
533 CWARN("no callback function for recovery\n");
536 CDEBUG(D_HA, "send llcd %p:%p forcibly after recovery\n",
537 ctxt->loc_llcd, ctxt);
538 llog_sync(ctxt, NULL);
540 rc = llog_cat_put(llh);
542 CERROR("llog_cat_put failed %d\n", rc);
547 static int llog_recovery_generic(struct llog_ctxt *ctxt, void *handle,void *arg)
552 down(&llpa.llpa_sem);
553 llpa.llpa_ctxt = ctxt;
554 llpa.llpa_cb = handle;
557 rc = kernel_thread(log_process_thread, &llpa, CLONE_VM | CLONE_FILES);
559 CERROR("error starting log_process_thread: %d\n", rc);
561 CDEBUG(D_HA, "log_process_thread: %d\n", rc);
568 int llog_repl_connect(struct llog_ctxt *ctxt, int count,
569 struct llog_logid *logid, struct llog_gen *gen,
570 struct obd_uuid *uuid)
572 struct llog_canceld_ctxt *llcd;
576 /* send back llcd before recovery from llog */
577 if (ctxt->loc_llcd != NULL) {
578 CWARN("llcd %p:%p not empty\n", ctxt->loc_llcd, ctxt);
579 llog_sync(ctxt, NULL);
582 down(&ctxt->loc_sem);
583 ctxt->loc_gen = *gen;
586 CERROR("couldn't get an llcd\n");
590 llcd->llcd_ctxt = ctxt;
591 ctxt->loc_llcd = llcd;
594 rc = llog_recovery_generic(ctxt, ctxt->llog_proc_cb, logid);
596 CERROR("error recovery process: %d\n", rc);
600 EXPORT_SYMBOL(llog_repl_connect);
602 #else /* !__KERNEL__ */
604 int llog_obd_repl_cancel(struct llog_ctxt *ctxt,
605 struct lov_stripe_md *lsm, int count,
606 struct llog_cookie *cookies, int flags)