1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 only,
10 * as published by the Free Software Foundation.
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License version 2 for more details (a copy is included
16 * in the LICENSE file that accompanied this code).
18 * You should have received a copy of the GNU General Public License
19 * version 2 along with this program; If not, see [sun.com URL with a
22 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 * CA 95054 USA or visit www.sun.com if you need additional information or
29 * Copyright 2008 Sun Microsystems, Inc. All rights reserved
30 * Use is subject to license terms.
33 * This file is part of Lustre, http://www.lustre.org/
34 * Lustre is a trademark of Sun Microsystems, Inc.
36 * lustre/ptlrpc/recov_thread.c
38 * OST<->MDS recovery logging thread.
39 * Invariants in implementation:
40 * - we do not share logs among different OST<->MDS connections, so that
41 * if an OST or MDS fails it need only look at log(s) relevant to itself
43 * Author: Andreas Dilger <adilger@clusterfs.com>
46 #define DEBUG_SUBSYSTEM S_LOG
49 # define EXPORT_SYMTAB
53 # include <libcfs/libcfs.h>
55 # include <libcfs/list.h>
56 # include <liblustre.h>
59 #include <obd_class.h>
60 #include <lustre_commit_confd.h>
61 #include <obd_support.h>
62 #include <obd_class.h>
63 #include <lustre_net.h>
64 #include <lnet/types.h>
65 #include <libcfs/list.h>
66 #include <lustre_log.h>
67 #include "ptlrpc_internal.h"
71 /* Allocate new commit structs in case we do not have enough.
72 * Make the llcd size small enough that it fits into a single page when we
73 * are sending/receiving it. */
74 static int llcd_alloc(struct llog_commit_master *lcm)
76 struct llog_canceld_ctxt *llcd;
79 /* payload of lustre_msg V2 is bigger */
80 llcd_size = 4096 - lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL);
82 llcd_size + offsetof(struct llog_canceld_ctxt, llcd_cookies));
86 llcd->llcd_size = llcd_size;
89 spin_lock(&lcm->lcm_llcd_lock);
90 list_add(&llcd->llcd_list, &lcm->lcm_llcd_free);
91 atomic_inc(&lcm->lcm_llcd_numfree);
92 spin_unlock(&lcm->lcm_llcd_lock);
97 /* Get a free cookie struct from the list */
98 static struct llog_canceld_ctxt *llcd_grab(struct llog_commit_master *lcm)
100 struct llog_canceld_ctxt *llcd;
103 spin_lock(&lcm->lcm_llcd_lock);
104 if (list_empty(&lcm->lcm_llcd_free)) {
105 spin_unlock(&lcm->lcm_llcd_lock);
106 if (llcd_alloc(lcm) < 0) {
107 CERROR("unable to allocate log commit data!\n");
110 /* check new llcd wasn't grabbed while lock dropped, b=7407 */
114 llcd = list_entry(lcm->lcm_llcd_free.next, typeof(*llcd), llcd_list);
115 list_del(&llcd->llcd_list);
116 atomic_dec(&lcm->lcm_llcd_numfree);
117 spin_unlock(&lcm->lcm_llcd_lock);
119 llcd->llcd_cookiebytes = 0;
124 static void llcd_put(struct llog_canceld_ctxt *llcd)
126 struct llog_commit_master *lcm = llcd->llcd_lcm;
128 llog_ctxt_put(llcd->llcd_ctxt);
129 if (atomic_read(&lcm->lcm_llcd_numfree) >= lcm->lcm_llcd_maxfree) {
130 int llcd_size = llcd->llcd_size +
131 offsetof(struct llog_canceld_ctxt, llcd_cookies);
132 OBD_FREE(llcd, llcd_size);
134 spin_lock(&lcm->lcm_llcd_lock);
135 list_add(&llcd->llcd_list, &lcm->lcm_llcd_free);
136 atomic_inc(&lcm->lcm_llcd_numfree);
137 spin_unlock(&lcm->lcm_llcd_lock);
141 /* Send some cookies to the appropriate target */
142 static void llcd_send(struct llog_canceld_ctxt *llcd)
144 if (!(llcd->llcd_lcm->lcm_flags & LLOG_LCM_FL_EXIT)) {
145 spin_lock(&llcd->llcd_lcm->lcm_llcd_lock);
146 list_add_tail(&llcd->llcd_list,
147 &llcd->llcd_lcm->lcm_llcd_pending);
148 spin_unlock(&llcd->llcd_lcm->lcm_llcd_lock);
150 cfs_waitq_signal_nr(&llcd->llcd_lcm->lcm_waitq, 1);
154 * Grab llcd and assign it to passed @ctxt. Also set up backward link
155 * and get ref on @ctxt.
157 static struct llog_canceld_ctxt *ctxt_llcd_grab(struct llog_ctxt *ctxt)
159 struct llog_canceld_ctxt *llcd;
161 LASSERT_SEM_LOCKED(&ctxt->loc_sem);
162 llcd = llcd_grab(ctxt->loc_lcm);
166 llcd->llcd_ctxt = llog_ctxt_get(ctxt);
167 ctxt->loc_llcd = llcd;
169 CDEBUG(D_RPCTRACE,"grab llcd %p:%p\n", ctxt->loc_llcd, ctxt);
174 * Put llcd in passed @ctxt. Set ->loc_llcd to NULL.
176 static void ctxt_llcd_put(struct llog_ctxt *ctxt)
178 mutex_down(&ctxt->loc_sem);
179 if (ctxt->loc_llcd != NULL) {
180 CDEBUG(D_RPCTRACE,"put llcd %p:%p\n", ctxt->loc_llcd, ctxt);
181 llcd_put(ctxt->loc_llcd);
182 ctxt->loc_llcd = NULL;
185 class_import_put(ctxt->loc_imp);
186 ctxt->loc_imp = NULL;
188 mutex_up(&ctxt->loc_sem);
191 /* deleted objects have a commit callback that cancels the MDS
192 * log record for the deletion. The commit callback calls this
195 int llog_obd_repl_cancel(struct llog_ctxt *ctxt,
196 struct lov_stripe_md *lsm, int count,
197 struct llog_cookie *cookies, int flags)
199 struct llog_canceld_ctxt *llcd;
205 mutex_down(&ctxt->loc_sem);
206 llcd = ctxt->loc_llcd;
208 if (ctxt->loc_imp == NULL) {
209 CDEBUG(D_RPCTRACE, "no import for ctxt %p\n", ctxt);
213 if (count > 0 && cookies != NULL) {
215 llcd = ctxt_llcd_grab(ctxt);
217 CERROR("couldn't get an llcd - dropped "LPX64
219 cookies->lgc_lgl.lgl_oid,
220 cookies->lgc_lgl.lgl_ogen,
222 GOTO(out, rc = -ENOMEM);
226 memcpy((char *)llcd->llcd_cookies + llcd->llcd_cookiebytes,
227 cookies, sizeof(*cookies));
228 llcd->llcd_cookiebytes += sizeof(*cookies);
230 if (llcd == NULL || !(flags & OBD_LLOG_FL_SENDNOW))
234 if ((llcd->llcd_size - llcd->llcd_cookiebytes) < sizeof(*cookies) ||
235 (flags & OBD_LLOG_FL_SENDNOW)) {
236 CDEBUG(D_RPCTRACE, "send llcd %p:%p\n", llcd, llcd->llcd_ctxt);
237 ctxt->loc_llcd = NULL;
241 mutex_up(&ctxt->loc_sem);
244 EXPORT_SYMBOL(llog_obd_repl_cancel);
246 int llog_obd_repl_sync(struct llog_ctxt *ctxt, struct obd_export *exp)
251 if (exp && (ctxt->loc_imp == exp->exp_imp_reverse)) {
252 CDEBUG(D_RPCTRACE,"reverse import disconnect\n");
254 * We put llcd because it is not going to sending list and
255 * thus, its refc will not be handled. We will handle it here.
260 * Sending cancel. This means that ctxt->loc_llcd wil be
261 * put on sending list in llog_obd_repl_cancel() and in
262 * this case recovery thread will take care of it refc.
264 rc = llog_cancel(ctxt, NULL, 0, NULL, OBD_LLOG_FL_SENDNOW);
268 EXPORT_SYMBOL(llog_obd_repl_sync);
270 static inline void stop_log_commit(struct llog_commit_master *lcm,
271 struct llog_commit_daemon *lcd,
274 CERROR("error preparing commit: rc %d\n", rc);
276 spin_lock(&lcm->lcm_llcd_lock);
277 list_splice_init(&lcd->lcd_llcd_list, &lcm->lcm_llcd_resend);
278 spin_unlock(&lcm->lcm_llcd_lock);
281 static int log_commit_thread(void *arg)
283 struct llog_commit_master *lcm = arg;
284 struct llog_commit_daemon *lcd;
285 struct llog_canceld_ctxt *llcd, *n;
286 struct obd_import *import = NULL;
289 OBD_ALLOC(lcd, sizeof(*lcd));
293 spin_lock(&lcm->lcm_thread_lock);
294 THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1,
295 "ll_log_comt_%02d", atomic_read(&lcm->lcm_thread_total));
296 atomic_inc(&lcm->lcm_thread_total);
297 spin_unlock(&lcm->lcm_thread_lock);
299 ptlrpc_daemonize(cfs_curproc_comm()); /* thread never needs to do IO */
301 CFS_INIT_LIST_HEAD(&lcd->lcd_lcm_list);
302 CFS_INIT_LIST_HEAD(&lcd->lcd_llcd_list);
305 CDEBUG(D_HA, "%s started\n", cfs_curproc_comm());
307 struct ptlrpc_request *request;
308 struct list_head *sending_list;
312 class_import_put(import);
315 /* If we do not have enough pages available, allocate some */
316 while (atomic_read(&lcm->lcm_llcd_numfree) <
317 lcm->lcm_llcd_minfree) {
318 if (llcd_alloc(lcm) < 0)
322 spin_lock(&lcm->lcm_thread_lock);
323 atomic_inc(&lcm->lcm_thread_numidle);
324 list_move(&lcd->lcd_lcm_list, &lcm->lcm_thread_idle);
325 spin_unlock(&lcm->lcm_thread_lock);
327 wait_event_interruptible(lcm->lcm_waitq,
328 !list_empty(&lcm->lcm_llcd_pending) ||
329 lcm->lcm_flags & LLOG_LCM_FL_EXIT);
331 /* If we are the last available thread, start a new one in case
332 * we get blocked on an RPC (nobody else will start a new one)*/
333 spin_lock(&lcm->lcm_thread_lock);
334 atomic_dec(&lcm->lcm_thread_numidle);
335 list_move(&lcd->lcd_lcm_list, &lcm->lcm_thread_busy);
336 spin_unlock(&lcm->lcm_thread_lock);
338 sending_list = &lcm->lcm_llcd_pending;
341 class_import_put(import);
343 if (lcm->lcm_flags & LLOG_LCM_FL_EXIT) {
344 lcm->lcm_llcd_maxfree = 0;
345 lcm->lcm_llcd_minfree = 0;
346 lcm->lcm_thread_max = 0;
348 if (list_empty(&lcm->lcm_llcd_pending) ||
349 lcm->lcm_flags & LLOG_LCM_FL_EXIT_FORCE)
353 if (atomic_read(&lcm->lcm_thread_numidle) <= 1 &&
354 atomic_read(&lcm->lcm_thread_total) < lcm->lcm_thread_max) {
355 rc = llog_start_commit_thread(lcm);
357 CERROR("error starting thread: rc %d\n", rc);
360 /* Move all of the pending cancels from the same OST off of
361 * the list, so we don't get multiple threads blocked and/or
362 * doing upcalls on the same OST in case of failure. */
363 spin_lock(&lcm->lcm_llcd_lock);
364 if (!list_empty(sending_list)) {
365 list_move_tail(sending_list->next,
366 &lcd->lcd_llcd_list);
367 llcd = list_entry(lcd->lcd_llcd_list.next,
368 typeof(*llcd), llcd_list);
369 LASSERT(llcd->llcd_lcm == lcm);
370 import = llcd->llcd_ctxt->loc_imp;
372 class_import_get(import);
374 list_for_each_entry_safe(llcd, n, sending_list, llcd_list) {
375 LASSERT(llcd->llcd_lcm == lcm);
376 if (import == llcd->llcd_ctxt->loc_imp)
377 list_move_tail(&llcd->llcd_list,
378 &lcd->lcd_llcd_list);
380 if (sending_list != &lcm->lcm_llcd_resend) {
381 list_for_each_entry_safe(llcd, n, &lcm->lcm_llcd_resend,
383 LASSERT(llcd->llcd_lcm == lcm);
384 if (import == llcd->llcd_ctxt->loc_imp)
385 list_move_tail(&llcd->llcd_list,
386 &lcd->lcd_llcd_list);
389 spin_unlock(&lcm->lcm_llcd_lock);
391 /* We are the only one manipulating our local list - no lock */
392 list_for_each_entry_safe(llcd,n, &lcd->lcd_llcd_list,llcd_list){
393 char *bufs[2] = { NULL, (char *)llcd->llcd_cookies };
395 list_del(&llcd->llcd_list);
396 if (llcd->llcd_cookiebytes == 0) {
397 CDEBUG(D_RPCTRACE, "put empty llcd %p:%p\n",
398 llcd, llcd->llcd_ctxt);
403 mutex_down(&llcd->llcd_ctxt->loc_sem);
404 if (llcd->llcd_ctxt->loc_imp == NULL) {
405 mutex_up(&llcd->llcd_ctxt->loc_sem);
406 CWARN("import will be destroyed, put "
407 "llcd %p:%p\n", llcd, llcd->llcd_ctxt);
411 mutex_up(&llcd->llcd_ctxt->loc_sem);
413 if (!import || (import == LP_POISON) ||
414 (import->imp_client == LP_POISON)) {
415 CERROR("No import %p (llcd=%p, ctxt=%p)\n",
416 import, llcd, llcd->llcd_ctxt);
421 OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_RECOV, 10);
423 request = ptlrpc_request_alloc(import, &RQF_LOG_CANCEL);
424 if (request == NULL) {
426 stop_log_commit(lcm, lcd, rc);
430 req_capsule_set_size(&request->rq_pill, &RMF_LOGCOOKIES,
431 RCL_CLIENT,llcd->llcd_cookiebytes);
433 rc = ptlrpc_request_bufs_pack(request,
435 OBD_LOG_CANCEL, bufs,
438 ptlrpc_request_free(request);
439 stop_log_commit(lcm, lcd, rc);
444 request->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
445 request->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
446 ptlrpc_at_set_req_timeout(request);
448 ptlrpc_request_set_replen(request);
449 mutex_down(&llcd->llcd_ctxt->loc_sem);
450 if (llcd->llcd_ctxt->loc_imp == NULL) {
451 mutex_up(&llcd->llcd_ctxt->loc_sem);
452 CWARN("import will be destroyed, put "
453 "llcd %p:%p\n", llcd, llcd->llcd_ctxt);
455 ptlrpc_req_finished(request);
458 mutex_up(&llcd->llcd_ctxt->loc_sem);
459 rc = ptlrpc_queue_wait(request);
460 ptlrpc_req_finished(request);
462 /* If the RPC failed, we put this and the remaining
463 * messages onto the resend list for another time. */
469 CERROR("commit %p:%p drop %d cookies: rc %d\n",
470 llcd, llcd->llcd_ctxt,
471 (int)(llcd->llcd_cookiebytes /
472 sizeof(*llcd->llcd_cookies)), rc);
477 sending_list = &lcm->lcm_llcd_resend;
478 if (!list_empty(sending_list))
484 class_import_put(import);
486 /* If we are force exiting, just drop all of the cookies. */
487 if (lcm->lcm_flags & LLOG_LCM_FL_EXIT_FORCE) {
488 spin_lock(&lcm->lcm_llcd_lock);
489 list_splice_init(&lcm->lcm_llcd_pending, &lcd->lcd_llcd_list);
490 list_splice_init(&lcm->lcm_llcd_resend, &lcd->lcd_llcd_list);
491 list_splice_init(&lcm->lcm_llcd_free, &lcd->lcd_llcd_list);
492 spin_unlock(&lcm->lcm_llcd_lock);
494 list_for_each_entry_safe(llcd, n, &lcd->lcd_llcd_list,llcd_list)
498 spin_lock(&lcm->lcm_thread_lock);
499 list_del(&lcd->lcd_lcm_list);
500 spin_unlock(&lcm->lcm_thread_lock);
501 OBD_FREE(lcd, sizeof(*lcd));
503 CDEBUG(D_HA, "%s exiting\n", cfs_curproc_comm());
505 spin_lock(&lcm->lcm_thread_lock);
506 atomic_dec(&lcm->lcm_thread_total);
507 spin_unlock(&lcm->lcm_thread_lock);
508 cfs_waitq_signal(&lcm->lcm_waitq);
513 int llog_start_commit_thread(struct llog_commit_master *lcm)
518 if (atomic_read(&lcm->lcm_thread_total) >= lcm->lcm_thread_max)
521 rc = cfs_kernel_thread(log_commit_thread, lcm, CLONE_VM | CLONE_FILES);
523 CERROR("error starting thread #%d: %d\n",
524 atomic_read(&lcm->lcm_thread_total), rc);
530 EXPORT_SYMBOL(llog_start_commit_thread);
532 static struct llog_process_args {
533 struct semaphore llpa_sem;
534 struct llog_ctxt *llpa_ctxt;
539 int llog_init_commit_master(struct llog_commit_master *lcm)
541 CFS_INIT_LIST_HEAD(&lcm->lcm_thread_busy);
542 CFS_INIT_LIST_HEAD(&lcm->lcm_thread_idle);
543 spin_lock_init(&lcm->lcm_thread_lock);
544 atomic_set(&lcm->lcm_thread_numidle, 0);
545 cfs_waitq_init(&lcm->lcm_waitq);
546 CFS_INIT_LIST_HEAD(&lcm->lcm_llcd_pending);
547 CFS_INIT_LIST_HEAD(&lcm->lcm_llcd_resend);
548 CFS_INIT_LIST_HEAD(&lcm->lcm_llcd_free);
549 spin_lock_init(&lcm->lcm_llcd_lock);
550 atomic_set(&lcm->lcm_llcd_numfree, 0);
551 lcm->lcm_llcd_minfree = 0;
552 lcm->lcm_thread_max = 5;
553 /* FIXME initialize semaphore for llog_process_args */
554 sema_init(&llpa.llpa_sem, 1);
557 EXPORT_SYMBOL(llog_init_commit_master);
559 int llog_cleanup_commit_master(struct llog_commit_master *lcm,
562 lcm->lcm_flags |= LLOG_LCM_FL_EXIT;
564 lcm->lcm_flags |= LLOG_LCM_FL_EXIT_FORCE;
565 cfs_waitq_signal(&lcm->lcm_waitq);
567 wait_event_interruptible(lcm->lcm_waitq,
568 atomic_read(&lcm->lcm_thread_total) == 0);
571 EXPORT_SYMBOL(llog_cleanup_commit_master);
573 static int log_process_thread(void *args)
575 struct llog_process_args *data = args;
576 struct llog_ctxt *ctxt = data->llpa_ctxt;
577 void *cb = data->llpa_cb;
578 struct llog_logid logid = *(struct llog_logid *)(data->llpa_arg);
579 struct llog_handle *llh = NULL;
583 mutex_up(&data->llpa_sem);
584 ptlrpc_daemonize("llog_process"); /* thread does IO to log files */
586 rc = llog_create(ctxt, &llh, &logid, NULL);
588 CERROR("llog_create failed %d\n", rc);
591 rc = llog_init_handle(llh, LLOG_F_IS_CAT, NULL);
593 CERROR("llog_init_handle failed %d\n", rc);
594 GOTO(release_llh, rc);
598 rc = llog_cat_process(llh, (llog_cb_t)cb, NULL);
599 if (rc != LLOG_PROC_BREAK)
600 CERROR("llog_cat_process failed %d\n", rc);
602 CWARN("no callback function for recovery\n");
605 CDEBUG(D_HA, "send llcd %p:%p forcibly after recovery\n",
606 ctxt->loc_llcd, ctxt);
607 llog_sync(ctxt, NULL);
610 rc = llog_cat_put(llh);
612 CERROR("llog_cat_put failed %d\n", rc);
618 static int llog_recovery_generic(struct llog_ctxt *ctxt, void *handle,void *arg)
620 struct obd_device *obd = ctxt->loc_obd;
624 if (obd->obd_stopping)
627 mutex_down(&llpa.llpa_sem);
628 llpa.llpa_cb = handle;
630 llpa.llpa_ctxt = llog_ctxt_get(ctxt);
631 if (!llpa.llpa_ctxt) {
635 rc = cfs_kernel_thread(log_process_thread, &llpa, CLONE_VM | CLONE_FILES);
638 CERROR("error starting log_process_thread: %d\n", rc);
640 CDEBUG(D_HA, "log_process_thread: %d\n", rc);
647 int llog_repl_connect(struct llog_ctxt *ctxt, int count,
648 struct llog_logid *logid, struct llog_gen *gen,
649 struct obd_uuid *uuid)
651 struct llog_canceld_ctxt *llcd;
655 /* send back llcd before recovery from llog */
656 if (ctxt->loc_llcd != NULL) {
657 CWARN("llcd %p:%p not empty\n", ctxt->loc_llcd, ctxt);
658 llog_sync(ctxt, NULL);
661 mutex_down(&ctxt->loc_sem);
662 ctxt->loc_gen = *gen;
663 llcd = ctxt_llcd_grab(ctxt);
665 CERROR("couldn't get an llcd\n");
666 mutex_up(&ctxt->loc_sem);
669 mutex_up(&ctxt->loc_sem);
671 rc = llog_recovery_generic(ctxt, ctxt->llog_proc_cb, logid);
674 CERROR("error recovery process: %d\n", rc);
678 EXPORT_SYMBOL(llog_repl_connect);
680 #else /* !__KERNEL__ */
682 int llog_obd_repl_cancel(struct llog_ctxt *ctxt,
683 struct lov_stripe_md *lsm, int count,
684 struct llog_cookie *cookies, int flags)