1 /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*-
2 * vim:expandtab:shiftwidth=8:tabstop=8:
4 * Copyright (C) 2003 Cluster File Systems, Inc.
5 * Author: Andreas Dilger <adilger@clusterfs.com>
7 * This file is part of the Lustre file system, http://www.lustre.org
8 * Lustre is a trademark of Cluster File Systems, Inc.
10 * You may have signed or agreed to another license before downloading
11 * this software. If so, you are bound by the terms and conditions
12 * of that agreement, and the following does not apply to you. See the
13 * LICENSE file included with this distribution for more information.
15 * If you did not agree to a different license, then this copy of Lustre
16 * is open source software; you can redistribute it and/or modify it
17 * under the terms of version 2 of the GNU General Public License as
18 * published by the Free Software Foundation.
20 * In either case, Lustre is distributed in the hope that it will be
21 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty
22 * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 * license text for more details.
25 * OST<->MDS recovery logging thread.
27 * Invariants in implementation:
28 * - we do not share logs among different OST<->MDS connections, so that
29 * if an OST or MDS fails it need only look at log(s) relevant to itself
32 #define DEBUG_SUBSYSTEM S_LOG
35 # define EXPORT_SYMTAB
39 # include <libcfs/libcfs.h>
41 # include <libcfs/list.h>
42 # include <liblustre.h>
45 #include <obd_class.h>
46 #include <lustre_commit_confd.h>
47 #include <obd_support.h>
48 #include <obd_class.h>
49 #include <lustre_net.h>
50 #include <lnet/types.h>
51 #include <libcfs/list.h>
52 #include <lustre_log.h>
53 #include "ptlrpc_internal.h"
57 /* Allocate new commit structs in case we do not have enough.
58 * Make the llcd size small enough that it fits into a single page when we
59 * are sending/receiving it. */
60 static int llcd_alloc(struct llog_commit_master *lcm)
62 struct llog_canceld_ctxt *llcd;
65 /* payload of lustre_msg V2 is bigger */
66 llcd_size = 4096 - lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL);
68 llcd_size + offsetof(struct llog_canceld_ctxt, llcd_cookies));
72 llcd->llcd_size = llcd_size;
75 spin_lock(&lcm->lcm_llcd_lock);
76 list_add(&llcd->llcd_list, &lcm->lcm_llcd_free);
77 atomic_inc(&lcm->lcm_llcd_numfree);
78 spin_unlock(&lcm->lcm_llcd_lock);
83 /* Get a free cookie struct from the list */
84 static struct llog_canceld_ctxt *llcd_grab(struct llog_commit_master *lcm)
86 struct llog_canceld_ctxt *llcd;
89 spin_lock(&lcm->lcm_llcd_lock);
90 if (list_empty(&lcm->lcm_llcd_free)) {
91 spin_unlock(&lcm->lcm_llcd_lock);
92 if (llcd_alloc(lcm) < 0) {
93 CERROR("unable to allocate log commit data!\n");
96 /* check new llcd wasn't grabbed while lock dropped, b=7407 */
100 llcd = list_entry(lcm->lcm_llcd_free.next, typeof(*llcd), llcd_list);
101 list_del(&llcd->llcd_list);
102 atomic_dec(&lcm->lcm_llcd_numfree);
103 spin_unlock(&lcm->lcm_llcd_lock);
105 llcd->llcd_cookiebytes = 0;
110 static void llcd_put(struct llog_canceld_ctxt *llcd)
112 struct llog_commit_master *lcm = llcd->llcd_lcm;
114 llog_ctxt_put(llcd->llcd_ctxt);
115 if (atomic_read(&lcm->lcm_llcd_numfree) >= lcm->lcm_llcd_maxfree) {
116 int llcd_size = llcd->llcd_size +
117 offsetof(struct llog_canceld_ctxt, llcd_cookies);
118 OBD_FREE(llcd, llcd_size);
120 spin_lock(&lcm->lcm_llcd_lock);
121 list_add(&llcd->llcd_list, &lcm->lcm_llcd_free);
122 atomic_inc(&lcm->lcm_llcd_numfree);
123 spin_unlock(&lcm->lcm_llcd_lock);
127 /* Send some cookies to the appropriate target */
128 static void llcd_send(struct llog_canceld_ctxt *llcd)
130 if (!(llcd->llcd_lcm->lcm_flags & LLOG_LCM_FL_EXIT)) {
131 spin_lock(&llcd->llcd_lcm->lcm_llcd_lock);
132 list_add_tail(&llcd->llcd_list,
133 &llcd->llcd_lcm->lcm_llcd_pending);
134 spin_unlock(&llcd->llcd_lcm->lcm_llcd_lock);
136 cfs_waitq_signal_nr(&llcd->llcd_lcm->lcm_waitq, 1);
140 * Grab llcd and assign it to passed @ctxt. Also set up backward link
141 * and get ref on @ctxt.
143 static struct llog_canceld_ctxt *ctxt_llcd_grab(struct llog_ctxt *ctxt)
145 struct llog_canceld_ctxt *llcd;
147 LASSERT_SEM_LOCKED(&ctxt->loc_sem);
148 llcd = llcd_grab(ctxt->loc_lcm);
152 llcd->llcd_ctxt = llog_ctxt_get(ctxt);
153 ctxt->loc_llcd = llcd;
155 CDEBUG(D_RPCTRACE,"grab llcd %p:%p\n", ctxt->loc_llcd, ctxt);
160 * Put llcd in passed @ctxt. Set ->loc_llcd to NULL.
162 static void ctxt_llcd_put(struct llog_ctxt *ctxt)
164 mutex_down(&ctxt->loc_sem);
165 if (ctxt->loc_llcd != NULL) {
166 CDEBUG(D_RPCTRACE,"put llcd %p:%p\n", ctxt->loc_llcd, ctxt);
167 llcd_put(ctxt->loc_llcd);
168 ctxt->loc_llcd = NULL;
170 class_import_put(ctxt->loc_imp);
171 ctxt->loc_imp = NULL;
172 mutex_up(&ctxt->loc_sem);
175 /* deleted objects have a commit callback that cancels the MDS
176 * log record for the deletion. The commit callback calls this
179 int llog_obd_repl_cancel(struct llog_ctxt *ctxt,
180 struct lov_stripe_md *lsm, int count,
181 struct llog_cookie *cookies, int flags)
183 struct llog_canceld_ctxt *llcd;
189 mutex_down(&ctxt->loc_sem);
190 llcd = ctxt->loc_llcd;
192 if (ctxt->loc_imp == NULL) {
193 CDEBUG(D_RPCTRACE, "no import for ctxt %p\n", ctxt);
197 if (count > 0 && cookies != NULL) {
199 llcd = ctxt_llcd_grab(ctxt);
201 CERROR("couldn't get an llcd - dropped "LPX64
203 cookies->lgc_lgl.lgl_oid,
204 cookies->lgc_lgl.lgl_ogen,
206 GOTO(out, rc = -ENOMEM);
210 memcpy((char *)llcd->llcd_cookies + llcd->llcd_cookiebytes,
211 cookies, sizeof(*cookies));
212 llcd->llcd_cookiebytes += sizeof(*cookies);
214 if (llcd == NULL || !(flags & OBD_LLOG_FL_SENDNOW))
218 if ((llcd->llcd_size - llcd->llcd_cookiebytes) < sizeof(*cookies) ||
219 (flags & OBD_LLOG_FL_SENDNOW)) {
220 CDEBUG(D_RPCTRACE, "send llcd %p:%p\n", llcd, llcd->llcd_ctxt);
221 ctxt->loc_llcd = NULL;
225 mutex_up(&ctxt->loc_sem);
228 EXPORT_SYMBOL(llog_obd_repl_cancel);
230 int llog_obd_repl_sync(struct llog_ctxt *ctxt, struct obd_export *exp)
235 if (exp && (ctxt->loc_imp == exp->exp_imp_reverse)) {
236 CDEBUG(D_RPCTRACE,"reverse import disconnect\n");
238 * We put llcd because it is not going to sending list and
239 * thus, its refc will not be handled. We will handle it here.
244 * Sending cancel. This means that ctxt->loc_llcd wil be
245 * put on sending list in llog_obd_repl_cancel() and in
246 * this case recovery thread will take care of it refc.
248 rc = llog_cancel(ctxt, NULL, 0, NULL, OBD_LLOG_FL_SENDNOW);
252 EXPORT_SYMBOL(llog_obd_repl_sync);
254 static inline void stop_log_commit(struct llog_commit_master *lcm,
255 struct llog_commit_daemon *lcd,
258 CERROR("error preparing commit: rc %d\n", rc);
260 spin_lock(&lcm->lcm_llcd_lock);
261 list_splice_init(&lcd->lcd_llcd_list, &lcm->lcm_llcd_resend);
262 spin_unlock(&lcm->lcm_llcd_lock);
265 static int log_commit_thread(void *arg)
267 struct llog_commit_master *lcm = arg;
268 struct llog_commit_daemon *lcd;
269 struct llog_canceld_ctxt *llcd, *n;
270 struct obd_import *import = NULL;
273 OBD_ALLOC(lcd, sizeof(*lcd));
277 spin_lock(&lcm->lcm_thread_lock);
278 THREAD_NAME(cfs_curproc_comm(), CFS_CURPROC_COMM_MAX - 1,
279 "ll_log_comt_%02d", atomic_read(&lcm->lcm_thread_total));
280 atomic_inc(&lcm->lcm_thread_total);
281 spin_unlock(&lcm->lcm_thread_lock);
283 ptlrpc_daemonize(cfs_curproc_comm()); /* thread never needs to do IO */
285 CFS_INIT_LIST_HEAD(&lcd->lcd_lcm_list);
286 CFS_INIT_LIST_HEAD(&lcd->lcd_llcd_list);
289 CDEBUG(D_HA, "%s started\n", cfs_curproc_comm());
291 struct ptlrpc_request *request;
292 struct list_head *sending_list;
296 class_import_put(import);
299 /* If we do not have enough pages available, allocate some */
300 while (atomic_read(&lcm->lcm_llcd_numfree) <
301 lcm->lcm_llcd_minfree) {
302 if (llcd_alloc(lcm) < 0)
306 spin_lock(&lcm->lcm_thread_lock);
307 atomic_inc(&lcm->lcm_thread_numidle);
308 list_move(&lcd->lcd_lcm_list, &lcm->lcm_thread_idle);
309 spin_unlock(&lcm->lcm_thread_lock);
311 wait_event_interruptible(lcm->lcm_waitq,
312 !list_empty(&lcm->lcm_llcd_pending) ||
313 lcm->lcm_flags & LLOG_LCM_FL_EXIT);
315 /* If we are the last available thread, start a new one in case
316 * we get blocked on an RPC (nobody else will start a new one)*/
317 spin_lock(&lcm->lcm_thread_lock);
318 atomic_dec(&lcm->lcm_thread_numidle);
319 list_move(&lcd->lcd_lcm_list, &lcm->lcm_thread_busy);
320 spin_unlock(&lcm->lcm_thread_lock);
322 sending_list = &lcm->lcm_llcd_pending;
325 class_import_put(import);
327 if (lcm->lcm_flags & LLOG_LCM_FL_EXIT) {
328 lcm->lcm_llcd_maxfree = 0;
329 lcm->lcm_llcd_minfree = 0;
330 lcm->lcm_thread_max = 0;
332 if (list_empty(&lcm->lcm_llcd_pending) ||
333 lcm->lcm_flags & LLOG_LCM_FL_EXIT_FORCE)
337 if (atomic_read(&lcm->lcm_thread_numidle) <= 1 &&
338 atomic_read(&lcm->lcm_thread_total) < lcm->lcm_thread_max) {
339 rc = llog_start_commit_thread(lcm);
341 CERROR("error starting thread: rc %d\n", rc);
344 /* Move all of the pending cancels from the same OST off of
345 * the list, so we don't get multiple threads blocked and/or
346 * doing upcalls on the same OST in case of failure. */
347 spin_lock(&lcm->lcm_llcd_lock);
348 if (!list_empty(sending_list)) {
349 list_move_tail(sending_list->next,
350 &lcd->lcd_llcd_list);
351 llcd = list_entry(lcd->lcd_llcd_list.next,
352 typeof(*llcd), llcd_list);
353 LASSERT(llcd->llcd_lcm == lcm);
354 import = llcd->llcd_ctxt->loc_imp;
356 class_import_get(import);
358 list_for_each_entry_safe(llcd, n, sending_list, llcd_list) {
359 LASSERT(llcd->llcd_lcm == lcm);
360 if (import == llcd->llcd_ctxt->loc_imp)
361 list_move_tail(&llcd->llcd_list,
362 &lcd->lcd_llcd_list);
364 if (sending_list != &lcm->lcm_llcd_resend) {
365 list_for_each_entry_safe(llcd, n, &lcm->lcm_llcd_resend,
367 LASSERT(llcd->llcd_lcm == lcm);
368 if (import == llcd->llcd_ctxt->loc_imp)
369 list_move_tail(&llcd->llcd_list,
370 &lcd->lcd_llcd_list);
373 spin_unlock(&lcm->lcm_llcd_lock);
375 /* We are the only one manipulating our local list - no lock */
376 list_for_each_entry_safe(llcd,n, &lcd->lcd_llcd_list,llcd_list){
377 char *bufs[2] = { NULL, (char *)llcd->llcd_cookies };
379 list_del(&llcd->llcd_list);
380 if (llcd->llcd_cookiebytes == 0) {
381 CDEBUG(D_RPCTRACE, "put empty llcd %p:%p\n",
382 llcd, llcd->llcd_ctxt);
387 mutex_down(&llcd->llcd_ctxt->loc_sem);
388 if (llcd->llcd_ctxt->loc_imp == NULL) {
389 mutex_up(&llcd->llcd_ctxt->loc_sem);
390 CWARN("import will be destroyed, put "
391 "llcd %p:%p\n", llcd, llcd->llcd_ctxt);
395 mutex_up(&llcd->llcd_ctxt->loc_sem);
397 if (!import || (import == LP_POISON) ||
398 (import->imp_client == LP_POISON)) {
399 CERROR("No import %p (llcd=%p, ctxt=%p)\n",
400 import, llcd, llcd->llcd_ctxt);
405 OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_RECOV, 10);
407 request = ptlrpc_request_alloc(import, &RQF_LOG_CANCEL);
408 if (request == NULL) {
410 stop_log_commit(lcm, lcd, rc);
414 req_capsule_set_size(&request->rq_pill, &RMF_LOGCOOKIES,
415 RCL_CLIENT,llcd->llcd_cookiebytes);
417 rc = ptlrpc_request_bufs_pack(request,
419 OBD_LOG_CANCEL, bufs,
422 ptlrpc_request_free(request);
423 stop_log_commit(lcm, lcd, rc);
427 /* XXX FIXME bug 249, 5515 */
428 request->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
429 request->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
431 ptlrpc_request_set_replen(request);
432 mutex_down(&llcd->llcd_ctxt->loc_sem);
433 if (llcd->llcd_ctxt->loc_imp == NULL) {
434 mutex_up(&llcd->llcd_ctxt->loc_sem);
435 CWARN("import will be destroyed, put "
436 "llcd %p:%p\n", llcd, llcd->llcd_ctxt);
438 ptlrpc_req_finished(request);
441 mutex_up(&llcd->llcd_ctxt->loc_sem);
442 rc = ptlrpc_queue_wait(request);
443 ptlrpc_req_finished(request);
445 /* If the RPC failed, we put this and the remaining
446 * messages onto the resend list for another time. */
452 CERROR("commit %p:%p drop %d cookies: rc %d\n",
453 llcd, llcd->llcd_ctxt,
454 (int)(llcd->llcd_cookiebytes /
455 sizeof(*llcd->llcd_cookies)), rc);
460 sending_list = &lcm->lcm_llcd_resend;
461 if (!list_empty(sending_list))
467 class_import_put(import);
469 /* If we are force exiting, just drop all of the cookies. */
470 if (lcm->lcm_flags & LLOG_LCM_FL_EXIT_FORCE) {
471 spin_lock(&lcm->lcm_llcd_lock);
472 list_splice_init(&lcm->lcm_llcd_pending, &lcd->lcd_llcd_list);
473 list_splice_init(&lcm->lcm_llcd_resend, &lcd->lcd_llcd_list);
474 list_splice_init(&lcm->lcm_llcd_free, &lcd->lcd_llcd_list);
475 spin_unlock(&lcm->lcm_llcd_lock);
477 list_for_each_entry_safe(llcd, n, &lcd->lcd_llcd_list,llcd_list)
481 spin_lock(&lcm->lcm_thread_lock);
482 list_del(&lcd->lcd_lcm_list);
483 spin_unlock(&lcm->lcm_thread_lock);
484 OBD_FREE(lcd, sizeof(*lcd));
486 CDEBUG(D_HA, "%s exiting\n", cfs_curproc_comm());
488 spin_lock(&lcm->lcm_thread_lock);
489 atomic_dec(&lcm->lcm_thread_total);
490 spin_unlock(&lcm->lcm_thread_lock);
491 cfs_waitq_signal(&lcm->lcm_waitq);
496 int llog_start_commit_thread(struct llog_commit_master *lcm)
501 if (atomic_read(&lcm->lcm_thread_total) >= lcm->lcm_thread_max)
504 rc = cfs_kernel_thread(log_commit_thread, lcm, CLONE_VM | CLONE_FILES);
506 CERROR("error starting thread #%d: %d\n",
507 atomic_read(&lcm->lcm_thread_total), rc);
513 EXPORT_SYMBOL(llog_start_commit_thread);
515 static struct llog_process_args {
516 struct semaphore llpa_sem;
517 struct llog_ctxt *llpa_ctxt;
522 int llog_init_commit_master(struct llog_commit_master *lcm)
524 CFS_INIT_LIST_HEAD(&lcm->lcm_thread_busy);
525 CFS_INIT_LIST_HEAD(&lcm->lcm_thread_idle);
526 spin_lock_init(&lcm->lcm_thread_lock);
527 atomic_set(&lcm->lcm_thread_numidle, 0);
528 cfs_waitq_init(&lcm->lcm_waitq);
529 CFS_INIT_LIST_HEAD(&lcm->lcm_llcd_pending);
530 CFS_INIT_LIST_HEAD(&lcm->lcm_llcd_resend);
531 CFS_INIT_LIST_HEAD(&lcm->lcm_llcd_free);
532 spin_lock_init(&lcm->lcm_llcd_lock);
533 atomic_set(&lcm->lcm_llcd_numfree, 0);
534 lcm->lcm_llcd_minfree = 0;
535 lcm->lcm_thread_max = 5;
536 /* FIXME initialize semaphore for llog_process_args */
537 sema_init(&llpa.llpa_sem, 1);
540 EXPORT_SYMBOL(llog_init_commit_master);
542 int llog_cleanup_commit_master(struct llog_commit_master *lcm,
545 lcm->lcm_flags |= LLOG_LCM_FL_EXIT;
547 lcm->lcm_flags |= LLOG_LCM_FL_EXIT_FORCE;
548 cfs_waitq_signal(&lcm->lcm_waitq);
550 wait_event_interruptible(lcm->lcm_waitq,
551 atomic_read(&lcm->lcm_thread_total) == 0);
554 EXPORT_SYMBOL(llog_cleanup_commit_master);
556 static int log_process_thread(void *args)
558 struct llog_process_args *data = args;
559 struct llog_ctxt *ctxt = data->llpa_ctxt;
560 void *cb = data->llpa_cb;
561 struct llog_logid logid = *(struct llog_logid *)(data->llpa_arg);
562 struct llog_handle *llh = NULL;
566 mutex_up(&data->llpa_sem);
567 ptlrpc_daemonize("llog_process"); /* thread does IO to log files */
569 rc = llog_create(ctxt, &llh, &logid, NULL);
571 CERROR("llog_create failed %d\n", rc);
574 rc = llog_init_handle(llh, LLOG_F_IS_CAT, NULL);
576 CERROR("llog_init_handle failed %d\n", rc);
577 GOTO(release_llh, rc);
581 rc = llog_cat_process(llh, (llog_cb_t)cb, NULL);
582 if (rc != LLOG_PROC_BREAK)
583 CERROR("llog_cat_process failed %d\n", rc);
585 CWARN("no callback function for recovery\n");
588 CDEBUG(D_HA, "send llcd %p:%p forcibly after recovery\n",
589 ctxt->loc_llcd, ctxt);
590 llog_sync(ctxt, NULL);
593 rc = llog_cat_put(llh);
595 CERROR("llog_cat_put failed %d\n", rc);
601 static int llog_recovery_generic(struct llog_ctxt *ctxt, void *handle,void *arg)
603 struct obd_device *obd = ctxt->loc_obd;
607 if (obd->obd_stopping)
610 mutex_down(&llpa.llpa_sem);
611 llpa.llpa_cb = handle;
613 llpa.llpa_ctxt = llog_ctxt_get(ctxt);
614 if (!llpa.llpa_ctxt) {
618 rc = cfs_kernel_thread(log_process_thread, &llpa, CLONE_VM | CLONE_FILES);
621 CERROR("error starting log_process_thread: %d\n", rc);
623 CDEBUG(D_HA, "log_process_thread: %d\n", rc);
630 int llog_repl_connect(struct llog_ctxt *ctxt, int count,
631 struct llog_logid *logid, struct llog_gen *gen,
632 struct obd_uuid *uuid)
634 struct llog_canceld_ctxt *llcd;
638 /* send back llcd before recovery from llog */
639 if (ctxt->loc_llcd != NULL) {
640 CWARN("llcd %p:%p not empty\n", ctxt->loc_llcd, ctxt);
641 llog_sync(ctxt, NULL);
644 mutex_down(&ctxt->loc_sem);
645 ctxt->loc_gen = *gen;
646 llcd = ctxt_llcd_grab(ctxt);
648 CERROR("couldn't get an llcd\n");
649 mutex_up(&ctxt->loc_sem);
652 mutex_up(&ctxt->loc_sem);
654 rc = llog_recovery_generic(ctxt, ctxt->llog_proc_cb, logid);
657 CERROR("error recovery process: %d\n", rc);
661 EXPORT_SYMBOL(llog_repl_connect);
663 #else /* !__KERNEL__ */
665 int llog_obd_repl_cancel(struct llog_ctxt *ctxt,
666 struct lov_stripe_md *lsm, int count,
667 struct llog_cookie *cookies, int flags)