X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fobdclass%2Fllog_osd.c;h=6c3c288cce9507d5e9a6f65bcb8b57459c915413;hb=1cf11e1cf9a55fa71c873a5f485be4b63e3a5e39;hp=f0e904b17ca39930f3b039e334d1e3b8e0540720;hpb=cd764a5462697261a9a6b1e6c6858c75d969bae1;p=fs%2Flustre-release.git diff --git a/lustre/obdclass/llog_osd.c b/lustre/obdclass/llog_osd.c index f0e904b..6c3c288 100644 --- a/lustre/obdclass/llog_osd.c +++ b/lustre/obdclass/llog_osd.c @@ -23,7 +23,7 @@ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2012, 2014 Intel Corporation. + * Copyright (c) 2012, 2015, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -44,10 +44,11 @@ #define DEBUG_SUBSYSTEM S_LOG +#include +#include +#include #include #include -#include -#include #include "llog_internal.h" #include "local_storage.h" @@ -111,12 +112,35 @@ static int llog_osd_create_new_object(const struct lu_env *env, } /** + * Implementation of the llog_operations::lop_exist + * + * This function checks that llog exists on storage. + * + * \param[in] handle llog handle of the current llog + * + * \retval true if llog object exists and is not just destroyed + * \retval false if llog doesn't exist or just destroyed + */ +static int llog_osd_exist(struct llog_handle *handle) +{ + LASSERT(handle->lgh_obj); + return dt_object_exists(handle->lgh_obj) && + !lu_object_is_dying(handle->lgh_obj->do_lu.lo_header); +} + +static void *rec_tail(struct llog_rec_hdr *rec) +{ + return (void *)((char *)rec + rec->lrh_len - + sizeof(struct llog_rec_tail)); +} + +/** * Write a padding record to the llog * * This function writes a padding record to the end of llog. That may * be needed if llog contains records of variable size, e.g. config logs * or changelogs. - * The padding record just aligns llog to the LLOG_CHUNK_SIZE boundary if + * The padding record just aligns llog to the llog chunk_size boundary if * the current record doesn't fit in the remaining space. * * It allocates full length to avoid two separate writes for header and tail. @@ -192,8 +216,6 @@ static int llog_osd_read_header(const struct lu_env *env, ENTRY; - LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE); - o = handle->lgh_obj; LASSERT(o); @@ -214,17 +236,21 @@ static int llog_osd_read_header(const struct lu_env *env, lgi->lgi_off = 0; lgi->lgi_buf.lb_buf = handle->lgh_hdr; - lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE; - - rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off); - if (rc) { - CERROR("%s: error reading log header from "DFID": rc = %d\n", + lgi->lgi_buf.lb_len = handle->lgh_hdr_size; + rc = dt_read(env, o, &lgi->lgi_buf, &lgi->lgi_off); + llh_hdr = &handle->lgh_hdr->llh_hdr; + if (rc < sizeof(*llh_hdr) || rc < llh_hdr->lrh_len) { + CERROR("%s: error reading "DFID" log header size %d: rc = %d\n", o->do_lu.lo_dev->ld_obd->obd_name, - PFID(lu_object_fid(&o->do_lu)), rc); + PFID(lu_object_fid(&o->do_lu)), rc < 0 ? 0 : rc, + -EFAULT); + + if (rc >= 0) + rc = -EFAULT; + RETURN(rc); } - llh_hdr = &handle->lgh_hdr->llh_hdr; if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr)) lustre_swab_llog_hdr(handle->lgh_hdr); @@ -235,19 +261,32 @@ static int llog_osd_read_header(const struct lu_env *env, PFID(lu_object_fid(&o->do_lu)), llh_hdr->lrh_type, LLOG_HDR_MAGIC); RETURN(-EIO); - } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) { + } else if (llh_hdr->lrh_len < LLOG_MIN_CHUNK_SIZE || + llh_hdr->lrh_len > handle->lgh_hdr_size) { CERROR("%s: incorrectly sized log %s "DFID" header: " - "%#x (expected %#x)\n" + "%#x (expected at least %#x)\n" "you may need to re-run lconf --write_conf.\n", o->do_lu.lo_dev->ld_obd->obd_name, handle->lgh_name ? handle->lgh_name : "", PFID(lu_object_fid(&o->do_lu)), - llh_hdr->lrh_len, LLOG_CHUNK_SIZE); + llh_hdr->lrh_len, LLOG_MIN_CHUNK_SIZE); + RETURN(-EIO); + } else if (LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index > + LLOG_HDR_BITMAP_SIZE(handle->lgh_hdr) || + LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len != + llh_hdr->lrh_len) { + CERROR("%s: incorrectly sized log %s "DFID" tailer: " + "%#x : rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + handle->lgh_name ? handle->lgh_name : "", + PFID(lu_object_fid(&o->do_lu)), + LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_len, -EIO); RETURN(-EIO); } handle->lgh_hdr->llh_flags |= (flags & LLOG_F_EXT_MASK); - handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index; + handle->lgh_last_idx = LLOG_HDR_TAIL(handle->lgh_hdr)->lrt_index; + handle->lgh_write_offset = lgi->lgi_attr.la_size; RETURN(0); } @@ -277,6 +316,7 @@ static int llog_osd_declare_write_rec(const struct lu_env *env, int idx, struct thandle *th) { struct llog_thread_info *lgi = llog_info(env); + __u32 chunk_size; struct dt_object *o; int rc; @@ -286,12 +326,13 @@ static int llog_osd_declare_write_rec(const struct lu_env *env, LASSERT(th); LASSERT(loghandle); LASSERT(rec); - LASSERT(rec->lrh_len <= LLOG_CHUNK_SIZE); + LASSERT(rec->lrh_len <= loghandle->lgh_ctxt->loc_chunk_size); o = loghandle->lgh_obj; LASSERT(o); - lgi->lgi_buf.lb_len = sizeof(struct llog_log_hdr); + chunk_size = loghandle->lgh_ctxt->loc_chunk_size; + lgi->lgi_buf.lb_len = chunk_size; lgi->lgi_buf.lb_buf = NULL; /* each time we update header */ rc = dt_declare_record_write(env, o, &lgi->lgi_buf, 0, @@ -303,7 +344,7 @@ static int llog_osd_declare_write_rec(const struct lu_env *env, * the pad record can be inserted so take into account double * record size */ - lgi->lgi_buf.lb_len = rec->lrh_len * 2; + lgi->lgi_buf.lb_len = chunk_size * 2; lgi->lgi_buf.lb_buf = NULL; /* XXX: implement declared window or multi-chunks approach */ rc = dt_declare_record_write(env, o, &lgi->lgi_buf, -1, th); @@ -346,24 +387,32 @@ static int llog_osd_write_rec(const struct lu_env *env, int index, rc; struct llog_rec_tail *lrt; struct dt_object *o; + __u32 chunk_size; size_t left; - + __u32 orig_last_idx; + __u64 orig_write_offset; ENTRY; - LASSERT(env); llh = loghandle->lgh_hdr; - LASSERT(llh); o = loghandle->lgh_obj; - LASSERT(o); - LASSERT(th); + chunk_size = llh->llh_hdr.lrh_len; CDEBUG(D_OTHER, "new record %x to "DFID"\n", rec->lrh_type, PFID(lu_object_fid(&o->do_lu))); - /* record length should not bigger than LLOG_CHUNK_SIZE */ - if (reclen > LLOG_CHUNK_SIZE) + if (!llog_osd_exist(loghandle)) + RETURN(-ENOENT); + + /* record length should not bigger than */ + if (reclen > loghandle->lgh_hdr->llh_hdr.lrh_len) RETURN(-E2BIG); + /* sanity check for fixed-records llog */ + if (idx != LLOG_HEADER_IDX && (llh->llh_flags & LLOG_F_IS_FIXSIZE)) { + LASSERT(llh->llh_size != 0); + LASSERT(llh->llh_size == reclen); + } + rc = dt_attr_get(env, o, &lgi->lgi_attr); if (rc) RETURN(rc); @@ -389,7 +438,7 @@ static int llog_osd_write_rec(const struct lu_env *env, /* llog can be empty only when first record is being written */ LASSERT(ergo(idx > 0, lgi->lgi_attr.la_size > 0)); - if (!ext2_test_bit(idx, llh->llh_bitmap)) { + if (!ext2_test_bit(idx, LLOG_HDR_BITMAP(llh))) { CERROR("%s: modify unset record %u\n", o->do_lu.lo_dev->ld_obd->obd_name, idx); RETURN(-ENOENT); @@ -404,14 +453,44 @@ static int llog_osd_write_rec(const struct lu_env *env, if (idx == LLOG_HEADER_IDX) { /* llog header update */ - LASSERT(reclen == sizeof(struct llog_log_hdr)); - LASSERT(rec == &llh->llh_hdr); + __u32 *bitmap = LLOG_HDR_BITMAP(llh); lgi->lgi_off = 0; - lgi->lgi_buf.lb_len = reclen; - lgi->lgi_buf.lb_buf = rec; + + /* If it does not indicate the bitmap index + * (reccookie == NULL), then it means update + * the whole update header. Otherwise only + * update header and bits needs to be updated, + * and in DNE cases, it will signaficantly + * shrink the RPC size. + * see distribute_txn_cancel_records()*/ + if (reccookie == NULL) { + lgi->lgi_buf.lb_len = reclen; + lgi->lgi_buf.lb_buf = rec; + rc = dt_record_write(env, o, &lgi->lgi_buf, + &lgi->lgi_off, th); + RETURN(rc); + } + + /* update the header */ + lgi->lgi_buf.lb_len = llh->llh_bitmap_offset; + lgi->lgi_buf.lb_buf = llh; rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + if (rc != 0) + RETURN(rc); + + /* update the bitmap */ + index = reccookie->lgc_index; + lgi->lgi_off = llh->llh_bitmap_offset + + (index / (sizeof(*bitmap) * 8)) * + sizeof(*bitmap); + lgi->lgi_buf.lb_len = sizeof(*bitmap); + lgi->lgi_buf.lb_buf = + &bitmap[index/(sizeof(*bitmap)*8)]; + rc = dt_record_write(env, o, &lgi->lgi_buf, + &lgi->lgi_off, th); + RETURN(rc); } else if (loghandle->lgh_cur_idx > 0) { /** @@ -430,15 +509,9 @@ static int llog_osd_write_rec(const struct lu_env *env, "len:%u offset %llu\n", POSTID(&loghandle->lgh_id.lgl_oi), idx, rec->lrh_len, (long long)lgi->lgi_off); - } else if (llh->llh_size > 0) { - if (llh->llh_size != rec->lrh_len) { - CERROR("%s: wrong record size, llh_size is %u" - " but record size is %u\n", - o->do_lu.lo_dev->ld_obd->obd_name, - llh->llh_size, rec->lrh_len); - RETURN(-EINVAL); - } - lgi->lgi_off = sizeof(*llh) + (idx - 1) * reclen; + } else if (llh->llh_flags & LLOG_F_IS_FIXSIZE) { + lgi->lgi_off = llh->llh_hdr.lrh_len + + (idx - 1) * reclen; } else { /* This can be result of lgh_cur_idx is not set during * llog processing or llh_size is not set to proper @@ -473,9 +546,32 @@ static int llog_osd_write_rec(const struct lu_env *env, * process them page-at-a-time if needed. If it will cross a chunk * boundary, write in a fake (but referenced) entry to pad the chunk. */ + + + /* simulate ENOSPC when new plain llog is being added to the + * catalog */ + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED2) && + llh->llh_flags & LLOG_F_IS_CAT) + RETURN(-ENOSPC); + LASSERT(lgi->lgi_attr.la_valid & LA_SIZE); + orig_last_idx = loghandle->lgh_last_idx; + orig_write_offset = loghandle->lgh_write_offset; lgi->lgi_off = lgi->lgi_attr.la_size; - left = LLOG_CHUNK_SIZE - (lgi->lgi_off & (LLOG_CHUNK_SIZE - 1)); + + if (loghandle->lgh_max_size > 0 && + lgi->lgi_off >= loghandle->lgh_max_size) { + CDEBUG(D_OTHER, "llog is getting too large (%u > %u) at %u " + DOSTID"\n", (unsigned)lgi->lgi_off, + loghandle->lgh_max_size, + (int)loghandle->lgh_last_idx, + POSTID(&loghandle->lgh_id.lgl_oi)); + /* this is to signal that this llog is full */ + loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(llh) - 1; + RETURN(-ENOSPC); + } + + left = chunk_size - (lgi->lgi_off & (chunk_size - 1)); /* NOTE: padding is a record, but no bit is set */ if (left != 0 && left != reclen && left < (reclen + LLOG_MIN_REC_SIZE)) { @@ -483,39 +579,58 @@ static int llog_osd_write_rec(const struct lu_env *env, rc = llog_osd_pad(env, o, &lgi->lgi_off, left, index, th); if (rc) RETURN(rc); + + if (dt_object_remote(o)) + loghandle->lgh_write_offset = lgi->lgi_off; + loghandle->lgh_last_idx++; /* for pad rec */ } - /* if it's the last idx in log file, then return -ENOSPC */ - if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1) - RETURN(-ENOSPC); + /* if it's the last idx in log file, then return -ENOSPC + * or wrap around if a catalog */ + if (llog_is_full(loghandle) || + unlikely(llh->llh_flags & LLOG_F_IS_CAT && + OBD_FAIL_PRECHECK(OBD_FAIL_CAT_RECORDS) && + loghandle->lgh_last_idx >= cfs_fail_val)) { + if (llh->llh_flags & LLOG_F_IS_CAT) + loghandle->lgh_last_idx = 0; + else + RETURN(-ENOSPC); + } /* increment the last_idx along with llh_tail index, they should * be equal for a llog lifetime */ loghandle->lgh_last_idx++; index = loghandle->lgh_last_idx; - llh->llh_tail.lrt_index = index; + LLOG_HDR_TAIL(llh)->lrt_index = index; /** * NB: the caller should make sure only 1 process access * the lgh_last_idx, e.g. append should be exclusive. * Otherwise it might hit the assert. */ - LASSERT(index < LLOG_BITMAP_SIZE(llh)); + LASSERT(index < LLOG_HDR_BITMAP_SIZE(llh)); rec->lrh_index = index; lrt = rec_tail(rec); lrt->lrt_len = rec->lrh_len; lrt->lrt_index = rec->lrh_index; - /* the lgh_hdr_lock protects llog header data from concurrent + /* the lgh_hdr_mutex protects llog header data from concurrent * update/cancel, the llh_count and llh_bitmap are protected */ - spin_lock(&loghandle->lgh_hdr_lock); - if (ext2_set_bit(index, llh->llh_bitmap)) { + mutex_lock(&loghandle->lgh_hdr_mutex); + if (ext2_set_bit(index, LLOG_HDR_BITMAP(llh))) { CERROR("%s: index %u already set in log bitmap\n", o->do_lu.lo_dev->ld_obd->obd_name, index); - spin_unlock(&loghandle->lgh_hdr_lock); + mutex_unlock(&loghandle->lgh_hdr_mutex); LBUG(); /* should never happen */ } llh->llh_count++; - spin_unlock(&loghandle->lgh_hdr_lock); + + if (!(llh->llh_flags & LLOG_F_IS_FIXSIZE)) { + /* Update the minimum size of the llog record */ + if (llh->llh_size == 0) + llh->llh_size = reclen; + else if (reclen < llh->llh_size) + llh->llh_size = reclen; + } if (lgi->lgi_attr.la_size == 0) { lgi->lgi_off = 0; @@ -523,8 +638,10 @@ static int llog_osd_write_rec(const struct lu_env *env, lgi->lgi_buf.lb_buf = &llh->llh_hdr; rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); if (rc != 0) - GOTO(out, rc); + GOTO(out_unlock, rc); } else { + __u32 *bitmap = LLOG_HDR_BITMAP(llh); + /* Note: If this is not initialization (size == 0), then do not * write the whole header (8k bytes), only update header/tail * and bits needs to be updated. Because this update might be @@ -533,44 +650,65 @@ static int llog_osd_write_rec(const struct lu_env *env, * the RPC (1MB limit), if we write 8K for each operation, which * will cost a lot space, and keep us adding more updates to one * update log.*/ - lgi->lgi_off = offsetof(typeof(*llh), llh_count); - lgi->lgi_buf.lb_len = sizeof(llh->llh_count); - lgi->lgi_buf.lb_buf = &llh->llh_count; + lgi->lgi_off = 0; + lgi->lgi_buf.lb_len = llh->llh_bitmap_offset; + lgi->lgi_buf.lb_buf = &llh->llh_hdr; rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); if (rc != 0) - GOTO(out, rc); + GOTO(out_unlock, rc); - lgi->lgi_off = offsetof(typeof(*llh), - llh_bitmap[index / (sizeof(*llh->llh_bitmap) * 8)]); - lgi->lgi_buf.lb_len = sizeof(*llh->llh_bitmap); - lgi->lgi_buf.lb_buf = - &llh->llh_bitmap[index/(sizeof(*llh->llh_bitmap)*8)]; + lgi->lgi_off = llh->llh_bitmap_offset + + (index / (sizeof(*bitmap) * 8)) * sizeof(*bitmap); + lgi->lgi_buf.lb_len = sizeof(*bitmap); + lgi->lgi_buf.lb_buf = &bitmap[index/(sizeof(*bitmap)*8)]; rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); if (rc != 0) - GOTO(out, rc); + GOTO(out_unlock, rc); - lgi->lgi_off = offsetof(typeof(*llh), llh_tail); + lgi->lgi_off = (unsigned long)LLOG_HDR_TAIL(llh) - + (unsigned long)llh; lgi->lgi_buf.lb_len = sizeof(llh->llh_tail); - lgi->lgi_buf.lb_buf = &llh->llh_tail; + lgi->lgi_buf.lb_buf = LLOG_HDR_TAIL(llh); rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); if (rc != 0) - GOTO(out, rc); + GOTO(out_unlock, rc); } - rc = dt_attr_get(env, o, &lgi->lgi_attr); +out_unlock: + /* unlock here for remote object */ + mutex_unlock(&loghandle->lgh_hdr_mutex); if (rc) GOTO(out, rc); - LASSERT(lgi->lgi_attr.la_valid & LA_SIZE); - lgi->lgi_off = max_t(__u64, lgi->lgi_attr.la_size, lgi->lgi_off); + /* computed index can be used to determine offset for fixed-size + * records. This also allows to handle Catalog wrap around case */ + if (llh->llh_flags & LLOG_F_IS_FIXSIZE) { + lgi->lgi_off = llh->llh_hdr.lrh_len + (index - 1) * reclen; + } else if (dt_object_remote(o)) { + lgi->lgi_off = max_t(__u64, loghandle->lgh_write_offset, + lgi->lgi_off); + } else { + rc = dt_attr_get(env, o, &lgi->lgi_attr); + if (rc) + GOTO(out, rc); + + LASSERT(lgi->lgi_attr.la_valid & LA_SIZE); + lgi->lgi_off = max_t(__u64, lgi->lgi_attr.la_size, + lgi->lgi_off); + } + lgi->lgi_buf.lb_len = reclen; lgi->lgi_buf.lb_buf = rec; rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); if (rc < 0) GOTO(out, rc); - CDEBUG(D_OTHER, "added record "DOSTID": idx: %u, %u\n", - POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len); + if (dt_object_remote(o)) + loghandle->lgh_write_offset = lgi->lgi_off; + + CDEBUG(D_HA, "added record "DFID": idx: %u, %u off%llu\n", + PFID(lu_object_fid(&o->do_lu)), index, rec->lrh_len, + lgi->lgi_off); if (reccookie != NULL) { reccookie->lgc_lgl = loghandle->lgh_id; reccookie->lgc_index = index; @@ -586,14 +724,22 @@ static int llog_osd_write_rec(const struct lu_env *env, RETURN(rc); out: /* cleanup llog for error case */ - spin_lock(&loghandle->lgh_hdr_lock); - ext2_clear_bit(index, llh->llh_bitmap); + mutex_lock(&loghandle->lgh_hdr_mutex); + ext2_clear_bit(index, LLOG_HDR_BITMAP(llh)); llh->llh_count--; - spin_unlock(&loghandle->lgh_hdr_lock); + mutex_unlock(&loghandle->lgh_hdr_mutex); /* restore llog last_idx */ - loghandle->lgh_last_idx--; - llh->llh_tail.lrt_index = loghandle->lgh_last_idx; + if (dt_object_remote(o)) { + loghandle->lgh_last_idx = orig_last_idx; + loghandle->lgh_write_offset = orig_write_offset; + } else if (--loghandle->lgh_last_idx == 0 && + (llh->llh_flags & LLOG_F_IS_CAT) && llh->llh_cat_idx != 0) { + /* catalog had just wrap-around case */ + loghandle->lgh_last_idx = LLOG_HDR_BITMAP_SIZE(llh) - 1; + } + + LLOG_HDR_TAIL(llh)->lrt_index = loghandle->lgh_last_idx; RETURN(rc); } @@ -604,13 +750,42 @@ out: * that we are not far enough along the log (because the * actual records are larger than minimum size) we just skip * some more records. + * + * Note: in llog_process_thread, it will use bitmap offset as + * the index to locate the record, which also includs some pad + * records, whose record size is very small, and it also does not + * consider pad record when recording minimum record size (otherwise + * min_record size might be too small), so in some rare cases, + * it might skip too much record for @goal, see llog_osd_next_block(). + * + * When force_mini_rec is true, it means we have to use LLOG_MIN_REC_SIZE + * as the min record size to skip over, usually because in the previous + * try, it skip too much record, see loog_osd_next(prev)_block(). */ -static inline void llog_skip_over(__u64 *off, int curr, int goal) +static inline void llog_skip_over(struct llog_handle *lgh, __u64 *off, + int curr, int goal, __u32 chunk_size, + bool force_mini_rec) { - if (goal <= curr) - return; - *off = (*off + (goal - curr - 1) * LLOG_MIN_REC_SIZE) & - ~(LLOG_CHUNK_SIZE - 1); + struct llog_log_hdr *llh = lgh->lgh_hdr; + + /* Goal should not bigger than the record count */ + if (goal > lgh->lgh_last_idx) + goal = lgh->lgh_last_idx; + + if (goal > curr) { + if (llh->llh_flags & LLOG_F_IS_FIXSIZE) { + *off = chunk_size + (goal - 1) * llh->llh_size; + } else { + __u64 min_rec_size = LLOG_MIN_REC_SIZE; + + if (llh->llh_size > 0 && !force_mini_rec) + min_rec_size = llh->llh_size; + + *off = *off + (goal - curr - 1) * min_rec_size; + } + } + /* always align with lower chunk boundary*/ + *off &= ~(chunk_size - 1); } /** @@ -652,7 +827,7 @@ static void changelog_block_trim_ext(struct llog_rec_hdr *hdr, * \param[in,out] cur_offset furtherst point read in the file * \param[in] buf pointer to data buffer to fill * \param[in] len required len to read, it is - * LLOG_CHUNK_SIZE usually. + * usually llog chunk_size. * * \retval 0 on successful buffer read * \retval negative value on error @@ -666,18 +841,20 @@ static int llog_osd_next_block(const struct lu_env *env, struct dt_object *o; struct dt_device *dt; int rc; + __u32 chunk_size; + int last_idx = *cur_idx; + __u64 last_offset = *cur_offset; + bool force_mini_rec = false; ENTRY; LASSERT(env); LASSERT(lgi); - if (len == 0 || len & (LLOG_CHUNK_SIZE - 1)) + chunk_size = loghandle->lgh_hdr->llh_hdr.lrh_len; + if (len == 0 || len & (chunk_size - 1)) RETURN(-EINVAL); - CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n", - next_idx, *cur_idx, *cur_offset); - LASSERT(loghandle); LASSERT(loghandle->lgh_ctxt); @@ -691,21 +868,29 @@ static int llog_osd_next_block(const struct lu_env *env, if (rc) GOTO(out, rc); + CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off" + "%llu), size %llu\n", next_idx, *cur_idx, + *cur_offset, lgi->lgi_attr.la_size); + while (*cur_offset < lgi->lgi_attr.la_size) { struct llog_rec_hdr *rec, *last_rec; struct llog_rec_tail *tail; - llog_skip_over(cur_offset, *cur_idx, next_idx); + llog_skip_over(loghandle, cur_offset, *cur_idx, + next_idx, chunk_size, force_mini_rec); - /* read up to next LLOG_CHUNK_SIZE block */ - lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE - - (*cur_offset & (LLOG_CHUNK_SIZE - 1)); + /* read up to next llog chunk_size block */ + lgi->lgi_buf.lb_len = chunk_size - + (*cur_offset & (chunk_size - 1)); lgi->lgi_buf.lb_buf = buf; rc = dt_read(env, o, &lgi->lgi_buf, cur_offset); if (rc < 0) { + if (rc == -EBADR && !force_mini_rec) + goto retry; + CERROR("%s: can't read llog block from log "DFID - " offset "LPU64": rc = %d\n", + " offset %llu: rc = %d\n", o->do_lu.lo_dev->ld_obd->obd_name, PFID(lu_object_fid(&o->do_lu)), *cur_offset, rc); @@ -718,12 +903,18 @@ static int llog_osd_next_block(const struct lu_env *env, memset(buf + rc, 0, len - rc); } - if (rc == 0) /* end of file, nothing to do */ + if (rc == 0) { /* end of file, nothing to do */ + if (!force_mini_rec) + goto retry; GOTO(out, rc); + } if (rc < sizeof(*tail)) { + if (!force_mini_rec) + goto retry; + CERROR("%s: invalid llog block at log id "DOSTID"/%u " - "offset "LPU64"\n", + "offset %llu\n", o->do_lu.lo_dev->ld_obd->obd_name, POSTID(&loghandle->lgh_id.lgl_oi), loghandle->lgh_id.lgl_ogen, *cur_offset); @@ -742,25 +933,40 @@ static int llog_osd_next_block(const struct lu_env *env, if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec)) lustre_swab_llog_rec(last_rec); - LASSERT(last_rec->lrh_index == tail->lrt_index); + + if (last_rec->lrh_index != tail->lrt_index) { + CERROR("%s: invalid llog tail at log id "DOSTID"/%u " + "offset %llu last_rec idx %u tail idx %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, *cur_offset, + last_rec->lrh_index, tail->lrt_index); + GOTO(out, rc = -EINVAL); + } *cur_idx = tail->lrt_index; /* this shouldn't happen */ if (tail->lrt_index == 0) { CERROR("%s: invalid llog tail at log id "DOSTID"/%u " - "offset "LPU64"\n", + "offset %llu bytes %d\n", o->do_lu.lo_dev->ld_obd->obd_name, POSTID(&loghandle->lgh_id.lgl_oi), - loghandle->lgh_id.lgl_ogen, *cur_offset); + loghandle->lgh_id.lgl_ogen, *cur_offset, rc); GOTO(out, rc = -EINVAL); } - if (tail->lrt_index < next_idx) + if (tail->lrt_index < next_idx) { + last_idx = *cur_idx; + last_offset = *cur_offset; continue; + } /* sanity check that the start of the new buffer is no farther * than the record that we wanted. This shouldn't happen. */ if (rec->lrh_index > next_idx) { + if (!force_mini_rec && next_idx > last_idx) + goto retry; + CERROR("%s: missed desired record? %u > %u\n", o->do_lu.lo_dev->ld_obd->obd_name, rec->lrh_index, next_idx); @@ -773,6 +979,14 @@ static int llog_osd_next_block(const struct lu_env *env, CLF_VERSION | CLF_RENAME); GOTO(out, rc = 0); + +retry: + /* Note: because there are some pad records in the + * llog, so llog_skip_over() might skip too much + * records, let's try skip again with minimum record */ + force_mini_rec = true; + *cur_offset = last_offset; + *cur_idx = last_idx; } GOTO(out, rc = -EIO); out: @@ -791,7 +1005,7 @@ out: * \param[in] loghandle llog handle of the current llog * \param[in] prev_idx target index to find * \param[in] buf pointer to data buffer to fill - * \param[in] len required len to read, it is LLOG_CHUNK_SIZE usually. + * \param[in] len required len to read, it is llog_chunk_size usually. * * \retval 0 on successful buffer read * \retval negative value on error @@ -804,11 +1018,13 @@ static int llog_osd_prev_block(const struct lu_env *env, struct dt_object *o; struct dt_device *dt; loff_t cur_offset; + __u32 chunk_size; int rc; ENTRY; - if (len == 0 || len & (LLOG_CHUNK_SIZE - 1)) + chunk_size = loghandle->lgh_hdr->llh_hdr.lrh_len; + if (len == 0 || len & (chunk_size - 1)) RETURN(-EINVAL); CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx); @@ -822,8 +1038,11 @@ static int llog_osd_prev_block(const struct lu_env *env, dt = lu2dt_dev(o->do_lu.lo_dev); LASSERT(dt); - cur_offset = LLOG_CHUNK_SIZE; - llog_skip_over(&cur_offset, 0, prev_idx); + /* Let's only use mini record size for previous block read + * for now XXX */ + cur_offset = chunk_size; + llog_skip_over(loghandle, &cur_offset, 0, prev_idx, + chunk_size, true); rc = dt_attr_get(env, o, &lgi->lgi_attr); if (rc) @@ -838,7 +1057,7 @@ static int llog_osd_prev_block(const struct lu_env *env, rc = dt_read(env, o, &lgi->lgi_buf, &cur_offset); if (rc < 0) { CERROR("%s: can't read llog block from log "DFID - " offset "LPU64": rc = %d\n", + " offset %llu: rc = %d\n", o->do_lu.lo_dev->ld_obd->obd_name, PFID(lu_object_fid(&o->do_lu)), cur_offset, rc); GOTO(out, rc); @@ -849,7 +1068,7 @@ static int llog_osd_prev_block(const struct lu_env *env, if (rc < sizeof(*tail)) { CERROR("%s: invalid llog block at log id "DOSTID"/%u " - "offset "LPU64"\n", + "offset %llu\n", o->do_lu.lo_dev->ld_obd->obd_name, POSTID(&loghandle->lgh_id.lgl_oi), loghandle->lgh_id.lgl_ogen, cur_offset); @@ -873,7 +1092,7 @@ static int llog_osd_prev_block(const struct lu_env *env, /* this shouldn't happen */ if (tail->lrt_index == 0) { CERROR("%s: invalid llog tail at log id "DOSTID"/%u " - "offset "LPU64"\n", + "offset %llu\n", o->do_lu.lo_dev->ld_obd->obd_name, POSTID(&loghandle->lgh_id.lgl_oi), loghandle->lgh_id.lgl_ogen, cur_offset); @@ -972,6 +1191,7 @@ static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle, struct ls_device *ls; struct local_oid_storage *los = NULL; int rc = 0; + bool new_id = false; ENTRY; @@ -982,6 +1202,7 @@ static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle, dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt; LASSERT(dt); if (ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) { + struct lu_object_conf conf = { 0 }; if (logid != NULL) { logid_to_fid(logid, &lgi->lgi_fid); } else { @@ -992,9 +1213,11 @@ static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle, if (rc < 0) RETURN(rc); rc = 0; + conf.loc_flags = LOC_F_NEW; } - o = dt_locate(env, dt, &lgi->lgi_fid); + o = dt_locate_at(env, dt, &lgi->lgi_fid, + dt->dd_lu_dev.ld_site->ls_top_dev, &conf); if (IS_ERR(o)) RETURN(PTR_ERR(o)); @@ -1029,6 +1252,7 @@ static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle, /* generate fid for new llog */ rc = local_object_fid_generate(env, los, &lgi->lgi_fid); + new_id = true; } if (rc < 0) GOTO(out, rc); @@ -1040,15 +1264,30 @@ static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle, } else { LASSERTF(open_param & LLOG_OPEN_NEW, "%#x\n", open_param); /* generate fid for new llog */ +generate: rc = local_object_fid_generate(env, los, &lgi->lgi_fid); if (rc < 0) GOTO(out, rc); + new_id = true; } o = ls_locate(env, ls, &lgi->lgi_fid, NULL); if (IS_ERR(o)) GOTO(out_name, rc = PTR_ERR(o)); + if (dt_object_exists(o) && new_id) { + /* llog exists with just generated ID, e.g. some old llog file + * still is in use or is orphan, drop a warn and skip it. */ + CDEBUG(D_INFO, "%s: llog exists with the same FID: "DFID + ", skipping\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu))); + lu_object_put(env, &o->do_lu); + /* just skip this llog ID, we shouldn't delete it because we + * don't know exactly what is its purpose and state. */ + goto generate; + } + after_open: /* No new llog is expected but doesn't exist */ if (open_param != LLOG_OPEN_NEW && !dt_object_exists(o)) @@ -1073,23 +1312,6 @@ out: } /** - * Implementation of the llog_operations::lop_exist - * - * This function checks that llog exists on storage. - * - * \param[in] handle llog handle of the current llog - * - * \retval true if llog object exists and is not just destroyed - * \retval false if llog doesn't exist or just destroyed - */ -static int llog_osd_exist(struct llog_handle *handle) -{ - LASSERT(handle->lgh_obj); - return (dt_object_exists(handle->lgh_obj) && - !lu_object_is_dying(handle->lgh_obj->do_lu.lo_header)); -} - -/** * Get dir for regular fid log object * * Get directory for regular fid log object, and these regular fid log @@ -1443,14 +1665,11 @@ llog_osd_regular_fid_del_name_entry(const struct lu_env *env, RETURN(rc); } - /** - * Implementation of the llog_operations::lop_destroy + * Implementation of the llog_operations::lop_declare_destroy * - * This function destroys the llog and deletes also entry in the + * This function declare destroys the llog and deletes also entry in the * llog directory in case of named llog. Llog should be opened prior that. - * Destroy method is not part of external transaction and does everything - * inside. * * \param[in] env execution environment * \param[in] loghandle llog handle of the current llog @@ -1458,14 +1677,12 @@ llog_osd_regular_fid_del_name_entry(const struct lu_env *env, * \retval 0 on successful destroy * \retval negative value on error */ -static int llog_osd_destroy(const struct lu_env *env, - struct llog_handle *loghandle) +static int llog_osd_declare_destroy(const struct lu_env *env, + struct llog_handle *loghandle, + struct thandle *th) { struct llog_ctxt *ctxt; struct dt_object *o, *llog_dir = NULL; - struct dt_device *d; - struct thandle *th; - char *name = NULL; int rc; ENTRY; @@ -1476,79 +1693,105 @@ static int llog_osd_destroy(const struct lu_env *env, o = loghandle->lgh_obj; LASSERT(o); - d = lu2dt_dev(o->do_lu.lo_dev); - LASSERT(d); - LASSERT(d == ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt); - - th = dt_trans_create(env, d); - if (IS_ERR(th)) - RETURN(PTR_ERR(th)); - if (loghandle->lgh_name) { llog_dir = llog_osd_dir_get(env, ctxt); if (IS_ERR(llog_dir)) - GOTO(out_trans, rc = PTR_ERR(llog_dir)); + RETURN(PTR_ERR(llog_dir)); - name = loghandle->lgh_name; rc = dt_declare_delete(env, llog_dir, - (struct dt_key *)name, th); - if (rc) - GOTO(out_trans, rc); + (struct dt_key *)loghandle->lgh_name, + th); + if (rc < 0) + GOTO(out_put, rc); } rc = dt_declare_ref_del(env, o, th); if (rc < 0) - GOTO(out_trans, rc); + GOTO(out_put, rc); rc = dt_declare_destroy(env, o, th); - if (rc) - GOTO(out_trans, rc); + if (rc < 0) + GOTO(out_put, rc); if (loghandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) { rc = llog_osd_regular_fid_del_name_entry(env, o, th, true); if (rc < 0) - GOTO(out_trans, rc); + GOTO(out_put, rc); } - rc = dt_trans_start_local(env, d, th); - if (rc) - GOTO(out_trans, rc); +out_put: + if (!(IS_ERR_OR_NULL(llog_dir))) + lu_object_put(env, &llog_dir->do_lu); + + RETURN(rc); +} - th->th_wait_submit = 1; + +/** + * Implementation of the llog_operations::lop_destroy + * + * This function destroys the llog and deletes also entry in the + * llog directory in case of named llog. Llog should be opened prior that. + * Destroy method is not part of external transaction and does everything + * inside. + * + * \param[in] env execution environment + * \param[in] loghandle llog handle of the current llog + * + * \retval 0 on successful destroy + * \retval negative value on error + */ +static int llog_osd_destroy(const struct lu_env *env, + struct llog_handle *loghandle, struct thandle *th) +{ + struct llog_ctxt *ctxt; + struct dt_object *o, *llog_dir = NULL; + int rc; + + ENTRY; + + ctxt = loghandle->lgh_ctxt; + LASSERT(ctxt != NULL); + + o = loghandle->lgh_obj; + LASSERT(o != NULL); dt_write_lock(env, o, 0); - if (dt_object_exists(o)) { - if (name) { - dt_read_lock(env, llog_dir, 0); - rc = dt_delete(env, llog_dir, - (struct dt_key *) name, - th); - dt_read_unlock(env, llog_dir); - if (rc) { - CERROR("%s: can't remove llog %s: rc = %d\n", - o->do_lu.lo_dev->ld_obd->obd_name, - name, rc); - GOTO(out_unlock, rc); - } - } - dt_ref_del(env, o, th); - rc = dt_destroy(env, o, th); - if (rc) - GOTO(out_unlock, rc); + if (!dt_object_exists(o)) + GOTO(out_unlock, rc = 0); - if (loghandle->lgh_ctxt->loc_flags & - LLOG_CTXT_FLAG_NORMAL_FID) { - rc = llog_osd_regular_fid_del_name_entry(env, o, th, - false); - if (rc < 0) - GOTO(out_unlock, rc); + if (loghandle->lgh_name) { + llog_dir = llog_osd_dir_get(env, ctxt); + if (IS_ERR(llog_dir)) + GOTO(out_unlock, rc = PTR_ERR(llog_dir)); + + dt_read_lock(env, llog_dir, 0); + rc = dt_delete(env, llog_dir, + (struct dt_key *)loghandle->lgh_name, + th); + dt_read_unlock(env, llog_dir); + if (rc) { + CERROR("%s: can't remove llog %s: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + loghandle->lgh_name, rc); + GOTO(out_unlock, rc); } } + + dt_ref_del(env, o, th); + rc = dt_destroy(env, o, th); + if (rc < 0) + GOTO(out_unlock, rc); + + if (loghandle->lgh_ctxt->loc_flags & LLOG_CTXT_FLAG_NORMAL_FID) { + rc = llog_osd_regular_fid_del_name_entry(env, o, th, false); + if (rc < 0) + GOTO(out_unlock, rc); + } + out_unlock: dt_write_unlock(env, o); -out_trans: - dt_trans_stop(env, d, th); - if (llog_dir != NULL) + if (!(IS_ERR_OR_NULL(llog_dir))) lu_object_put(env, &llog_dir->do_lu); RETURN(rc); } @@ -1644,6 +1887,7 @@ struct llog_operations llog_osd_ops = { .lop_next_block = llog_osd_next_block, .lop_prev_block = llog_osd_prev_block, .lop_read_header = llog_osd_read_header, + .lop_declare_destroy = llog_osd_declare_destroy, .lop_destroy = llog_osd_destroy, .lop_setup = llog_osd_setup, .lop_cleanup = llog_osd_cleanup, @@ -1661,6 +1905,7 @@ struct llog_operations llog_common_cat_ops = { .lop_next_block = llog_osd_next_block, .lop_prev_block = llog_osd_prev_block, .lop_read_header = llog_osd_read_header, + .lop_declare_destroy = llog_osd_declare_destroy, .lop_destroy = llog_osd_destroy, .lop_setup = llog_osd_setup, .lop_cleanup = llog_osd_cleanup, @@ -1859,7 +2104,16 @@ int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d, lgi->lgi_buf.lb_buf = idarray; rc = dt_declare_record_write(env, o, &lgi->lgi_buf, lgi->lgi_off, th); if (rc) - GOTO(out, rc); + GOTO(out_trans, rc); + + /* For update log, this happens during initialization, + * see lod_sub_prep_llog(), and we need make sure catlog + * file ID is written to catlist file(committed) before + * cross-MDT operation write update records to catlog FILE, + * otherwise, during failover these update records might + * missing */ + if (fid_is_update_log(fid)) + th->th_sync = 1; rc = dt_trans_start_local(env, d, th); if (rc)