From 788d39358e2f90b24d6ac3f98b28a957a8681693 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Fri, 1 Jun 2012 09:56:52 +0800 Subject: [PATCH] LU-957 scrub: OI scrub against ldiskfs The OI scrub is totally implemented inside osd-ldiskfs, and driven by the inode iterator. For each object with normal fid, its fid in the LMA and related OI entry in the OI file will be compared, if they are inconsistent, then the OI entry will be updated. Signed-off-by: Fan Yong Change-Id: I04d25b3433336fb63c312f795a8631328beb3aa7 Reviewed-on: http://review.whamcloud.com/2552 Tested-by: Hudson Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- lustre/osd-ldiskfs/Makefile.in | 3 +- lustre/osd-ldiskfs/autoMakefile.am | 2 +- lustre/osd-ldiskfs/osd_handler.c | 37 +- lustre/osd-ldiskfs/osd_iam.c | 25 +- lustre/osd-ldiskfs/osd_iam.h | 2 + lustre/osd-ldiskfs/osd_iam_lfix.c | 15 + lustre/osd-ldiskfs/osd_iam_lvar.c | 12 + lustre/osd-ldiskfs/osd_internal.h | 63 +++ lustre/osd-ldiskfs/osd_oi.c | 225 +++++---- lustre/osd-ldiskfs/osd_oi.h | 5 + lustre/osd-ldiskfs/osd_scrub.c | 940 +++++++++++++++++++++++++++++++++++++ lustre/osd-ldiskfs/osd_scrub.h | 195 ++++++++ 12 files changed, 1408 insertions(+), 116 deletions(-) create mode 100644 lustre/osd-ldiskfs/osd_scrub.c create mode 100644 lustre/osd-ldiskfs/osd_scrub.h diff --git a/lustre/osd-ldiskfs/Makefile.in b/lustre/osd-ldiskfs/Makefile.in index 9a9e0f6..179bf2f 100644 --- a/lustre/osd-ldiskfs/Makefile.in +++ b/lustre/osd-ldiskfs/Makefile.in @@ -1,6 +1,7 @@ MODULES := osd_ldiskfs osd_ldiskfs-objs := osd_handler.o osd_oi.o osd_igif.o osd_lproc.o osd_iam.o \ - osd_iam_lfix.o osd_iam_lvar.o osd_io.o osd_compat.o + osd_iam_lfix.o osd_iam_lvar.o osd_io.o osd_compat.o \ + osd_scrub.o EXTRA_PRE_CFLAGS := -I@LINUX@/fs -I@LDISKFS_DIR@ -I@LDISKFS_DIR@/ldiskfs diff --git a/lustre/osd-ldiskfs/autoMakefile.am b/lustre/osd-ldiskfs/autoMakefile.am index 09186d1..3294c28 100644 --- a/lustre/osd-ldiskfs/autoMakefile.am +++ b/lustre/osd-ldiskfs/autoMakefile.am @@ -40,4 +40,4 @@ endif MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ EXTRA_DIST := $(osd_ldiskfs-objs:%.o=%.c) osd_internal.h osd_oi.h osd_igif.h \ - osd_iam.h + osd_iam.h osd_scrub.h diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index b68f7f788..9c1949b 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -1057,7 +1057,7 @@ const int osd_dto_credits_noquota[DTO_NR] = { [DTO_INDEX_INSERT] = 16, [DTO_INDEX_DELETE] = 16, /** - * Unused now + * Used for OI scrub */ [DTO_INDEX_UPDATE] = 16, /** @@ -1541,6 +1541,7 @@ static int osd_mkfile(struct osd_thread_info *info, struct osd_object *obj, * NB: don't need any lock because no contention at this * early stage */ inode->i_flags |= S_NOCMTIME; + inode->i_state |= I_LUSTRE_NOSCRUB; obj->oo_inode = inode; result = 0; } else { @@ -1874,6 +1875,9 @@ static int osd_object_destroy(const struct lu_env *env, LASSERT(inode); LASSERT(!lu_object_is_dying(dt->do_lu.lo_header)); + /* Parallel control for OI scrub. For most of cases, there is no + * lock contention. So it will not affect unlink performance. */ + cfs_mutex_lock(&inode->i_mutex); if (S_ISDIR(inode->i_mode)) { LASSERT(osd_inode_unlinked(inode) || inode->i_nlink == 1); @@ -1888,6 +1892,7 @@ static int osd_object_destroy(const struct lu_env *env, OSD_EXEC_OP(th, destroy); result = osd_oi_delete(osd_oti_get(env), osd, fid, th); + cfs_mutex_unlock(&inode->i_mutex); /* XXX: add to ext3 orphan list */ /* rc = ext3_orphan_add(handle_t *handle, struct inode *inode) */ @@ -4004,19 +4009,16 @@ static int osd_device_init(const struct lu_env *env, struct lu_device *d, static int osd_shutdown(const struct lu_env *env, struct osd_device *o) { - struct osd_thread_info *info = osd_oti_get(env); - - ENTRY; + ENTRY; - if (o->od_oi_table != NULL) - osd_oi_fini(info, o); + osd_scrub_cleanup(env, o); - if (o->od_fsops) { - fsfilt_put_ops(o->od_fsops); - o->od_fsops = NULL; - } + if (o->od_fsops) { + fsfilt_put_ops(o->od_fsops); + o->od_fsops = NULL; + } - RETURN(0); + RETURN(0); } static int osd_mount(const struct lu_env *env, @@ -4113,6 +4115,7 @@ static struct lu_device *osd_device_alloc(const struct lu_env *env, l->ld_ops = &osd_lu_ops; o->od_dt_dev.dd_ops = &osd_dt_ops; cfs_spin_lock_init(&o->od_osfs_lock); + cfs_mutex_init(&o->od_otable_mutex); o->od_osfs_age = cfs_time_shift_64(-1000); o->od_capa_hash = init_capa_hash(); if (o->od_capa_hash == NULL) { @@ -4171,14 +4174,12 @@ static int osd_recovery_complete(const struct lu_env *env, static int osd_prepare(const struct lu_env *env, struct lu_device *pdev, struct lu_device *dev) { - struct osd_device *osd = osd_dev(dev); - struct osd_thread_info *oti = osd_oti_get(env); - int result; - - ENTRY; + struct osd_device *osd = osd_dev(dev); + int result; + ENTRY; - /* 1. initialize oi before any file create or file open */ - result = osd_oi_init(oti, osd); + /* 1. setup scrub, including OI files initialization */ + result = osd_scrub_setup(env, osd); if (result < 0) RETURN(result); diff --git a/lustre/osd-ldiskfs/osd_iam.c b/lustre/osd-ldiskfs/osd_iam.c index a495491..faf99ac 100644 --- a/lustre/osd-ldiskfs/osd_iam.c +++ b/lustre/osd-ldiskfs/osd_iam.c @@ -2249,20 +2249,27 @@ EXPORT_SYMBOL(iam_insert); * Update record with the key @k in container @c (within context of * transaction @h), new record is given by @r. * - * Return values: 0: success, -ve: error, including -ENOENT if no record with - * the given key found. + * Return values: +1: skip because of the same rec value, 0: success, + * -ve: error, including -ENOENT if no record with the given key found. */ int iam_update(handle_t *h, struct iam_container *c, const struct iam_key *k, const struct iam_rec *r, struct iam_path_descr *pd) { struct iam_iterator it; - int result; - - iam_it_init(&it, c, IAM_IT_WRITE, pd); - - result = iam_it_get_exact(&it, k); - if (result == 0) - iam_it_rec_set(h, &it, r); + struct iam_leaf *folio; + int result; + + iam_it_init(&it, c, IAM_IT_WRITE, pd); + + result = iam_it_get_exact(&it, k); + if (result == 0) { + folio = &it.ii_path.ip_leaf; + result = iam_leaf_ops(folio)->rec_eq(folio, r); + if (result == 0) + iam_it_rec_set(h, &it, r); + else + result = 1; + } iam_it_put(&it); iam_it_fini(&it); return result; diff --git a/lustre/osd-ldiskfs/osd_iam.h b/lustre/osd-ldiskfs/osd_iam.h index 3e0aabd..98da250 100644 --- a/lustre/osd-ldiskfs/osd_iam.h +++ b/lustre/osd-ldiskfs/osd_iam.h @@ -374,6 +374,8 @@ struct iam_leaf_operations { int (*key_cmp)(const struct iam_leaf *l, const struct iam_key *k); int (*key_eq)(const struct iam_leaf *l, const struct iam_key *k); + int (*rec_eq)(const struct iam_leaf *l, const struct iam_rec *r); + int (*key_size)(const struct iam_leaf *l); /* * Search leaf @l for a record with key @k or for a place diff --git a/lustre/osd-ldiskfs/osd_iam_lfix.c b/lustre/osd-ldiskfs/osd_iam_lfix.c index d9ee5c9..f676794 100644 --- a/lustre/osd-ldiskfs/osd_iam_lfix.c +++ b/lustre/osd-ldiskfs/osd_iam_lfix.c @@ -26,6 +26,8 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. + * + * Copyright (c) 2012 Whamcloud, Inc. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -356,6 +358,18 @@ static void iam_lfix_rec_set(struct iam_leaf *l, const struct iam_rec *r) memcpy(iam_lfix_rec(l), r, iam_leaf_descr(l)->id_rec_size); } +static inline int lfix_reccmp(const struct iam_container *c, + const struct iam_rec *r1, + const struct iam_rec *r2) +{ + return memcmp(r1, r2, c->ic_descr->id_rec_size); +} + +static int iam_lfix_rec_eq(const struct iam_leaf *l, const struct iam_rec *r) +{ + return !lfix_reccmp(iam_leaf_container(l), iam_lfix_rec(l), r); +} + static void iam_lfix_rec_get(const struct iam_leaf *l, struct iam_rec *r) { assert_corr(iam_leaf_at_rec(l)); @@ -511,6 +525,7 @@ static struct iam_leaf_operations iam_lfix_leaf_ops = { .key_eq = iam_lfix_key_eq, .key_size = iam_lfix_key_size, .rec_set = iam_lfix_rec_set, + .rec_eq = iam_lfix_rec_eq, .rec_get = iam_lfix_rec_get, .lookup = iam_lfix_lookup, .ilookup = iam_lfix_ilookup, diff --git a/lustre/osd-ldiskfs/osd_iam_lvar.c b/lustre/osd-ldiskfs/osd_iam_lvar.c index 4d1fa30..75068e5 100644 --- a/lustre/osd-ldiskfs/osd_iam_lvar.c +++ b/lustre/osd-ldiskfs/osd_iam_lvar.c @@ -26,6 +26,8 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. + * + * Copyright (c) 2012 Whamcloud, Inc. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -571,6 +573,15 @@ static void lvar_rec_set(struct iam_leaf *l, const struct iam_rec *r) assert_inv(n_invariant(l)); } +static int lvar_rec_eq(const struct iam_leaf *l, const struct iam_rec *r) +{ + struct iam_rec *rec = e_rec(n_cur(l)); + + if (rec_size(rec) != rec_size(r)) + return 0; + return !memcmp(rec, r, rec_size(r)); +} + static void lvar_rec_get(const struct iam_leaf *l, struct iam_rec *r) { struct iam_rec *rec; @@ -768,6 +779,7 @@ static struct iam_leaf_operations lvar_leaf_ops = { .key_eq = lvar_key_eq, .key_size = lvar_key_size, .rec_set = lvar_rec_set, + .rec_eq = lvar_rec_eq, .rec_get = lvar_rec_get, .lookup = lvar_lookup, .ilookup = lvar_ilookup, diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index f6858e1..f75bb7e 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -75,11 +75,15 @@ #include "osd_oi.h" #include "osd_iam.h" +#include "osd_scrub.h" struct inode; #define OSD_COUNTERS (0) +/* Lustre special inode::i_state to indicate OI scrub skip this inode. */ +#define I_LUSTRE_NOSCRUB (1 << 31) + /** Enable thandle usage statistics */ #define OSD_THANDLE_STATS (0) @@ -191,6 +195,55 @@ static inline void ldiskfs_htree_lock_free(struct htree_lock *lk) #endif /* HAVE_LDISKFS_PDO */ +#define OSD_OTABLE_IT_CACHE_SIZE 128 +#define OSD_OTABLE_IT_CACHE_MASK (~(OSD_OTABLE_IT_CACHE_SIZE - 1)) + +struct osd_inconsistent_item { + /* link into osd_scrub::os_inconsistent_items, + * protected by osd_scrub::os_lock. */ + cfs_list_t oii_list; + + /* The right FID <=> ino#/gen mapping. */ + struct osd_idmap_cache oii_cache; + + unsigned int oii_insert:1; /* insert or update mapping. */ +}; + +struct osd_otable_cache { + struct osd_idmap_cache ooc_cache[OSD_OTABLE_IT_CACHE_SIZE]; + + /* Index for next cache slot to be filled. */ + int ooc_producer_idx; + + /* Index for next cache slot to be returned by it::next(). */ + int ooc_consumer_idx; + + /* How many items in ooc_cache. */ + int ooc_cached_items; + + /* Position for up layer LFSCK iteration pre-loading. */ + __u32 ooc_pos_preload; +}; + +struct osd_otable_it { + struct osd_device *ooi_dev; + struct osd_otable_cache ooi_cache; + + /* For osd_otable_it_key. */ + __u8 ooi_key[16]; + + /* The following bits can be updated/checked w/o lock protection. + * If more bits will be introduced in the future and need lock to + * protect, please add comment. */ + unsigned long ooi_used_outside:1, /* Some user out of OSD + * uses the iteration. */ + ooi_all_cached:1, /* No more entries can be + * filled into cache. */ + ooi_user_ready:1, /* The user out of OSD is + * ready to iterate. */ + ooi_waiting:1; /* it::next is waiting. */ +}; + extern const int osd_dto_credits_noquota[]; /* @@ -244,6 +297,10 @@ struct osd_device { struct brw_stats od_brw_stats; cfs_atomic_t od_r_in_flight; cfs_atomic_t od_w_in_flight; + + cfs_mutex_t od_otable_mutex; + struct osd_otable_it *od_otable_it; + struct osd_scrub od_scrub; }; #define OSD_TRACK_DECLARES @@ -561,6 +618,12 @@ int osd_compat_spec_insert(struct osd_thread_info *info, const struct lu_fid *fid, const struct osd_inode_id *id, struct thandle *th); +void osd_scrub_file_reset(struct osd_scrub *scrub, __u8 *uuid, __u64 flags); +int osd_scrub_file_store(struct osd_scrub *scrub); +int osd_scrub_start(struct osd_device *dev); +int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev); +void osd_scrub_cleanup(const struct lu_env *env, struct osd_device *dev); + /* * Invariants, assertions. */ diff --git a/lustre/osd-ldiskfs/osd_oi.c b/lustre/osd-ldiskfs/osd_oi.c index d13a0fa..737e8d3 100644 --- a/lustre/osd-ldiskfs/osd_oi.c +++ b/lustre/osd-ldiskfs/osd_oi.c @@ -70,6 +70,7 @@ /* osd_lookup(), struct osd_thread_info */ #include "osd_internal.h" #include "osd_igif.h" +#include "osd_scrub.h" static unsigned int osd_oi_count = OSD_OI_FID_NR; CFS_MODULE_PARM(osd_oi_count, "i", int, 0444, @@ -89,22 +90,25 @@ static struct dt_index_features oi_feat = { #define OSD_OI_NAME_BASE "oi.16" static void osd_oi_table_put(struct osd_thread_info *info, - struct osd_oi **oi_table, unsigned oi_count) + struct osd_oi **oi_table, unsigned oi_count) { - struct iam_container *bag; - int i; - - for (i = 0; i < oi_count; i++) { - LASSERT(oi_table[i] != NULL); - LASSERT(oi_table[i]->oi_inode != NULL); - - bag = &(oi_table[i]->oi_dir.od_container); - if (bag->ic_object == oi_table[i]->oi_inode) - iam_container_fini(bag); - iput(oi_table[i]->oi_inode); - oi_table[i]->oi_inode = NULL; - OBD_FREE_PTR(oi_table[i]); - } + struct iam_container *bag; + int i; + + for (i = 0; i < oi_count; i++) { + if (oi_table[i] == NULL) + continue; + + LASSERT(oi_table[i]->oi_inode != NULL); + + bag = &(oi_table[i]->oi_dir.od_container); + if (bag->ic_object == oi_table[i]->oi_inode) + iam_container_fini(bag); + iput(oi_table[i]->oi_inode); + oi_table[i]->oi_inode = NULL; + OBD_FREE_PTR(oi_table[i]); + oi_table[i] = NULL; + } } static int osd_oi_index_create_one(struct osd_thread_info *info, @@ -281,89 +285,136 @@ out_inode: */ static int osd_oi_table_open(struct osd_thread_info *info, struct osd_device *osd, - struct osd_oi **oi_table, unsigned oi_count, bool create) + struct osd_oi **oi_table, unsigned oi_count, bool create) { - struct dt_device *dev = &osd->od_dt_dev; - int count = 0; - int rc = 0; - int i; - - /* NB: oi_count != 0 means that we have already created/known all OIs - * and have known exact number of OIs. */ - LASSERT(oi_count <= OSD_OI_FID_NR_MAX); - - for (i = 0; i < (oi_count != 0 ? oi_count : OSD_OI_FID_NR_MAX); i++) { - char name[12]; - - sprintf(name, "%s.%d", OSD_OI_NAME_BASE, i); - rc = osd_oi_open(info, osd, name, &oi_table[i], create); - if (rc == 0) { - count++; - continue; - } - - if (rc == -ENOENT && oi_count == 0) - return count; - - CERROR("%s: can't open %s: rc = %d\n", - dev->dd_lu_dev.ld_obd->obd_name, name, rc); - if (oi_count > 0) { - CERROR("%s: expect to open total %d OI files.\n", - dev->dd_lu_dev.ld_obd->obd_name, oi_count); - } - break; - } + struct dt_device *dev = &osd->od_dt_dev; + struct scrub_file *sf = &osd->od_scrub.os_file; + int count = 0; + int rc = 0; + int i; + ENTRY; + + /* NB: oi_count != 0 means that we have already created/known all OIs + * and have known exact number of OIs. */ + LASSERT(oi_count <= OSD_OI_FID_NR_MAX); + + for (i = 0; i < (oi_count != 0 ? oi_count : OSD_OI_FID_NR_MAX); i++) { + char name[12]; + + if (oi_table[i] != NULL) { + count++; + continue; + } - if (rc < 0) { - osd_oi_table_put(info, oi_table, count); - return rc; - } + sprintf(name, "%s.%d", OSD_OI_NAME_BASE, i); + rc = osd_oi_open(info, osd, name, &oi_table[i], create); + if (rc == 0) { + count++; + continue; + } - return count; + if (rc == -ENOENT && create == false) { + if (oi_count == 0) + return count; + + rc = 0; + ldiskfs_set_bit(i, sf->sf_oi_bitmap); + continue; + } + + CERROR("%s: can't open %s: rc = %d\n", + dev->dd_lu_dev.ld_obd->obd_name, name, rc); + if (oi_count > 0) + CERROR("%s: expect to open total %d OI files.\n", + dev->dd_lu_dev.ld_obd->obd_name, oi_count); + break; + } + + if (rc < 0) { + osd_oi_table_put(info, oi_table, oi_count > 0 ? oi_count : i); + count = rc; + } + + RETURN(count); } int osd_oi_init(struct osd_thread_info *info, struct osd_device *osd) { - struct dt_device *dev = &osd->od_dt_dev; - struct osd_oi **oi; - int rc; + struct dt_device *dev = &osd->od_dt_dev; + struct osd_scrub *scrub = &osd->od_scrub; + struct scrub_file *sf = &scrub->os_file; + struct osd_oi **oi; + int rc; + ENTRY; + + OBD_ALLOC(oi, sizeof(*oi) * OSD_OI_FID_NR_MAX); + if (oi == NULL) + RETURN(-ENOMEM); + + cfs_mutex_lock(&oi_init_lock); + /* try to open existing multiple OIs first */ + rc = osd_oi_table_open(info, osd, oi, sf->sf_oi_count, false); + if (rc < 0) + GOTO(out, rc); - OBD_ALLOC(oi, sizeof(*oi) * OSD_OI_FID_NR_MAX); - if (oi == NULL) - return -ENOMEM; - - cfs_mutex_lock(&oi_init_lock); - /* try to open existing multiple OIs first */ - rc = osd_oi_table_open(info, osd, oi, 0, false); - if (rc != 0) - goto out; - - /* if previous failed then try found single OI from old filesystem */ - rc = osd_oi_open(info, osd, OSD_OI_NAME_BASE, &oi[0], false); - if (rc == 0) { /* found single OI from old filesystem */ - rc = 1; - goto out; - } else if (rc != -ENOENT) { - CERROR("%s: can't open %s: rc = %d\n", - dev->dd_lu_dev.ld_obd->obd_name, OSD_OI_NAME_BASE, rc); - goto out; - } + if (rc > 0) { + if (rc == sf->sf_oi_count || sf->sf_oi_count == 0) + GOTO(out, rc); + + osd_scrub_file_reset(scrub, + LDISKFS_SB(osd_sb(osd))->s_es->s_uuid, + SF_RECREATED); + osd_oi_count = sf->sf_oi_count; + goto create; + } + + /* if previous failed then try found single OI from old filesystem */ + rc = osd_oi_open(info, osd, OSD_OI_NAME_BASE, &oi[0], false); + if (rc == 0) { /* found single OI from old filesystem */ + GOTO(out, rc = 1); + } else if (rc != -ENOENT) { + CERROR("%s: can't open %s: rc = %d\n", + dev->dd_lu_dev.ld_obd->obd_name, OSD_OI_NAME_BASE, rc); + GOTO(out, rc); + } + + if (sf->sf_oi_count > 0) { + int i; + + memset(sf->sf_oi_bitmap, 0, SCRUB_OI_BITMAP_SIZE); + for (i = 0; i < osd_oi_count; i++) + ldiskfs_set_bit(i, sf->sf_oi_bitmap); + osd_scrub_file_reset(scrub, + LDISKFS_SB(osd_sb(osd))->s_es->s_uuid, + SF_RECREATED); + } + sf->sf_oi_count = osd_oi_count; + +create: + rc = osd_scrub_file_store(scrub); + if (rc < 0) { + osd_oi_table_put(info, oi, sf->sf_oi_count); + GOTO(out, rc); + } + + /* No OIs exist, new filesystem, create OI objects */ + rc = osd_oi_table_open(info, osd, oi, osd_oi_count, true); + LASSERT(ergo(rc >= 0, rc == osd_oi_count)); + + GOTO(out, rc); - /* No OIs exist, new filesystem, create OI objects */ - rc = osd_oi_table_open(info, osd, oi, osd_oi_count, true); - LASSERT(ergo(rc >= 0, rc == osd_oi_count)); out: - if (rc < 0) { - OBD_FREE(oi, sizeof(*oi) * OSD_OI_FID_NR_MAX); - } else { - LASSERT((rc & (rc - 1)) == 0); - osd->od_oi_table = oi; - osd->od_oi_count = rc; - rc = 0; - } + if (rc < 0) { + OBD_FREE(oi, sizeof(*oi) * OSD_OI_FID_NR_MAX); + } else { + LASSERT((rc & (rc - 1)) == 0); + osd->od_oi_table = oi; + osd->od_oi_count = rc; + rc = 0; + } - cfs_mutex_unlock(&oi_init_lock); - return rc; + cfs_mutex_unlock(&oi_init_lock); + return rc; } void osd_oi_fini(struct osd_thread_info *info, struct osd_device *osd) diff --git a/lustre/osd-ldiskfs/osd_oi.h b/lustre/osd-ldiskfs/osd_oi.h index bae99ed..8db80dd 100644 --- a/lustre/osd-ldiskfs/osd_oi.h +++ b/lustre/osd-ldiskfs/osd_oi.h @@ -84,6 +84,11 @@ struct osd_inode_id { __u32 oii_gen; /* inode generation */ }; +struct osd_idmap_cache { + struct lu_fid oic_fid; + struct osd_inode_id oic_lid; +}; + static inline void osd_id_pack(struct osd_inode_id *tgt, const struct osd_inode_id *src) { diff --git a/lustre/osd-ldiskfs/osd_scrub.c b/lustre/osd-ldiskfs/osd_scrub.c new file mode 100644 index 0000000..ccdca14 --- /dev/null +++ b/lustre/osd-ldiskfs/osd_scrub.c @@ -0,0 +1,940 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012 Whamcloud, Inc. + */ +/* + * lustre/osd-ldiskfs/osd_scrub.c + * + * Top-level entry points into osd module + * + * The OI scrub is used for rebuilding Object Index files when restores MDT from + * file-level backup. + * + * The otable based iterator scans ldiskfs inode table to feed up layer LFSCK. + * + * Author: Fan Yong + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_MDS + +#include +#include +#include + +#include "osd_internal.h" +#include "osd_oi.h" +#include "osd_scrub.h" + +#define HALF_SEC (CFS_HZ >> 1) + +static inline struct osd_device *osd_scrub2dev(struct osd_scrub *scrub) +{ + return container_of0(scrub, struct osd_device, od_scrub); +} + +static inline struct super_block *osd_scrub2sb(struct osd_scrub *scrub) +{ + return osd_sb(osd_scrub2dev(scrub)); +} + +static void osd_scrub_file_to_cpu(struct scrub_file *des, + struct scrub_file *src) +{ + memcpy(des->sf_uuid, src->sf_uuid, 16); + des->sf_flags = le64_to_cpu(src->sf_flags); + des->sf_magic = le32_to_cpu(src->sf_magic); + des->sf_status = le16_to_cpu(src->sf_status); + des->sf_param = le16_to_cpu(src->sf_param); + des->sf_time_last_complete = + le64_to_cpu(src->sf_time_last_complete); + des->sf_time_latest_start = + le64_to_cpu(src->sf_time_latest_start); + des->sf_time_last_checkpoint = + le64_to_cpu(src->sf_time_last_checkpoint); + des->sf_pos_latest_start = + le64_to_cpu(src->sf_pos_latest_start); + des->sf_pos_last_checkpoint = + le64_to_cpu(src->sf_pos_last_checkpoint); + des->sf_pos_first_inconsistent = + le64_to_cpu(src->sf_pos_first_inconsistent); + des->sf_items_checked = + le64_to_cpu(src->sf_items_checked); + des->sf_items_updated = + le64_to_cpu(src->sf_items_updated); + des->sf_items_failed = + le64_to_cpu(src->sf_items_failed); + des->sf_items_updated_prior = + le64_to_cpu(src->sf_items_updated_prior); + des->sf_run_time = le32_to_cpu(src->sf_run_time); + des->sf_success_count = le32_to_cpu(src->sf_success_count); + des->sf_oi_count = le16_to_cpu(src->sf_oi_count); + memcpy(des->sf_oi_bitmap, src->sf_oi_bitmap, SCRUB_OI_BITMAP_SIZE); +} + +static void osd_scrub_file_to_le(struct scrub_file *des, + struct scrub_file *src) +{ + memcpy(des->sf_uuid, src->sf_uuid, 16); + des->sf_flags = cpu_to_le64(src->sf_flags); + des->sf_magic = cpu_to_le32(src->sf_magic); + des->sf_status = cpu_to_le16(src->sf_status); + des->sf_param = cpu_to_le16(src->sf_param); + des->sf_time_last_complete = + cpu_to_le64(src->sf_time_last_complete); + des->sf_time_latest_start = + cpu_to_le64(src->sf_time_latest_start); + des->sf_time_last_checkpoint = + cpu_to_le64(src->sf_time_last_checkpoint); + des->sf_pos_latest_start = + cpu_to_le64(src->sf_pos_latest_start); + des->sf_pos_last_checkpoint = + cpu_to_le64(src->sf_pos_last_checkpoint); + des->sf_pos_first_inconsistent = + cpu_to_le64(src->sf_pos_first_inconsistent); + des->sf_items_checked = + cpu_to_le64(src->sf_items_checked); + des->sf_items_updated = + cpu_to_le64(src->sf_items_updated); + des->sf_items_failed = + cpu_to_le64(src->sf_items_failed); + des->sf_items_updated_prior = + cpu_to_le64(src->sf_items_updated_prior); + des->sf_run_time = cpu_to_le32(src->sf_run_time); + des->sf_success_count = cpu_to_le32(src->sf_success_count); + des->sf_oi_count = cpu_to_le16(src->sf_oi_count); + memcpy(des->sf_oi_bitmap, src->sf_oi_bitmap, SCRUB_OI_BITMAP_SIZE); +} + +static void osd_scrub_file_init(struct osd_scrub *scrub, __u8 *uuid) +{ + struct scrub_file *sf = &scrub->os_file; + + memset(sf, 0, sizeof(*sf)); + memcpy(sf->sf_uuid, uuid, 16); + sf->sf_magic = SCRUB_MAGIC_V1; + sf->sf_status = SS_INIT; +} + +void osd_scrub_file_reset(struct osd_scrub *scrub, __u8 *uuid, __u64 flags) +{ + struct scrub_file *sf = &scrub->os_file; + + CDEBUG(D_LFSCK, "Reset OI scrub file, flags = "LPX64"\n", flags); + memcpy(sf->sf_uuid, uuid, 16); + sf->sf_status = SS_INIT; + sf->sf_flags |= flags; + sf->sf_param = 0; + sf->sf_run_time = 0; + sf->sf_time_latest_start = 0; + sf->sf_time_last_checkpoint = 0; + sf->sf_pos_latest_start = 0; + sf->sf_pos_last_checkpoint = 0; + sf->sf_pos_first_inconsistent = 0; + sf->sf_items_checked = 0; + sf->sf_items_updated = 0; + sf->sf_items_failed = 0; + sf->sf_items_updated_prior = 0; +} + +static int osd_scrub_file_load(struct osd_scrub *scrub) +{ + loff_t pos = 0; + char *name = LDISKFS_SB(osd_scrub2sb(scrub))->s_es->s_volume_name; + int len = sizeof(scrub->os_file_disk); + int rc; + + rc = osd_ldiskfs_read(scrub->os_inode, &scrub->os_file_disk, len, &pos); + if (rc == len) { + struct scrub_file *sf = &scrub->os_file; + + osd_scrub_file_to_cpu(sf, &scrub->os_file_disk); + if (sf->sf_magic != SCRUB_MAGIC_V1) { + CWARN("%.16s: invalid scrub magic 0x%x != 0x%x\n,", + name, sf->sf_magic, SCRUB_MAGIC_V1); + /* Process it as new scrub file. */ + rc = -ENOENT; + } else { + rc = 0; + } + } else if (rc != 0) { + CERROR("%.16s: fail to load scrub file, expected = %d, " + "rc = %d\n", name, len, rc); + if (rc > 0) + rc = -EFAULT; + } else { + /* return -ENOENT for empty scrub file case. */ + rc = -ENOENT; + } + + return rc; +} + +int osd_scrub_file_store(struct osd_scrub *scrub) +{ + struct osd_device *dev; + handle_t *jh; + loff_t pos = 0; + int len = sizeof(scrub->os_file_disk); + int credits; + int rc; + + dev = container_of0(scrub, struct osd_device, od_scrub); + credits = osd_dto_credits_noquota[DTO_WRITE_BASE] + + osd_dto_credits_noquota[DTO_WRITE_BLOCK]; + jh = ldiskfs_journal_start_sb(osd_sb(dev), credits); + if (IS_ERR(jh)) { + rc = PTR_ERR(jh); + CERROR("%.16s: fail to start trans for scrub store, rc = %d\n", + LDISKFS_SB(osd_scrub2sb(scrub))->s_es->s_volume_name,rc); + return rc; + } + + osd_scrub_file_to_le(&scrub->os_file_disk, &scrub->os_file); + rc = osd_ldiskfs_write_record(scrub->os_inode, &scrub->os_file_disk, + len, &pos, jh); + ldiskfs_journal_stop(jh); + if (rc != 0) + CERROR("%.16s: fail to store scrub file, expected = %d, " + "rc = %d\n", + LDISKFS_SB(osd_scrub2sb(scrub))->s_es->s_volume_name, + len, rc); + scrub->os_time_last_checkpoint = cfs_time_current(); + scrub->os_time_next_checkpoint = scrub->os_time_last_checkpoint + + cfs_time_seconds(SCRUB_CHECKPOINT_INTERVAL); + return rc; +} + +static int osd_scrub_prep(struct osd_device *dev) +{ + struct osd_scrub *scrub = &dev->od_scrub; + struct ptlrpc_thread *thread = &scrub->os_thread; + struct scrub_file *sf = &scrub->os_file; + __u32 flags = scrub->os_start_flags; + int rc; + ENTRY; + + cfs_down_write(&scrub->os_rwsem); + if (flags & SS_SET_FAILOUT) + sf->sf_param |= SP_FAILOUT; + + if (flags & SS_CLEAR_FAILOUT) + sf->sf_param &= ~SP_FAILOUT; + + if (flags & SS_RESET) + osd_scrub_file_reset(scrub, + LDISKFS_SB(osd_sb(dev))->s_es->s_uuid, sf->sf_flags); + + if (flags & SS_AUTO) { + scrub->os_full_speed = 1; + sf->sf_flags |= SF_AUTO; + } else { + scrub->os_full_speed = 0; + } + + if (sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT)) + scrub->os_full_speed = 1; + + scrub->os_in_prior = 0; + scrub->os_waiting = 0; + scrub->os_new_checked = 0; + if (sf->sf_pos_last_checkpoint != 0) + sf->sf_pos_latest_start = sf->sf_pos_last_checkpoint + 1; + else + sf->sf_pos_latest_start = LDISKFS_FIRST_INO(osd_sb(dev)); + + scrub->os_pos_current = sf->sf_pos_latest_start; + sf->sf_status = SS_SCANNING; + sf->sf_time_latest_start = cfs_time_current_sec(); + sf->sf_time_last_checkpoint = sf->sf_time_latest_start; + rc = osd_scrub_file_store(scrub); + if (rc == 0) { + cfs_spin_lock(&scrub->os_lock); + thread_set_flags(thread, SVC_RUNNING); + cfs_spin_unlock(&scrub->os_lock); + cfs_waitq_broadcast(&thread->t_ctl_waitq); + } + cfs_up_write(&scrub->os_rwsem); + + RETURN(rc); +} + +static int osd_scrub_error_handler(struct osd_device *dev, + struct osd_inode_id *lid, int rc) +{ + struct osd_scrub *scrub = &dev->od_scrub; + struct scrub_file *sf = &scrub->os_file; + + cfs_down_write(&scrub->os_rwsem); + scrub->os_new_checked++; + sf->sf_items_failed++; + if (sf->sf_pos_first_inconsistent == 0 || + sf->sf_pos_first_inconsistent > lid->oii_ino) + sf->sf_pos_first_inconsistent = lid->oii_ino; + cfs_up_write(&scrub->os_rwsem); + return sf->sf_param & SP_FAILOUT ? rc : 0; +} + +static int +osd_scrub_check_update(struct osd_thread_info *info, struct osd_device *dev, + struct osd_idmap_cache *oic) +{ + struct osd_scrub *scrub = &dev->od_scrub; + struct scrub_file *sf = &scrub->os_file; + struct osd_inode_id *lid2 = &info->oti_id; + struct lu_fid *oi_fid = &info->oti_fid; + struct osd_inode_id *oi_id = &info->oti_id; + handle_t *jh = NULL; + struct osd_inconsistent_item *oii = NULL; + struct inode *inode = NULL; + struct lu_fid *fid = &oic->oic_fid; + struct osd_inode_id *lid = &oic->oic_lid; + struct iam_container *bag; + struct iam_path_descr *ipd; + int ops = DTO_INDEX_UPDATE; + int idx; + int rc; + ENTRY; + + if (scrub->os_in_prior) + oii = cfs_list_entry(oic, struct osd_inconsistent_item, + oii_cache); + + cfs_down_write(&scrub->os_rwsem); + scrub->os_new_checked++; + if (lid->oii_ino < sf->sf_pos_latest_start && oii == NULL) + GOTO(out, rc = 0); + + if (oii != NULL && oii->oii_insert) + goto iget; + + rc = osd_oi_lookup(info, dev, fid, lid2); + if (rc != 0) { + if (rc != -ENOENT) + GOTO(out, rc); + +iget: + inode = osd_iget(info, dev, lid); + if (IS_ERR(inode)) { + rc = PTR_ERR(inode); + /* Someone removed the inode. */ + if (rc == -ENOENT || rc == -ESTALE) + rc = 0; + GOTO(out, rc); + } + + /* Prevent the inode to be unlinked during OI scrub. */ + cfs_mutex_lock(&inode->i_mutex); + if (unlikely(inode->i_nlink == 0)) { + cfs_mutex_unlock(&inode->i_mutex); + iput(inode); + GOTO(out, rc = 0); + } + + ops = DTO_INDEX_INSERT; + idx = osd_oi_fid2idx(dev, fid); + if (unlikely(!ldiskfs_test_bit(idx, sf->sf_oi_bitmap))) + ldiskfs_set_bit(idx, sf->sf_oi_bitmap); + sf->sf_flags |= SF_RECREATED; + } else if (osd_id_eq(lid, lid2)) { + GOTO(out, rc = 0); + } + + sf->sf_flags |= SF_INCONSISTENT; + fid_cpu_to_be(oi_fid, fid); + osd_id_pack(oi_id, &oic->oic_lid); + jh = ldiskfs_journal_start_sb(osd_sb(dev), + osd_dto_credits_noquota[ops]); + if (IS_ERR(jh)) { + rc = PTR_ERR(jh); + CERROR("%.16s: fail to start trans for scrub store, rc = %d\n", + LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name, rc); + GOTO(out, rc); + } + + bag = &osd_fid2oi(dev, fid)->oi_dir.od_container; + ipd = osd_idx_ipd_get(info->oti_env, bag); + if (unlikely(ipd == NULL)) { + ldiskfs_journal_stop(jh); + CERROR("%.16s: fail to get ipd for scrub store\n", + LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name); + GOTO(out, rc = -ENOMEM); + } + + if (ops == DTO_INDEX_UPDATE) + rc = iam_update(jh, bag, (const struct iam_key *)oi_fid, + (struct iam_rec *)oi_id, ipd); + else + rc = iam_insert(jh, bag, (const struct iam_key *)oi_fid, + (struct iam_rec *)oi_id, ipd); + osd_ipd_put(info->oti_env, bag, ipd); + ldiskfs_journal_stop(jh); + if (rc == 0) { + if (scrub->os_in_prior) + sf->sf_items_updated_prior++; + else + sf->sf_items_updated++; + } + + GOTO(out, rc); + +out: + if (rc != 0) { + sf->sf_items_failed++; + if (sf->sf_pos_first_inconsistent == 0 || + sf->sf_pos_first_inconsistent > lid->oii_ino) + sf->sf_pos_first_inconsistent = lid->oii_ino; + } + + if (ops == DTO_INDEX_INSERT) { + cfs_mutex_unlock(&inode->i_mutex); + iput(inode); + } + cfs_up_write(&scrub->os_rwsem); + + if (oii != NULL) { + LASSERT(!cfs_list_empty(&oii->oii_list)); + + cfs_spin_lock(&scrub->os_lock); + cfs_list_del_init(&oii->oii_list); + cfs_spin_unlock(&scrub->os_lock); + OBD_FREE_PTR(oii); + } + RETURN(sf->sf_param & SP_FAILOUT ? rc : 0); +} + +static int do_osd_scrub_checkpoint(struct osd_scrub *scrub) +{ + struct scrub_file *sf = &scrub->os_file; + int rc; + ENTRY; + + cfs_down_write(&scrub->os_rwsem); + sf->sf_items_checked += scrub->os_new_checked; + scrub->os_new_checked = 0; + sf->sf_pos_last_checkpoint = scrub->os_pos_current; + sf->sf_time_last_checkpoint = cfs_time_current_sec(); + sf->sf_run_time += cfs_duration_sec(cfs_time_current() + HALF_SEC - + scrub->os_time_last_checkpoint); + rc = osd_scrub_file_store(scrub); + cfs_up_write(&scrub->os_rwsem); + + RETURN(rc); +} + +static inline int osd_scrub_checkpoint(struct osd_scrub *scrub) +{ + if (unlikely(cfs_time_beforeq(scrub->os_time_next_checkpoint, + cfs_time_current()) && + scrub->os_new_checked > 0)) + return do_osd_scrub_checkpoint(scrub); + return 0; +} + +static void osd_scrub_post(struct osd_scrub *scrub, int result) +{ + struct scrub_file *sf = &scrub->os_file; + ENTRY; + + cfs_down_write(&scrub->os_rwsem); + cfs_spin_lock(&scrub->os_lock); + thread_set_flags(&scrub->os_thread, SVC_STOPPING); + cfs_spin_unlock(&scrub->os_lock); + if (scrub->os_new_checked > 0) { + sf->sf_items_checked += scrub->os_new_checked; + scrub->os_new_checked = 0; + sf->sf_pos_last_checkpoint = scrub->os_pos_current; + } + sf->sf_time_last_checkpoint = cfs_time_current_sec(); + if (result > 0) { + sf->sf_status = SS_COMPLETED; + memset(sf->sf_oi_bitmap, 0, SCRUB_OI_BITMAP_SIZE); + sf->sf_flags &= ~(SF_RECREATED | SF_INCONSISTENT | SF_AUTO); + sf->sf_time_last_complete = sf->sf_time_last_checkpoint; + sf->sf_success_count++; + } else if (result == 0) { + sf->sf_status = SS_PAUSED; + } else { + sf->sf_status = SS_FAILED; + } + sf->sf_run_time += cfs_duration_sec(cfs_time_current() + HALF_SEC - + scrub->os_time_last_checkpoint); + result = osd_scrub_file_store(scrub); + if (result < 0) + CERROR("%.16s: fail to osd_scrub_post, rc = %d\n", + LDISKFS_SB(osd_scrub2sb(scrub))->s_es->s_volume_name, + result); + cfs_up_write(&scrub->os_rwsem); + + EXIT; +} + +#define SCRUB_NEXT_BREAK 1 +#define SCRUB_NEXT_CONTINUE 2 + +static int +osd_scrub_next(struct osd_thread_info *info, struct osd_device *dev, + struct osd_scrub *scrub, struct super_block *sb, + ldiskfs_group_t bg, struct buffer_head *bitmap, __u32 gbase, + __u32 *offset, struct osd_idmap_cache **oic) +{ + struct osd_inconsistent_item *oii; + struct lu_fid *fid; + struct osd_inode_id *lid; + struct inode *inode; + int rc = 0; + + if (!cfs_list_empty(&scrub->os_inconsistent_items)) { + oii = cfs_list_entry(scrub->os_inconsistent_items.next, + struct osd_inconsistent_item, oii_list); + *oic = &oii->oii_cache; + scrub->os_in_prior = 1; + return 0; + } + + *oic = &scrub->os_oic; + fid = &(*oic)->oic_fid; + lid = &(*oic)->oic_lid; + *offset = ldiskfs_find_next_bit(bitmap->b_data, + LDISKFS_INODES_PER_GROUP(sb), *offset); + if (*offset >= LDISKFS_INODES_PER_GROUP(sb)) { + brelse(bitmap); + scrub->os_pos_current = 1 + (bg + 1) * + LDISKFS_INODES_PER_GROUP(sb); + return SCRUB_NEXT_BREAK; + } + + scrub->os_pos_current = gbase + *offset; + osd_id_gen(lid, scrub->os_pos_current, OSD_OII_NOGEN); + inode = osd_iget_fid(info, dev, lid, fid); + if (IS_ERR(inode)) { + rc = PTR_ERR(inode); + /* The inode may be removed after bitmap searching, or the + * file is new created without inode initialized yet. */ + if (rc == -ENOENT || rc == -ESTALE) + rc = SCRUB_NEXT_CONTINUE; + else + CERROR("%.16s: fail to read inode, group = %u, " + "ino# = %u, rc = %d\n", + LDISKFS_SB(sb)->s_es->s_volume_name, + bg, scrub->os_pos_current, rc); + } else { + if (fid_is_igif(fid) || fid_is_idif(fid) || + fid_seq(fid) == FID_SEQ_LLOG || + fid_seq(fid) == FID_SEQ_LOCAL_FILE || + fid_seq_is_rsvd(fid_seq(fid)) || + inode->i_state & I_LUSTRE_NOSCRUB) + rc = SCRUB_NEXT_CONTINUE; + iput(inode); + } + return rc; +} + +static inline int osd_scrub_has_window(struct osd_scrub *scrub, + struct osd_otable_cache *ooc) +{ + return scrub->os_pos_current < ooc->ooc_pos_preload + SCRUB_WINDOW_SIZE; +} + +static int osd_scrub_main(void *args) +{ + struct lu_env env; + struct osd_thread_info *info; + struct osd_device *dev = (struct osd_device *)args; + struct osd_scrub *scrub = &dev->od_scrub; + struct ptlrpc_thread *thread = &scrub->os_thread; + cfs_list_t *list = &scrub->os_inconsistent_items; + struct l_wait_info lwi = { 0 }; + struct super_block *sb = osd_sb(dev); + struct osd_otable_it *it = NULL; + struct osd_otable_cache *ooc = NULL; + int noslot = 0; + int rc; + __u32 max; + ENTRY; + + cfs_daemonize("OI_scrub"); + rc = lu_env_init(&env, LCT_DT_THREAD); + if (rc != 0) { + CERROR("%.16s: OI scrub, fail to init env, rc = %d\n", + LDISKFS_SB(sb)->s_es->s_volume_name, rc); + GOTO(noenv, rc); + } + + info = osd_oti_get(&env); + rc = osd_scrub_prep(dev); + if (rc != 0) { + CERROR("%.16s: OI scrub, fail to scrub prep, rc = %d\n", + LDISKFS_SB(sb)->s_es->s_volume_name, rc); + GOTO(out, rc); + } + + if (!scrub->os_full_speed) { + LASSERT(dev->od_otable_it != NULL); + + it = dev->od_otable_it; + ooc = &it->ooi_cache; + l_wait_event(thread->t_ctl_waitq, + it->ooi_user_ready || !thread_is_running(thread), + &lwi); + if (unlikely(!thread_is_running(thread))) + GOTO(post, rc = 0); + + LASSERT(scrub->os_pos_current >= ooc->ooc_pos_preload); + scrub->os_pos_current = ooc->ooc_pos_preload; + } + + CDEBUG(D_LFSCK, "OI scrub: flags = 0x%x, pos = %u\n", + scrub->os_start_flags, scrub->os_pos_current); + + max = le32_to_cpu(LDISKFS_SB(sb)->s_es->s_inodes_count); + while (scrub->os_pos_current <= max) { + struct buffer_head *bitmap = NULL; + struct osd_idmap_cache *oic = NULL; + ldiskfs_group_t bg = (scrub->os_pos_current - 1) / + LDISKFS_INODES_PER_GROUP(sb); + __u32 offset = (scrub->os_pos_current - 1) % + LDISKFS_INODES_PER_GROUP(sb); + __u32 gbase = 1 + bg * LDISKFS_INODES_PER_GROUP(sb); + + bitmap = ldiskfs_read_inode_bitmap(sb, bg); + if (bitmap == NULL) { + CERROR("%.16s: fail to read bitmap at pos = %u, " + "bg = %u, scrub will stop\n", + LDISKFS_SB(sb)->s_es->s_volume_name, + scrub->os_pos_current, (__u32)bg); + GOTO(post, rc = -EIO); + } + + while (offset < LDISKFS_INODES_PER_GROUP(sb)) { + if (unlikely(!thread_is_running(thread))) { + brelse(bitmap); + GOTO(post, rc = 0); + } + + if (cfs_list_empty(list) && noslot != 0) + goto wait; + + rc = osd_scrub_next(info, dev, scrub, sb, bg, + bitmap, gbase, &offset, &oic); + if (rc == SCRUB_NEXT_BREAK) + break; + else if (rc == SCRUB_NEXT_CONTINUE) + goto next; + + if (rc != 0) + rc = osd_scrub_error_handler(dev, &oic->oic_lid, + rc); + else + rc = osd_scrub_check_update(info, dev, oic); + if (rc != 0) { + brelse(bitmap); + GOTO(post, rc); + } + + rc = osd_scrub_checkpoint(scrub); + if (rc != 0) { + CERROR("%.16s: fail to checkpoint, pos = %u, " + "rc = %d\n", + LDISKFS_SB(sb)->s_es->s_volume_name, + scrub->os_pos_current, rc); + brelse(bitmap); + GOTO(post, rc); + } + + if (scrub->os_in_prior) { + scrub->os_in_prior = 0; + continue; + } + +next: + scrub->os_pos_current = gbase + ++offset; + if (dev->od_otable_it != NULL) { + if (unlikely(it == NULL)) { + it = dev->od_otable_it; + ooc = &it->ooi_cache; + } + + if (it->ooi_waiting && + (ooc->ooc_pos_preload < + scrub->os_pos_current)) { + it->ooi_waiting = 0; + cfs_waitq_broadcast( + &thread->t_ctl_waitq); + } + } + + if (scrub->os_full_speed || rc == SCRUB_NEXT_CONTINUE) + continue; + +wait: + if (osd_scrub_has_window(scrub, ooc)) { + noslot = 0; + continue; + } + + scrub->os_waiting = 1; + l_wait_event(thread->t_ctl_waitq, + osd_scrub_has_window(scrub, ooc) || + !cfs_list_empty(list) || + !thread_is_running(thread), + &lwi); + scrub->os_waiting = 0; + + if (osd_scrub_has_window(scrub, ooc)) + noslot = 0; + else + noslot = 1; + } + } + + GOTO(post, rc = (scrub->os_pos_current > max ? 1 : rc)); + +post: + osd_scrub_post(scrub, rc); + CDEBUG(D_LFSCK, "OI scrub: stop, rc = %d, pos = %u\n", + rc, scrub->os_pos_current); + +out: + while (!cfs_list_empty(list)) { + struct osd_inconsistent_item *oii; + + oii = cfs_list_entry(list->next, + struct osd_inconsistent_item, oii_list); + cfs_list_del_init(&oii->oii_list); + OBD_FREE_PTR(oii); + } + lu_env_fini(&env); + +noenv: + cfs_spin_lock(&scrub->os_lock); + thread_set_flags(thread, SVC_STOPPED); + cfs_waitq_broadcast(&thread->t_ctl_waitq); + cfs_spin_unlock(&scrub->os_lock); + return rc; +} + +static int do_osd_scrub_start(struct osd_device *dev, __u32 flags) +{ + struct osd_scrub *scrub = &dev->od_scrub; + struct ptlrpc_thread *thread = &scrub->os_thread; + struct l_wait_info lwi = { 0 }; + int rc; + ENTRY; + +again: + /* os_lock: sync status between stop and scrub thread */ + cfs_spin_lock(&scrub->os_lock); + if (thread_is_running(thread)) { + cfs_spin_unlock(&scrub->os_lock); + RETURN(-EALREADY); + } else if (unlikely(thread_is_stopping(thread))) { + cfs_spin_unlock(&scrub->os_lock); + l_wait_event(thread->t_ctl_waitq, + thread_is_stopped(thread), + &lwi); + goto again; + } + cfs_spin_unlock(&scrub->os_lock); + + scrub->os_start_flags = flags; + thread_set_flags(thread, 0); + rc = cfs_create_thread(osd_scrub_main, dev, 0); + if (rc < 0) { + CERROR("%.16s: cannot start iteration thread, rc = %d\n", + LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name, rc); + RETURN(rc); + } + + l_wait_event(thread->t_ctl_waitq, + thread_is_running(thread) || thread_is_stopped(thread), + &lwi); + + RETURN(0); +} + +int osd_scrub_start(struct osd_device *dev) +{ + __u32 flags = SS_AUTO; + int rc; + ENTRY; + + if (dev->od_scrub.os_file.sf_status == SS_COMPLETED) + flags |= SS_RESET; + + /* od_otable_mutex: prevent curcurrent start/stop */ + cfs_mutex_lock(&dev->od_otable_mutex); + rc = do_osd_scrub_start(dev, flags); + cfs_mutex_unlock(&dev->od_otable_mutex); + + RETURN(rc == -EALREADY ? 0 : rc); +} + +static void do_osd_scrub_stop(struct osd_scrub *scrub) +{ + struct ptlrpc_thread *thread = &scrub->os_thread; + struct l_wait_info lwi = { 0 }; + + /* os_lock: sync status between stop and scrub thread */ + cfs_spin_lock(&scrub->os_lock); + if (!thread_is_init(thread) && !thread_is_stopped(thread)) { + thread_set_flags(thread, SVC_STOPPING); + cfs_spin_unlock(&scrub->os_lock); + cfs_waitq_broadcast(&thread->t_ctl_waitq); + l_wait_event(thread->t_ctl_waitq, + thread_is_stopped(thread), + &lwi); + /* Do not skip the last lock/unlock, which can guarantee that + * the caller cannot return until the OI scrub thread exit. */ + cfs_spin_lock(&scrub->os_lock); + } + cfs_spin_unlock(&scrub->os_lock); +} + +static void osd_scrub_stop(struct osd_device *dev) +{ + /* od_otable_mutex: prevent curcurrent start/stop */ + cfs_mutex_lock(&dev->od_otable_mutex); + do_osd_scrub_stop(&dev->od_scrub); + cfs_mutex_unlock(&dev->od_otable_mutex); +} + +static const char osd_scrub_name[] = "OI_scrub"; + +int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev) +{ + struct osd_thread_info *info = osd_oti_get(env); + struct osd_scrub *scrub = &dev->od_scrub; + struct lvfs_run_ctxt *ctxt = &scrub->os_ctxt; + struct scrub_file *sf = &scrub->os_file; + struct osd_inode_id *id = &scrub->os_oic.oic_lid; + struct super_block *sb = osd_sb(dev); + struct ldiskfs_super_block *es = LDISKFS_SB(sb)->s_es; + struct inode *inode; + struct lvfs_run_ctxt saved; + struct file *filp; + int dirty = 0; + int init = 0; + int rc = 0; + ENTRY; + + OBD_SET_CTXT_MAGIC(ctxt); + ctxt->pwdmnt = dev->od_mnt; + ctxt->pwd = dev->od_mnt->mnt_root; + ctxt->fs = get_ds(); + + cfs_waitq_init(&scrub->os_thread.t_ctl_waitq); + cfs_init_rwsem(&scrub->os_rwsem); + cfs_spin_lock_init(&scrub->os_lock); + CFS_INIT_LIST_HEAD(&scrub->os_inconsistent_items); + if (get_mount_flags(dev->od_mount->lmi_sb) & LMD_FLG_NOSCRUB) + scrub->os_no_scrub = 1; + + push_ctxt(&saved, ctxt, NULL); + filp = filp_open(osd_scrub_name, O_RDWR | O_CREAT, 0644); + if (IS_ERR(filp)) + RETURN(PTR_ERR(filp)); + + scrub->os_inode = igrab(filp->f_dentry->d_inode); + filp_close(filp, 0); + pop_ctxt(&saved, ctxt, NULL); + + rc = osd_scrub_file_load(scrub); + if (rc == -ENOENT) { + osd_scrub_file_init(scrub, es->s_uuid); + dirty = 1; + init = 1; + } else if (rc != 0) { + RETURN(rc); + } else { + if (memcmp(sf->sf_uuid, es->s_uuid, 16) != 0) { + osd_scrub_file_reset(scrub, es->s_uuid,SF_INCONSISTENT); + dirty = 1; + } else if (sf->sf_status == SS_SCANNING) { + sf->sf_status = SS_CRASHED; + dirty = 1; + } + } + + if (sf->sf_pos_last_checkpoint != 0) + scrub->os_pos_current = sf->sf_pos_last_checkpoint + 1; + else + scrub->os_pos_current = LDISKFS_FIRST_INO(sb); + + if (dirty != 0) { + rc = osd_scrub_file_store(scrub); + if (rc != 0) + RETURN(rc); + } + + /* Initialize OI files. */ + rc = osd_oi_init(info, dev); + if (rc < 0) + RETURN(rc); + + if (init != 0) { + rc = __osd_oi_lookup(info, dev, &LU_DOT_LUSTRE_FID, id); + if (rc == 0) { + inode = osd_iget(info, dev, id); + if (IS_ERR(inode)) { + rc = PTR_ERR(inode); + /* It is restored from old 2.x backup. */ + if (rc == -ENOENT || rc == -ESTALE) { + osd_scrub_file_reset(scrub, es->s_uuid, + SF_INCONSISTENT); + rc = osd_scrub_file_store(scrub); + } + } else { + iput(inode); + } + } else if (rc == -ENOENT) { + rc = 0; + } + } + + if (rc == 0 && !scrub->os_no_scrub && + ((sf->sf_status == SS_CRASHED && + sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT | SF_AUTO)) || + (sf->sf_status == SS_INIT && + sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT)))) + rc = osd_scrub_start(dev); + + RETURN(rc); +} + +void osd_scrub_cleanup(const struct lu_env *env, struct osd_device *dev) +{ + struct osd_scrub *scrub = &dev->od_scrub; + + LASSERT(dev->od_otable_it == NULL); + + if (scrub->os_inode != NULL) { + osd_scrub_stop(dev); + iput(scrub->os_inode); + scrub->os_inode = NULL; + } + if (dev->od_oi_table != NULL) + osd_oi_fini(osd_oti_get(env), dev); +} diff --git a/lustre/osd-ldiskfs/osd_scrub.h b/lustre/osd-ldiskfs/osd_scrub.h new file mode 100644 index 0000000..5c9df6d --- /dev/null +++ b/lustre/osd-ldiskfs/osd_scrub.h @@ -0,0 +1,195 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012 Whamcloud, Inc. + */ +/* + * lustre/osd-ldiskfs/osd_scrub.h + * + * Shared definitions and declarations for OI scrub. + * + * Author: Fan Yong + */ + +#ifndef _OSD_SCRUB_H +# define _OSD_SCRUB_H + +#include "osd_oi.h" + +#define SCRUB_MAGIC_V1 0x4C5FD252 +#define SCRUB_CHECKPOINT_INTERVAL 60 +#define SCRUB_OI_BITMAP_SIZE (OSD_OI_FID_NR_MAX >> 3) +#define SCRUB_WINDOW_SIZE 1024 + +enum scrub_status { + /* The scrub file is new created, for new MDT, upgrading from old disk, + * or re-creating the scrub file manually. */ + SS_INIT = 0, + + /* The scrub is checking/repairing the OI files. */ + SS_SCANNING = 1, + + /* The scrub checked/repaired the OI files successfully. */ + SS_COMPLETED = 2, + + /* The scrub failed to check/repair the OI files. */ + SS_FAILED = 3, + + /* The scrub is stopped manually, the OI files may be inconsistent. */ + SS_PAUSED = 4, + + /* The scrub crashed during the scanning, should be restarted. */ + SS_CRASHED = 5, +}; + +enum scrub_flags { + /* OI files have been recreated, OI mappings should be re-inserted. */ + SF_RECREATED = 0x0000000000000001ULL, + + /* OI files are invalid, should be rebuild ASAP */ + SF_INCONSISTENT = 0x0000000000000002ULL, + + /* OI scrub is triggered automatically. */ + SF_AUTO = 0x0000000000000004ULL, +}; + +enum scrub_param { + /* Exit when fail. */ + SP_FAILOUT = 0x0001, +}; + +enum scrub_start { + /* Set failout flag. */ + SS_SET_FAILOUT = 0x00000001, + + /* Clear failout flag. */ + SS_CLEAR_FAILOUT = 0x00000002, + + /* Reset scrub start position. */ + SS_RESET = 0x00000004, + + /* Trigger scrub automatically. */ + SS_AUTO = 0x00000008, +}; + +struct scrub_file { + /* 128-bit uuid for volume. */ + __u8 sf_uuid[16]; + + /* See 'enum scrub_flags'. */ + __u64 sf_flags; + + /* The scrub magic. */ + __u32 sf_magic; + + /* See 'enum scrub_status'. */ + __u16 sf_status; + + /* See 'enum scrub_param'. */ + __u16 sf_param; + + /* The time for the last OI scrub completed. */ + __u64 sf_time_last_complete; + + /* The time for the latest OI scrub ran. */ + __u64 sf_time_latest_start; + + /* The time for the last OI scrub checkpoint. */ + __u64 sf_time_last_checkpoint; + + /* The position for the latest OI scrub started from. */ + __u64 sf_pos_latest_start; + + /* The position for the last OI scrub checkpoint. */ + __u64 sf_pos_last_checkpoint; + + /* The position for the first should be updated object. */ + __u64 sf_pos_first_inconsistent; + + /* How many objects have been checked. */ + __u64 sf_items_checked; + + /* How many objects have been updated. */ + __u64 sf_items_updated; + + /* How many objects failed to be processed. */ + __u64 sf_items_failed; + + /* How many prior objects have been updated during scanning. */ + __u64 sf_items_updated_prior; + + /* How long the OI scrub has run. */ + __u32 sf_run_time; + + /* How many completed OI scrub ran on the device. */ + __u32 sf_success_count; + + /* How many OI files. */ + __u16 sf_oi_count; + + /* Update the magic or flags if want to use the reserved fields. */ + __u16 sf_reserved_0; + __u32 sf_reserved_1; + __u64 sf_reserved_2[16]; + + /* Bitmap for OI files recreated case. */ + __u8 sf_oi_bitmap[SCRUB_OI_BITMAP_SIZE]; +}; + +struct osd_scrub { + struct lvfs_run_ctxt os_ctxt; + struct ptlrpc_thread os_thread; + struct osd_idmap_cache os_oic; + cfs_list_t os_inconsistent_items; + + /* write lock for scrub prep/update/post/checkpoint, + * read lock for scrub dump. */ + cfs_rw_semaphore_t os_rwsem; + cfs_spinlock_t os_lock; + + /* Scrub file in memory. */ + struct scrub_file os_file; + + /* Buffer for scrub file load/store. */ + struct scrub_file os_file_disk; + + /* Inode for the scrub file. */ + struct inode *os_inode; + + /* The time for last checkpoint, jiffies */ + cfs_time_t os_time_last_checkpoint; + + /* The time for next checkpoint, jiffies */ + cfs_time_t os_time_next_checkpoint; + + /* How many objects have been checked since last checkpoint. */ + __u32 os_new_checked; + __u32 os_pos_current; + __u32 os_start_flags; + unsigned int os_in_prior:1, /* process inconsistent item + * found by RPC prior */ + os_waiting:1, /* Waiting for scan window. */ + os_full_speed:1, /* run w/o speed limit */ + os_no_scrub:1; /* NOT auto trigger OI scrub*/ +}; + +#endif /* _OSD_SCRUB_H */ -- 1.8.3.1