From c961228f1c30254c454ed1432ba83af3aa7c39b4 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Tue, 15 Jan 2013 15:07:07 +0800 Subject: [PATCH] LU-1866 lfsck: general framework for LFSCK 1.5 LFSCK 1.5 main data structure definations, LFSCK component APIs, setup/cleanup/start/stop interfaces. Signed-off-by: Fan Yong Change-Id: I4507b9432289ca8d54cbf686893b706052d0eeb3 Reviewed-on: http://review.whamcloud.com/4908 Tested-by: Hudson Reviewed-by: Alex Zhuravlev Reviewed-by: Andreas Dilger Tested-by: Maloo --- lustre/include/lustre/lustre_idl.h | 24 +- lustre/include/lustre/lustre_lfsck_user.h | 1 + lustre/include/lustre_lib.h | 1 + lustre/mdd/autoMakefile.am | 4 +- lustre/mdd/mdd_device.c | 6 +- lustre/mdd/mdd_internal.h | 30 +-- lustre/mdd/mdd_lfsck.c | 434 ++++++++++++++++++++++++++---- lustre/mdd/mdd_lfsck.h | 315 ++++++++++++++++++++++ lustre/mdd/mdd_lproc.c | 19 +- lustre/mdt/mdt_handler.c | 8 + 10 files changed, 755 insertions(+), 87 deletions(-) create mode 100644 lustre/mdd/mdd_lfsck.h diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index 1929d24..8226334 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -833,10 +833,26 @@ static inline int lu_fid_cmp(const struct lu_fid *f0, * enumeration. */ enum lu_dirent_attrs { - LUDA_FID = 0x0001, - LUDA_TYPE = 0x0002, - LUDA_64BITHASH = 0x0004, -}; + LUDA_FID = 0x0001, + LUDA_TYPE = 0x0002, + LUDA_64BITHASH = 0x0004, + + /* The following attrs are used for MDT interanl only, + * not visible to client */ + + /* Verify the dirent consistency */ + LUDA_VERIFY = 0x8000, + /* Only check but not repair the dirent inconsistency */ + LUDA_VERIFY_DRYRUN = 0x4000, + /* The dirent has been repaired, or to be repaired (dryrun). */ + LUDA_REPAIR = 0x2000, + /* The system is upgraded, has beed or to be repaired (dryrun). */ + LUDA_UPGRADE = 0x1000, + /* Ignore this record, go to next directly. */ + LUDA_IGNORE = 0x0800, +}; + +#define LU_DIRENT_ATTRS_MASK 0xf800 /** * Layout of readdir pages, as transmitted on wire. diff --git a/lustre/include/lustre/lustre_lfsck_user.h b/lustre/include/lustre/lustre_lfsck_user.h index f56e3f7..1e2b011 100644 --- a/lustre/include/lustre/lustre_lfsck_user.h +++ b/lustre/include/lustre/lustre_lfsck_user.h @@ -56,6 +56,7 @@ enum lfsck_type { #define LFSCK_TYPES_ALL ((__u16)(~0)) #define LFSCK_TYPES_DEF ((__u16)0) +#define LFSCK_TYPES_SUPPORTED 0 #define LFSCK_SPEED_NO_LIMIT 0 #define LFSCK_SPEED_LIMIT_DEF LFSCK_SPEED_NO_LIMIT diff --git a/lustre/include/lustre_lib.h b/lustre/include/lustre_lib.h index 99cd735..e753df1 100644 --- a/lustre/include/lustre_lib.h +++ b/lustre/include/lustre_lib.h @@ -561,6 +561,7 @@ static inline void obd_ioctl_freedata(char *buf, int len) #define OBD_IOC_START_LFSCK _IOWR('f', 230, OBD_IOC_DATA_TYPE) #define OBD_IOC_STOP_LFSCK _IOW('f', 231, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PAUSE_LFSCK _IOW('f', 232, OBD_IOC_DATA_TYPE) /* XXX _IOWR('f', 250, long) has been defined in * libcfs/include/libcfs/libcfs_private.h for debug, don't use it diff --git a/lustre/mdd/autoMakefile.am b/lustre/mdd/autoMakefile.am index 6a1747f..e9cb34f 100644 --- a/lustre/mdd/autoMakefile.am +++ b/lustre/mdd/autoMakefile.am @@ -38,5 +38,5 @@ if MODULES modulefs_DATA = mdd$(KMODEXT) endif -MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -EXTRA_DIST := $(mdd-objs:%.o=%.c) mdd_internal.h +MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ +EXTRA_DIST := $(mdd-objs:%.o=%.c) mdd_internal.h mdd_lfsck.h diff --git a/lustre/mdd/mdd_device.c b/lustre/mdd/mdd_device.c index 8a2aef8..f87fa42 100644 --- a/lustre/mdd/mdd_device.c +++ b/lustre/mdd/mdd_device.c @@ -1634,7 +1634,11 @@ static int mdd_iocontrol(const struct lu_env *env, struct md_device *m, RETURN(rc); } case OBD_IOC_STOP_LFSCK: { - rc = mdd_lfsck_stop(env, &mdd->mdd_lfsck); + rc = mdd_lfsck_stop(env, &mdd->mdd_lfsck, false); + RETURN(rc); + } + case OBD_IOC_PAUSE_LFSCK: { + rc = mdd_lfsck_stop(env, &mdd->mdd_lfsck, true); RETURN(rc); } } diff --git a/lustre/mdd/mdd_internal.h b/lustre/mdd/mdd_internal.h index d4ab715..248a42a 100644 --- a/lustre/mdd/mdd_internal.h +++ b/lustre/mdd/mdd_internal.h @@ -52,6 +52,8 @@ #include #include +#include "mdd_lfsck.h" + /* PDO lock is unnecessary for current MDT stack because operations * are already protected by ldlm lock */ #define MDD_DISABLE_PDO_LOCK 1 @@ -93,28 +95,6 @@ struct mdd_dot_lustre_objs { extern const char lfsck_bookmark_name[]; -struct md_lfsck { - struct mutex ml_mutex; - spinlock_t ml_lock; - struct ptlrpc_thread ml_thread; - struct dt_object *ml_bookmark_obj; - struct dt_object *ml_it_obj; - __u32 ml_new_scanned; - /* Arguments for low layer iteration. */ - __u32 ml_args; - - /* Raw value for LFSCK speed limit. */ - __u32 ml_speed_limit; - - /* Schedule for every N objects. */ - __u32 ml_sleep_rate; - - /* Sleep N jiffies for each schedule. */ - __u32 ml_sleep_jif; - __u16 ml_version; - unsigned int ml_paused:1; /* The lfsck is paused. */ -}; - struct mdd_device { struct md_device mdd_md_dev; struct obd_export *mdd_child_exp; @@ -466,10 +446,12 @@ int mdd_txn_start_cb(const struct lu_env *env, struct thandle *, void *cookie); /* mdd_lfsck.c */ -void mdd_lfsck_set_speed(struct md_lfsck *lfsck, __u32 limit); +int mdd_lfsck_set_speed(const struct lu_env *env, struct md_lfsck *lfsck, + __u32 limit); int mdd_lfsck_start(const struct lu_env *env, struct md_lfsck *lfsck, struct lfsck_start *start); -int mdd_lfsck_stop(const struct lu_env *env, struct md_lfsck *lfsck); +int mdd_lfsck_stop(const struct lu_env *env, struct md_lfsck *lfsck, + bool pause); int mdd_lfsck_setup(const struct lu_env *env, struct mdd_device *mdd); void mdd_lfsck_cleanup(const struct lu_env *env, struct mdd_device *mdd); diff --git a/lustre/mdd/mdd_lfsck.c b/lustre/mdd/mdd_lfsck.c index 6c69793..bd3f331 100644 --- a/lustre/mdd/mdd_lfsck.c +++ b/lustre/mdd/mdd_lfsck.c @@ -40,21 +40,70 @@ #include #include +#include #include "mdd_internal.h" +#include "mdd_lfsck.h" + +#define HALF_SEC (CFS_HZ >> 1) +#define LFSCK_CHECKPOINT_INTERVAL 60 + +const char lfsck_bookmark_name[] = "lfsck_bookmark"; + +/* misc functions */ + +static inline struct mdd_device *mdd_lfsck2mdd(struct md_lfsck *lfsck) +{ + return container_of0(lfsck, struct mdd_device, mdd_lfsck); +} static inline char *mdd_lfsck2name(struct md_lfsck *lfsck) { - struct mdd_device *mdd; + struct mdd_device *mdd = mdd_lfsck2mdd(lfsck); - mdd = container_of0(lfsck, struct mdd_device, mdd_lfsck); return mdd2obd_dev(mdd)->obd_name; } -void mdd_lfsck_set_speed(struct md_lfsck *lfsck, __u32 limit) +static inline void mdd_lfsck_component_put(const struct lu_env *env, + struct lfsck_component *com) { - spin_lock(&lfsck->ml_lock); - lfsck->ml_speed_limit = limit; + if (atomic_dec_and_test(&com->lc_ref)) { + if (com->lc_obj != NULL) + lu_object_put(env, &com->lc_obj->do_lu); + if (com->lc_file_ram != NULL) + OBD_FREE(com->lc_file_ram, com->lc_file_size); + if (com->lc_file_disk != NULL) + OBD_FREE(com->lc_file_disk, com->lc_file_size); + OBD_FREE_PTR(com); + } +} + +static inline struct lfsck_component * +__mdd_lfsck_component_find(struct md_lfsck *lfsck, __u16 type, cfs_list_t *list) +{ + struct lfsck_component *com; + + cfs_list_for_each_entry(com, list, lc_link) { + if (com->lc_type == type) + return com; + } + return NULL; +} + +static void mdd_lfsck_component_cleanup(const struct lu_env *env, + struct lfsck_component *com) +{ + if (!cfs_list_empty(&com->lc_link)) + cfs_list_del_init(&com->lc_link); + if (!cfs_list_empty(&com->lc_link_dir)) + cfs_list_del_init(&com->lc_link_dir); + + mdd_lfsck_component_put(env, com); +} + +static void __mdd_lfsck_set_speed(struct md_lfsck *lfsck, __u32 limit) +{ + lfsck->ml_bookmark_ram.lb_speed_limit = limit; if (limit != LFSCK_SPEED_NO_LIMIT) { if (limit > CFS_HZ) { lfsck->ml_sleep_rate = limit / CFS_HZ; @@ -67,7 +116,6 @@ void mdd_lfsck_set_speed(struct md_lfsck *lfsck, __u32 limit) lfsck->ml_sleep_jif = 0; lfsck->ml_sleep_rate = 0; } - spin_unlock(&lfsck->ml_lock); } static void mdd_lfsck_control_speed(struct md_lfsck *lfsck) @@ -94,12 +142,131 @@ static void mdd_lfsck_control_speed(struct md_lfsck *lfsck) } } +/* lfsck_bookmark file ops */ + +static void inline mdd_lfsck_bookmark_to_cpu(struct lfsck_bookmark *des, + struct lfsck_bookmark *src) +{ + des->lb_magic = le32_to_cpu(src->lb_magic); + des->lb_version = le16_to_cpu(src->lb_version); + des->lb_param = le16_to_cpu(src->lb_param); + des->lb_speed_limit = le32_to_cpu(src->lb_speed_limit); +} + +static void inline mdd_lfsck_bookmark_to_le(struct lfsck_bookmark *des, + struct lfsck_bookmark *src) +{ + des->lb_magic = cpu_to_le32(src->lb_magic); + des->lb_version = cpu_to_le16(src->lb_version); + des->lb_param = cpu_to_le16(src->lb_param); + des->lb_speed_limit = cpu_to_le32(src->lb_speed_limit); +} + +static int mdd_lfsck_bookmark_load(const struct lu_env *env, + struct md_lfsck *lfsck) +{ + loff_t pos = 0; + int len = sizeof(struct lfsck_bookmark); + int rc; + + rc = dt_record_read(env, lfsck->ml_bookmark_obj, + mdd_buf_get(env, &lfsck->ml_bookmark_disk, len), + &pos); + if (rc == 0) { + struct lfsck_bookmark *bm = &lfsck->ml_bookmark_ram; + + mdd_lfsck_bookmark_to_cpu(bm, &lfsck->ml_bookmark_disk); + if (bm->lb_magic != LFSCK_BOOKMARK_MAGIC) { + CWARN("%.16s: invalid lfsck_bookmark magic " + "0x%x != 0x%x\n", mdd_lfsck2name(lfsck), + bm->lb_magic, LFSCK_BOOKMARK_MAGIC); + /* Process it as new lfsck_bookmark. */ + rc = -ENODATA; + } + } else { + if (rc == -EFAULT && pos == 0) + /* return -ENODATA for empty lfsck_bookmark. */ + rc = -ENODATA; + else + CERROR("%.16s: fail to load lfsck_bookmark, " + "expected = %d, rc = %d\n", + mdd_lfsck2name(lfsck), len, rc); + } + return rc; +} + +static int mdd_lfsck_bookmark_store(const struct lu_env *env, + struct md_lfsck *lfsck) +{ + struct mdd_device *mdd = mdd_lfsck2mdd(lfsck); + struct thandle *handle; + struct dt_object *obj = lfsck->ml_bookmark_obj; + loff_t pos = 0; + int len = sizeof(struct lfsck_bookmark); + int rc; + ENTRY; + + mdd_lfsck_bookmark_to_le(&lfsck->ml_bookmark_disk, + &lfsck->ml_bookmark_ram); + handle = dt_trans_create(env, mdd->mdd_bottom); + if (IS_ERR(handle)) { + rc = PTR_ERR(handle); + CERROR("%.16s: fail to create trans for storing " + "lfsck_bookmark: %d\n,", mdd_lfsck2name(lfsck), rc); + RETURN(rc); + } + + rc = dt_declare_record_write(env, obj, len, 0, handle); + if (rc != 0) { + CERROR("%.16s: fail to declare trans for storing " + "lfsck_bookmark: %d\n,", mdd_lfsck2name(lfsck), rc); + GOTO(out, rc); + } + + rc = dt_trans_start_local(env, mdd->mdd_bottom, handle); + if (rc != 0) { + CERROR("%.16s: fail to start trans for storing " + "lfsck_bookmark: %d\n,", mdd_lfsck2name(lfsck), rc); + GOTO(out, rc); + } + + rc = dt_record_write(env, obj, + mdd_buf_get(env, &lfsck->ml_bookmark_disk, len), + &pos, handle); + if (rc != 0) + CERROR("%.16s: fail to store lfsck_bookmark, expected = %d, " + "rc = %d\n", mdd_lfsck2name(lfsck), len, rc); + + GOTO(out, rc); + +out: + dt_trans_stop(env, mdd->mdd_bottom, handle); + return rc; +} + +static int mdd_lfsck_bookmark_init(const struct lu_env *env, + struct md_lfsck *lfsck) +{ + struct lfsck_bookmark *mb = &lfsck->ml_bookmark_ram; + int rc; + + memset(mb, 0, sizeof(mb)); + mb->lb_magic = LFSCK_BOOKMARK_MAGIC; + mb->lb_version = LFSCK_VERSION_V1; + mutex_lock(&lfsck->ml_mutex); + rc = mdd_lfsck_bookmark_store(env, lfsck); + mutex_unlock(&lfsck->ml_mutex); + return rc; +} + +/* LFSCK engines */ + static int mdd_lfsck_main(void *args) { struct lu_env env; struct md_lfsck *lfsck = (struct md_lfsck *)args; struct ptlrpc_thread *thread = &lfsck->ml_thread; - struct dt_object *obj = lfsck->ml_it_obj; + struct dt_object *obj = lfsck->ml_obj_oit; const struct dt_it_ops *iops = &obj->do_index_ops->dio_it; struct dt_it *di; struct lu_fid *fid; @@ -114,7 +281,7 @@ static int mdd_lfsck_main(void *args) GOTO(noenv, rc); } - di = iops->init(&env, obj, lfsck->ml_args, BYPASS_CAPA); + di = iops->init(&env, obj, lfsck->ml_args_oit, BYPASS_CAPA); if (IS_ERR(di)) { rc = PTR_ERR(di); CERROR("%s: LFSCK, fail to init iteration, rc = %d\n", @@ -123,7 +290,7 @@ static int mdd_lfsck_main(void *args) } CDEBUG(D_LFSCK, "LFSCK: flags = 0x%x, pid = %d\n", - lfsck->ml_args, cfs_curproc_pid()); + lfsck->ml_args_oit, cfs_curproc_pid()); spin_lock(&lfsck->ml_lock); thread_set_flags(thread, SVC_RUNNING); @@ -197,49 +364,165 @@ noenv: return rc; } +/* external interfaces */ + +int mdd_lfsck_set_speed(const struct lu_env *env, struct md_lfsck *lfsck, + __u32 limit) +{ + int rc; + + mutex_lock(&lfsck->ml_mutex); + __mdd_lfsck_set_speed(lfsck, limit); + rc = mdd_lfsck_bookmark_store(env, lfsck); + mutex_unlock(&lfsck->ml_mutex); + return rc; +} + int mdd_lfsck_start(const struct lu_env *env, struct md_lfsck *lfsck, struct lfsck_start *start) { - struct ptlrpc_thread *thread = &lfsck->ml_thread; - struct l_wait_info lwi = { 0 }; - int rc = 0; - __u16 valid = 0; - __u16 flags = 0; + struct lfsck_bookmark *bk = &lfsck->ml_bookmark_ram; + struct ptlrpc_thread *thread = &lfsck->ml_thread; + struct lfsck_component *com; + struct l_wait_info lwi = { 0 }; + bool dirty = false; + int rc = 0; + __u16 valid = 0; + __u16 flags = 0; ENTRY; - if (lfsck->ml_it_obj == NULL) + if (lfsck->ml_obj_oit == NULL) RETURN(-ENOTSUPP); + /* start == NULL means auto trigger paused LFSCK. */ + if (start == NULL && cfs_list_empty(&lfsck->ml_list_scan)) + RETURN(0); + mutex_lock(&lfsck->ml_mutex); spin_lock(&lfsck->ml_lock); - if (thread_is_running(thread)) { + if (!thread_is_init(thread) && !thread_is_stopped(thread)) { spin_unlock(&lfsck->ml_lock); mutex_unlock(&lfsck->ml_mutex); RETURN(-EALREADY); } spin_unlock(&lfsck->ml_lock); - if (start->ls_valid & LSV_SPEED_LIMIT) - mdd_lfsck_set_speed(lfsck, start->ls_speed_limit); + + lfsck->ml_paused = 0; + lfsck->ml_oit_over = 0; + lfsck->ml_drop_dryrun = 0; + lfsck->ml_new_scanned = 0; + + /* For auto trigger. */ + if (start == NULL) + goto trigger; + + start->ls_version = bk->lb_version; + if (start->ls_valid & LSV_SPEED_LIMIT) { + __mdd_lfsck_set_speed(lfsck, start->ls_speed_limit); + dirty = true; + } if (start->ls_valid & LSV_ERROR_HANDLE) { valid |= DOIV_ERROR_HANDLE; if (start->ls_flags & LPF_FAILOUT) flags |= DOIF_FAILOUT; + + if ((start->ls_flags & LPF_FAILOUT) && + !(bk->lb_param & LPF_FAILOUT)) { + bk->lb_param |= LPF_FAILOUT; + dirty = true; + } else if (!(start->ls_flags & LPF_FAILOUT) && + (bk->lb_param & LPF_FAILOUT)) { + bk->lb_param &= ~LPF_FAILOUT; + dirty = true; + } } - /* XXX: 1. low layer does not care 'dryrun'. - * 2. will process 'ls_active' when introduces LFSCK for layout - * consistency, DNE consistency, and so on in the future. */ - start->ls_active = 0; + if (start->ls_valid & LSV_DRYRUN) { + if ((start->ls_flags & LPF_DRYRUN) && + !(bk->lb_param & LPF_DRYRUN)) { + bk->lb_param |= LPF_DRYRUN; + dirty = true; + } else if (!(start->ls_flags & LPF_DRYRUN) && + (bk->lb_param & LPF_DRYRUN)) { + bk->lb_param &= ~LPF_DRYRUN; + lfsck->ml_drop_dryrun = 1; + dirty = true; + } + } + + if (dirty) { + rc = mdd_lfsck_bookmark_store(env, lfsck); + if (rc != 0) + GOTO(out, rc); + } if (start->ls_flags & LPF_RESET) flags |= DOIF_RESET; - if (start->ls_active != 0) + if (start->ls_active != 0) { + struct lfsck_component *next; + __u16 type = 1; + + if (start->ls_active == LFSCK_TYPES_ALL) + start->ls_active = LFSCK_TYPES_SUPPORTED; + + if (start->ls_active & ~LFSCK_TYPES_SUPPORTED) { + start->ls_active &= ~LFSCK_TYPES_SUPPORTED; + GOTO(out, rc = -ENOTSUPP); + } + + cfs_list_for_each_entry_safe(com, next, + &lfsck->ml_list_scan, lc_link) { + if (!(com->lc_type & start->ls_active)) { + rc = com->lc_ops->lfsck_post(env, com, 0); + if (rc != 0) + GOTO(out, rc); + } + } + + while (start->ls_active != 0) { + if (type & start->ls_active) { + com = __mdd_lfsck_component_find(lfsck, type, + &lfsck->ml_list_idle); + if (com != NULL) { + /* The component status will be updated + * when its prep() is called later by + * the LFSCK main engine. */ + cfs_list_del_init(&com->lc_link); + cfs_list_add_tail(&com->lc_link, + &lfsck->ml_list_scan); + } + start->ls_active &= ~type; + } + type <<= 1; + } + } + + cfs_list_for_each_entry(com, &lfsck->ml_list_scan, lc_link) { + start->ls_active |= com->lc_type; + if (flags & DOIF_RESET) { + rc = com->lc_ops->lfsck_reset(env, com, false); + if (rc != 0) + GOTO(out, rc); + } + } + +trigger: + lfsck->ml_args_dir = LUDA_64BITHASH | LUDA_VERIFY; + if (bk->lb_param & LPF_DRYRUN) + lfsck->ml_args_dir |= LUDA_VERIFY_DRYRUN; + + if (bk->lb_param & LPF_FAILOUT) { + valid |= DOIV_ERROR_HANDLE; + flags |= DOIF_FAILOUT; + } + + if (!cfs_list_empty(&lfsck->ml_list_scan)) flags |= DOIF_OUTUSED; - lfsck->ml_args = (flags << DT_OTABLE_IT_FLAGS_SHIFT) | valid; + lfsck->ml_args_oit = (flags << DT_OTABLE_IT_FLAGS_SHIFT) | valid; thread_set_flags(thread, 0); rc = cfs_create_thread(mdd_lfsck_main, lfsck, 0); if (rc < 0) @@ -250,12 +533,16 @@ int mdd_lfsck_start(const struct lu_env *env, struct md_lfsck *lfsck, thread_is_running(thread) || thread_is_stopped(thread), &lwi); - mutex_unlock(&lfsck->ml_mutex); - RETURN(rc < 0 ? rc : 0); + GOTO(out, rc = 0); + +out: + mutex_unlock(&lfsck->ml_mutex); + return (rc < 0 ? rc : 0); } -int mdd_lfsck_stop(const struct lu_env *env, struct md_lfsck *lfsck) +int mdd_lfsck_stop(const struct lu_env *env, struct md_lfsck *lfsck, + bool pause) { struct ptlrpc_thread *thread = &lfsck->ml_thread; struct l_wait_info lwi = { 0 }; @@ -269,7 +556,14 @@ int mdd_lfsck_stop(const struct lu_env *env, struct md_lfsck *lfsck) RETURN(-EALREADY); } + if (pause) + lfsck->ml_paused = 1; thread_set_flags(thread, SVC_STOPPING); + /* The LFSCK thread may be sleeping on low layer wait queue, + * wake it up. */ + if (likely(lfsck->ml_di_oit != NULL)) + lfsck->ml_obj_oit->do_index_ops->dio_it.put(env, + lfsck->ml_di_oit); spin_unlock(&lfsck->ml_lock); cfs_waitq_broadcast(&thread->t_ctl_waitq); @@ -281,8 +575,6 @@ int mdd_lfsck_stop(const struct lu_env *env, struct md_lfsck *lfsck) RETURN(0); } -const char lfsck_bookmark_name[] = "lfsck_bookmark"; - static const struct lu_fid lfsck_it_fid = { .f_seq = FID_SEQ_LOCAL_FILE, .f_oid = OTABLE_IT_OID, .f_ver = 0 }; @@ -292,50 +584,90 @@ int mdd_lfsck_setup(const struct lu_env *env, struct mdd_device *mdd) struct md_lfsck *lfsck = &mdd->mdd_lfsck; struct dt_object *obj; int rc; + ENTRY; - memset(lfsck, 0, sizeof(*lfsck)); - lfsck->ml_version = LFSCK_VERSION_V1; - cfs_waitq_init(&lfsck->ml_thread.t_ctl_waitq); + LASSERT(!lfsck->ml_initialized); + + lfsck->ml_initialized = 1; mutex_init(&lfsck->ml_mutex); spin_lock_init(&lfsck->ml_lock); + CFS_INIT_LIST_HEAD(&lfsck->ml_list_scan); + CFS_INIT_LIST_HEAD(&lfsck->ml_list_dir); + CFS_INIT_LIST_HEAD(&lfsck->ml_list_double_scan); + CFS_INIT_LIST_HEAD(&lfsck->ml_list_idle); + cfs_waitq_init(&lfsck->ml_thread.t_ctl_waitq); - obj = dt_store_open(env, mdd->mdd_child, "", lfsck_bookmark_name, - &mdd_env_info(env)->mti_fid); - if (IS_ERR(obj)) - return PTR_ERR(obj); - - lfsck->ml_bookmark_obj = obj; - - obj = dt_locate(env, mdd->mdd_child, &lfsck_it_fid); + obj = dt_locate(env, mdd->mdd_bottom, &lfsck_it_fid); if (IS_ERR(obj)) - return PTR_ERR(obj); + RETURN(PTR_ERR(obj)); + lfsck->ml_obj_oit = obj; rc = obj->do_ops->do_index_try(env, obj, &dt_otable_features); if (rc != 0) { - lu_object_put(env, &obj->do_lu); if (rc == -ENOTSUPP) rc = 0; - return rc; + + RETURN(rc); } - lfsck->ml_it_obj = obj; + obj = dt_store_open(env, mdd->mdd_bottom, "", lfsck_bookmark_name, + &mdd_env_info(env)->mti_fid); + if (IS_ERR(obj)) + RETURN(PTR_ERR(obj)); + + lfsck->ml_bookmark_obj = obj; + rc = mdd_lfsck_bookmark_load(env, lfsck); + if (rc == -ENODATA) + rc = mdd_lfsck_bookmark_init(env, lfsck); + + /* XXX: LFSCK components initialization to be added here. */ - return 0; + RETURN(rc); } void mdd_lfsck_cleanup(const struct lu_env *env, struct mdd_device *mdd) { - struct md_lfsck *lfsck = &mdd->mdd_lfsck; + struct md_lfsck *lfsck = &mdd->mdd_lfsck; + struct ptlrpc_thread *thread = &lfsck->ml_thread; + struct lfsck_component *com; - if (lfsck->ml_it_obj != NULL) { - lfsck->ml_paused = 1; - mdd_lfsck_stop(env, lfsck); - lu_object_put(env, &lfsck->ml_it_obj->do_lu); - lfsck->ml_it_obj = NULL; + if (!lfsck->ml_initialized) + return; + + LASSERT(thread_is_init(thread) || thread_is_stopped(thread)); + + if (lfsck->ml_obj_oit != NULL) { + lu_object_put(env, &lfsck->ml_obj_oit->do_lu); + lfsck->ml_obj_oit = NULL; } + LASSERT(lfsck->ml_obj_dir == NULL); + if (lfsck->ml_bookmark_obj != NULL) { lu_object_put(env, &lfsck->ml_bookmark_obj->do_lu); lfsck->ml_bookmark_obj = NULL; } + + while (!cfs_list_empty(&lfsck->ml_list_scan)) { + com = cfs_list_entry(lfsck->ml_list_scan.next, + struct lfsck_component, + lc_link); + mdd_lfsck_component_cleanup(env, com); + } + + LASSERT(cfs_list_empty(&lfsck->ml_list_dir)); + + while (!cfs_list_empty(&lfsck->ml_list_double_scan)) { + com = cfs_list_entry(lfsck->ml_list_double_scan.next, + struct lfsck_component, + lc_link); + mdd_lfsck_component_cleanup(env, com); + } + + while (!cfs_list_empty(&lfsck->ml_list_idle)) { + com = cfs_list_entry(lfsck->ml_list_idle.next, + struct lfsck_component, + lc_link); + mdd_lfsck_component_cleanup(env, com); + } } diff --git a/lustre/mdd/mdd_lfsck.h b/lustre/mdd/mdd_lfsck.h new file mode 100644 index 0000000..2410c4f --- /dev/null +++ b/lustre/mdd/mdd_lfsck.h @@ -0,0 +1,315 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012 Intel Corporation. + */ +/* + * lustre/mdd/mdd_lfsck.h + * + * Shared definitions and declarations for the LFSCK. + * + * Author: Fan, Yong + */ + +#ifndef _MDD_LFSCK_H +# define _MDD_LFSCK_H + +#include + +enum lfsck_status { + /* The lfsck file is new created, for new MDT, upgrading from old disk, + * or re-creating the lfsck file manually. */ + LS_INIT = 0, + + /* The first-step system scanning. */ + LS_SCANNING_PHASE1 = 1, + + /* The second-step system scanning. */ + LS_SCANNING_PHASE2 = 2, + + /* The LFSCK processing has completed for all objects. */ + LS_COMPLETED = 3, + + /* The LFSCK exited automatically for failure, will not auto restart. */ + LS_FAILED = 4, + + /* The LFSCK is stopped manually, will not auto restart. */ + LS_STOPPED = 5, + + /* LFSCK is paused automatically when umount, + * will be restarted automatically when remount. */ + LS_PAUSED = 6, + + /* System crashed during the LFSCK, + * will be restarted automatically after recovery. */ + LS_CRASHED = 7, +}; + +enum lfsck_flags { + /* Finish to the cycle scanning. */ + LF_SCANNED_ONCE = 0x00000001ULL, + + /* There is some namespace inconsistency. */ + LF_INCONSISTENT = 0x00000002ULL, + + /* The device is upgraded from 1.8 format. */ + LF_UPGRADE = 0x00000004ULL, +}; + +struct lfsck_position { + /* local layer object table-based iteration position. */ + __u64 lp_oit_cookie; + + /* parent FID for directory traversal. */ + struct lu_fid lp_dir_parent; + + /* namespace-based directory traversal position. */ + __u64 lp_dir_cookie; +}; + +#define LFSCK_BOOKMARK_MAGIC 0x20130C1D + +struct lfsck_bookmark { + /* Magic number to detect that this struct contains valid data. */ + __u32 lb_magic; + + /* For compatible with old versions. */ + __u16 lb_version; + + /* See 'enum lfsck_param_flags' */ + __u16 lb_param; + + /* How many items can be scanned at most per second. */ + __u32 lb_speed_limit; + + /* For 64-bits aligned. */ + __u32 lb_padding; + + /* For future using. */ + __u64 lb_reserved[6]; +}; + +#define LFSCK_NAMESPACE_MAGIC 0xA0629D03 + +struct lfsck_namespace { + /* Magic number to detect that this struct contains valid data. */ + __u32 ln_magic; + + /* See 'enum lfsck_status'. */ + __u32 ln_status; + + /* See 'enum lfsck_flags'. */ + __u32 ln_flags; + + /* How many completed LFSCK runs on the device. */ + __u32 ln_success_count; + + /* How long the LFSCK phase1 has run in seconds. */ + __u32 ln_run_time_phase1; + + /* How long the LFSCK phase2 has run in seconds. */ + __u32 ln_run_time_phase2; + + /* Time for the last LFSCK completed in seconds since epoch. */ + __u64 ln_time_last_complete; + + /* Time for the latest LFSCK ran in seconds since epoch. */ + __u64 ln_time_latest_start; + + /* Time for the last LFSCK checkpoint in seconds since epoch. */ + __u64 ln_time_last_checkpoint; + + /* Position for the latest LFSCK started from. */ + struct lfsck_position ln_pos_latest_start; + + /* Position for the last LFSCK checkpoint. */ + struct lfsck_position ln_pos_last_checkpoint; + + /* Position for the first should be updated object. */ + struct lfsck_position ln_pos_first_inconsistent; + + /* How many items (including dir) have been checked. */ + __u64 ln_items_checked; + + /* How many items have been repaired. */ + __u64 ln_items_repaired; + + /* How many items failed to be processed. */ + __u64 ln_items_failed; + + /* How many directories have been traversed. */ + __u64 ln_dirs_checked; + + /* How many multiple-linked objects have been checked. */ + __u64 ln_mlinked_checked; + + /* How many objects have been double scanned. */ + __u64 ln_objs_checked_phase2; + + /* How many objects have been reparied during double scan. */ + __u64 ln_objs_repaired_phase2; + + /* How many objects failed to be processed during double scan. */ + __u64 ln_objs_failed_phase2; + + /* How many objects with nlink fixed. */ + __u64 ln_objs_nlink_repaired; + + /* How many objects were lost before, but found back now. */ + __u64 ln_objs_lost_found; + + /* The latest object has been processed (failed) during double scan. */ + struct lu_fid ln_fid_latest_scanned_phase2; + + /* For further using. 256-bytes aligned now. */ + __u64 ln_reserved[2]; +}; + +struct lfsck_component; +struct mdd_object; + +struct lfsck_operations { + int (*lfsck_reset)(const struct lu_env *env, + struct lfsck_component *com, + bool init); + + void (*lfsck_fail)(const struct lu_env *env, + struct lfsck_component *com, + bool oit, bool new_checked); + + int (*lfsck_checkpoint)(const struct lu_env *env, + struct lfsck_component *com, + bool init); + + int (*lfsck_prep)(const struct lu_env *env, + struct lfsck_component *com); + + int (*lfsck_exec_oit)(const struct lu_env *env, + struct lfsck_component *com, + struct mdd_object *obj); + + int (*lfsck_exec_dir)(const struct lu_env *env, + struct lfsck_component *com, + struct mdd_object *obj, + struct lu_dirent *ent); + + int (*lfsck_post)(const struct lu_env *env, + struct lfsck_component *com, + int result); + + int (*lfsck_dump)(const struct lu_env *env, + struct lfsck_component *com, + char *buf, + int len); + + int (*lfsck_double_scan)(const struct lu_env *env, + struct lfsck_component *com); +}; + +struct lfsck_component { + /* into md_lfsck::ml_list_(scan,double_scan,idle} */ + cfs_list_t lc_link; + + /* into md_lfsck::ml_list_dir */ + cfs_list_t lc_link_dir; + struct rw_semaphore lc_sem; + cfs_atomic_t lc_ref; + + struct lfsck_position lc_pos_start; + struct md_lfsck *lc_lfsck; + struct dt_object *lc_obj; + struct lfsck_operations *lc_ops; + void *lc_file_ram; + void *lc_file_disk; + __u32 lc_file_size; + + /* How many objects have been checked since last checkpoint. */ + __u32 lc_new_checked; + unsigned int lc_journal:1; + __u16 lc_type; +}; + +struct md_lfsck { + struct mutex ml_mutex; + spinlock_t ml_lock; + + /* For the components in (first) scanning via otable-based iteration. */ + cfs_list_t ml_list_scan; + + /* For the components in scanning via directory traversal. Because + * directory traversal cannot guarantee all the object be scanned, + * so the component in the ml_list_dir must be in ml_list_scan. */ + cfs_list_t ml_list_dir; + + /* For the components in double scanning. */ + cfs_list_t ml_list_double_scan; + + /* For the components those are not scanning now. */ + cfs_list_t ml_list_idle; + + struct ptlrpc_thread ml_thread; + + /* The time for last checkpoint, jiffies */ + cfs_time_t ml_time_last_checkpoint; + + /* The time for next checkpoint, jiffies */ + cfs_time_t ml_time_next_checkpoint; + + struct dt_object *ml_bookmark_obj; + struct lfsck_bookmark ml_bookmark_ram; + struct lfsck_bookmark ml_bookmark_disk; + struct lfsck_position ml_pos_current; + + /* Obj for otable-based iteration */ + struct dt_object *ml_obj_oit; + + /* Obj for directory traversal */ + struct dt_object *ml_obj_dir; + + /* It for otable-based iteration */ + struct dt_it *ml_di_oit; + + /* It for directory traversal */ + struct dt_it *ml_di_dir; + + /* Arguments for low layer otable-based iteration. */ + __u32 ml_args_oit; + + /* Arugments for namespace-based directory traversal. */ + __u32 ml_args_dir; + + /* Schedule for every N objects. */ + __u32 ml_sleep_rate; + + /* Sleep N jiffies for each schedule. */ + __u32 ml_sleep_jif; + + /* How many objects have been scanned since last sleep. */ + __u32 ml_new_scanned; + + unsigned int ml_paused:1, /* The lfsck is paused. */ + ml_oit_over:1, /* oit is finished. */ + ml_drop_dryrun:1, /* Ever dryrun, not now. */ + ml_initialized:1; /* lfsck_setup is called. */ +}; + +#endif /* _MDD_LFSCK_H */ diff --git a/lustre/mdd/mdd_lproc.c b/lustre/mdd/mdd_lproc.c index b921bc5..4d8e42d 100644 --- a/lustre/mdd/mdd_lproc.c +++ b/lustre/mdd/mdd_lproc.c @@ -272,7 +272,8 @@ static int lprocfs_rd_lfsck_speed_limit(char *page, char **start, off_t off, LASSERT(mdd != NULL); *eof = 1; - return snprintf(page, count, "%u\n", mdd->mdd_lfsck.ml_speed_limit); + return snprintf(page, count, "%u\n", + mdd->mdd_lfsck.ml_bookmark_ram.lb_speed_limit); } static int lprocfs_wr_lfsck_speed_limit(struct file *file, const char *buffer, @@ -285,13 +286,21 @@ static int lprocfs_wr_lfsck_speed_limit(struct file *file, const char *buffer, LASSERT(mdd != NULL); rc = lprocfs_write_helper(buffer, count, &val); - if (rc) + if (rc != 0) return rc; lfsck = &mdd->mdd_lfsck; - if (val != lfsck->ml_speed_limit) - mdd_lfsck_set_speed(lfsck, val); - return count; + if (val != lfsck->ml_bookmark_ram.lb_speed_limit) { + struct lu_env env; + + rc = lu_env_init(&env, LCT_MD_THREAD | LCT_DT_THREAD); + if (rc != 0) + return rc; + + rc = mdd_lfsck_set_speed(&env, lfsck, val); + lu_env_fini(&env); + } + return rc != 0 ? rc : count; } static struct lprocfs_vars lprocfs_mdd_obd_vars[] = { diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 3ed1784..59b6d34 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -4467,6 +4467,8 @@ static void mdt_fini(const struct lu_env *env, struct mdt_device *m) m->mdt_nosquash_strlen = 0; } + next->md_ops->mdo_iocontrol(env, next, OBD_IOC_PAUSE_LFSCK, + 0, NULL); mdt_seq_fini(env, m); mdt_fld_fini(env, m); sptlrpc_rule_set_free(&m->mdt_sptlrpc_rset); @@ -4926,6 +4928,12 @@ static int mdt_prepare(const struct lu_env *env, if (rc) RETURN(rc); + rc = mdt->mdt_child->md_ops->mdo_iocontrol(env, mdt->mdt_child, + OBD_IOC_START_LFSCK, + 0, NULL); + if (rc != 0) + CWARN("Fail to auto trigger paused LFSCK.\n"); + rc = mdt->mdt_child->md_ops->mdo_root_get(env, mdt->mdt_child, &mdt->mdt_md_root_fid); if (rc) -- 1.8.3.1