X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosd-ldiskfs%2Fosd_scrub.c;h=37364106b08bda685544aea09a7823a8fda4e338;hp=e6eca1e45040025acac037ee868aa7e0b51e24a1;hb=c3b6fe769f7a32ba1c0f42b8e9604abecaa43da4;hpb=5dca0371d2ce9c4f54c31a9c88223a4eed77ecde diff --git a/lustre/osd-ldiskfs/osd_scrub.c b/lustre/osd-ldiskfs/osd_scrub.c index e6eca1e..3736410 100644 --- a/lustre/osd-ldiskfs/osd_scrub.c +++ b/lustre/osd-ldiskfs/osd_scrub.c @@ -20,7 +20,7 @@ * GPL HEADER END */ /* - * Copyright (c) 2012 Whamcloud, Inc. + * Copyright (c) 2012, Intel Corporation. */ /* * lustre/osd-ldiskfs/osd_scrub.c @@ -50,6 +50,19 @@ #define HALF_SEC (CFS_HZ >> 1) +#define OSD_OTABLE_MAX_HASH 0x00000000ffffffffULL + +#define SCRUB_NEXT_BREAK 1 /* exit current loop and process next group */ +#define SCRUB_NEXT_CONTINUE 2 /* skip current object and process next bit */ +#define SCRUB_NEXT_EXIT 3 /* exit all the loops */ +#define SCRUB_NEXT_WAIT 4 /* wait for free cache slot */ +#define SCRUB_NEXT_CRASH 5 /* simulate system crash during OI scrub */ +#define SCRUB_NEXT_FATAL 6 /* simulate failure during OI scrub */ +#define SCRUB_NEXT_NOSCRUB 7 /* new created object, no scrub on it */ +#define SCRUB_NEXT_NOLMA 8 /* the inode has no FID-in-LMA */ + +/* misc functions */ + static inline struct osd_device *osd_scrub2dev(struct osd_scrub *scrub) { return container_of0(scrub, struct osd_device, od_scrub); @@ -60,6 +73,93 @@ static inline struct super_block *osd_scrub2sb(struct osd_scrub *scrub) return osd_sb(osd_scrub2dev(scrub)); } +static inline int osd_scrub_has_window(struct osd_scrub *scrub, + struct osd_otable_cache *ooc) +{ + return scrub->os_pos_current < ooc->ooc_pos_preload + SCRUB_WINDOW_SIZE; +} + +static int osd_scrub_refresh_mapping(struct osd_thread_info *info, + struct osd_device *dev, + const struct lu_fid *fid, + const struct osd_inode_id *id, int ops) +{ + struct lu_fid *oi_fid = &info->oti_fid2; + struct osd_inode_id *oi_id = &info->oti_id2; + struct iam_container *bag; + struct iam_path_descr *ipd; + handle_t *jh; + int rc; + ENTRY; + + fid_cpu_to_be(oi_fid, fid); + osd_id_pack(oi_id, id); + jh = ldiskfs_journal_start_sb(osd_sb(dev), + osd_dto_credits_noquota[ops]); + if (IS_ERR(jh)) { + rc = PTR_ERR(jh); + CERROR("%.16s: fail to start trans for scrub store: rc = %d\n", + LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name, rc); + RETURN(rc); + } + + bag = &osd_fid2oi(dev, fid)->oi_dir.od_container; + ipd = osd_idx_ipd_get(info->oti_env, bag); + if (unlikely(ipd == NULL)) { + ldiskfs_journal_stop(jh); + CERROR("%.16s: fail to get ipd for scrub store\n", + LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name); + RETURN(-ENOMEM); + } + + if (ops == DTO_INDEX_UPDATE) { + rc = iam_update(jh, bag, (const struct iam_key *)oi_fid, + (struct iam_rec *)oi_id, ipd); + } else { + rc = iam_insert(jh, bag, (const struct iam_key *)oi_fid, + (struct iam_rec *)oi_id, ipd); + if (rc == -EEXIST) { + rc = 1; + /* XXX: There are trouble things when adding OI + * mapping for IGIF object, which may cause + * multiple objects to be mapped to the same + * IGIF formatted FID. Consider the following + * situations: + * + * 1) The MDT is upgrading from 1.8 device. + * The OI scrub generates IGIF FID1 for the + * OBJ1 and adds the OI mapping. + * + * 2) For some reason, the OI scrub does not + * process all the IGIF objects completely. + * + * 3) The MDT is backuped and restored against + * this device. + * + * 4) When the MDT mounts up, the OI scrub will + * try to rebuild the OI files. For some IGIF + * object, OBJ2, which was not processed by the + * OI scrub before the backup/restore, and the + * new generated IGIF formatted FID may be just + * the FID1, the same as OBJ1. + * + * Under such case, the OI scrub cannot know how + * to generate new FID for the OBJ2. + * + * Currently, we do nothing for that. One possible + * solution is to generate new normal FID for the + * conflict object. + * + * Anyway, it is rare, only exists in theory. */ + } + } + osd_ipd_put(info->oti_env, bag, ipd); + ldiskfs_journal_stop(jh); + RETURN(rc); +} + +/* OI_scrub file ops */ + static void osd_scrub_file_to_cpu(struct scrub_file *des, struct scrub_file *src) { @@ -229,6 +329,8 @@ int osd_scrub_file_store(struct osd_scrub *scrub) return rc; } +/* OI scrub APIs */ + static int osd_scrub_prep(struct osd_device *dev) { struct osd_scrub *scrub = &dev->od_scrub; @@ -238,7 +340,7 @@ static int osd_scrub_prep(struct osd_device *dev) int rc; ENTRY; - cfs_down_write(&scrub->os_rwsem); + down_write(&scrub->os_rwsem); if (flags & SS_SET_FAILOUT) sf->sf_param |= SP_FAILOUT; @@ -247,7 +349,7 @@ static int osd_scrub_prep(struct osd_device *dev) if (flags & SS_RESET) osd_scrub_file_reset(scrub, - LDISKFS_SB(osd_sb(dev))->s_es->s_uuid, sf->sf_flags); + LDISKFS_SB(osd_sb(dev))->s_es->s_uuid, 0); if (flags & SS_AUTO) { scrub->os_full_speed = 1; @@ -256,16 +358,17 @@ static int osd_scrub_prep(struct osd_device *dev) scrub->os_full_speed = 0; } - if (sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT)) + if (sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT | SF_UPGRADE)) scrub->os_full_speed = 1; scrub->os_in_prior = 0; scrub->os_waiting = 0; + scrub->os_paused = 0; scrub->os_new_checked = 0; if (sf->sf_pos_last_checkpoint != 0) sf->sf_pos_latest_start = sf->sf_pos_last_checkpoint + 1; else - sf->sf_pos_latest_start = LDISKFS_FIRST_INO(osd_sb(dev)); + sf->sf_pos_latest_start = LDISKFS_FIRST_INO(osd_sb(dev)) + 1; scrub->os_pos_current = sf->sf_pos_latest_start; sf->sf_status = SS_SCANNING; @@ -273,66 +376,60 @@ static int osd_scrub_prep(struct osd_device *dev) sf->sf_time_last_checkpoint = sf->sf_time_latest_start; rc = osd_scrub_file_store(scrub); if (rc == 0) { - cfs_spin_lock(&scrub->os_lock); + spin_lock(&scrub->os_lock); thread_set_flags(thread, SVC_RUNNING); - cfs_spin_unlock(&scrub->os_lock); + spin_unlock(&scrub->os_lock); cfs_waitq_broadcast(&thread->t_ctl_waitq); } - cfs_up_write(&scrub->os_rwsem); + up_write(&scrub->os_rwsem); RETURN(rc); } static int -osd_scrub_error(struct osd_device *dev, struct osd_inode_id *lid, int rc) -{ - struct osd_scrub *scrub = &dev->od_scrub; - struct scrub_file *sf = &scrub->os_file; - - cfs_down_write(&scrub->os_rwsem); - scrub->os_new_checked++; - sf->sf_items_failed++; - if (sf->sf_pos_first_inconsistent == 0 || - sf->sf_pos_first_inconsistent > lid->oii_ino) - sf->sf_pos_first_inconsistent = lid->oii_ino; - cfs_up_write(&scrub->os_rwsem); - return sf->sf_param & SP_FAILOUT ? rc : 0; -} - -static int osd_scrub_check_update(struct osd_thread_info *info, struct osd_device *dev, - struct osd_idmap_cache *oic) + struct osd_idmap_cache *oic, int val) { struct osd_scrub *scrub = &dev->od_scrub; struct scrub_file *sf = &scrub->os_file; + struct lu_fid *fid = &oic->oic_fid; + struct osd_inode_id *lid = &oic->oic_lid; struct osd_inode_id *lid2 = &info->oti_id; - struct lu_fid *oi_fid = &info->oti_fid; - struct osd_inode_id *oi_id = &info->oti_id2; - handle_t *jh = NULL; struct osd_inconsistent_item *oii = NULL; struct inode *inode = NULL; - struct lu_fid *fid = &oic->oic_fid; - struct osd_inode_id *lid = &oic->oic_lid; - struct iam_container *bag; - struct iam_path_descr *ipd; int ops = DTO_INDEX_UPDATE; int idx; int rc; ENTRY; + down_write(&scrub->os_rwsem); + scrub->os_new_checked++; + if (val < 0) + GOTO(out, rc = val); + if (scrub->os_in_prior) oii = cfs_list_entry(oic, struct osd_inconsistent_item, oii_cache); - cfs_down_write(&scrub->os_rwsem); - scrub->os_new_checked++; if (lid->oii_ino < sf->sf_pos_latest_start && oii == NULL) GOTO(out, rc = 0); - if (oii != NULL && oii->oii_insert) + if (fid_is_igif(fid)) + sf->sf_items_igif++; + + if ((val == SCRUB_NEXT_NOLMA) && + (!dev->od_handle_nolma || OBD_FAIL_CHECK(OBD_FAIL_FID_NOLMA))) + GOTO(out, rc = 0); + + if ((oii != NULL && oii->oii_insert) || (val == SCRUB_NEXT_NOLMA)) goto iget; - rc = osd_oi_lookup(info, dev, fid, lid2); + /* XXX: Currently, no FID-in-LMA for OST object, so osd_oi_lookup() + * without checking FLD is enough. + * + * It should be updated if FID-in-LMA for OSD object introduced + * in the future. */ + rc = osd_oi_lookup(info, dev, fid, lid2, false); if (rc != 0) { if (rc != -ENOENT) GOTO(out, rc); @@ -348,54 +445,31 @@ iget: } /* Prevent the inode to be unlinked during OI scrub. */ - cfs_mutex_lock(&inode->i_mutex); + mutex_lock(&inode->i_mutex); if (unlikely(inode->i_nlink == 0)) { - cfs_mutex_unlock(&inode->i_mutex); + mutex_unlock(&inode->i_mutex); iput(inode); GOTO(out, rc = 0); } ops = DTO_INDEX_INSERT; idx = osd_oi_fid2idx(dev, fid); - if (unlikely(!ldiskfs_test_bit(idx, sf->sf_oi_bitmap))) - ldiskfs_set_bit(idx, sf->sf_oi_bitmap); - sf->sf_flags |= SF_RECREATED; + if (val == SCRUB_NEXT_NOLMA) { + rc = osd_ea_fid_set(info, inode, fid); + if (rc != 0) + GOTO(out, rc); + } else { + sf->sf_flags |= SF_RECREATED | SF_INCONSISTENT; + if (unlikely(!ldiskfs_test_bit(idx, sf->sf_oi_bitmap))) + ldiskfs_set_bit(idx, sf->sf_oi_bitmap); + } } else if (osd_id_eq(lid, lid2)) { - GOTO(out, rc = 0); - } - - sf->sf_flags |= SF_INCONSISTENT; - fid_cpu_to_be(oi_fid, fid); - osd_id_pack(oi_id, &oic->oic_lid); - jh = ldiskfs_journal_start_sb(osd_sb(dev), - osd_dto_credits_noquota[ops]); - if (IS_ERR(jh)) { - rc = PTR_ERR(jh); - CERROR("%.16s: fail to start trans for scrub store, rc = %d\n", - LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name, rc); - GOTO(out, rc); - } - - bag = &osd_fid2oi(dev, fid)->oi_dir.od_container; - ipd = osd_idx_ipd_get(info->oti_env, bag); - if (unlikely(ipd == NULL)) { - ldiskfs_journal_stop(jh); - CERROR("%.16s: fail to get ipd for scrub store\n", - LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name); - GOTO(out, rc = -ENOMEM); - } - - if (ops == DTO_INDEX_UPDATE) { - rc = iam_update(jh, bag, (const struct iam_key *)oi_fid, - (struct iam_rec *)oi_id, ipd); + GOTO(out, rc = 0); } else { - rc = iam_insert(jh, bag, (const struct iam_key *)oi_fid, - (struct iam_rec *)oi_id, ipd); - if (rc == -EEXIST) - rc = 1; + sf->sf_flags |= SF_INCONSISTENT; } - osd_ipd_put(info->oti_env, bag, ipd); - ldiskfs_journal_stop(jh); + + rc = osd_scrub_refresh_mapping(info, dev, fid, lid, ops); if (rc == 0) { if (scrub->os_in_prior) sf->sf_items_updated_prior++; @@ -416,29 +490,33 @@ out: } if (ops == DTO_INDEX_INSERT) { - cfs_mutex_unlock(&inode->i_mutex); + mutex_unlock(&inode->i_mutex); iput(inode); } - cfs_up_write(&scrub->os_rwsem); + up_write(&scrub->os_rwsem); if (oii != NULL) { LASSERT(!cfs_list_empty(&oii->oii_list)); - cfs_spin_lock(&scrub->os_lock); + spin_lock(&scrub->os_lock); cfs_list_del_init(&oii->oii_list); - cfs_spin_unlock(&scrub->os_lock); + spin_unlock(&scrub->os_lock); OBD_FREE_PTR(oii); } RETURN(sf->sf_param & SP_FAILOUT ? rc : 0); } -static int do_osd_scrub_checkpoint(struct osd_scrub *scrub) +static int osd_scrub_checkpoint(struct osd_scrub *scrub) { struct scrub_file *sf = &scrub->os_file; int rc; - ENTRY; - cfs_down_write(&scrub->os_rwsem); + if (likely(cfs_time_before(cfs_time_current(), + scrub->os_time_next_checkpoint) || + scrub->os_new_checked == 0)) + return 0; + + down_write(&scrub->os_rwsem); sf->sf_items_checked += scrub->os_new_checked; scrub->os_new_checked = 0; sf->sf_pos_last_checkpoint = scrub->os_pos_current; @@ -446,18 +524,9 @@ static int do_osd_scrub_checkpoint(struct osd_scrub *scrub) sf->sf_run_time += cfs_duration_sec(cfs_time_current() + HALF_SEC - scrub->os_time_last_checkpoint); rc = osd_scrub_file_store(scrub); - cfs_up_write(&scrub->os_rwsem); + up_write(&scrub->os_rwsem); - RETURN(rc); -} - -static inline int osd_scrub_checkpoint(struct osd_scrub *scrub) -{ - if (unlikely(cfs_time_beforeq(scrub->os_time_next_checkpoint, - cfs_time_current()) && - scrub->os_new_checked > 0)) - return do_osd_scrub_checkpoint(scrub); - return 0; + return rc; } static void osd_scrub_post(struct osd_scrub *scrub, int result) @@ -465,10 +534,10 @@ static void osd_scrub_post(struct osd_scrub *scrub, int result) struct scrub_file *sf = &scrub->os_file; ENTRY; - cfs_down_write(&scrub->os_rwsem); - cfs_spin_lock(&scrub->os_lock); + down_write(&scrub->os_rwsem); + spin_lock(&scrub->os_lock); thread_set_flags(&scrub->os_thread, SVC_STOPPING); - cfs_spin_unlock(&scrub->os_lock); + spin_unlock(&scrub->os_lock); if (scrub->os_new_checked > 0) { sf->sf_items_checked += scrub->os_new_checked; scrub->os_new_checked = 0; @@ -478,7 +547,8 @@ static void osd_scrub_post(struct osd_scrub *scrub, int result) if (result > 0) { sf->sf_status = SS_COMPLETED; memset(sf->sf_oi_bitmap, 0, SCRUB_OI_BITMAP_SIZE); - sf->sf_flags &= ~(SF_RECREATED | SF_INCONSISTENT | SF_AUTO); + sf->sf_flags &= ~(SF_RECREATED | SF_INCONSISTENT | + SF_UPGRADE | SF_AUTO); sf->sf_time_last_complete = sf->sf_time_last_checkpoint; sf->sf_success_count++; } else if (result == 0) { @@ -496,19 +566,12 @@ static void osd_scrub_post(struct osd_scrub *scrub, int result) CERROR("%.16s: fail to osd_scrub_post, rc = %d\n", LDISKFS_SB(osd_scrub2sb(scrub))->s_es->s_volume_name, result); - cfs_up_write(&scrub->os_rwsem); + up_write(&scrub->os_rwsem); EXIT; } -#define SCRUB_NEXT_BREAK 1 /* exit current loop and process next group */ -#define SCRUB_NEXT_CONTINUE 2 /* skip current object and process next bit */ -#define SCRUB_NEXT_EXIT 3 /* exit all the loops */ -#define SCRUB_NEXT_WAIT 4 /* wait for free cache slot */ -#define SCRUB_NEXT_CRASH 5 /* simulate system crash during OI scrub */ -#define SCRUB_NEXT_FATAL 6 /* simulate failure during OI scrub */ -#define SCRUB_NEXT_NOSCRUB 7 /* new created object, no scrub on it */ -#define SCRUB_NEXT_IGIF 8 /* IGIF object */ +/* iteration engine */ struct osd_iit_param { struct super_block *sb; @@ -530,12 +593,6 @@ typedef int (*osd_iit_exec_policy)(struct osd_thread_info *info, struct osd_idmap_cache *oic, int *noslot, int rc); -static inline int osd_scrub_has_window(struct osd_scrub *scrub, - struct osd_otable_cache *ooc) -{ - return scrub->os_pos_current < ooc->ooc_pos_preload + SCRUB_WINDOW_SIZE; -} - static int osd_iit_next(struct osd_iit_param *param, __u32 *pos) { param->offset = ldiskfs_find_next_bit(param->bitmap->b_data, @@ -551,13 +608,14 @@ static int osd_iit_next(struct osd_iit_param *param, __u32 *pos) static int osd_iit_iget(struct osd_thread_info *info, struct osd_device *dev, struct lu_fid *fid, struct osd_inode_id *lid, __u32 pos, - struct super_block *sb, struct inode **pinode) + struct super_block *sb, bool scrub) { - struct inode *inode; - int rc; + struct lustre_mdt_attrs *lma = &info->oti_mdt_attrs; + struct inode *inode; + int rc; osd_id_gen(lid, pos, OSD_OII_NOGEN); - inode = osd_iget_fid(info, dev, lid, fid); + inode = osd_iget(info, dev, lid); if (IS_ERR(inode)) { rc = PTR_ERR(inode); /* The inode may be removed after bitmap searching, or the @@ -570,8 +628,38 @@ static int osd_iit_iget(struct osd_thread_info *info, struct osd_device *dev, return rc; } - *pinode = inode; - return 0; + /* If the inode has no OI mapping, then it is special locally used, + * should be invisible to OI scrub or up layer LFSCK. */ + if (ldiskfs_test_inode_state(inode, LDISKFS_STATE_LUSTRE_NO_OI)) { + iput(inode); + return SCRUB_NEXT_CONTINUE; + } + + if (scrub && + ldiskfs_test_inode_state(inode, LDISKFS_STATE_LUSTRE_NOSCRUB)) { + /* Only skip it for the first OI scrub accessing. */ + ldiskfs_clear_inode_state(inode, LDISKFS_STATE_LUSTRE_NOSCRUB); + iput(inode); + return SCRUB_NEXT_NOSCRUB; + } + + rc = osd_get_lma(info, inode, &info->oti_obj_dentry, lma); + if (rc == 0) { + if (!scrub) { + if (!fid_is_client_visible(&lma->lma_self_fid)) + rc = SCRUB_NEXT_CONTINUE; + else + *fid = lma->lma_self_fid; + } + } else if (rc == -ENODATA) { + lu_igif_build(fid, inode->i_ino, inode->i_generation); + if (scrub) + rc = SCRUB_NEXT_NOLMA; + else + rc = 0; + } + iput(inode); + return rc; } static int osd_scrub_next(struct osd_thread_info *info, struct osd_device *dev, @@ -582,7 +670,6 @@ static int osd_scrub_next(struct osd_thread_info *info, struct osd_device *dev, struct ptlrpc_thread *thread = &scrub->os_thread; struct lu_fid *fid; struct osd_inode_id *lid; - struct inode *inode; int rc; if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_DELAY) && cfs_fail_val > 0) { @@ -596,9 +683,9 @@ static int osd_scrub_next(struct osd_thread_info *info, struct osd_device *dev, } if (OBD_FAIL_CHECK(OBD_FAIL_OSD_SCRUB_CRASH)) { - cfs_spin_lock(&scrub->os_lock); + spin_lock(&scrub->os_lock); thread_set_flags(thread, SVC_STOPPING); - cfs_spin_unlock(&scrub->os_lock); + spin_unlock(&scrub->os_lock); return SCRUB_NEXT_CRASH; } @@ -629,19 +716,7 @@ static int osd_scrub_next(struct osd_thread_info *info, struct osd_device *dev, fid = &(*oic)->oic_fid; lid = &(*oic)->oic_lid; rc = osd_iit_iget(info, dev, fid, lid, - scrub->os_pos_current, param->sb, &inode); - if (rc != 0) - return rc; - - if (inode->i_state & I_LUSTRE_NOSCRUB) { - /* Only skip it for the first OI scrub accessing. */ - inode->i_state &= ~I_LUSTRE_NOSCRUB; - rc = SCRUB_NEXT_NOSCRUB; - } else if (!fid_is_norm(fid)) { - rc = SCRUB_NEXT_IGIF; - } - - iput(inode); + scrub->os_pos_current, param->sb, true); return rc; } @@ -652,7 +727,6 @@ static int osd_preload_next(struct osd_thread_info *info, struct osd_otable_cache *ooc = &dev->od_otable_it->ooi_cache; struct osd_scrub *scrub; struct ptlrpc_thread *thread; - struct inode *inode; int rc; rc = osd_iit_next(param, &ooc->ooc_pos_preload); @@ -668,12 +742,10 @@ static int osd_preload_next(struct osd_thread_info *info, rc = osd_iit_iget(info, dev, &ooc->ooc_cache[ooc->ooc_producer_idx].oic_fid, &ooc->ooc_cache[ooc->ooc_producer_idx].oic_lid, - ooc->ooc_pos_preload, param->sb, &inode); + ooc->ooc_pos_preload, param->sb, false); /* If succeed, it needs to move forward; otherwise up layer LFSCK may * ignore the failure, so it still need to skip the inode next time. */ ooc->ooc_pos_preload = param->gbase + ++(param->offset); - if (rc == 0) - iput(inode); return rc; } @@ -684,7 +756,6 @@ static int osd_scrub_exec(struct osd_thread_info *info, struct osd_device *dev, struct l_wait_info lwi = { 0 }; struct osd_scrub *scrub = &dev->od_scrub; struct scrub_file *sf = &scrub->os_file; - __u64 *items = NULL; struct ptlrpc_thread *thread = &scrub->os_thread; struct osd_otable_it *it = dev->od_otable_it; struct osd_otable_cache *ooc = it ? &it->ooi_cache : NULL; @@ -695,27 +766,14 @@ static int osd_scrub_exec(struct osd_thread_info *info, struct osd_device *dev, case SCRUB_NEXT_WAIT: goto wait; case SCRUB_NEXT_NOSCRUB: - items = &sf->sf_items_noscrub; - break; - case SCRUB_NEXT_IGIF: - items = &sf->sf_items_igif; - break; - } - - if (items != NULL) { - cfs_down_write(&scrub->os_rwsem); + down_write(&scrub->os_rwsem); scrub->os_new_checked++; - (*items)++; - cfs_up_write(&scrub->os_rwsem); + sf->sf_items_noscrub++; + up_write(&scrub->os_rwsem); goto next; } - LASSERTF(rc <= 0, "unexpected rc = %d\n", rc); - - if (rc != 0) - rc = osd_scrub_error(dev, &oic->oic_lid, rc); - else - rc = osd_scrub_check_update(info, dev, oic); + rc = osd_scrub_check_update(info, dev, oic, rc); if (rc != 0) return rc; @@ -782,7 +840,7 @@ static int osd_preload_exec(struct osd_thread_info *info, #define SCRUB_IT_CRASH 2 static int osd_inode_iteration(struct osd_thread_info *info, - struct osd_device *dev, __u32 max, int preload) + struct osd_device *dev, __u32 max, bool preload) { osd_iit_next_policy next; osd_iit_exec_policy exec; @@ -794,7 +852,7 @@ static int osd_inode_iteration(struct osd_thread_info *info, int rc; ENTRY; - if (preload == 0) { + if (!preload) { struct osd_scrub *scrub = &dev->od_scrub; next = osd_scrub_next; @@ -849,10 +907,18 @@ static int osd_inode_iteration(struct osd_thread_info *info, brelse(param.bitmap); RETURN(rc); } + + if (preload && dev->od_otable_it->ooi_stopping) { + brelse(param.bitmap); + RETURN(0); + } } next_group: brelse(param.bitmap); + + if (preload && dev->od_otable_it->ooi_stopping) + RETURN(0); } if (*pos > limit) @@ -860,6 +926,32 @@ next_group: RETURN(0); } +static int osd_otable_it_preload(const struct lu_env *env, + struct osd_otable_it *it) +{ + struct osd_device *dev = it->ooi_dev; + struct osd_scrub *scrub = &dev->od_scrub; + struct osd_otable_cache *ooc = &it->ooi_cache; + int rc; + ENTRY; + + rc = osd_inode_iteration(osd_oti_get(env), dev, + OSD_OTABLE_IT_CACHE_SIZE, true); + if (rc == SCRUB_IT_ALL) + it->ooi_all_cached = 1; + + CDEBUG(D_LFSCK, "OSD pre-loaded: max = %u, preload = %u, rc = %d\n", + le32_to_cpu(LDISKFS_SB(osd_sb(dev))->s_es->s_inodes_count), + ooc->ooc_pos_preload, rc); + + if (scrub->os_waiting && osd_scrub_has_window(scrub, ooc)) { + scrub->os_waiting = 0; + cfs_waitq_broadcast(&scrub->os_thread.t_ctl_waitq); + } + + RETURN(rc < 0 ? rc : ooc->ooc_cached_items); +} + static int osd_scrub_main(void *args) { struct lu_env env; @@ -903,7 +995,7 @@ static int osd_scrub_main(void *args) CDEBUG(D_LFSCK, "OI scrub: flags = 0x%x, pos = %u\n", scrub->os_start_flags, scrub->os_pos_current); - rc = osd_inode_iteration(osd_oti_get(&env), dev, ~0U, 0); + rc = osd_inode_iteration(osd_oti_get(&env), dev, ~0U, false); if (unlikely(rc == SCRUB_IT_CRASH)) GOTO(out, rc = -EINVAL); GOTO(post, rc); @@ -925,13 +1017,502 @@ out: lu_env_fini(&env); noenv: - cfs_spin_lock(&scrub->os_lock); + spin_lock(&scrub->os_lock); thread_set_flags(thread, SVC_STOPPED); cfs_waitq_broadcast(&thread->t_ctl_waitq); - cfs_spin_unlock(&scrub->os_lock); + spin_unlock(&scrub->os_lock); return rc; } +/* initial OI scrub */ + +typedef int (*scandir_t)(struct osd_thread_info *, struct osd_device *, + struct dentry *, filldir_t filldir); + +static int osd_ios_varfid_fill(void *buf, const char *name, int namelen, + loff_t offset, __u64 ino, unsigned d_type); + +static int +osd_ios_general_scan(struct osd_thread_info *info, struct osd_device *dev, + struct dentry *dentry, filldir_t filldir); +static int +osd_ios_ROOT_scan(struct osd_thread_info *info, struct osd_device *dev, + struct dentry *dentry, filldir_t filldir); + +static int +osd_ios_OBJECTS_scan(struct osd_thread_info *info, struct osd_device *dev, + struct dentry *dentry, filldir_t filldir); + +enum osd_lf_flags { + OLF_SCAN_SUBITEMS = 0x0001, + OLF_HIDE_FID = 0x0002, + OLF_SHOW_NAME = 0x0004, +}; + +struct osd_lf_map { + char *olm_name; + struct lu_fid olm_fid; + __u16 olm_flags; + scandir_t olm_scandir; + filldir_t olm_filldir; +}; + +/* Add the new introduced local files in the list in the future. */ +static const struct osd_lf_map osd_lf_maps[] = { + /* CATALOGS */ + { CATLIST, { FID_SEQ_LOCAL_FILE, LLOG_CATALOGS_OID, 0 }, OLF_SHOW_NAME, + NULL, NULL }, + + /* CONFIGS */ + { MOUNT_CONFIGS_DIR, { FID_SEQ_LOCAL_FILE, MGS_CONFIGS_OID, 0 }, + OLF_SCAN_SUBITEMS, osd_ios_general_scan, + osd_ios_varfid_fill }, + + /* NIDTBL_VERSIONS */ + { MGS_NIDTBL_DIR, { 0, 0, 0 }, OLF_SCAN_SUBITEMS, + osd_ios_general_scan, osd_ios_varfid_fill }, + + /* PENDING */ + { "PENDING", { FID_SEQ_LOCAL_FILE, MDD_ORPHAN_OID, 0 }, 0, NULL, NULL }, + + /* ROOT */ + { "ROOT", { FID_SEQ_LOCAL_FILE, MDD_ROOT_INDEX_OID, 0 }, + OLF_SCAN_SUBITEMS | OLF_HIDE_FID, osd_ios_ROOT_scan, NULL }, + + /* capa_keys */ + { CAPA_KEYS, { FID_SEQ_LOCAL_FILE, MDD_CAPA_KEYS_OID, 0 }, 0, + NULL, NULL }, + + /* changelog_catalog */ + { CHANGELOG_CATALOG, { 0, 0, 0 }, 0, NULL, NULL }, + + /* changelog_users */ + { CHANGELOG_USERS, { 0, 0, 0 }, 0, NULL, NULL }, + + /* fld */ + { "fld", { FID_SEQ_LOCAL_FILE, FLD_INDEX_OID, 0 }, OLF_SHOW_NAME, + NULL, NULL }, + + /* last_rcvd */ + { LAST_RCVD, { FID_SEQ_LOCAL_FILE, LAST_RECV_OID, 0 }, OLF_SHOW_NAME, + NULL, NULL }, + + /* lfsck_bookmark */ + { "lfsck_bookmark", { FID_SEQ_LOCAL_FILE, LFSCK_BOOKMARK_OID, 0 }, 0, + NULL, NULL }, + + /* lov_objid */ + { LOV_OBJID, { FID_SEQ_LOCAL_FILE, MDD_LOV_OBJ_OID, 0 }, OLF_SHOW_NAME, + NULL, NULL }, + + /* lov_objseq */ + { LOV_OBJSEQ, { FID_SEQ_LOCAL_FILE, MDD_LOV_OBJ_OSEQ, 0 }, + OLF_SHOW_NAME, NULL, NULL }, + + /* quota_master */ + { QMT_DIR, { 0, 0, 0 }, OLF_SCAN_SUBITEMS, + osd_ios_general_scan, osd_ios_varfid_fill }, + + /* quota_slave */ + { QSD_DIR, { 0, 0, 0 }, OLF_SCAN_SUBITEMS, + osd_ios_general_scan, osd_ios_varfid_fill }, + + /* seq-200000003-lastid */ + { "seq-200000003-lastid", { FID_SEQ_LOCAL_NAME, 1, 0 }, 0, + NULL, NULL }, + + /* seq_ctl */ + { "seq_ctl", { FID_SEQ_LOCAL_FILE, FID_SEQ_CTL_OID, 0 }, + OLF_SHOW_NAME, NULL, NULL }, + + /* seq_srv */ + { "seq_srv", { FID_SEQ_LOCAL_FILE, FID_SEQ_SRV_OID, 0 }, + OLF_SHOW_NAME, NULL, NULL }, + + /* LAST_GROUP */ + { "LAST_GROUP", { FID_SEQ_LOCAL_FILE, OFD_LAST_GROUP_OID, 0 }, + OLF_SHOW_NAME, NULL, NULL }, + + /* health_check */ + { HEALTH_CHECK, { FID_SEQ_LOCAL_FILE, OFD_HEALTH_CHECK_OID, 0 }, + OLF_SHOW_NAME, NULL, NULL }, + + /* lfsck_namespace */ + { "lfsck_namespace", { FID_SEQ_LOCAL_FILE, LFSCK_NAMESPACE_OID, 0 }, 0, + NULL, NULL }, + + /* OBJECTS, upgrade from old device */ + { OBJECTS, { 0, 0, 0 }, OLF_SCAN_SUBITEMS, osd_ios_OBJECTS_scan, NULL }, + + /* lquota_v2.user, upgrade from old device */ + { "lquota_v2.user", { 0, 0, 0 }, 0, NULL, NULL }, + + /* lquota_v2.group, upgrade from old device */ + { "lquota_v2.group", { 0, 0, 0 }, 0, NULL, NULL }, + + { NULL, { 0, 0, 0 }, 0, NULL, NULL } +}; + +struct osd_ios_item { + cfs_list_t oii_list; + struct dentry *oii_dentry; + scandir_t oii_scandir; + filldir_t oii_filldir; +}; + +struct osd_ios_filldir_buf { + struct osd_thread_info *oifb_info; + struct osd_device *oifb_dev; + struct dentry *oifb_dentry; +}; + +static inline struct dentry * +osd_ios_lookup_one_len(const char *name, struct dentry *parent, int namelen) +{ + struct dentry *dentry; + + dentry = ll_lookup_one_len(name, parent, namelen); + if (!IS_ERR(dentry) && dentry->d_inode == NULL) { + dput(dentry); + return ERR_PTR(-ENOENT); + } + + return dentry; +} + +static inline void +osd_ios_llogname2fid(struct lu_fid *fid, const char *name, int namelen) +{ + obd_id id = 0; + int i = 0; + + fid->f_seq = FID_SEQ_LLOG; + while (i < namelen) + id = id * 10 + name[i++] - '0'; + + fid->f_oid = id & 0x00000000ffffffffULL; + fid->f_ver = id >> 32; +} + +static inline void +osd_ios_Oname2fid(struct lu_fid *fid, const char *name, int namelen) +{ + __u64 seq = 0; + int i = 0; + + while (i < namelen) + seq = seq * 10 + name[i++] - '0'; + + lu_last_id_fid(fid, seq); +} + +static int +osd_ios_new_item(struct osd_device *dev, struct dentry *dentry, + scandir_t scandir, filldir_t filldir) +{ + struct osd_ios_item *item; + + OBD_ALLOC_PTR(item); + if (item == NULL) + return -ENOMEM; + + CFS_INIT_LIST_HEAD(&item->oii_list); + item->oii_dentry = dget(dentry); + item->oii_scandir = scandir; + item->oii_filldir = filldir; + cfs_list_add_tail(&item->oii_list, &dev->od_ios_list); + return 0; +} + +/** + * osd_ios_scan_one() - check/fix LMA FID and OI entry for one inode + * + * The passed \a inode's \a fid is verified against the LMA FID. If the \a fid + * is NULL or is empty the IGIF FID is used. The FID is verified in the OI to + * reference the inode, or fixed if it is missing or references another inode. + */ +static int +osd_ios_scan_one(struct osd_thread_info *info, struct osd_device *dev, + struct inode *inode, const struct lu_fid *fid, int flags) +{ + struct lustre_mdt_attrs *lma = &info->oti_mdt_attrs; + struct osd_inode_id *id = &info->oti_id; + struct osd_inode_id *id2 = &info->oti_id2; + struct osd_scrub *scrub = &dev->od_scrub; + struct scrub_file *sf = &scrub->os_file; + struct lu_fid tfid; + int rc; + ENTRY; + + rc = osd_get_lma(info, inode, &info->oti_obj_dentry, lma); + if (rc != 0 && rc != -ENODATA) + RETURN(rc); + + osd_id_gen(id, inode->i_ino, inode->i_generation); + if (rc == -ENODATA) { + if (fid == NULL || fid_is_zero(fid) || flags & OLF_HIDE_FID) + lu_igif_build(&tfid, inode->i_ino, inode->i_generation); + else + tfid = *fid; + rc = osd_ea_fid_set(info, inode, &tfid); + if (rc != 0) + RETURN(rc); + } else { + tfid = lma->lma_self_fid; + } + + rc = __osd_oi_lookup(info, dev, &tfid, id2); + if (rc != 0) { + if (rc != -ENOENT) + RETURN(rc); + + rc = osd_scrub_refresh_mapping(info, dev, &tfid, id, + DTO_INDEX_INSERT); + RETURN(rc); + } + + if (osd_id_eq_strict(id, id2)) + RETURN(0); + + if (!(sf->sf_flags & SF_INCONSISTENT)) { + osd_scrub_file_reset(scrub, + LDISKFS_SB(osd_sb(dev))->s_es->s_uuid, + SF_INCONSISTENT); + rc = osd_scrub_file_store(scrub); + if (rc != 0) + RETURN(rc); + } + + rc = osd_scrub_refresh_mapping(info, dev, &tfid, id, DTO_INDEX_UPDATE); + + RETURN(rc); +} + +static int osd_ios_varfid_fill(void *buf, const char *name, int namelen, + loff_t offset, __u64 ino, unsigned d_type) +{ + struct osd_ios_filldir_buf *fill_buf = buf; + struct osd_device *dev = fill_buf->oifb_dev; + struct dentry *child; + int rc; + ENTRY; + + /* skip any '.' started names */ + if (name[0] == '.') + RETURN(0); + + child = osd_ios_lookup_one_len(name, fill_buf->oifb_dentry, namelen); + if (IS_ERR(child)) + RETURN(PTR_ERR(child)); + + rc = osd_ios_scan_one(fill_buf->oifb_info, dev, child->d_inode, + NULL, 0); + if (rc == 0 && S_ISDIR(child->d_inode->i_mode)) + rc = osd_ios_new_item(dev, child, osd_ios_general_scan, + osd_ios_varfid_fill); + dput(child); + + RETURN(rc); +} + +static int osd_ios_root_fill(void *buf, const char *name, int namelen, + loff_t offset, __u64 ino, unsigned d_type) +{ + struct osd_ios_filldir_buf *fill_buf = buf; + struct osd_device *dev = fill_buf->oifb_dev; + const struct osd_lf_map *map; + struct dentry *child; + int rc = 0; + ENTRY; + + /* skip any '.' started names */ + if (name[0] == '.') + RETURN(0); + + for (map = osd_lf_maps; map->olm_name != NULL; map++) { + if (strlen(map->olm_name) != namelen) + continue; + + if (strncmp(map->olm_name, name, namelen) == 0) + break; + } + + if (map->olm_name == NULL) + RETURN(0); + + child = osd_ios_lookup_one_len(name, fill_buf->oifb_dentry, namelen); + if (IS_ERR(child)) + RETURN(PTR_ERR(child)); + + rc = osd_ios_scan_one(fill_buf->oifb_info, dev, child->d_inode, + &map->olm_fid, map->olm_flags); + if (rc == 0 && map->olm_flags & OLF_SCAN_SUBITEMS) + rc = osd_ios_new_item(dev, child, map->olm_scandir, + map->olm_filldir); + dput(child); + + RETURN(rc); +} + +static int +osd_ios_general_scan(struct osd_thread_info *info, struct osd_device *dev, + struct dentry *dentry, filldir_t filldir) +{ + struct osd_ios_filldir_buf buf = { info, dev, dentry }; + struct file *filp = &info->oti_it_ea.oie_file; + struct inode *inode = dentry->d_inode; + const struct file_operations *fops = inode->i_fop; + int rc; + ENTRY; + + LASSERT(filldir != NULL); + + filp->f_pos = 0; + filp->f_dentry = dentry; + filp->f_mode = FMODE_64BITHASH; + filp->f_mapping = inode->i_mapping; + filp->f_op = fops; + filp->private_data = NULL; + + rc = fops->readdir(filp, &buf, filldir); + fops->release(inode, filp); + + RETURN(rc); +} + +static int +osd_ios_ROOT_scan(struct osd_thread_info *info, struct osd_device *dev, + struct dentry *dentry, filldir_t filldir) +{ + struct osd_scrub *scrub = &dev->od_scrub; + struct scrub_file *sf = &scrub->os_file; + struct dentry *child; + int rc; + ENTRY; + + /* It is existing MDT device. */ + dev->od_handle_nolma = 1; + child = osd_ios_lookup_one_len(dot_lustre_name, dentry, + strlen(dot_lustre_name)); + if (IS_ERR(child)) { + rc = PTR_ERR(child); + if (rc == -ENOENT) { + /* It is 1.8 MDT device. */ + if (!(sf->sf_flags & SF_UPGRADE)) { + osd_scrub_file_reset(scrub, + LDISKFS_SB(osd_sb(dev))->s_es->s_uuid, + SF_UPGRADE); + rc = osd_scrub_file_store(scrub); + } else { + rc = 0; + } + } + } else { + rc = osd_ios_scan_one(info, dev, child->d_inode, NULL, 0); + dput(child); + } + + RETURN(rc); +} + +static int +osd_ios_OBJECTS_scan(struct osd_thread_info *info, struct osd_device *dev, + struct dentry *dentry, filldir_t filldir) +{ + struct dentry *child; + int rc; + ENTRY; + + child = osd_ios_lookup_one_len(ADMIN_USR, dentry, strlen(ADMIN_USR)); + if (!IS_ERR(child)) { + rc = osd_ios_scan_one(info, dev, child->d_inode, NULL, 0); + dput(child); + } else { + rc = PTR_ERR(child); + } + + if (rc != 0 && rc != -ENOENT) + RETURN(rc); + + child = osd_ios_lookup_one_len(ADMIN_GRP, dentry, strlen(ADMIN_GRP)); + if (!IS_ERR(child)) { + rc = osd_ios_scan_one(info, dev, child->d_inode, NULL, 0); + dput(child); + } else { + rc = PTR_ERR(child); + } + + if (rc == -ENOENT) + rc = 0; + + RETURN(rc); +} + +static int osd_initial_OI_scrub(struct osd_thread_info *info, + struct osd_device *dev) +{ + struct osd_ios_item *item = NULL; + scandir_t scandir = osd_ios_general_scan; + filldir_t filldir = osd_ios_root_fill; + struct dentry *dentry = osd_sb(dev)->s_root; + int rc; + ENTRY; + + while (1) { + rc = scandir(info, dev, dentry, filldir); + if (item != NULL) { + dput(item->oii_dentry); + OBD_FREE_PTR(item); + } + + if (rc != 0) + break; + + if (cfs_list_empty(&dev->od_ios_list)) + break; + + item = cfs_list_entry(dev->od_ios_list.next, + struct osd_ios_item, oii_list); + cfs_list_del_init(&item->oii_list); + + LASSERT(item->oii_scandir != NULL); + scandir = item->oii_scandir; + filldir = item->oii_filldir; + dentry = item->oii_dentry; + } + + while (!cfs_list_empty(&dev->od_ios_list)) { + item = cfs_list_entry(dev->od_ios_list.next, + struct osd_ios_item, oii_list); + cfs_list_del_init(&item->oii_list); + dput(item->oii_dentry); + OBD_FREE_PTR(item); + } + + RETURN(rc); +} + +char *osd_lf_fid2name(const struct lu_fid *fid) +{ + const struct osd_lf_map *map = osd_lf_maps; + + while (map->olm_name != NULL) { + if (!lu_fid_eq(fid, &map->olm_fid)) { + map++; + continue; + } + + if (map->olm_flags & OLF_SHOW_NAME) + return map->olm_name; + else + return ""; + } + + return NULL; +} + +/* OI scrub start/stop */ + static int do_osd_scrub_start(struct osd_device *dev, __u32 flags) { struct osd_scrub *scrub = &dev->od_scrub; @@ -942,18 +1523,18 @@ static int do_osd_scrub_start(struct osd_device *dev, __u32 flags) again: /* os_lock: sync status between stop and scrub thread */ - cfs_spin_lock(&scrub->os_lock); + spin_lock(&scrub->os_lock); if (thread_is_running(thread)) { - cfs_spin_unlock(&scrub->os_lock); + spin_unlock(&scrub->os_lock); RETURN(-EALREADY); } else if (unlikely(thread_is_stopping(thread))) { - cfs_spin_unlock(&scrub->os_lock); + spin_unlock(&scrub->os_lock); l_wait_event(thread->t_ctl_waitq, thread_is_stopped(thread), &lwi); goto again; } - cfs_spin_unlock(&scrub->os_lock); + spin_unlock(&scrub->os_lock); if (scrub->os_file.sf_status == SS_COMPLETED) flags |= SS_RESET; @@ -980,9 +1561,9 @@ int osd_scrub_start(struct osd_device *dev) ENTRY; /* od_otable_mutex: prevent curcurrent start/stop */ - cfs_mutex_lock(&dev->od_otable_mutex); + mutex_lock(&dev->od_otable_mutex); rc = do_osd_scrub_start(dev, SS_AUTO); - cfs_mutex_unlock(&dev->od_otable_mutex); + mutex_unlock(&dev->od_otable_mutex); RETURN(rc == -EALREADY ? 0 : rc); } @@ -993,30 +1574,32 @@ static void do_osd_scrub_stop(struct osd_scrub *scrub) struct l_wait_info lwi = { 0 }; /* os_lock: sync status between stop and scrub thread */ - cfs_spin_lock(&scrub->os_lock); + spin_lock(&scrub->os_lock); if (!thread_is_init(thread) && !thread_is_stopped(thread)) { thread_set_flags(thread, SVC_STOPPING); - cfs_spin_unlock(&scrub->os_lock); + spin_unlock(&scrub->os_lock); cfs_waitq_broadcast(&thread->t_ctl_waitq); l_wait_event(thread->t_ctl_waitq, thread_is_stopped(thread), &lwi); /* Do not skip the last lock/unlock, which can guarantee that * the caller cannot return until the OI scrub thread exit. */ - cfs_spin_lock(&scrub->os_lock); + spin_lock(&scrub->os_lock); } - cfs_spin_unlock(&scrub->os_lock); + spin_unlock(&scrub->os_lock); } static void osd_scrub_stop(struct osd_device *dev) { /* od_otable_mutex: prevent curcurrent start/stop */ - cfs_mutex_lock(&dev->od_otable_mutex); + mutex_lock(&dev->od_otable_mutex); dev->od_scrub.os_paused = 1; do_osd_scrub_stop(&dev->od_scrub); - cfs_mutex_unlock(&dev->od_otable_mutex); + mutex_unlock(&dev->od_otable_mutex); } +/* OI scrub setup/cleanup */ + static const char osd_scrub_name[] = "OI_scrub"; int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev) @@ -1025,14 +1608,11 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev) struct osd_scrub *scrub = &dev->od_scrub; struct lvfs_run_ctxt *ctxt = &scrub->os_ctxt; struct scrub_file *sf = &scrub->os_file; - struct osd_inode_id *id = &scrub->os_oic.oic_lid; struct super_block *sb = osd_sb(dev); struct ldiskfs_super_block *es = LDISKFS_SB(sb)->s_es; - struct inode *inode; struct lvfs_run_ctxt saved; struct file *filp; int dirty = 0; - int init = 0; int rc = 0; ENTRY; @@ -1043,8 +1623,8 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev) ctxt->fs = get_ds(); cfs_waitq_init(&scrub->os_thread.t_ctl_waitq); - cfs_init_rwsem(&scrub->os_rwsem); - cfs_spin_lock_init(&scrub->os_lock); + init_rwsem(&scrub->os_rwsem); + spin_lock_init(&scrub->os_lock); CFS_INIT_LIST_HEAD(&scrub->os_inconsistent_items); push_ctxt(&saved, ctxt, NULL); @@ -1055,12 +1635,13 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev) scrub->os_inode = igrab(filp->f_dentry->d_inode); filp_close(filp, 0); pop_ctxt(&saved, ctxt, NULL); + ldiskfs_set_inode_state(scrub->os_inode, + LDISKFS_STATE_LUSTRE_NO_OI); rc = osd_scrub_file_load(scrub); if (rc == -ENOENT) { osd_scrub_file_init(scrub, es->s_uuid); dirty = 1; - init = 1; } else if (rc != 0) { RETURN(rc); } else { @@ -1076,7 +1657,7 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev) if (sf->sf_pos_last_checkpoint != 0) scrub->os_pos_current = sf->sf_pos_last_checkpoint + 1; else - scrub->os_pos_current = LDISKFS_FIRST_INO(sb); + scrub->os_pos_current = LDISKFS_FIRST_INO(sb) + 1; if (dirty != 0) { rc = osd_scrub_file_store(scrub); @@ -1089,32 +1670,14 @@ int osd_scrub_setup(const struct lu_env *env, struct osd_device *dev) if (rc < 0) RETURN(rc); - if (init != 0) { - rc = __osd_oi_lookup(info, dev, &LU_DOT_LUSTRE_FID, id); - if (rc == 0) { - inode = osd_iget(info, dev, id); - if (IS_ERR(inode)) { - rc = PTR_ERR(inode); - /* It is restored from old 2.x backup. */ - if (rc == -ENOENT || rc == -ESTALE) { - osd_scrub_file_reset(scrub, es->s_uuid, - SF_INCONSISTENT); - rc = osd_scrub_file_store(scrub); - } - } else { - iput(inode); - } - } else if (rc == -ENOENT) { - rc = 0; - } - } - + rc = osd_initial_OI_scrub(info, dev); if (rc == 0 && !dev->od_noscrub && ((sf->sf_status == SS_PAUSED) || (sf->sf_status == SS_CRASHED && - sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT | SF_AUTO)) || + sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT | SF_UPGRADE | + SF_AUTO)) || (sf->sf_status == SS_INIT && - sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT)))) + sf->sf_flags & (SF_RECREATED | SF_INCONSISTENT | SF_UPGRADE)))) rc = osd_scrub_start(dev); RETURN(rc); @@ -1135,6 +1698,8 @@ void osd_scrub_cleanup(const struct lu_env *env, struct osd_device *dev) osd_oi_fini(osd_oti_get(env), dev); } +/* object table based iteration APIs */ + static struct dt_it *osd_otable_it_init(const struct lu_env *env, struct dt_object *dt, __u32 attr, struct lustre_capa *capa) @@ -1149,7 +1714,7 @@ static struct dt_it *osd_otable_it_init(const struct lu_env *env, ENTRY; /* od_otable_mutex: prevent curcurrent init/fini */ - cfs_mutex_lock(&dev->od_otable_mutex); + mutex_lock(&dev->od_otable_mutex); if (dev->od_otable_it != NULL) GOTO(out, it = ERR_PTR(-EALREADY)); @@ -1159,6 +1724,7 @@ static struct dt_it *osd_otable_it_init(const struct lu_env *env, dev->od_otable_it = it; it->ooi_dev = dev; + it->ooi_pid = cfs_curproc_pid(); it->ooi_cache.ooc_consumer_idx = -1; if (flags & DOIF_OUTUSED) it->ooi_used_outside = 1; @@ -1174,20 +1740,18 @@ static struct dt_it *osd_otable_it_init(const struct lu_env *env, } rc = do_osd_scrub_start(dev, start); - if (rc == -EALREADY) { - it->ooi_cache.ooc_pos_preload = scrub->os_pos_current - 1; - } else if (rc < 0) { + if (rc < 0 && rc != -EALREADY) { dev->od_otable_it = NULL; OBD_FREE_PTR(it); - GOTO(out, it = ERR_PTR(-EALREADY)); - } else { - it->ooi_cache.ooc_pos_preload = scrub->os_pos_current; + GOTO(out, it = ERR_PTR(rc)); } + it->ooi_cache.ooc_pos_preload = scrub->os_pos_current; + GOTO(out, it); out: - cfs_mutex_unlock(&dev->od_otable_mutex); + mutex_unlock(&dev->od_otable_mutex); return (struct dt_it *)it; } @@ -1197,85 +1761,45 @@ static void osd_otable_it_fini(const struct lu_env *env, struct dt_it *di) struct osd_device *dev = it->ooi_dev; /* od_otable_mutex: prevent curcurrent init/fini */ - cfs_mutex_lock(&dev->od_otable_mutex); + mutex_lock(&dev->od_otable_mutex); do_osd_scrub_stop(&dev->od_scrub); LASSERT(dev->od_otable_it == it); dev->od_otable_it = NULL; - cfs_mutex_unlock(&dev->od_otable_mutex); + mutex_unlock(&dev->od_otable_mutex); OBD_FREE_PTR(it); } -/** - * XXX: Temporary used to notify otable iteration to be paused. - */ -static void osd_otable_it_put(const struct lu_env *env, struct dt_it *di) +static int osd_otable_it_get(const struct lu_env *env, + struct dt_it *di, const struct dt_key *key) { - struct osd_device *dev = ((struct osd_otable_it *)di)->ooi_dev; - - /* od_otable_mutex: prevent curcurrent init/fini */ - cfs_mutex_lock(&dev->od_otable_mutex); - dev->od_scrub.os_paused = 1; - cfs_mutex_unlock(&dev->od_otable_mutex); + return 0; } /** - * Set the OSD layer iteration start position as the specified key. + * It is hack here: * - * The LFSCK out of OSD layer does not know the detail of the key, so if there - * are several keys, they cannot be compared out of OSD, so call "::get()" for - * each key, and OSD will select the smallest one by itself. + * Sometimes the otable-based iteration driver (LFSCK) may be blocked in OSD + * layer when someone wants to stop/pause the iteration. Under such case, we + * need some mechanism to notify the event and wakeup the blocker. */ -static int osd_otable_it_get(const struct lu_env *env, - struct dt_it *di, const struct dt_key *key) -{ - struct osd_otable_it *it = (struct osd_otable_it *)di; - struct osd_otable_cache *ooc = &it->ooi_cache; - const char *str = (const char *)key; - __u32 ino; - ENTRY; - - /* Forbid to set iteration position after iteration started. */ - if (it->ooi_user_ready) - RETURN(-EPERM); - - if (str[0] == '\0') - RETURN(-EINVAL); - - if (sscanf(str, "%u", &ino) <= 0) - RETURN(-EINVAL); - - /* Skip the one that has been processed last time. */ - if (ooc->ooc_pos_preload > ++ino) - ooc->ooc_pos_preload = ino; - - RETURN(0); -} - -static int osd_otable_it_preload(const struct lu_env *env, - struct osd_otable_it *it) +static void osd_otable_it_put(const struct lu_env *env, struct dt_it *di) { - struct osd_device *dev = it->ooi_dev; - struct osd_scrub *scrub = &dev->od_scrub; - struct osd_otable_cache *ooc = &it->ooi_cache; - int rc; - ENTRY; - - rc = osd_inode_iteration(osd_oti_get(env), dev, - OSD_OTABLE_IT_CACHE_SIZE, 1); - if (rc == SCRUB_IT_ALL) - it->ooi_all_cached = 1; + struct osd_otable_it *it = (struct osd_otable_it *)di; + struct osd_device *dev = it->ooi_dev; - CDEBUG(D_LFSCK, "OSD pre-loaded: max = %u, preload = %u, rc = %d\n", - le32_to_cpu(LDISKFS_SB(osd_sb(dev))->s_es->s_inodes_count), - ooc->ooc_pos_preload, rc); + /* od_otable_mutex: prevent curcurrent init/fini */ + mutex_lock(&dev->od_otable_mutex); + if (it->ooi_pid == cfs_curproc_pid()) { + dev->od_scrub.os_paused = 1; + } else { + struct ptlrpc_thread *thread = &dev->od_scrub.os_thread; - if (scrub->os_waiting && osd_scrub_has_window(scrub, ooc)) { - scrub->os_waiting = 0; - cfs_waitq_broadcast(&scrub->os_thread.t_ctl_waitq); + it->ooi_stopping = 1; + if (it->ooi_waiting) + cfs_waitq_broadcast(&thread->t_ctl_waitq); } - - RETURN(rc < 0 ? rc : ooc->ooc_cached_items); + mutex_unlock(&dev->od_otable_mutex); } static int osd_otable_it_next(const struct lu_env *env, struct dt_it *di) @@ -1312,13 +1836,17 @@ again: it->ooi_waiting = 1; l_wait_event(thread->t_ctl_waitq, ooc->ooc_pos_preload < scrub->os_pos_current || - !thread_is_running(thread), + !thread_is_running(thread) || + it->ooi_stopping, &lwi); it->ooi_waiting = 0; if (!thread_is_running(thread) && !it->ooi_used_outside) RETURN(1); + if (it->ooi_stopping) + RETURN(0); + rc = osd_otable_it_preload(env, it); if (rc >= 0) goto again; @@ -1329,18 +1857,13 @@ again: static struct dt_key *osd_otable_it_key(const struct lu_env *env, const struct dt_it *di) { - struct osd_otable_it *it = (struct osd_otable_it *)di; - struct osd_otable_cache *ooc = &it->ooi_cache; - - sprintf(it->ooi_key, "%u", - ooc->ooc_cache[ooc->ooc_consumer_idx].oic_lid.oii_ino); - return (struct dt_key *)it->ooi_key; + return NULL; } static int osd_otable_it_key_size(const struct lu_env *env, const struct dt_it *di) { - return sizeof(((struct osd_otable_it *)di)->ooi_key); + return sizeof(__u64); } static int osd_otable_it_rec(const struct lu_env *env, const struct dt_it *di, @@ -1350,9 +1873,33 @@ static int osd_otable_it_rec(const struct lu_env *env, const struct dt_it *di, struct osd_otable_cache *ooc = &it->ooi_cache; *(struct lu_fid *)rec = ooc->ooc_cache[ooc->ooc_consumer_idx].oic_fid; + + /* Filter out Invald FID already. */ + LASSERTF(fid_is_sane((struct lu_fid *)rec), + "Invalid FID "DFID", p_idx = %d, c_idx = %d\n", + PFID((struct lu_fid *)rec), + ooc->ooc_producer_idx, ooc->ooc_consumer_idx); + return 0; } +static __u64 osd_otable_it_store(const struct lu_env *env, + const struct dt_it *di) +{ + struct osd_otable_it *it = (struct osd_otable_it *)di; + struct osd_otable_cache *ooc = &it->ooi_cache; + __u64 hash; + + if (it->ooi_user_ready) + hash = ooc->ooc_pos_preload; + else + hash = ooc->ooc_cache[ooc->ooc_consumer_idx].oic_lid.oii_ino; + return hash; +} + +/** + * Set the OSD layer iteration start position as the specified hash. + */ static int osd_otable_it_load(const struct lu_env *env, const struct dt_it *di, __u64 hash) { @@ -1360,34 +1907,57 @@ static int osd_otable_it_load(const struct lu_env *env, struct osd_device *dev = it->ooi_dev; struct osd_otable_cache *ooc = &it->ooi_cache; struct osd_scrub *scrub = &dev->od_scrub; + int rc; + ENTRY; + /* Forbid to set iteration position after iteration started. */ if (it->ooi_user_ready) - return 0; + RETURN(-EPERM); + + if (hash > OSD_OTABLE_MAX_HASH) + hash = OSD_OTABLE_MAX_HASH; + + /* Skip the one that has been processed last time. */ + if (ooc->ooc_pos_preload > hash) + ooc->ooc_pos_preload = hash; + + if (ooc->ooc_pos_preload <= LDISKFS_FIRST_INO(osd_sb(dev))) + ooc->ooc_pos_preload = LDISKFS_FIRST_INO(osd_sb(dev)) + 1; - if (ooc->ooc_pos_preload < LDISKFS_FIRST_INO(osd_sb(dev))) - ooc->ooc_pos_preload = LDISKFS_FIRST_INO(osd_sb(dev)); it->ooi_user_ready = 1; if (!scrub->os_full_speed) cfs_waitq_broadcast(&scrub->os_thread.t_ctl_waitq); /* Unplug OSD layer iteration by the first next() call. */ - return osd_otable_it_next(env, (struct dt_it *)it); + rc = osd_otable_it_next(env, (struct dt_it *)it); + + RETURN(rc); +} + +static int osd_otable_it_key_rec(const struct lu_env *env, + const struct dt_it *di, void *key_rec) +{ + return 0; } const struct dt_index_operations osd_otable_ops = { .dio_it = { .init = osd_otable_it_init, .fini = osd_otable_it_fini, - .put = osd_otable_it_put, .get = osd_otable_it_get, + .put = osd_otable_it_put, .next = osd_otable_it_next, - .key = osd_otable_it_key, + .key = osd_otable_it_key, .key_size = osd_otable_it_key_size, .rec = osd_otable_it_rec, + .store = osd_otable_it_store, .load = osd_otable_it_load, + .key_rec = osd_otable_it_key_rec, } }; +/* high priority inconsistent items list APIs */ + int osd_oii_insert(struct osd_device *dev, struct osd_idmap_cache *oic, int insert) { @@ -1405,9 +1975,9 @@ int osd_oii_insert(struct osd_device *dev, struct osd_idmap_cache *oic, oii->oii_cache = *oic; oii->oii_insert = insert; - cfs_spin_lock(&scrub->os_lock); + spin_lock(&scrub->os_lock); if (unlikely(!thread_is_running(thread))) { - cfs_spin_unlock(&scrub->os_lock); + spin_unlock(&scrub->os_lock); OBD_FREE_PTR(oii); RETURN(-EAGAIN); } @@ -1415,7 +1985,7 @@ int osd_oii_insert(struct osd_device *dev, struct osd_idmap_cache *oic, if (cfs_list_empty(&scrub->os_inconsistent_items)) wakeup = 1; cfs_list_add_tail(&oii->oii_list, &scrub->os_inconsistent_items); - cfs_spin_unlock(&scrub->os_lock); + spin_unlock(&scrub->os_lock); if (wakeup != 0) cfs_waitq_broadcast(&thread->t_ctl_waitq); @@ -1430,19 +2000,21 @@ int osd_oii_lookup(struct osd_device *dev, const struct lu_fid *fid, struct osd_inconsistent_item *oii; ENTRY; - cfs_spin_lock(&scrub->os_lock); + spin_lock(&scrub->os_lock); cfs_list_for_each_entry(oii, &scrub->os_inconsistent_items, oii_list) { if (lu_fid_eq(fid, &oii->oii_cache.oic_fid)) { *id = oii->oii_cache.oic_lid; - cfs_spin_unlock(&scrub->os_lock); + spin_unlock(&scrub->os_lock); RETURN(0); } } - cfs_spin_unlock(&scrub->os_lock); + spin_unlock(&scrub->os_lock); RETURN(-ENOENT); } +/* OI scrub dump */ + static const char *scrub_status_names[] = { "init", "scanning", @@ -1458,6 +2030,7 @@ static const char *scrub_flags_names[] = { "recreated", "inconsistent", "auto", + "upgrade", NULL }; @@ -1538,9 +2111,9 @@ int osd_scrub_dump(struct osd_device *dev, char *buf, int len) int ret = -ENOSPC; int rc; - cfs_down_read(&scrub->os_rwsem); + down_read(&scrub->os_rwsem); rc = snprintf(buf, len, - "name: OI scrub\n" + "name: OI_scrub\n" "magic: 0x%x\n" "oi_files: %d\n" "status: %s\n", @@ -1644,6 +2217,6 @@ int osd_scrub_dump(struct osd_device *dev, char *buf, int len) ret = save - len; out: - cfs_up_read(&scrub->os_rwsem); + up_read(&scrub->os_rwsem); return ret; }