X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosd-ldiskfs%2Fosd_handler.c;h=d6c2f86127cbfb185e9c03129ee20f04f226364f;hp=41b79fd9b3f849859c2e0971a455c8e8c9a7de43;hb=eebc3da214dfcbc01ba637f0925bfe8635b26138;hpb=bad49e39e301d4367eaead5ee566f5dcacfde8f6 diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 41b79fd..d6c2f86 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -122,6 +122,11 @@ static const struct dt_object_operations osd_obj_otable_it_ops; static const struct dt_index_operations osd_index_iam_ops; static const struct dt_index_operations osd_index_ea_ops; +static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd, + const struct lu_fid *fid); +static int osd_process_scheduled_agent_removals(const struct lu_env *env, + struct osd_device *osd); + int osd_trans_declare_op2rb[] = { [OSD_OT_ATTR_SET] = OSD_OT_ATTR_SET, [OSD_OT_PUNCH] = OSD_OT_MAX, @@ -237,6 +242,11 @@ struct inode *osd_iget(struct osd_thread_info *info, struct osd_device *dev, { struct inode *inode = NULL; + /* if we look for an inode withing a running + * transaction, then we risk to deadlock */ + /* osd_dirent_check_repair() breaks this */ + /*LASSERT(current->journal_info == NULL);*/ + inode = ldiskfs_iget(osd_sb(dev), id->oii_ino); if (IS_ERR(inode)) { CDEBUG(D_INODE, "no inode: ino = %u, rc = %ld\n", @@ -301,6 +311,130 @@ osd_iget_fid(struct osd_thread_info *info, struct osd_device *dev, return inode; } +static struct inode *osd_iget_check(struct osd_thread_info *info, + struct osd_device *dev, + const struct lu_fid *fid, + struct osd_inode_id *id, + bool cached) +{ + struct inode *inode; + int rc = 0; + ENTRY; + + /* The cached OI mapping is trustable. If we cannot locate the inode + * via the cached OI mapping, then return the failure to the caller + * directly without further OI checking. */ + + inode = ldiskfs_iget(osd_sb(dev), id->oii_ino); + if (IS_ERR(inode)) { + rc = PTR_ERR(inode); + if (cached || (rc != -ENOENT && rc != -ESTALE)) { + CDEBUG(D_INODE, "no inode: ino = %u, rc = %d\n", + id->oii_ino, rc); + + GOTO(put, rc); + } + + goto check_oi; + } + + if (is_bad_inode(inode)) { + rc = -ENOENT; + if (cached) { + CDEBUG(D_INODE, "bad inode: ino = %u\n", id->oii_ino); + + GOTO(put, rc); + } + + goto check_oi; + } + + if (id->oii_gen != OSD_OII_NOGEN && + inode->i_generation != id->oii_gen) { + rc = -ESTALE; + if (cached) { + CDEBUG(D_INODE, "unmatched inode: ino = %u, " + "oii_gen = %u, i_generation = %u\n", + id->oii_ino, id->oii_gen, inode->i_generation); + + GOTO(put, rc); + } + + goto check_oi; + } + + if (inode->i_nlink == 0) { + rc = -ENOENT; + if (cached) { + CDEBUG(D_INODE, "stale inode: ino = %u\n", id->oii_ino); + + GOTO(put, rc); + } + + goto check_oi; + } + +check_oi: + if (rc != 0) { + LASSERTF(rc == -ESTALE || rc == -ENOENT, "rc = %d\n", rc); + + rc = osd_oi_lookup(info, dev, fid, id, OI_CHECK_FLD); + /* XXX: There are four possible cases: + * 1. rc = 0. + * Backup/restore caused the OI invalid. + * 2. rc = 0. + * Someone unlinked the object but NOT removed + * the OI mapping, such as mount target device + * as ldiskfs, and modify something directly. + * 3. rc = -ENOENT. + * Someone just removed the object between the + * former oi_lookup and the iget. It is normal. + * 4. Other failure cases. + * + * Generally, when the device is mounted, it will + * auto check whether the system is restored from + * file-level backup or not. We trust such detect + * to distinguish the 1st case from the 2nd case. */ + if (rc == 0) { + if (!IS_ERR(inode) && inode->i_generation != 0 && + inode->i_generation == id->oii_gen) + /* "id->oii_gen != OSD_OII_NOGEN" is for + * "@cached == false" case. */ + rc = -ENOENT; + else + rc = -EREMCHG; + } else { + /* If the OI mapping was in OI file before the + * osd_iget_check(), but now, it is disappear, + * then it must be removed by race. That is a + * normal race case. */ + } + } else { + if (id->oii_gen == OSD_OII_NOGEN) + osd_id_gen(id, inode->i_ino, inode->i_generation); + + /* Do not update file c/mtime in ldiskfs. + * NB: we don't have any lock to protect this because we don't + * have reference on osd_object now, but contention with + * another lookup + attr_set can't happen in the tiny window + * between if (...) and set S_NOCMTIME. */ + if (!(inode->i_flags & S_NOCMTIME)) + inode->i_flags |= S_NOCMTIME; + } + + GOTO(put, rc); + +put: + if (rc != 0) { + if (!IS_ERR(inode)) + iput(inode); + + inode = ERR_PTR(rc); + } + + return inode; +} + /** * \retval +v: new filter_fid, does not contain self-fid * \retval 0: filter_fid_old, contains self-fid @@ -385,9 +519,6 @@ static int osd_check_lma(const struct lu_env *env, struct osd_object *obj) } } - if (unlikely(rc == -ENODATA)) - RETURN(0); - if (rc < 0) RETURN(rc); @@ -450,8 +581,7 @@ static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj, struct scrub_file *sf; int result; int saved = 0; - bool in_oi = false; - bool in_cache = false; + bool cached = true; bool triggered = false; ENTRY; @@ -481,7 +611,6 @@ static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj, if (lu_fid_eq(fid, &oic->oic_fid) && likely(oic->oic_dev == dev)) { id = &oic->oic_lid; - in_cache = true; goto iget; } @@ -493,10 +622,12 @@ static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj, goto iget; } + cached = false; /* Search order: 3. OI files. */ result = osd_oi_lookup(info, dev, fid, id, OI_CHECK_FLD); if (result == -ENOENT) { if (!(fid_is_norm(fid) || fid_is_igif(fid)) || + fid_is_on_ost(info, dev, fid, OI_CHECK_FLD) || !ldiskfs_test_bit(osd_oi_fid2idx(dev,fid), sf->sf_oi_bitmap)) GOTO(out, result = 0); @@ -507,81 +638,64 @@ static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj, if (result != 0) GOTO(out, result); - in_oi = true; - iget: - inode = osd_iget(info, dev, id); + inode = osd_iget_check(info, dev, fid, id, cached); if (IS_ERR(inode)) { result = PTR_ERR(inode); - if (result != -ENOENT && result != -ESTALE) - GOTO(out, result); - - if (in_cache) - fid_zero(&oic->oic_fid); - - result = osd_oi_lookup(info, dev, fid, id, OI_CHECK_FLD); - if (result != 0) - GOTO(out, result = (result == -ENOENT ? 0 : result)); + if (result == -ENOENT || result == -ESTALE) + GOTO(out, result = -ENOENT); - /* The OI mapping is there, but the inode is NOT there. - * Two possible cases for that: - * - * 1) Backup/restore caused the OI invalid. - * 2) Someone unlinked the object but NOT removed - * the OI mapping, such as mount target device - * as ldiskfs, and modify something directly. - * - * Generally, when the device is mounted, it will - * auto check whether the system is restored from - * file-level backup or not. We trust such detect - * to distinguish the 1st case from the 2nd case. */ - if (!(scrub->os_file.sf_flags & SF_INCONSISTENT)) - GOTO(out, result = 0); + if (result == -EREMCHG) { trigger: - if (unlikely(triggered)) - GOTO(out, result = saved); - - triggered = true; - if (thread_is_running(&scrub->os_thread)) { - result = -EINPROGRESS; - } else if (!dev->od_noscrub) { - /* Since we do not know the right OI mapping, we have - * to trigger OI scrub to scan the whole device. */ - result = osd_scrub_start(dev, SS_AUTO_FULL | - SS_CLEAR_DRYRUN | SS_CLEAR_FAILOUT); - CDEBUG(D_LFSCK | D_CONSOLE, "%.16s: trigger OI " - "scrub by RPC for "DFID", rc = %d [1]\n", - osd_name(dev), PFID(fid), result); - if (result == 0 || result == -EALREADY) + if (unlikely(triggered)) + GOTO(out, result = saved); + + triggered = true; + if (thread_is_running(&scrub->os_thread)) { result = -EINPROGRESS; - else - result = -EREMCHG; - } + } else if (!dev->od_noscrub) { + result = osd_scrub_start(dev, SS_AUTO_FULL | + SS_CLEAR_DRYRUN | SS_CLEAR_FAILOUT); + LCONSOLE_WARN("%.16s: trigger OI scrub by RPC " + "for "DFID", rc = %d [1]\n", + osd_name(dev), PFID(fid), result); + if (result == 0 || result == -EALREADY) + result = -EINPROGRESS; + else + result = -EREMCHG; + } - /* We still have chance to get the valid inode: for the - * object which is referenced by remote name entry, the - * object on the local MDT will be linked under the dir - * of "/REMOTE_PARENT_DIR" with its FID string as name. - * - * We do not know whether the object for the given FID - * is referenced by some remote name entry or not, and - * especially for DNE II, a multiple-linked object may - * have many name entries reside on many MDTs. - * - * To simplify the operation, OSD will not distinguish - * more, just lookup "/REMOTE_PARENT_DIR". Usually, it - * only happened for the RPC from other MDT during the - * OI scrub, or for the client side RPC with FID only, - * such as FID to path, or from old connected client. */ - saved = result; - result = osd_lookup_in_remote_parent(info, dev, fid, id); - if (result == 0) { - in_oi = false; - goto iget; + if (fid_is_on_ost(info, dev, fid, OI_CHECK_FLD)) + GOTO(out, result); + + /* We still have chance to get the valid inode: for the + * object which is referenced by remote name entry, the + * object on the local MDT will be linked under the dir + * of "/REMOTE_PARENT_DIR" with its FID string as name. + * + * We do not know whether the object for the given FID + * is referenced by some remote name entry or not, and + * especially for DNE II, a multiple-linked object may + * have many name entries reside on many MDTs. + * + * To simplify the operation, OSD will not distinguish + * more, just lookup "/REMOTE_PARENT_DIR". Usually, it + * only happened for the RPC from other MDT during the + * OI scrub, or for the client side RPC with FID only, + * such as FID to path, or from old connected client. */ + saved = result; + result = osd_lookup_in_remote_parent(info, dev, + fid, id); + if (result == 0) { + cached = true; + goto iget; + } + + result = saved; } - GOTO(out, result = saved); + GOTO(out, result); } obj->oo_inode = inode; @@ -589,34 +703,66 @@ trigger: result = osd_check_lma(env, obj); if (result != 0) { + if (result == -ENODATA) { + if (cached) { + result = osd_oi_lookup(info, dev, fid, id, + OI_CHECK_FLD); + if (result != 0) { + /* result == -ENOENT means that the OI + * mapping has been removed by race, + * the target inode belongs to other + * object. + * + * Others error also can be returned + * directly. */ + iput(inode); + obj->oo_inode = NULL; + GOTO(out, result); + } else { + /* result == 0 means the cached OI + * mapping is still in the OI file, + * the target the inode is valid. */ + } + } else { + /* The current OI mapping is from the OI file, + * since the inode has been found via + * osd_iget_check(), no need recheck OI. */ + } + + goto found; + } + iput(inode); obj->oo_inode = NULL; - if (result != -EREMCHG) GOTO(out, result); - if (in_cache) - fid_zero(&oic->oic_fid); - - result = osd_oi_lookup(info, dev, fid, id, OI_CHECK_FLD); - if (result == 0) - goto trigger; - - if (result != -ENOENT) - GOTO(out, result); + if (cached) { + result = osd_oi_lookup(info, dev, fid, id, + OI_CHECK_FLD); + /* result == -ENOENT means the cached OI mapping + * has been removed from the OI file by race, + * above target inode belongs to other object. + * + * Others error also can be returned directly. */ + if (result != 0) + GOTO(out, result); - if (!in_oi && (fid_is_norm(fid) || fid_is_igif(fid)) && - ldiskfs_test_bit(osd_oi_fid2idx(dev, fid), - sf->sf_oi_bitmap)) - goto trigger; + /* result == 0, goto trigger */ + } else { + /* The current OI mapping is from the OI file, + * since the inode has been found via + * osd_iget_check(), no need recheck OI. */ + } - GOTO(out, result = 0); + goto trigger; } +found: obj->oo_compat_dot_created = 1; obj->oo_compat_dotdot_created = 1; - if (!S_ISDIR(inode->i_mode) || !ldiskfs_pdo) /* done */ + if (!S_ISDIR(inode->i_mode) || !ldiskfs_pdo) /* done */ GOTO(out, result = 0); LASSERT(obj->oo_hl_head == NULL); @@ -629,6 +775,9 @@ trigger: GOTO(out, result = 0); out: + if (result != 0 && cached) + fid_zero(&oic->oic_fid); + LINVRNT(osd_invariant(obj)); return result; } @@ -1052,7 +1201,7 @@ static void osd_trans_stop_cb(struct osd_thandle *oth, int result) static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt, struct thandle *th) { - int rc = 0; + int rc = 0, remove_agents = 0; struct osd_thandle *oh; struct osd_thread_info *oti = osd_oti_get(env); struct osd_iobuf *iobuf = &oti->oti_iobuf; @@ -1063,6 +1212,8 @@ static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt, oh = container_of0(th, struct osd_thandle, ot_super); + remove_agents = oh->ot_remove_agents; + qtrans = oh->ot_quota_trans; oh->ot_quota_trans = NULL; @@ -1117,6 +1268,9 @@ static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt, if (!rc) rc = iobuf->dr_error; + if (unlikely(remove_agents != 0)) + osd_process_scheduled_agent_removals(env, osd); + RETURN(rc); } @@ -2283,9 +2437,11 @@ static int osd_object_create(const struct lu_env *env, struct dt_object *dt, RETURN(-EPERM); result = __osd_object_create(info, obj, attr, hint, dof, th); - if (result == 0) + if (result == 0) { result = __osd_oi_insert(env, obj, fid, th); - + if (obj->oo_dt.do_body_ops == &osd_body_ops_new) + obj->oo_dt.do_body_ops = &osd_body_ops; + } LASSERT(ergo(result == 0, dt_object_exists(dt) && !dt_object_remote(dt))); @@ -2556,33 +2712,86 @@ static struct inode *osd_create_local_agent_inode(const struct lu_env *env, } /** - * Delete local agent inode for remote entry + * when direntry is deleted, we have to take care of possible agent inode + * referenced by that. unfortunately we can't do this at that point: + * iget() within a running transaction leads to deadlock and we better do + * not call that every delete declaration to save performance. so we put + * a potention agent inode on a list and process that once the transaction + * is over. Notice it's not any worse in terms of real orphans as regular + * object destroy doesn't put inodes on the on-disk orphan list. this should + * be addressed separately */ -static int osd_delete_local_agent_inode(const struct lu_env *env, - struct osd_device *osd, - const struct lu_fid *fid, - __u32 ino, struct osd_thandle *oh) +static int osd_schedule_agent_inode_removal(const struct lu_env *env, + struct osd_thandle *oh, + __u32 ino) { - struct osd_thread_info *oti = osd_oti_get(env); - struct osd_inode_id *id = &oti->oti_id; - struct inode *inode; - ENTRY; + struct osd_device *osd = osd_dt_dev(oh->ot_super.th_dev); + struct osd_obj_orphan *oor; - id->oii_ino = le32_to_cpu(ino); - id->oii_gen = OSD_OII_NOGEN; - inode = osd_iget(oti, osd, id); - if (IS_ERR(inode)) { - CERROR("%s: iget error "DFID" id %u:%u\n", osd_name(osd), - PFID(fid), id->oii_ino, id->oii_gen); - RETURN(PTR_ERR(inode)); + OBD_ALLOC_PTR(oor); + if (oor == NULL) + return -ENOMEM; + + oor->oor_ino = ino; + oor->oor_env = (struct lu_env *)env; + spin_lock(&osd->od_osfs_lock); + list_add(&oor->oor_list, &osd->od_orphan_list); + spin_unlock(&osd->od_osfs_lock); + + oh->ot_remove_agents = 1; + + return 0; + +} + +static int osd_process_scheduled_agent_removals(const struct lu_env *env, + struct osd_device *osd) +{ + struct osd_thread_info *info = osd_oti_get(env); + struct osd_obj_orphan *oor, *tmp; + struct osd_inode_id id; + struct list_head list; + struct inode *inode; + struct lu_fid fid; + handle_t *jh; + __u32 ino; + + INIT_LIST_HEAD(&list); + + spin_lock(&osd->od_osfs_lock); + list_for_each_entry_safe(oor, tmp, &osd->od_orphan_list, oor_list) { + if (oor->oor_env == env) { + list_del(&oor->oor_list); + list_add(&oor->oor_list, &list); + } } + spin_unlock(&osd->od_osfs_lock); - clear_nlink(inode); - mark_inode_dirty(inode); - CDEBUG(D_INODE, "%s: delete remote inode "DFID" %lu\n", - osd_name(osd), PFID(fid), inode->i_ino); - iput(inode); - RETURN(0); + list_for_each_entry_safe(oor, tmp, &list, oor_list) { + + ino = oor->oor_ino; + + list_del(&oor->oor_list); + OBD_FREE_PTR(oor); + + osd_id_gen(&id, ino, OSD_OII_NOGEN); + inode = osd_iget_fid(info, osd, &id, &fid); + if (IS_ERR(inode)) + continue; + + if (!osd_remote_fid(env, osd, &fid)) { + iput(inode); + continue; + } + + jh = osd_journal_start_sb(osd_sb(osd), LDISKFS_HT_MISC, 1); + clear_nlink(inode); + mark_inode_dirty(inode); + ldiskfs_journal_stop(jh); + iput(inode); + } + + return 0; } /** @@ -2636,6 +2845,8 @@ static int osd_object_ea_create(const struct lu_env *env, struct dt_object *dt, fid, OI_CHECK_FLD) ? LMAC_FID_ON_OST : 0, 0); } + if (obj->oo_dt.do_body_ops == &osd_body_ops_new) + obj->oo_dt.do_body_ops = &osd_body_ops; } if (result == 0) @@ -3283,8 +3494,9 @@ static int osd_index_declare_iam_delete(const struct lu_env *env, oh = container_of0(handle, struct osd_thandle, ot_super); LASSERT(oh->ot_handle == NULL); + /* Recycle may cause additional three blocks to be changed. */ osd_trans_declare_op(env, oh, OSD_OT_DELETE, - osd_dto_credits_noquota[DTO_INDEX_DELETE]); + osd_dto_credits_noquota[DTO_INDEX_DELETE] + 3); return 0; } @@ -3462,8 +3674,6 @@ static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt, bh = osd_ldiskfs_find_entry(dir, &dentry->d_name, &de, NULL, hlock); if (bh) { - __u32 ino = 0; - /* If this is not the ".." entry, it might be a remote DNE * entry and we need to check if the FID is for a remote * MDT. If the FID is not in the directory entry (e.g. @@ -3481,56 +3691,19 @@ static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt, if (strcmp((char *)key, dotdot) != 0) { LASSERT(de != NULL); rc = osd_get_fid_from_dentry(de, (struct dt_rec *)fid); - /* If Fid is not in dentry, try to get it from LMA */ if (rc == -ENODATA) { - struct osd_inode_id *id; - struct inode *inode; - - /* Before trying to get fid from the inode, - * check whether the inode is valid. - * - * If the inode has been deleted, do not go - * ahead to do osd_ea_fid_get, which will set - * the inode to bad inode, which might cause - * the inode to be deleted uncorrectly */ - inode = ldiskfs_iget(osd_sb(osd), - le32_to_cpu(de->inode)); - if (IS_ERR(inode)) { - CDEBUG(D_INODE, "%s: "DFID"get inode" - "error.\n", osd_name(osd), - PFID(fid)); - rc = PTR_ERR(inode); - } else { - if (likely(inode->i_nlink != 0)) { - id = &osd_oti_get(env)->oti_id; - rc = osd_ea_fid_get(env, obj, - le32_to_cpu(de->inode), - fid, id); - } else { - CDEBUG(D_INFO, "%s: %u "DFID - "deleted.\n", - osd_name(osd), - le32_to_cpu(de->inode), - PFID(fid)); - rc = -ESTALE; - } - iput(inode); - } + /* can't get FID, postpone to the end of the + * transaction when iget() is safe */ + osd_schedule_agent_inode_removal(env, oh, + le32_to_cpu(de->inode)); + } else if (rc == 0 && + unlikely(osd_remote_fid(env, osd, fid))) { + osd_schedule_agent_inode_removal(env, oh, + le32_to_cpu(de->inode)); } - if (rc == 0 && - unlikely(osd_remote_fid(env, osd, fid))) - /* Need to delete agent inode */ - ino = le32_to_cpu(de->inode); } rc = ldiskfs_delete_entry(oh->ot_handle, dir, de, bh); brelse(bh); - if (rc == 0 && unlikely(ino != 0)) { - rc = osd_delete_local_agent_inode(env, osd, fid, ino, - oh); - if (rc != 0) - CERROR("%s: del local inode "DFID": rc = %d\n", - osd_name(osd), PFID(fid), rc); - } } else { rc = -ENOENT; } @@ -4959,62 +5132,44 @@ static int osd_it_ea_key_size(const struct lu_env *env, const struct dt_it *di) return it->oie_dirent->oied_namelen; } -static int -osd_dirent_update(handle_t *jh, struct super_block *sb, - struct osd_it_ea_dirent *ent, struct lu_fid *fid, - struct buffer_head *bh, struct ldiskfs_dir_entry_2 *de) +static inline bool +osd_dot_dotdot_has_space(struct ldiskfs_dir_entry_2 *de, int dot_dotdot) { - struct osd_fid_pack *rec; - int rc; - ENTRY; - - LASSERT(de->file_type & LDISKFS_DIRENT_LUFID); - LASSERT(de->rec_len >= de->name_len + sizeof(struct osd_fid_pack)); - - rc = ldiskfs_journal_get_write_access(jh, bh); - if (rc != 0) - RETURN(rc); + LASSERTF(dot_dotdot == 1 || dot_dotdot == 2, + "dot_dotdot = %d\n", dot_dotdot); - rec = (struct osd_fid_pack *)(de->name + de->name_len + 1); - fid_cpu_to_be((struct lu_fid *)rec->fp_area, fid); - rc = ldiskfs_handle_dirty_metadata(jh, NULL, bh); + if (LDISKFS_DIR_REC_LEN(de) >= + __LDISKFS_DIR_REC_LEN(dot_dotdot + 1 + sizeof(struct osd_fid_pack))) + return true; - RETURN(rc); + return false; } -static inline int -osd_dirent_has_space(__u16 reclen, __u16 namelen, unsigned blocksize) +static inline bool +osd_dirent_has_space(struct ldiskfs_dir_entry_2 *de, __u16 namelen, + unsigned blocksize, int dot_dotdot) { - if (ldiskfs_rec_len_from_disk(reclen, blocksize) >= - __LDISKFS_DIR_REC_LEN(namelen + 1 + sizeof(struct osd_fid_pack))) - return 1; - else - return 0; -} + if (dot_dotdot > 0) + return osd_dot_dotdot_has_space(de, dot_dotdot); -static inline int -osd_dot_dotdot_has_space(struct ldiskfs_dir_entry_2 *de, int dot_dotdot) -{ - LASSERTF(dot_dotdot == 1 || dot_dotdot == 2, - "dot_dotdot = %d\n", dot_dotdot); + if (ldiskfs_rec_len_from_disk(de->rec_len, blocksize) >= + __LDISKFS_DIR_REC_LEN(namelen + 1 + sizeof(struct osd_fid_pack))) + return true; - if (LDISKFS_DIR_REC_LEN(de) >= - __LDISKFS_DIR_REC_LEN(dot_dotdot + 1 + sizeof(struct osd_fid_pack))) - return 1; - else - return 0; + return false; } static int osd_dirent_reinsert(const struct lu_env *env, handle_t *jh, - struct inode *dir, struct inode *inode, - struct osd_it_ea_dirent *ent, struct lu_fid *fid, + struct dentry *dentry, const struct lu_fid *fid, struct buffer_head *bh, struct ldiskfs_dir_entry_2 *de, - struct htree_lock *hlock) + struct htree_lock *hlock, int dot_dotdot) { - struct dentry *dentry; + struct inode *dir = dentry->d_parent->d_inode; + struct inode *inode = dentry->d_inode; struct osd_fid_pack *rec; struct ldiskfs_dentry_param *ldp; + int namelen = dentry->d_name.len; int rc; ENTRY; @@ -5023,29 +5178,28 @@ osd_dirent_reinsert(const struct lu_env *env, handle_t *jh, RETURN(0); /* There is enough space to hold the FID-in-dirent. */ - if (osd_dirent_has_space(de->rec_len, ent->oied_namelen, - dir->i_sb->s_blocksize)) { + if (osd_dirent_has_space(de, namelen, dir->i_sb->s_blocksize, + dot_dotdot)) { rc = ldiskfs_journal_get_write_access(jh, bh); if (rc != 0) RETURN(rc); - de->name[de->name_len] = 0; - rec = (struct osd_fid_pack *)(de->name + de->name_len + 1); + de->name[namelen] = 0; + rec = (struct osd_fid_pack *)(de->name + namelen + 1); rec->fp_len = sizeof(struct lu_fid) + 1; fid_cpu_to_be((struct lu_fid *)rec->fp_area, fid); de->file_type |= LDISKFS_DIRENT_LUFID; - rc = ldiskfs_handle_dirty_metadata(jh, NULL, bh); RETURN(rc); } + LASSERTF(dot_dotdot == 0, "dot_dotdot = %d\n", dot_dotdot); + rc = ldiskfs_delete_entry(jh, dir, de, bh); if (rc != 0) RETURN(rc); - dentry = osd_child_dentry_by_inode(env, dir, ent->oied_name, - ent->oied_namelen); ldp = (struct ldiskfs_dentry_param *)osd_oti_get(env)->oti_ldp; osd_get_ldiskfs_dirent_param(ldp, fid); dentry->d_fsdata = (void *)ldp; @@ -5057,8 +5211,8 @@ osd_dirent_reinsert(const struct lu_env *env, handle_t *jh, CDEBUG(D_LFSCK, "%.16s: fail to reinsert the dirent, " "dir = %lu/%u, name = %.*s, "DFID": rc = %d\n", LDISKFS_SB(inode->i_sb)->s_es->s_volume_name, - dir->i_ino, dir->i_generation, - ent->oied_namelen, ent->oied_name, PFID(fid), rc); + dir->i_ino, dir->i_generation, namelen, + dentry->d_name.name, PFID(fid), rc); RETURN(rc); } @@ -5088,6 +5242,32 @@ osd_dirent_check_repair(const struct lu_env *env, struct osd_object *obj, bool dirty = false; ENTRY; + osd_id_gen(id, ent->oied_ino, OSD_OII_NOGEN); + inode = osd_iget(info, dev, id); + if (IS_ERR(inode)) { + rc = PTR_ERR(inode); + if (rc == -ENOENT || rc == -ESTALE) { + *attr |= LUDA_UNKNOWN; + rc = 0; + } else { + CDEBUG(D_LFSCK, "%.16s: fail to iget for dirent " + "check_repair, dir = %lu/%u, name = %.*s: " + "rc = %d\n", + devname, dir->i_ino, dir->i_generation, + ent->oied_namelen, ent->oied_name, rc); + } + + RETURN(rc); + } + + dentry = osd_child_dentry_by_inode(env, dir, ent->oied_name, + ent->oied_namelen); + rc = osd_get_lma(info, inode, dentry, lma); + if (rc == -ENODATA) + lma = NULL; + else if (rc != 0) + GOTO(out, rc); + if (ent->oied_name[0] == '.') { if (ent->oied_namelen == 1) dot_dotdot = 1; @@ -5095,9 +5275,6 @@ osd_dirent_check_repair(const struct lu_env *env, struct osd_object *obj, dot_dotdot = 2; } - dentry = osd_child_dentry_get(env, obj, ent->oied_name, - ent->oied_namelen); - /* We need to ensure that the name entry is still valid. * Because it may be removed or renamed by other already. * @@ -5114,8 +5291,9 @@ osd_dirent_check_repair(const struct lu_env *env, struct osd_object *obj, credits = osd_dto_credits_noquota[DTO_INDEX_DELETE] + osd_dto_credits_noquota[DTO_INDEX_INSERT] + 1 + 1 + 2; -again: if (dev->od_dirent_journal != 0) { + +again: jh = osd_journal_start_sb(sb, LDISKFS_HT_MISC, credits); if (IS_ERR(jh)) { rc = PTR_ERR(jh); @@ -5124,7 +5302,8 @@ again: "name = %.*s: rc = %d\n", devname, dir->i_ino, dir->i_generation, credits, ent->oied_namelen, ent->oied_name, rc); - RETURN(rc); + + GOTO(out_inode, rc); } if (obj->oo_hl_head != NULL) { @@ -5154,58 +5333,64 @@ again: * For the whole directory, only dot/dotdot entry have no FID-in-dirent * and needs to get FID from LMA when readdir, it will not affect the * performance much. */ - if ((bh == NULL) || (le32_to_cpu(de->inode) != ent->oied_ino) || + if ((bh == NULL) || (le32_to_cpu(de->inode) != inode->i_ino) || (dot_dotdot != 0 && !osd_dot_dotdot_has_space(de, dot_dotdot))) { *attr |= LUDA_IGNORE; - GOTO(out_journal, rc = 0); - } - osd_id_gen(id, ent->oied_ino, OSD_OII_NOGEN); - inode = osd_iget(info, dev, id); - if (IS_ERR(inode)) { - rc = PTR_ERR(inode); - if (rc == -ENOENT || rc == -ESTALE) - rc = 1; - else - CDEBUG(D_LFSCK, "%.16s: fail to iget for dirent " - "check_repair, dir = %lu/%u, name = %.*s: " - "rc = %d\n", - devname, dir->i_ino, dir->i_generation, - ent->oied_namelen, ent->oied_name, rc); - - GOTO(out_journal, rc); + GOTO(out, rc = 0); } - rc = osd_get_lma(info, inode, &info->oti_obj_dentry, lma); - if (rc == 0) { - LASSERT(!(lma->lma_compat & LMAC_NOT_IN_OI)); + if (lma != NULL) { + if (unlikely(lma->lma_compat & LMAC_NOT_IN_OI)) { + struct lu_fid *tfid = &lma->lma_self_fid; + + *attr |= LUDA_IGNORE; + /* It must be REMOTE_PARENT_DIR and as the + * dotdot entry of remote directory */ + if (unlikely(dot_dotdot != 2 || + fid_seq(tfid) != FID_SEQ_LOCAL_FILE || + fid_oid(tfid) != REMOTE_PARENT_DIR_OID)) { + CDEBUG(D_LFSCK, "%.16s: expect remote agent " + "parent directory, but got %.*s under " + "dir = %lu/%u with the FID "DFID"\n", + devname, ent->oied_namelen, + ent->oied_name, dir->i_ino, + dir->i_generation, PFID(tfid)); + + GOTO(out, rc = -EIO); + } + + GOTO(out, rc = 0); + } if (fid_is_sane(fid)) { /* FID-in-dirent is valid. */ if (lu_fid_eq(fid, &lma->lma_self_fid)) - GOTO(out_inode, rc = 0); + GOTO(out, rc = 0); /* Do not repair under dryrun mode. */ if (*attr & LUDA_VERIFY_DRYRUN) { *attr |= LUDA_REPAIR; - GOTO(out_inode, rc = 0); + + GOTO(out, rc = 0); } - if (dev->od_dirent_journal == 0) { - iput(inode); + if (jh == NULL) { brelse(bh); if (hlock != NULL) ldiskfs_htree_unlock(hlock); else up_read(&obj->oo_ext_idx_sem); dev->od_dirent_journal = 1; + goto again; } *fid = lma->lma_self_fid; dirty = true; /* Update the FID-in-dirent. */ - rc = osd_dirent_update(jh, sb, ent, fid, bh, de); + rc = osd_dirent_reinsert(env, jh, dentry, fid, bh, de, + hlock, dot_dotdot); if (rc == 0) *attr |= LUDA_REPAIR; else @@ -5220,25 +5405,26 @@ again: if (*attr & LUDA_VERIFY_DRYRUN) { *fid = lma->lma_self_fid; *attr |= LUDA_REPAIR; - GOTO(out_inode, rc = 0); + + GOTO(out, rc = 0); } - if (dev->od_dirent_journal == 0) { - iput(inode); + if (jh == NULL) { brelse(bh); if (hlock != NULL) ldiskfs_htree_unlock(hlock); else up_read(&obj->oo_ext_idx_sem); dev->od_dirent_journal = 1; + goto again; } *fid = lma->lma_self_fid; dirty = true; /* Append the FID-in-dirent. */ - rc = osd_dirent_reinsert(env, jh, dir, inode, ent, - fid, bh, de, hlock); + rc = osd_dirent_reinsert(env, jh, dentry, fid, bh, de, + hlock, dot_dotdot); if (rc == 0) *attr |= LUDA_REPAIR; else @@ -5249,7 +5435,7 @@ again: ent->oied_namelen, ent->oied_name, PFID(fid), rc); } - } else if (rc == -ENODATA) { + } else { /* Do not repair under dryrun mode. */ if (*attr & LUDA_VERIFY_DRYRUN) { if (fid_is_sane(fid)) { @@ -5259,17 +5445,18 @@ again: inode->i_generation); *attr |= LUDA_UPGRADE; } - GOTO(out_inode, rc = 0); + + GOTO(out, rc = 0); } - if (dev->od_dirent_journal == 0) { - iput(inode); + if (jh == NULL) { brelse(bh); if (hlock != NULL) ldiskfs_htree_unlock(hlock); else up_read(&obj->oo_ext_idx_sem); dev->od_dirent_journal = 1; + goto again; } @@ -5291,8 +5478,8 @@ again: lu_igif_build(fid, inode->i_ino, inode->i_generation); /* It is probably IGIF object. Only aappend the * FID-in-dirent. OI scrub will process FID-in-LMA. */ - rc = osd_dirent_reinsert(env, jh, dir, inode, ent, - fid, bh, de, hlock); + rc = osd_dirent_reinsert(env, jh, dentry, fid, bh, de, + hlock, dot_dotdot); if (rc == 0) *attr |= LUDA_UPGRADE; else @@ -5305,12 +5492,9 @@ again: } } - GOTO(out_inode, rc); - -out_inode: - iput(inode); + GOTO(out, rc); -out_journal: +out: brelse(bh); if (hlock != NULL) { ldiskfs_htree_unlock(hlock); @@ -5320,10 +5504,15 @@ out_journal: else up_read(&obj->oo_ext_idx_sem); } + if (jh != NULL) ldiskfs_journal_stop(jh); + +out_inode: + iput(inode); if (rc >= 0 && !dirty) dev->od_dirent_journal = 0; + return rc; } @@ -5369,6 +5558,11 @@ static inline int osd_it_ea_rec(const struct lu_env *env, rc = osd_dirent_check_repair(env, obj, it, fid, id, &attr); } + + if (!fid_is_sane(fid)) { + attr &= ~LUDA_IGNORE; + attr |= LUDA_UNKNOWN; + } } else { attr &= ~LU_DIRENT_ATTRS_MASK; if (!fid_is_sane(fid)) { @@ -5406,7 +5600,7 @@ static inline int osd_it_ea_rec(const struct lu_env *env, if (osd_remote_fid(env, dev, fid)) RETURN(0); - if (likely(!(attr & LUDA_IGNORE) && rc == 0)) + if (likely(!(attr & (LUDA_IGNORE | LUDA_UNKNOWN)) && rc == 0)) osd_add_oi_cache(oti, dev, id, fid); RETURN(rc > 0 ? 0 : rc); @@ -5875,6 +6069,7 @@ static int osd_device_init0(const struct lu_env *env, spin_lock_init(&o->od_osfs_lock); mutex_init(&o->od_otable_mutex); + INIT_LIST_HEAD(&o->od_orphan_list); o->od_read_cache = 1; o->od_writethrough_cache = 1; @@ -6201,6 +6396,7 @@ static int __init osd_mod_init(void) { int rc; + LASSERT(BH_DXLock < sizeof(((struct buffer_head *)0)->b_state) * 8); #if !defined(CONFIG_DEBUG_MUTEXES) && !defined(CONFIG_DEBUG_SPINLOCK) /* please, try to keep osd_thread_info smaller than a page */ CLASSERT(sizeof(struct osd_thread_info) <= PAGE_SIZE); @@ -6226,7 +6422,7 @@ static void __exit osd_mod_exit(void) lu_kmem_fini(ldiskfs_caches); } -MODULE_AUTHOR("Sun Microsystems, Inc. "); +MODULE_AUTHOR("OpenSFS, Inc. "); MODULE_DESCRIPTION("Lustre Object Storage Device ("LUSTRE_OSD_LDISKFS_NAME")"); MODULE_VERSION(LUSTRE_VERSION_STRING); MODULE_LICENSE("GPL");