X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fosd-ldiskfs%2Fosd_handler.c;h=bc52d58d3ef89cc041eb44e4a95a9d7522a41cd6;hb=ef1b815d77ae717f3ee701e2392fd3fe6c71906d;hp=2c4ec7ff3d819b3a8972ffb37092f572c06a43b8;hpb=4e7541ab2328da4d57f60b3b4d6514990f996858;p=fs%2Flustre-release.git diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 2c4ec7f..bc52d58 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -27,7 +27,7 @@ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2011, 2012, Intel Corporation. + * Copyright (c) 2011, 2013, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -73,6 +73,10 @@ int ldiskfs_pdo = 1; CFS_MODULE_PARM(ldiskfs_pdo, "i", int, 0644, "ldiskfs with parallel directory operations"); +int ldiskfs_track_declares_assert; +CFS_MODULE_PARM(ldiskfs_track_declares_assert, "i", int, 0644, + "LBUG during tracking of declares"); + static const char dot[] = "."; static const char dotdot[] = ".."; static const char remote_obj_dir[] = "REM_OBJ_DIR"; @@ -84,7 +88,6 @@ static const struct dt_object_operations osd_obj_otable_it_ops; static const struct dt_index_operations osd_index_iam_ops; static const struct dt_index_operations osd_index_ea_ops; -#ifdef OSD_TRACK_DECLARES int osd_trans_declare_op2rb[] = { [OSD_OT_ATTR_SET] = OSD_OT_ATTR_SET, [OSD_OT_PUNCH] = OSD_OT_MAX, @@ -98,7 +101,6 @@ int osd_trans_declare_op2rb[] = { [OSD_OT_DELETE] = OSD_OT_INSERT, [OSD_OT_QUOTA] = OSD_OT_MAX, }; -#endif static int osd_has_index(const struct osd_object *obj) { @@ -172,6 +174,7 @@ static inline int __osd_xattr_get(struct inode *inode, struct dentry *dentry, const char *name, void *buf, int len) { dentry->d_inode = inode; + dentry->d_sb = inode->i_sb; return inode->i_op->getxattr(dentry, name, buf, len); } @@ -180,28 +183,22 @@ int osd_get_lma(struct osd_thread_info *info, struct inode *inode, { int rc; - rc = __osd_xattr_get(inode, dentry, XATTR_NAME_LMA, (void *)lma, - sizeof(*lma)); - if (rc == -ERANGE) { - /* try with old lma size */ - rc = inode->i_op->getxattr(dentry, XATTR_NAME_LMA, - info->oti_mdt_attrs_old, - LMA_OLD_SIZE); - if (rc > 0) - memcpy(lma, info->oti_mdt_attrs_old, sizeof(*lma)); - } + CLASSERT(LMA_OLD_SIZE >= sizeof(*lma)); + rc = __osd_xattr_get(inode, dentry, XATTR_NAME_LMA, + info->oti_mdt_attrs_old, LMA_OLD_SIZE); if (rc > 0) { + if ((void *)lma != (void *)info->oti_mdt_attrs_old) + memcpy(lma, info->oti_mdt_attrs_old, sizeof(*lma)); + rc = 0; + lustre_lma_swab(lma); /* Check LMA compatibility */ - if (lma->lma_incompat & ~cpu_to_le32(LMA_INCOMPAT_SUPP)) { - CWARN("%.16s: unsupported incompat LMA feature(s) " - "%lx/%#x\n", + if (lma->lma_incompat & ~LMA_INCOMPAT_SUPP) { + CWARN("%.16s: unsupported incompat LMA feature(s) %#x " + "for fid = "DFID", ino = %lu\n", LDISKFS_SB(inode->i_sb)->s_es->s_volume_name, - inode->i_ino, le32_to_cpu(lma->lma_incompat) & - ~LMA_INCOMPAT_SUPP); - rc = -ENOSYS; - } else { - lustre_lma_swab(lma); - rc = 0; + lma->lma_incompat & ~LMA_INCOMPAT_SUPP, + PFID(&lma->lma_self_fid), inode->i_ino); + rc = -EOPNOTSUPP; } } else if (rc == 0) { rc = -ENODATA; @@ -450,6 +447,50 @@ static void osd_object_init0(struct osd_object *obj) (LOHA_EXISTS | (obj->oo_inode->i_mode & S_IFMT)); } +static int osd_check_lma(const struct lu_env *env, struct osd_object *obj) +{ + struct osd_thread_info *info = osd_oti_get(env); + struct lustre_mdt_attrs *lma = &info->oti_mdt_attrs; + int rc; + ENTRY; + + CLASSERT(LMA_OLD_SIZE >= sizeof(*lma)); + rc = __osd_xattr_get(obj->oo_inode, &info->oti_obj_dentry, + XATTR_NAME_LMA, info->oti_mdt_attrs_old, + LMA_OLD_SIZE); + if (rc > 0) { + rc = 0; + lustre_lma_swab(lma); + if (unlikely((lma->lma_incompat & ~LMA_INCOMPAT_SUPP) || + CFS_FAIL_CHECK(OBD_FAIL_OSD_LMA_INCOMPAT))) { + rc = -EOPNOTSUPP; + CWARN("%s: unsupported incompat LMA feature(s) %#x for " + "fid = "DFID", ino = %lu: rc = %d\n", + osd_obj2dev(obj)->od_svname, + lma->lma_incompat & ~LMA_INCOMPAT_SUPP, + PFID(lu_object_fid(&obj->oo_dt.do_lu)), + obj->oo_inode->i_ino, rc); + } + if (unlikely(!lu_fid_eq(lu_object_fid(&obj->oo_dt.do_lu), + &lma->lma_self_fid))) { + CDEBUG(D_INODE, "%s: FID "DFID" != self_fid "DFID"\n", + osd_obj2dev(obj)->od_svname, + PFID(lu_object_fid(&obj->oo_dt.do_lu)), + PFID(&lma->lma_self_fid)); + if (obj->oo_inode != NULL) { + iput(obj->oo_inode); + obj->oo_inode = NULL; + } + rc = -ESTALE; + } + } else if (rc == -ENODATA) { + /* haven't initialize LMA xattr */ + rc = 0; + } + + RETURN(rc); +} + /* * Concurrency: no concurrent access is possible that early in object * life-cycle. @@ -470,8 +511,13 @@ static int osd_object_init(const struct lu_env *env, struct lu_object *l, result = osd_fid_lookup(env, obj, lu_object_fid(l), conf); obj->oo_dt.do_body_ops = &osd_body_ops_new; - if (result == 0 && obj->oo_inode != NULL) + if (result == 0 && obj->oo_inode != NULL) { + result = osd_check_lma(env, obj); + if (result != 0) + return result; + osd_object_init0(obj); + } LINVRNT(osd_invariant(obj)); return result; @@ -680,9 +726,12 @@ static struct thandle *osd_trans_create(const struct lu_env *env, CFS_INIT_LIST_HEAD(&oh->ot_dcb_list); osd_th_alloced(oh); - memset(oti->oti_declare_ops, 0, OSD_OT_MAX); - memset(oti->oti_declare_ops_rb, 0, OSD_OT_MAX); - memset(oti->oti_declare_ops_cred, 0, OSD_OT_MAX); + memset(oti->oti_declare_ops, 0, + sizeof(oti->oti_declare_ops)); + memset(oti->oti_declare_ops_rb, 0, + sizeof(oti->oti_declare_ops_rb)); + memset(oti->oti_declare_ops_cred, 0, + sizeof(oti->oti_declare_ops_cred)); oti->oti_rollback = false; } RETURN(th); @@ -720,7 +769,6 @@ int osd_trans_start(const struct lu_env *env, struct dt_device *d, LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name, oh->ot_credits, osd_journal(dev)->j_max_transaction_buffers); -#ifdef OSD_TRACK_DECLARES CWARN(" create: %u/%u, delete: %u/%u, destroy: %u/%u\n", oti->oti_declare_ops[OSD_OT_CREATE], oti->oti_declare_ops_cred[OSD_OT_CREATE], @@ -757,7 +805,6 @@ int osd_trans_start(const struct lu_env *env, struct dt_device *d, last_credits = oh->ot_credits; last_printed = jiffies; } -#endif /* XXX Limit the credits to 'max_transaction_buffers', and * let the underlying filesystem to catch the error if * we really need so many credits. @@ -1994,13 +2041,13 @@ int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd, int rc; if (fid_is_idif(fid)) { - range->lsr_flags = LU_SEQ_RANGE_OST; + fld_range_set_ost(range); range->lsr_index = fid_idif_ost_idx(fid); return 0; } if (!fid_seq_in_fldb(fid_seq(fid))) { - range->lsr_flags = LU_SEQ_RANGE_MDT; + fld_range_set_mdt(range); if (ss != NULL) /* FIXME: If ss is NULL, it suppose not get lsr_index * at all */ @@ -2009,10 +2056,10 @@ int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd, } LASSERT(ss != NULL); - range->lsr_flags = -1; + fld_range_set_any(range); rc = fld_server_lookup(env, ss->ss_server_fld, fid_seq(fid), range); if (rc != 0) { - CERROR("%s can not find "DFID": rc = %d\n", + CERROR("%s: cannot find FLD range for "DFID": rc = %d\n", osd_name(osd), PFID(fid), rc); } return rc; @@ -2135,13 +2182,12 @@ static int osd_declare_object_destroy(const struct lu_env *env, LASSERT(oh->ot_handle == NULL); LASSERT(inode); - osd_trans_declare_op(env, oh, OSD_OT_DELETE, + osd_trans_declare_op(env, oh, OSD_OT_DESTROY, osd_dto_credits_noquota[DTO_OBJECT_DELETE]); /* Recycle idle OI leaf may cause additional three OI blocks * to be changed. */ - osd_trans_declare_op(env, oh, OSD_OT_DESTROY, + osd_trans_declare_op(env, oh, OSD_OT_DELETE, osd_dto_credits_noquota[DTO_INDEX_DELETE] + 3); - /* one less inode */ rc = osd_declare_inode_qid(env, inode->i_uid, inode->i_gid, -1, oh, false, true, NULL, false); @@ -2173,9 +2219,6 @@ static int osd_object_destroy(const struct lu_env *env, if (unlikely(fid_is_acct(fid))) RETURN(-EPERM); - /* Parallel control for OI scrub. For most of cases, there is no - * lock contention. So it will not affect unlink performance. */ - mutex_lock(&inode->i_mutex); if (S_ISDIR(inode->i_mode)) { LASSERT(osd_inode_unlinked(inode) || inode->i_nlink == 1); /* it will check/delete the inode from remote parent, @@ -2194,7 +2237,6 @@ static int osd_object_destroy(const struct lu_env *env, osd_trans_exec_op(env, th, OSD_OT_DESTROY); result = osd_oi_delete(osd_oti_get(env), osd, fid, th); - mutex_unlock(&inode->i_mutex); /* XXX: add to ext3 orphan list */ /* rc = ext3_orphan_add(handle_t *handle, struct inode *inode) */ @@ -2224,9 +2266,6 @@ int osd_ea_fid_set(struct osd_thread_info *info, struct inode *inode, if (OBD_FAIL_CHECK(OBD_FAIL_FID_INLMA)) return 0; - if (OBD_FAIL_CHECK(OBD_FAIL_FID_IGIF) && fid_is_client_visible(fid)) - return 0; - lustre_lma_init(lma, fid, flags); lustre_lma_swab(lma); @@ -2249,7 +2288,8 @@ int osd_ea_fid_set(struct osd_thread_info *info, struct inode *inode, void osd_get_ldiskfs_dirent_param(struct ldiskfs_dentry_param *param, const struct dt_rec *fid) { - if (!fid_is_client_mdt_visible((const struct lu_fid *)fid)) { + if (!fid_is_namespace_visible((const struct lu_fid *)fid) || + OBD_FAIL_CHECK(OBD_FAIL_FID_IGIF)) { param->edp_magic = 0; return; } @@ -2351,12 +2391,12 @@ static struct inode *osd_create_local_agent_inode(const struct lu_env *env, } /** - * Delete local inode for remote entry + * Delete local agent inode for remote entry */ -static int osd_delete_remote_inode(const struct lu_env *env, - struct osd_device *osd, - const struct lu_fid *fid, - __u32 ino, struct osd_thandle *oh) +static int osd_delete_local_agent_inode(const struct lu_env *env, + struct osd_device *osd, + const struct lu_fid *fid, + __u32 ino, struct osd_thandle *oh) { struct osd_thread_info *oti = osd_oti_get(env); struct osd_inode_id *id = &oti->oti_id; @@ -2415,13 +2455,11 @@ static int osd_object_ea_create(const struct lu_env *env, struct dt_object *dt, osd_trans_declare_rb(env, th, OSD_OT_REF_ADD); result = __osd_object_create(info, obj, attr, hint, dof, th); - if ((result == 0) && - (fid_is_last_id(fid) || - !fid_is_on_ost(info, osd_dt_dev(th->th_dev), fid))) + if (result == 0) result = osd_ea_fid_set(info, obj->oo_inode, fid, 0); - if (result == 0) - result = __osd_oi_insert(env, obj, fid, th); + if (result == 0) + result = __osd_oi_insert(env, obj, fid, th); LASSERT(ergo(result == 0, dt_object_exists(dt) && !dt_object_remote(dt))); @@ -2453,8 +2491,10 @@ static int osd_declare_object_ref_add(const struct lu_env *env, static int osd_object_ref_add(const struct lu_env *env, struct dt_object *dt, struct thandle *th) { - struct osd_object *obj = osd_dt_obj(dt); - struct inode *inode = obj->oo_inode; + struct osd_object *obj = osd_dt_obj(dt); + struct inode *inode = obj->oo_inode; + bool need_dirty = false; + int rc = 0; LINVRNT(osd_invariant(obj)); LASSERT(dt_object_exists(dt) && !dt_object_remote(dt)); @@ -2463,33 +2503,44 @@ static int osd_object_ref_add(const struct lu_env *env, osd_trans_exec_op(env, th, OSD_OT_REF_ADD); - /* - * DIR_NLINK feature is set for compatibility reasons if: - * 1) nlinks > LDISKFS_LINK_MAX, or - * 2) nlinks == 2, since this indicates i_nlink was previously 1. + /* This based on ldiskfs_inc_count(), which is not exported. + * + * The DIR_NLINK feature allows directories to exceed LDISKFS_LINK_MAX + * (65000) subdirectories by storing "1" in i_nlink if the link count + * would otherwise overflow. Directory tranversal tools understand + * that (st_nlink == 1) indicates that the filesystem dose not track + * hard links count on the directory, and will not abort subdirectory + * scanning early once (st_nlink - 2) subdirs have been found. * - * It is easier to always set this flag (rather than check and set), - * since it has less overhead, and the superblock will be dirtied - * at some point. Both e2fsprogs and any Lustre-supported ldiskfs - * do not actually care whether this flag is set or not. + * This also has to properly handle the case of inodes with nlink == 0 + * in case they are being linked into the PENDING directory */ spin_lock(&obj->oo_guard); - /* inc_nlink from 0 may cause WARN_ON */ - if(inode->i_nlink == 0) + if (unlikely(!S_ISDIR(inode->i_mode) && + inode->i_nlink >= LDISKFS_LINK_MAX)) { + /* MDD should have checked this, but good to be safe */ + rc = -EMLINK; + } else if (unlikely(inode->i_nlink == 0 || + (S_ISDIR(inode->i_mode) && + inode->i_nlink >= LDISKFS_LINK_MAX))) { + /* inc_nlink from 0 may cause WARN_ON */ set_nlink(inode, 1); - else + need_dirty = true; + } else if (!S_ISDIR(inode->i_mode) || + (S_ISDIR(inode->i_mode) && inode->i_nlink >= 2)) { inc_nlink(inode); - if (S_ISDIR(inode->i_mode) && inode->i_nlink > 1) { - if (inode->i_nlink >= LDISKFS_LINK_MAX || - inode->i_nlink == 2) - set_nlink(inode, 1); - } + need_dirty = true; + } /* else (S_ISDIR(inode->i_mode) && inode->i_nlink == 1) { ; } */ + LASSERT(inode->i_nlink <= LDISKFS_LINK_MAX); spin_unlock(&obj->oo_guard); - ll_dirty_inode(inode, I_DIRTY_DATASYNC); + + if (need_dirty) + ll_dirty_inode(inode, I_DIRTY_DATASYNC); + LINVRNT(osd_invariant(obj)); - return 0; + return rc; } static int osd_declare_object_ref_del(const struct lu_env *env, @@ -2528,15 +2579,24 @@ static int osd_object_ref_del(const struct lu_env *env, struct dt_object *dt, spin_lock(&obj->oo_guard); LASSERT(inode->i_nlink > 0); - drop_nlink(inode); - /* If this is/was a many-subdir directory (nlink > LDISKFS_LINK_MAX) - * then the nlink count is 1. Don't let it be set to 0 or the directory - * inode will be deleted incorrectly. */ - if (S_ISDIR(inode->i_mode) && inode->i_nlink == 0) - set_nlink(inode, 1); - spin_unlock(&obj->oo_guard); - ll_dirty_inode(inode, I_DIRTY_DATASYNC); - LINVRNT(osd_invariant(obj)); + + /* This based on ldiskfs_dec_count(), which is not exported. + * + * If a directory already has nlink == 1, then do not drop the nlink + * count to 0, even temporarily, to avoid race conditions with other + * threads not holding oo_guard seeing i_nlink == 0 in rare cases. + * + * nlink == 1 means the directory has/had > EXT4_LINK_MAX subdirs. + * */ + if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 1) { + drop_nlink(inode); + + spin_unlock(&obj->oo_guard); + ll_dirty_inode(inode, I_DIRTY_DATASYNC); + LINVRNT(osd_invariant(obj)); + } else { + spin_unlock(&obj->oo_guard); + } return 0; } @@ -2681,6 +2741,7 @@ static int osd_xattr_list(const struct lu_env *env, struct dt_object *dt, return -EACCES; dentry->d_inode = inode; + dentry->d_sb = inode->i_sb; return inode->i_op->listxattr(dentry, buf->lb_buf, buf->lb_len); } @@ -2726,6 +2787,7 @@ static int osd_xattr_del(const struct lu_env *env, struct dt_object *dt, ll_vfs_dq_init(inode); dentry->d_inode = inode; + dentry->d_sb = inode->i_sb; rc = inode->i_op->removexattr(dentry, name); return rc; } @@ -2829,6 +2891,7 @@ static int osd_object_sync(const struct lu_env *env, struct dt_object *dt) ENTRY; dentry->d_inode = inode; + dentry->d_sb = inode->i_sb; file->f_dentry = dentry; file->f_mapping = inode->i_mapping; file->f_op = inode->i_fop; @@ -3248,11 +3311,77 @@ static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt, down_write(&obj->oo_ext_idx_sem); } - bh = osd_ldiskfs_find_entry(dir, dentry, &de, hlock); + bh = ldiskfs_find_entry(dir, &dentry->d_name, &de, hlock); if (bh) { - rc = ldiskfs_delete_entry(oh->ot_handle, - dir, de, bh); + __u32 ino = 0; + + /* If this is not the ".." entry, it might be a remote DNE + * entry and we need to check if the FID is for a remote + * MDT. If the FID is not in the directory entry (e.g. + * upgraded 1.8 filesystem without dirdata enabled) then + * we need to get the FID from the LMA. For a remote directory + * there HAS to be an LMA, it cannot be an IGIF inode in this + * case. + * + * Delete the entry before the agent inode in order to + * simplify error handling. At worst an error after deleting + * the entry first might leak the agent inode afterward. The + * reverse would need filesystem abort in case of error deleting + * the entry after the agent had been removed, or leave a + * dangling entry pointing at a random inode. */ + if (strcmp((char *)key, dotdot) != 0) { + LASSERT(de != NULL); + rc = osd_get_fid_from_dentry(de, (struct dt_rec *)fid); + /* If Fid is not in dentry, try to get it from LMA */ + if (rc == -ENODATA) { + struct osd_inode_id *id; + struct inode *inode; + + /* Before trying to get fid from the inode, + * check whether the inode is valid. + * + * If the inode has been deleted, do not go + * ahead to do osd_ea_fid_get, which will set + * the inode to bad inode, which might cause + * the inode to be deleted uncorrectly */ + inode = ldiskfs_iget(osd_sb(osd), + le32_to_cpu(de->inode)); + if (IS_ERR(inode)) { + CDEBUG(D_INODE, "%s: "DFID"get inode" + "error.\n", osd_name(osd), + PFID(fid)); + rc = PTR_ERR(inode); + } else { + if (likely(inode->i_nlink != 0)) { + id = &osd_oti_get(env)->oti_id; + rc = osd_ea_fid_get(env, obj, + le32_to_cpu(de->inode), + fid, id); + } else { + CDEBUG(D_INFO, "%s: %u "DFID + "deleted.\n", + osd_name(osd), + le32_to_cpu(de->inode), + PFID(fid)); + rc = -ESTALE; + } + iput(inode); + } + } + if (rc == 0 && + unlikely(osd_remote_fid(env, osd, fid))) + /* Need to delete agent inode */ + ino = le32_to_cpu(de->inode); + } + rc = ldiskfs_delete_entry(oh->ot_handle, dir, de, bh); brelse(bh); + if (rc == 0 && unlikely(ino != 0)) { + rc = osd_delete_local_agent_inode(env, osd, fid, ino, + oh); + if (rc != 0) + CERROR("%s: del local inode "DFID": rc = %d\n", + osd_name(osd), PFID(fid), rc); + } } else { rc = -ENOENT; } @@ -3265,23 +3394,20 @@ static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt, GOTO(out, rc); /* For inode on the remote MDT, .. will point to - * /Agent directory. So do not try to lookup/delete - * remote inode for .. */ - if (strcmp((char *)key, dotdot) == 0) - GOTO(out, rc = 0); - - LASSERT(de != NULL); - rc = osd_get_fid_from_dentry(de, (struct dt_rec *)fid); - if (rc == 0 && osd_remote_fid(env, osd, fid)) { - __u32 ino = le32_to_cpu(de->inode); + * /Agent directory, Check whether it needs to delete + * from agent directory */ + if (unlikely(strcmp((char *)key, dotdot) == 0)) { + rc = osd_delete_from_remote_parent(env, osd_obj2dev(obj), obj, + oh); + if (rc != 0 && rc != -ENOENT) { + CERROR("%s: delete agent inode "DFID": rc = %d\n", + osd_name(osd), PFID(fid), rc); + } - rc = osd_delete_remote_inode(env, osd, fid, ino, oh); - if (rc != 0) - CERROR("%s: del local inode "DFID": rc = %d\n", - osd_name(osd), PFID(fid), rc); - } else { - if (rc == -ENODATA) + if (rc == -ENOENT) rc = 0; + + GOTO(out, rc); } out: @@ -3662,6 +3788,19 @@ static int osd_fail_fid_lookup(struct osd_thread_info *oti, return rc; } +static int osd_add_oi_cache(struct osd_thread_info *info, + struct osd_device *osd, + struct osd_inode_id *id, + struct lu_fid *fid) +{ + CDEBUG(D_INODE, "add "DFID" %u:%u to info %p\n", PFID(fid), + id->oii_ino, id->oii_gen, info); + info->oti_cache.oic_lid = *id; + info->oti_cache.oic_fid = *fid; + + return 0; +} + /** * Calls ->lookup() to find dentry. From dentry get inode and * read inode's ea to get fid. This is required for interoperability @@ -3725,8 +3864,10 @@ static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj, GOTO(out, rc); } - oic->oic_lid = *id; - oic->oic_fid = *fid; + rc = osd_add_oi_cache(osd_oti_get(env), osd_obj2dev(obj), id, + fid); + if (rc != 0) + GOTO(out, rc); if ((scrub->os_pos_current <= ino) && ((sf->sf_flags & SF_INCONSISTENT) || (sf->sf_flags & SF_UPGRADE && fid_is_igif(fid)) || @@ -4255,6 +4396,7 @@ static struct dt_it *osd_it_ea_init(const struct lu_env *env, struct osd_object *obj = osd_dt_obj(dt); struct osd_thread_info *info = osd_oti_get(env); struct osd_it_ea *it = &info->oti_it_ea; + struct file *file = &it->oie_file; struct lu_object *lo = &dt->do_lu; struct dentry *obj_dentry = &info->oti_it_dentry; ENTRY; @@ -4269,17 +4411,20 @@ static struct dt_it *osd_it_ea_init(const struct lu_env *env, it->oie_dirent = NULL; it->oie_buf = info->oti_it_ea_buf; it->oie_obj = obj; - it->oie_file.f_pos = 0; - it->oie_file.f_dentry = obj_dentry; - if (attr & LUDA_64BITHASH) - it->oie_file.f_mode |= FMODE_64BITHASH; - else - it->oie_file.f_mode |= FMODE_32BITHASH; - it->oie_file.f_mapping = obj->oo_inode->i_mapping; - it->oie_file.f_op = obj->oo_inode->i_fop; - it->oie_file.private_data = NULL; - lu_object_get(lo); - RETURN((struct dt_it *) it); + + /* Reset the "file" totally to avoid to reuse any old value from + * former readdir handling, the "file->f_pos" should be zero. */ + memset(file, 0, sizeof(*file)); + /* Only FMODE_64BITHASH or FMODE_32BITHASH should be set, NOT both. */ + if (attr & LUDA_64BITHASH) + file->f_mode = FMODE_64BITHASH; + else + file->f_mode = FMODE_32BITHASH; + file->f_dentry = obj_dentry; + file->f_mapping = obj->oo_inode->i_mapping; + file->f_op = obj->oo_inode->i_fop; + lu_object_get(lo); + RETURN((struct dt_it *) it); } /** @@ -4548,6 +4693,19 @@ osd_dirent_has_space(__u16 reclen, __u16 namelen, unsigned blocksize) return 0; } +static inline int +osd_dot_dotdot_has_space(struct ldiskfs_dir_entry_2 *de, int dot_dotdot) +{ + LASSERTF(dot_dotdot == 1 || dot_dotdot == 2, + "dot_dotdot = %d\n", dot_dotdot); + + if (LDISKFS_DIR_REC_LEN(de) >= + __LDISKFS_DIR_REC_LEN(dot_dotdot + 1 + sizeof(struct osd_fid_pack))) + return 1; + else + return 0; +} + static int osd_dirent_reinsert(const struct lu_env *env, handle_t *jh, struct inode *dir, struct inode *inode, @@ -4641,18 +4799,15 @@ osd_dirent_check_repair(const struct lu_env *env, struct osd_object *obj, struct inode *inode; int credits; int rc; + int dot_dotdot = 0; bool dirty = false; - bool is_dotdot = false; ENTRY; if (ent->oied_name[0] == '.') { - /* Skip dot entry, even if it has stale FID-in-dirent, because - * we do not use such FID-in-dirent anymore, it is harmless. */ if (ent->oied_namelen == 1) - RETURN(0); - - if (ent->oied_namelen == 2 && ent->oied_name[1] == '.') - is_dotdot = true; + dot_dotdot = 1; + else if (ent->oied_namelen == 2 && ent->oied_name[1] == '.') + dot_dotdot = 2; } dentry = osd_child_dentry_get(env, obj, ent->oied_name, @@ -4685,26 +4840,36 @@ again: ent->oied_name, rc); RETURN(rc); } - } - if (obj->oo_hl_head != NULL) { - hlock = osd_oti_get(env)->oti_hlock; - ldiskfs_htree_lock(hlock, obj->oo_hl_head, dir, - LDISKFS_HLOCK_DEL); + if (obj->oo_hl_head != NULL) { + hlock = osd_oti_get(env)->oti_hlock; + /* "0" means exclusive lock for the whole directory. + * We need to prevent others access such name entry + * during the delete + insert. Neither HLOCK_ADD nor + * HLOCK_DEL cannot guarantee the atomicity. */ + ldiskfs_htree_lock(hlock, obj->oo_hl_head, dir, 0); + } else { + down_write(&obj->oo_ext_idx_sem); + } } else { - down_write(&obj->oo_ext_idx_sem); + if (obj->oo_hl_head != NULL) { + hlock = osd_oti_get(env)->oti_hlock; + ldiskfs_htree_lock(hlock, obj->oo_hl_head, dir, + LDISKFS_HLOCK_LOOKUP); + } else { + down_read(&obj->oo_ext_idx_sem); + } } bh = osd_ldiskfs_find_entry(dir, dentry, &de, hlock); - /* For dotdot entry, if there is not enough space to hold FID-in-dirent, - * just keep it there. It only happens when the device upgraded from 1.8 - * or restored from MDT file-level backup. For the whole directory, only - * dotdot entry has no FID-in-dirent and needs to get FID from LMA when - * readdir, it will not affect the performance much. */ + /* For dot/dotdot entry, if there is not enough space to hold the + * FID-in-dirent, just keep them there. It only happens when the + * device upgraded from 1.8 or restored from MDT file-level backup. + * For the whole directory, only dot/dotdot entry have no FID-in-dirent + * and needs to get FID from LMA when readdir, it will not affect the + * performance much. */ if ((bh == NULL) || (le32_to_cpu(de->inode) != ent->oied_ino) || - (is_dotdot && !osd_dirent_has_space(de->rec_len, - ent->oied_namelen, - sb->s_blocksize))) { + (dot_dotdot != 0 && !osd_dot_dotdot_has_space(de, dot_dotdot))) { *attr |= LUDA_IGNORE; GOTO(out_journal, rc = 0); } @@ -4721,6 +4886,10 @@ again: GOTO(out_journal, rc); } + /* skip the REMOTE_PARENT_DIR. */ + if (inode == dev->od_mdt_map->omm_remote_parent->d_inode) + GOTO(out_inode, rc = 0); + rc = osd_get_lma(info, inode, &info->oti_obj_dentry, lma); if (rc == 0) { if (fid_is_sane(fid)) { @@ -4740,7 +4909,7 @@ again: if (hlock != NULL) ldiskfs_htree_unlock(hlock); else - up_write(&obj->oo_ext_idx_sem); + up_read(&obj->oo_ext_idx_sem); dev->od_dirent_journal = 1; goto again; } @@ -4754,6 +4923,7 @@ again: } else { /* Do not repair under dryrun mode. */ if (*attr & LUDA_VERIFY_DRYRUN) { + *fid = lma->lma_self_fid; *attr |= LUDA_REPAIR; GOTO(out_inode, rc = 0); } @@ -4764,7 +4934,7 @@ again: if (hlock != NULL) ldiskfs_htree_unlock(hlock); else - up_write(&obj->oo_ext_idx_sem); + up_read(&obj->oo_ext_idx_sem); dev->od_dirent_journal = 1; goto again; } @@ -4780,10 +4950,13 @@ again: } else if (rc == -ENODATA) { /* Do not repair under dryrun mode. */ if (*attr & LUDA_VERIFY_DRYRUN) { - if (fid_is_sane(fid)) + if (fid_is_sane(fid)) { *attr |= LUDA_REPAIR; - else + } else { + lu_igif_build(fid, inode->i_ino, + inode->i_generation); *attr |= LUDA_UPGRADE; + } GOTO(out_inode, rc = 0); } @@ -4793,7 +4966,7 @@ again: if (hlock != NULL) ldiskfs_htree_unlock(hlock); else - up_write(&obj->oo_ext_idx_sem); + up_read(&obj->oo_ext_idx_sem); dev->od_dirent_journal = 1; goto again; } @@ -4823,10 +4996,14 @@ out_inode: out_journal: brelse(bh); - if (hlock != NULL) + if (hlock != NULL) { ldiskfs_htree_unlock(hlock); - else - up_write(&obj->oo_ext_idx_sem); + } else { + if (dev->od_dirent_journal) + up_write(&obj->oo_ext_idx_sem); + else + up_read(&obj->oo_ext_idx_sem); + } if (jh != NULL) ldiskfs_journal_stop(jh); if (rc >= 0 && !dirty) @@ -4895,10 +5072,8 @@ pack: if (osd_remote_fid(env, dev, fid)) RETURN(0); - if (likely(!(attr & LUDA_IGNORE))) { - oic->oic_lid = *id; - oic->oic_fid = *fid; - } + if (likely(!(attr & LUDA_IGNORE))) + rc = osd_add_oi_cache(oti, dev, id, fid); if (!(attr & LUDA_VERIFY) && (scrub->os_pos_current <= ino) && @@ -5081,20 +5256,33 @@ static int osd_shutdown(const struct lu_env *env, struct osd_device *o) { ENTRY; - osd_scrub_cleanup(env, o); + /* shutdown quota slave instance associated with the device */ + if (o->od_quota_slave != NULL) { + qsd_fini(env, o->od_quota_slave); + o->od_quota_slave = NULL; + } + + RETURN(0); +} + +static void osd_umount(const struct lu_env *env, struct osd_device *o) +{ + ENTRY; if (o->od_fsops) { fsfilt_put_ops(o->od_fsops); o->od_fsops = NULL; } - /* shutdown quota slave instance associated with the device */ - if (o->od_quota_slave != NULL) { - qsd_fini(env, o->od_quota_slave); - o->od_quota_slave = NULL; + if (o->od_mnt != NULL) { + shrink_dcache_sb(osd_sb(o)); + osd_sync(env, &o->od_dt_dev); + + mntput(o->od_mnt); + o->od_mnt = NULL; } - RETURN(0); + EXIT; } static int osd_mount(const struct lu_env *env, @@ -5196,30 +5384,18 @@ out: } static struct lu_device *osd_device_fini(const struct lu_env *env, - struct lu_device *d) + struct lu_device *d) { - int rc; - ENTRY; - - rc = osd_shutdown(env, osd_dev(d)); - - osd_obj_map_fini(osd_dev(d)); - - shrink_dcache_sb(osd_sb(osd_dev(d))); - osd_sync(env, lu2dt_dev(d)); - - rc = osd_procfs_fini(osd_dev(d)); - if (rc) { - CERROR("proc fini error %d \n", rc); - RETURN (ERR_PTR(rc)); - } + struct osd_device *o = osd_dev(d); + ENTRY; - if (osd_dev(d)->od_mnt) { - mntput(osd_dev(d)->od_mnt); - osd_dev(d)->od_mnt = NULL; - } + osd_procfs_fini(o); + osd_shutdown(env, o); + osd_scrub_cleanup(env, o); + osd_obj_map_fini(o); + osd_umount(env, o); - RETURN(NULL); + RETURN(NULL); } static int osd_device_init0(const struct lu_env *env, @@ -5257,12 +5433,6 @@ static int osd_device_init0(const struct lu_env *env, if (rc) GOTO(out_capa, rc); - CFS_INIT_LIST_HEAD(&o->od_ios_list); - /* setup scrub, including OI files initialization */ - rc = osd_scrub_setup(env, o); - if (rc < 0) - GOTO(out_mnt, rc); - cplen = strlcpy(o->od_svname, lustre_cfg_string(cfg, 4), sizeof(o->od_svname)); if (cplen >= sizeof(o->od_svname)) { @@ -5272,22 +5442,28 @@ static int osd_device_init0(const struct lu_env *env, rc = osd_obj_map_init(env, o); if (rc != 0) - GOTO(out_scrub, rc); + GOTO(out_mnt, rc); rc = lu_site_init(&o->od_site, l); - if (rc) + if (rc != 0) GOTO(out_compat, rc); o->od_site.ls_bottom_dev = l; rc = lu_site_init_finish(&o->od_site); - if (rc) + if (rc != 0) + GOTO(out_site, rc); + + CFS_INIT_LIST_HEAD(&o->od_ios_list); + /* setup scrub, including OI files initialization */ + rc = osd_scrub_setup(env, o); + if (rc < 0) GOTO(out_site, rc); rc = osd_procfs_init(o, o->od_svname); if (rc != 0) { CERROR("%s: can't initialize procfs: rc = %d\n", o->od_svname, rc); - GOTO(out_site, rc); + GOTO(out_scrub, rc); } LASSERT(l->ld_site->ls_linkage.next && l->ld_site->ls_linkage.prev); @@ -5302,23 +5478,21 @@ static int osd_device_init0(const struct lu_env *env, } RETURN(0); + out_procfs: osd_procfs_fini(o); +out_scrub: + osd_scrub_cleanup(env, o); out_site: lu_site_fini(&o->od_site); out_compat: osd_obj_map_fini(o); -out_scrub: - osd_scrub_cleanup(env, o); out_mnt: - osd_oi_fini(info, o); - osd_shutdown(env, o); - mntput(o->od_mnt); - o->od_mnt = NULL; + osd_umount(env, o); out_capa: cleanup_capa_hash(o->od_capa_hash); out: - RETURN(rc); + return rc; } static struct lu_device *osd_device_alloc(const struct lu_env *env, @@ -5465,15 +5639,6 @@ static int osd_prepare(const struct lu_env *env, struct lu_device *pdev, int result = 0; ENTRY; - if (dev->ld_site && lu_device_is_md(dev->ld_site->ls_top_dev)) { - /* MDT/MDD still use old infrastructure to create - * special files */ - result = llo_local_objects_setup(env, lu2md_dev(pdev), - lu2dt_dev(dev)); - if (result) - RETURN(result); - } - if (osd->od_quota_slave != NULL) /* set up quota slave objects */ result = qsd_prepare(env, osd->od_quota_slave);