X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosd-ldiskfs%2Fosd_handler.c;h=34f8465cfbb397e4471647a003cf8682e1b03a8a;hp=35b248b5dd64897d6a660bccd57a3507cec53b70;hb=e6c7fcdaf40b130c39af2e3ee8b108c6e31a8ca8;hpb=b76df292c241a6db8dfad76d0f5f4cb8babe22bf diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 35b248b..34f8465c 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -27,7 +27,6 @@ */ /* * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. * * lustre/osd/osd_handler.c * @@ -76,6 +75,9 @@ #include +/* encoding routines */ +#include + /* Maximum EA size is limited by LNET_MTU for remote objects */ #define OSD_MAX_EA_SIZE 1048364 @@ -285,6 +287,76 @@ osd_idc_find_or_init(const struct lu_env *env, struct osd_device *osd, return idc; } +static void osd_idc_dump_lma(const struct lu_env *env, + struct osd_device *osd, + unsigned long ino, + bool check_in_oi) +{ + struct osd_thread_info *info = osd_oti_get(env); + struct lustre_ost_attrs *loa = &info->oti_ost_attrs; + const struct lu_fid *fid; + struct osd_inode_id lid; + struct inode *inode; + int rc; + + inode = osd_ldiskfs_iget(osd_sb(osd), ino); + if (IS_ERR(inode)) { + CERROR("%s: can't get inode %lu: rc = %d\n", + osd->od_svname, ino, (int)PTR_ERR(inode)); + return; + } + if (is_bad_inode(inode)) { + CERROR("%s: bad inode %lu\n", osd->od_svname, ino); + goto put; + } + rc = osd_get_lma(info, inode, &info->oti_obj_dentry, loa); + if (rc) { + CERROR("%s: can't get LMA for %lu: rc = %d\n", + osd->od_svname, ino, rc); + goto put; + } + fid = &loa->loa_lma.lma_self_fid; + LCONSOLE(D_INFO, "%s: "DFID" in inode %lu/%u\n", osd->od_svname, + PFID(fid), ino, (unsigned)inode->i_generation); + if (!check_in_oi) + goto put; + rc = osd_oi_lookup(osd_oti_get(env), osd, fid, &lid, 0); + if (rc) { + CERROR("%s: can't lookup "DFID": rc = %d\n", + osd->od_svname, PFID(fid), rc); + goto put; + } + LCONSOLE(D_INFO, "%s: "DFID" maps to %u/%u\n", osd->od_svname, + PFID(fid), lid.oii_ino, lid.oii_gen); +put: + iput(inode); +} + +static void osd_idc_dump_debug(const struct lu_env *env, + struct osd_device *osd, + const struct lu_fid *fid, + unsigned long ino1, + unsigned long ino2) +{ + struct osd_inode_id lid; + + int rc; + + rc = osd_oi_lookup(osd_oti_get(env), osd, fid, &lid, 0); + if (!rc) { + LCONSOLE(D_INFO, "%s: "DFID" maps to %u/%u\n", + osd->od_svname, PFID(fid), lid.oii_ino, lid.oii_gen); + osd_idc_dump_lma(env, osd, lid.oii_ino, false); + } else { + CERROR("%s: can't lookup "DFID": rc = %d\n", + osd->od_svname, PFID(fid), rc); + } + if (ino1) + osd_idc_dump_lma(env, osd, ino1, true); + if (ino2) + osd_idc_dump_lma(env, osd, ino2, true); +} + /* * lookup mapping for given FID and fill it from the given object. * the object is lolcal by definition. @@ -302,7 +374,12 @@ static int osd_idc_find_and_init(const struct lu_env *env, if (obj->oo_inode == NULL) return 0; if (idc->oic_lid.oii_ino != obj->oo_inode->i_ino) { - LASSERT(idc->oic_lid.oii_ino == 0); + if (idc->oic_lid.oii_ino) { + osd_idc_dump_debug(env, osd, fid, + idc->oic_lid.oii_ino, + obj->oo_inode->i_ino); + return -EINVAL; + } idc->oic_lid.oii_ino = obj->oo_inode->i_ino; idc->oic_lid.oii_gen = obj->oo_inode->i_generation; } @@ -392,12 +469,11 @@ int osd_get_lma(struct osd_thread_info *info, struct inode *inode, lustre_loa_swab(loa, true); /* Check LMA compatibility */ if (lma->lma_incompat & ~LMA_INCOMPAT_SUPP) { - CWARN("%s: unsupported incompat LMA feature(s) %#x " - "for fid = "DFID", ino = %lu\n", + rc = -EOPNOTSUPP; + CWARN("%s: unsupported incompat LMA feature(s) %#x for fid = "DFID", ino = %lu: rc = %d\n", osd_ino2name(inode), lma->lma_incompat & ~LMA_INCOMPAT_SUPP, - PFID(&lma->lma_self_fid), inode->i_ino); - rc = -EOPNOTSUPP; + PFID(&lma->lma_self_fid), inode->i_ino, rc); } } else if (rc == 0) { rc = -ENODATA; @@ -442,10 +518,11 @@ struct inode *osd_iget(struct osd_thread_info *info, struct osd_device *dev, iput(inode); inode = ERR_PTR(-ESTALE); } else if (is_bad_inode(inode)) { - CWARN("%s: bad inode: ino = %u\n", - osd_dev2name(dev), id->oii_ino); + rc = -ENOENT; + CWARN("%s: bad inode: ino = %u: rc = %d\n", + osd_dev2name(dev), id->oii_ino, rc); iput(inode); - inode = ERR_PTR(-ENOENT); + inode = ERR_PTR(rc); } else if ((rc = osd_attach_jinode(inode))) { iput(inode); inode = ERR_PTR(rc); @@ -484,7 +561,7 @@ int osd_ldiskfs_add_entry(struct osd_thread_info *info, struct osd_device *osd, if (!rc2) { fid = &loa->loa_lma.lma_self_fid; } else if (rc2 == -ENODATA) { - if (unlikely(parent == inode->i_sb->s_root->d_inode)) { + if (unlikely(is_root_inode(parent))) { fid = &info->oti_fid3; lu_local_obj_fid(fid, OSD_FS_ROOT_OID); } else if (!osd->od_is_ost && osd->od_index == 0) { @@ -529,7 +606,7 @@ osd_iget_fid(struct osd_thread_info *info, struct osd_device *dev, if (!rc) { *fid = loa->loa_lma.lma_self_fid; } else if (rc == -ENODATA) { - if (unlikely(inode == osd_sb(dev)->s_root->d_inode)) + if (unlikely(is_root_inode(inode))) lu_local_obj_fid(fid, OSD_FS_ROOT_OID); else lu_igif_build(fid, inode->i_ino, inode->i_generation); @@ -858,7 +935,6 @@ struct osd_check_lmv_buf { struct dir_context ctx; struct osd_thread_info *oclb_info; struct osd_device *oclb_dev; - struct osd_idmap_cache *oclb_oic; int oclb_items; bool oclb_found; }; @@ -884,7 +960,6 @@ static int osd_stripe_dir_filldir(void *buf, struct lu_fid *fid = &oti->oti_fid3; struct osd_inode_id *id = &oti->oti_id3; struct osd_device *dev = oclb->oclb_dev; - struct osd_idmap_cache *oic = oclb->oclb_oic; struct inode *inode; oclb->oclb_items++; @@ -907,10 +982,7 @@ static int osd_stripe_dir_filldir(void *buf, iput(inode); osd_add_oi_cache(oti, dev, id, fid); - oic->oic_fid = *fid; - oic->oic_lid = *id; - oic->oic_dev = dev; - osd_oii_insert(dev, oic, true); + osd_oii_insert(dev, fid, id, true); oclb->oclb_found = true; return 1; @@ -953,18 +1025,16 @@ static int osd_stripe_dir_filldir(void *buf, * the correct OI mapping for the slave MDT-object. */ static int osd_check_lmv(struct osd_thread_info *oti, struct osd_device *dev, - struct inode *inode, struct osd_idmap_cache *oic) + struct inode *inode) { struct lu_buf *buf = &oti->oti_big_buf; struct dentry *dentry = &oti->oti_obj_dentry; - struct file *filp = &oti->oti_file; - const struct file_operations *fops; + struct file *filp; struct lmv_mds_md_v1 *lmv1; struct osd_check_lmv_buf oclb = { .ctx.actor = osd_stripe_dir_filldir, .oclb_info = oti, .oclb_dev = dev, - .oclb_oic = oic, .oclb_found = false, }; int rc = 0; @@ -1003,18 +1073,7 @@ again: if (le32_to_cpu(lmv1->lmv_magic) != LMV_MAGIC_V1) GOTO(out, rc = 0); - fops = inode->i_fop; - dentry->d_inode = inode; - dentry->d_sb = inode->i_sb; - filp->f_pos = 0; - filp->f_path.dentry = dentry; - filp->f_flags |= O_NOATIME; - filp->f_mode = FMODE_64BITHASH | FMODE_NONOTIFY; - filp->f_mapping = inode->i_mapping; - filp->f_op = fops; - filp->private_data = NULL; - filp->f_cred = current_cred(); - filp->f_inode = inode; + filp = osd_quasi_file(oti->oti_env, inode); rc = osd_security_file_alloc(filp); if (rc) goto out; @@ -1024,14 +1083,14 @@ again: rc = iterate_dir(filp, &oclb.ctx); } while (rc >= 0 && oclb.oclb_items > 0 && !oclb.oclb_found && filp->f_pos != LDISKFS_HTREE_EOF_64BIT); - fops->release(inode, filp); + inode->i_fop->release(inode, filp); out: if (rc < 0) - CDEBUG(D_LFSCK, "%s: fail to check LMV EA, inode = %lu/%u," - DFID": rc = %d\n", osd_ino2name(inode), - inode->i_ino, inode->i_generation, - PFID(&oic->oic_fid), rc); + CDEBUG(D_LFSCK, + "%s: cannot check LMV, ino = %lu/%u: rc = %d\n", + osd_ino2name(inode), inode->i_ino, inode->i_generation, + rc); else rc = 0; @@ -1064,7 +1123,13 @@ static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj, LINVRNT(osd_invariant(obj)); LASSERT(obj->oo_inode == NULL); - LASSERTF(fid_is_sane(fid) || fid_is_idif(fid), DFID"\n", PFID(fid)); + + if (fid_is_sane(fid) == 0) { + CERROR("%s: invalid FID "DFID"\n", ldev->ld_obd->obd_name, + PFID(fid)); + dump_stack(); + RETURN(-EINVAL); + } dev = osd_dev(ldev); scrub = &dev->od_scrub.os_scrub; @@ -1093,6 +1158,7 @@ static int osd_fid_lookup(const struct lu_env *env, struct osd_object *obj, } id = &info->oti_id; + memset(id, 0, sizeof(struct osd_inode_id)); if (!list_empty(&scrub->os_inconsistent_items)) { /* Search order: 2. OI scrub pending list. */ result = osd_oii_lookup(dev, fid, id); @@ -1183,18 +1249,19 @@ trigger: if (scrub->os_partial_scan && !scrub->os_in_join) goto join; - if (IS_ERR_OR_NULL(inode) || result) + if (IS_ERR_OR_NULL(inode) || result) { + osd_oii_insert(dev, fid, id, result == -ENOENT); GOTO(out, result = -EINPROGRESS); + } LASSERT(remote); LASSERT(obj->oo_inode == inode); - osd_add_oi_cache(info, dev, id, fid); - osd_oii_insert(dev, oic, true); + osd_oii_insert(dev, fid, id, true); goto found; } - if (dev->od_auto_scrub_interval == AS_NEVER) { + if (dev->od_scrub.os_scrub.os_auto_scrub_interval == AS_NEVER) { if (!remote) GOTO(out, result = -EREMCHG); @@ -1207,19 +1274,21 @@ trigger: join: rc1 = osd_scrub_start(env, dev, flags); - LCONSOLE_WARN("%s: trigger OI scrub by RPC for the " DFID" with flags " - "0x%x, rc = %d\n", osd_name(dev), PFID(fid), flags, rc1); + CDEBUG_LIMIT(D_LFSCK | D_CONSOLE | D_WARNING, + "%s: trigger OI scrub by RPC for "DFID"/%u with flags %#x: rc = %d\n", + osd_name(dev), PFID(fid), id->oii_ino, flags, rc1); if (rc1 && rc1 != -EALREADY) GOTO(out, result = -EREMCHG); - if (IS_ERR_OR_NULL(inode) || result) + if (IS_ERR_OR_NULL(inode) || result) { + osd_oii_insert(dev, fid, id, result == -ENOENT); GOTO(out, result = -EINPROGRESS); + } LASSERT(remote); LASSERT(obj->oo_inode == inode); - osd_add_oi_cache(info, dev, id, fid); - osd_oii_insert(dev, oic, true); + osd_oii_insert(dev, fid, id, true); goto found; check_lma: @@ -1316,6 +1385,8 @@ check_lma: if (saved_ino == id->oii_ino && saved_gen == id->oii_gen) { result = -EREMCHG; + osd_scrub_refresh_mapping(info, dev, fid, id, DTO_INDEX_DELETE, + true, 0, NULL); goto trigger; } @@ -1350,7 +1421,7 @@ found: if (S_ISDIR(inode->i_mode) && (flags & SS_AUTO_PARTIAL || sf->sf_status == SS_SCANNING)) - osd_check_lmv(info, dev, inode, oic); + osd_check_lmv(info, dev, inode); result = osd_attach_jinode(inode); if (result) @@ -1591,11 +1662,8 @@ static void osd_object_free(const struct lu_env *env, struct lu_object *l) ldiskfs_htree_lock_head_free(obj->oo_hl_head); /* obj doesn't contain an lu_object_header, so we don't need call_rcu */ OBD_FREE_PTR(obj); - if (unlikely(h)) { - lu_object_header_fini(h); - OBD_FREE_PRE(h, sizeof(*h), "kfreed"); - kfree_rcu(h, loh_rcu); - } + if (unlikely(h)) + lu_object_header_free(h); } /* @@ -1617,16 +1685,6 @@ static void osd_index_fini(struct osd_object *o) } } -/* - * Concurrency: no concurrent access is possible that late in object - * life-cycle (for all existing callers, that is. New callers have to provide - * their own locking.) - */ -static int osd_inode_unlinked(const struct inode *inode) -{ - return inode->i_nlink == 0; -} - enum { OSD_TXN_OI_DELETE_CREDITS = 20, OSD_TXN_INODE_DELETE_CREDITS = 20 @@ -1716,6 +1774,7 @@ static void osd_trans_commit_cb(struct super_block *sb, struct osd_thandle *oh = container_of(jcb, struct osd_thandle, ot_jcb); struct thandle *th = &oh->ot_super; struct lu_device *lud = &th->th_dev->dd_lu_dev; + struct osd_device *osd = osd_dev(lud); struct dt_txn_commit_cb *dcb, *tmp; LASSERT(oh->ot_handle == NULL); @@ -1735,7 +1794,8 @@ static void osd_trans_commit_cb(struct super_block *sb, } lu_ref_del_at(&lud->ld_reference, &oh->ot_dev_link, "osd-tx", th); - lu_device_put(lud); + if (atomic_dec_and_test(&osd->od_commit_cb_in_flight)) + wake_up(&osd->od_commit_cb_done); th->th_dev = NULL; OBD_FREE_PTR(oh); @@ -1776,6 +1836,7 @@ static struct thandle *osd_trans_create(const struct lu_env *env, th->th_dev = d; th->th_result = 0; oh->ot_credits = 0; + oh->oh_declared_ext = 0; INIT_LIST_HEAD(&oh->ot_commit_dcb_list); INIT_LIST_HEAD(&oh->ot_stop_dcb_list); INIT_LIST_HEAD(&oh->ot_trunc_locks); @@ -1918,7 +1979,7 @@ static int osd_trans_start(const struct lu_env *env, struct dt_device *d, oh->ot_handle = jh; LASSERT(oti->oti_txns == 0); - lu_device_get(&d->dd_lu_dev); + atomic_inc(&dev->od_commit_cb_in_flight); lu_ref_add_at(&d->dd_lu_dev.ld_reference, &oh->ot_dev_link, "osd-tx", th); oti->oti_txns++; @@ -2029,7 +2090,7 @@ static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt, if (!rc) rc = rc2; - osd_process_truncates(&truncates); + osd_process_truncates(env, &truncates); } else { osd_trans_stop_cb(oh, th->th_result); OBD_FREE_PTR(oh); @@ -2059,6 +2120,7 @@ static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt, if (unlikely(remove_agents != 0)) osd_process_scheduled_agent_removals(env, osd); + LASSERT(oti->oti_ins_cache_depth > 0); oti->oti_ins_cache_depth--; /* reset OI cache for safety */ if (oti->oti_ins_cache_depth == 0) @@ -2365,16 +2427,17 @@ static void osd_conf_get(const struct lu_env *env, OBD_CKSUM_T10IP512 : OBD_CKSUM_T10IP4K; } else { - CERROR("%s: unsupported checksum type of " - "T10PI type '%s'", + CERROR("%s: unsupported checksum type of T10PI type '%s'\n", d->od_svname, name); } } else { - CERROR("%s: unsupported T10PI type '%s'", + CERROR("%s: unsupported T10PI type '%s'\n", d->od_svname, name); } } + + param->ddp_has_lseek_data_hole = true; } static struct super_block *osd_mnt_sb_get(const struct dt_device *d) @@ -2513,18 +2576,42 @@ const int osd_dto_credits_noquota[DTO_NR] = { [DTO_ATTR_SET_CHOWN] = 0 }; +/* reserve or free quota for some operation */ +static int osd_reserve_or_free_quota(const struct lu_env *env, + struct dt_device *dev, + enum quota_type type, __u64 uid, + __u64 gid, __s64 count, bool is_md) +{ + int rc; + struct osd_device *osd = osd_dt_dev(dev); + struct osd_thread_info *info = osd_oti_get(env); + struct lquota_id_info *qi = &info->oti_qi; + struct qsd_instance *qsd = NULL; + + ENTRY; + + if (is_md) + qsd = osd->od_quota_slave_md; + else + qsd = osd->od_quota_slave_dt; + + rc = quota_reserve_or_free(env, qsd, qi, type, uid, gid, count, is_md); + RETURN(rc); +} + static const struct dt_device_operations osd_dt_ops = { - .dt_root_get = osd_root_get, - .dt_statfs = osd_statfs, - .dt_trans_create = osd_trans_create, - .dt_trans_start = osd_trans_start, - .dt_trans_stop = osd_trans_stop, - .dt_trans_cb_add = osd_trans_cb_add, - .dt_conf_get = osd_conf_get, - .dt_mnt_sb_get = osd_mnt_sb_get, - .dt_sync = osd_sync, - .dt_ro = osd_ro, - .dt_commit_async = osd_commit_async, + .dt_root_get = osd_root_get, + .dt_statfs = osd_statfs, + .dt_trans_create = osd_trans_create, + .dt_trans_start = osd_trans_start, + .dt_trans_stop = osd_trans_stop, + .dt_trans_cb_add = osd_trans_cb_add, + .dt_conf_get = osd_conf_get, + .dt_mnt_sb_get = osd_mnt_sb_get, + .dt_sync = osd_sync, + .dt_ro = osd_ro, + .dt_commit_async = osd_commit_async, + .dt_reserve_or_free_quota = osd_reserve_or_free_quota, }; static void osd_read_lock(const struct lu_env *env, struct dt_object *dt, @@ -2722,7 +2809,7 @@ static int osd_declare_attr_qid(const struct lu_env *env, struct osd_object *obj, struct osd_thandle *oh, long long bspace, qid_t old_id, qid_t new_id, bool enforce, - unsigned int type, bool ignore_edquot) + unsigned int type) { int rc; struct osd_thread_info *info = osd_oti_get(env); @@ -2737,7 +2824,7 @@ static int osd_declare_attr_qid(const struct lu_env *env, qi->lqi_space = 1; /* Reserve credits for the new id */ rc = osd_declare_qid(env, oh, qi, NULL, enforce, NULL); - if (ignore_edquot && (rc == -EDQUOT || rc == -EINPROGRESS)) + if (rc == -EDQUOT || rc == -EINPROGRESS) rc = 0; if (rc) RETURN(rc); @@ -2746,7 +2833,7 @@ static int osd_declare_attr_qid(const struct lu_env *env, qi->lqi_id.qid_uid = old_id; qi->lqi_space = -1; rc = osd_declare_qid(env, oh, qi, obj, enforce, NULL); - if (ignore_edquot && (rc == -EDQUOT || rc == -EINPROGRESS)) + if (rc == -EDQUOT || rc == -EINPROGRESS) rc = 0; if (rc) RETURN(rc); @@ -2762,7 +2849,7 @@ static int osd_declare_attr_qid(const struct lu_env *env, * to save credit reservation. */ rc = osd_declare_qid(env, oh, qi, obj, enforce, NULL); - if (ignore_edquot && (rc == -EDQUOT || rc == -EINPROGRESS)) + if (rc == -EDQUOT || rc == -EINPROGRESS) rc = 0; if (rc) RETURN(rc); @@ -2771,7 +2858,7 @@ static int osd_declare_attr_qid(const struct lu_env *env, qi->lqi_id.qid_uid = old_id; qi->lqi_space = -bspace; rc = osd_declare_qid(env, oh, qi, obj, enforce, NULL); - if (ignore_edquot && (rc == -EDQUOT || rc == -EINPROGRESS)) + if (rc == -EDQUOT || rc == -EINPROGRESS) rc = 0; RETURN(rc); @@ -2822,20 +2909,11 @@ static int osd_declare_attr_set(const struct lu_env *env, * space adjustment once the operation is completed. */ if (attr->la_valid & LA_UID || attr->la_valid & LA_GID) { - bool ignore_edquot = !(attr->la_flags & LUSTRE_SET_SYNC_FL); - - if (!ignore_edquot) - CDEBUG(D_QUOTA, "%s: enforce quota on UID %u, GID %u" - "(the quota space is %lld)\n", - obj->oo_inode->i_sb->s_id, attr->la_uid, - attr->la_gid, bspace); - /* USERQUOTA */ uid = i_uid_read(obj->oo_inode); enforce = (attr->la_valid & LA_UID) && (attr->la_uid != uid); rc = osd_declare_attr_qid(env, obj, oh, bspace, uid, - attr->la_uid, enforce, USRQUOTA, - true); + attr->la_uid, enforce, USRQUOTA); if (rc) RETURN(rc); @@ -2844,8 +2922,7 @@ static int osd_declare_attr_set(const struct lu_env *env, attr->la_uid, gid, attr->la_gid); enforce = (attr->la_valid & LA_GID) && (attr->la_gid != gid); rc = osd_declare_attr_qid(env, obj, oh, bspace, gid, - attr->la_gid, enforce, GRPQUOTA, - ignore_edquot); + attr->la_gid, enforce, GRPQUOTA); if (rc) RETURN(rc); @@ -2858,7 +2935,7 @@ static int osd_declare_attr_set(const struct lu_env *env, (attr->la_projid != projid); rc = osd_declare_attr_qid(env, obj, oh, bspace, (qid_t)projid, (qid_t)attr->la_projid, - enforce, PRJQUOTA, true); + enforce, PRJQUOTA); if (rc) RETURN(rc); } @@ -2913,6 +2990,13 @@ static int osd_inode_setattr(const struct lu_env *env, /* always keep S_NOCMTIME */ inode->i_flags = ll_ext_to_inode_flags(attr->la_flags) | S_NOCMTIME; +#if defined(S_ENCRYPTED) + /* Always remove S_ENCRYPTED, because ldiskfs must not be + * aware of encryption status. It is just stored into LMA + * so that it can be forwared to client side. + */ + inode->i_flags &= ~S_ENCRYPTED; +#endif /* * Ext4 did not transfer inherit flags from * @inode->i_flags to raw inode i_flags when writing @@ -2960,9 +3044,9 @@ static int osd_transfer_project(struct inode *inode, __u32 projid, raw_inode = ldiskfs_raw_inode(&iloc); if (!LDISKFS_FITS_IN_INODE(raw_inode, ei, i_projid)) { - struct osd_thandle *oh = - container_of0(handle, struct osd_thandle, - ot_super); + struct osd_thandle *oh = container_of(handle, + struct osd_thandle, + ot_super); /** * try to expand inode size automatically. */ @@ -2978,7 +3062,9 @@ static int osd_transfer_project(struct inode *inode, __u32 projid, dquot_initialize(inode); transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); if (transfer_to[PRJQUOTA]) { + lock_dquot_transfer(inode); err = __dquot_transfer(inode, transfer_to); + unlock_dquot_transfer(inode); dqput(transfer_to[PRJQUOTA]); if (err) return err; @@ -3011,11 +3097,12 @@ static int osd_quota_transfer(struct inode *inode, const struct lu_attr *attr, iattr.ia_uid = make_kuid(&init_user_ns, attr->la_uid); iattr.ia_gid = make_kgid(&init_user_ns, attr->la_gid); + lock_dquot_transfer(inode); rc = dquot_transfer(inode, &iattr); + unlock_dquot_transfer(inode); if (rc) { - CERROR("%s: quota transfer failed: rc = %d. Is quota " - "enforcement enabled on the ldiskfs " - "filesystem?\n", inode->i_sb->s_id, rc); + CERROR("%s: quota transfer failed. Is quota enforcement enabled on the ldiskfs filesystem? rc = %d\n", + osd_ino2name(inode), rc); return rc; } } @@ -3023,15 +3110,16 @@ static int osd_quota_transfer(struct inode *inode, const struct lu_attr *attr, /* Handle project id transfer here properly */ if (attr->la_valid & LA_PROJID && attr->la_projid != i_projid_read(inode)) { + if (!projid_valid(make_kprojid(&init_user_ns, attr->la_projid))) + return -EINVAL; #ifdef HAVE_PROJECT_QUOTA rc = osd_transfer_project(inode, attr->la_projid, handle); #else rc = -ENOTSUPP; #endif if (rc) { - CERROR("%s: quota transfer failed: rc = %d. Is project " - "enforcement enabled on the ldiskfs " - "filesystem?\n", inode->i_sb->s_id, rc); + CERROR("%s: quota transfer failed. Is project enforcement enabled on the ldiskfs filesystem? rc = %d\n", + osd_ino2name(inode), rc); return rc; } } @@ -3155,12 +3243,21 @@ static int osd_mkfile(struct osd_thread_info *info, struct osd_object *obj, struct osd_thandle *oth; struct dt_object *parent = NULL; struct inode *inode; - uid_t owner[2] = {0, 0}; + struct iattr iattr = { + .ia_valid = ATTR_UID | ATTR_GID | + ATTR_CTIME | ATTR_MTIME | ATTR_ATIME, + .ia_ctime.tv_sec = attr->la_ctime, + .ia_mtime.tv_sec = attr->la_mtime, + .ia_atime.tv_sec = attr->la_atime, + .ia_uid = GLOBAL_ROOT_UID, + .ia_gid = GLOBAL_ROOT_GID, + }; + const struct osd_timespec omit = { .tv_nsec = UTIME_OMIT }; if (attr->la_valid & LA_UID) - owner[0] = attr->la_uid; + iattr.ia_uid = make_kuid(&init_user_ns, attr->la_uid); if (attr->la_valid & LA_GID) - owner[1] = attr->la_gid; + iattr.ia_gid = make_kgid(&init_user_ns, attr->la_gid); LINVRNT(osd_invariant(obj)); LASSERT(obj->oo_inode == NULL); @@ -3180,10 +3277,18 @@ static int osd_mkfile(struct osd_thread_info *info, struct osd_object *obj, !dt_object_remote(hint->dah_parent)) parent = hint->dah_parent; + /* if a time component is not valid set it to UTIME_OMIT */ + if (!(attr->la_valid & LA_CTIME)) + iattr.ia_ctime = omit; + if (!(attr->la_valid & LA_MTIME)) + iattr.ia_mtime = omit; + if (!(attr->la_valid & LA_ATIME)) + iattr.ia_atime = omit; + inode = ldiskfs_create_inode(oth->ot_handle, parent ? osd_dt_obj(parent)->oo_inode : osd_sb(osd)->s_root->d_inode, - mode, owner); + mode, &iattr); if (!IS_ERR(inode)) { /* Do not update file c/mtime in ldiskfs. */ inode->i_flags |= S_NOCMTIME; @@ -3354,7 +3459,6 @@ static osd_obj_type_f osd_create_type_f(enum dt_format_type type) return result; } - static void osd_ah_init(const struct lu_env *env, struct dt_allocation_hint *ah, struct dt_object *parent, struct dt_object *child, umode_t child_mode) @@ -3470,6 +3574,9 @@ static int __osd_oi_insert(const struct lu_env *env, struct osd_object *obj, LASSERT(obj->oo_inode != NULL); + if (CFS_FAIL_CHECK(OBD_FAIL_OSD_OI_ENOSPC)) + return -ENOSPC; + oh = container_of(th, struct osd_thandle, ot_super); LASSERT(oh->ot_handle); osd_trans_exec_op(env, th, OSD_OT_INSERT); @@ -3530,6 +3637,8 @@ static int osd_declare_create(const struct lu_env *env, struct dt_object *dt, struct thandle *handle) { struct osd_thandle *oh; + struct super_block *sb = osd_sb(osd_dev(dt->do_lu.lo_dev)); + int credits; int rc; ENTRY; @@ -3544,10 +3653,23 @@ static int osd_declare_create(const struct lu_env *env, struct dt_object *dt, * vs. osd_mkreg: osd_mk_index will create 2 blocks for root_node and * leaf_node, could involves the block, block bitmap, groups, GDT * change for each block, so add 4 * 2 credits in that case. + * + * The default ACL initialization may consume an additional 16 blocks */ - osd_trans_declare_op(env, oh, OSD_OT_CREATE, - osd_dto_credits_noquota[DTO_OBJECT_CREATE] + - (dof->dof_type == DFT_INDEX) ? 4 * 2 : 0); + credits = osd_dto_credits_noquota[DTO_OBJECT_CREATE] + + ((dof->dof_type == DFT_INDEX) ? 4 * 2 : 0); + + /** + * While ldiskfs_new_inode() calls ldiskfs_init_acl() we have to add + * credits for possible default ACL creation in new inode + */ + if (hint && hint->dah_acl_len) + credits += osd_calc_bkmap_credits(sb, NULL, 0, -1, + (hint->dah_acl_len + sb->s_blocksize - 1) >> + sb->s_blocksize_bits); + + osd_trans_declare_op(env, oh, OSD_OT_CREATE, credits); + /* * Reuse idle OI block may cause additional one OI block * to be changed. @@ -3661,8 +3783,10 @@ static int osd_destroy(const struct lu_env *env, struct dt_object *dt, } if (S_ISDIR(inode->i_mode)) { - LASSERT(osd_inode_unlinked(inode) || inode->i_nlink == 1 || - inode->i_nlink == 2); + if (inode->i_nlink > 2) + CERROR("%s: directory "DFID" ino %lu link count is %u at unlink. run e2fsck to repair\n", + osd_name(osd), PFID(fid), inode->i_ino, + inode->i_nlink); spin_lock(&obj->oo_guard); clear_nlink(inode); @@ -3826,6 +3950,9 @@ static int osd_add_dot_dotdot_internal(struct osd_thread_info *info, __u32 saved_nlink = dir->i_nlink; int rc; + if (OBD_FAIL_CHECK(OBD_FAIL_OSD_DOTDOT_ENOSPC)) + return -ENOSPC; + dot_dot_ldp = (struct ldiskfs_dentry_param *)info->oti_ldp2; osd_get_ldiskfs_dirent_param(dot_dot_ldp, dot_dot_fid); @@ -3866,6 +3993,15 @@ static struct inode *osd_create_local_agent_inode(const struct lu_env *env, struct osd_thread_info *info = osd_oti_get(env); struct inode *local; struct osd_thandle *oh; + struct iattr iattr = { + .ia_valid = ATTR_UID | ATTR_GID | + ATTR_CTIME | ATTR_MTIME | ATTR_ATIME, + .ia_ctime.tv_nsec = UTIME_OMIT, + .ia_mtime.tv_nsec = UTIME_OMIT, + .ia_atime.tv_nsec = UTIME_OMIT, + .ia_uid = GLOBAL_ROOT_UID, + .ia_gid = GLOBAL_ROOT_GID, + }; int rc; ENTRY; @@ -3874,8 +4010,8 @@ static struct inode *osd_create_local_agent_inode(const struct lu_env *env, oh = container_of(th, struct osd_thandle, ot_super); LASSERT(oh->ot_handle->h_transaction != NULL); - local = ldiskfs_create_inode(oh->ot_handle, pobj->oo_inode, type, - NULL); + local = ldiskfs_create_inode(oh->ot_handle, pobj->oo_inode, + type, &iattr); if (IS_ERR(local)) { CERROR("%s: create local error %d\n", osd_name(osd), (int)PTR_ERR(local)); @@ -3895,8 +4031,8 @@ static struct inode *osd_create_local_agent_inode(const struct lu_env *env, */ if (S_ISLNK(type)) { BUILD_BUG_ON(LDISKFS_N_BLOCKS * 4 < FID_LEN + 1); - rc = snprintf((char *)LDISKFS_I(local)->i_data, - LDISKFS_N_BLOCKS * 4, DFID, PFID(fid)); + rc = scnprintf((char *)LDISKFS_I(local)->i_data, + LDISKFS_N_BLOCKS * 4, DFID, PFID(fid)); i_size_write(local, rc); LDISKFS_I(local)->i_disksize = rc; @@ -3909,9 +4045,8 @@ static struct inode *osd_create_local_agent_inode(const struct lu_env *env, i_projid_read(pobj->oo_inode) != 0) { rc = osd_transfer_project(local, 0, th); if (rc) { - CERROR("%s: quota transfer failed: rc = %d. Is project " - "quota enforcement enabled on the ldiskfs " - "filesystem?\n", local->i_sb->s_id, rc); + CERROR("%s: quota transfer failed:. Is project quota enforcement enabled on the ldiskfs filesystem? rc = %d\n", + osd_ino2name(local), rc); RETURN(ERR_PTR(rc)); } } @@ -4075,8 +4210,21 @@ static int osd_create(const struct lu_env *env, struct dt_object *dt, obj->oo_dt.do_body_ops = &osd_body_ops; } - if (!result && !CFS_FAIL_CHECK(OBD_FAIL_OSD_NO_OI_ENTRY)) + if (!result && !CFS_FAIL_CHECK(OBD_FAIL_OSD_NO_OI_ENTRY)) { + struct inode *inode = obj->oo_inode; + result = __osd_oi_insert(env, obj, fid, th); + if (result && inode) { + spin_lock(&obj->oo_guard); + clear_nlink(inode); + spin_unlock(&obj->oo_guard); + osd_dirty_inode(inode, I_DIRTY_DATASYNC); + ldiskfs_set_inode_state(inode, + LDISKFS_STATE_LUSTRE_DESTROY); + iput(inode); + obj->oo_inode = NULL; + } + } /* * a small optimization - dt_insert() isn't usually applied @@ -4100,6 +4248,7 @@ static int osd_declare_ref_add(const struct lu_env *env, struct dt_object *dt, struct thandle *handle) { struct osd_thandle *oh; + int rc; /* it's possible that object doesn't exist yet */ LASSERT(handle != NULL); @@ -4110,9 +4259,10 @@ static int osd_declare_ref_add(const struct lu_env *env, struct dt_object *dt, osd_trans_declare_op(env, oh, OSD_OT_REF_ADD, osd_dto_credits_noquota[DTO_ATTR_SET_BASE]); - osd_idc_find_and_init(env, osd_dev(dt->do_lu.lo_dev), osd_dt_obj(dt)); + rc = osd_idc_find_and_init(env, osd_dev(dt->do_lu.lo_dev), + osd_dt_obj(dt)); - return 0; + return rc; } /* @@ -4210,6 +4360,9 @@ static int osd_ref_del(const struct lu_env *env, struct dt_object *dt, LASSERT(osd_is_write_locked(env, obj)); LASSERT(th != NULL); + if (OBD_FAIL_CHECK(OBD_FAIL_OSD_REF_DEL)) + return -EIO; + oh = container_of(th, struct osd_thandle, ot_super); LASSERT(oh->ot_handle != NULL); @@ -4502,8 +4655,8 @@ static int osd_xattr_set_pfid(const struct lu_env *env, struct osd_object *obj, dquot_initialize(inode); rc = ll_vfs_removexattr(dentry, inode, XATTR_NAME_FID); if (rc == -ENODATA) { - if ((fl & LU_XATTR_REPLACE) && !(fl & LU_XATTR_CREATE)) - RETURN(rc); + /* XATTR_NAME_FID is already absent */ + rc = 0; } else if (rc) { RETURN(rc); } @@ -4824,20 +4977,11 @@ static int osd_object_sync(const struct lu_env *env, struct dt_object *dt, { struct osd_object *obj = osd_dt_obj(dt); struct inode *inode = obj->oo_inode; - struct osd_thread_info *info = osd_oti_get(env); - struct dentry *dentry = &info->oti_obj_dentry; - struct file *file = &info->oti_file; + struct file *file = osd_quasi_file(env, inode); int rc; ENTRY; - dentry->d_inode = inode; - dentry->d_sb = inode->i_sb; - file->f_path.dentry = dentry; - file->f_mapping = inode->i_mapping; - file->f_op = inode->i_fop; - file->f_inode = inode; - rc = vfs_fsync_range(file, start, end, 0); RETURN(rc); @@ -4848,6 +4992,10 @@ static int osd_invalidate(const struct lu_env *env, struct dt_object *dt) return 0; } +static bool osd_check_stale(struct dt_object *dt) +{ + return false; +} /* * Index operations. */ @@ -5036,6 +5184,7 @@ static const struct dt_object_operations osd_obj_ops = { .do_xattr_list = osd_xattr_list, .do_object_sync = osd_object_sync, .do_invalidate = osd_invalidate, + .do_check_stale = osd_check_stale, }; static const struct dt_object_operations osd_obj_otable_it_ops = { @@ -5217,6 +5366,50 @@ static void osd_take_care_of_agent(const struct lu_env *env, } /** + * Utility function to get real name from object name + * + * \param[in] obj pointer to the object to be handled + * \param[in] name object name + * \param[in] len object name len + * \param[out]ln pointer to the struct lu_name to hold the real name + * + * If file is not encrypted, real name is just the object name. + * If file is encrypted, object name needs to be decoded. In + * this case a new buffer is allocated, and ln->ln_name needs to be freed by + * the caller. + * + * \retval 0, on success + * \retval -ve, on error + */ +static int obj_name2lu_name(struct osd_object *obj, const char *name, + int len, struct lu_name *ln) +{ + struct lu_fid namefid; + + fid_zero(&namefid); + + if (name[0] == '[') + sscanf(name + 1, SFID, RFID(&namefid)); + + if (fid_is_sane(&namefid) || name_is_dot_or_dotdot(name, len) || + !(obj->oo_lma_flags & LUSTRE_ENCRYPT_FL)) { + ln->ln_name = name; + ln->ln_namelen = len; + } else { + char *buf = kmalloc(len, GFP_NOFS); + + if (!buf) + return -ENOMEM; + + len = critical_decode(name, len, buf); + ln->ln_name = buf; + ln->ln_namelen = len; + } + + return 0; +} + +/** * Index delete function for interoperability mode (b11826). * It will remove the directory entry added by osd_index_ea_insert(). * This entry is needed to maintain name->fid mapping. @@ -5237,6 +5430,7 @@ static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt, struct buffer_head *bh; struct htree_lock *hlock = NULL; struct osd_device *osd = osd_dev(dt->do_lu.lo_dev); + struct lu_name ln; int rc; ENTRY; @@ -5248,6 +5442,10 @@ static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt, LASSERT(!dt_object_remote(dt)); LASSERT(handle != NULL); + rc = obj_name2lu_name(obj, (char *)key, strlen((char *)key), &ln); + if (rc) + RETURN(rc); + osd_trans_exec_op(env, handle, OSD_OT_DELETE); oh = container_of(handle, struct osd_thandle, ot_super); @@ -5255,8 +5453,7 @@ static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt, LASSERT(oh->ot_handle->h_transaction != NULL); dquot_initialize(dir); - dentry = osd_child_dentry_get(env, obj, - (char *)key, strlen((char *)key)); + dentry = osd_child_dentry_get(env, obj, ln.ln_name, ln.ln_namelen); if (obj->oo_hl_head != NULL) { hlock = osd_oti_get(env)->oti_hlock; @@ -5310,6 +5507,8 @@ static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt, out: LASSERT(osd_invariant(obj)); osd_trans_exec_check(env, handle, OSD_OT_DELETE); + if (ln.ln_name != (char *)key) + kfree(ln.ln_name); RETURN(rc); } @@ -5484,6 +5683,7 @@ static int __osd_ea_add_rec(struct osd_thread_info *info, struct ldiskfs_dentry_param *ldp; struct dentry *child; struct osd_thandle *oth; + struct lu_name ln; int rc; oth = container_of(th, struct osd_thandle, ot_super); @@ -5491,13 +5691,17 @@ static int __osd_ea_add_rec(struct osd_thread_info *info, LASSERT(oth->ot_handle->h_transaction != NULL); LASSERT(pobj->oo_inode); + rc = obj_name2lu_name(pobj, name, strlen(name), &ln); + if (rc) + RETURN(rc); + ldp = (struct ldiskfs_dentry_param *)info->oti_ldp; - if (unlikely(pobj->oo_inode == - osd_sb(osd_obj2dev(pobj))->s_root->d_inode)) + if (unlikely(osd_object_is_root(pobj))) ldp->edp_magic = 0; else osd_get_ldiskfs_dirent_param(ldp, fid); - child = osd_child_dentry_get(info->oti_env, pobj, name, strlen(name)); + child = osd_child_dentry_get(info->oti_env, pobj, + ln.ln_name, ln.ln_namelen); child->d_fsdata = (void *)ldp; dquot_initialize(pobj->oo_inode); rc = osd_ldiskfs_add_entry(info, osd_obj2dev(pobj), oth->ot_handle, @@ -5526,6 +5730,8 @@ static int __osd_ea_add_rec(struct osd_thread_info *info, } } + if (ln.ln_name != name) + kfree(ln.ln_name); RETURN(rc); } @@ -5653,30 +5859,20 @@ static int osd_ea_add_rec(const struct lu_env *env, struct osd_object *pobj, } static int -osd_consistency_check(struct osd_thread_info *oti, struct osd_device *dev, - struct osd_idmap_cache *oic) +osd_ldiskfs_consistency_check(struct osd_thread_info *oti, + struct osd_device *dev, + const struct lu_fid *fid, + struct osd_inode_id *id) { struct lustre_scrub *scrub = &dev->od_scrub.os_scrub; - struct lu_fid *fid = &oic->oic_fid; - struct osd_inode_id *id = &oic->oic_lid; struct inode *inode = NULL; int once = 0; bool insert; int rc; ENTRY; - - if (!fid_is_norm(fid) && !fid_is_igif(fid)) - RETURN(0); - - if (scrub->os_running && scrub->os_pos_current > id->oii_ino) - RETURN(0); - - if (dev->od_auto_scrub_interval == AS_NEVER || - ktime_get_real_seconds() < - scrub->os_file.sf_time_last_complete + dev->od_auto_scrub_interval) + if (!scrub_needs_check(scrub, fid, id->oii_ino)) RETURN(0); - again: rc = osd_oi_lookup(oti, dev, fid, &oti->oti_id, 0); if (rc == -ENOENT) { @@ -5721,7 +5917,7 @@ trigger: } } - rc = osd_oii_insert(dev, oic, insert); + rc = osd_oii_insert(dev, fid, id, insert); /* * There is race condition between osd_oi_lookup and OI scrub. * The OI scrub finished just after osd_oi_lookup() failure. @@ -5734,18 +5930,18 @@ trigger: if (!S_ISDIR(inode->i_mode)) rc = 0; else - rc = osd_check_lmv(oti, dev, inode, oic); + rc = osd_check_lmv(oti, dev, inode); GOTO(out, rc); } - if (dev->od_auto_scrub_interval != AS_NEVER && ++once == 1) { + if (dev->od_scrub.os_scrub.os_auto_scrub_interval != AS_NEVER && + ++once == 1) { rc = osd_scrub_start(oti->oti_env, dev, SS_AUTO_PARTIAL | SS_CLEAR_DRYRUN | SS_CLEAR_FAILOUT); - CDEBUG(D_LFSCK | D_CONSOLE | D_WARNING, - "%s: trigger partial OI scrub for RPC inconsistency " - "checking FID "DFID": rc = %d\n", - osd_dev2name(dev), PFID(fid), rc); + CDEBUG_LIMIT(D_LFSCK | D_CONSOLE | D_WARNING, + "%s: trigger partial OI scrub for RPC inconsistency, checking FID "DFID"/%u: rc = %d\n", + osd_dev2name(dev), PFID(fid), id->oii_ino, rc); if (rc == 0 || rc == -EALREADY) goto again; } @@ -5753,18 +5949,17 @@ trigger: GOTO(out, rc); out: - if (inode) - iput(inode); + iput(inode); RETURN(rc); } static int osd_fail_fid_lookup(struct osd_thread_info *oti, struct osd_device *dev, - struct osd_idmap_cache *oic, struct lu_fid *fid, __u32 ino) { struct lustre_ost_attrs *loa = &oti->oti_ost_attrs; + struct osd_idmap_cache *oic = &oti->oti_cache; struct inode *inode; int rc; @@ -5932,6 +6127,7 @@ static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj, struct buffer_head *bh; struct lu_fid *fid = (struct lu_fid *)rec; struct htree_lock *hlock = NULL; + struct lu_name ln; int ino; int rc; @@ -5940,8 +6136,11 @@ static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj, LASSERT(dir->i_op != NULL); LASSERT(dir->i_op->lookup != NULL); - dentry = osd_child_dentry_get(env, obj, - (char *)key, strlen((char *)key)); + rc = obj_name2lu_name(obj, (char *)key, strlen((char *)key), &ln); + if (rc) + RETURN(rc); + + dentry = osd_child_dentry_get(env, obj, ln.ln_name, ln.ln_namelen); if (obj->oo_hl_head != NULL) { hlock = osd_oti_get(env)->oti_hlock; @@ -5955,13 +6154,12 @@ static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj, if (!IS_ERR(bh)) { struct osd_thread_info *oti = osd_oti_get(env); struct osd_inode_id *id = &oti->oti_id; - struct osd_idmap_cache *oic = &oti->oti_cache; struct osd_device *dev = osd_obj2dev(obj); ino = le32_to_cpu(de->inode); if (OBD_FAIL_CHECK(OBD_FAIL_FID_LOOKUP)) { brelse(bh); - rc = osd_fail_fid_lookup(oti, dev, oic, fid, ino); + rc = osd_fail_fid_lookup(oti, dev, fid, ino); GOTO(out, rc); } @@ -5989,19 +6187,24 @@ static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj, osd_id_gen(id, ino, OSD_OII_NOGEN); } - if (rc != 0 || osd_remote_fid(env, dev, fid)) { - fid_zero(&oic->oic_fid); - + if (rc != 0 || osd_remote_fid(env, dev, fid)) GOTO(out, rc); - } - osd_add_oi_cache(osd_oti_get(env), osd_obj2dev(obj), id, fid); - rc = osd_consistency_check(oti, dev, oic); - if (rc == -ENOENT) - fid_zero(&oic->oic_fid); - else + rc = osd_ldiskfs_consistency_check(oti, dev, fid, id); + if (rc != -ENOENT) { /* Other error should not affect lookup result. */ rc = 0; + + /* Normal file mapping should be added into OI cache + * after FID in LMA check, but for local files like + * hsm_actions, their FIDs are not stored in OI files, + * see osd_initial_OI_scrub(), and here is the only + * place to load mapping into OI cache. + */ + if (!fid_is_namespace_visible(fid)) + osd_add_oi_cache(osd_oti_get(env), + osd_obj2dev(obj), id, fid); + } } else { rc = PTR_ERR(bh); } @@ -6013,7 +6216,9 @@ out: ldiskfs_htree_unlock(hlock); else up_read(&obj->oo_ext_idx_sem); - return rc; + if (ln.ln_name != (char *)key) + kfree(ln.ln_name); + RETURN(rc); } static int osd_index_declare_ea_insert(const struct lu_env *env, @@ -6087,7 +6292,7 @@ static int osd_index_declare_ea_insert(const struct lu_env *env, i_projid_read(inode) != 0) rc = osd_declare_attr_qid(env, osd_dt_obj(dt), oh, 0, i_projid_read(inode), - 0, false, PRJQUOTA, true); + 0, false, PRJQUOTA); #endif } @@ -6478,36 +6683,23 @@ static const struct dt_index_operations osd_index_iam_ops = { } }; - -/** - * Creates or initializes iterator context. - * - * \retval struct osd_it_ea, iterator structure on success - * - */ -static struct dt_it *osd_it_ea_init(const struct lu_env *env, - struct dt_object *dt, - __u32 attr) +struct osd_it_ea *osd_it_dir_init(const struct lu_env *env, + struct inode *inode, __u32 attr) { - struct osd_object *obj = osd_dt_obj(dt); struct osd_thread_info *info = osd_oti_get(env); struct osd_it_ea *oie; struct file *file; - struct lu_object *lo = &dt->do_lu; struct dentry *obj_dentry; ENTRY; - if (!dt_object_exists(dt) || obj->oo_destroyed) - RETURN(ERR_PTR(-ENOENT)); - OBD_SLAB_ALLOC_PTR_GFP(oie, osd_itea_cachep, GFP_NOFS); if (oie == NULL) RETURN(ERR_PTR(-ENOMEM)); obj_dentry = &oie->oie_dentry; - obj_dentry->d_inode = obj->oo_inode; - obj_dentry->d_sb = osd_sb(osd_obj2dev(obj)); + obj_dentry->d_inode = inode; + obj_dentry->d_sb = inode->i_sb; obj_dentry->d_name.hash = 0; oie->oie_rd_dirent = 0; @@ -6521,7 +6713,7 @@ static struct dt_it *osd_it_ea_init(const struct lu_env *env, if (oie->oie_buf == NULL) RETURN(ERR_PTR(-ENOMEM)); } - oie->oie_obj = obj; + oie->oie_obj = NULL; file = &oie->oie_file; @@ -6531,14 +6723,56 @@ static struct dt_it *osd_it_ea_init(const struct lu_env *env, else file->f_mode = FMODE_32BITHASH; file->f_path.dentry = obj_dentry; - file->f_mapping = obj->oo_inode->i_mapping; - file->f_op = obj->oo_inode->i_fop; - file->f_inode = obj->oo_inode; + file->f_mapping = inode->i_mapping; + file->f_op = inode->i_fop; + file->f_inode = inode; + RETURN(oie); +} + +/** + * Creates or initializes iterator context. + * + * \retval struct osd_it_ea, iterator structure on success + * + */ +static struct dt_it *osd_it_ea_init(const struct lu_env *env, + struct dt_object *dt, + __u32 attr) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct lu_object *lo = &dt->do_lu; + struct osd_it_ea *oie; + + ENTRY; + + if (!dt_object_exists(dt) || obj->oo_destroyed) + RETURN(ERR_PTR(-ENOENT)); + + oie = osd_it_dir_init(env, obj->oo_inode, attr); + if (IS_ERR(oie)) + RETURN((struct dt_it *)oie); + + oie->oie_obj = obj; lu_object_get(lo); RETURN((struct dt_it *)oie); } +void osd_it_dir_fini(const struct lu_env *env, struct osd_it_ea *oie, + struct inode *inode) +{ + struct osd_thread_info *info = osd_oti_get(env); + + ENTRY; + oie->oie_file.f_op->release(inode, &oie->oie_file); + if (unlikely(oie->oie_buf != info->oti_it_ea_buf)) + OBD_FREE(oie->oie_buf, OSD_IT_EA_BUFSIZE); + else + info->oti_it_ea_buf_used = 0; + OBD_SLAB_FREE_PTR(oie, osd_itea_cachep); + EXIT; +} + /** * Destroy or finishes iterator context. * @@ -6546,19 +6780,13 @@ static struct dt_it *osd_it_ea_init(const struct lu_env *env, */ static void osd_it_ea_fini(const struct lu_env *env, struct dt_it *di) { - struct osd_thread_info *info = osd_oti_get(env); struct osd_it_ea *oie = (struct osd_it_ea *)di; struct osd_object *obj = oie->oie_obj; struct inode *inode = obj->oo_inode; ENTRY; - oie->oie_file.f_op->release(inode, &oie->oie_file); + osd_it_dir_fini(env, (struct osd_it_ea *)di, inode); osd_object_put(env, obj); - if (unlikely(oie->oie_buf != info->oti_it_ea_buf)) - OBD_FREE(oie->oie_buf, OSD_IT_EA_BUFSIZE); - else - info->oti_it_ea_buf_used = 0; - OBD_SLAB_FREE_PTR(oie, osd_itea_cachep); EXIT; } @@ -6622,6 +6850,7 @@ static int osd_ldiskfs_filldir(void *buf, struct osd_it_ea_dirent *ent = it->oie_dirent; struct lu_fid *fid = &ent->oied_fid; struct osd_fid_pack *rec; + struct lu_fid namefid; ENTRY; @@ -6635,9 +6864,12 @@ static int osd_ldiskfs_filldir(void *buf, OSD_IT_EA_BUFSIZE) RETURN(1); + fid_zero(&namefid); + /* "." is just the object itself. */ if (namelen == 1 && name[0] == '.') { - *fid = obj->oo_dt.do_lu.lo_header->loh_fid; + if (obj != NULL) + *fid = obj->oo_dt.do_lu.lo_header->loh_fid; } else if (d_type & LDISKFS_DIRENT_LUFID) { rec = (struct osd_fid_pack *)(name + namelen + 1); if (osd_fid_unpack(fid, rec) != 0) @@ -6648,18 +6880,34 @@ static int osd_ldiskfs_filldir(void *buf, d_type &= ~LDISKFS_DIRENT_LUFID; /* NOT export local root. */ - if (unlikely(osd_sb(osd_obj2dev(obj))->s_root->d_inode->i_ino == ino)) { + if (obj != NULL && + unlikely(osd_sb(osd_obj2dev(obj))->s_root->d_inode->i_ino == ino)) { ino = obj->oo_inode->i_ino; *fid = obj->oo_dt.do_lu.lo_header->loh_fid; } + if (name[0] == '[') + sscanf(name + 1, SFID, RFID(&namefid)); + if (fid_is_sane(&namefid) || name_is_dot_or_dotdot(name, namelen) || + !obj || !(obj->oo_lma_flags & LUSTRE_ENCRYPT_FL)) { + memcpy(ent->oied_name, name, namelen); + } else { + int presented_len = critical_chars(name, namelen); + + if (presented_len == namelen) + memcpy(ent->oied_name, name, namelen); + else + namelen = critical_encode(name, namelen, + ent->oied_name); + + ent->oied_name[namelen] = '\0'; + } + ent->oied_ino = ino; ent->oied_off = offset; ent->oied_namelen = namelen; ent->oied_type = d_type; - memcpy(ent->oied_name, name, namelen); - it->oie_rd_dirent++; it->oie_dirent = (void *)ent + cfs_size_round(sizeof(*ent) + namelen); RETURN(0); @@ -6675,12 +6923,10 @@ static int osd_ldiskfs_filldir(void *buf, * \retval -ve on error * \retval +1 reach the end of entry */ -static int osd_ldiskfs_it_fill(const struct lu_env *env, - const struct dt_it *di) +int osd_ldiskfs_it_fill(const struct lu_env *env, const struct dt_it *di) { struct osd_it_ea *it = (struct osd_it_ea *)di; struct osd_object *obj = it->oie_obj; - struct inode *inode = obj->oo_inode; struct htree_lock *hlock = NULL; struct file *filp = &it->oie_file; int rc = 0; @@ -6693,29 +6939,27 @@ static int osd_ldiskfs_it_fill(const struct lu_env *env, it->oie_dirent = it->oie_buf; it->oie_rd_dirent = 0; - if (obj->oo_hl_head != NULL) { - hlock = osd_oti_get(env)->oti_hlock; - ldiskfs_htree_lock(hlock, obj->oo_hl_head, - inode, LDISKFS_HLOCK_READDIR); - } else { - down_read(&obj->oo_ext_idx_sem); + if (obj) { + if (obj->oo_hl_head != NULL) { + hlock = osd_oti_get(env)->oti_hlock; + ldiskfs_htree_lock(hlock, obj->oo_hl_head, + obj->oo_inode, + LDISKFS_HLOCK_READDIR); + } else { + down_read(&obj->oo_ext_idx_sem); + } } filp->f_cred = current_cred(); rc = osd_security_file_alloc(filp); if (rc) - RETURN(rc); + GOTO(unlock, rc); filp->f_flags |= O_NOATIME; filp->f_mode |= FMODE_NONOTIFY; rc = iterate_dir(filp, &buf.ctx); if (rc) - RETURN(rc); - - if (hlock != NULL) - ldiskfs_htree_unlock(hlock); - else - up_read(&obj->oo_ext_idx_sem); + GOTO(unlock, rc); if (it->oie_rd_dirent == 0) { /* @@ -6729,6 +6973,13 @@ static int osd_ldiskfs_it_fill(const struct lu_env *env, it->oie_dirent = it->oie_buf; it->oie_it_dirent = 1; } +unlock: + if (obj) { + if (hlock != NULL) + ldiskfs_htree_unlock(hlock); + else + up_read(&obj->oo_ext_idx_sem); + } RETURN(rc); } @@ -6877,11 +7128,10 @@ osd_dirent_reinsert(const struct lu_env *env, struct osd_device *dev, * That means we lose it! */ if (rc != 0) - CDEBUG(D_LFSCK, "%s: fail to reinsert the dirent, " - "dir = %lu/%u, name = %.*s, "DFID": rc = %d\n", - osd_ino2name(inode), - dir->i_ino, dir->i_generation, namelen, - dentry->d_name.name, PFID(fid), rc); + CDEBUG(D_LFSCK, + "%s: fail to reinsert the dirent, dir = %lu/%u, name = %.*s, "DFID": rc = %d\n", + osd_ino2name(inode), dir->i_ino, dir->i_generation, + namelen, dentry->d_name.name, PFID(fid), rc); RETURN(rc); } @@ -6909,6 +7159,7 @@ osd_dirent_check_repair(const struct lu_env *env, struct osd_object *obj, int rc; bool dotdot = false; bool dirty = false; + struct lu_name ln; ENTRY; @@ -6943,8 +7194,11 @@ osd_dirent_check_repair(const struct lu_env *env, struct osd_object *obj, RETURN(rc); } - dentry = osd_child_dentry_by_inode(env, dir, ent->oied_name, - ent->oied_namelen); + rc = obj_name2lu_name(obj, ent->oied_name, ent->oied_namelen, &ln); + if (rc) + RETURN(rc); + + dentry = osd_child_dentry_by_inode(env, dir, ln.ln_name, ln.ln_namelen); rc = osd_get_lma(info, inode, dentry, &info->oti_ost_attrs); if (rc == -ENODATA || !fid_is_sane(&lma->lma_self_fid)) lma = NULL; @@ -7217,6 +7471,8 @@ out_inode: iput(inode); if (rc >= 0 && !dirty) dev->od_dirent_journal = 0; + if (ln.ln_name != ent->oied_name) + kfree(ln.ln_name); return rc; } @@ -7292,8 +7548,6 @@ static inline int osd_it_ea_rec(const struct lu_env *env, rc = osd_ea_fid_get(env, obj, ino, fid, id); } - } else { - osd_id_gen(id, ino, OSD_OII_NOGEN); } } @@ -7303,15 +7557,6 @@ static inline int osd_it_ea_rec(const struct lu_env *env, it->oie_dirent->oied_namelen, it->oie_dirent->oied_type, attr); - if (rc < 0) - RETURN(rc); - - if (osd_remote_fid(env, dev, fid)) - RETURN(0); - - if (likely(!(attr & (LUDA_IGNORE | LUDA_UNKNOWN)) && rc == 0)) - osd_add_oi_cache(oti, dev, id, fid); - RETURN(rc > 0 ? 0 : rc); } @@ -7606,6 +7851,8 @@ static void osd_umount(const struct lu_env *env, struct osd_device *o) if (o->od_mnt != NULL) { shrink_dcache_sb(osd_sb(o)); osd_sync(env, &o->od_dt_dev); + wait_event(o->od_commit_cb_done, + !atomic_read(&o->od_commit_cb_in_flight)); mntput(o->od_mnt); o->od_mnt = NULL; @@ -7614,17 +7861,31 @@ static void osd_umount(const struct lu_env *env, struct osd_device *o) EXIT; } +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 53, 0) +# ifndef LDISKFS_HAS_INCOMPAT_FEATURE +/* Newer kernels provide the ldiskfs_set_feature_largedir() wrapper already, + * which calls ldiskfs_update_dynamic_rev() to update ancient filesystems. + * All ldiskfs filesystems are already v2, so it is a no-op and unnecessary. + * This avoids maintaining patches to export this otherwise-useless function. + */ +void ldiskfs_update_dynamic_rev(struct super_block *sb) +{ + /* do nothing */ +} +# endif +#endif + static int osd_mount(const struct lu_env *env, struct osd_device *o, struct lustre_cfg *cfg) { const char *name = lustre_cfg_string(cfg, 0); const char *dev = lustre_cfg_string(cfg, 1); const char *opts; - unsigned long page, s_flags, lmd_flags = 0; + unsigned long page, s_flags = 0, lmd_flags = 0; struct page *__page; struct file_system_type *type; char *options = NULL; - char *str; + const char *str; struct osd_thread_info *info = osd_oti_get(env); struct lu_fid *fid = &info->oti_fid; struct inode *inode; @@ -7639,11 +7900,9 @@ static int osd_mount(const struct lu_env *env, RETURN(-E2BIG); strcpy(o->od_mntdev, dev); - str = lustre_cfg_string(cfg, 2); - s_flags = simple_strtoul(str, NULL, 0); - str = strstr(str, ":"); - if (str) - lmd_flags = simple_strtoul(str + 1, NULL, 0); + str = lustre_cfg_buf(cfg, 2); + sscanf(str, "%lu:%lu", &s_flags, &lmd_flags); + opts = lustre_cfg_string(cfg, 3); #ifdef __BIG_ENDIAN if (opts == NULL || strstr(opts, "bigendian_extents") == NULL) { @@ -7693,6 +7952,7 @@ static int osd_mount(const struct lu_env *env, "force_over_256tb", "force_over_512tb", "force_over_1024tb", + "resetoi", NULL }; strncat(options, opts, PAGE_SIZE); @@ -7748,7 +8008,7 @@ static int osd_mount(const struct lu_env *env, } if (lmd_flags & LMD_FLG_DEV_RDONLY) { - LCONSOLE_WARN("%s: not support dev_rdonly on this device", + LCONSOLE_WARN("%s: not support dev_rdonly on this device\n", name); GOTO(out_mnt, rc = -EOPNOTSUPP); @@ -7759,6 +8019,7 @@ static int osd_mount(const struct lu_env *env, GOTO(out_mnt, rc = -EINVAL); } +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 53, 0) #ifdef LDISKFS_MOUNT_DIRDATA if (ldiskfs_has_feature_dirdata(o->od_mnt->mnt_sb)) LDISKFS_SB(osd_sb(o))->s_mount_opt |= LDISKFS_MOUNT_DIRDATA; @@ -7768,6 +8029,15 @@ static int osd_mount(const struct lu_env *env, "downgrade to Lustre-1.x again, you can enable it via " "'tune2fs -O dirdata device'\n", name, dev); #endif + /* enable large_dir on MDTs to avoid REMOTE_PARENT_DIR overflow, + * and on very large OSTs to avoid object directory overflow */ + if (unlikely(!ldiskfs_has_feature_largedir(o->od_mnt->mnt_sb) && + !strstr(name, "MGS"))) { + ldiskfs_set_feature_largedir(o->od_mnt->mnt_sb); + LCONSOLE_INFO("%s: enabled 'large_dir' feature on device %s\n", + name, dev); + } +#endif inode = osd_sb(o)->s_root->d_inode; lu_local_obj_fid(fid, OSD_FS_ROOT_OID); rc = osd_ea_fid_set(info, inode, fid, LMAC_NOT_IN_OI, 0); @@ -7777,7 +8047,7 @@ static int osd_mount(const struct lu_env *env, } if (lmd_flags & LMD_FLG_NOSCRUB) - o->od_auto_scrub_interval = AS_NEVER; + o->od_scrub.os_scrub.os_auto_scrub_interval = AS_NEVER; if (blk_queue_nonrot(bdev_get_queue(osd_sb(o)->s_bdev))) { /* do not use pagecache with flash-backed storage */ @@ -7810,6 +8080,8 @@ static struct lu_device *osd_device_fini(const struct lu_env *env, osd_procfs_fini(o); if (o->od_oi_table != NULL) osd_oi_fini(osd_oti_get(env), o); + if (o->od_extent_bytes_percpu) + free_percpu(o->od_extent_bytes_percpu); osd_obj_map_fini(o); osd_umount(env, o); @@ -7822,8 +8094,10 @@ static int osd_device_init0(const struct lu_env *env, { struct lu_device *l = osd2lu_dev(o); struct osd_thread_info *info; - int rc; int cplen = 0; + char *opts = NULL; + bool restored = false; + int rc; /* if the module was re-loaded, env can loose its keys */ rc = lu_env_refill((struct lu_env *)env); @@ -7843,13 +8117,16 @@ static int osd_device_init0(const struct lu_env *env, spin_lock_init(&o->od_lock); o->od_index_backup_policy = LIBP_NONE; o->od_t10_type = 0; + init_waitqueue_head(&o->od_commit_cb_done); o->od_read_cache = 1; o->od_writethrough_cache = 1; o->od_readcache_max_filesize = OSD_MAX_CACHE_SIZE; o->od_readcache_max_iosize = OSD_READCACHE_MAX_IO_MB << 20; o->od_writethrough_max_iosize = OSD_WRITECACHE_MAX_IO_MB << 20; - o->od_auto_scrub_interval = AS_DEFAULT; + o->od_scrub.os_scrub.os_auto_scrub_interval = AS_DEFAULT; + /* default fallocate to unwritten extents: LU-14326/LU-14333 */ + o->od_fallocate_zero_blocks = 0; cplen = strlcpy(o->od_svname, lustre_cfg_string(cfg, 4), sizeof(o->od_svname)); @@ -7887,10 +8164,14 @@ static int osd_device_init0(const struct lu_env *env, if (rc != 0) GOTO(out_site, rc); + opts = lustre_cfg_string(cfg, 3); + if (opts && strstr(opts, "resetoi")) + restored = true; + INIT_LIST_HEAD(&o->od_ios_list); /* setup scrub, including OI files initialization */ o->od_in_init = 1; - rc = osd_scrub_setup(env, o); + rc = osd_scrub_setup(env, o, restored); o->od_in_init = 0; if (rc < 0) GOTO(out_site, rc); @@ -7932,6 +8213,12 @@ static int osd_device_init0(const struct lu_env *env, GOTO(out_procfs, rc); } + o->od_extent_bytes_percpu = alloc_percpu(unsigned int); + if (!o->od_extent_bytes_percpu) { + rc = -ENOMEM; + GOTO(out_procfs, rc); + } + RETURN(0); out_procfs: @@ -8276,10 +8563,10 @@ static int __init osd_init(void) #ifdef CONFIG_KALLSYMS priv_security_file_alloc = - (void *)kallsyms_lookup_name("security_file_alloc"); + (void *)cfs_kallsyms_lookup_name("security_file_alloc"); #endif - rc = class_register_type(&osd_obd_device_ops, NULL, true, NULL, + rc = class_register_type(&osd_obd_device_ops, NULL, true, LUSTRE_OSD_LDISKFS_NAME, &osd_device_type); if (rc) { lu_kmem_fini(ldiskfs_caches);