From: Jinshan Xiong Date: Fri, 17 Feb 2017 02:09:33 +0000 (-0800) Subject: LU-2435 osd-zfs: use zfs native dnode accounting X-Git-Tag: 2.9.55~2 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=11afef00b6af407b8987076bd4f1ec9bc77eb75e LU-2435 osd-zfs: use zfs native dnode accounting If ZFS userobj_accounting feature exists, use it for osd-zfs object accounting; otherwise estimate object use by block space. Disable quota check in conf-sanity:32b and sanity-quota:7e if the underlying zfs version is below 0.7.0. For conf-sanity test, if the running ZFS is version 0.7.0 or higher, upgrade the zpool image to enable native dnode accounting. That ensures we also test the upgrade procedure. Test-Parameters: mdtfilesystemtype=zfs ostfilesystemtype=zfs testlist=sanity-quota,sanity-quota,sanity-quota,conf-sanity,conf-sanity envdefinitions=SLOW=yes Change-Id: I0fe47fd0e3d787663a4c72fd708678827fdfb474 Signed-off-by: Jinshan Xiong Reviewed-on: https://review.whamcloud.com/15294 Tested-by: Jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Alex Zhuravlev Reviewed-by: Oleg Drokin --- diff --git a/config/lustre-build-zfs.m4 b/config/lustre-build-zfs.m4 index ff7f115..ded7085 100644 --- a/config/lustre-build-zfs.m4 +++ b/config/lustre-build-zfs.m4 @@ -503,6 +503,18 @@ your distribution. AC_DEFINE(HAVE_DMU_PREFETCH_6ARG, 1, [Have 6 argument dmu_pretch in ZFS]) ]) + dnl # + dnl # ZFS 0.7.0 feature: SPA_FEATURE_USEROBJ_ACCOUNTING + dnl # + LB_CHECK_COMPILE([if zfs has native dnode accounting supported], + dmu_objset_userobjspace_upgrade, [ + #include + ],[ + dmu_objset_userobjspace_upgrade(NULL); + ],[ + AC_DEFINE(HAVE_DMU_USEROBJ_ACCOUNTING, 1, + [Have native dnode accounting in ZFS]) + ]) ]) AM_CONDITIONAL(ZFS_ENABLED, [test "x$enable_zfs" = xyes]) diff --git a/lustre/osd-zfs/osd_handler.c b/lustre/osd-zfs/osd_handler.c index f57e3a7..fa04e09 100644 --- a/lustre/osd-zfs/osd_handler.c +++ b/lustre/osd-zfs/osd_handler.c @@ -295,16 +295,6 @@ static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt, RETURN(0); } - /* When doing our own inode accounting, the ZAPs storing per-uid/gid - * usage are updated at operation execution time, so we should call - * qsd_op_end() straight away. Otherwise (for blk accounting maintained - * by ZFS and when #inode is estimated from #blks) accounting is updated - * at commit time and the call to qsd_op_end() must be delayed */ - if (oh->ot_quota_trans.lqt_id_cnt > 0 && - !oh->ot_quota_trans.lqt_ids[0].lqi_is_blk && - !osd->od_quota_iused_est) - qsd_op_end(env, osd->od_quota_slave, &oh->ot_quota_trans); - rc = dt_txn_hook_stop(env, th); if (rc != 0) CDEBUG(D_OTHER, "%s: transaction hook failed: rc = %d\n", @@ -1068,10 +1058,6 @@ static int osd_mount(const struct lu_env *env, if (rc) GOTO(err, rc); - /* Use our own ZAP for inode accounting by default, this can be changed - * via procfs to estimate the inode usage from the block usage */ - o->od_quota_iused_est = 0; - rc = osd_procfs_init(o, o->od_svname); if (rc) GOTO(err, rc); diff --git a/lustre/osd-zfs/osd_internal.h b/lustre/osd-zfs/osd_internal.h index 4ae2cc3..fb3e47d 100644 --- a/lustre/osd-zfs/osd_internal.h +++ b/lustre/osd-zfs/osd_internal.h @@ -51,6 +51,7 @@ #include #include #include +#include /** * By design including kmem.h overrides the Linux slab interfaces to provide @@ -284,7 +285,6 @@ struct osd_device { unsigned int od_dev_set_rdonly:1, /**< osd_ro() called */ od_prop_rdonly:1, /**< ZFS property readonly */ od_xattr_in_sa:1, - od_quota_iused_est:1, od_is_ost:1, od_posix_acl:1; @@ -712,4 +712,26 @@ static inline void osd_dnode_rele(dnode_t *dn) DB_DNODE_EXIT(db); dmu_buf_rele(&db->db, osd_obj_tag); } + +#ifdef HAVE_DMU_USEROBJ_ACCOUNTING + +#define OSD_DMU_USEROBJ_PREFIX DMU_OBJACCT_PREFIX + +static inline bool osd_dmu_userobj_accounting_available(struct osd_device *osd) +{ + if (unlikely(dmu_objset_userobjspace_upgradable(osd->od_os))) + dmu_objset_userobjspace_upgrade(osd->od_os); + + return dmu_objset_userobjspace_present(osd->od_os); +} +#else + +#define OSD_DMU_USEROBJ_PREFIX "obj-" + +static inline bool osd_dmu_userobj_accounting_available(struct osd_device *osd) +{ + return false; +} +#endif /* #ifdef HAVE_DMU_USEROBJ_ACCOUNTING */ + #endif /* _OSD_INTERNAL_H */ diff --git a/lustre/osd-zfs/osd_lproc.c b/lustre/osd-zfs/osd_lproc.c index e3fd0e9..88c1600 100644 --- a/lustre/osd-zfs/osd_lproc.c +++ b/lustre/osd-zfs/osd_lproc.c @@ -247,37 +247,6 @@ lprocfs_osd_force_sync_seq_write(struct file *file, const char __user *buffer, } LPROC_SEQ_FOPS_WO_TYPE(zfs, osd_force_sync); -static int zfs_osd_iused_est_seq_show(struct seq_file *m, void *data) -{ - struct osd_device *osd = osd_dt_dev((struct dt_device *)m->private); - LASSERT(osd != NULL); - - seq_printf(m, "%d\n", osd->od_quota_iused_est); - return 0; -} - -static ssize_t -zfs_osd_iused_est_seq_write(struct file *file, const char __user *buffer, - size_t count, loff_t *off) -{ - struct seq_file *m = file->private_data; - struct dt_device *dt = m->private; - struct osd_device *osd = osd_dt_dev(dt); - int rc; - __s64 val; - - LASSERT(osd != NULL); - - rc = lprocfs_str_to_s64(buffer, count, &val); - if (rc) - return rc; - - osd->od_quota_iused_est = !!val; - - return count; -} -LPROC_SEQ_FOPS(zfs_osd_iused_est); - LPROC_SEQ_FOPS_RO_TYPE(zfs, dt_blksize); LPROC_SEQ_FOPS_RO_TYPE(zfs, dt_kbytestotal); LPROC_SEQ_FOPS_RO_TYPE(zfs, dt_kbytesfree); @@ -304,8 +273,6 @@ struct lprocfs_vars lprocfs_osd_obd_vars[] = { .fops = &zfs_osd_mntdev_fops }, { .name = "force_sync", .fops = &zfs_osd_force_sync_fops }, - { .name = "quota_iused_estimate", - .fops = &zfs_osd_iused_est_fops }, { 0 } }; diff --git a/lustre/osd-zfs/osd_object.c b/lustre/osd-zfs/osd_object.c index 2e64f18..042ea47 100644 --- a/lustre/osd-zfs/osd_object.c +++ b/lustre/osd-zfs/osd_object.c @@ -522,10 +522,6 @@ static int osd_declare_object_destroy(const struct lu_env *env, osd_declare_xattrs_destroy(env, obj, oh); - /* declare that we'll remove object from inode accounting ZAPs */ - dmu_tx_hold_zap(oh->ot_tx, osd->od_iusr_oid, FALSE, NULL); - dmu_tx_hold_zap(oh->ot_tx, osd->od_igrp_oid, FALSE, NULL); - /* one less inode */ rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid, obj->oo_attr.la_gid, -1, oh, false, NULL, false); @@ -592,22 +588,6 @@ static int osd_object_destroy(const struct lu_env *env, GOTO(out, rc); } - /* Remove object from inode accounting. It is not fatal for the destroy - * operation if something goes wrong while updating accounting, but we - * still log an error message to notify the administrator */ - rc = -zap_increment_int(osd->od_os, osd->od_iusr_oid, - obj->oo_attr.la_uid, -1, oh->ot_tx); - if (rc) - CERROR("%s: failed to remove "DFID" from accounting ZAP for usr" - " %d: rc = %d\n", osd->od_svname, PFID(fid), - obj->oo_attr.la_uid, rc); - rc = -zap_increment_int(osd->od_os, osd->od_igrp_oid, - obj->oo_attr.la_gid, -1, oh->ot_tx); - if (rc) - CERROR("%s: failed to remove "DFID" from accounting ZAP for grp" - " %d: rc = %d\n", osd->od_svname, PFID(fid), - obj->oo_attr.la_gid, rc); - oid = obj->oo_dn->dn_object; if (unlikely(obj->oo_destroy == OSD_DESTROY_NONE)) { /* this may happen if the destroy wasn't declared @@ -898,9 +878,6 @@ static int osd_declare_attr_set(const struct lu_env *env, } if (attr && attr->la_valid & LA_UID) { - /* account for user inode tracking ZAP update */ - dmu_tx_hold_zap(oh->ot_tx, osd->od_iusr_oid, FALSE, NULL); - /* quota enforcement for user */ if (attr->la_uid != obj->oo_attr.la_uid) { rc = qsd_transfer(env, osd->od_quota_slave, @@ -912,9 +889,6 @@ static int osd_declare_attr_set(const struct lu_env *env, } } if (attr && attr->la_valid & LA_GID) { - /* account for user inode tracking ZAP update */ - dmu_tx_hold_zap(oh->ot_tx, osd->od_igrp_oid, FALSE, NULL); - /* quota enforcement for group */ if (attr->la_gid != obj->oo_attr.la_gid) { rc = qsd_transfer(env, osd->od_quota_slave, @@ -1015,38 +989,6 @@ static int osd_attr_set(const struct lu_env *env, struct dt_object *dt, } } - /* do both accounting updates outside oo_attr_lock below */ - if ((valid & LA_UID) && (la->la_uid != obj->oo_attr.la_uid)) { - /* Update user accounting. Failure isn't fatal, but we still - * log an error message */ - rc = -zap_increment_int(osd->od_os, osd->od_iusr_oid, - la->la_uid, 1, oh->ot_tx); - if (rc) - CERROR("%s: failed to update accounting ZAP for user " - "%d (%d)\n", osd->od_svname, la->la_uid, rc); - rc = -zap_increment_int(osd->od_os, osd->od_iusr_oid, - obj->oo_attr.la_uid, -1, oh->ot_tx); - if (rc) - CERROR("%s: failed to update accounting ZAP for user " - "%d (%d)\n", osd->od_svname, - obj->oo_attr.la_uid, rc); - } - if ((valid & LA_GID) && (la->la_gid != obj->oo_attr.la_gid)) { - /* Update group accounting. Failure isn't fatal, but we still - * log an error message */ - rc = -zap_increment_int(osd->od_os, osd->od_igrp_oid, - la->la_gid, 1, oh->ot_tx); - if (rc) - CERROR("%s: failed to update accounting ZAP for user " - "%d (%d)\n", osd->od_svname, la->la_gid, rc); - rc = -zap_increment_int(osd->od_os, osd->od_igrp_oid, - obj->oo_attr.la_gid, -1, oh->ot_tx); - if (rc) - CERROR("%s: failed to update accounting ZAP for user " - "%d (%d)\n", osd->od_svname, - obj->oo_attr.la_gid, rc); - } - write_lock(&obj->oo_attr_lock); cnt = 0; if (valid & LA_ATIME) { @@ -1202,10 +1144,6 @@ static int osd_declare_object_create(const struct lu_env *env, zapid = osd_get_name_n_idx(env, osd, fid, NULL, 0); dmu_tx_hold_zap(oh->ot_tx, zapid, TRUE, NULL); - /* we will also update inode accounting ZAPs */ - dmu_tx_hold_zap(oh->ot_tx, osd->od_iusr_oid, FALSE, NULL); - dmu_tx_hold_zap(oh->ot_tx, osd->od_igrp_oid, FALSE, NULL); - /* will help to find FID->ino mapping at dt_insert() */ osd_idc_find_and_init(env, osd, obj); @@ -1537,10 +1475,6 @@ static int osd_object_create(const struct lu_env *env, struct dt_object *dt, LASSERT(th != NULL); oh = container_of0(th, struct osd_thandle, ot_super); - /* - * XXX missing: Quote handling. - */ - LASSERT(obj->oo_dn == NULL); /* to follow ZFS on-disk format we need @@ -1608,21 +1542,6 @@ static int osd_object_create(const struct lu_env *env, struct dt_object *dt, GOTO(out, rc); osd_idc_find_and_init(env, osd, obj); - /* Add new object to inode accounting. - * Errors are not considered as fatal */ - rc = -zap_increment_int(osd->od_os, osd->od_iusr_oid, - (attr->la_valid & LA_UID) ? attr->la_uid : 0, 1, - oh->ot_tx); - if (rc) - CERROR("%s: failed to add "DFID" to accounting ZAP for usr %d " - "(%d)\n", osd->od_svname, PFID(fid), attr->la_uid, rc); - rc = -zap_increment_int(osd->od_os, osd->od_igrp_oid, - (attr->la_valid & LA_GID) ? attr->la_gid : 0, 1, - oh->ot_tx); - if (rc) - CERROR("%s: failed to add "DFID" to accounting ZAP for grp %d " - "(%d)\n", osd->od_svname, PFID(fid), attr->la_gid, rc); - out: if (unlikely(rc && dn)) { dmu_object_free(osd->od_os, dn->dn_object, oh->ot_tx); diff --git a/lustre/osd-zfs/osd_quota.c b/lustre/osd-zfs/osd_quota.c index 2e9152c..4ef1ad5 100644 --- a/lustre/osd-zfs/osd_quota.c +++ b/lustre/osd-zfs/osd_quota.c @@ -92,6 +92,7 @@ static int osd_acct_index_lookup(const struct lu_env *env, { struct osd_thread_info *info = osd_oti_get(env); char *buf = info->oti_buf; + size_t buflen = sizeof(info->oti_buf); struct lquota_acct_rec *rec = (struct lquota_acct_rec *)dtrec; struct osd_object *obj = osd_dt_obj(dtobj); struct osd_device *osd = osd_obj2dev(obj); @@ -102,44 +103,44 @@ static int osd_acct_index_lookup(const struct lu_env *env, rec->bspace = rec->ispace = 0; /* convert the 64-bit uid/gid into a string */ - sprintf(buf, "%llx", *((__u64 *)dtkey)); + snprintf(buf, buflen, "%llx", *((__u64 *)dtkey)); /* fetch DMU object ID (DMU_USERUSED_OBJECT/DMU_GROUPUSED_OBJECT) to be * used */ oid = osd_quota_fid2dmu(lu_object_fid(&dtobj->do_lu)); /* disk usage (in bytes) is maintained by DMU. * DMU_USERUSED_OBJECT/DMU_GROUPUSED_OBJECT are special objects which - * not associated with any dmu_but_t (see dnode_special_open()). - * As a consequence, we cannot use udmu_zap_lookup() here since it - * requires a valid oo_dn. */ - rc = -zap_lookup(osd->od_os, oid, buf, sizeof(uint64_t), 1, + * not associated with any dmu_but_t (see dnode_special_open()). */ + rc = zap_lookup(osd->od_os, oid, buf, sizeof(uint64_t), 1, &rec->bspace); - if (rc == -ENOENT) + if (rc == -ENOENT) { /* user/group has not created anything yet */ CDEBUG(D_QUOTA, "%s: id %s not found in DMU accounting ZAP\n", osd->od_svname, buf); - else if (rc) + } else if (rc) { RETURN(rc); + } - if (osd->od_quota_iused_est) { + if (!osd_dmu_userobj_accounting_available(osd)) { if (rec->bspace != 0) /* estimate #inodes in use */ rec->ispace = osd_objset_user_iused(osd, rec->bspace); - RETURN(+1); + rc = 1; + } else { + snprintf(buf, buflen, OSD_DMU_USEROBJ_PREFIX "%llx", + *((__u64 *)dtkey)); + rc = zap_lookup(osd->od_os, oid, buf, sizeof(uint64_t), 1, + &rec->ispace); + if (rc == -ENOENT) { + CDEBUG(D_QUOTA, + "%s: id %s not found dnode accounting\n", + osd->od_svname, buf); + } else if (rc == 0) { + rc = 1; + } } - /* as for inode accounting, it is not maintained by DMU, so we just - * use our own ZAP to track inode usage */ - rc = -zap_lookup(osd->od_os, obj->oo_dn->dn_object, - buf, sizeof(uint64_t), 1, &rec->ispace); - if (rc == -ENOENT) - /* user/group has not created any file yet */ - CDEBUG(D_QUOTA, "%s: id %s not found in accounting ZAP\n", - osd->od_svname, buf); - else if (rc) - RETURN(rc); - - RETURN(+1); + RETURN(rc); } /** @@ -329,7 +330,7 @@ static int osd_it_acct_rec(const struct lu_env *env, if (rc) RETURN(rc); - if (osd->od_quota_iused_est) { + if (!osd_dmu_userobj_accounting_available(osd)) { if (rec->bspace != 0) /* estimate #inodes in use */ rec->ispace = osd_objset_user_iused(osd, rec->bspace); diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index 59c1c9b..2c72876 100755 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -1606,6 +1606,19 @@ t32_verify_quota() { local qval local cmd + # LU-2435: if the underlying zfs doesn't support userobj_accounting, + # lustre will estimate the object count usage. This fails quota + # verification in 32b. The object quota usage should be accurate after + # zfs-0.7.0 is released. + [ $fstype == "zfs" ] && { + local zfs_version=$(do_node $node cat /sys/module/zfs/version) + + [ $(version_code $zfs_version) -lt $(version_code 0.7.0) ] && { + echo "Skip quota verify for zfs: $zfs_version" + return 0 + } + } + $LFS quota -u $T32_QID -v $mnt qval=$($LFS quota -v -u $T32_QID $mnt | @@ -1767,6 +1780,10 @@ t32_test() { $ZPOOL import -f -d $tmp $poolname" done + # upgrade zpool to latest supported features, including + # dnode quota accounting in 0.7.0 + $r "$ZPOOL upgrade -a" + mdt_dev=t32fs-mdt1/mdt1 ost_dev=t32fs-ost1/ost1 ! $mdt2_is_available || mdt2_dev=t32fs-mdt2/mdt2 @@ -2290,8 +2307,12 @@ t32_test() { if [[ $fstype == zfs ]]; then local poolname=t32fs-mdt1 $r "modprobe zfs; - $ZPOOL list -H $poolname >/dev/null 2>&1 || + $ZPOOL list -H $poolname >/dev/null 2>&1 || $ZPOOL import -f -d $tmp $poolname" + + # upgrade zpool to latest supported features, + # including dnode quota accounting in 0.7.0 + $r "$ZPOOL upgrade $poolname" fi # mount a second time to make sure we didnt leave upgrade flag on diff --git a/lustre/tests/sanity-quota.sh b/lustre/tests/sanity-quota.sh index 2976b54..439488d 100755 --- a/lustre/tests/sanity-quota.sh +++ b/lustre/tests/sanity-quota.sh @@ -1179,6 +1179,15 @@ run_test 7d "Quota reintegration (Transfer index in multiple bulks)" test_7e() { [ "$MDSCOUNT" -lt "2" ] && skip "Required more MDTs" && return + # LU-2435: skip this quota test if underlying zfs version has not + # supported native dnode accounting + [ "$(facet_fstype mds1)" == "zfs" ] && { + local zfs_version=$(do_facet mds1 cat /sys/module/zfs/version) + + [ $(version_code $zfs_version) -lt $(version_code 0.7.0) ] && + skip "requires zfs version at least 0.7.0" && return + } + local ilimit=$((1024 * 2)) # 2k inodes local TESTFILE=$DIR/${tdir}-1/$tfile