* GPL HEADER END
*/
/*
- * Copyright (c) 2012, Intel Corporation.
+ * Copyright (c) 2012, 2017, Intel Corporation.
* Use is subject to license terms.
*
* Author: Johann Lombardi <johann@whamcloud.com>
#include <obd.h>
#include "osd_internal.h"
-#include <sys/dnode.h>
-#include <sys/spa.h>
-#include <sys/zap.h>
-#include <sys/dmu_tx.h>
-#include <sys/dsl_prop.h>
-#include <sys/txg.h>
-
-/*
- * the structure tracks per-ID change/state
- */
-struct zfs_id_change {
- struct hlist_node zic_hash;
- __u64 zic_id;
- atomic_t zic_num;
-};
-
-/*
- * callback data for cfs_hash_for_each_safe()
- * used in txg commit and OSD cleanup path
- */
-struct hash_cbdata {
- struct osd_device *hcb_osd;
- uint64_t hcb_zapid;
- dmu_tx_t *hcb_tx;
-};
-
/**
- * Helper function to retrieve DMU object id from fid for accounting object
- */
-static inline uint64_t osd_quota_fid2dmu(const struct lu_fid *fid)
-{
- LASSERT(fid_is_acct(fid));
- if (fid_oid(fid) == ACCT_GROUP_OID)
- return DMU_GROUPUSED_OBJECT;
- return DMU_USERUSED_OBJECT;
-}
-
-/*
- * a note about locking:
- * entries in per-OSD cache never go before umount,
- * so there is no need in locking for lookups.
- *
- * entries in per-txg deltas never go before txg is closed,
- * there is no concurrency between removal/insertions.
- *
- * also, given all above, there is no need in reference counting.
- */
-static struct zfs_id_change *osd_zfs_lookup_by_id(cfs_hash_t *hash, __u64 id)
-{
- struct zfs_id_change *za = NULL;
- struct hlist_node *hnode;
- cfs_hash_bd_t bd;
-
- cfs_hash_bd_get(hash, &id, &bd);
- hnode = cfs_hash_bd_peek_locked(hash, &bd, &id);
- if (hnode != NULL)
- za = container_of0(hnode, struct zfs_id_change, zic_hash);
-
- return za;
-}
-
-static struct zfs_id_change *lookup_or_create_by_id(struct osd_device *osd,
- cfs_hash_t *hash, __u64 id)
-{
- struct zfs_id_change *za, *tmp;
- struct hlist_node *hnode;
- cfs_hash_bd_t bd;
-
- za = osd_zfs_lookup_by_id(hash, id);
- if (likely(za != NULL))
- return za;
-
- OBD_ALLOC_PTR(za);
- if (unlikely(za == NULL))
- return NULL;
-
- za->zic_id = id;
-
- cfs_hash_bd_get(hash, &id, &bd);
- spin_lock(&osd->od_known_txg_lock);
- hnode = cfs_hash_bd_findadd_locked(hash, &bd, &id, &za->zic_hash, 1);
- LASSERT(hnode != NULL);
- tmp = container_of0(hnode, struct zfs_id_change, zic_hash);
- spin_unlock(&osd->od_known_txg_lock);
-
- if (tmp == za) {
- /*
- * our structure got into the hash
- */
- } else {
- /* somebody won the race, we wasted the cycles */
- OBD_FREE_PTR(za);
- }
-
- return tmp;
-}
-
-/*
- * used to maintain per-txg deltas
- */
-static int osd_zfs_acct_id(const struct lu_env *env, cfs_hash_t *hash,
- __u64 id, int delta, struct osd_thandle *oh)
-{
- struct osd_device *osd = osd_dt_dev(oh->ot_super.th_dev);
- struct zfs_id_change *za;
-
- LASSERT(hash);
- LASSERT(oh->ot_tx);
- LASSERT(oh->ot_tx->tx_txg == osd->od_known_txg);
- LASSERT(osd->od_acct_delta != NULL);
-
- za = lookup_or_create_by_id(osd, hash, id);
- if (unlikely(za == NULL))
- return -ENOMEM;
-
- atomic_add(delta, &za->zic_num);
-
- return 0;
-}
-
-/*
- * this function is used to maintain current state for given ID:
- * at the beginning it initializes the cache from correspoding ZAP
- */
-static void osd_zfs_acct_cache_init(const struct lu_env *env,
- struct osd_device *osd,
- cfs_hash_t *hash, __u64 oid,
- __u64 id, int delta,
- struct osd_thandle *oh)
-{
- char *buf = osd_oti_get(env)->oti_buf;
- struct hlist_node *hnode;
- cfs_hash_bd_t bd;
- struct zfs_id_change *za, *tmp;
- __u64 v;
- int rc;
-
- za = osd_zfs_lookup_by_id(hash, id);
- if (likely(za != NULL))
- goto apply;
-
- /*
- * any concurrent thread is running in the same txg, so no on-disk
- * accounting ZAP can be modified until this txg is closed
- * thus all the concurrent threads must be getting the same value
- * from that ZAP and we don't need to serialize lookups
- */
- snprintf(buf, sizeof(osd_oti_get(env)->oti_buf), "%llx", id);
- /* XXX: we should be using zap_lookup_int_key(), but it consumes
- * 20 bytes on the stack for buf .. */
- rc = -zap_lookup(osd->od_objset.os, oid, buf, sizeof(uint64_t), 1, &v);
- if (rc == -ENOENT) {
- v = 0;
- } else if (unlikely(rc != 0)) {
- CERROR("%s: can't access accounting zap %llu\n",
- osd->od_svname, oid);
- return;
- }
-
- OBD_ALLOC_PTR(za);
- if (unlikely(za == NULL)) {
- CERROR("%s: can't allocate za\n", osd->od_svname);
- return;
- }
-
- za->zic_id = id;
- atomic_set(&za->zic_num, v);
-
- cfs_hash_bd_get(hash, &id, &bd);
- spin_lock(&osd->od_known_txg_lock);
- hnode = cfs_hash_bd_findadd_locked(hash, &bd, &id, &za->zic_hash, 1);
- LASSERT(hnode != NULL);
- tmp = container_of0(hnode, struct zfs_id_change, zic_hash);
- spin_unlock(&osd->od_known_txg_lock);
-
- if (tmp == za) {
- /* our structure got into the hash */
- if (rc == -ENOENT) {
- /* there was no entry in ZAP yet, we have
- * to initialize with 0, so that accounting
- * reports can find that and then find our
- * cached value. */
- v = 0;
- rc = -zap_update(osd->od_objset.os, oid, buf,
- sizeof(uint64_t), 1, &v, oh->ot_tx);
- if (unlikely(rc != 0))
- CERROR("%s: can't initialize: rc = %d\n",
- osd->od_svname, rc);
- }
- } else {
- /* somebody won the race, we wasted the cycles */
- OBD_FREE_PTR(za);
- za = tmp;
- }
-
-apply:
- LASSERT(za != NULL);
- atomic_add(delta, &za->zic_num);
-}
-
-static __u32 acct_hashfn(cfs_hash_t *hash_body, const void *key, unsigned mask)
-{
- const __u64 *id = key;
- __u32 result;
-
- result = (__u32) *id;
- return result % mask;
-}
-
-static void *acct_key(struct hlist_node *hnode)
-{
- struct zfs_id_change *ac;
-
- ac = hlist_entry(hnode, struct zfs_id_change, zic_hash);
- return &ac->zic_id;
-}
-
-static int acct_hashkey_keycmp(const void *key,
- struct hlist_node *compared_hnode)
-{
- struct zfs_id_change *ac;
- const __u64 *id = key;
-
- ac = hlist_entry(compared_hnode, struct zfs_id_change, zic_hash);
- return *id == ac->zic_id;
-}
-
-static void *acct_hashobject(struct hlist_node *hnode)
-{
- return hlist_entry(hnode, struct zfs_id_change, zic_hash);
-}
-
-static cfs_hash_ops_t acct_hash_operations = {
- .hs_hash = acct_hashfn,
- .hs_key = acct_key,
- .hs_keycmp = acct_hashkey_keycmp,
- .hs_object = acct_hashobject,
-};
-
-#define ACCT_HASH_OPS (CFS_HASH_NO_LOCK|CFS_HASH_NO_ITEMREF|CFS_HASH_ADD_TAIL)
-
-int osd_zfs_acct_init(const struct lu_env *env, struct osd_device *o)
-{
- int rc = 0;
- ENTRY;
-
- spin_lock_init(&o->od_known_txg_lock);
-
- /* global structure representing current state for given ID */
- o->od_acct_usr = cfs_hash_create("usr", 4, 4, 4, 0, 0, 0,
- &acct_hash_operations,
- ACCT_HASH_OPS);
- if (o->od_acct_usr == NULL)
- GOTO(out, rc = -ENOMEM);
-
- o->od_acct_grp = cfs_hash_create("grp", 4, 4, 4, 0, 0, 0,
- &acct_hash_operations,
- ACCT_HASH_OPS);
- if (o->od_acct_grp == NULL)
- GOTO(out, rc = -ENOMEM);
-
-out:
- RETURN(rc);
-}
-
-static int osd_zfs_delete_item(cfs_hash_t *hs, cfs_hash_bd_t *bd,
- struct hlist_node *node, void *data)
-{
- struct hash_cbdata *d = data;
- struct zfs_id_change *za;
- __u64 v;
- char buf[12];
- int rc;
-
- za = hlist_entry(node, struct zfs_id_change, zic_hash);
-
- /*
- * XXX: should we try to fix accounting we failed to update before?
- */
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 5, 70, 0)
- /*
- * extra checks to ensure our cache matches on-disk state
- */
- snprintf(buf, sizeof(buf), "%llx", za->zic_id);
- rc = -zap_lookup(d->hcb_osd->od_objset.os, d->hcb_zapid,
- buf, sizeof(uint64_t), 1, &v);
- /* pairs with zero value are removed by ZAP automatically */
- if (rc == -ENOENT)
- v = 0;
- if (atomic_read(&za->zic_num) != v) {
- CERROR("%s: INVALID ACCOUNTING FOR %llu %d != %lld: rc = %d\n",
- d->hcb_osd->od_svname, za->zic_id,
- atomic_read(&za->zic_num), v, rc);
- /* XXX: to catch with automated testing */
- LBUG();
- }
-#else
-#warning "remove this additional check before release"
-#endif
-
- cfs_hash_bd_del_locked(hs, bd, node);
- OBD_FREE_PTR(za);
-
- return 0;
-}
-
-void osd_zfs_acct_fini(const struct lu_env *env, struct osd_device *o)
-{
- struct hash_cbdata cbdata;
-
- cbdata.hcb_osd = o;
-
- /* release object accounting cache (owners) */
- cbdata.hcb_zapid = o->od_iusr_oid;
-
- if (o->od_acct_usr) {
- cfs_hash_for_each_safe(o->od_acct_usr, osd_zfs_delete_item,
- &cbdata);
- cfs_hash_putref(o->od_acct_usr);
- o->od_acct_usr = NULL;
- }
-
- /* release object accounting cache (groups) */
- cbdata.hcb_zapid = o->od_igrp_oid;
-
- if (o->od_acct_grp) {
- cfs_hash_for_each_safe(o->od_acct_grp, osd_zfs_delete_item,
- &cbdata);
- cfs_hash_putref(o->od_acct_grp);
- o->od_acct_grp = NULL;
- }
-}
-
-static int osd_zfs_commit_item(cfs_hash_t *hs, cfs_hash_bd_t *bd,
- struct hlist_node *node, void *data)
-{
- struct hash_cbdata *d = data;
- struct osd_device *osd = d->hcb_osd;
- struct zfs_id_change *za;
- int rc;
-
- za = hlist_entry(node, struct zfs_id_change, zic_hash);
-
- rc = -zap_increment_int(osd->od_objset.os, d->hcb_zapid, za->zic_id,
- atomic_read(&za->zic_num), d->hcb_tx);
- if (unlikely(rc != 0))
- CERROR("%s: quota update for UID "LPU64" failed: rc = %d\n",
- osd->od_svname, za->zic_id, rc);
-
- cfs_hash_bd_del_locked(hs, bd, node);
- OBD_FREE_PTR(za);
-
- return 0;
-}
-
-/*
- * this function is called as part of txg commit procedure,
- * no more normal changes are allowed to this txg.
- * we go over all the changes cached in per-txg structure
- * and apply them to actual ZAPs
- */
-#ifdef HAVE_DSL_SYNC_TASK_DO_NOWAIT
-static void osd_zfs_acct_update(void *arg, void *arg2, dmu_tx_t *tx)
-#else
-static void osd_zfs_acct_update(void *arg, dmu_tx_t *tx)
-#endif
-{
- struct osd_zfs_acct_txg *zat = arg;
- struct osd_device *osd = zat->zat_osd;
- struct hash_cbdata cbdata;
-
- cbdata.hcb_osd = osd;
- cbdata.hcb_tx = tx;
-
- CDEBUG(D_OTHER, "COMMIT %llu on %s\n", tx->tx_txg, osd->od_svname);
-
- /* apply changes related to the owners */
- cbdata.hcb_zapid = osd->od_iusr_oid;
- cfs_hash_for_each_safe(zat->zat_usr, osd_zfs_commit_item, &cbdata);
-
- /* apply changes related to the groups */
- cbdata.hcb_zapid = osd->od_igrp_oid;
- cfs_hash_for_each_safe(zat->zat_grp, osd_zfs_commit_item, &cbdata);
-
- cfs_hash_putref(zat->zat_usr);
- cfs_hash_putref(zat->zat_grp);
-
- OBD_FREE_PTR(zat);
-}
-
-#ifdef HAVE_DSL_SYNC_TASK_DO_NOWAIT
-#define dsl_sync_task_nowait(pool, func, arg, blocks, tx) \
- dsl_sync_task_do_nowait(pool, NULL, func, arg, NULL, blocks, tx)
-#endif
-
-/*
- * if any change to the object accounting is going to happen,
- * we create one structure per txg to track all the changes
- * and register special routine to be called as part of txg
- * commit procedure.
+ * Helper function to estimate the number of inodes in use for the given
+ * uid/gid/projid from the block usage
*/
-int osd_zfs_acct_trans_start(const struct lu_env *env, struct osd_thandle *oh)
+static uint64_t osd_objset_user_iused(struct osd_device *osd, uint64_t uidbytes)
{
- struct osd_device *osd = osd_dt_dev(oh->ot_super.th_dev);
- struct osd_zfs_acct_txg *ac = NULL;
- int rc = 0, add_work = 0;
-
- if (likely(oh->ot_tx->tx_txg == osd->od_known_txg)) {
- /* already created */
- return 0;
- }
-
- OBD_ALLOC_PTR(ac);
- if (unlikely(ac == NULL))
- return -ENOMEM;
-
- ac->zat_usr = cfs_hash_create("usr", 4, 4, 4, 0, 0, 0,
- &acct_hash_operations,
- ACCT_HASH_OPS);
- if (unlikely(ac->zat_usr == NULL)) {
- CERROR("%s: can't allocate hash for accounting\n",
- osd->od_svname);
- GOTO(out, rc = -ENOMEM);
- }
-
- ac->zat_grp = cfs_hash_create("grp", 4, 4, 4, 0, 0, 0,
- &acct_hash_operations,
- ACCT_HASH_OPS);
- if (unlikely(ac->zat_grp == NULL)) {
- CERROR("%s: can't allocate hash for accounting\n",
- osd->od_svname);
- GOTO(out, rc = -ENOMEM);
- }
-
- spin_lock(&osd->od_known_txg_lock);
- if (oh->ot_tx->tx_txg != osd->od_known_txg) {
- osd->od_acct_delta = ac;
- osd->od_known_txg = oh->ot_tx->tx_txg;
- add_work = 1;
- }
- spin_unlock(&osd->od_known_txg_lock);
-
- /* schedule a callback to be run in the context of txg
- * once the latter is closed and syncing */
- if (add_work) {
- spa_t *spa = dmu_objset_spa(osd->od_objset.os);
- LASSERT(ac->zat_osd == NULL);
- ac->zat_osd = osd;
- dsl_sync_task_nowait(spa_get_dsl(spa),
- osd_zfs_acct_update,
- ac, 128, oh->ot_tx);
-
- /* no to be freed now */
- ac = NULL;
- }
-
-out:
- if (ac != NULL) {
- /* another thread has installed new structure already */
- if (ac->zat_usr)
- cfs_hash_putref(ac->zat_usr);
- if (ac->zat_grp)
- cfs_hash_putref(ac->zat_grp);
- OBD_FREE_PTR(ac);
- }
-
- return rc;
-}
-
-void osd_zfs_acct_uid(const struct lu_env *env, struct osd_device *osd,
- __u64 uid, int delta, struct osd_thandle *oh)
-{
- int rc;
-
- /* add per-txg job to update accounting */
- rc = osd_zfs_acct_trans_start(env, oh);
- if (unlikely(rc != 0))
- return;
-
- /* maintain per-OSD cached value */
- osd_zfs_acct_cache_init(env, osd, osd->od_acct_usr,
- osd->od_iusr_oid, uid, delta, oh);
-
- /* maintain per-TXG delta */
- osd_zfs_acct_id(env, osd->od_acct_delta->zat_usr, uid, delta, oh);
-
-}
-
-void osd_zfs_acct_gid(const struct lu_env *env, struct osd_device *osd,
- __u64 gid, int delta, struct osd_thandle *oh)
-{
- int rc;
-
- /* add per-txg job to update accounting */
- rc = osd_zfs_acct_trans_start(env, oh);
- if (unlikely(rc != 0))
- return;
-
- /* maintain per-OSD cached value */
- osd_zfs_acct_cache_init(env, osd, osd->od_acct_grp,
- osd->od_igrp_oid, gid, delta, oh);
-
- /* maintain per-TXG delta */
- osd_zfs_acct_id(env, osd->od_acct_delta->zat_grp, gid, delta, oh);
+ uint64_t refdbytes, availbytes, usedobjs, availobjs;
+ uint64_t uidobjs, bshift;
+
+ /* get fresh statfs info */
+ dmu_objset_space(osd->od_os, &refdbytes, &availbytes,
+ &usedobjs, &availobjs);
+
+ /* estimate the number of objects based on the disk usage */
+ bshift = fls64(osd->od_max_blksz) - 1;
+ uidobjs = osd_objs_count_estimate(refdbytes, usedobjs,
+ uidbytes >> bshift, bshift);
+ if (uidbytes > 0)
+ /* if we have at least 1 byte, we have at least one dnode ... */
+ uidobjs = max_t(uint64_t, uidobjs, 1);
+
+ return uidobjs;
}
/**
*/
/**
- * Return space usage consumed by a given uid or gid.
+ * Return space usage consumed by a given uid or gid or projid.
* Block usage is accurrate since it is maintained by DMU itself.
* However, DMU does not provide inode accounting, so the #inodes in use
* is estimated from the block usage and statfs information.
* \param dtrec - is the record to fill with space usage information
* \param dtkey - is the id the of the user or group for which we would
* like to access disk usage.
- * \param capa - is the capability, not used.
*
* \retval +ve - success : exact match
* \retval -ve - failure
*/
static int osd_acct_index_lookup(const struct lu_env *env,
- struct dt_object *dtobj,
- struct dt_rec *dtrec,
- const struct dt_key *dtkey,
- struct lustre_capa *capa)
+ struct dt_object *dtobj,
+ struct dt_rec *dtrec,
+ const struct dt_key *dtkey)
{
- struct osd_thread_info *info = osd_oti_get(env);
- char *buf = info->oti_buf;
- struct lquota_acct_rec *rec = (struct lquota_acct_rec *)dtrec;
- struct osd_object *obj = osd_dt_obj(dtobj);
- struct osd_device *osd = osd_obj2dev(obj);
- uint64_t oid;
- struct zfs_id_change *za = NULL;
- int rc;
+ struct osd_thread_info *info = osd_oti_get(env);
+ char *buf = info->oti_buf;
+ struct lquota_acct_rec *rec = (struct lquota_acct_rec *)dtrec;
+ struct osd_object *obj = osd_dt_obj(dtobj);
+ struct osd_device *osd = osd_obj2dev(obj);
+ dnode_t *dn = obj->oo_dn;
+ size_t buflen = sizeof(info->oti_buf);
+ int rc;
ENTRY;
rec->bspace = rec->ispace = 0;
- /* convert the 64-bit uid/gid into a string */
- sprintf(buf, "%llx", *((__u64 *)dtkey));
- /* fetch DMU object ID (DMU_USERUSED_OBJECT/DMU_GROUPUSED_OBJECT) to be
- * used */
- oid = osd_quota_fid2dmu(lu_object_fid(&dtobj->do_lu));
+ /* convert the 64-bit uid/gid/projid into a string */
+ snprintf(buf, buflen, "%llx", *((__u64 *)dtkey));
+ if (unlikely(!dn)) {
+ CDEBUG(D_QUOTA, "%s: miss accounting obj for %s\n",
+ osd->od_svname, buf);
+
+ RETURN(-ENOENT);
+ }
/* disk usage (in bytes) is maintained by DMU.
* DMU_USERUSED_OBJECT/DMU_GROUPUSED_OBJECT are special objects which
- * not associated with any dmu_but_t (see dnode_special_open()).
- * As a consequence, we cannot use udmu_zap_lookup() here since it
- * requires a valid oo_db. */
- rc = -zap_lookup(osd->od_objset.os, oid, buf, sizeof(uint64_t), 1,
- &rec->bspace);
- if (rc == -ENOENT)
- /* user/group has not created anything yet */
+ * not associated with any dmu_but_t (see dnode_special_open()). */
+ rc = osd_zap_lookup(osd, dn->dn_object, dn, buf, sizeof(uint64_t), 1,
+ &rec->bspace);
+ if (rc == -ENOENT) {
+ /* user/group/project has not created anything yet */
CDEBUG(D_QUOTA, "%s: id %s not found in DMU accounting ZAP\n",
osd->od_svname, buf);
- else if (rc)
+ /* -ENOENT is normal case, convert it as 1. */
+ rc = 1;
+ } else if (rc) {
RETURN(rc);
+ }
- if (osd->od_quota_iused_est) {
+ if (!osd_dmu_userobj_accounting_available(osd)) {
if (rec->bspace != 0)
/* estimate #inodes in use */
- rec->ispace = udmu_objset_user_iused(&osd->od_objset,
- rec->bspace);
- RETURN(+1);
- }
-
- /* as for inode accounting, it is not maintained by DMU, so we just
- * use our own ZAP to track inode usage */
- if (oid == DMU_USERUSED_OBJECT) {
- za = osd_zfs_lookup_by_id(osd->od_acct_usr,
- *((__u64 *)dtkey));
- } else if (oid == DMU_GROUPUSED_OBJECT) {
- za = osd_zfs_lookup_by_id(osd->od_acct_grp,
- *((__u64 *)dtkey));
- }
- if (za) {
- rec->ispace = atomic_read(&za->zic_num);
+ rec->ispace = osd_objset_user_iused(osd, rec->bspace);
+ rc = 1;
} else {
- rc = -zap_lookup(osd->od_objset.os, obj->oo_db->db_object,
- buf, sizeof(uint64_t), 1, &rec->ispace);
+ snprintf(buf, buflen, OSD_DMU_USEROBJ_PREFIX "%llx",
+ *((__u64 *)dtkey));
+ rc = osd_zap_lookup(osd, dn->dn_object, dn, buf,
+ sizeof(uint64_t), 1, &rec->ispace);
+ if (rc == -ENOENT) {
+ CDEBUG(D_QUOTA,
+ "%s: id %s not found dnode accounting\n",
+ osd->od_svname, buf);
+ /* -ENOENT is normal case, convert it as 1. */
+ rc = 1;
+ } else if (rc == 0) {
+ rc = 1;
+ }
}
- if (rc == -ENOENT)
- /* user/group has not created any file yet */
- CDEBUG(D_QUOTA, "%s: id %s not found in accounting ZAP\n",
- osd->od_svname, buf);
- else if (rc)
- RETURN(rc);
-
- RETURN(+1);
+ RETURN(rc);
}
/**
*
* \param dt - osd index object
* \param attr - not used
- * \param capa - BYPASS_CAPA
*/
static struct dt_it *osd_it_acct_init(const struct lu_env *env,
struct dt_object *dt,
- __u32 attr,
- struct lustre_capa *capa)
+ __u32 attr)
{
- struct osd_thread_info *info = osd_oti_get(env);
- struct osd_it_quota *it;
- struct lu_object *lo = &dt->do_lu;
- struct osd_device *osd = osd_dev(lo->lo_dev);
- int rc;
+ struct osd_thread_info *info = osd_oti_get(env);
+ struct osd_it_quota *it;
+ struct osd_object *obj = osd_dt_obj(dt);
+ struct osd_device *osd = osd_obj2dev(obj);
+ dnode_t *dn = obj->oo_dn;
+ int rc;
ENTRY;
- LASSERT(lu_object_exists(lo));
+ if (unlikely(!dn)) {
+ CDEBUG(D_QUOTA, "%s: Not found in DMU accounting ZAP\n",
+ osd->od_svname);
+
+ RETURN(ERR_PTR(-ENOENT));
+ }
if (info == NULL)
RETURN(ERR_PTR(-ENOMEM));
- it = &info->oti_it_quota;
- memset(it, 0, sizeof(*it));
- it->oiq_oid = osd_quota_fid2dmu(lu_object_fid(lo));
+ OBD_ALLOC_PTR(it);
+ if (it == NULL)
+ RETURN(ERR_PTR(-ENOMEM));
- if (it->oiq_oid == DMU_GROUPUSED_OBJECT)
- it->oiq_hash = osd->od_acct_grp;
- else if (it->oiq_oid == DMU_USERUSED_OBJECT)
- it->oiq_hash = osd->od_acct_usr;
- else
- LBUG();
+ memset(it, 0, sizeof(*it));
+ it->oiq_oid = dn->dn_object;
/* initialize zap cursor */
- rc = -udmu_zap_cursor_init(&it->oiq_zc, &osd->od_objset, it->oiq_oid,0);
- if (rc)
+ rc = osd_zap_cursor_init(&it->oiq_zc, osd->od_os, it->oiq_oid, 0);
+ if (rc != 0) {
+ OBD_FREE_PTR(it);
RETURN(ERR_PTR(rc));
+ }
/* take object reference */
- lu_object_get(lo);
+ lu_object_get(&dt->do_lu);
it->oiq_obj = osd_dt_obj(dt);
it->oiq_reset = 1;
*/
static void osd_it_acct_fini(const struct lu_env *env, struct dt_it *di)
{
- struct osd_it_quota *it = (struct osd_it_quota *)di;
+ struct osd_it_quota *it = (struct osd_it_quota *)di;
ENTRY;
- udmu_zap_cursor_fini(it->oiq_zc);
- lu_object_put(env, &it->oiq_obj->oo_dt.do_lu);
+
+ osd_zap_cursor_fini(it->oiq_zc);
+ osd_object_put(env, it->oiq_obj);
+ OBD_FREE_PTR(it);
+
EXIT;
}
/**
+ * Locate the first entry that is for space accounting.
+ */
+static int osd_zap_locate(struct osd_it_quota *it, zap_attribute_t *za)
+{
+ int rc;
+ ENTRY;
+
+ while (1) {
+ rc = -zap_cursor_retrieve(it->oiq_zc, za);
+ if (rc)
+ break;
+
+ if (strncmp(za->za_name, OSD_DMU_USEROBJ_PREFIX,
+ OSD_DMU_USEROBJ_PREFIX_LEN))
+ break;
+
+ zap_cursor_advance(it->oiq_zc);
+ }
+
+ RETURN(rc);
+}
+
+/**
* Move on to the next valid entry.
*
* \param di - osd iterator
static int osd_it_acct_next(const struct lu_env *env, struct dt_it *di)
{
struct osd_it_quota *it = (struct osd_it_quota *)di;
+ zap_attribute_t *za = &osd_oti_get(env)->oti_za;
int rc;
ENTRY;
if (it->oiq_reset == 0)
zap_cursor_advance(it->oiq_zc);
it->oiq_reset = 0;
- rc = -udmu_zap_cursor_retrieve_key(env, it->oiq_zc, NULL, 32);
- if (rc == -ENOENT) /* reached the end */
- RETURN(+1);
- RETURN(rc);
+
+ rc = osd_zap_locate(it, za);
+ RETURN(rc == -ENOENT ? 1 : rc);
}
/**
const struct dt_it *di)
{
struct osd_it_quota *it = (struct osd_it_quota *)di;
- struct osd_thread_info *info = osd_oti_get(env);
- char *buf = info->oti_buf;
- char *p;
+ zap_attribute_t *za = &osd_oti_get(env)->oti_za;
int rc;
ENTRY;
it->oiq_reset = 0;
- rc = -udmu_zap_cursor_retrieve_key(env, it->oiq_zc, buf, 32);
+ rc = osd_zap_locate(it, za);
if (rc)
RETURN(ERR_PTR(rc));
- it->oiq_id = simple_strtoull(buf, &p, 16);
+
+ rc = kstrtoull(za->za_name, 16, &it->oiq_id);
+ if (rc)
+ CERROR("couldn't parse name %s\n", za->za_name);
+
RETURN((struct dt_key *) &it->oiq_id);
}
RETURN((int)sizeof(uint64_t));
}
+/*
+ * zap_cursor_retrieve read from current record.
+ * to read bytes we need to call zap_lookup explicitly.
+ */
+static int osd_zap_cursor_retrieve_value(const struct lu_env *env,
+ struct osd_it_quota *it,
+ char *buf, int buf_size,
+ int *bytes_read)
+{
+ const struct lu_fid *fid = lu_object_fid(&it->oiq_obj->oo_dt.do_lu);
+ zap_attribute_t *za = &osd_oti_get(env)->oti_za;
+ zap_cursor_t *zc = it->oiq_zc;
+ struct osd_device *osd = osd_obj2dev(it->oiq_obj);
+ int rc, actual_size;
+
+ rc = -zap_cursor_retrieve(zc, za);
+ if (unlikely(rc != 0))
+ return rc;
+
+ if (unlikely(za->za_integer_length <= 0))
+ return -ERANGE;
+
+ actual_size = za->za_integer_length * za->za_num_integers;
+
+ if (actual_size > buf_size) {
+ actual_size = buf_size;
+ buf_size = actual_size / za->za_integer_length;
+ } else {
+ buf_size = za->za_num_integers;
+ }
+
+ /* use correct special ID to request bytes used */
+ rc = osd_zap_lookup(osd, fid_oid(fid) == ACCT_GROUP_OID ?
+ DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT, NULL,
+ za->za_name, za->za_integer_length, buf_size, buf);
+ if (likely(rc == 0))
+ *bytes_read = actual_size;
+
+ return rc;
+}
+
/**
* Return pointer to the record under iterator.
*
struct dt_rec *dtrec, __u32 attr)
{
struct osd_thread_info *info = osd_oti_get(env);
- char *buf = info->oti_buf;
+ zap_attribute_t *za = &info->oti_za;
struct osd_it_quota *it = (struct osd_it_quota *)di;
struct lquota_acct_rec *rec = (struct lquota_acct_rec *)dtrec;
struct osd_object *obj = it->oiq_obj;
struct osd_device *osd = osd_obj2dev(obj);
int bytes_read;
- struct zfs_id_change *za;
int rc;
ENTRY;
rec->ispace = rec->bspace = 0;
/* retrieve block usage from the DMU accounting object */
- rc = -udmu_zap_cursor_retrieve_value(env, it->oiq_zc,
- (char *)&rec->bspace,
- sizeof(uint64_t), &bytes_read);
+ rc = osd_zap_cursor_retrieve_value(env, it, (char *)&rec->bspace,
+ sizeof(uint64_t), &bytes_read);
if (rc)
RETURN(rc);
- if (osd->od_quota_iused_est) {
+ if (!osd_dmu_userobj_accounting_available(osd)) {
if (rec->bspace != 0)
/* estimate #inodes in use */
- rec->ispace = udmu_objset_user_iused(&osd->od_objset,
- rec->bspace);
+ rec->ispace = osd_objset_user_iused(osd, rec->bspace);
RETURN(0);
}
/* retrieve key associated with the current cursor */
- rc = -udmu_zap_cursor_retrieve_key(env, it->oiq_zc, buf, 32);
- if (rc)
+ rc = -zap_cursor_retrieve(it->oiq_zc, za);
+ if (unlikely(rc != 0))
RETURN(rc);
- /* inode accounting is not maintained by DMU, so we use our own ZAP to
- * track inode usage */
- za = osd_zfs_lookup_by_id(it->oiq_hash, it->oiq_id);
- if (za != NULL) {
- /* found in the cache */
- rec->ispace = atomic_read(&za->zic_num);
- } else {
- rc = -zap_lookup(osd->od_objset.os,
- it->oiq_obj->oo_db->db_object,
- buf, sizeof(uint64_t), 1, &rec->ispace);
- if (rc == -ENOENT) {
- /* user/group has not created any file yet */
- CDEBUG(D_QUOTA, "%s: id %s not found in ZAP\n",
- osd->od_svname, buf);
- rc = 0;
- }
- }
+ /* inode accounting is maintained by DMU since 0.7.0 */
+ strncpy(info->oti_buf, OSD_DMU_USEROBJ_PREFIX,
+ OSD_DMU_USEROBJ_PREFIX_LEN);
+ strlcpy(info->oti_buf + OSD_DMU_USEROBJ_PREFIX_LEN, za->za_name,
+ sizeof(info->oti_buf) - OSD_DMU_USEROBJ_PREFIX_LEN);
+ rc = osd_zap_lookup(osd, it->oiq_obj->oo_dn->dn_object,
+ it->oiq_obj->oo_dn, info->oti_buf, sizeof(uint64_t),
+ 1, &rec->ispace);
+ if (rc == -ENOENT)
+ /* user/group has not created any file yet */
+ CDEBUG(D_QUOTA, "%s: id %s not found in accounting ZAP\n",
+ osd->od_svname, info->oti_buf);
+ else if (rc)
+ RETURN(rc);
- RETURN(rc);
+ RETURN(0);
}
/**
struct osd_it_quota *it = (struct osd_it_quota *)di;
ENTRY;
it->oiq_reset = 0;
- RETURN(udmu_zap_cursor_serialize(it->oiq_zc));
+ RETURN(osd_zap_cursor_serialize(it->oiq_zc));
}
/**
{
struct osd_it_quota *it = (struct osd_it_quota *)di;
struct osd_device *osd = osd_obj2dev(it->oiq_obj);
+ zap_attribute_t *za = &osd_oti_get(env)->oti_za;
zap_cursor_t *zc;
int rc;
ENTRY;
/* create new cursor pointing to the new hash */
- rc = -udmu_zap_cursor_init(&zc, &osd->od_objset, it->oiq_oid, hash);
+ rc = osd_zap_cursor_init(&zc, osd->od_os, it->oiq_oid, hash);
if (rc)
RETURN(rc);
- udmu_zap_cursor_fini(it->oiq_zc);
+ osd_zap_cursor_fini(it->oiq_zc);
it->oiq_zc = zc;
it->oiq_reset = 0;
- rc = -udmu_zap_cursor_retrieve_key(env, it->oiq_zc, NULL, 32);
+ rc = osd_zap_locate(it, za);
if (rc == 0)
- RETURN(+1);
+ rc = 1;
else if (rc == -ENOENT)
- RETURN(0);
+ rc = 0;
RETURN(rc);
}
* move to the first valid record.
*
* \param di - osd iterator
- * \param key - uid or gid
+ * \param key - uid or gid or projid
*
* \retval +ve - di points to exact matched key
* \retval 0 - di points to the first valid record
* \param osd - is the osd_device
* \param uid - user id of the inode
* \param gid - group id of the inode
+ * \param projid - project id of the inode
* \param space - how many blocks/inodes will be consumed/released
* \param oh - osd transaction handle
- * \param is_blk - block quota or inode quota?
* \param flags - if the operation is write, return no user quota, no
* group quota, or sync commit flags to the caller
- * \param force - set to 1 when changes are performed by root user and thus
- * can't failed with EDQUOT
+ * \param osd_qid_declare_flags - indicate this is a inode/block accounting
+ * and whether changes are performed by root user
*
* \retval 0 - success
* \retval -ve - failure
*/
int osd_declare_quota(const struct lu_env *env, struct osd_device *osd,
- qid_t uid, qid_t gid, long long space,
- struct osd_thandle *oh, bool is_blk, int *flags,
- bool force)
+ qid_t uid, qid_t gid, qid_t projid, long long space,
+ struct osd_thandle *oh, int *flags,
+ enum osd_qid_declare_flags osd_qid_declare_flags)
{
- struct osd_thread_info *info = osd_oti_get(env);
- struct lquota_id_info *qi = &info->oti_qi;
- struct qsd_instance *qsd = osd->od_quota_slave;
- int rcu, rcg; /* user & group rc */
+ struct osd_thread_info *info = osd_oti_get(env);
+ struct lquota_id_info *qi = &info->oti_qi;
+ struct qsd_instance *qsd = NULL;
+ int rcu, rcg, rcp = 0; /* user & group & project rc */
+ struct thandle *th = &oh->ot_super;
+ bool force = !!(osd_qid_declare_flags & OSD_QID_FORCE) ||
+ th->th_ignore_quota;
ENTRY;
+ if (osd_qid_declare_flags & OSD_QID_INODE)
+ qsd = osd->od_quota_slave_md;
+ else if (osd_qid_declare_flags & OSD_QID_BLK)
+ qsd = osd->od_quota_slave_dt;
+ else
+ RETURN(0);
+
if (unlikely(qsd == NULL))
/* quota slave instance hasn't been allocated yet */
RETURN(0);
qi->lqi_id.qid_uid = uid;
qi->lqi_type = USRQUOTA;
qi->lqi_space = space;
- qi->lqi_is_blk = is_blk;
+ qi->lqi_is_blk = !!(osd_qid_declare_flags & OSD_QID_BLK);
rcu = qsd_op_begin(env, qsd, &oh->ot_quota_trans, qi, flags);
-
if (force && (rcu == -EDQUOT || rcu == -EINPROGRESS))
/* ignore EDQUOT & EINPROGRESS when changes are done by root */
rcu = 0;
qi->lqi_id.qid_gid = gid;
qi->lqi_type = GRPQUOTA;
rcg = qsd_op_begin(env, qsd, &oh->ot_quota_trans, qi, flags);
-
if (force && (rcg == -EDQUOT || rcg == -EINPROGRESS))
/* as before, ignore EDQUOT & EINPROGRESS for root */
rcg = 0;
- RETURN(rcu ? rcu : rcg);
+#ifdef ZFS_PROJINHERIT
+ if (rcg && (rcg != -EDQUOT || flags == NULL))
+ RETURN(rcg);
+
+ /* for project quota */
+ if (osd->od_projectused_dn) {
+ qi->lqi_id.qid_projid = projid;
+ qi->lqi_type = PRJQUOTA;
+ rcp = qsd_op_begin(env, qsd, &oh->ot_quota_trans, qi, flags);
+ if (force && (rcp == -EDQUOT || rcp == -EINPROGRESS))
+ rcp = 0;
+ }
+#endif
+
+ RETURN(rcu ? rcu : (rcg ? rcg : rcp));
}