Whamcloud - gitweb
LU-2733 osd: not return "-ENOENT" for zfs osd_object_init
[fs/lustre-release.git] / lustre / osd-zfs / osd_object.c
index a241376..ac6b52d 100644 (file)
@@ -28,7 +28,7 @@
  * Use is subject to license terms.
  */
 /*
- * Copyright (c) 2011, 2012 Whamcloud, Inc.
+ * Copyright (c) 2012, Intel Corporation.
  * Use is subject to license terms.
  */
 /*
@@ -77,6 +77,7 @@ static char *osd_obj_tag = "osd_object";
 static struct dt_object_operations osd_obj_ops;
 static struct lu_object_operations osd_lu_obj_ops;
 extern struct dt_body_operations osd_body_ops;
+static struct dt_object_operations osd_obj_otable_it_ops;
 
 extern cfs_mem_cache_t *osd_object_kmem;
 
@@ -123,10 +124,12 @@ osd_object_sa_dirty_add(struct osd_object *obj, struct osd_thandle *oh)
        if (!cfs_list_empty(&obj->oo_sa_linkage))
                return;
 
-       cfs_down(&oh->ot_sa_lock);
+       down(&oh->ot_sa_lock);
+       write_lock(&obj->oo_attr_lock);
        if (likely(cfs_list_empty(&obj->oo_sa_linkage)))
                cfs_list_add(&obj->oo_sa_linkage, &oh->ot_sa_list);
-       cfs_up(&oh->ot_sa_lock);
+       write_unlock(&obj->oo_attr_lock);
+       up(&oh->ot_sa_lock);
 }
 
 /*
@@ -136,14 +139,16 @@ void osd_object_sa_dirty_rele(struct osd_thandle *oh)
 {
        struct osd_object *obj;
 
-       cfs_down(&oh->ot_sa_lock);
+       down(&oh->ot_sa_lock);
        while (!cfs_list_empty(&oh->ot_sa_list)) {
                obj = cfs_list_entry(oh->ot_sa_list.next,
                                     struct osd_object, oo_sa_linkage);
                sa_spill_rele(obj->oo_sa_hdl);
+               write_lock(&obj->oo_attr_lock);
                cfs_list_del_init(&obj->oo_sa_linkage);
+               write_unlock(&obj->oo_attr_lock);
        }
-       cfs_up(&oh->ot_sa_lock);
+       up(&oh->ot_sa_lock);
 }
 
 /*
@@ -294,9 +299,9 @@ struct lu_object *osd_object_alloc(const struct lu_env *env,
                mo->oo_dt.do_ops = &osd_obj_ops;
                l->lo_ops = &osd_lu_obj_ops;
                CFS_INIT_LIST_HEAD(&mo->oo_sa_linkage);
-               cfs_init_rwsem(&mo->oo_sem);
-               cfs_sema_init(&mo->oo_guard, 1);
-               cfs_rwlock_init(&mo->oo_attr_lock);
+               init_rwsem(&mo->oo_sem);
+               sema_init(&mo->oo_guard, 1);
+               rwlock_init(&mo->oo_attr_lock);
                return l;
        } else {
                return NULL;
@@ -359,6 +364,12 @@ static int osd_object_init(const struct lu_env *env, struct lu_object *l,
 
        LASSERT(osd_invariant(obj));
 
+       if (fid_is_otable_it(&l->lo_header->loh_fid)) {
+               obj->oo_dt.do_ops = &osd_obj_otable_it_ops;
+               l->lo_header->loh_attr |= LOHA_EXISTS;
+               RETURN(0);
+       }
+
        rc = osd_fid_lookup(env, osd, lu_object_fid(l), &oid);
        if (rc == 0) {
                LASSERT(obj->oo_db == NULL);
@@ -452,6 +463,7 @@ static int osd_declare_object_destroy(const struct lu_env *env,
        struct osd_device       *osd = osd_obj2dev(obj);
        struct osd_thandle      *oh;
        uint64_t                 zapid;
+       int                      rc;
        ENTRY;
 
        LASSERT(th != NULL);
@@ -474,15 +486,24 @@ static int osd_declare_object_destroy(const struct lu_env *env,
        dmu_tx_hold_bonus(oh->ot_tx, osd->od_igrp_oid);
        dmu_tx_hold_zap(oh->ot_tx, osd->od_igrp_oid, 0, buf);
 
-       RETURN(0);
+       /* one less inode */
+       rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid,
+                              obj->oo_attr.la_gid, -1, oh, false, NULL, false);
+       if (rc)
+               RETURN(rc);
+
+       /* data to be truncated */
+       rc = osd_declare_quota(env, osd, obj->oo_attr.la_uid,
+                              obj->oo_attr.la_gid, 0, oh, true, NULL, false);
+       RETURN(rc);
 }
 
 int __osd_object_free(udmu_objset_t *uos, uint64_t oid, dmu_tx_t *tx)
 {
        LASSERT(uos->objects != 0);
-       cfs_spin_lock(&uos->lock);
+       spin_lock(&uos->lock);
        uos->objects--;
-       cfs_spin_unlock(&uos->lock);
+       spin_unlock(&uos->lock);
 
        return -dmu_object_free(uos->os, oid, tx);
 }
@@ -648,7 +669,7 @@ static void osd_object_read_lock(const struct lu_env *env,
 
        LASSERT(osd_invariant(obj));
 
-       cfs_down_read(&obj->oo_sem);
+       down_read(&obj->oo_sem);
 }
 
 static void osd_object_write_lock(const struct lu_env *env,
@@ -658,7 +679,7 @@ static void osd_object_write_lock(const struct lu_env *env,
 
        LASSERT(osd_invariant(obj));
 
-       cfs_down_write(&obj->oo_sem);
+       down_write(&obj->oo_sem);
 }
 
 static void osd_object_read_unlock(const struct lu_env *env,
@@ -667,7 +688,7 @@ static void osd_object_read_unlock(const struct lu_env *env,
        struct osd_object *obj = osd_dt_obj(dt);
 
        LASSERT(osd_invariant(obj));
-       cfs_up_read(&obj->oo_sem);
+       up_read(&obj->oo_sem);
 }
 
 static void osd_object_write_unlock(const struct lu_env *env,
@@ -676,7 +697,7 @@ static void osd_object_write_unlock(const struct lu_env *env,
         struct osd_object *obj = osd_dt_obj(dt);
 
         LASSERT(osd_invariant(obj));
-        cfs_up_write(&obj->oo_sem);
+       up_write(&obj->oo_sem);
 }
 
 static int osd_object_write_locked(const struct lu_env *env,
@@ -687,9 +708,9 @@ static int osd_object_write_locked(const struct lu_env *env,
 
        LASSERT(osd_invariant(obj));
 
-       if (cfs_down_write_trylock(&obj->oo_sem)) {
+       if (down_write_trylock(&obj->oo_sem)) {
                rc = 0;
-               cfs_up_write(&obj->oo_sem);
+               up_write(&obj->oo_sem);
        }
        return rc;
 }
@@ -707,14 +728,18 @@ static int osd_attr_get(const struct lu_env *env,
        LASSERT(osd_invariant(obj));
        LASSERT(obj->oo_db);
 
-       cfs_read_lock(&obj->oo_attr_lock);
+       read_lock(&obj->oo_attr_lock);
        *attr = obj->oo_attr;
-       cfs_read_unlock(&obj->oo_attr_lock);
+       read_unlock(&obj->oo_attr_lock);
 
        /* with ZFS_DEBUG zrl_add_debug() called by DB_DNODE_ENTER()
         * from within sa_object_size() can block on a mutex, so
         * we can't call sa_object_size() holding rwlock */
        sa_object_size(obj->oo_sa_hdl, &blksize, &blocks);
+       /* we do not control size of indices, so always calculate
+        * it from number of blocks reported by DMU */
+       if (S_ISDIR(attr->la_mode))
+               attr->la_size = 512 * blocks;
        /* Block size may be not set; suggest maximal I/O transfers. */
        if (blksize == 0)
                blksize = 1ULL << SPA_MAXBLOCKSHIFT;
@@ -726,15 +751,80 @@ static int osd_attr_get(const struct lu_env *env,
        return 0;
 }
 
+/* Simple wrapper on top of qsd API which implement quota transfer for osd
+ * setattr needs. As a reminder, only the root user can change ownership of
+ * a file, that's why EDQUOT & EINPROGRESS errors are discarded */
+static inline int qsd_transfer(const struct lu_env *env,
+                              struct qsd_instance *qsd,
+                              struct lquota_trans *trans, int qtype,
+                              __u64 orig_id, __u64 new_id, __u64 bspace,
+                              struct lquota_id_info *qi)
+{
+       int     rc;
+
+       if (unlikely(qsd == NULL))
+               return 0;
+
+       LASSERT(qtype >= 0 && qtype < MAXQUOTAS);
+       qi->lqi_type = qtype;
+
+       /* inode accounting */
+       qi->lqi_is_blk = false;
+
+       /* one more inode for the new owner ... */
+       qi->lqi_id.qid_uid = new_id;
+       qi->lqi_space      = 1;
+       rc = qsd_op_begin(env, qsd, trans, qi, NULL);
+       if (rc == -EDQUOT || rc == -EINPROGRESS)
+               rc = 0;
+       if (rc)
+               return rc;
+
+       /* and one less inode for the current id */
+       qi->lqi_id.qid_uid = orig_id;;
+       qi->lqi_space      = -1;
+       /* can't get EDQUOT when reducing usage */
+       rc = qsd_op_begin(env, qsd, trans, qi, NULL);
+       if (rc == -EINPROGRESS)
+               rc = 0;
+       if (rc)
+               return rc;
+
+       /* block accounting */
+       qi->lqi_is_blk = true;
+
+       /* more blocks for the new owner ... */
+       qi->lqi_id.qid_uid = new_id;
+       qi->lqi_space      = bspace;
+       rc = qsd_op_begin(env, qsd, trans, qi, NULL);
+       if (rc == -EDQUOT || rc == -EINPROGRESS)
+               rc = 0;
+       if (rc)
+               return rc;
+
+       /* and finally less blocks for the current owner */
+       qi->lqi_id.qid_uid = orig_id;
+       qi->lqi_space      = -bspace;
+       rc = qsd_op_begin(env, qsd, trans, qi, NULL);
+       /* can't get EDQUOT when reducing usage */
+       if (rc == -EINPROGRESS)
+               rc = 0;
+       return rc;
+}
+
 static int osd_declare_attr_set(const struct lu_env *env,
                                struct dt_object *dt,
                                const struct lu_attr *attr,
                                struct thandle *handle)
 {
+       struct osd_thread_info  *info = osd_oti_get(env);
        char                    *buf = osd_oti_get(env)->oti_str;
        struct osd_object       *obj = osd_dt_obj(dt);
        struct osd_device       *osd = osd_obj2dev(obj);
        struct osd_thandle      *oh;
+       uint64_t                 bspace;
+       uint32_t                 blksize;
+       int                      rc;
        ENTRY;
 
        if (!dt_object_exists(dt)) {
@@ -750,15 +840,38 @@ static int osd_declare_attr_set(const struct lu_env *env,
        LASSERT(obj->oo_sa_hdl != NULL);
        dmu_tx_hold_sa(oh->ot_tx, obj->oo_sa_hdl, 0);
 
+       sa_object_size(obj->oo_sa_hdl, &blksize, &bspace);
+       bspace = toqb(bspace * blksize);
+
        if (attr && attr->la_valid & LA_UID) {
                /* account for user inode tracking ZAP update */
                dmu_tx_hold_bonus(oh->ot_tx, osd->od_iusr_oid);
                dmu_tx_hold_zap(oh->ot_tx, osd->od_iusr_oid, TRUE, buf);
+
+               /* quota enforcement for user */
+               if (attr->la_uid != obj->oo_attr.la_uid) {
+                       rc = qsd_transfer(env, osd->od_quota_slave,
+                                         &oh->ot_quota_trans, USRQUOTA,
+                                         obj->oo_attr.la_uid, attr->la_uid,
+                                         bspace, &info->oti_qi);
+                       if (rc)
+                               RETURN(rc);
+               }
        }
        if (attr && attr->la_valid & LA_GID) {
                /* account for user inode tracking ZAP update */
                dmu_tx_hold_bonus(oh->ot_tx, osd->od_igrp_oid);
                dmu_tx_hold_zap(oh->ot_tx, osd->od_igrp_oid, TRUE, buf);
+
+               /* quota enforcement for group */
+               if (attr->la_gid != obj->oo_attr.la_gid) {
+                       rc = qsd_transfer(env, osd->od_quota_slave,
+                                         &oh->ot_quota_trans, GRPQUOTA,
+                                         obj->oo_attr.la_gid, attr->la_gid,
+                                         bspace, &info->oti_qi);
+                       if (rc)
+                               RETURN(rc);
+               }
        }
 
        RETURN(0);
@@ -834,7 +947,7 @@ static int osd_attr_set(const struct lu_env *env, struct dt_object *dt,
                                obj->oo_attr.la_gid, rc);
        }
 
-       cfs_write_lock(&obj->oo_attr_lock);
+       write_lock(&obj->oo_attr_lock);
        cnt = 0;
        if (la->la_valid & LA_ATIME) {
                osa->atime[0] = obj->oo_attr.la_atime = la->la_atime;
@@ -890,7 +1003,7 @@ static int osd_attr_set(const struct lu_env *env, struct dt_object *dt,
                                 &osa->gid, 8);
        }
        obj->oo_attr.la_valid |= la->la_valid;
-       cfs_write_unlock(&obj->oo_attr_lock);
+       write_unlock(&obj->oo_attr_lock);
 
        rc = osd_object_sa_bulk_update(obj, bulk, cnt, oh);
 
@@ -905,7 +1018,8 @@ static int osd_attr_set(const struct lu_env *env, struct dt_object *dt,
  */
 
 static void osd_ah_init(const struct lu_env *env, struct dt_allocation_hint *ah,
-                       struct dt_object *parent, cfs_umode_t child_mode)
+                       struct dt_object *parent, struct dt_object *child,
+                       cfs_umode_t child_mode)
 {
        LASSERT(ah);
 
@@ -926,6 +1040,7 @@ static int osd_declare_object_create(const struct lu_env *env,
        struct osd_device       *osd = osd_obj2dev(obj);
        struct osd_thandle      *oh;
        uint64_t                 zapid;
+       int                      rc;
        ENTRY;
 
        LASSERT(dof);
@@ -976,7 +1091,12 @@ static int osd_declare_object_create(const struct lu_env *env,
 
        dmu_tx_hold_sa_create(oh->ot_tx, ZFS_SA_BASE_ATTR_SIZE);
 
-       RETURN(0);
+       __osd_xattr_declare_set(env, obj, sizeof(struct lustre_mdt_attrs),
+                               XATTR_NAME_LMA, oh);
+
+       rc = osd_declare_quota(env, osd, attr->la_uid, attr->la_gid, 1, oh,
+                              false, NULL, false);
+       RETURN(rc);
 }
 
 int __osd_attr_init(const struct lu_env *env, udmu_objset_t *uos,
@@ -1067,9 +1187,9 @@ int __osd_object_create(const struct lu_env *env, udmu_objset_t *uos,
        int      rc;
 
        LASSERT(tag);
-       cfs_spin_lock(&uos->lock);
+       spin_lock(&uos->lock);
        uos->objects++;
-       cfs_spin_unlock(&uos->lock);
+       spin_unlock(&uos->lock);
 
        /* Assert that the transaction has been assigned to a
           transaction group. */
@@ -1108,9 +1228,9 @@ int __osd_zap_create(const struct lu_env *env, udmu_objset_t *uos,
 
        LASSERT(tag);
 
-       cfs_spin_lock(&uos->lock);
+       spin_lock(&uos->lock);
        uos->objects++;
-       cfs_spin_unlock(&uos->lock);
+       spin_unlock(&uos->lock);
 
        /* Assert that the transaction has been assigned to a
           transaction group. */
@@ -1257,6 +1377,24 @@ static osd_obj_type_f osd_create_type_f(enum dt_format_type type)
 /*
  * Primitives for directory (i.e. ZAP) handling
  */
+static inline int osd_init_lma(const struct lu_env *env, struct osd_object *obj,
+                              const struct lu_fid *fid, struct osd_thandle *oh)
+{
+       struct osd_thread_info  *info = osd_oti_get(env);
+       struct lustre_mdt_attrs *lma = &info->oti_mdt_attrs;
+       struct lu_buf            buf;
+       int rc;
+
+       lustre_lma_init(lma, fid);
+       lustre_lma_swab(lma);
+       buf.lb_buf = lma;
+       buf.lb_len = sizeof(*lma);
+
+       rc = osd_xattr_set_internal(env, obj, &buf, XATTR_NAME_LMA,
+                                   LU_XATTR_CREATE, oh, BYPASS_CAPA);
+
+       return rc;
+}
 
 /*
  * Concurrency: @dt is write locked.
@@ -1282,7 +1420,7 @@ static int osd_object_create(const struct lu_env *env, struct dt_object *dt,
        /* concurrent create declarations should not see
         * the object inconsistent (db, attr, etc).
         * in regular cases acquisition should be cheap */
-       cfs_down(&obj->oo_guard);
+       down(&obj->oo_guard);
 
        LASSERT(osd_invariant(obj));
        LASSERT(!dt_object_exists(dt));
@@ -1332,8 +1470,16 @@ static int osd_object_create(const struct lu_env *env, struct dt_object *dt,
        LASSERT(ergo(rc == 0, dt_object_exists(dt)));
        LASSERT(osd_invariant(obj));
 
+       rc = osd_init_lma(env, obj, fid, oh);
+       if (rc) {
+               CERROR("%s: can not set LMA on "DFID": rc = %d\n",
+                      osd->od_svname, PFID(fid), rc);
+               /* ignore errors during LMA initialization */
+               rc = 0;
+       }
+
 out:
-       cfs_up(&obj->oo_guard);
+       up(&obj->oo_guard);
        RETURN(rc);
 }
 
@@ -1366,9 +1512,9 @@ static int osd_object_ref_add(const struct lu_env *env,
 
        oh = container_of0(handle, struct osd_thandle, ot_super);
 
-       cfs_write_lock(&obj->oo_attr_lock);
+       write_lock(&obj->oo_attr_lock);
        nlink = ++obj->oo_attr.la_nlink;
-       cfs_write_unlock(&obj->oo_attr_lock);
+       write_unlock(&obj->oo_attr_lock);
 
        rc = osd_object_sa_update(obj, SA_ZPL_LINKS(uos), &nlink, 8, oh);
        return rc;
@@ -1404,9 +1550,9 @@ static int osd_object_ref_del(const struct lu_env *env,
        oh = container_of0(handle, struct osd_thandle, ot_super);
        LASSERT(!lu_object_is_dying(dt->do_lu.lo_header));
 
-       cfs_write_lock(&obj->oo_attr_lock);
+       write_lock(&obj->oo_attr_lock);
        nlink = --obj->oo_attr.la_nlink;
-       cfs_write_unlock(&obj->oo_attr_lock);
+       write_unlock(&obj->oo_attr_lock);
 
        rc = osd_object_sa_update(obj, SA_ZPL_LINKS(uos), &nlink, 8, oh);
        return rc;
@@ -1430,14 +1576,14 @@ static int capa_is_sane(const struct lu_env *env, struct osd_device *dev,
                RETURN(rc);
        }
 
-       cfs_spin_lock(&capa_lock);
+       spin_lock(&capa_lock);
        for (i = 0; i < 2; i++) {
                if (keys[i].lk_keyid == capa->lc_keyid) {
                        oti->oti_capa_key = keys[i];
                        break;
                }
        }
-       cfs_spin_unlock(&capa_lock);
+       spin_unlock(&capa_lock);
 
        if (i == 2) {
                DEBUG_CAPA(D_ERROR, capa, "no matched capa key");
@@ -1533,9 +1679,9 @@ static struct obd_capa *osd_capa_get(const struct lu_env *env,
                RETURN(oc);
        }
 
-       cfs_spin_lock(&capa_lock);
+       spin_lock(&capa_lock);
        *key = dev->od_capa_keys[1];
-       cfs_spin_unlock(&capa_lock);
+       spin_unlock(&capa_lock);
 
        capa->lc_keyid = key->lk_keyid;
        capa->lc_expiry = cfs_time_current_sec() + dev->od_capa_timeout;
@@ -1601,3 +1747,17 @@ static struct lu_object_operations osd_lu_obj_ops = {
        .loo_object_invariant   = osd_object_invariant,
 };
 
+static int osd_otable_it_attr_get(const struct lu_env *env,
+                               struct dt_object *dt,
+                               struct lu_attr *attr,
+                               struct lustre_capa *capa)
+{
+       attr->la_valid = 0;
+       return 0;
+}
+
+static struct dt_object_operations osd_obj_otable_it_ops = {
+        .do_attr_get    = osd_otable_it_attr_get,
+        .do_index_try   = osd_index_try,
+};
+