X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fosd-zfs%2Fosd_index.c;h=8c1ffbf02a5737893527b630a24012ff7595d22a;hb=e3d507eec50fc1ff79acf2a9f93d52d698c887d7;hp=b68833f8613777ba16ddc5b32ee6800c8912bba9;hpb=98060d83459ba10409f295898f0ec917f938b4d3;p=fs%2Flustre-release.git diff --git a/lustre/osd-zfs/osd_index.c b/lustre/osd-zfs/osd_index.c index b68833f..8c1ffbf 100644 --- a/lustre/osd-zfs/osd_index.c +++ b/lustre/osd-zfs/osd_index.c @@ -26,10 +26,8 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. - */ -/* - * Copyright (c) 2012, 2013, Intel Corporation. - * Use is subject to license terms. + * + * Copyright (c) 2012, 2015, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -41,14 +39,10 @@ * Author: Mike Pershin */ -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif #define DEBUG_SUBSYSTEM S_OSD #include #include -#include #include #include #include @@ -71,33 +65,120 @@ #include #include +static inline int osd_object_is_zap(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *) db; + dnode_t *dn; + int rc; + + DB_DNODE_ENTER(dbi); + dn = DB_DNODE(dbi); + rc = (dn->dn_type == DMU_OT_DIRECTORY_CONTENTS || + dn->dn_type == DMU_OT_USERGROUP_USED); + DB_DNODE_EXIT(dbi); + + return rc; +} + +/* We don't actually have direct access to the zap_hashbits() function + * so just pretend like we do for now. If this ever breaks we can look at + * it at that time. */ +#define zap_hashbits(zc) 48 +/* + * ZFS hash format: + * | cd (16 bits) | hash (48 bits) | + * we need it in other form: + * |0| hash (48 bit) | cd (15 bit) | + * to be a full 64-bit ordered hash so that Lustre readdir can use it to merge + * the readdir hashes from multiple directory stripes uniformly on the client. + * Another point is sign bit, the hash range should be in [0, 2^63-1] because + * loff_t (for llseek) needs to be a positive value. This means the "cd" field + * should only be the low 15 bits. + */ +uint64_t osd_zap_cursor_serialize(zap_cursor_t *zc) +{ + uint64_t zfs_hash = zap_cursor_serialize(zc) & (~0ULL >> 1); + + return (zfs_hash >> zap_hashbits(zc)) | + (zfs_hash << (63 - zap_hashbits(zc))); +} + +void osd_zap_cursor_init_serialized(zap_cursor_t *zc, struct objset *os, + uint64_t id, uint64_t dirhash) +{ + uint64_t zfs_hash = ((dirhash << zap_hashbits(zc)) & (~0ULL >> 1)) | + (dirhash >> (63 - zap_hashbits(zc))); + + zap_cursor_init_serialized(zc, os, id, zfs_hash); +} + +int osd_zap_cursor_init(zap_cursor_t **zc, struct objset *os, + uint64_t id, uint64_t dirhash) +{ + zap_cursor_t *t; + + OBD_ALLOC_PTR(t); + if (unlikely(t == NULL)) + return -ENOMEM; + + osd_zap_cursor_init_serialized(t, os, id, dirhash); + *zc = t; + + return 0; +} + +void osd_zap_cursor_fini(zap_cursor_t *zc) +{ + zap_cursor_fini(zc); + OBD_FREE_PTR(zc); +} + +static inline void osd_obj_cursor_init_serialized(zap_cursor_t *zc, + struct osd_object *o, + uint64_t dirhash) +{ + struct osd_device *d = osd_obj2dev(o); + osd_zap_cursor_init_serialized(zc, d->od_os, + o->oo_db->db_object, dirhash); +} + +static inline int osd_obj_cursor_init(zap_cursor_t **zc, struct osd_object *o, + uint64_t dirhash) +{ + struct osd_device *d = osd_obj2dev(o); + return osd_zap_cursor_init(zc, d->od_os, o->oo_db->db_object, dirhash); +} + static struct dt_it *osd_index_it_init(const struct lu_env *env, struct dt_object *dt, - __u32 unused, - struct lustre_capa *capa) + __u32 unused) { struct osd_thread_info *info = osd_oti_get(env); struct osd_zap_it *it; struct osd_object *obj = osd_dt_obj(dt); - struct osd_device *osd = osd_obj2dev(obj); struct lu_object *lo = &dt->do_lu; + int rc; ENTRY; - /* XXX: check capa ? */ + if (obj->oo_destroyed) + RETURN(ERR_PTR(-ENOENT)); LASSERT(lu_object_exists(lo)); LASSERT(obj->oo_db); - LASSERT(udmu_object_is_zap(obj->oo_db)); + LASSERT(osd_object_is_zap(obj->oo_db)); LASSERT(info); - it = &info->oti_it_zap; - - if (udmu_zap_cursor_init(&it->ozi_zc, &osd->od_objset, - obj->oo_db->db_object, 0)) + OBD_SLAB_ALLOC_PTR_GFP(it, osd_zapit_cachep, GFP_NOFS); + if (it == NULL) RETURN(ERR_PTR(-ENOMEM)); + rc = osd_obj_cursor_init(&it->ozi_zc, obj, 0); + if (rc != 0) { + OBD_SLAB_FREE_PTR(it, osd_zapit_cachep); + RETURN(ERR_PTR(rc)); + } + it->ozi_obj = obj; - it->ozi_capa = capa; it->ozi_reset = 1; lu_object_get(lo); @@ -106,8 +187,8 @@ static struct dt_it *osd_index_it_init(const struct lu_env *env, static void osd_index_it_fini(const struct lu_env *env, struct dt_it *di) { - struct osd_zap_it *it = (struct osd_zap_it *)di; - struct osd_object *obj; + struct osd_zap_it *it = (struct osd_zap_it *)di; + struct osd_object *obj; ENTRY; LASSERT(it); @@ -115,8 +196,9 @@ static void osd_index_it_fini(const struct lu_env *env, struct dt_it *di) obj = it->ozi_obj; - udmu_zap_cursor_fini(it->ozi_zc); + osd_zap_cursor_fini(it->ozi_zc); lu_object_put(env, &obj->oo_dt.do_lu); + OBD_SLAB_FREE_PTR(it, osd_zapit_cachep); EXIT; } @@ -128,57 +210,6 @@ static void osd_index_it_put(const struct lu_env *env, struct dt_it *di) * next/finish. */ } -int udmu_zap_cursor_retrieve_key(const struct lu_env *env, - zap_cursor_t *zc, char *key, int max) -{ - zap_attribute_t *za = &osd_oti_get(env)->oti_za; - int err; - - if ((err = zap_cursor_retrieve(zc, za))) - return err; - - if (key) - strcpy(key, za->za_name); - - return 0; -} - -/* - * zap_cursor_retrieve read from current record. - * to read bytes we need to call zap_lookup explicitly. - */ -int udmu_zap_cursor_retrieve_value(const struct lu_env *env, - zap_cursor_t *zc, char *buf, - int buf_size, int *bytes_read) -{ - zap_attribute_t *za = &osd_oti_get(env)->oti_za; - int err, actual_size; - - if ((err = zap_cursor_retrieve(zc, za))) - return err; - - if (za->za_integer_length <= 0) - return (ERANGE); - - actual_size = za->za_integer_length * za->za_num_integers; - - if (actual_size > buf_size) { - actual_size = buf_size; - buf_size = actual_size / za->za_integer_length; - } else { - buf_size = za->za_num_integers; - } - - err = -zap_lookup(zc->zc_objset, zc->zc_zapobj, - za->za_name, za->za_integer_length, - buf_size, buf); - - if (!err) - *bytes_read = actual_size; - - return err; -} - static inline void osd_it_append_attrs(struct lu_dirent *ent, __u32 attr, int len, __u16 type) { @@ -197,90 +228,121 @@ static inline void osd_it_append_attrs(struct lu_dirent *ent, __u32 attr, ent->lde_attrs = cpu_to_le32(ent->lde_attrs); } -/* - * as we don't know FID, we can't use LU object, so this function - * partially duplicate __osd_xattr_get() which is built around - * LU-object and uses it to cache data like regular EA dnode, etc +/** + * Get the object's FID from its LMA EA. + * + * \param[in] env pointer to the thread context + * \param[in] osd pointer to the OSD device + * \param[in] oid the object's local identifier + * \param[out] fid the buffer to hold the object's FID + * + * \retval 0 for success + * \retval negative error number on failure */ -static int osd_find_parent_by_dnode(const struct lu_env *env, - struct dt_object *o, - struct lu_fid *fid) +static int osd_get_fid_by_oid(const struct lu_env *env, struct osd_device *osd, + uint64_t oid, struct lu_fid *fid) { - struct lustre_mdt_attrs *lma; - udmu_objset_t *uos = &osd_obj2dev(osd_dt_obj(o))->od_objset; + struct objset *os = osd->od_os; + struct osd_thread_info *oti = osd_oti_get(env); + struct lustre_mdt_attrs *lma = + (struct lustre_mdt_attrs *)oti->oti_buf; struct lu_buf buf; - sa_handle_t *sa_hdl; - nvlist_t *nvbuf = NULL; - uchar_t *value; - uint64_t dnode; - int rc, size; + nvlist_t *sa_xattr = NULL; + sa_handle_t *sa_hdl = NULL; + uchar_t *nv_value = NULL; + uint64_t xattr = ZFS_NO_OBJECT; + int size = 0; + int rc; ENTRY; - /* first of all, get parent dnode from own attributes */ - LASSERT(osd_dt_obj(o)->oo_db); - rc = -sa_handle_get(uos->os, osd_dt_obj(o)->oo_db->db_object, - NULL, SA_HDL_PRIVATE, &sa_hdl); - if (rc) - RETURN(rc); + rc = __osd_xattr_load(osd, oid, &sa_xattr); + if (rc == -ENOENT) + goto regular; - dnode = ZFS_NO_OBJECT; - rc = -sa_lookup(sa_hdl, SA_ZPL_PARENT(uos), &dnode, 8); - sa_handle_destroy(sa_hdl); - if (rc) - RETURN(rc); + if (rc != 0) + GOTO(out, rc); - /* now get EA buffer */ - rc = __osd_xattr_load(uos, dnode, &nvbuf); - if (rc) - GOTO(regular, rc); + rc = -nvlist_lookup_byte_array(sa_xattr, XATTR_NAME_LMA, &nv_value, + &size); + if (rc == -ENOENT) + goto regular; - /* XXX: if we get that far.. should we cache the result? */ + if (rc != 0) + GOTO(out, rc); - /* try to find LMA attribute */ - LASSERT(nvbuf != NULL); - rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMA, &value, &size); - if (rc == 0 && size >= sizeof(*lma)) { - lma = (struct lustre_mdt_attrs *)value; - lustre_lma_swab(lma); - *fid = lma->lma_self_fid; - GOTO(out, rc = 0); - } + if (unlikely(size > sizeof(oti->oti_buf))) + GOTO(out, rc = -ERANGE); -regular: - /* no LMA attribute in SA, let's try regular EA */ + memcpy(lma, nv_value, size); - /* first of all, get parent dnode storing regular EA */ - rc = -sa_handle_get(uos->os, dnode, NULL, SA_HDL_PRIVATE, &sa_hdl); - if (rc) + goto found; + +regular: + rc = -sa_handle_get(os, oid, NULL, SA_HDL_PRIVATE, &sa_hdl); + if (rc != 0) GOTO(out, rc); - dnode = ZFS_NO_OBJECT; - rc = -sa_lookup(sa_hdl, SA_ZPL_XATTR(uos), &dnode, 8); + rc = -sa_lookup(sa_hdl, SA_ZPL_XATTR(osd), &xattr, 8); sa_handle_destroy(sa_hdl); - if (rc) + if (rc != 0) GOTO(out, rc); - CLASSERT(sizeof(*lma) <= sizeof(osd_oti_get(env)->oti_buf)); - buf.lb_buf = osd_oti_get(env)->oti_buf; - buf.lb_len = sizeof(osd_oti_get(env)->oti_buf); - - /* now try to find LMA */ - rc = __osd_xattr_get_large(env, uos, dnode, &buf, + buf.lb_buf = lma; + buf.lb_len = sizeof(oti->oti_buf); + rc = __osd_xattr_get_large(env, osd, xattr, &buf, XATTR_NAME_LMA, &size); - if (rc == 0 && size >= sizeof(*lma)) { - lma = buf.lb_buf; - lustre_lma_swab(lma); - *fid = lma->lma_self_fid; - GOTO(out, rc = 0); - } else if (rc < 0) { + if (rc != 0) GOTO(out, rc); - } else { + +found: + if (size < sizeof(*lma)) GOTO(out, rc = -EIO); + + lustre_lma_swab(lma); + if (unlikely((lma->lma_incompat & ~LMA_INCOMPAT_SUPP) || + CFS_FAIL_CHECK(OBD_FAIL_OSD_LMA_INCOMPAT))) { + CWARN("%s: unsupported incompat LMA feature(s) %#x for " + "oid = "LPX64"\n", osd->od_svname, + lma->lma_incompat & ~LMA_INCOMPAT_SUPP, oid); + GOTO(out, rc = -EOPNOTSUPP); + } else { + *fid = lma->lma_self_fid; + GOTO(out, rc = 0); } out: - if (nvbuf != NULL) - nvlist_free(nvbuf); + if (sa_xattr != NULL) + nvlist_free(sa_xattr); + return rc; +} + +/* + * As we don't know FID, we can't use LU object, so this function + * partially duplicate __osd_xattr_get() which is built around + * LU-object and uses it to cache data like regular EA dnode, etc + */ +static int osd_find_parent_by_dnode(const struct lu_env *env, + struct dt_object *o, + struct lu_fid *fid) +{ + struct osd_device *osd = osd_obj2dev(osd_dt_obj(o)); + sa_handle_t *sa_hdl; + uint64_t dnode = ZFS_NO_OBJECT; + int rc; + ENTRY; + + /* first of all, get parent dnode from own attributes */ + LASSERT(osd_dt_obj(o)->oo_db); + rc = -sa_handle_get(osd->od_os, osd_dt_obj(o)->oo_db->db_object, + NULL, SA_HDL_PRIVATE, &sa_hdl); + if (rc != 0) + RETURN(rc); + + rc = -sa_lookup(sa_hdl, SA_ZPL_PARENT(osd), &dnode, 8); + sa_handle_destroy(sa_hdl); + if (rc == 0) + rc = osd_get_fid_by_oid(env, osd, dnode, fid); + RETURN(rc); } @@ -296,10 +358,9 @@ static int osd_find_parent_fid(const struct lu_env *env, struct dt_object *o, buf.lb_buf = osd_oti_get(env)->oti_buf; buf.lb_len = sizeof(osd_oti_get(env)->oti_buf); - rc = osd_xattr_get(env, o, &buf, XATTR_NAME_LINK, BYPASS_CAPA); + rc = osd_xattr_get(env, o, &buf, XATTR_NAME_LINK); if (rc == -ERANGE) { - rc = osd_xattr_get(env, o, &LU_BUF_NULL, - XATTR_NAME_LINK, BYPASS_CAPA); + rc = osd_xattr_get(env, o, &LU_BUF_NULL, XATTR_NAME_LINK); if (rc < 0) RETURN(rc); LASSERT(rc > 0); @@ -307,7 +368,7 @@ static int osd_find_parent_fid(const struct lu_env *env, struct dt_object *o, if (buf.lb_buf == NULL) RETURN(-ENOMEM); buf.lb_len = rc; - rc = osd_xattr_get(env, o, &buf, XATTR_NAME_LINK, BYPASS_CAPA); + rc = osd_xattr_get(env, o, &buf, XATTR_NAME_LINK); } if (rc < 0) GOTO(out, rc); @@ -355,8 +416,7 @@ out: } static int osd_dir_lookup(const struct lu_env *env, struct dt_object *dt, - struct dt_rec *rec, const struct dt_key *key, - struct lustre_capa *capa) + struct dt_rec *rec, const struct dt_key *key) { struct osd_thread_info *oti = osd_oti_get(env); struct osd_object *obj = osd_dt_obj(dt); @@ -365,7 +425,7 @@ static int osd_dir_lookup(const struct lu_env *env, struct dt_object *dt, int rc; ENTRY; - LASSERT(udmu_object_is_zap(obj->oo_db)); + LASSERT(osd_object_is_zap(obj->oo_db)); if (name[0] == '.') { if (name[1] == 0) { @@ -378,12 +438,22 @@ static int osd_dir_lookup(const struct lu_env *env, struct dt_object *dt, } } - rc = -zap_lookup(osd->od_objset.os, obj->oo_db->db_object, + memset(&oti->oti_zde.lzd_fid, 0, sizeof(struct lu_fid)); + rc = -zap_lookup(osd->od_os, obj->oo_db->db_object, (char *)key, 8, sizeof(oti->oti_zde) / 8, (void *)&oti->oti_zde); - memcpy(rec, &oti->oti_zde.lzd_fid, sizeof(struct lu_fid)); + if (rc != 0) + RETURN(rc); - RETURN(rc == 0 ? 1 : rc); + if (likely(fid_is_sane(&oti->oti_zde.lzd_fid))) { + memcpy(rec, &oti->oti_zde.lzd_fid, sizeof(struct lu_fid)); + RETURN(1); + } + + rc = osd_get_fid_by_oid(env, osd, oti->oti_zde.lzd_reg.zde_dnode, + (struct lu_fid *)rec); + + RETURN(rc == 0 ? 1 : (rc == -ENOENT ? -ENODATA : rc)); } static int osd_declare_dir_insert(const struct lu_env *env, @@ -394,16 +464,20 @@ static int osd_declare_dir_insert(const struct lu_env *env, { struct osd_object *obj = osd_dt_obj(dt); struct osd_thandle *oh; + uint64_t object; ENTRY; LASSERT(th != NULL); oh = container_of0(th, struct osd_thandle, ot_super); - LASSERT(obj->oo_db); - LASSERT(udmu_object_is_zap(obj->oo_db)); + /* This is for inserting dot/dotdot for new created dir. */ + if (obj->oo_db == NULL) + object = DMU_NEW_OBJECT; + else + object = obj->oo_db->db_object; - dmu_tx_hold_bonus(oh->ot_tx, obj->oo_db->db_object); - dmu_tx_hold_zap(oh->ot_tx, obj->oo_db->db_object, TRUE, (char *)key); + dmu_tx_hold_bonus(oh->ot_tx, object); + dmu_tx_hold_zap(oh->ot_tx, object, TRUE, (char *)key); RETURN(0); } @@ -445,7 +519,7 @@ struct osd_object *osd_object_find(const struct lu_env *env, child = osd_obj(lo); else LU_OBJECT_DEBUG(D_ERROR, env, luch, - "%s: object can't be located "DFID"\n", + "%s: object can't be located "DFID, osd_dev(ludev)->od_svname, PFID(fid)); if (child == NULL) { @@ -456,7 +530,7 @@ struct osd_object *osd_object_find(const struct lu_env *env, } } else { LU_OBJECT_DEBUG(D_ERROR, env, luch, - "%s: lu_object does not exists "DFID"\n", + "%s: lu_object does not exists "DFID, osd_dev(ludev)->od_svname, PFID(fid)); lu_object_put(env, luch); child = ERR_PTR(-ENOENT); @@ -476,25 +550,49 @@ static inline void osd_object_put(const struct lu_env *env, lu_object_put(env, &obj->oo_dt.do_lu); } -static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd, - struct lu_fid *fid) +static int osd_seq_exists(const struct lu_env *env, struct osd_device *osd, + u64 seq) { struct lu_seq_range *range = &osd_oti_get(env)->oti_seq_range; struct seq_server_site *ss = osd_seq_site(osd); int rc; ENTRY; - if (!fid_is_norm(fid) && !fid_is_root(fid)) - RETURN(0); + LASSERT(ss != NULL); + LASSERT(ss->ss_server_fld != NULL); - rc = osd_fld_lookup(env, osd, fid, range); + rc = osd_fld_lookup(env, osd, seq, range); if (rc != 0) { - CERROR("%s: Can not lookup fld for "DFID"\n", - osd_name(osd), PFID(fid)); - RETURN(rc); + if (rc != -ENOENT) + CERROR("%s: Can not lookup fld for "LPX64"\n", + osd_name(osd), seq); + RETURN(0); } - RETURN(ss->ss_node_id != range->lsr_index); + RETURN(ss->ss_node_id == range->lsr_index); +} + +static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd, + const struct lu_fid *fid) +{ + struct seq_server_site *ss = osd_seq_site(osd); + ENTRY; + + /* FID seqs not in FLDB, must be local seq */ + if (unlikely(!fid_seq_in_fldb(fid_seq(fid)))) + RETURN(0); + + /* If FLD is not being initialized yet, it only happens during the + * initialization, likely during mgs initialization, and we assume + * this is local FID. */ + if (ss == NULL || ss->ss_server_fld == NULL) + RETURN(0); + + /* Only check the local FLDB here */ + if (osd_seq_exists(env, osd, fid_seq(fid))) + RETURN(0); + + RETURN(1); } /** @@ -504,7 +602,6 @@ static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd, * \param key key for index * \param rec record reference * \param th transaction handler - * \param capa capability descriptor * \param ignore_quota update should not affect quota * * \retval 0 success @@ -512,13 +609,13 @@ static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd, */ static int osd_dir_insert(const struct lu_env *env, struct dt_object *dt, const struct dt_rec *rec, const struct dt_key *key, - struct thandle *th, struct lustre_capa *capa, - int ignore_quota) + struct thandle *th, int ignore_quota) { struct osd_thread_info *oti = osd_oti_get(env); struct osd_object *parent = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(parent); - struct lu_fid *fid = (struct lu_fid *)rec; + struct dt_insert_rec *rec1 = (struct dt_insert_rec *)rec; + const struct lu_fid *fid = rec1->rec_fid; struct osd_thandle *oh; struct osd_object *child = NULL; __u32 attr; @@ -527,7 +624,7 @@ static int osd_dir_insert(const struct lu_env *env, struct dt_object *dt, ENTRY; LASSERT(parent->oo_db); - LASSERT(udmu_object_is_zap(parent->oo_db)); + LASSERT(osd_object_is_zap(parent->oo_db)); LASSERT(dt_object_exists(dt)); LASSERT(osd_invariant(parent)); @@ -545,7 +642,7 @@ static int osd_dir_insert(const struct lu_env *env, struct dt_object *dt, if (unlikely(rc == 1)) { /* Insert remote entry */ memset(&oti->oti_zde.lzd_reg, 0, sizeof(oti->oti_zde.lzd_reg)); - oti->oti_zde.lzd_reg.zde_type = IFTODT(S_IFDIR & S_IFMT); + oti->oti_zde.lzd_reg.zde_type = IFTODT(rec1->rec_type & S_IFMT); } else { /* * To simulate old Orion setups with ./.. stored in the @@ -563,13 +660,25 @@ static int osd_dir_insert(const struct lu_env *env, struct dt_object *dt, * during iteration */ GOTO(out, rc = 0); } else if (name[1] == '.' && name[2] == 0) { + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_BAD_PARENT)) { + struct lu_fid tfid = *fid; + + osd_object_put(env, child); + tfid.f_oid--; + child = osd_object_find(env, dt, &tfid); + if (IS_ERR(child)) + RETURN(PTR_ERR(child)); + + LASSERT(child->oo_db); + } + /* update parent dnode in the child. * later it will be used to generate ".." */ - udmu_objset_t *uos = &osd->od_objset; rc = osd_object_sa_update(parent, - SA_ZPL_PARENT(uos), + SA_ZPL_PARENT(osd), &child->oo_db->db_object, 8, oh); + GOTO(out, rc); } } @@ -582,9 +691,15 @@ static int osd_dir_insert(const struct lu_env *env, struct dt_object *dt, oti->oti_zde.lzd_fid = *fid; /* Insert (key,oid) into ZAP */ - rc = -zap_add(osd->od_objset.os, parent->oo_db->db_object, + rc = -zap_add(osd->od_os, parent->oo_db->db_object, (char *)key, 8, sizeof(oti->oti_zde) / 8, (void *)&oti->oti_zde, oh->ot_tx); + if (unlikely(rc == -EEXIST && + name[0] == '.' && name[1] == '.' && name[2] == 0)) + /* Update (key,oid) in ZAP */ + rc = -zap_update(osd->od_os, parent->oo_db->db_object, + (char *)key, 8, sizeof(oti->oti_zde) / 8, + (void *)&oti->oti_zde, oh->ot_tx); out: if (child != NULL) @@ -598,8 +713,9 @@ static int osd_declare_dir_delete(const struct lu_env *env, const struct dt_key *key, struct thandle *th) { - struct osd_object *obj = osd_dt_obj(dt); + struct osd_object *obj = osd_dt_obj(dt); struct osd_thandle *oh; + uint64_t dnode; ENTRY; LASSERT(dt_object_exists(dt)); @@ -608,17 +724,20 @@ static int osd_declare_dir_delete(const struct lu_env *env, LASSERT(th != NULL); oh = container_of0(th, struct osd_thandle, ot_super); - LASSERT(obj->oo_db); - LASSERT(udmu_object_is_zap(obj->oo_db)); - - dmu_tx_hold_zap(oh->ot_tx, obj->oo_db->db_object, TRUE, (char *)key); + if (dt_object_exists(dt)) { + LASSERT(obj->oo_db); + LASSERT(osd_object_is_zap(obj->oo_db)); + dnode = obj->oo_db->db_object; + } else { + dnode = DMU_NEW_OBJECT; + } + dmu_tx_hold_zap(oh->ot_tx, dnode, TRUE, (char *)key); RETURN(0); } static int osd_dir_delete(const struct lu_env *env, struct dt_object *dt, - const struct dt_key *key, struct thandle *th, - struct lustre_capa *capa) + const struct dt_key *key, struct thandle *th) { struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); @@ -628,8 +747,8 @@ static int osd_dir_delete(const struct lu_env *env, struct dt_object *dt, int rc; ENTRY; - LASSERT(obj->oo_db); - LASSERT(udmu_object_is_zap(obj->oo_db)); + LASSERT(zap_db); + LASSERT(osd_object_is_zap(zap_db)); LASSERT(th != NULL); oh = container_of0(th, struct osd_thandle, ot_super); @@ -647,14 +766,9 @@ static int osd_dir_delete(const struct lu_env *env, struct dt_object *dt, } /* Remove key from the ZAP */ - rc = -zap_remove(osd->od_objset.os, zap_db->db_object, + rc = -zap_remove(osd->od_os, zap_db->db_object, (char *) key, oh->ot_tx); -#if LUSTRE_VERSION_CODE <= OBD_OCD_VERSION(2, 4, 53, 0) - if (unlikely(rc == -ENOENT && name[0] == '.' && - (name[1] == 0 || (name[1] == '.' && name[2] == 0)))) - rc = 0; -#endif if (unlikely(rc && rc != -ENOENT)) CERROR("%s: zap_remove failed: rc = %d\n", osd->od_svname, rc); @@ -663,12 +777,11 @@ static int osd_dir_delete(const struct lu_env *env, struct dt_object *dt, static struct dt_it *osd_dir_it_init(const struct lu_env *env, struct dt_object *dt, - __u32 unused, - struct lustre_capa *capa) + __u32 unused) { struct osd_zap_it *it; - it = (struct osd_zap_it *)osd_index_it_init(env, dt, unused, capa); + it = (struct osd_zap_it *)osd_index_it_init(env, dt, unused); if (!IS_ERR(it)) it->ozi_pos = 0; @@ -690,7 +803,6 @@ static int osd_dir_it_get(const struct lu_env *env, { struct osd_zap_it *it = (struct osd_zap_it *)di; struct osd_object *obj = it->ozi_obj; - struct osd_device *osd = osd_obj2dev(obj); char *name = (char *)key; int rc; ENTRY; @@ -698,11 +810,9 @@ static int osd_dir_it_get(const struct lu_env *env, LASSERT(it); LASSERT(it->ozi_zc); - udmu_zap_cursor_fini(it->ozi_zc); - - if (udmu_zap_cursor_init(&it->ozi_zc, &osd->od_objset, - obj->oo_db->db_object, 0)) - RETURN(-ENOMEM); + /* reset the cursor */ + zap_cursor_fini(it->ozi_zc); + osd_obj_cursor_init_serialized(it->ozi_zc, obj, 0); /* XXX: implementation of the API is broken at the moment */ LASSERT(((const char *)key)[0] == 0); @@ -781,6 +891,8 @@ static int osd_dir_it_next(const struct lu_env *env, struct dt_it *di) zap_attribute_t *za = &osd_oti_get(env)->oti_za; int rc; + ENTRY; + /* temp. storage should be enough for any key supported by ZFS */ CLASSERT(sizeof(za->za_name) <= sizeof(it->ozi_name)); @@ -793,9 +905,10 @@ static int osd_dir_it_next(const struct lu_env *env, struct dt_it *di) it->ozi_pos++; if (it->ozi_pos <=2) RETURN(0); - } - zap_cursor_advance(it->ozi_zc); + } else { + zap_cursor_advance(it->ozi_zc); + } /* * According to current API we need to return error if its last entry. @@ -831,19 +944,6 @@ static struct dt_key *osd_dir_it_key(const struct lu_env *env, strcpy(it->ozi_name, za->za_name); -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 3, 90, 0) - if (za->za_name[0] == '.') { - if (za->za_name[1] == 0 || (za->za_name[1] == '.' && - za->za_name[2] == 0)) { - /* we should not get onto . and .. - * stored in the directory. ->next() and - * other methods should prevent this - */ - LBUG(); - } - } -#endif - RETURN((struct dt_key *)it->ozi_name); } @@ -864,18 +964,6 @@ static int osd_dir_it_key_size(const struct lu_env *env, const struct dt_it *di) if ((rc = -zap_cursor_retrieve(it->ozi_zc, za)) == 0) rc = strlen(za->za_name); -#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 3, 90, 0) - if (rc == 0 && za->za_name[0] == '.') { - if (za->za_name[1] == 0 || (za->za_name[1] == '.' && - za->za_name[2] == 0)) { - /* we should not get onto . and .. - * stored in the directory. ->next() and - * other methods should prevent this - */ - LBUG(); - } - } -#endif RETURN(rc); } @@ -910,23 +998,20 @@ static int osd_dir_it_rec(const struct lu_env *env, const struct dt_it *di, osd_it_append_attrs(lde, attr, 2, IFTODT(S_IFDIR)); lde->lde_reclen = cpu_to_le16(lu_dirent_calc_size(2, attr)); rc = osd_find_parent_fid(env, &it->ozi_obj->oo_dt, &lde->lde_fid); - /* - * early Orion code was not setting LinkEA, so it's possible - * some setups still have objects with no LinkEA set. - * but at that time .. was a real record in the directory - * so we should try to lookup .. in ZAP - */ - if (rc != -ENOENT) - GOTO(out, rc); + + /* ENOENT happens at the root of filesystem so ignore it */ + if (rc == -ENOENT) + rc = 0; + GOTO(out, rc); } LASSERT(lde); - lde->lde_hash = cpu_to_le64(udmu_zap_cursor_serialize(it->ozi_zc)); - - if ((rc = -zap_cursor_retrieve(it->ozi_zc, za))) + rc = -zap_cursor_retrieve(it->ozi_zc, za); + if (unlikely(rc != 0)) GOTO(out, rc); + lde->lde_hash = cpu_to_le64(osd_zap_cursor_serialize(it->ozi_zc)); namelen = strlen(za->za_name); if (namelen > NAME_MAX) GOTO(out, rc = -EOVERFLOW); @@ -958,6 +1043,45 @@ out: RETURN(rc); } +static int osd_dir_it_rec_size(const struct lu_env *env, const struct dt_it *di, + __u32 attr) +{ + struct osd_zap_it *it = (struct osd_zap_it *)di; + zap_attribute_t *za = &osd_oti_get(env)->oti_za; + size_t namelen = 0; + int rc; + ENTRY; + + if (it->ozi_pos <= 1) + namelen = 1; + else if (it->ozi_pos == 2) + namelen = 2; + + if (namelen > 0) { + rc = lu_dirent_calc_size(namelen, attr); + RETURN(rc); + } + + rc = -zap_cursor_retrieve(it->ozi_zc, za); + if (unlikely(rc != 0)) + RETURN(rc); + + if (za->za_integer_length != 8 || za->za_num_integers < 3) { + CERROR("%s: unsupported direntry format: %d %d\n", + osd_obj2dev(it->ozi_obj)->od_svname, + za->za_integer_length, (int)za->za_num_integers); + RETURN(-EIO); + } + + namelen = strlen(za->za_name); + if (namelen > NAME_MAX) + RETURN(-EOVERFLOW); + + rc = lu_dirent_calc_size(namelen, attr); + + RETURN(rc); +} + static __u64 osd_dir_it_store(const struct lu_env *env, const struct dt_it *di) { struct osd_zap_it *it = (struct osd_zap_it *)di; @@ -967,7 +1091,7 @@ static __u64 osd_dir_it_store(const struct lu_env *env, const struct dt_it *di) if (it->ozi_pos <= 2) pos = it->ozi_pos; else - pos = udmu_zap_cursor_serialize(it->ozi_zc); + pos = osd_zap_cursor_serialize(it->ozi_zc); RETURN(pos); } @@ -983,15 +1107,13 @@ static int osd_dir_it_load(const struct lu_env *env, { struct osd_zap_it *it = (struct osd_zap_it *)di; struct osd_object *obj = it->ozi_obj; - struct osd_device *osd = osd_obj2dev(obj); zap_attribute_t *za = &osd_oti_get(env)->oti_za; int rc; ENTRY; - udmu_zap_cursor_fini(it->ozi_zc); - if (udmu_zap_cursor_init(&it->ozi_zc, &osd->od_objset, - obj->oo_db->db_object, hash)) - RETURN(-ENOMEM); + /* reset the cursor */ + zap_cursor_fini(it->ozi_zc); + osd_obj_cursor_init_serialized(it->ozi_zc, obj, hash); if (hash <= 2) { it->ozi_pos = hash; @@ -1009,7 +1131,7 @@ static int osd_dir_it_load(const struct lu_env *env, RETURN(rc); } -static struct dt_index_operations osd_dir_ops = { +struct dt_index_operations osd_dir_ops = { .dio_lookup = osd_dir_lookup, .dio_declare_insert = osd_declare_dir_insert, .dio_insert = osd_dir_insert, @@ -1024,6 +1146,7 @@ static struct dt_index_operations osd_dir_ops = { .key = osd_dir_it_key, .key_size = osd_dir_it_key_size, .rec = osd_dir_it_rec, + .rec_size = osd_dir_it_rec_size, .store = osd_dir_it_store, .load = osd_dir_it_load } @@ -1031,20 +1154,43 @@ static struct dt_index_operations osd_dir_ops = { /* * Primitives for index files using binary keys. - * XXX: only 64-bit keys are supported for now. */ +/* key integer_size is 8 */ +static int osd_prepare_key_uint64(struct osd_object *o, __u64 *dst, + const struct dt_key *src) +{ + int size; + + LASSERT(dst); + LASSERT(src); + + /* align keysize to 64bit */ + size = (o->oo_keysize + sizeof(__u64) - 1) / sizeof(__u64); + size *= sizeof(__u64); + + LASSERT(size <= MAXNAMELEN); + + if (unlikely(size > o->oo_keysize)) + memset(dst + o->oo_keysize, 0, size - o->oo_keysize); + memcpy(dst, (const char *)src, o->oo_keysize); + + return (size/sizeof(__u64)); +} + static int osd_index_lookup(const struct lu_env *env, struct dt_object *dt, - struct dt_rec *rec, const struct dt_key *key, - struct lustre_capa *capa) + struct dt_rec *rec, const struct dt_key *key) { struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); + __u64 *k = osd_oti_get(env)->oti_key64; int rc; ENTRY; - rc = -zap_lookup_uint64(osd->od_objset.os, obj->oo_db->db_object, - (const __u64 *)key, 1, 8, obj->oo_recsize, + rc = osd_prepare_key_uint64(obj, k, key); + + rc = -zap_lookup_uint64(osd->od_os, obj->oo_db->db_object, + k, rc, obj->oo_recusize, obj->oo_recsize, (void *)rec); RETURN(rc == 0 ? 1 : rc); } @@ -1076,12 +1222,12 @@ static int osd_declare_index_insert(const struct lu_env *env, static int osd_index_insert(const struct lu_env *env, struct dt_object *dt, const struct dt_rec *rec, const struct dt_key *key, - struct thandle *th, struct lustre_capa *capa, - int ignore_quota) + struct thandle *th, int ignore_quota) { struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); struct osd_thandle *oh; + __u64 *k = osd_oti_get(env)->oti_key64; int rc; ENTRY; @@ -1092,9 +1238,11 @@ static int osd_index_insert(const struct lu_env *env, struct dt_object *dt, oh = container_of0(th, struct osd_thandle, ot_super); + rc = osd_prepare_key_uint64(obj, k, key); + /* Insert (key,oid) into ZAP */ - rc = -zap_add_uint64(osd->od_objset.os, obj->oo_db->db_object, - (const __u64 *)key, 1, 8, obj->oo_recsize, + rc = -zap_add_uint64(osd->od_os, obj->oo_db->db_object, + k, rc, obj->oo_recusize, obj->oo_recsize, (void *)rec, oh->ot_tx); RETURN(rc); } @@ -1120,12 +1268,12 @@ static int osd_declare_index_delete(const struct lu_env *env, } static int osd_index_delete(const struct lu_env *env, struct dt_object *dt, - const struct dt_key *key, struct thandle *th, - struct lustre_capa *capa) + const struct dt_key *key, struct thandle *th) { struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); struct osd_thandle *oh; + __u64 *k = osd_oti_get(env)->oti_key64; int rc; ENTRY; @@ -1133,9 +1281,11 @@ static int osd_index_delete(const struct lu_env *env, struct dt_object *dt, LASSERT(th != NULL); oh = container_of0(th, struct osd_thandle, ot_super); + rc = osd_prepare_key_uint64(obj, k, key); + /* Remove binary key from the ZAP */ - rc = -zap_remove_uint64(osd->od_objset.os, obj->oo_db->db_object, - (const __u64 *)key, 1, oh->ot_tx); + rc = -zap_remove_uint64(osd->od_os, obj->oo_db->db_object, + k, rc, oh->ot_tx); RETURN(rc); } @@ -1150,12 +1300,15 @@ static int osd_index_it_get(const struct lu_env *env, struct dt_it *di, LASSERT(it); LASSERT(it->ozi_zc); - /* XXX: API is broken at the moment */ - LASSERT(*((const __u64 *)key) == 0); + /* + * XXX: we need a binary version of zap_cursor_move_to_key() + * to implement this API */ + if (*((const __u64 *)key) != 0) + CERROR("NOT IMPLEMETED YET (move to "LPX64")\n", + *((__u64 *)key)); zap_cursor_fini(it->ozi_zc); - memset(it->ozi_zc, 0, sizeof(*it->ozi_zc)); - zap_cursor_init(it->ozi_zc, osd->od_objset.os, obj->oo_db->db_object); + zap_cursor_init(it->ozi_zc, osd->od_os, obj->oo_db->db_object); it->ozi_reset = 1; RETURN(+1); @@ -1189,6 +1342,7 @@ static struct dt_key *osd_index_it_key(const struct lu_env *env, const struct dt_it *di) { struct osd_zap_it *it = (struct osd_zap_it *)di; + struct osd_object *obj = it->ozi_obj; zap_attribute_t *za = &osd_oti_get(env)->oti_za; int rc = 0; ENTRY; @@ -1199,7 +1353,7 @@ static struct dt_key *osd_index_it_key(const struct lu_env *env, RETURN(ERR_PTR(rc)); /* the binary key is stored in the name */ - it->ozi_key = *((__u64 *)za->za_name); + memcpy(&it->ozi_key, za->za_name, obj->oo_keysize); RETURN((struct dt_key *)&it->ozi_key); } @@ -1207,8 +1361,9 @@ static struct dt_key *osd_index_it_key(const struct lu_env *env, static int osd_index_it_key_size(const struct lu_env *env, const struct dt_it *di) { - /* we only support 64-bit binary keys for the time being */ - RETURN(sizeof(__u64)); + struct osd_zap_it *it = (struct osd_zap_it *)di; + struct osd_object *obj = it->ozi_obj; + RETURN(obj->oo_keysize); } static int osd_index_it_rec(const struct lu_env *env, const struct dt_it *di, @@ -1218,6 +1373,7 @@ static int osd_index_it_rec(const struct lu_env *env, const struct dt_it *di, struct osd_zap_it *it = (struct osd_zap_it *)di; struct osd_object *obj = it->ozi_obj; struct osd_device *osd = osd_obj2dev(obj); + __u64 *k = osd_oti_get(env)->oti_key64; int rc; ENTRY; @@ -1226,9 +1382,11 @@ static int osd_index_it_rec(const struct lu_env *env, const struct dt_it *di, if (rc) RETURN(rc); - rc = -zap_lookup_uint64(osd->od_objset.os, obj->oo_db->db_object, - (const __u64 *)za->za_name, 1, 8, - obj->oo_recsize, (void *)rec); + rc = osd_prepare_key_uint64(obj, k, (const struct dt_key *)za->za_name); + + rc = -zap_lookup_uint64(osd->od_os, obj->oo_db->db_object, + k, rc, obj->oo_recusize, obj->oo_recsize, + (void *)rec); RETURN(rc); } @@ -1251,12 +1409,9 @@ static int osd_index_it_load(const struct lu_env *env, const struct dt_it *di, int rc; ENTRY; - /* close the current cursor */ + /* reset the cursor */ zap_cursor_fini(it->ozi_zc); - - /* create a new one starting at hash */ - memset(it->ozi_zc, 0, sizeof(*it->ozi_zc)); - zap_cursor_init_serialized(it->ozi_zc, osd->od_objset.os, + zap_cursor_init_serialized(it->ozi_zc, osd->od_os, obj->oo_db->db_object, hash); it->ozi_reset = 0; @@ -1289,60 +1444,274 @@ static struct dt_index_operations osd_index_ops = { } }; +struct osd_metadnode_it { + struct osd_device *mit_dev; + __u64 mit_pos; + struct lu_fid mit_fid; + int mit_prefetched; + __u64 mit_prefetched_dnode; +}; + +static struct dt_it *osd_zfs_otable_it_init(const struct lu_env *env, + struct dt_object *dt, __u32 attr) +{ + struct osd_device *dev = osd_dev(dt->do_lu.lo_dev); + struct osd_metadnode_it *it; + ENTRY; + + OBD_ALLOC_PTR(it); + if (unlikely(it == NULL)) + RETURN(ERR_PTR(-ENOMEM)); + + it->mit_dev = dev; + + /* XXX: dmu_object_next() does NOT find dnodes allocated + * in the current non-committed txg, so we force txg + * commit to find all existing dnodes ... */ + txg_wait_synced(dmu_objset_pool(dev->od_os), 0ULL); + + RETURN((struct dt_it *)it); +} + +static void osd_zfs_otable_it_fini(const struct lu_env *env, struct dt_it *di) +{ + struct osd_metadnode_it *it = (struct osd_metadnode_it *)di; + + OBD_FREE_PTR(it); +} + +static int osd_zfs_otable_it_get(const struct lu_env *env, + struct dt_it *di, const struct dt_key *key) +{ + return 0; +} + +static void osd_zfs_otable_it_put(const struct lu_env *env, struct dt_it *di) +{ +} + +#define OTABLE_PREFETCH 256 + +static void osd_zfs_otable_prefetch(const struct lu_env *env, + struct osd_metadnode_it *it) +{ + struct osd_device *dev = it->mit_dev; + int rc; + + /* can go negative on the very first access to the iterator + * or if some non-Lustre objects were found */ + if (unlikely(it->mit_prefetched < 0)) + it->mit_prefetched = 0; + + if (it->mit_prefetched >= (OTABLE_PREFETCH >> 1)) + return; + + if (it->mit_prefetched_dnode == 0) + it->mit_prefetched_dnode = it->mit_pos; + + while (it->mit_prefetched < OTABLE_PREFETCH) { + rc = -dmu_object_next(dev->od_os, &it->mit_prefetched_dnode, + B_FALSE, 0); + if (unlikely(rc != 0)) + break; + + osd_dmu_prefetch(dev->od_os, it->mit_prefetched_dnode, + 0, 0, 0, ZIO_PRIORITY_ASYNC_READ); + + it->mit_prefetched++; + } +} + +static int osd_zfs_otable_it_next(const struct lu_env *env, struct dt_it *di) +{ + struct osd_metadnode_it *it = (struct osd_metadnode_it *)di; + struct lustre_mdt_attrs *lma; + struct osd_device *dev = it->mit_dev; + nvlist_t *nvbuf = NULL; + uchar_t *v; + __u64 dnode; + int rc, s; + + memset(&it->mit_fid, 0, sizeof(it->mit_fid)); + + dnode = it->mit_pos; + do { + rc = -dmu_object_next(dev->od_os, &it->mit_pos, B_FALSE, 0); + if (unlikely(rc != 0)) + GOTO(out, rc = 1); + it->mit_prefetched--; + + /* LMA is required for this to be a Lustre object. + * If there is no xattr skip it. */ + rc = __osd_xattr_load(dev, it->mit_pos, &nvbuf); + if (unlikely(rc != 0)) + continue; + + LASSERT(nvbuf != NULL); + rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMA, &v, &s); + if (likely(rc == 0)) { + /* Lustre object */ + lma = (struct lustre_mdt_attrs *)v; + lustre_lma_swab(lma); + it->mit_fid = lma->lma_self_fid; + nvlist_free(nvbuf); + break; + } else { + /* not a Lustre object, try next one */ + nvlist_free(nvbuf); + } + + } while (1); + + + /* we aren't prefetching in the above loop because the number of + * non-Lustre objects is very small and we will be repeating very + * rare. in case we want to use this to iterate over non-Lustre + * objects (i.e. when we convert regular ZFS in Lustre) it makes + * sense to initiate prefetching in the loop */ + + /* 0 - there are more items, +1 - the end */ + if (likely(rc == 0)) + osd_zfs_otable_prefetch(env, it); + + CDEBUG(D_OTHER, "advance: %llu -> %llu "DFID": %d\n", dnode, + it->mit_pos, PFID(&it->mit_fid), rc); + +out: + return rc; +} + +static struct dt_key *osd_zfs_otable_it_key(const struct lu_env *env, + const struct dt_it *di) +{ + return NULL; +} + +static int osd_zfs_otable_it_key_size(const struct lu_env *env, + const struct dt_it *di) +{ + return sizeof(__u64); +} + +static int osd_zfs_otable_it_rec(const struct lu_env *env, + const struct dt_it *di, + struct dt_rec *rec, __u32 attr) +{ + struct osd_metadnode_it *it = (struct osd_metadnode_it *)di; + struct lu_fid *fid = (struct lu_fid *)rec; + ENTRY; + + *fid = it->mit_fid; + + RETURN(0); +} + + +static __u64 osd_zfs_otable_it_store(const struct lu_env *env, + const struct dt_it *di) +{ + struct osd_metadnode_it *it = (struct osd_metadnode_it *)di; + + return it->mit_pos; +} + +static int osd_zfs_otable_it_load(const struct lu_env *env, + const struct dt_it *di, __u64 hash) +{ + struct osd_metadnode_it *it = (struct osd_metadnode_it *)di; + + it->mit_pos = hash; + it->mit_prefetched = 0; + it->mit_prefetched_dnode = 0; + + return osd_zfs_otable_it_next(env, (struct dt_it *)di); +} + +static int osd_zfs_otable_it_key_rec(const struct lu_env *env, + const struct dt_it *di, void *key_rec) +{ + return 0; +} + +const struct dt_index_operations osd_zfs_otable_ops = { + .dio_it = { + .init = osd_zfs_otable_it_init, + .fini = osd_zfs_otable_it_fini, + .get = osd_zfs_otable_it_get, + .put = osd_zfs_otable_it_put, + .next = osd_zfs_otable_it_next, + .key = osd_zfs_otable_it_key, + .key_size = osd_zfs_otable_it_key_size, + .rec = osd_zfs_otable_it_rec, + .store = osd_zfs_otable_it_store, + .load = osd_zfs_otable_it_load, + .key_rec = osd_zfs_otable_it_key_rec, + } +}; + int osd_index_try(const struct lu_env *env, struct dt_object *dt, const struct dt_index_features *feat) { struct osd_object *obj = osd_dt_obj(dt); + int rc = 0; ENTRY; - LASSERT(dt_object_exists(dt)); + down_read(&obj->oo_guard); /* * XXX: implement support for fixed-size keys sorted with natural * numerical way (not using internal hash value) */ if (feat->dif_flags & DT_IND_RANGE) - RETURN(-ERANGE); + GOTO(out, rc = -ERANGE); - if (unlikely(feat == &dt_otable_features)) - /* do not support oi scrub yet. */ - RETURN(-ENOTSUPP); + if (unlikely(feat == &dt_otable_features)) { + dt->do_index_ops = &osd_zfs_otable_ops; + GOTO(out, rc = 0); + } - LASSERT(obj->oo_db != NULL); + LASSERT(!dt_object_exists(dt) || obj->oo_db != NULL); if (likely(feat == &dt_directory_features)) { - if (udmu_object_is_zap(obj->oo_db)) + if (!dt_object_exists(dt) || osd_object_is_zap(obj->oo_db)) dt->do_index_ops = &osd_dir_ops; else - RETURN(-ENOTDIR); + GOTO(out, rc = -ENOTDIR); } else if (unlikely(feat == &dt_acct_features)) { LASSERT(fid_is_acct(lu_object_fid(&dt->do_lu))); dt->do_index_ops = &osd_acct_index_ops; - } else if (udmu_object_is_zap(obj->oo_db) && - dt->do_index_ops == NULL) { + } else if (dt->do_index_ops == NULL) { /* For index file, we don't support variable key & record sizes * and the key has to be unique */ if ((feat->dif_flags & ~DT_IND_UPDATE) != 0) - RETURN(-EINVAL); + GOTO(out, rc = -EINVAL); - /* Although the zap_*_uint64() primitives support large keys, we - * limit ourselves to 64-bit keys for now */ - if (feat->dif_keysize_max != sizeof(__u64) || - feat->dif_keysize_min != sizeof(__u64)) - RETURN(-EINVAL); + if (feat->dif_keysize_max > ZAP_MAXNAMELEN) + GOTO(out, rc = -E2BIG); + if (feat->dif_keysize_max != feat->dif_keysize_min) + GOTO(out, rc = -EINVAL); /* As for the record size, it should be a multiple of 8 bytes * and smaller than the maximum value length supported by ZAP. */ if (feat->dif_recsize_max > ZAP_MAXVALUELEN) - RETURN(-E2BIG); - if (feat->dif_recsize_max != feat->dif_recsize_min || - (feat->dif_recsize_max & (sizeof(__u64) - 1))) - RETURN(-EINVAL); - - obj->oo_recsize = feat->dif_recsize_max / sizeof(__u64); + GOTO(out, rc = -E2BIG); + if (feat->dif_recsize_max != feat->dif_recsize_min) + GOTO(out, rc = -EINVAL); + + obj->oo_keysize = feat->dif_keysize_max; + obj->oo_recsize = feat->dif_recsize_max; + obj->oo_recusize = 1; + + /* ZFS prefers to work with array of 64bits */ + if ((obj->oo_recsize & 7) == 0) { + obj->oo_recsize >>= 3; + obj->oo_recusize = 8; + } dt->do_index_ops = &osd_index_ops; } - RETURN(0); -} +out: + up_read(&obj->oo_guard); + RETURN(rc); +}