X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosd-zfs%2Fosd_index.c;h=43779a9b8b7c1a2f0e35bed371c92a3b03ba389f;hp=cb865b5bcdadd8232722b6b67a9b8714ac01d3a1;hb=b69b7de30c3977cb69a741099218bc4a81752717;hpb=52953ae668710484a8396cdb2fe94016051496a6 diff --git a/lustre/osd-zfs/osd_index.c b/lustre/osd-zfs/osd_index.c index cb865b5..43779a9 100644 --- a/lustre/osd-zfs/osd_index.c +++ b/lustre/osd-zfs/osd_index.c @@ -28,7 +28,7 @@ * Use is subject to license terms. */ /* - * Copyright (c) 2011, 2012 Whamcloud, Inc. + * Copyright (c) 2012, 2013, Intel Corporation. * Use is subject to license terms. */ /* @@ -41,14 +41,10 @@ * Author: Mike Pershin */ -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif #define DEBUG_SUBSYSTEM S_OSD #include #include -#include #include #include #include @@ -71,10 +67,10 @@ #include #include -static struct dt_it *osd_zap_it_init(const struct lu_env *env, - struct dt_object *dt, - __u32 unused, - struct lustre_capa *capa) +static struct dt_it *osd_index_it_init(const struct lu_env *env, + struct dt_object *dt, + __u32 unused, + struct lustre_capa *capa) { struct osd_thread_info *info = osd_oti_get(env); struct osd_zap_it *it; @@ -104,7 +100,7 @@ static struct dt_it *osd_zap_it_init(const struct lu_env *env, RETURN((struct dt_it *)it); } -static void osd_zap_it_fini(const struct lu_env *env, struct dt_it *di) +static void osd_index_it_fini(const struct lu_env *env, struct dt_it *di) { struct osd_zap_it *it = (struct osd_zap_it *)di; struct osd_object *obj; @@ -121,42 +117,8 @@ static void osd_zap_it_fini(const struct lu_env *env, struct dt_it *di) EXIT; } -/** - * Move Iterator to record specified by \a key - * - * \param di osd iterator - * \param key key for index - * - * \retval +ve di points to record with least key not larger than key - * \retval 0 di points to exact matched key - * \retval -ve failure - */ - -static int osd_zap_it_get(const struct lu_env *env, - struct dt_it *di, const struct dt_key *key) -{ - struct osd_zap_it *it = (struct osd_zap_it *)di; - struct osd_object *obj = it->ozi_obj; - struct osd_device *osd = osd_obj2dev(obj); - ENTRY; - - LASSERT(it); - LASSERT(it->ozi_zc); - - /* XXX: API is broken at the moment */ - LASSERT(((const char *)key)[0] == '\0'); - - udmu_zap_cursor_fini(it->ozi_zc); - if (udmu_zap_cursor_init(&it->ozi_zc, &osd->od_objset, - obj->oo_db->db_object, 0)) - RETURN(-ENOMEM); - - it->ozi_reset = 1; - - RETURN(+1); -} -static void osd_zap_it_put(const struct lu_env *env, struct dt_it *di) +static void osd_index_it_put(const struct lu_env *env, struct dt_it *di) { /* PBS: do nothing : ref are incremented at retrive and decreamented * next/finish. */ @@ -166,96 +128,28 @@ int udmu_zap_cursor_retrieve_key(const struct lu_env *env, zap_cursor_t *zc, char *key, int max) { zap_attribute_t *za = &osd_oti_get(env)->oti_za; - int err; + int err; if ((err = zap_cursor_retrieve(zc, za))) return err; - if (key) { - if (strlen(za->za_name) > max) - return EOVERFLOW; + if (key) strcpy(key, za->za_name); - } return 0; } -/** - * to load a directory entry at a time and stored it in - * iterator's in-memory data structure. - * - * \param di, struct osd_it_ea, iterator's in memory structure - * - * \retval +ve, iterator reached to end - * \retval 0, iterator not reached to end - * \retval -ve, on error - */ -static int osd_zap_it_next(const struct lu_env *env, struct dt_it *di) -{ - struct osd_zap_it *it = (struct osd_zap_it *)di; - int rc; - ENTRY; - - if (it->ozi_reset == 0) - zap_cursor_advance(it->ozi_zc); - it->ozi_reset = 0; - - /* - * According to current API we need to return error if its last entry. - * zap_cursor_advance() does return any value. So we need to call - * retrieve to check if there is any record. We should make - * changes to Iterator API to not return status for this API - */ - rc = -udmu_zap_cursor_retrieve_key(env, it->ozi_zc, NULL, NAME_MAX); - if (rc == -ENOENT) /* end of dir*/ - RETURN(+1); - - RETURN((rc)); -} - -static struct dt_key *osd_zap_it_key(const struct lu_env *env, - const struct dt_it *di) -{ - struct osd_zap_it *it = (struct osd_zap_it *)di; - int rc = 0; - ENTRY; - - it->ozi_reset = 0; - rc = -udmu_zap_cursor_retrieve_key(env, it->ozi_zc, it->ozi_name, - NAME_MAX + 1); - if (!rc) - RETURN((struct dt_key *)it->ozi_name); - else - RETURN(ERR_PTR(rc)); -} - -static int osd_zap_it_key_size(const struct lu_env *env, const struct dt_it *di) -{ - struct osd_zap_it *it = (struct osd_zap_it *)di; - int rc; - ENTRY; - - it->ozi_reset = 0; - rc = -udmu_zap_cursor_retrieve_key(env, it->ozi_zc, it->ozi_name, - NAME_MAX + 1); - if (!rc) - RETURN(strlen(it->ozi_name)); - else - RETURN(rc); -} - /* * zap_cursor_retrieve read from current record. * to read bytes we need to call zap_lookup explicitly. */ int udmu_zap_cursor_retrieve_value(const struct lu_env *env, - zap_cursor_t *zc, char *buf, - int buf_size, int *bytes_read) + zap_cursor_t *zc, char *buf, + int buf_size, int *bytes_read) { zap_attribute_t *za = &osd_oti_get(env)->oti_za; int err, actual_size; - if ((err = zap_cursor_retrieve(zc, za))) return err; @@ -272,8 +166,8 @@ int udmu_zap_cursor_retrieve_value(const struct lu_env *env, } err = -zap_lookup(zc->zc_objset, zc->zc_zapobj, - za->za_name, za->za_integer_length, - buf_size, buf); + za->za_name, za->za_integer_length, + buf_size, buf); if (!err) *bytes_read = actual_size; @@ -292,98 +186,166 @@ static inline void osd_it_append_attrs(struct lu_dirent *ent, __u32 attr, len = (len + align) & ~align; lt = (void *)ent->lde_name + len; - lt->lt_type = cpu_to_le16(CFS_DTTOIF(type)); + lt->lt_type = cpu_to_le16(DTTOIF(type)); ent->lde_attrs |= LUDA_TYPE; } ent->lde_attrs = cpu_to_le32(ent->lde_attrs); } -static int osd_zap_it_rec(const struct lu_env *env, const struct dt_it *di, - struct dt_rec *dtrec, __u32 attr) +/* + * as we don't know FID, we can't use LU object, so this function + * partially duplicate __osd_xattr_get() which is built around + * LU-object and uses it to cache data like regular EA dnode, etc + */ +static int osd_find_parent_by_dnode(const struct lu_env *env, + struct dt_object *o, + struct lu_fid *fid) { - struct luz_direntry *zde = &osd_oti_get(env)->oti_zde; - zap_attribute_t *za = &osd_oti_get(env)->oti_za; - struct osd_zap_it *it = (struct osd_zap_it *)di; - struct lu_dirent *lde = (struct lu_dirent *)dtrec; - int rc, namelen; + struct lustre_mdt_attrs *lma; + udmu_objset_t *uos = &osd_obj2dev(osd_dt_obj(o))->od_objset; + struct lu_buf buf; + sa_handle_t *sa_hdl; + nvlist_t *nvbuf = NULL; + uchar_t *value; + uint64_t dnode; + int rc, size; ENTRY; - it->ozi_reset = 0; - LASSERT(lde); - - lde->lde_hash = cpu_to_le64(udmu_zap_cursor_serialize(it->ozi_zc)); - - if ((rc = -zap_cursor_retrieve(it->ozi_zc, za))) - GOTO(out, rc); - - namelen = strlen(za->za_name); - if (namelen > NAME_MAX) - GOTO(out, rc = -EOVERFLOW); - strcpy(lde->lde_name, za->za_name); - lde->lde_namelen = cpu_to_le16(namelen); + /* first of all, get parent dnode from own attributes */ + LASSERT(osd_dt_obj(o)->oo_db); + rc = -sa_handle_get(uos->os, osd_dt_obj(o)->oo_db->db_object, + NULL, SA_HDL_PRIVATE, &sa_hdl); + if (rc) + RETURN(rc); - if (za->za_integer_length != 8 || za->za_num_integers < 3) { - CERROR("%s: unsupported direntry format: %d %d\n", - osd_obj2dev(it->ozi_obj)->od_svname, - za->za_integer_length, (int)za->za_num_integers); + dnode = ZFS_NO_OBJECT; + rc = -sa_lookup(sa_hdl, SA_ZPL_PARENT(uos), &dnode, 8); + sa_handle_destroy(sa_hdl); + if (rc) + RETURN(rc); - GOTO(out, rc = -EIO); + /* now get EA buffer */ + rc = __osd_xattr_load(uos, dnode, &nvbuf); + if (rc) + GOTO(regular, rc); + + /* XXX: if we get that far.. should we cache the result? */ + + /* try to find LMA attribute */ + LASSERT(nvbuf != NULL); + rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMA, &value, &size); + if (rc == 0 && size >= sizeof(*lma)) { + lma = (struct lustre_mdt_attrs *)value; + lustre_lma_swab(lma); + *fid = lma->lma_self_fid; + GOTO(out, rc = 0); } - rc = -zap_lookup(it->ozi_zc->zc_objset, it->ozi_zc->zc_zapobj, - za->za_name, za->za_integer_length, 3, zde); +regular: + /* no LMA attribute in SA, let's try regular EA */ + + /* first of all, get parent dnode storing regular EA */ + rc = -sa_handle_get(uos->os, dnode, NULL, SA_HDL_PRIVATE, &sa_hdl); if (rc) GOTO(out, rc); - lde->lde_fid = zde->lzd_fid; - lde->lde_attrs = LUDA_FID; - - /* append lustre attributes */ - osd_it_append_attrs(lde, attr, namelen, zde->lzd_reg.zde_type); + dnode = ZFS_NO_OBJECT; + rc = -sa_lookup(sa_hdl, SA_ZPL_XATTR(uos), &dnode, 8); + sa_handle_destroy(sa_hdl); + if (rc) + GOTO(out, rc); - lde->lde_reclen = cpu_to_le16(lu_dirent_calc_size(namelen, attr)); + CLASSERT(sizeof(*lma) <= sizeof(osd_oti_get(env)->oti_buf)); + buf.lb_buf = osd_oti_get(env)->oti_buf; + buf.lb_len = sizeof(osd_oti_get(env)->oti_buf); + + /* now try to find LMA */ + rc = __osd_xattr_get_large(env, uos, dnode, &buf, + XATTR_NAME_LMA, &size); + if (rc == 0 && size >= sizeof(*lma)) { + lma = buf.lb_buf; + lustre_lma_swab(lma); + *fid = lma->lma_self_fid; + GOTO(out, rc = 0); + } else if (rc < 0) { + GOTO(out, rc); + } else { + GOTO(out, rc = -EIO); + } out: + if (nvbuf != NULL) + nvlist_free(nvbuf); RETURN(rc); } -static __u64 osd_zap_it_store(const struct lu_env *env, const struct dt_it *di) +static int osd_find_parent_fid(const struct lu_env *env, struct dt_object *o, + struct lu_fid *fid) { - struct osd_zap_it *it = (struct osd_zap_it *)di; + struct link_ea_header *leh; + struct link_ea_entry *lee; + struct lu_buf buf; + int rc; + ENTRY; - it->ozi_reset = 0; - RETURN(udmu_zap_cursor_serialize(it->ozi_zc)); -} + buf.lb_buf = osd_oti_get(env)->oti_buf; + buf.lb_len = sizeof(osd_oti_get(env)->oti_buf); + + rc = osd_xattr_get(env, o, &buf, XATTR_NAME_LINK, BYPASS_CAPA); + if (rc == -ERANGE) { + rc = osd_xattr_get(env, o, &LU_BUF_NULL, + XATTR_NAME_LINK, BYPASS_CAPA); + if (rc < 0) + RETURN(rc); + LASSERT(rc > 0); + OBD_ALLOC(buf.lb_buf, rc); + if (buf.lb_buf == NULL) + RETURN(-ENOMEM); + buf.lb_len = rc; + rc = osd_xattr_get(env, o, &buf, XATTR_NAME_LINK, BYPASS_CAPA); + } + if (rc < 0) + GOTO(out, rc); + if (rc < sizeof(*leh) + sizeof(*lee)) + GOTO(out, rc = -EINVAL); + + leh = buf.lb_buf; + if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) { + leh->leh_magic = LINK_EA_MAGIC; + leh->leh_reccount = __swab32(leh->leh_reccount); + leh->leh_len = __swab64(leh->leh_len); + } + if (leh->leh_magic != LINK_EA_MAGIC) + GOTO(out, rc = -EINVAL); + if (leh->leh_reccount == 0) + GOTO(out, rc = -ENODATA); -/* - * return status : - * rc == 0 -> ok, proceed. - * rc > 0 -> end of directory. - * rc < 0 -> error. ( EOVERFLOW can be masked.) - */ -static int osd_zap_it_load(const struct lu_env *env, - const struct dt_it *di, __u64 hash) -{ - struct osd_zap_it *it = (struct osd_zap_it *)di; - struct osd_object *obj = it->ozi_obj; - struct osd_device *osd = osd_obj2dev(obj); - int rc; - ENTRY; + lee = (struct link_ea_entry *)(leh + 1); + fid_be_to_cpu(fid, (const struct lu_fid *)&lee->lee_parent_fid); + rc = 0; - udmu_zap_cursor_fini(it->ozi_zc); - if (udmu_zap_cursor_init(&it->ozi_zc, &osd->od_objset, - obj->oo_db->db_object, hash)) - RETURN(-ENOMEM); - it->ozi_reset = 0; +out: + if (buf.lb_buf != osd_oti_get(env)->oti_buf) + OBD_FREE(buf.lb_buf, buf.lb_len); + +#if 0 + /* this block can be enabled for additional verification + * it's trying to match FID from LinkEA vs. FID from LMA */ + if (rc == 0) { + struct lu_fid fid2; + int rc2; + rc2 = osd_find_parent_by_dnode(env, o, &fid2); + if (rc2 == 0) + if (lu_fid_eq(fid, &fid2) == 0) + CERROR("wrong parent: "DFID" != "DFID"\n", + PFID(fid), PFID(&fid2)); + } +#endif - /* same as osd_zap_it_next()*/ - rc = -udmu_zap_cursor_retrieve_key(env, it->ozi_zc, NULL, - NAME_MAX + 1); - if (rc == 0) - RETURN(+1); - else if (rc == -ENOENT) /* end of dir*/ - RETURN(0); + /* no LinkEA is found, let's try to find the fid in parent's LMA */ + if (unlikely(rc != 0)) + rc = osd_find_parent_by_dnode(env, o, fid); RETURN(rc); } @@ -395,11 +357,23 @@ static int osd_dir_lookup(const struct lu_env *env, struct dt_object *dt, struct osd_thread_info *oti = osd_oti_get(env); struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); + char *name = (char *)key; int rc; ENTRY; LASSERT(udmu_object_is_zap(obj->oo_db)); + if (name[0] == '.') { + if (name[1] == 0) { + const struct lu_fid *f = lu_object_fid(&dt->do_lu); + memcpy(rec, f, sizeof(*f)); + RETURN(1); + } else if (name[1] == '.' && name[2] == 0) { + rc = osd_find_parent_fid(env, dt, (struct lu_fid *)rec); + RETURN(rc == 0 ? 1 : rc); + } + } + rc = -zap_lookup(osd->od_objset.os, obj->oo_db->db_object, (char *)key, 8, sizeof(oti->oti_zde) / 8, (void *)&oti->oti_zde); @@ -498,6 +472,50 @@ static inline void osd_object_put(const struct lu_env *env, lu_object_put(env, &obj->oo_dt.do_lu); } +static int osd_seq_exists(const struct lu_env *env, struct osd_device *osd, + obd_seq seq) +{ + struct lu_seq_range *range = &osd_oti_get(env)->oti_seq_range; + struct seq_server_site *ss = osd_seq_site(osd); + int rc; + ENTRY; + + LASSERT(ss != NULL); + LASSERT(ss->ss_server_fld != NULL); + + rc = osd_fld_lookup(env, osd, seq, range); + if (rc != 0) { + CERROR("%s: Can not lookup fld for "LPX64"\n", + osd_name(osd), seq); + RETURN(0); + } + + RETURN(ss->ss_node_id == range->lsr_index); +} + +static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd, + const struct lu_fid *fid) +{ + struct seq_server_site *ss = osd_seq_site(osd); + ENTRY; + + /* FID seqs not in FLDB, must be local seq */ + if (unlikely(!fid_seq_in_fldb(fid_seq(fid)))) + RETURN(0); + + /* If FLD is not being initialized yet, it only happens during the + * initialization, likely during mgs initialization, and we assume + * this is local FID. */ + if (ss == NULL || ss->ss_server_fld == NULL) + RETURN(0); + + /* Only check the local FLDB here */ + if (osd_seq_exists(env, osd, fid_seq(fid))) + RETURN(0); + + RETURN(1); +} + /** * Inserts (key, value) pair in \a directory object. * @@ -519,10 +537,12 @@ static int osd_dir_insert(const struct lu_env *env, struct dt_object *dt, struct osd_thread_info *oti = osd_oti_get(env); struct osd_object *parent = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(parent); - struct lu_fid *fid = (struct lu_fid *)rec; + struct dt_insert_rec *rec1 = (struct dt_insert_rec *)rec; + const struct lu_fid *fid = rec1->rec_fid; struct osd_thandle *oh; - struct osd_object *child; + struct osd_object *child = NULL; __u32 attr; + char *name = (char *)key; int rc; ENTRY; @@ -532,36 +552,63 @@ static int osd_dir_insert(const struct lu_env *env, struct dt_object *dt, LASSERT(dt_object_exists(dt)); LASSERT(osd_invariant(parent)); - /* - * zfs_readdir() generates ./.. on fly, but - * we want own entries (.. at least) with a fid - */ -#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 3, 58, 0) -#warning "fix '.' and '..' handling" -#endif - LASSERT(th != NULL); oh = container_of0(th, struct osd_thandle, ot_super); - child = osd_object_find(env, dt, fid); - if (IS_ERR(child)) - RETURN(PTR_ERR(child)); + rc = osd_remote_fid(env, osd, fid); + if (rc < 0) { + CERROR("%s: Can not find object "DFID": rc = %d\n", + osd->od_svname, PFID(fid), rc); + RETURN(rc); + } - LASSERT(child->oo_db); + if (unlikely(rc == 1)) { + /* Insert remote entry */ + memset(&oti->oti_zde.lzd_reg, 0, sizeof(oti->oti_zde.lzd_reg)); + oti->oti_zde.lzd_reg.zde_type = IFTODT(rec1->rec_type & S_IFMT); + } else { + /* + * To simulate old Orion setups with ./.. stored in the + * directories + */ + /* Insert local entry */ + child = osd_object_find(env, dt, fid); + if (IS_ERR(child)) + RETURN(PTR_ERR(child)); + + LASSERT(child->oo_db); + if (name[0] == '.') { + if (name[1] == 0) { + /* do not store ".", instead generate it + * during iteration */ + GOTO(out, rc = 0); + } else if (name[1] == '.' && name[2] == 0) { + /* update parent dnode in the child. + * later it will be used to generate ".." */ + udmu_objset_t *uos = &osd->od_objset; + rc = osd_object_sa_update(parent, + SA_ZPL_PARENT(uos), + &child->oo_db->db_object, + 8, oh); + GOTO(out, rc); + } + } + CLASSERT(sizeof(oti->oti_zde.lzd_reg) == 8); + CLASSERT(sizeof(oti->oti_zde) % 8 == 0); + attr = child->oo_dt.do_lu.lo_header ->loh_attr; + oti->oti_zde.lzd_reg.zde_type = IFTODT(attr & S_IFMT); + oti->oti_zde.lzd_reg.zde_dnode = child->oo_db->db_object; + } - CLASSERT(sizeof(oti->oti_zde.lzd_reg) == 8); - CLASSERT(sizeof(oti->oti_zde) % 8 == 0); - attr = child->oo_dt.do_lu.lo_header ->loh_attr; - oti->oti_zde.lzd_reg.zde_type = IFTODT(attr & S_IFMT); - oti->oti_zde.lzd_reg.zde_dnode = child->oo_db->db_object; oti->oti_zde.lzd_fid = *fid; - /* Insert (key,oid) into ZAP */ rc = -zap_add(osd->od_objset.os, parent->oo_db->db_object, (char *)key, 8, sizeof(oti->oti_zde) / 8, (void *)&oti->oti_zde, oh->ot_tx); - osd_object_put(env, child); +out: + if (child != NULL) + osd_object_put(env, child); RETURN(rc); } @@ -597,6 +644,7 @@ static int osd_dir_delete(const struct lu_env *env, struct dt_object *dt, struct osd_device *osd = osd_obj2dev(obj); struct osd_thandle *oh; dmu_buf_t *zap_db = obj->oo_db; + char *name = (char *)key; int rc; ENTRY; @@ -606,70 +654,498 @@ static int osd_dir_delete(const struct lu_env *env, struct dt_object *dt, LASSERT(th != NULL); oh = container_of0(th, struct osd_thandle, ot_super); + /* + * In Orion . and .. were stored in the directory (not generated upon + * request as now). we preserve them for backward compatibility + */ + if (name[0] == '.') { + if (name[1] == 0) { + RETURN(0); + } else if (name[1] == '.' && name[2] == 0) { + RETURN(0); + } + } + /* Remove key from the ZAP */ rc = -zap_remove(osd->od_objset.os, zap_db->db_object, (char *) key, oh->ot_tx); - if (rc && rc != -ENOENT) +#if LUSTRE_VERSION_CODE <= OBD_OCD_VERSION(2, 4, 53, 0) + if (unlikely(rc == -ENOENT && name[0] == '.' && + (name[1] == 0 || (name[1] == '.' && name[2] == 0)))) + rc = 0; +#endif + if (unlikely(rc && rc != -ENOENT)) CERROR("%s: zap_remove failed: rc = %d\n", osd->od_svname, rc); RETURN(rc); } -static struct dt_index_operations osd_dir_ops = { - .dio_lookup = osd_dir_lookup, - .dio_declare_insert = osd_declare_dir_insert, - .dio_insert = osd_dir_insert, - .dio_declare_delete = osd_declare_dir_delete, - .dio_delete = osd_dir_delete, - .dio_it = { - .init = osd_zap_it_init, - .fini = osd_zap_it_fini, - .get = osd_zap_it_get, - .put = osd_zap_it_put, - .next = osd_zap_it_next, - .key = osd_zap_it_key, - .key_size = osd_zap_it_key_size, - .rec = osd_zap_it_rec, - .store = osd_zap_it_store, - .load = osd_zap_it_load - } -}; - -/* - * Primitives for index files using binary keys. - * XXX: only 64-bit keys are supported for now. - */ - -static int osd_index_lookup(const struct lu_env *env, struct dt_object *dt, - struct dt_rec *rec, const struct dt_key *key, - struct lustre_capa *capa) +static struct dt_it *osd_dir_it_init(const struct lu_env *env, + struct dt_object *dt, + __u32 unused, + struct lustre_capa *capa) { - struct osd_object *obj = osd_dt_obj(dt); - struct osd_device *osd = osd_obj2dev(obj); - int rc; - ENTRY; + struct osd_zap_it *it; - rc = -zap_lookup_uint64(osd->od_objset.os, obj->oo_db->db_object, - (const __u64 *)key, 1, 8, obj->oo_recsize, - (void *)rec); - RETURN(rc == 0 ? 1 : rc); + it = (struct osd_zap_it *)osd_index_it_init(env, dt, unused, capa); + if (!IS_ERR(it)) + it->ozi_pos = 0; + + RETURN((struct dt_it *)it); } -static int osd_declare_index_insert(const struct lu_env *env, - struct dt_object *dt, - const struct dt_rec *rec, - const struct dt_key *key, - struct thandle *th) +/** + * Move Iterator to record specified by \a key + * + * \param di osd iterator + * \param key key for index + * + * \retval +ve di points to record with least key not larger than key + * \retval 0 di points to exact matched key + * \retval -ve failure + */ +static int osd_dir_it_get(const struct lu_env *env, + struct dt_it *di, const struct dt_key *key) { - struct osd_object *obj = osd_dt_obj(dt); - struct osd_thandle *oh; + struct osd_zap_it *it = (struct osd_zap_it *)di; + struct osd_object *obj = it->ozi_obj; + struct osd_device *osd = osd_obj2dev(obj); + char *name = (char *)key; + int rc; ENTRY; - LASSERT(th != NULL); - oh = container_of0(th, struct osd_thandle, ot_super); + LASSERT(it); + LASSERT(it->ozi_zc); - LASSERT(obj->oo_db); + udmu_zap_cursor_fini(it->ozi_zc); + + if (udmu_zap_cursor_init(&it->ozi_zc, &osd->od_objset, + obj->oo_db->db_object, 0)) + RETURN(-ENOMEM); + + /* XXX: implementation of the API is broken at the moment */ + LASSERT(((const char *)key)[0] == 0); + + if (name[0] == 0) { + it->ozi_pos = 0; + RETURN(1); + } + + if (name[0] == '.') { + if (name[1] == 0) { + it->ozi_pos = 1; + GOTO(out, rc = 1); + } else if (name[1] == '.' && name[2] == 0) { + it->ozi_pos = 2; + GOTO(out, rc = 1); + } + } + + /* neither . nor .. - some real record */ + it->ozi_pos = 3; + rc = +1; + +out: + RETURN(rc); +} + +static void osd_dir_it_put(const struct lu_env *env, struct dt_it *di) +{ + /* PBS: do nothing : ref are incremented at retrive and decreamented + * next/finish. */ +} + +/* + * in Orion . and .. were stored in the directory, while ZPL + * and current osd-zfs generate them up on request. so, we + * need to ignore previously stored . and .. + */ +static int osd_index_retrieve_skip_dots(struct osd_zap_it *it, + zap_attribute_t *za) +{ + int rc, isdot; + + do { + rc = -zap_cursor_retrieve(it->ozi_zc, za); + + isdot = 0; + if (unlikely(rc == 0 && za->za_name[0] == '.')) { + if (za->za_name[1] == 0) { + isdot = 1; + } else if (za->za_name[1] == '.' && + za->za_name[2] == 0) { + isdot = 1; + } + if (unlikely(isdot)) + zap_cursor_advance(it->ozi_zc); + } + } while (unlikely(rc == 0 && isdot)); + + return rc; +} + +/** + * to load a directory entry at a time and stored it in + * iterator's in-memory data structure. + * + * \param di, struct osd_it_ea, iterator's in memory structure + * + * \retval +ve, iterator reached to end + * \retval 0, iterator not reached to end + * \retval -ve, on error + */ +static int osd_dir_it_next(const struct lu_env *env, struct dt_it *di) +{ + struct osd_zap_it *it = (struct osd_zap_it *)di; + zap_attribute_t *za = &osd_oti_get(env)->oti_za; + int rc; + + /* temp. storage should be enough for any key supported by ZFS */ + CLASSERT(sizeof(za->za_name) <= sizeof(it->ozi_name)); + + /* + * the first ->next() moves the cursor to . + * the second ->next() moves the cursor to .. + * then we get to the real records and have to verify any exist + */ + if (it->ozi_pos <= 2) { + it->ozi_pos++; + if (it->ozi_pos <=2) + RETURN(0); + } + + zap_cursor_advance(it->ozi_zc); + + /* + * According to current API we need to return error if its last entry. + * zap_cursor_advance() does not return any value. So we need to call + * retrieve to check if there is any record. We should make + * changes to Iterator API to not return status for this API + */ + rc = osd_index_retrieve_skip_dots(it, za); + + if (rc == -ENOENT) /* end of dir */ + RETURN(+1); + + RETURN(rc); +} + +static struct dt_key *osd_dir_it_key(const struct lu_env *env, + const struct dt_it *di) +{ + struct osd_zap_it *it = (struct osd_zap_it *)di; + zap_attribute_t *za = &osd_oti_get(env)->oti_za; + int rc = 0; + ENTRY; + + if (it->ozi_pos <= 1) { + it->ozi_pos = 1; + RETURN((struct dt_key *)"."); + } else if (it->ozi_pos == 2) { + RETURN((struct dt_key *)".."); + } + + if ((rc = -zap_cursor_retrieve(it->ozi_zc, za))) + RETURN(ERR_PTR(rc)); + + strcpy(it->ozi_name, za->za_name); + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 3, 90, 0) + if (za->za_name[0] == '.') { + if (za->za_name[1] == 0 || (za->za_name[1] == '.' && + za->za_name[2] == 0)) { + /* we should not get onto . and .. + * stored in the directory. ->next() and + * other methods should prevent this + */ + LBUG(); + } + } +#endif + + RETURN((struct dt_key *)it->ozi_name); +} + +static int osd_dir_it_key_size(const struct lu_env *env, const struct dt_it *di) +{ + struct osd_zap_it *it = (struct osd_zap_it *)di; + zap_attribute_t *za = &osd_oti_get(env)->oti_za; + int rc; + ENTRY; + + if (it->ozi_pos <= 1) { + it->ozi_pos = 1; + RETURN(2); + } else if (it->ozi_pos == 2) { + RETURN(3); + } + + if ((rc = -zap_cursor_retrieve(it->ozi_zc, za)) == 0) + rc = strlen(za->za_name); + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 3, 90, 0) + if (rc == 0 && za->za_name[0] == '.') { + if (za->za_name[1] == 0 || (za->za_name[1] == '.' && + za->za_name[2] == 0)) { + /* we should not get onto . and .. + * stored in the directory. ->next() and + * other methods should prevent this + */ + LBUG(); + } + } +#endif + RETURN(rc); +} + +static int osd_dir_it_rec(const struct lu_env *env, const struct dt_it *di, + struct dt_rec *dtrec, __u32 attr) +{ + struct osd_zap_it *it = (struct osd_zap_it *)di; + struct lu_dirent *lde = (struct lu_dirent *)dtrec; + struct luz_direntry *zde = &osd_oti_get(env)->oti_zde; + zap_attribute_t *za = &osd_oti_get(env)->oti_za; + int rc, namelen; + ENTRY; + + if (it->ozi_pos <= 1) { + lde->lde_hash = cpu_to_le64(1); + strcpy(lde->lde_name, "."); + lde->lde_namelen = cpu_to_le16(1); + lde->lde_fid = *lu_object_fid(&it->ozi_obj->oo_dt.do_lu); + lde->lde_attrs = LUDA_FID; + /* append lustre attributes */ + osd_it_append_attrs(lde, attr, 1, IFTODT(S_IFDIR)); + lde->lde_reclen = cpu_to_le16(lu_dirent_calc_size(1, attr)); + it->ozi_pos = 1; + GOTO(out, rc = 0); + + } else if (it->ozi_pos == 2) { + lde->lde_hash = cpu_to_le64(2); + strcpy(lde->lde_name, ".."); + lde->lde_namelen = cpu_to_le16(2); + lde->lde_attrs = LUDA_FID; + /* append lustre attributes */ + osd_it_append_attrs(lde, attr, 2, IFTODT(S_IFDIR)); + lde->lde_reclen = cpu_to_le16(lu_dirent_calc_size(2, attr)); + rc = osd_find_parent_fid(env, &it->ozi_obj->oo_dt, &lde->lde_fid); + /* + * early Orion code was not setting LinkEA, so it's possible + * some setups still have objects with no LinkEA set. + * but at that time .. was a real record in the directory + * so we should try to lookup .. in ZAP + */ + if (rc != -ENOENT) + GOTO(out, rc); + } + + LASSERT(lde); + + rc = -zap_cursor_retrieve(it->ozi_zc, za); + if (unlikely(rc != 0)) + GOTO(out, rc); + + lde->lde_hash = cpu_to_le64(udmu_zap_cursor_serialize(it->ozi_zc)); + namelen = strlen(za->za_name); + if (namelen > NAME_MAX) + GOTO(out, rc = -EOVERFLOW); + strcpy(lde->lde_name, za->za_name); + lde->lde_namelen = cpu_to_le16(namelen); + + if (za->za_integer_length != 8 || za->za_num_integers < 3) { + CERROR("%s: unsupported direntry format: %d %d\n", + osd_obj2dev(it->ozi_obj)->od_svname, + za->za_integer_length, (int)za->za_num_integers); + + GOTO(out, rc = -EIO); + } + + rc = -zap_lookup(it->ozi_zc->zc_objset, it->ozi_zc->zc_zapobj, + za->za_name, za->za_integer_length, 3, zde); + if (rc) + GOTO(out, rc); + + lde->lde_fid = zde->lzd_fid; + lde->lde_attrs = LUDA_FID; + + /* append lustre attributes */ + osd_it_append_attrs(lde, attr, namelen, zde->lzd_reg.zde_type); + + lde->lde_reclen = cpu_to_le16(lu_dirent_calc_size(namelen, attr)); + +out: + RETURN(rc); +} + +static int osd_dir_it_rec_size(const struct lu_env *env, const struct dt_it *di, + __u32 attr) +{ + struct osd_zap_it *it = (struct osd_zap_it *)di; + zap_attribute_t *za = &osd_oti_get(env)->oti_za; + int rc, namelen = 0; + ENTRY; + + if (it->ozi_pos <= 1) + namelen = 1; + else if (it->ozi_pos == 2) + namelen = 2; + + if (namelen > 0) { + rc = lu_dirent_calc_size(namelen, attr); + RETURN(rc); + } + + rc = -zap_cursor_retrieve(it->ozi_zc, za); + if (unlikely(rc != 0)) + RETURN(rc); + + if (za->za_integer_length != 8 || za->za_num_integers < 3) { + CERROR("%s: unsupported direntry format: %d %d\n", + osd_obj2dev(it->ozi_obj)->od_svname, + za->za_integer_length, (int)za->za_num_integers); + RETURN(-EIO); + } + + namelen = strlen(za->za_name); + if (namelen > NAME_MAX) + RETURN(-EOVERFLOW); + + rc = lu_dirent_calc_size(namelen, attr); + + RETURN(rc); +} + +static __u64 osd_dir_it_store(const struct lu_env *env, const struct dt_it *di) +{ + struct osd_zap_it *it = (struct osd_zap_it *)di; + __u64 pos; + ENTRY; + + if (it->ozi_pos <= 2) + pos = it->ozi_pos; + else + pos = udmu_zap_cursor_serialize(it->ozi_zc); + + RETURN(pos); +} + +/* + * return status : + * rc == 0 -> end of directory. + * rc > 0 -> ok, proceed. + * rc < 0 -> error. ( EOVERFLOW can be masked.) + */ +static int osd_dir_it_load(const struct lu_env *env, + const struct dt_it *di, __u64 hash) +{ + struct osd_zap_it *it = (struct osd_zap_it *)di; + struct osd_object *obj = it->ozi_obj; + struct osd_device *osd = osd_obj2dev(obj); + zap_attribute_t *za = &osd_oti_get(env)->oti_za; + int rc; + ENTRY; + + udmu_zap_cursor_fini(it->ozi_zc); + if (udmu_zap_cursor_init(&it->ozi_zc, &osd->od_objset, + obj->oo_db->db_object, hash)) + RETURN(-ENOMEM); + + if (hash <= 2) { + it->ozi_pos = hash; + rc = +1; + } else { + it->ozi_pos = 3; + /* to return whether the end has been reached */ + rc = osd_index_retrieve_skip_dots(it, za); + if (rc == 0) + rc = +1; + else if (rc == -ENOENT) + rc = 0; + } + + RETURN(rc); +} + +static struct dt_index_operations osd_dir_ops = { + .dio_lookup = osd_dir_lookup, + .dio_declare_insert = osd_declare_dir_insert, + .dio_insert = osd_dir_insert, + .dio_declare_delete = osd_declare_dir_delete, + .dio_delete = osd_dir_delete, + .dio_it = { + .init = osd_dir_it_init, + .fini = osd_index_it_fini, + .get = osd_dir_it_get, + .put = osd_dir_it_put, + .next = osd_dir_it_next, + .key = osd_dir_it_key, + .key_size = osd_dir_it_key_size, + .rec = osd_dir_it_rec, + .rec_size = osd_dir_it_rec_size, + .store = osd_dir_it_store, + .load = osd_dir_it_load + } +}; + +/* + * Primitives for index files using binary keys. + */ + +/* key integer_size is 8 */ +static int osd_prepare_key_uint64(struct osd_object *o, __u64 *dst, + const struct dt_key *src) +{ + int size; + + LASSERT(dst); + LASSERT(src); + + /* align keysize to 64bit */ + size = (o->oo_keysize + sizeof(__u64) - 1) / sizeof(__u64); + size *= sizeof(__u64); + + LASSERT(size <= MAXNAMELEN); + + if (unlikely(size > o->oo_keysize)) + memset(dst + o->oo_keysize, 0, size - o->oo_keysize); + memcpy(dst, (const char *)src, o->oo_keysize); + + return (size/sizeof(__u64)); +} + +static int osd_index_lookup(const struct lu_env *env, struct dt_object *dt, + struct dt_rec *rec, const struct dt_key *key, + struct lustre_capa *capa) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct osd_device *osd = osd_obj2dev(obj); + __u64 *k = osd_oti_get(env)->oti_key64; + int rc; + ENTRY; + + rc = osd_prepare_key_uint64(obj, k, key); + + rc = -zap_lookup_uint64(osd->od_objset.os, obj->oo_db->db_object, + k, rc, obj->oo_recusize, obj->oo_recsize, + (void *)rec); + RETURN(rc == 0 ? 1 : rc); +} + +static int osd_declare_index_insert(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct osd_thandle *oh; + ENTRY; + + LASSERT(th != NULL); + oh = container_of0(th, struct osd_thandle, ot_super); + + LASSERT(obj->oo_db); dmu_tx_hold_bonus(oh->ot_tx, obj->oo_db->db_object); @@ -689,6 +1165,7 @@ static int osd_index_insert(const struct lu_env *env, struct dt_object *dt, struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); struct osd_thandle *oh; + __u64 *k = osd_oti_get(env)->oti_key64; int rc; ENTRY; @@ -699,9 +1176,11 @@ static int osd_index_insert(const struct lu_env *env, struct dt_object *dt, oh = container_of0(th, struct osd_thandle, ot_super); + rc = osd_prepare_key_uint64(obj, k, key); + /* Insert (key,oid) into ZAP */ rc = -zap_add_uint64(osd->od_objset.os, obj->oo_db->db_object, - (const __u64 *)key, 1, 8, obj->oo_recsize, + k, rc, obj->oo_recusize, obj->oo_recsize, (void *)rec, oh->ot_tx); RETURN(rc); } @@ -733,6 +1212,7 @@ static int osd_index_delete(const struct lu_env *env, struct dt_object *dt, struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); struct osd_thandle *oh; + __u64 *k = osd_oti_get(env)->oti_key64; int rc; ENTRY; @@ -740,9 +1220,11 @@ static int osd_index_delete(const struct lu_env *env, struct dt_object *dt, LASSERT(th != NULL); oh = container_of0(th, struct osd_thandle, ot_super); + rc = osd_prepare_key_uint64(obj, k, key); + /* Remove binary key from the ZAP */ rc = -zap_remove_uint64(osd->od_objset.os, obj->oo_db->db_object, - (const __u64 *)key, 1, oh->ot_tx); + k, rc, oh->ot_tx); RETURN(rc); } @@ -757,8 +1239,12 @@ static int osd_index_it_get(const struct lu_env *env, struct dt_it *di, LASSERT(it); LASSERT(it->ozi_zc); - /* XXX: API is broken at the moment */ - LASSERT(*((const __u64 *)key) == 0); + /* + * XXX: we need a binary version of zap_cursor_move_to_key() + * to implement this API */ + if (*((const __u64 *)key) != 0) + CERROR("NOT IMPLEMETED YET (move to "LPX64")\n", + *((__u64 *)key)); zap_cursor_fini(it->ozi_zc); memset(it->ozi_zc, 0, sizeof(*it->ozi_zc)); @@ -796,6 +1282,7 @@ static struct dt_key *osd_index_it_key(const struct lu_env *env, const struct dt_it *di) { struct osd_zap_it *it = (struct osd_zap_it *)di; + struct osd_object *obj = it->ozi_obj; zap_attribute_t *za = &osd_oti_get(env)->oti_za; int rc = 0; ENTRY; @@ -806,7 +1293,7 @@ static struct dt_key *osd_index_it_key(const struct lu_env *env, RETURN(ERR_PTR(rc)); /* the binary key is stored in the name */ - it->ozi_key = *((__u64 *)za->za_name); + memcpy(&it->ozi_key, za->za_name, obj->oo_keysize); RETURN((struct dt_key *)&it->ozi_key); } @@ -814,8 +1301,9 @@ static struct dt_key *osd_index_it_key(const struct lu_env *env, static int osd_index_it_key_size(const struct lu_env *env, const struct dt_it *di) { - /* we only support 64-bit binary keys for the time being */ - RETURN(sizeof(__u64)); + struct osd_zap_it *it = (struct osd_zap_it *)di; + struct osd_object *obj = it->ozi_obj; + RETURN(obj->oo_keysize); } static int osd_index_it_rec(const struct lu_env *env, const struct dt_it *di, @@ -825,6 +1313,7 @@ static int osd_index_it_rec(const struct lu_env *env, const struct dt_it *di, struct osd_zap_it *it = (struct osd_zap_it *)di; struct osd_object *obj = it->ozi_obj; struct osd_device *osd = osd_obj2dev(obj); + __u64 *k = osd_oti_get(env)->oti_key64; int rc; ENTRY; @@ -833,9 +1322,11 @@ static int osd_index_it_rec(const struct lu_env *env, const struct dt_it *di, if (rc) RETURN(rc); + rc = osd_prepare_key_uint64(obj, k, (const struct dt_key *)za->za_name); + rc = -zap_lookup_uint64(osd->od_objset.os, obj->oo_db->db_object, - (const __u64 *)za->za_name, 1, 8, - obj->oo_recsize, (void *)rec); + k, rc, obj->oo_recusize, obj->oo_recsize, + (void *)rec); RETURN(rc); } @@ -883,10 +1374,10 @@ static struct dt_index_operations osd_index_ops = { .dio_declare_delete = osd_declare_index_delete, .dio_delete = osd_index_delete, .dio_it = { - .init = osd_zap_it_init, - .fini = osd_zap_it_fini, + .init = osd_index_it_init, + .fini = osd_index_it_fini, .get = osd_index_it_get, - .put = osd_zap_it_put, + .put = osd_index_it_put, .next = osd_index_it_next, .key = osd_index_it_key, .key_size = osd_index_it_key_size, @@ -896,6 +1387,216 @@ static struct dt_index_operations osd_index_ops = { } }; +struct osd_metadnode_it { + struct osd_device *mit_dev; + __u64 mit_pos; + struct lu_fid mit_fid; + int mit_prefetched; + __u64 mit_prefetched_dnode; +}; + +static struct dt_it *osd_zfs_otable_it_init(const struct lu_env *env, + struct dt_object *dt, __u32 attr, + struct lustre_capa *capa) +{ + struct osd_device *dev = osd_dev(dt->do_lu.lo_dev); + struct osd_metadnode_it *it; + ENTRY; + + OBD_ALLOC_PTR(it); + if (unlikely(it == NULL)) + RETURN(ERR_PTR(-ENOMEM)); + + it->mit_dev = dev; + + /* XXX: dmu_object_next() does NOT find dnodes allocated + * in the current non-committed txg, so we force txg + * commit to find all existing dnodes ... */ + txg_wait_synced(dmu_objset_pool(dev->od_objset.os), 0ULL); + + RETURN((struct dt_it *)it); +} + +static void osd_zfs_otable_it_fini(const struct lu_env *env, struct dt_it *di) +{ + struct osd_metadnode_it *it = (struct osd_metadnode_it *)di; + + OBD_FREE_PTR(it); +} + +static int osd_zfs_otable_it_get(const struct lu_env *env, + struct dt_it *di, const struct dt_key *key) +{ + return 0; +} + +static void osd_zfs_otable_it_put(const struct lu_env *env, struct dt_it *di) +{ +} + +#define OTABLE_PREFETCH 256 + +static void osd_zfs_otable_prefetch(const struct lu_env *env, + struct osd_metadnode_it *it) +{ + struct osd_device *dev = it->mit_dev; + udmu_objset_t *uos = &dev->od_objset; + int rc; + + /* can go negative on the very first access to the iterator + * or if some non-Lustre objects were found */ + if (unlikely(it->mit_prefetched < 0)) + it->mit_prefetched = 0; + + if (it->mit_prefetched >= (OTABLE_PREFETCH >> 1)) + return; + + if (it->mit_prefetched_dnode == 0) + it->mit_prefetched_dnode = it->mit_pos; + + while (it->mit_prefetched < OTABLE_PREFETCH) { + rc = -dmu_object_next(uos->os, &it->mit_prefetched_dnode, + B_FALSE, 0); + if (unlikely(rc != 0)) + break; + + /* dmu_prefetch() was exported in 0.6.2, if you use with + * an older release, just comment it out - this is an + * optimization */ + dmu_prefetch(uos->os, it->mit_prefetched_dnode, 0, 0); + + it->mit_prefetched++; + } +} + +static int osd_zfs_otable_it_next(const struct lu_env *env, struct dt_it *di) +{ + struct osd_metadnode_it *it = (struct osd_metadnode_it *)di; + struct lustre_mdt_attrs *lma; + struct osd_device *dev = it->mit_dev; + udmu_objset_t *uos = &dev->od_objset; + nvlist_t *nvbuf = NULL; + uchar_t *v; + __u64 dnode; + int rc, s; + + memset(&it->mit_fid, 0, sizeof(it->mit_fid)); + + dnode = it->mit_pos; + do { + rc = -dmu_object_next(uos->os, &it->mit_pos, B_FALSE, 0); + if (unlikely(rc != 0)) + GOTO(out, rc = 1); + it->mit_prefetched--; + + /* LMA is required for this to be a Lustre object. + * If there is no xattr skip it. */ + rc = __osd_xattr_load(uos, it->mit_pos, &nvbuf); + if (unlikely(rc != 0)) + continue; + + LASSERT(nvbuf != NULL); + rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMA, &v, &s); + if (likely(rc == 0)) { + /* Lustre object */ + lma = (struct lustre_mdt_attrs *)v; + lustre_lma_swab(lma); + it->mit_fid = lma->lma_self_fid; + nvlist_free(nvbuf); + break; + } else { + /* not a Lustre object, try next one */ + nvlist_free(nvbuf); + } + + } while (1); + + + /* we aren't prefetching in the above loop because the number of + * non-Lustre objects is very small and we will be repeating very + * rare. in case we want to use this to iterate over non-Lustre + * objects (i.e. when we convert regular ZFS in Lustre) it makes + * sense to initiate prefetching in the loop */ + + /* 0 - there are more items, +1 - the end */ + if (likely(rc == 0)) + osd_zfs_otable_prefetch(env, it); + + CDEBUG(D_OTHER, "advance: %llu -> %llu "DFID": %d\n", dnode, + it->mit_pos, PFID(&it->mit_fid), rc); + +out: + return rc; +} + +static struct dt_key *osd_zfs_otable_it_key(const struct lu_env *env, + const struct dt_it *di) +{ + return NULL; +} + +static int osd_zfs_otable_it_key_size(const struct lu_env *env, + const struct dt_it *di) +{ + return sizeof(__u64); +} + +static int osd_zfs_otable_it_rec(const struct lu_env *env, + const struct dt_it *di, + struct dt_rec *rec, __u32 attr) +{ + struct osd_metadnode_it *it = (struct osd_metadnode_it *)di; + struct lu_fid *fid = (struct lu_fid *)rec; + ENTRY; + + *fid = it->mit_fid; + + RETURN(0); +} + + +static __u64 osd_zfs_otable_it_store(const struct lu_env *env, + const struct dt_it *di) +{ + struct osd_metadnode_it *it = (struct osd_metadnode_it *)di; + + return it->mit_pos; +} + +static int osd_zfs_otable_it_load(const struct lu_env *env, + const struct dt_it *di, __u64 hash) +{ + struct osd_metadnode_it *it = (struct osd_metadnode_it *)di; + + it->mit_pos = hash; + it->mit_prefetched = 0; + it->mit_prefetched_dnode = 0; + + return osd_zfs_otable_it_next(env, (struct dt_it *)di); +} + +static int osd_zfs_otable_it_key_rec(const struct lu_env *env, + const struct dt_it *di, void *key_rec) +{ + return 0; +} + +const struct dt_index_operations osd_zfs_otable_ops = { + .dio_it = { + .init = osd_zfs_otable_it_init, + .fini = osd_zfs_otable_it_fini, + .get = osd_zfs_otable_it_get, + .put = osd_zfs_otable_it_put, + .next = osd_zfs_otable_it_next, + .key = osd_zfs_otable_it_key, + .key_size = osd_zfs_otable_it_key_size, + .rec = osd_zfs_otable_it_rec, + .store = osd_zfs_otable_it_store, + .load = osd_zfs_otable_it_load, + .key_rec = osd_zfs_otable_it_key_rec, + } +}; + int osd_index_try(const struct lu_env *env, struct dt_object *dt, const struct dt_index_features *feat) { @@ -911,9 +1612,10 @@ int osd_index_try(const struct lu_env *env, struct dt_object *dt, if (feat->dif_flags & DT_IND_RANGE) RETURN(-ERANGE); - if (unlikely(feat == &dt_otable_features)) - /* do not support oi scrub yet. */ - RETURN(-ENOTSUPP); + if (unlikely(feat == &dt_otable_features)) { + dt->do_index_ops = &osd_zfs_otable_ops; + RETURN(0); + } LASSERT(obj->oo_db != NULL); if (likely(feat == &dt_directory_features)) { @@ -931,10 +1633,9 @@ int osd_index_try(const struct lu_env *env, struct dt_object *dt, if ((feat->dif_flags & ~DT_IND_UPDATE) != 0) RETURN(-EINVAL); - /* Although the zap_*_uint64() primitives support large keys, we - * limit ourselves to 64-bit keys for now */ - if (feat->dif_keysize_max != sizeof(__u64) || - feat->dif_keysize_min != sizeof(__u64)) + if (feat->dif_keysize_max > ZAP_MAXNAMELEN) + RETURN(-E2BIG); + if (feat->dif_keysize_max != feat->dif_keysize_min) RETURN(-EINVAL); /* As for the record size, it should be a multiple of 8 bytes @@ -942,14 +1643,20 @@ int osd_index_try(const struct lu_env *env, struct dt_object *dt, */ if (feat->dif_recsize_max > ZAP_MAXVALUELEN) RETURN(-E2BIG); - if (feat->dif_recsize_max != feat->dif_recsize_min || - (feat->dif_recsize_max & (sizeof(__u64) - 1))) + if (feat->dif_recsize_max != feat->dif_recsize_min) RETURN(-EINVAL); - obj->oo_recsize = feat->dif_recsize_max / sizeof(__u64); + obj->oo_keysize = feat->dif_keysize_max; + obj->oo_recsize = feat->dif_recsize_max; + obj->oo_recusize = 1; + + /* ZFS prefers to work with array of 64bits */ + if ((obj->oo_recsize & 7) == 0) { + obj->oo_recsize >>= 3; + obj->oo_recusize = 8; + } dt->do_index_ops = &osd_index_ops; } RETURN(0); } -