X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fosd-zfs%2Fosd_index.c;h=7fb832659db77ba90248aaa345b134857cbc3894;hb=17797975b27e1b5222fb75cac49d90066091486d;hp=6ebd0ab080ddc7a8a246106f0088db22907d1bb5;hpb=11db1a551172f596d1d284e8496530f9ce24ac81;p=fs%2Flustre-release.git diff --git a/lustre/osd-zfs/osd_index.c b/lustre/osd-zfs/osd_index.c index 6ebd0ab..7fb8326 100644 --- a/lustre/osd-zfs/osd_index.c +++ b/lustre/osd-zfs/osd_index.c @@ -472,25 +472,40 @@ static inline void osd_object_put(const struct lu_env *env, lu_object_put(env, &obj->oo_dt.do_lu); } -static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd, - struct lu_fid *fid) +static int osd_seq_exists(const struct lu_env *env, struct osd_device *osd, + obd_seq seq) { struct lu_seq_range *range = &osd_oti_get(env)->oti_seq_range; struct seq_server_site *ss = osd_seq_site(osd); int rc; ENTRY; - if (!fid_is_norm(fid) && !fid_is_root(fid)) - RETURN(0); + if (ss == NULL) + RETURN(1); - rc = osd_fld_lookup(env, osd, fid, range); + rc = osd_fld_lookup(env, osd, seq, range); if (rc != 0) { - CERROR("%s: Can not lookup fld for "DFID"\n", - osd_name(osd), PFID(fid)); - RETURN(rc); + CERROR("%s: Can not lookup fld for "LPX64"\n", + osd_name(osd), seq); + RETURN(0); } - RETURN(ss->ss_node_id != range->lsr_index); + RETURN(ss->ss_node_id == range->lsr_index); +} + +static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd, + struct lu_fid *fid) +{ + ENTRY; + + /* FID seqs not in FLDB, must be local seq */ + if (unlikely(!fid_seq_in_fldb(fid_seq(fid)))) + RETURN(0); + + if (osd_seq_exists(env, osd, fid_seq(fid))) + RETURN(0); + + RETURN(1); } /** @@ -918,11 +933,11 @@ static int osd_dir_it_rec(const struct lu_env *env, const struct dt_it *di, LASSERT(lde); - lde->lde_hash = cpu_to_le64(udmu_zap_cursor_serialize(it->ozi_zc)); - - if ((rc = -zap_cursor_retrieve(it->ozi_zc, za))) + rc = -zap_cursor_retrieve(it->ozi_zc, za); + if (unlikely(rc != 0)) GOTO(out, rc); + lde->lde_hash = cpu_to_le64(udmu_zap_cursor_serialize(it->ozi_zc)); namelen = strlen(za->za_name); if (namelen > NAME_MAX) GOTO(out, rc = -EOVERFLOW); @@ -954,6 +969,44 @@ out: RETURN(rc); } +static int osd_dir_it_rec_size(const struct lu_env *env, const struct dt_it *di, + __u32 attr) +{ + struct osd_zap_it *it = (struct osd_zap_it *)di; + zap_attribute_t *za = &osd_oti_get(env)->oti_za; + int rc, namelen = 0; + ENTRY; + + if (it->ozi_pos <= 1) + namelen = 1; + else if (it->ozi_pos == 2) + namelen = 2; + + if (namelen > 0) { + rc = lu_dirent_calc_size(namelen, attr); + RETURN(rc); + } + + rc = -zap_cursor_retrieve(it->ozi_zc, za); + if (unlikely(rc != 0)) + RETURN(rc); + + if (za->za_integer_length != 8 || za->za_num_integers < 3) { + CERROR("%s: unsupported direntry format: %d %d\n", + osd_obj2dev(it->ozi_obj)->od_svname, + za->za_integer_length, (int)za->za_num_integers); + RETURN(-EIO); + } + + namelen = strlen(za->za_name); + if (namelen > NAME_MAX) + RETURN(-EOVERFLOW); + + rc = lu_dirent_calc_size(namelen, attr); + + RETURN(rc); +} + static __u64 osd_dir_it_store(const struct lu_env *env, const struct dt_it *di) { struct osd_zap_it *it = (struct osd_zap_it *)di; @@ -1020,6 +1073,7 @@ static struct dt_index_operations osd_dir_ops = { .key = osd_dir_it_key, .key_size = osd_dir_it_key_size, .rec = osd_dir_it_rec, + .rec_size = osd_dir_it_rec_size, .store = osd_dir_it_store, .load = osd_dir_it_load } @@ -1027,20 +1081,44 @@ static struct dt_index_operations osd_dir_ops = { /* * Primitives for index files using binary keys. - * XXX: only 64-bit keys are supported for now. */ +/* key integer_size is 8 */ +static int osd_prepare_key_uint64(struct osd_object *o, __u64 *dst, + const struct dt_key *src) +{ + int size; + + LASSERT(dst); + LASSERT(src); + + /* align keysize to 64bit */ + size = (o->oo_keysize + sizeof(__u64) - 1) / sizeof(__u64); + size *= sizeof(__u64); + + LASSERT(size <= MAXNAMELEN); + + if (unlikely(size > o->oo_keysize)) + memset(dst + o->oo_keysize, 0, size - o->oo_keysize); + memcpy(dst, (const char *)src, o->oo_keysize); + + return (size/sizeof(__u64)); +} + static int osd_index_lookup(const struct lu_env *env, struct dt_object *dt, struct dt_rec *rec, const struct dt_key *key, struct lustre_capa *capa) { struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); + __u64 *k = osd_oti_get(env)->oti_key64; int rc; ENTRY; + rc = osd_prepare_key_uint64(obj, k, key); + rc = -zap_lookup_uint64(osd->od_objset.os, obj->oo_db->db_object, - (const __u64 *)key, 1, 8, obj->oo_recsize, + k, rc, obj->oo_recusize, obj->oo_recsize, (void *)rec); RETURN(rc == 0 ? 1 : rc); } @@ -1078,6 +1156,7 @@ static int osd_index_insert(const struct lu_env *env, struct dt_object *dt, struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); struct osd_thandle *oh; + __u64 *k = osd_oti_get(env)->oti_key64; int rc; ENTRY; @@ -1088,9 +1167,11 @@ static int osd_index_insert(const struct lu_env *env, struct dt_object *dt, oh = container_of0(th, struct osd_thandle, ot_super); + rc = osd_prepare_key_uint64(obj, k, key); + /* Insert (key,oid) into ZAP */ rc = -zap_add_uint64(osd->od_objset.os, obj->oo_db->db_object, - (const __u64 *)key, 1, 8, obj->oo_recsize, + k, rc, obj->oo_recusize, obj->oo_recsize, (void *)rec, oh->ot_tx); RETURN(rc); } @@ -1122,6 +1203,7 @@ static int osd_index_delete(const struct lu_env *env, struct dt_object *dt, struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); struct osd_thandle *oh; + __u64 *k = osd_oti_get(env)->oti_key64; int rc; ENTRY; @@ -1129,9 +1211,11 @@ static int osd_index_delete(const struct lu_env *env, struct dt_object *dt, LASSERT(th != NULL); oh = container_of0(th, struct osd_thandle, ot_super); + rc = osd_prepare_key_uint64(obj, k, key); + /* Remove binary key from the ZAP */ rc = -zap_remove_uint64(osd->od_objset.os, obj->oo_db->db_object, - (const __u64 *)key, 1, oh->ot_tx); + k, rc, oh->ot_tx); RETURN(rc); } @@ -1146,8 +1230,12 @@ static int osd_index_it_get(const struct lu_env *env, struct dt_it *di, LASSERT(it); LASSERT(it->ozi_zc); - /* XXX: API is broken at the moment */ - LASSERT(*((const __u64 *)key) == 0); + /* + * XXX: we need a binary version of zap_cursor_move_to_key() + * to implement this API */ + if (*((const __u64 *)key) != 0) + CERROR("NOT IMPLEMETED YET (move to "LPX64")\n", + *((__u64 *)key)); zap_cursor_fini(it->ozi_zc); memset(it->ozi_zc, 0, sizeof(*it->ozi_zc)); @@ -1185,6 +1273,7 @@ static struct dt_key *osd_index_it_key(const struct lu_env *env, const struct dt_it *di) { struct osd_zap_it *it = (struct osd_zap_it *)di; + struct osd_object *obj = it->ozi_obj; zap_attribute_t *za = &osd_oti_get(env)->oti_za; int rc = 0; ENTRY; @@ -1195,7 +1284,7 @@ static struct dt_key *osd_index_it_key(const struct lu_env *env, RETURN(ERR_PTR(rc)); /* the binary key is stored in the name */ - it->ozi_key = *((__u64 *)za->za_name); + memcpy(&it->ozi_key, za->za_name, obj->oo_keysize); RETURN((struct dt_key *)&it->ozi_key); } @@ -1203,8 +1292,9 @@ static struct dt_key *osd_index_it_key(const struct lu_env *env, static int osd_index_it_key_size(const struct lu_env *env, const struct dt_it *di) { - /* we only support 64-bit binary keys for the time being */ - RETURN(sizeof(__u64)); + struct osd_zap_it *it = (struct osd_zap_it *)di; + struct osd_object *obj = it->ozi_obj; + RETURN(obj->oo_keysize); } static int osd_index_it_rec(const struct lu_env *env, const struct dt_it *di, @@ -1214,6 +1304,7 @@ static int osd_index_it_rec(const struct lu_env *env, const struct dt_it *di, struct osd_zap_it *it = (struct osd_zap_it *)di; struct osd_object *obj = it->ozi_obj; struct osd_device *osd = osd_obj2dev(obj); + __u64 *k = osd_oti_get(env)->oti_key64; int rc; ENTRY; @@ -1222,9 +1313,11 @@ static int osd_index_it_rec(const struct lu_env *env, const struct dt_it *di, if (rc) RETURN(rc); + rc = osd_prepare_key_uint64(obj, k, (const struct dt_key *)za->za_name); + rc = -zap_lookup_uint64(osd->od_objset.os, obj->oo_db->db_object, - (const __u64 *)za->za_name, 1, 8, - obj->oo_recsize, (void *)rec); + k, rc, obj->oo_recusize, obj->oo_recsize, + (void *)rec); RETURN(rc); } @@ -1285,6 +1378,216 @@ static struct dt_index_operations osd_index_ops = { } }; +struct osd_metadnode_it { + struct osd_device *mit_dev; + __u64 mit_pos; + struct lu_fid mit_fid; + int mit_prefetched; + __u64 mit_prefetched_dnode; +}; + +static struct dt_it *osd_zfs_otable_it_init(const struct lu_env *env, + struct dt_object *dt, __u32 attr, + struct lustre_capa *capa) +{ + struct osd_device *dev = osd_dev(dt->do_lu.lo_dev); + struct osd_metadnode_it *it; + ENTRY; + + OBD_ALLOC_PTR(it); + if (unlikely(it == NULL)) + RETURN(ERR_PTR(-ENOMEM)); + + it->mit_dev = dev; + + /* XXX: dmu_object_next() does NOT find dnodes allocated + * in the current non-committed txg, so we force txg + * commit to find all existing dnodes ... */ + txg_wait_synced(dmu_objset_pool(dev->od_objset.os), 0ULL); + + RETURN((struct dt_it *)it); +} + +static void osd_zfs_otable_it_fini(const struct lu_env *env, struct dt_it *di) +{ + struct osd_metadnode_it *it = (struct osd_metadnode_it *)di; + + OBD_FREE_PTR(it); +} + +static int osd_zfs_otable_it_get(const struct lu_env *env, + struct dt_it *di, const struct dt_key *key) +{ + return 0; +} + +static void osd_zfs_otable_it_put(const struct lu_env *env, struct dt_it *di) +{ +} + +#define OTABLE_PREFETCH 256 + +static void osd_zfs_otable_prefetch(const struct lu_env *env, + struct osd_metadnode_it *it) +{ + struct osd_device *dev = it->mit_dev; + udmu_objset_t *uos = &dev->od_objset; + int rc; + + /* can go negative on the very first access to the iterator + * or if some non-Lustre objects were found */ + if (unlikely(it->mit_prefetched < 0)) + it->mit_prefetched = 0; + + if (it->mit_prefetched >= (OTABLE_PREFETCH >> 1)) + return; + + if (it->mit_prefetched_dnode == 0) + it->mit_prefetched_dnode = it->mit_pos; + + while (it->mit_prefetched < OTABLE_PREFETCH) { + rc = -dmu_object_next(uos->os, &it->mit_prefetched_dnode, + B_FALSE, 0); + if (unlikely(rc != 0)) + break; + + /* dmu_prefetch() was exported in 0.6.2, if you use with + * an older release, just comment it out - this is an + * optimization */ + dmu_prefetch(uos->os, it->mit_prefetched_dnode, 0, 0); + + it->mit_prefetched++; + } +} + +static int osd_zfs_otable_it_next(const struct lu_env *env, struct dt_it *di) +{ + struct osd_metadnode_it *it = (struct osd_metadnode_it *)di; + struct lustre_mdt_attrs *lma; + struct osd_device *dev = it->mit_dev; + udmu_objset_t *uos = &dev->od_objset; + nvlist_t *nvbuf = NULL; + uchar_t *v; + __u64 dnode; + int rc, s; + + memset(&it->mit_fid, 0, sizeof(it->mit_fid)); + + dnode = it->mit_pos; + do { + rc = -dmu_object_next(uos->os, &it->mit_pos, B_FALSE, 0); + if (unlikely(rc != 0)) + GOTO(out, rc = 1); + it->mit_prefetched--; + + /* LMA is required for this to be a Lustre object. + * If there is no xattr skip it. */ + rc = __osd_xattr_load(uos, it->mit_pos, &nvbuf); + if (unlikely(rc != 0)) + continue; + + LASSERT(nvbuf != NULL); + rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMA, &v, &s); + if (likely(rc == 0)) { + /* Lustre object */ + lma = (struct lustre_mdt_attrs *)v; + lustre_lma_swab(lma); + it->mit_fid = lma->lma_self_fid; + nvlist_free(nvbuf); + break; + } else { + /* not a Lustre object, try next one */ + nvlist_free(nvbuf); + } + + } while (1); + + + /* we aren't prefetching in the above loop because the number of + * non-Lustre objects is very small and we will be repeating very + * rare. in case we want to use this to iterate over non-Lustre + * objects (i.e. when we convert regular ZFS in Lustre) it makes + * sense to initiate prefetching in the loop */ + + /* 0 - there are more items, +1 - the end */ + if (likely(rc == 0)) + osd_zfs_otable_prefetch(env, it); + + CDEBUG(D_OTHER, "advance: %llu -> %llu "DFID": %d\n", dnode, + it->mit_pos, PFID(&it->mit_fid), rc); + +out: + return rc; +} + +static struct dt_key *osd_zfs_otable_it_key(const struct lu_env *env, + const struct dt_it *di) +{ + return NULL; +} + +static int osd_zfs_otable_it_key_size(const struct lu_env *env, + const struct dt_it *di) +{ + return sizeof(__u64); +} + +static int osd_zfs_otable_it_rec(const struct lu_env *env, + const struct dt_it *di, + struct dt_rec *rec, __u32 attr) +{ + struct osd_metadnode_it *it = (struct osd_metadnode_it *)di; + struct lu_fid *fid = (struct lu_fid *)rec; + ENTRY; + + *fid = it->mit_fid; + + RETURN(0); +} + + +static __u64 osd_zfs_otable_it_store(const struct lu_env *env, + const struct dt_it *di) +{ + struct osd_metadnode_it *it = (struct osd_metadnode_it *)di; + + return it->mit_pos; +} + +static int osd_zfs_otable_it_load(const struct lu_env *env, + const struct dt_it *di, __u64 hash) +{ + struct osd_metadnode_it *it = (struct osd_metadnode_it *)di; + + it->mit_pos = hash; + it->mit_prefetched = 0; + it->mit_prefetched_dnode = 0; + + return osd_zfs_otable_it_next(env, (struct dt_it *)di); +} + +static int osd_zfs_otable_it_key_rec(const struct lu_env *env, + const struct dt_it *di, void *key_rec) +{ + return 0; +} + +const struct dt_index_operations osd_zfs_otable_ops = { + .dio_it = { + .init = osd_zfs_otable_it_init, + .fini = osd_zfs_otable_it_fini, + .get = osd_zfs_otable_it_get, + .put = osd_zfs_otable_it_put, + .next = osd_zfs_otable_it_next, + .key = osd_zfs_otable_it_key, + .key_size = osd_zfs_otable_it_key_size, + .rec = osd_zfs_otable_it_rec, + .store = osd_zfs_otable_it_store, + .load = osd_zfs_otable_it_load, + .key_rec = osd_zfs_otable_it_key_rec, + } +}; + int osd_index_try(const struct lu_env *env, struct dt_object *dt, const struct dt_index_features *feat) { @@ -1300,9 +1603,10 @@ int osd_index_try(const struct lu_env *env, struct dt_object *dt, if (feat->dif_flags & DT_IND_RANGE) RETURN(-ERANGE); - if (unlikely(feat == &dt_otable_features)) - /* do not support oi scrub yet. */ - RETURN(-ENOTSUPP); + if (unlikely(feat == &dt_otable_features)) { + dt->do_index_ops = &osd_zfs_otable_ops; + RETURN(0); + } LASSERT(obj->oo_db != NULL); if (likely(feat == &dt_directory_features)) { @@ -1320,10 +1624,9 @@ int osd_index_try(const struct lu_env *env, struct dt_object *dt, if ((feat->dif_flags & ~DT_IND_UPDATE) != 0) RETURN(-EINVAL); - /* Although the zap_*_uint64() primitives support large keys, we - * limit ourselves to 64-bit keys for now */ - if (feat->dif_keysize_max != sizeof(__u64) || - feat->dif_keysize_min != sizeof(__u64)) + if (feat->dif_keysize_max > ZAP_MAXNAMELEN) + RETURN(-E2BIG); + if (feat->dif_keysize_max != feat->dif_keysize_min) RETURN(-EINVAL); /* As for the record size, it should be a multiple of 8 bytes @@ -1331,14 +1634,20 @@ int osd_index_try(const struct lu_env *env, struct dt_object *dt, */ if (feat->dif_recsize_max > ZAP_MAXVALUELEN) RETURN(-E2BIG); - if (feat->dif_recsize_max != feat->dif_recsize_min || - (feat->dif_recsize_max & (sizeof(__u64) - 1))) + if (feat->dif_recsize_max != feat->dif_recsize_min) RETURN(-EINVAL); - obj->oo_recsize = feat->dif_recsize_max / sizeof(__u64); + obj->oo_keysize = feat->dif_keysize_max; + obj->oo_recsize = feat->dif_recsize_max; + obj->oo_recusize = 1; + + /* ZFS prefers to work with array of 64bits */ + if ((obj->oo_recsize & 7) == 0) { + obj->oo_recsize >>= 3; + obj->oo_recusize = 8; + } dt->do_index_ops = &osd_index_ops; } RETURN(0); } -