* Use is subject to license terms.
*/
/*
- * Copyright (c) 2012, Intel Corporation.
+ * Copyright (c) 2012, 2013, Intel Corporation.
* Use is subject to license terms.
*/
/*
* Author: Mike Pershin <tappro@whamcloud.com>
*/
-#ifndef EXPORT_SYMTAB
-# define EXPORT_SYMTAB
-#endif
#define DEBUG_SUBSYSTEM S_OSD
#include <lustre_ver.h>
#include <libcfs/libcfs.h>
-#include <lustre_fsfilt.h>
#include <obd_support.h>
#include <lustre_net.h>
#include <obd.h>
len = (len + align) & ~align;
lt = (void *)ent->lde_name + len;
- lt->lt_type = cpu_to_le16(CFS_DTTOIF(type));
+ lt->lt_type = cpu_to_le16(DTTOIF(type));
ent->lde_attrs |= LUDA_TYPE;
}
ent->lde_attrs = cpu_to_le32(ent->lde_attrs);
}
+/*
+ * as we don't know FID, we can't use LU object, so this function
+ * partially duplicate __osd_xattr_get() which is built around
+ * LU-object and uses it to cache data like regular EA dnode, etc
+ */
+static int osd_find_parent_by_dnode(const struct lu_env *env,
+ struct dt_object *o,
+ struct lu_fid *fid)
+{
+ struct lustre_mdt_attrs *lma;
+ udmu_objset_t *uos = &osd_obj2dev(osd_dt_obj(o))->od_objset;
+ struct lu_buf buf;
+ sa_handle_t *sa_hdl;
+ nvlist_t *nvbuf = NULL;
+ uchar_t *value;
+ uint64_t dnode;
+ int rc, size;
+ ENTRY;
+
+ /* first of all, get parent dnode from own attributes */
+ LASSERT(osd_dt_obj(o)->oo_db);
+ rc = -sa_handle_get(uos->os, osd_dt_obj(o)->oo_db->db_object,
+ NULL, SA_HDL_PRIVATE, &sa_hdl);
+ if (rc)
+ RETURN(rc);
+
+ dnode = ZFS_NO_OBJECT;
+ rc = -sa_lookup(sa_hdl, SA_ZPL_PARENT(uos), &dnode, 8);
+ sa_handle_destroy(sa_hdl);
+ if (rc)
+ RETURN(rc);
+
+ /* now get EA buffer */
+ rc = __osd_xattr_load(uos, dnode, &nvbuf);
+ if (rc)
+ GOTO(regular, rc);
+
+ /* XXX: if we get that far.. should we cache the result? */
+
+ /* try to find LMA attribute */
+ LASSERT(nvbuf != NULL);
+ rc = -nvlist_lookup_byte_array(nvbuf, XATTR_NAME_LMA, &value, &size);
+ if (rc == 0 && size >= sizeof(*lma)) {
+ lma = (struct lustre_mdt_attrs *)value;
+ lustre_lma_swab(lma);
+ *fid = lma->lma_self_fid;
+ GOTO(out, rc = 0);
+ }
+
+regular:
+ /* no LMA attribute in SA, let's try regular EA */
+
+ /* first of all, get parent dnode storing regular EA */
+ rc = -sa_handle_get(uos->os, dnode, NULL, SA_HDL_PRIVATE, &sa_hdl);
+ if (rc)
+ GOTO(out, rc);
+
+ dnode = ZFS_NO_OBJECT;
+ rc = -sa_lookup(sa_hdl, SA_ZPL_XATTR(uos), &dnode, 8);
+ sa_handle_destroy(sa_hdl);
+ if (rc)
+ GOTO(out, rc);
+
+ CLASSERT(sizeof(*lma) <= sizeof(osd_oti_get(env)->oti_buf));
+ buf.lb_buf = osd_oti_get(env)->oti_buf;
+ buf.lb_len = sizeof(osd_oti_get(env)->oti_buf);
+
+ /* now try to find LMA */
+ rc = __osd_xattr_get_large(env, uos, dnode, &buf,
+ XATTR_NAME_LMA, &size);
+ if (rc == 0 && size >= sizeof(*lma)) {
+ lma = buf.lb_buf;
+ lustre_lma_swab(lma);
+ *fid = lma->lma_self_fid;
+ GOTO(out, rc = 0);
+ } else if (rc < 0) {
+ GOTO(out, rc);
+ } else {
+ GOTO(out, rc = -EIO);
+ }
+
+out:
+ if (nvbuf != NULL)
+ nvlist_free(nvbuf);
+ RETURN(rc);
+}
+
static int osd_find_parent_fid(const struct lu_env *env, struct dt_object *o,
struct lu_fid *fid)
{
out:
if (buf.lb_buf != osd_oti_get(env)->oti_buf)
OBD_FREE(buf.lb_buf, buf.lb_len);
+
+#if 0
+ /* this block can be enabled for additional verification
+ * it's trying to match FID from LinkEA vs. FID from LMA */
+ if (rc == 0) {
+ struct lu_fid fid2;
+ int rc2;
+ rc2 = osd_find_parent_by_dnode(env, o, &fid2);
+ if (rc2 == 0)
+ if (lu_fid_eq(fid, &fid2) == 0)
+ CERROR("wrong parent: "DFID" != "DFID"\n",
+ PFID(fid), PFID(&fid2));
+ }
+#endif
+
+ /* no LinkEA is found, let's try to find the fid in parent's LMA */
+ if (unlikely(rc != 0))
+ rc = osd_find_parent_by_dnode(env, o, fid);
+
RETURN(rc);
}
lu_object_put(env, &obj->oo_dt.do_lu);
}
+static int osd_seq_exists(const struct lu_env *env, struct osd_device *osd,
+ obd_seq seq)
+{
+ struct lu_seq_range *range = &osd_oti_get(env)->oti_seq_range;
+ struct seq_server_site *ss = osd_seq_site(osd);
+ int rc;
+ ENTRY;
+
+ if (ss == NULL)
+ RETURN(1);
+
+ rc = osd_fld_lookup(env, osd, seq, range);
+ if (rc != 0) {
+ CERROR("%s: Can not lookup fld for "LPX64"\n",
+ osd_name(osd), seq);
+ RETURN(0);
+ }
+
+ RETURN(ss->ss_node_id == range->lsr_index);
+}
+
+static int osd_remote_fid(const struct lu_env *env, struct osd_device *osd,
+ struct lu_fid *fid)
+{
+ ENTRY;
+
+ if (!fid_is_norm(fid) && !fid_is_root(fid))
+ RETURN(0);
+
+ if (osd_seq_exists(env, osd, fid_seq(fid)))
+ RETURN(0);
+
+ RETURN(1);
+}
+
/**
* Inserts (key, value) pair in \a directory object.
*
struct osd_device *osd = osd_obj2dev(parent);
struct lu_fid *fid = (struct lu_fid *)rec;
struct osd_thandle *oh;
- struct osd_object *child;
+ struct osd_object *child = NULL;
__u32 attr;
char *name = (char *)key;
int rc;
LASSERT(th != NULL);
oh = container_of0(th, struct osd_thandle, ot_super);
- child = osd_object_find(env, dt, fid);
- if (IS_ERR(child))
- RETURN(PTR_ERR(child));
-
-/*
- * to simulate old Orion setups with ./.. stored in the directories
- */
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 3, 91, 0)
-#define OSD_ZFS_INSERT_DOTS_FOR_TESTING__
-#endif
+ rc = osd_remote_fid(env, osd, fid);
+ if (rc < 0) {
+ CERROR("%s: Can not find object "DFID": rc = %d\n",
+ osd->od_svname, PFID(fid), rc);
+ RETURN(rc);
+ }
- LASSERT(child->oo_db);
- if (name[0] == '.') {
- if (name[1] == 0) {
- /* do not store ".", instead generate it
- * during iteration */
-#ifndef OSD_ZFS_INSERT_DOTS_FOR_TESTING
- GOTO(out, rc = 0);
-#endif
- } else if (name[1] == '.' && name[2] == 0) {
- /* update parent dnode in the child.
- * later it will be used to generate ".." */
- udmu_objset_t *uos = &osd->od_objset;
- rc = osd_object_sa_update(child,
- SA_ZPL_PARENT(uos),
- &parent->oo_db->db_object,
- 8, oh);
-
-#ifndef OSD_ZFS_INSERT_DOTS_FOR_TESTING
- GOTO(out, rc);
-#endif
+ if (unlikely(rc == 1)) {
+ /* Insert remote entry */
+ memset(&oti->oti_zde.lzd_reg, 0, sizeof(oti->oti_zde.lzd_reg));
+ oti->oti_zde.lzd_reg.zde_type = IFTODT(S_IFDIR & S_IFMT);
+ } else {
+ /*
+ * To simulate old Orion setups with ./.. stored in the
+ * directories
+ */
+ /* Insert local entry */
+ child = osd_object_find(env, dt, fid);
+ if (IS_ERR(child))
+ RETURN(PTR_ERR(child));
+
+ LASSERT(child->oo_db);
+ if (name[0] == '.') {
+ if (name[1] == 0) {
+ /* do not store ".", instead generate it
+ * during iteration */
+ GOTO(out, rc = 0);
+ } else if (name[1] == '.' && name[2] == 0) {
+ /* update parent dnode in the child.
+ * later it will be used to generate ".." */
+ udmu_objset_t *uos = &osd->od_objset;
+ rc = osd_object_sa_update(parent,
+ SA_ZPL_PARENT(uos),
+ &child->oo_db->db_object,
+ 8, oh);
+ GOTO(out, rc);
+ }
}
+ CLASSERT(sizeof(oti->oti_zde.lzd_reg) == 8);
+ CLASSERT(sizeof(oti->oti_zde) % 8 == 0);
+ attr = child->oo_dt.do_lu.lo_header ->loh_attr;
+ oti->oti_zde.lzd_reg.zde_type = IFTODT(attr & S_IFMT);
+ oti->oti_zde.lzd_reg.zde_dnode = child->oo_db->db_object;
}
- CLASSERT(sizeof(oti->oti_zde.lzd_reg) == 8);
- CLASSERT(sizeof(oti->oti_zde) % 8 == 0);
- attr = child->oo_dt.do_lu.lo_header ->loh_attr;
- oti->oti_zde.lzd_reg.zde_type = IFTODT(attr & S_IFMT);
- oti->oti_zde.lzd_reg.zde_dnode = child->oo_db->db_object;
oti->oti_zde.lzd_fid = *fid;
-
/* Insert (key,oid) into ZAP */
rc = -zap_add(osd->od_objset.os, parent->oo_db->db_object,
(char *)key, 8, sizeof(oti->oti_zde) / 8,
(void *)&oti->oti_zde, oh->ot_tx);
-#ifndef OSD_ZFS_INSERT_DOTS_FOR_TESTING
out:
-#endif
- osd_object_put(env, child);
+ if (child != NULL)
+ osd_object_put(env, child);
RETURN(rc);
}
LASSERT(th != NULL);
oh = container_of0(th, struct osd_thandle, ot_super);
-#ifndef OSD_ZFS_INSERT_DOTS_FOR_TESTING
/*
- * in Orion . and .. were stored in the directory (not generated up on
- * request as now. we preserve them for backward compatibility
+ * In Orion . and .. were stored in the directory (not generated upon
+ * request as now). we preserve them for backward compatibility
*/
if (name[0] == '.') {
if (name[1] == 0) {
RETURN(0);
}
}
-#endif
/* Remove key from the ZAP */
rc = -zap_remove(osd->od_objset.os, zap_db->db_object,
strcpy(it->ozi_name, za->za_name);
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 3, 91, 0)
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 3, 90, 0)
if (za->za_name[0] == '.') {
if (za->za_name[1] == 0 || (za->za_name[1] == '.' &&
za->za_name[2] == 0)) {
if ((rc = -zap_cursor_retrieve(it->ozi_zc, za)) == 0)
rc = strlen(za->za_name);
-#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 3, 99, 0)
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 3, 90, 0)
if (rc == 0 && za->za_name[0] == '.') {
if (za->za_name[1] == 0 || (za->za_name[1] == '.' &&
za->za_name[2] == 0)) {
int rc;
ENTRY;
- if (it->ozi_pos != 0) {
- /* the cursor wasn't at the beginning
- * so we should reset ZAP cursor as well */
- udmu_zap_cursor_fini(it->ozi_zc);
- if (udmu_zap_cursor_init(&it->ozi_zc, &osd->od_objset,
- obj->oo_db->db_object, hash))
- RETURN(-ENOMEM);
- }
+ udmu_zap_cursor_fini(it->ozi_zc);
+ if (udmu_zap_cursor_init(&it->ozi_zc, &osd->od_objset,
+ obj->oo_db->db_object, hash))
+ RETURN(-ENOMEM);
if (hash <= 2) {
it->ozi_pos = hash;
/*
* Primitives for index files using binary keys.
- * XXX: only 64-bit keys are supported for now.
*/
+static int osd_prepare_key(struct osd_object *o, __u64 *dst,
+ const struct dt_key *src)
+{
+ int size;
+
+ LASSERT(dst);
+ LASSERT(src);
+
+ /* align keysize to 64bit */
+ size = (o->oo_keysize + sizeof(__u64) - 1) / sizeof(__u64);
+ size *= sizeof(__u64);
+
+ LASSERT(size <= MAXNAMELEN);
+
+ if (unlikely(size > o->oo_keysize))
+ memset(dst + o->oo_keysize, 0, size - o->oo_keysize);
+ memcpy(dst, (const char *)src, o->oo_keysize);
+
+ return size;
+}
+
static int osd_index_lookup(const struct lu_env *env, struct dt_object *dt,
struct dt_rec *rec, const struct dt_key *key,
struct lustre_capa *capa)
{
struct osd_object *obj = osd_dt_obj(dt);
struct osd_device *osd = osd_obj2dev(obj);
+ __u64 *k = osd_oti_get(env)->oti_key64;
int rc;
ENTRY;
+ rc = osd_prepare_key(obj, k, key);
+
rc = -zap_lookup_uint64(osd->od_objset.os, obj->oo_db->db_object,
- (const __u64 *)key, 1, 8, obj->oo_recsize,
+ k, rc, obj->oo_recusize, obj->oo_recsize,
(void *)rec);
RETURN(rc == 0 ? 1 : rc);
}
struct osd_object *obj = osd_dt_obj(dt);
struct osd_device *osd = osd_obj2dev(obj);
struct osd_thandle *oh;
+ __u64 *k = osd_oti_get(env)->oti_key64;
int rc;
ENTRY;
oh = container_of0(th, struct osd_thandle, ot_super);
+ rc = osd_prepare_key(obj, k, key);
+
/* Insert (key,oid) into ZAP */
rc = -zap_add_uint64(osd->od_objset.os, obj->oo_db->db_object,
- (const __u64 *)key, 1, 8, obj->oo_recsize,
+ k, rc, obj->oo_recusize, obj->oo_recsize,
(void *)rec, oh->ot_tx);
RETURN(rc);
}
struct osd_object *obj = osd_dt_obj(dt);
struct osd_device *osd = osd_obj2dev(obj);
struct osd_thandle *oh;
+ __u64 *k = osd_oti_get(env)->oti_key64;
int rc;
ENTRY;
LASSERT(th != NULL);
oh = container_of0(th, struct osd_thandle, ot_super);
+ rc = osd_prepare_key(obj, k, key);
+
/* Remove binary key from the ZAP */
rc = -zap_remove_uint64(osd->od_objset.os, obj->oo_db->db_object,
- (const __u64 *)key, 1, oh->ot_tx);
+ k, rc, oh->ot_tx);
RETURN(rc);
}
LASSERT(it);
LASSERT(it->ozi_zc);
- /* XXX: API is broken at the moment */
- LASSERT(*((const __u64 *)key) == 0);
+ /*
+ * XXX: we need a binary version of zap_cursor_move_to_key()
+ * to implement this API */
+ if (*((const __u64 *)key) != 0)
+ CERROR("NOT IMPLEMETED YET (move to %Lx)\n", *((__u64 *)key));
zap_cursor_fini(it->ozi_zc);
memset(it->ozi_zc, 0, sizeof(*it->ozi_zc));
const struct dt_it *di)
{
struct osd_zap_it *it = (struct osd_zap_it *)di;
+ struct osd_object *obj = it->ozi_obj;
zap_attribute_t *za = &osd_oti_get(env)->oti_za;
int rc = 0;
ENTRY;
RETURN(ERR_PTR(rc));
/* the binary key is stored in the name */
- it->ozi_key = *((__u64 *)za->za_name);
+ memcpy(&it->ozi_key, za->za_name, obj->oo_keysize);
RETURN((struct dt_key *)&it->ozi_key);
}
static int osd_index_it_key_size(const struct lu_env *env,
const struct dt_it *di)
{
- /* we only support 64-bit binary keys for the time being */
- RETURN(sizeof(__u64));
+ struct osd_zap_it *it = (struct osd_zap_it *)di;
+ struct osd_object *obj = it->ozi_obj;
+ RETURN(obj->oo_keysize);
}
static int osd_index_it_rec(const struct lu_env *env, const struct dt_it *di,
struct osd_zap_it *it = (struct osd_zap_it *)di;
struct osd_object *obj = it->ozi_obj;
struct osd_device *osd = osd_obj2dev(obj);
+ __u64 *k = osd_oti_get(env)->oti_key64;
int rc;
ENTRY;
if (rc)
RETURN(rc);
+ rc = osd_prepare_key(obj, k, (const struct dt_key *)za->za_name);
+
rc = -zap_lookup_uint64(osd->od_objset.os, obj->oo_db->db_object,
- (const __u64 *)za->za_name, 1, 8,
- obj->oo_recsize, (void *)rec);
+ k, rc, obj->oo_recusize, obj->oo_recsize,
+ (void *)rec);
RETURN(rc);
}
if ((feat->dif_flags & ~DT_IND_UPDATE) != 0)
RETURN(-EINVAL);
- /* Although the zap_*_uint64() primitives support large keys, we
- * limit ourselves to 64-bit keys for now */
- if (feat->dif_keysize_max != sizeof(__u64) ||
- feat->dif_keysize_min != sizeof(__u64))
+ if (feat->dif_keysize_max > ZAP_MAXNAMELEN)
+ RETURN(-E2BIG);
+ if (feat->dif_keysize_max != feat->dif_keysize_min)
RETURN(-EINVAL);
/* As for the record size, it should be a multiple of 8 bytes
*/
if (feat->dif_recsize_max > ZAP_MAXVALUELEN)
RETURN(-E2BIG);
- if (feat->dif_recsize_max != feat->dif_recsize_min ||
- (feat->dif_recsize_max & (sizeof(__u64) - 1)))
+ if (feat->dif_recsize_max != feat->dif_recsize_min)
RETURN(-EINVAL);
- obj->oo_recsize = feat->dif_recsize_max / sizeof(__u64);
+ obj->oo_keysize = feat->dif_keysize_max;
+ obj->oo_recsize = feat->dif_recsize_max;
+ obj->oo_recusize = 1;
+
+ /* ZFS prefers to work with array of 64bits */
+ if ((obj->oo_recsize & 7) == 0) {
+ obj->oo_recsize >>= 3;
+ obj->oo_recusize = 8;
+ }
dt->do_index_ops = &osd_index_ops;
}