X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fosd-zfs%2Fosd_oi.c;h=e350ba82b515bd133b307523ffd266f203b299d7;hb=5277d9452ae990d53ca84a8cfb2150bda06772fb;hp=371c442e5468efa878622bb7eb4f0eaea2d87976;hpb=745c19c70319915a55b71b81b4e89d68e3a4e272;p=fs%2Flustre-release.git diff --git a/lustre/osd-zfs/osd_oi.c b/lustre/osd-zfs/osd_oi.c index 371c442..e350ba8 100644 --- a/lustre/osd-zfs/osd_oi.c +++ b/lustre/osd-zfs/osd_oi.c @@ -15,21 +15,15 @@ * * You should have received a copy of the GNU General Public License * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. + * http://www.gnu.org/licenses/gpl-2.0.html * * GPL HEADER END */ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. - */ -/* - * Copyright (c) 2012, 2013, Intel Corporation. - * Use is subject to license terms. + * + * Copyright (c) 2012, 2016, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -45,7 +39,6 @@ #define DEBUG_SUBSYSTEM S_OSD -#include #include #include #include @@ -85,18 +78,17 @@ struct named_oid { }; static const struct named_oid oids[] = { - { LAST_RECV_OID, LAST_RCVD }, - { OFD_LAST_GROUP_OID, "LAST_GROUP" }, - { LLOG_CATALOGS_OID, "CATALOGS" }, - { MGS_CONFIGS_OID, NULL /*MOUNT_CONFIGS_DIR*/ }, - { FID_SEQ_SRV_OID, "seq_srv" }, - { FID_SEQ_CTL_OID, "seq_ctl" }, - { FLD_INDEX_OID, "fld" }, - { MDD_LOV_OBJ_OID, LOV_OBJID }, - { OFD_HEALTH_CHECK_OID, HEALTH_CHECK }, - { ACCT_USER_OID, "acct_usr_inode" }, - { ACCT_GROUP_OID, "acct_grp_inode" }, - { 0, NULL } + { .oid = LAST_RECV_OID, .name = LAST_RCVD }, + { .oid = OFD_LAST_GROUP_OID, .name = "LAST_GROUP" }, + { .oid = LLOG_CATALOGS_OID, .name = "CATALOGS" }, + { .oid = MGS_CONFIGS_OID, /*MOUNT_CONFIGS_DIR*/ }, + { .oid = FID_SEQ_SRV_OID, .name = "seq_srv" }, + { .oid = FID_SEQ_CTL_OID, .name = "seq_ctl" }, + { .oid = FLD_INDEX_OID, .name = "fld" }, + { .oid = MDD_LOV_OBJ_OID, .name = LOV_OBJID }, + { .oid = OFD_HEALTH_CHECK_OID, .name = HEALTH_CHECK }, + { .oid = REPLY_DATA_OID, .name = REPLY_DATA }, + { .oid = 0 } }; static char *oid2name(const unsigned long oid) @@ -129,10 +121,9 @@ osd_oi_lookup(const struct lu_env *env, struct osd_device *o, if (rc >= sizeof(oi->oi_name)) return -E2BIG; - rc = 0; oi->oi_zapid = zde->zde_dnode; - return rc; + return 0; } /** @@ -144,8 +135,9 @@ osd_oi_create(const struct lu_env *env, struct osd_device *o, { struct zpl_direntry *zde = &osd_oti_get(env)->oti_zde.lzd_reg; struct lu_attr *la = &osd_oti_get(env)->oti_la; - dmu_buf_t *db; + sa_handle_t *sa_hdl = NULL; dmu_tx_t *tx; + uint64_t oid; int rc; /* verify it doesn't already exist */ @@ -153,6 +145,9 @@ osd_oi_create(const struct lu_env *env, struct osd_device *o, if (rc == 0) return -EEXIST; + if (o->od_dt_dev.dd_rdonly) + return -EROFS; + /* create fid-to-dnode index */ tx = dmu_tx_create(o->od_os); if (tx == NULL) @@ -161,7 +156,6 @@ osd_oi_create(const struct lu_env *env, struct osd_device *o, dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1, NULL); dmu_tx_hold_bonus(tx, parent); dmu_tx_hold_zap(tx, parent, TRUE, name); - LASSERT(tx->tx_objset->os_sa); dmu_tx_hold_sa_create(tx, ZFS_SA_BASE_ATTR_SIZE); rc = -dmu_tx_assign(tx, TXG_WAIT); @@ -170,21 +164,36 @@ osd_oi_create(const struct lu_env *env, struct osd_device *o, return rc; } + oid = osd_zap_create_flags(o->od_os, 0, ZAP_FLAG_HASH64, + DMU_OT_DIRECTORY_CONTENTS, + 14, /* == ZFS fzap_default_block_shift */ + DN_MAX_INDBLKSHIFT, + 0, tx); + + rc = -sa_handle_get(o->od_os, oid, NULL, SA_HDL_PRIVATE, &sa_hdl); + if (rc) + goto commit; la->la_valid = LA_MODE | LA_UID | LA_GID; la->la_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; la->la_uid = la->la_gid = 0; - __osd_zap_create(env, o, &db, tx, la, parent, 0); + rc = __osd_attr_init(env, o, sa_hdl, tx, la, parent, NULL); + sa_handle_destroy(sa_hdl); + if (rc) + goto commit; - zde->zde_dnode = db->db_object; + zde->zde_dnode = oid; zde->zde_pad = 0; zde->zde_type = IFTODT(S_IFDIR); rc = -zap_add(o->od_os, parent, name, 8, 1, (void *)zde, tx); +commit: + if (rc) + dmu_object_free(o->od_os, oid, tx); dmu_tx_commit(tx); - *child = db->db_object; - sa_buf_rele(db, osd_obj_tag); + if (rc == 0) + *child = oid; return rc; } @@ -210,7 +219,7 @@ osd_oi_find_or_create(const struct lu_env *env, struct osd_device *o, * the object is located (tgt index) and it is MDT or OST object. */ int osd_fld_lookup(const struct lu_env *env, struct osd_device *osd, - obd_seq seq, struct lu_seq_range *range) + u64 seq, struct lu_seq_range *range) { struct seq_server_site *ss = osd_seq_site(osd); @@ -251,6 +260,14 @@ int fid_is_on_ost(const struct lu_env *env, struct osd_device *osd, rc = osd_fld_lookup(env, osd, fid_seq(fid), range); if (rc != 0) { + /* During upgrade, OST FLDB might not be loaded because + * OST FLDB is not created until 2.6, so if some DNE + * filesystem upgrade from 2.5 to 2.7/2.8, they will + * not be able to find the sequence from local FLDB + * cache see fld_index_init(). */ + if (rc == -ENOENT && osd->od_is_ost) + RETURN(1); + if (rc != -ENOENT) CERROR("%s: "DFID" lookup failed: rc = %d\n", osd_name(osd), PFID(fid), rc); @@ -264,7 +281,7 @@ int fid_is_on_ost(const struct lu_env *env, struct osd_device *osd, } static struct osd_seq *osd_seq_find_locked(struct osd_seq_list *seq_list, - obd_seq seq) + u64 seq) { struct osd_seq *osd_seq; @@ -275,8 +292,7 @@ static struct osd_seq *osd_seq_find_locked(struct osd_seq_list *seq_list, return NULL; } -static struct osd_seq *osd_seq_find(struct osd_seq_list *seq_list, - obd_seq seq) +static struct osd_seq *osd_seq_find(struct osd_seq_list *seq_list, u64 seq) { struct osd_seq *osd_seq; @@ -288,7 +304,7 @@ static struct osd_seq *osd_seq_find(struct osd_seq_list *seq_list, } static struct osd_seq *osd_find_or_add_seq(const struct lu_env *env, - struct osd_device *osd, obd_seq seq) + struct osd_device *osd, u64 seq) { struct osd_seq_list *seq_list = &osd->od_seq_list; struct osd_seq *osd_seq; @@ -328,7 +344,7 @@ static struct osd_seq *osd_find_or_add_seq(const struct lu_env *env, oi.oi_zapid = osd->od_O_id; sprintf(seq_name, (fid_seq_is_rsvd(seq) || - fid_seq_is_mdt0(seq)) ? LPU64 : LPX64i, + fid_seq_is_mdt0(seq)) ? "%llu" : "%llx", fid_seq_is_idif(seq) ? 0 : seq); rc = osd_oi_find_or_create(env, osd, oi.oi_zapid, seq_name, &odb); @@ -369,11 +385,11 @@ out: */ static uint64_t osd_get_idx_for_ost_obj(const struct lu_env *env, struct osd_device *osd, - const struct lu_fid *fid, char *buf) + const struct lu_fid *fid, char *buf, int bufsize) { struct osd_seq *osd_seq; unsigned long b; - obd_id id; + u64 id; int rc; osd_seq = osd_find_or_add_seq(env, osd, fid_seq(fid)); @@ -394,7 +410,8 @@ osd_get_idx_for_ost_obj(const struct lu_env *env, struct osd_device *osd, b = id % OSD_OST_MAP_SIZE; LASSERT(osd_seq->os_compat_dirs[b]); - sprintf(buf, LPU64, id); + if (buf) + snprintf(buf, bufsize, "%llu", id); return osd_seq->os_compat_dirs[b]; } @@ -413,41 +430,47 @@ static void osd_fid2str(char *buf, const struct lu_fid *fid) */ static uint64_t osd_get_idx_for_fid(struct osd_device *osd, const struct lu_fid *fid, - char *buf) + char *buf, dnode_t **zdn) { struct osd_oi *oi; LASSERT(osd->od_oi_table != NULL); oi = osd->od_oi_table[fid_seq(fid) & (osd->od_oi_count - 1)]; - osd_fid2str(buf, fid); + if (buf) + osd_fid2str(buf, fid); + if (zdn) + *zdn = oi->oi_dn; return oi->oi_zapid; } uint64_t osd_get_name_n_idx(const struct lu_env *env, struct osd_device *osd, - const struct lu_fid *fid, char *buf) + const struct lu_fid *fid, char *buf, int bufsize, + dnode_t **zdn) { uint64_t zapid; LASSERT(fid); - LASSERT(buf); + LASSERT(!fid_is_acct(fid)); + + if (zdn != NULL) + *zdn = NULL; if (fid_is_on_ost(env, osd, fid) == 1 || fid_seq(fid) == FID_SEQ_ECHO) { - zapid = osd_get_idx_for_ost_obj(env, osd, fid, buf); + zapid = osd_get_idx_for_ost_obj(env, osd, fid, buf, bufsize); } else if (unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE)) { /* special objects with fixed known fids get their name */ char *name = oid2name(fid_oid(fid)); if (name) { zapid = osd->od_root; - strcpy(buf, name); - if (fid_is_acct(fid)) - zapid = MASTER_NODE_OBJ; + if (buf) + strncpy(buf, name, bufsize); } else { - zapid = osd_get_idx_for_fid(osd, fid, buf); + zapid = osd_get_idx_for_fid(osd, fid, buf, NULL); } } else { - zapid = osd_get_idx_for_fid(osd, fid, buf); + zapid = osd_get_idx_for_fid(osd, fid, buf, zdn); } return zapid; @@ -465,32 +488,31 @@ int osd_fid_lookup(const struct lu_env *env, struct osd_device *dev, { struct osd_thread_info *info = osd_oti_get(env); char *buf = info->oti_buf; - uint64_t zapid; + dnode_t *zdn; + uint64_t zapid; int rc = 0; ENTRY; - if (OBD_FAIL_CHECK(OBD_FAIL_OST_ENOENT)) + if (OBD_FAIL_CHECK(OBD_FAIL_SRV_ENOENT)) RETURN(-ENOENT); - if (unlikely(fid_is_acct(fid))) { - if (fid_oid(fid) == ACCT_USER_OID) - *oid = dev->od_iusr_oid; - else - *oid = dev->od_igrp_oid; - } else if (unlikely(fid_is_fs_root(fid))) { + LASSERT(!fid_is_acct(fid)); + + if (unlikely(fid_is_fs_root(fid))) { *oid = dev->od_root; } else { - zapid = osd_get_name_n_idx(env, dev, fid, buf); - - rc = -zap_lookup(dev->od_os, zapid, buf, - 8, 1, &info->oti_zde); + zapid = osd_get_name_n_idx(env, dev, fid, buf, + sizeof(info->oti_buf), &zdn); + rc = osd_zap_lookup(dev, zapid, zdn, buf, + 8, 1, &info->oti_zde); if (rc) RETURN(rc); *oid = info->oti_zde.lzd_reg.zde_dnode; } if (rc == 0) - dmu_prefetch(dev->od_os, *oid, 0, 0); + osd_dmu_prefetch(dev->od_os, *oid, 0, 0, 0, + ZIO_PRIORITY_ASYNC_READ); RETURN(rc); } @@ -507,6 +529,8 @@ osd_oi_remove_table(const struct lu_env *env, struct osd_device *o, int key) oi = o->od_oi_table[key]; if (oi) { + if (oi->oi_dn) + osd_dnode_rele(oi->oi_dn); OBD_FREE_PTR(oi); o->od_oi_table[key] = NULL; } @@ -536,6 +560,7 @@ osd_oi_add_table(const struct lu_env *env, struct osd_device *o, } o->od_oi_table[key] = oi; + __osd_obj2dnode(o->od_os, oi->oi_zapid, &oi->oi_dn); return 0; } @@ -619,15 +644,6 @@ osd_oi_probe(const struct lu_env *env, struct osd_device *o, int *count) RETURN(0); } -static void osd_ost_seq_init(const struct lu_env *env, struct osd_device *osd) -{ - struct osd_seq_list *osl = &osd->od_seq_list; - - INIT_LIST_HEAD(&osl->osl_seq_list); - rwlock_init(&osl->osl_seq_list_lock); - sema_init(&osl->osl_seq_init_sem, 1); -} - static void osd_ost_seq_fini(const struct lu_env *env, struct osd_device *osd) { struct osd_seq_list *osl = &osd->od_seq_list; @@ -652,134 +668,14 @@ static void osd_ost_seq_fini(const struct lu_env *env, struct osd_device *osd) static int osd_oi_init_compat(const struct lu_env *env, struct osd_device *o) { - uint64_t odb, sdb; - int rc; + uint64_t sdb; + int rc; ENTRY; rc = osd_oi_find_or_create(env, o, o->od_root, "O", &sdb); - if (rc) - RETURN(rc); - - o->od_O_id = sdb; - - osd_ost_seq_init(env, o); - /* Create on-disk indexes to maintain per-UID/GID inode usage. - * Those new indexes are created in the top-level ZAP outside the - * namespace in order not to confuse ZPL which might interpret those - * indexes as directories and assume the values are object IDs */ - rc = osd_oi_find_or_create(env, o, MASTER_NODE_OBJ, - oid2name(ACCT_USER_OID), &odb); - if (rc) - RETURN(rc); - o->od_iusr_oid = odb; - - rc = osd_oi_find_or_create(env, o, MASTER_NODE_OBJ, - oid2name(ACCT_GROUP_OID), &odb); - if (rc) - RETURN(rc); - o->od_igrp_oid = odb; - - RETURN(rc); -} - -static char *root2convert = "ROOT"; -/* - * due to DNE requirements we have to change sequence of /ROOT object - * so that it doesn't belong to the local sequence FID_SEQ_LOCAL_FILE - * but a normal sequence living on MDS#0 - * this is the sole purpose of this function. - * - * This is only needed for pre-production 2.4 ZFS filesystems, and - * can be removed in the future. - */ -int osd_convert_root_to_new_seq(const struct lu_env *env, - struct osd_device *o) -{ - struct luz_direntry *lze = &osd_oti_get(env)->oti_zde; - char *buf = osd_oti_get(env)->oti_str; - struct lu_fid newfid; - uint64_t zapid; - dmu_tx_t *tx = NULL; - int rc; - ENTRY; - - /* ignore OSTs */ - if (strstr(o->od_svname, "MDT") == NULL) - RETURN(0); - - /* lookup /ROOT */ - rc = -zap_lookup(o->od_os, o->od_root, root2convert, 8, - sizeof(*lze) / 8, (void *)lze); - /* doesn't exist or let actual user to handle the error */ - if (rc) - RETURN(0); - - CDEBUG(D_OTHER, "%s: /ROOT -> "DFID" -> "LPU64"\n", o->od_svname, - PFID(&lze->lzd_fid), (long long int) lze->lzd_reg.zde_dnode); - - /* already right one? */ - if (fid_seq(&lze->lzd_fid) == FID_SEQ_ROOT) - return 0; - - tx = dmu_tx_create(o->od_os); - if (tx == NULL) - return -ENOMEM; - - dmu_tx_hold_bonus(tx, o->od_root); - - /* declare delete/insert of the name */ - dmu_tx_hold_zap(tx, o->od_root, TRUE, root2convert); - dmu_tx_hold_zap(tx, o->od_root, FALSE, root2convert); - - /* declare that we'll remove object from fid-dnode mapping */ - zapid = osd_get_name_n_idx(env, o, &lze->lzd_fid, buf); - dmu_tx_hold_bonus(tx, zapid); - dmu_tx_hold_zap(tx, zapid, FALSE, buf); - - /* declare that we'll add object to fid-dnode mapping */ - newfid.f_seq = FID_SEQ_ROOT; - newfid.f_oid = 1; - newfid.f_ver = 0; - zapid = osd_get_name_n_idx(env, o, &newfid, buf); - dmu_tx_hold_bonus(tx, zapid); - dmu_tx_hold_zap(tx, zapid, TRUE, buf); - - rc = -dmu_tx_assign(tx, TXG_WAIT); - if (rc) - GOTO(err, rc); - - rc = -zap_remove(o->od_os, o->od_root, root2convert, tx); - if (rc) - GOTO(err, rc); - - /* remove from OI */ - zapid = osd_get_name_n_idx(env, o, &lze->lzd_fid, buf); - rc = -zap_remove(o->od_os, zapid, buf, tx); - if (rc) - GOTO(err, rc); - - lze->lzd_fid = newfid; - rc = -zap_add(o->od_os, o->od_root, root2convert, - 8, sizeof(*lze) / 8, (void *)lze, tx); - if (rc) - GOTO(err, rc); - - /* add to OI with the new fid */ - zapid = osd_get_name_n_idx(env, o, &newfid, buf); - rc = -zap_add(o->od_os, zapid, buf, 8, 1, &lze->lzd_reg, tx); - if (rc) - GOTO(err, rc); - + if (!rc) + o->od_O_id = sdb; - /* LMA will be updated in mdd_compat_fixes */ - dmu_tx_commit(tx); - - RETURN(rc); - -err: - if (tx) - dmu_tx_abort(tx); - CERROR("%s: can't convert to new fid: rc = %d\n", o->od_svname, rc); RETURN(rc); } @@ -861,4 +757,142 @@ int osd_options_init(void) return 0; } +/* + * the following set of functions are used to maintain per-thread + * cache of FID->ino mapping. this mechanism is used to avoid + * expensive LU/OI lookups. + */ +struct osd_idmap_cache *osd_idc_find(const struct lu_env *env, + struct osd_device *osd, + const struct lu_fid *fid) +{ + struct osd_thread_info *oti = osd_oti_get(env); + struct osd_idmap_cache *idc = oti->oti_ins_cache; + int i; + + for (i = 0; i < oti->oti_ins_cache_used; i++) { + if (!lu_fid_eq(&idc[i].oic_fid, fid)) + continue; + if (idc[i].oic_dev != osd) + continue; + + return idc + i; + } + + return NULL; +} + +struct osd_idmap_cache *osd_idc_add(const struct lu_env *env, + struct osd_device *osd, + const struct lu_fid *fid) +{ + struct osd_thread_info *oti = osd_oti_get(env); + struct osd_idmap_cache *idc; + int i; + + if (unlikely(oti->oti_ins_cache_used >= oti->oti_ins_cache_size)) { + i = oti->oti_ins_cache_size * 2; + LASSERT(i < 1000); + if (i == 0) + i = OSD_INS_CACHE_SIZE; + OBD_ALLOC(idc, sizeof(*idc) * i); + if (idc == NULL) + return ERR_PTR(-ENOMEM); + if (oti->oti_ins_cache != NULL) { + memcpy(idc, oti->oti_ins_cache, + oti->oti_ins_cache_used * sizeof(*idc)); + OBD_FREE(oti->oti_ins_cache, + oti->oti_ins_cache_used * sizeof(*idc)); + } + oti->oti_ins_cache = idc; + oti->oti_ins_cache_size = i; + } + idc = &oti->oti_ins_cache[oti->oti_ins_cache_used++]; + idc->oic_fid = *fid; + idc->oic_dev = osd; + idc->oic_dnode = 0; + idc->oic_remote = 0; + + return idc; +} + +/** + * Lookup mapping for the given fid in the cache + * + * Initialize a new one if not found. the initialization checks whether + * the object is local or remote. for the local objects, OI is used to + * learn dnode#. the function is used when the caller has no information + * about the object, e.g. at dt_insert(). + */ +struct osd_idmap_cache *osd_idc_find_or_init(const struct lu_env *env, + struct osd_device *osd, + const struct lu_fid *fid) +{ + struct osd_idmap_cache *idc; + int rc; + + LASSERT(!fid_is_acct(fid)); + + idc = osd_idc_find(env, osd, fid); + if (idc != NULL) + return idc; + + /* new mapping is needed */ + idc = osd_idc_add(env, osd, fid); + if (IS_ERR(idc)) + return idc; + + /* initialize it */ + rc = osd_remote_fid(env, osd, fid); + if (unlikely(rc < 0)) + return ERR_PTR(rc); + + if (rc == 0) { + /* the object is local, lookup in OI */ + uint64_t dnode; + + rc = osd_fid_lookup(env, osd, fid, &dnode); + if (unlikely(rc < 0)) { + CERROR("%s: can't lookup: rc = %d\n", + osd->od_svname, rc); + return ERR_PTR(rc); + } + LASSERT(dnode < (1ULL << DN_MAX_OBJECT_SHIFT)); + idc->oic_dnode = dnode; + } else { + /* the object is remote */ + idc->oic_remote = 1; + } + + return idc; +} + +/* + * lookup mapping for given FID and fill it from the given object. + * the object is local by definition. + */ +int osd_idc_find_and_init(const struct lu_env *env, struct osd_device *osd, + struct osd_object *obj) +{ + const struct lu_fid *fid = lu_object_fid(&obj->oo_dt.do_lu); + struct osd_idmap_cache *idc; + + idc = osd_idc_find(env, osd, fid); + if (idc != NULL) { + if (obj->oo_dn == NULL) + return 0; + idc->oic_dnode = obj->oo_dn->dn_object; + return 0; + } + + /* new mapping is needed */ + idc = osd_idc_add(env, osd, fid); + if (IS_ERR(idc)) + return PTR_ERR(idc); + + if (obj->oo_dn) + idc->oic_dnode = obj->oo_dn->dn_object; + + return 0; +}