From 9dce5e8d6248a121ac27108d32eb38fc24c10c1c Mon Sep 17 00:00:00 2001 From: nikita Date: Thu, 12 Oct 2006 14:15:23 +0000 Subject: [PATCH] iam,cmm,llite: cleanup handling of corner cases in readdir, especially for split directories. Get rid of -ERANGE and -E2BIG errors. --- lustre/cmm/cmm_split.c | 32 +++---- lustre/include/lustre/lustre_idl.h | 13 ++- .../kernel_patches/patches/ext3-iam-separate.patch | 26 ++++-- lustre/llite/dir.c | 13 +-- lustre/lmv/lmv_obd.c | 25 +++--- lustre/mdc/mdc_request.c | 26 +++--- lustre/mdt/mdt_handler.c | 24 +++-- lustre/osd/osd_handler.c | 100 ++++++++++++--------- 8 files changed, 140 insertions(+), 119 deletions(-) diff --git a/lustre/cmm/cmm_split.c b/lustre/cmm/cmm_split.c index 0555565..a32414a 100644 --- a/lustre/cmm/cmm_split.c +++ b/lustre/cmm/cmm_split.c @@ -256,7 +256,7 @@ static int cmm_remove_dir_ent(const struct lu_env *env, struct md_object *mo, int is_dir, rc; ENTRY; - if (!strncmp(ent->lde_name, ".", ent->lde_namelen) || + if (!strncmp(ent->lde_name, ".", ent->lde_namelen) || !strncmp(ent->lde_name, "..", ent->lde_namelen)) RETURN(0); @@ -269,29 +269,29 @@ static int cmm_remove_dir_ent(const struct lu_env *env, struct md_object *mo, else /* XXX: is this correct? */ is_dir = 1; - + OBD_ALLOC(name, ent->lde_namelen + 1); if (!name) GOTO(cleanup, rc = -ENOMEM); - + memcpy(name, ent->lde_name, ent->lde_namelen); rc = mdo_name_remove(env, md_object_next(mo), name, is_dir); OBD_FREE(name, ent->lde_namelen + 1); - if (rc) + if (rc) GOTO(cleanup, rc); - + /* * This ent will be transferred to slave MDS and insert it there, so in * the slave MDS, we should know whether this object is dir or not, so * use the highest bit of the hash to indicate that (because we do not * use highest bit of hash). - */ + */ if (is_dir) ent->lde_hash |= MAX_HASH_HIGHEST_BIT; cleanup: cmm_object_put(env, obj); - + RETURN(rc); } @@ -309,9 +309,9 @@ static int cmm_remove_entries(const struct lu_env *env, for (ent = lu_dirent_start(dp); ent != NULL; ent = lu_dirent_next(ent)) { if (ent->lde_hash < hash_end) { - rc = cmm_remove_dir_ent(env, mo, ent); - if (rc) { - CERROR("Can not del %s rc %d\n", ent->lde_name, + rc = cmm_remove_dir_ent(env, mo, ent); + if (rc) { + CERROR("Can not del %s rc %d\n", ent->lde_name, rc); GOTO(unmap, rc); } @@ -349,14 +349,8 @@ static int cmm_split_entries(const struct lu_env *env, kunmap(rdpg->rp_pages[0]); rc = mo_readpage(env, md_object_next(mo), rdpg); - /* -E2BIG means it already reach the end of the dir */ - if (rc) { - if (rc != -ERANGE) { - if (rc == -E2BIG) - rc = 0; - RETURN(rc); - } - } + if (rc) + RETURN(rc); /* Remove the old entries */ rc = cmm_remove_entries(env, mo, rdpg, end, &len); @@ -437,7 +431,7 @@ static struct lu_buf *cmm_buf_get(const struct lu_env *env, void *area, ssize_t len) { struct lu_buf *buf; - + buf = &cmm_env_info(env)->cmi_buf; buf->lb_buf = area; buf->lb_len = len; diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index e634970..67fd1ab 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -274,14 +274,21 @@ struct lu_dirent { struct lu_dirpage { __u32 ldp_hash_start; __u32 ldp_hash_end; - __u16 ldp_pad0; - __u32 ldp_pad1; + __u16 ldp_flags; + __u32 ldp_pad0; struct lu_dirent ldp_entries[0]; }; +enum lu_dirpage_flags { + LDF_EMPTY = 1 << 0 +}; + static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp) { - return dp->ldp_entries; + if (le16_to_cpu(dp->ldp_flags) & LDF_EMPTY) + return NULL; + else + return dp->ldp_entries; } static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent) diff --git a/lustre/kernel_patches/patches/ext3-iam-separate.patch b/lustre/kernel_patches/patches/ext3-iam-separate.patch index e35ab18..c2eb8f8 100644 --- a/lustre/kernel_patches/patches/ext3-iam-separate.patch +++ b/lustre/kernel_patches/patches/ext3-iam-separate.patch @@ -15,7 +15,7 @@ Index: iam/fs/ext3/iam.c =================================================================== --- iam.orig/fs/ext3/iam.c +++ iam/fs/ext3/iam.c -@@ -0,0 +1,1337 @@ +@@ -0,0 +1,1339 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * @@ -868,6 +868,8 @@ Index: iam/fs/ext3/iam.c + if (result != 0) + iam_it_put(it); + } ++ if (result == 0) ++ it->ii_state = IAM_IT_ATTACHED; + } + assert_corr(ergo(result == 0, it_state(it) == IAM_IT_ATTACHED)); + assert_corr(ergo(result > 0, it_state(it) == IAM_IT_DETACHED)); @@ -2710,7 +2712,7 @@ Index: iam/fs/ext3/iam_lvar.c =================================================================== --- iam.orig/fs/ext3/iam_lvar.c +++ iam/fs/ext3/iam_lvar.c -@@ -0,0 +1,960 @@ +@@ -0,0 +1,966 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * @@ -2981,8 +2983,8 @@ Index: iam/fs/ext3/iam_lvar.c + nexthash = e_hash(scan); + if (nexthash != get_hash(iam_leaf_container(leaf), + e_char(scan), e_keysize(scan))) { -+ return 0; + BREAKPOINT(); ++ return 0; + } + if (nexthash < hash) { + BREAKPOINT(); @@ -3138,18 +3140,24 @@ Index: iam/fs/ext3/iam_lvar.c + end = n_end(leaf); + hash = *(const lvar_hash_t *)ik; + ++ lvar_start(leaf); + for (scan = n_start(leaf); scan < end; scan = e_next(leaf, scan)) { + lvar_hash_t scan_hash; + + scan_hash = e_hash(scan); -+ if (scan_hash >= hash) { -+ leaf->il_at = lvar_lentry(scan); -+ return scan_hash == hash ? -+ IAM_LOOKUP_EXACT : IAM_LOOKUP_OK; -+ } ++ if (scan_hash > hash) ++ return scan == n_start(leaf) ? ++ IAM_LOOKUP_BEFORE : IAM_LOOKUP_OK; ++ leaf->il_at = lvar_lentry(scan); ++ if (scan_hash == hash) ++ return IAM_LOOKUP_EXACT; + } + assert_inv(n_invariant(leaf)); -+ return -E2BIG; ++ /* ++ * @ik is greater than any key in the node. Return last record in the ++ * node. ++ */ ++ return IAM_LOOKUP_OK; +} + +static void lvar_key_set(struct iam_leaf *l, const struct iam_key *k) diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index 2abfcf6..37dc3fd 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -239,9 +239,9 @@ static struct page *ll_dir_page_locate(struct inode *dir, unsigned long hash, LASSERT(*start <= hash); if (hash > *end || (*end != *start && hash == *end)) { kunmap(page); - lock_page(page); + lock_page(page); ll_truncate_complete_page(page); - unlock_page(page); + unlock_page(page); page_cache_release(page); page = NULL; } @@ -344,9 +344,9 @@ static struct page *ll_get_dir_page(struct inode *dir, __u32 hash, int exact, */ CWARN("Stale readpage page %p: %#lx != %#lx\n", page, (unsigned long)hash, (unsigned long)start); - lock_page(page); + lock_page(page); ll_truncate_complete_page(page); - unlock_page(page); + unlock_page(page); page_cache_release(page); } else GOTO(hash_collision, page); @@ -444,7 +444,8 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) struct lu_dirent *ent; if (!IS_ERR(page)) { - __u32 hash; /* no, Richard, it _is_ initialized */ + __u32 hash = ~0; /* if page is empty (end of directory + * is reached) use this value. */ __u32 next; dp = page_address(page); @@ -492,7 +493,7 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) name = ent->lde_name; fid_le_to_cpu(&fid, &fid); ino = ll_fid_build_ino(sbi, &fid); - + done = filldir(cookie, name, namelen, (loff_t)hash, ino, DT_UNKNOWN); } diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index 4d3d8e5..7053af1 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -1907,9 +1907,9 @@ int lmv_blocking_ast(struct ldlm_lock *lock, RETURN(0); } -static int lmv_reset_hash_seg_end (struct lmv_obd *lmv, struct lmv_obj *obj, - const struct lu_fid *fid, int index, - struct lu_dirpage *dp) +static int lmv_reset_hash_seg_end(struct lmv_obd *lmv, struct lmv_obj *obj, + const struct lu_fid *fid, int index, + struct lu_dirpage *dp) { struct ptlrpc_request *tmp_req = NULL; struct page *page = NULL; @@ -1920,7 +1920,7 @@ static int lmv_reset_hash_seg_end (struct lmv_obd *lmv, struct lmv_obj *obj, __u32 seg_end; int rc = 0; ENTRY; - + /* * We have reached the end of this hash segment, and the start offset of * next segment need to be gotten out from the next segment, set it to @@ -1942,16 +1942,15 @@ static int lmv_reset_hash_seg_end (struct lmv_obd *lmv, struct lmv_obj *obj, GOTO(cleanup, rc = -ENOMEM); rc = md_readpage(tgt_exp, &rid, NULL, seg_end, page, &tmp_req); - if (rc) { - /* E2BIG means it already reached the end of the dir, - * no need reset the hash segment end */ - if (rc == -E2BIG) - GOTO(cleanup, rc = 0); - if (rc != -ERANGE) - GOTO(cleanup, rc); - } + if (rc) + GOTO(cleanup, rc); kmap(page); next_dp = cfs_page_address(page); + if (lu_dirent_start(next_dp) == NULL) + /* + * End of hash-segment reached. + */ + GOTO(cleanup, rc); LASSERT(le32_to_cpu(next_dp->ldp_hash_start) >= seg_end); dp->ldp_hash_end = next_dp->ldp_hash_start; kunmap(page); @@ -2028,7 +2027,7 @@ static int lmv_readpage(struct obd_export *exp, const struct lu_fid *fid, rc = 0; break; } - /* if there are no entries in this segment + /* if there are no entries in this segment * and it is not the last hash segment */ } while (1); } diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index b49a71b..d344ab3 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -162,7 +162,7 @@ static int mdc_getattr_common(struct obd_export *exp, unsigned int ea_size, acl_size }; int offset, rc; ENTRY; - + /* Request message already built. */ if (ea_size) CDEBUG(D_INODE, "reserved %u bytes for MD/symlink in packet\n", @@ -251,7 +251,7 @@ int mdc_getattr(struct obd_export *exp, const struct lu_fid *fid, /* currently only root inode will call us with FLACL */ else if (valid & OBD_MD_FLACL) acl_size = LUSTRE_POSIX_ACL_MAX_SIZE; - + rc = mdc_getattr_common(exp, ea_size, acl_size, !!(valid & OBD_MD_FLMDSCAPA), req); if (rc != 0) { @@ -283,7 +283,7 @@ int mdc_getattr_name(struct obd_export *exp, const struct lu_fid *fid, mdc_pack_req_body(req, REQ_REC_OFF, valid, fid, oc, ea_size, MDS_BFLAG_EXT_FLAGS/*request "new" flags(bug 9486)*/); - + LASSERT(strnlen(filename, namelen) == namelen - 1); memcpy(lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 2, namelen), filename, namelen); @@ -488,8 +488,8 @@ int mdc_unpack_acl(struct obd_export *exp, struct ptlrpc_request *req, #endif int mdc_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req, - int offset, struct obd_export *dt_exp, - struct obd_export *md_exp, + int offset, struct obd_export *dt_exp, + struct obd_export *md_exp, struct lustre_md *md) { int rc; @@ -528,7 +528,7 @@ int mdc_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req, int lmvsize; struct lov_mds_md *lmv; LASSERT(S_ISDIR(md->body->mode)); - + if (md->body->eadatasize == 0) { CERROR("OBD_MD_FLEASIZE set, but eadatasize 0\n"); RETURN(-EPROTO); @@ -539,7 +539,7 @@ int mdc_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req, LASSERT (lmv != NULL); LASSERT_REPSWABBED(req, offset); - rc = obd_unpackmd(md_exp, (void *)&md->mea, lmv, + rc = obd_unpackmd(md_exp, (void *)&md->mea, lmv, lmvsize); if (rc < 0) RETURN(rc); @@ -715,7 +715,7 @@ int mdc_set_open_replay_data(struct obd_export *exp, open_req->rq_cb_data = mod; mod->mod_open_req = open_req; open_req->rq_commit_cb = mdc_commit_open; - + } rec->cr_fid2 = body->fid1; @@ -746,7 +746,7 @@ int mdc_clear_open_replay_data(struct obd_export *exp, LASSERT(mod != LP_POISON); if (mod != NULL) mod->mod_och = NULL; - + och->och_mod = NULL; RETURN(0); } @@ -895,10 +895,10 @@ int mdc_done_writing(struct obd_export *exp, struct md_op_data *op_data, if (req == NULL) RETURN(-ENOMEM); - /* XXX: add DONE_WRITING request to och -- when Size-on-MDS + /* XXX: add DONE_WRITING request to och -- when Size-on-MDS * recovery will be ready. */ mdc_close_pack(req, REQ_REC_OFF, op_data); - + ptlrpc_req_set_repsize(req, 2, repsize); mdc_get_rpc_lock(obd->u.cli.cl_close_lock, NULL); rc = ptlrpc_queue_wait(req); @@ -929,7 +929,7 @@ int mdc_sendpage(struct obd_export *exp, const struct lu_fid *fid, desc = ptlrpc_prep_bulk_imp(req, 1, BULK_GET_SOURCE, MDS_BULK_PORTAL); if (desc == NULL) GOTO(out, rc = -ENOMEM); - + /* NB req now owns desc and will free it when it gets freed. */ ptlrpc_prep_bulk_page(desc, (struct page *)page, 0, offset); mdc_readdir_pack(req, REQ_REC_OFF, 0, offset, fid, NULL); @@ -978,7 +978,7 @@ int mdc_readpage(struct obd_export *exp, const struct lu_fid *fid, ptlrpc_req_set_repsize(req, 2, size); rc = ptlrpc_queue_wait(req); - if (rc == 0 || rc == -ERANGE) { + if (rc == 0) { body = lustre_swab_repbuf(req, REPLY_REC_OFF, sizeof(*body), lustre_swab_mdt_body); if (body == NULL) { diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index dc9993c..4564401 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -755,7 +755,7 @@ static int mdt_connect(struct mdt_thread_info *info) static int mdt_disconnect(struct mdt_thread_info *info) { int rc; - + rc = target_handle_disconnect(mdt_info_req(info)); if (rc) rc = err_serious(rc); @@ -855,7 +855,7 @@ static int mdt_write_dir_page(struct mdt_thread_info *info, struct page *page, offset += ent->lde_reclen; if (ent->lde_namelen == 0) continue; - + if (offset > size) break; is_dir = le32_to_cpu(ent->lde_hash) & MAX_HASH_HIGHEST_BIT; @@ -969,7 +969,7 @@ static int mdt_readpage(struct mdt_thread_info *info) struct lu_rdpg *rdpg = &info->mti_u.rdpg.mti_rdpg; struct mdt_body *reqbody; struct mdt_body *repbody; - int rc, rc1 = 0; + int rc; int i; ENTRY; @@ -1006,12 +1006,8 @@ static int mdt_readpage(struct mdt_thread_info *info) /* call lower layers to fill allocated pages with directory data */ rc = mo_readpage(info->mti_env, mdt_object_child(object), rdpg); - if (rc) { - if (rc == -ERANGE) - rc1 = rc; - else - GOTO(free_rdpg, rc); - } + if (rc) + GOTO(free_rdpg, rc); /* send pages to client */ rc = mdt_sendpage(info, rdpg); @@ -1026,7 +1022,7 @@ free_rdpg: MDT_FAIL_RETURN(OBD_FAIL_MDS_SENDPAGE, 0); - return rc ? rc : rc1; + return rc; } static int mdt_reint_internal(struct mdt_thread_info *info, @@ -1182,7 +1178,7 @@ static int mdt_sync(struct mdt_thread_info *info) rc = req_capsule_pack(pill); if (rc == 0) rc = mdt_device_sync(info); - else + else rc = err_serious(rc); } else { /* sync an object */ @@ -1499,11 +1495,11 @@ static int mdt_body_unpack(struct mdt_thread_info *info, __u32 flags) * contains capa actually. There are some requests which do not, for * instance MDS_IS_SUBDIR. */ - if (req_capsule_has_field(pill, &RMF_CAPA1, RCL_CLIENT) && + if (req_capsule_has_field(pill, &RMF_CAPA1, RCL_CLIENT) && req_capsule_get_size(pill, &RMF_CAPA1, RCL_CLIENT)) mdt_set_capainfo(info, 0, &body->fid1, req_capsule_client_get(pill, &RMF_CAPA1)); - + obj = mdt_object_find(env, info->mti_mdt, &body->fid1); if (!IS_ERR(obj)) { if ((flags & HABEO_CORPUS) && @@ -1842,7 +1838,7 @@ static int mdt_reply(struct ptlrpc_request *req, int rc, struct mdt_thread_info *info) { ENTRY; - + #if 0 if (req->rq_reply_state == NULL && rc == 0) { req->rq_status = rc; diff --git a/lustre/osd/osd_handler.c b/lustre/osd/osd_handler.c index 721747e..cf8ee3c 100644 --- a/lustre/osd/osd_handler.c +++ b/lustre/osd/osd_handler.c @@ -1381,6 +1381,7 @@ static int osd_dir_page_build(const struct lu_env *env, int first, struct lu_dirent *ent; if (first) { + memset(area, 0, sizeof (struct lu_dirpage)); area += sizeof (struct lu_dirpage); nob -= sizeof (struct lu_dirpage); } @@ -1440,10 +1441,13 @@ static int osd_readpage(const struct lu_env *env, struct dt_it *it; struct osd_object *obj = osd_dt_obj(dt); struct dt_it_ops *iops; + struct page *pg; + struct lu_dirent *last; int i; int rc; - int rc1; int nob; + __u32 hash_start; + __u32 hash_end; LASSERT(dt_object_exists(dt)); LASSERT(osd_invariant(obj)); @@ -1465,62 +1469,74 @@ static int osd_readpage(const struct lu_env *env, } /* - * iterating through directory and fill pages from @rdpg + * iterate through directory and fill pages from @rdpg */ iops = &dt->do_index_ops->dio_it; it = iops->init(env, dt, 0); if (it == NULL) return -ENOMEM; - /* - * XXX position iterator at rdpg->rp_hash - */ + rc = iops->load(env, it, rdpg->rp_hash); + if (rc == 0) + /* + * Iterator didn't find record with exactly the key requested. + * + * It is currently either + * + * - positioned above record with key less than + * requested---skip it. + * + * - or not positioned at all (is in IAM_IT_SKEWED + * state)---position it on the next item. + */ + rc = iops->next(env, it); + else if (rc > 0) + rc = 0; + /* - * When spliting, it need read entries from some offset by computing - * not by some entries offset like readdir, so it might return 0 here. + * At this point and across for-loop: + * + * rc == 0 -> ok, proceed. + * rc > 0 -> end of directory. + * rc < 0 -> error. */ - rc1 = rc == 0 ? -ERANGE : 0; - - if (rc >= 0) { - struct page *pg; /* no, Richard, it _is_ initialized */ - struct lu_dirent *last; - __u32 hash_start; - __u32 hash_end; - - for (i = 0, rc = 0, nob = rdpg->rp_count; - rc == 0 && nob > 0; i++, nob -= CFS_PAGE_SIZE) { - LASSERT(i < rdpg->rp_npages); - pg = rdpg->rp_pages[i]; - rc = osd_dir_page_build(env, !i, kmap(pg), - min_t(int, nob, CFS_PAGE_SIZE), - iops, it, - &hash_start, &hash_end, &last); - if (rc != 0 || i == rdpg->rp_npages - 1) - last->lde_reclen = 0; - kunmap(pg); - } - iops->put(env, it); - if (rc > 0) { + for (i = 0, nob = rdpg->rp_count; rc == 0 && nob > 0; + i++, nob -= CFS_PAGE_SIZE) { + LASSERT(i < rdpg->rp_npages); + pg = rdpg->rp_pages[i]; + rc = osd_dir_page_build(env, !i, kmap(pg), + min_t(int, nob, CFS_PAGE_SIZE), iops, + it, &hash_start, &hash_end, &last); + if (rc != 0 || i == rdpg->rp_npages - 1) + last->lde_reclen = 0; + kunmap(pg); + } + if (rc > 0) { + /* + * end of directory. + */ + hash_end = ~0ul; + rc = 0; + } + if (rc == 0) { + struct lu_dirpage *dp; + + dp = kmap(rdpg->rp_pages[0]); + dp->ldp_hash_start = hash_start; + dp->ldp_hash_end = hash_end; + if (i == 0) /* - * end of directory. + * No pages were processed, mark this. */ - hash_end = ~0ul; - rc = 0; - } - if (rc == 0) { - struct lu_dirpage *dp; - - dp = kmap(rdpg->rp_pages[0]); - dp->ldp_hash_start = hash_start; - dp->ldp_hash_end = hash_end; - kunmap(rdpg->rp_pages[0]); - } + dp->ldp_flags |= LDF_EMPTY; + dp->ldp_flags = cpu_to_le16(dp->ldp_flags); + kunmap(rdpg->rp_pages[0]); } iops->put(env, it); iops->fini(env, it); - return rc ? rc : rc1; + return rc; } static struct obd_capa *osd_capa_get(const struct lu_env *env, -- 1.8.3.1