X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Flclient%2Flcommon_cl.c;h=792dc2c1991a6ef8b6230f01cd0f6cff7d5591b0;hp=195ce8787b586489ec86cd7421ebf760308b7c38;hb=7af2428511c286788d9c73c436cb43d6f791821f;hpb=f9132c2d4d214772318d5b02865a8196a23d7b5e;ds=sidebyside diff --git a/lustre/lclient/lcommon_cl.c b/lustre/lclient/lcommon_cl.c index 195ce87..792dc2c 100644 --- a/lustre/lclient/lcommon_cl.c +++ b/lustre/lclient/lcommon_cl.c @@ -26,10 +26,13 @@ * GPL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. */ /* + * Copyright (c) 2011 Whamcloud, Inc. + */ +/* * This file is part of Lustre, http://www.lustre.org/ * Lustre is a trademark of Sun Microsystems, Inc. * @@ -60,16 +63,6 @@ #include #include #include -# include -# ifdef HAVE_XTIO_H -# include -# endif -# include -# include -# include -# ifdef HAVE_FILE_H -# include -# endif # include #endif @@ -300,7 +293,7 @@ static struct lu_env *ccc_inode_fini_env = NULL; * A mutex serializing calls to slp_inode_fini() under extreme memory * pressure, when environments cannot be allocated. */ -static DEFINE_MUTEX(ccc_inode_fini_guard); +static CFS_DEFINE_MUTEX(ccc_inode_fini_guard); static int dummy_refcheck; int ccc_global_init(struct lu_device_type *device_type) @@ -308,15 +301,26 @@ int ccc_global_init(struct lu_device_type *device_type) int result; result = lu_kmem_init(ccc_caches); - if (result == 0) { - result = lu_device_type_init(device_type); - ccc_inode_fini_env = cl_env_alloc(&dummy_refcheck, - LCT_REMEMBER|LCT_NOREF); - if (IS_ERR(ccc_inode_fini_env)) - result = PTR_ERR(ccc_inode_fini_env); - else - ccc_inode_fini_env->le_ctx.lc_cookie = 0x4; + if (result) + return result; + + result = lu_device_type_init(device_type); + if (result) + goto out_kmem; + + ccc_inode_fini_env = cl_env_alloc(&dummy_refcheck, + LCT_REMEMBER|LCT_NOREF); + if (IS_ERR(ccc_inode_fini_env)) { + result = PTR_ERR(ccc_inode_fini_env); + goto out_device; } + + ccc_inode_fini_env->le_ctx.lc_cookie = 0x4; + return 0; +out_device: + lu_device_type_fini(device_type); +out_kmem: + lu_kmem_fini(ccc_caches); return result; } @@ -437,6 +441,13 @@ int ccc_object_glimpse(const struct lu_env *env, lvb->lvb_mtime = cl_inode_mtime(inode); lvb->lvb_atime = cl_inode_atime(inode); lvb->lvb_ctime = cl_inode_ctime(inode); + /* + * LU-417: Add dirty pages block count lest i_blocks reports 0, some + * "cp" or "tar" on remote node may think it's a completely sparse file + * and skip it. + */ + if (lvb->lvb_size > 0 && lvb->lvb_blocks == 0) + lvb->lvb_blocks = dirty_cnt(inode); RETURN(0); } @@ -682,8 +693,9 @@ void ccc_lock_state(const struct lu_env *env, if (rc == 0) { if (lock->cll_descr.cld_start == 0 && lock->cll_descr.cld_end == CL_PAGE_EOF) { - cl_isize_write(inode, attr->cat_kms); - CDEBUG(D_INODE, DFID" updating i_size %llu\n", + cl_isize_write_nolock(inode, attr->cat_kms); + CDEBUG(D_INODE|D_VFSTRACE, + DFID" updating i_size "LPU64"\n", PFID(lu_object_fid(&obj->co_lu)), (__u64)cl_isize_read(inode)); } @@ -691,7 +703,7 @@ void ccc_lock_state(const struct lu_env *env, cl_inode_atime(inode) = attr->cat_atime; cl_inode_ctime(inode) = attr->cat_ctime; } else { - CL_LOCK_DEBUG(D_INFO, env, lock, "attr_get: %i\n", rc); + CL_LOCK_DEBUG(D_INFO, env, lock, "attr_get: %d\n", rc); } cl_object_attr_unlock(obj); cl_isize_unlock(inode, 0); @@ -723,7 +735,7 @@ int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io, CLOBINVRNT(env, obj, ccc_object_invariant(obj)); ENTRY; - CDEBUG(D_VFSTRACE, "lock: %i [%lu, %lu]\n", mode, start, end); + CDEBUG(D_VFSTRACE, "lock: %d [%lu, %lu]\n", mode, start, end); memset(&cio->cui_link, 0, sizeof cio->cui_link); @@ -736,8 +748,8 @@ int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io, descr->cld_obj = obj; descr->cld_start = start; descr->cld_end = end; + descr->cld_enq_flags = enqflags; - cio->cui_link.cill_enq_flags = enqflags; cl_io_lock_add(env, io, &cio->cui_link); RETURN(0); } @@ -749,12 +761,9 @@ void ccc_io_update_iov(const struct lu_env *env, size_t size = io->u.ci_rw.crw_count; cio->cui_iov_olen = 0; - if (!cl_is_normalio(env, io) || size == cio->cui_tot_count) + if (!cl_is_normalio(env, io)) return; - if (cio->cui_tot_nrsegs == 0) - cio->cui_tot_nrsegs = cio->cui_nrsegs; - for (i = 0; i < cio->cui_tot_nrsegs; i++) { struct iovec *iv = &cio->cui_iov[i]; @@ -819,22 +828,20 @@ void ccc_io_advance(const struct lu_env *env, } } -static void ccc_object_size_lock(struct cl_object *obj, int vfslock) +static void ccc_object_size_lock(struct cl_object *obj) { struct inode *inode = ccc_object_inode(obj); - if (vfslock) - cl_isize_lock(inode, 0); + cl_isize_lock(inode, 0); cl_object_attr_lock(obj); } -static void ccc_object_size_unlock(struct cl_object *obj, int vfslock) +static void ccc_object_size_unlock(struct cl_object *obj) { struct inode *inode = ccc_object_inode(obj); cl_object_attr_unlock(obj); - if (vfslock) - cl_isize_unlock(inode, 0); + cl_isize_unlock(inode, 0); } /** @@ -847,13 +854,9 @@ static void ccc_object_size_unlock(struct cl_object *obj, int vfslock) * protect consistency between inode size and cl_object * attributes. cl_object_size_lock() protects consistency between cl_attr's of * top-object and sub-objects. - * - * In page fault path cl_isize_lock cannot be taken, client has to live with - * the resulting races. */ int ccc_prep_size(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io, loff_t start, size_t count, int vfslock, - int *exceed) + struct cl_io *io, loff_t start, size_t count, int *exceed) { struct cl_attr *attr = ccc_env_thread_attr(env); struct inode *inode = ccc_object_inode(obj); @@ -880,7 +883,7 @@ int ccc_prep_size(const struct lu_env *env, struct cl_object *obj, * ll_inode_size_lock(). This guarantees that short reads are handled * correctly in the face of concurrent writes and truncates. */ - ccc_object_size_lock(obj, vfslock); + ccc_object_size_lock(obj); result = cl_object_attr_get(env, obj, attr); if (result == 0) { kms = attr->cat_kms; @@ -890,7 +893,7 @@ int ccc_prep_size(const struct lu_env *env, struct cl_object *obj, * return a short read (B) or some zeroes at the end * of the buffer (C) */ - ccc_object_size_unlock(obj, vfslock); + ccc_object_size_unlock(obj); result = cl_glimpse_lock(env, io, inode, obj); if (result == 0 && exceed != NULL) { /* If objective page index exceed end-of-file @@ -917,28 +920,17 @@ int ccc_prep_size(const struct lu_env *env, struct cl_object *obj, * which will always be >= the kms value here. * b=11081 */ - /* - * XXX in a page fault path, change inode size without - * ll_inode_size_lock() held! there is a race - * condition with truncate path. (see ll_extent_lock) - */ - /* - * XXX i_size_write() is not used because it is not - * safe to take the ll_inode_size_lock() due to a - * potential lock inversion (bug 6077). And since - * it's not safe to use i_size_write() without a - * covering mutex we do the assignment directly. It - * is not critical that the size be correct. - */ if (cl_isize_read(inode) < kms) { - if (vfslock) - cl_isize_write(inode, kms); - else - cl_isize_write_nolock(inode, kms); + cl_isize_write_nolock(inode, kms); + CDEBUG(D_VFSTRACE, + DFID" updating i_size "LPU64"\n", + PFID(lu_object_fid(&obj->co_lu)), + (__u64)cl_isize_read(inode)); + } } } - ccc_object_size_unlock(obj, vfslock); + ccc_object_size_unlock(obj); return result; } @@ -953,6 +945,9 @@ void ccc_req_completion(const struct lu_env *env, { struct ccc_req *vrq; + if (ioret > 0) + cl_stats_tally(slice->crs_dev, slice->crs_req->crq_type, ioret); + vrq = cl2ccc_req(slice); OBD_SLAB_FREE_PTR(vrq, ccc_req_kmem); } @@ -965,13 +960,15 @@ void ccc_req_completion(const struct lu_env *env, * * - o_mode * - * - o_fid (filled with inode number?!) + * - o_parent_seq * * - o_[ug]id * - * - o_generation + * - o_parent_oid * - * - and IO epoch (stored in o_easize), + * - o_parent_ver + * + * - o_ioepoch, * * and capability. */ @@ -999,13 +996,13 @@ void ccc_req_attr_set(const struct lu_env *env, if (slice->crs_req->crq_type == CRT_WRITE) { if (flags & OBD_MD_FLEPOCH) { oa->o_valid |= OBD_MD_FLEPOCH; - oa->o_easize = cl_i2info(inode)->lli_ioepoch; + oa->o_ioepoch = cl_i2info(inode)->lli_ioepoch; valid_flags |= OBD_MD_FLMTIME|OBD_MD_FLCTIME| - OBD_MD_FLUID|OBD_MD_FLGID| - OBD_MD_FLFID|OBD_MD_FLGENER; + OBD_MD_FLUID|OBD_MD_FLGID; } } - obdo_from_inode(oa, inode, valid_flags & flags); + obdo_from_inode(oa, inode, &cl_i2info(inode)->lli_fid, + valid_flags & flags); } const struct cl_req_operations ccc_req_ops = { @@ -1013,9 +1010,8 @@ const struct cl_req_operations ccc_req_ops = { .cro_completion = ccc_req_completion }; -/* Setattr helpers */ -int cl_setattr_do_truncate(struct inode *inode, loff_t size, - struct obd_capa *capa) +int cl_setattr_ost(struct inode *inode, const struct iattr *attr, + struct obd_capa *capa) { struct lu_env *env; struct cl_io *io; @@ -1028,11 +1024,17 @@ int cl_setattr_do_truncate(struct inode *inode, loff_t size, if (IS_ERR(env)) RETURN(PTR_ERR(env)); - io = &ccc_env_info(env)->cti_io; + io = ccc_env_thread_io(env); io->ci_obj = cl_i2info(inode)->lli_clob; - io->u.ci_truncate.tr_size = size; - io->u.ci_truncate.tr_capa = capa; - if (cl_io_init(env, io, CIT_TRUNC, io->ci_obj) == 0) + + io->u.ci_setattr.sa_attr.lvb_atime = LTIME_S(attr->ia_atime); + io->u.ci_setattr.sa_attr.lvb_mtime = LTIME_S(attr->ia_mtime); + io->u.ci_setattr.sa_attr.lvb_ctime = LTIME_S(attr->ia_ctime); + io->u.ci_setattr.sa_attr.lvb_size = attr->ia_size; + io->u.ci_setattr.sa_valid = attr->ia_valid; + io->u.ci_setattr.sa_capa = capa; + + if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) result = cl_io_loop(env, io); else result = io->ci_result; @@ -1041,45 +1043,6 @@ int cl_setattr_do_truncate(struct inode *inode, loff_t size, RETURN(result); } -int cl_setattr_ost(struct inode *inode, struct obd_capa *capa) -{ - struct cl_inode_info *lli = cl_i2info(inode); - struct lov_stripe_md *lsm = lli->lli_smd; - int rc; - obd_flag flags; - struct obd_info oinfo = { { { 0 } } }; - struct obdo *oa; - - OBDO_ALLOC(oa); - if (oa) { - oa->o_id = lsm->lsm_object_id; - oa->o_gr = lsm->lsm_object_gr; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; - - flags = OBD_MD_FLTYPE | OBD_MD_FLATIME | - OBD_MD_FLMTIME | OBD_MD_FLCTIME | - OBD_MD_FLFID | OBD_MD_FLGENER | - OBD_MD_FLGROUP; - - obdo_from_inode(oa, inode, flags); - - oinfo.oi_oa = oa; - oinfo.oi_md = lsm; - oinfo.oi_capa = capa; - - /* XXX: this looks unnecessary now. */ - rc = obd_setattr_rqset(cl_i2sbi(inode)->ll_dt_exp, &oinfo, - NULL); - if (rc) - CERROR("obd_setattr_async fails: rc=%d\n", rc); - OBDO_FREE(oa); - } else { - rc = -ENOMEM; - } - return rc; -} - - /***************************************************************************** * * Type conversions. @@ -1173,20 +1136,22 @@ struct cl_page *ccc_vmpage_page_transient(cfs_page_t *vmpage) } /** - * Initializes or updates CLIO part when new meta-data arrives from the - * server. + * Initialize or update CLIO structures for regular files when new + * meta-data arrives from the server. * - * - allocates cl_object if necessary, - * - updated layout, if object was already here. + * \param inode regular file inode + * \param md new file metadata from MDS + * - allocates cl_object if necessary, + * - updated layout, if object was already here. */ -int cl_inode_init(struct inode *inode, struct lustre_md *md) +int cl_file_inode_init(struct inode *inode, struct lustre_md *md) { struct lu_env *env; struct cl_inode_info *lli; struct cl_object *clob; struct lu_site *site; struct lu_fid *fid; - const struct cl_object_conf conf = { + struct cl_object_conf conf = { .coc_inode = inode, .u = { .coc_md = md @@ -1195,11 +1160,8 @@ int cl_inode_init(struct inode *inode, struct lustre_md *md) int result = 0; int refcheck; - /* LASSERT(inode->i_state & I_NEW); */ LASSERT(md->body->valid & OBD_MD_FLID); - - if (!S_ISREG(cl_inode_mode(inode))) - return 0; + LASSERT(S_ISREG(cl_inode_mode(inode))); env = cl_env_get(&refcheck); if (IS_ERR(env)) @@ -1211,6 +1173,14 @@ int cl_inode_init(struct inode *inode, struct lustre_md *md) LASSERT(fid_is_sane(fid)); if (lli->lli_clob == NULL) { + /* clob is slave of inode, empty lli_clob means for new inode, + * there is no clob in cache with the given fid, so it is + * unnecessary to perform lookup-alloc-lookup-insert, just + * alloc and insert directly. */ +#ifdef __KERNEL__ + LASSERT(inode->i_state & I_NEW); +#endif + conf.coc_lu.loc_flags = LOC_F_NEW; clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev), fid, &conf); if (!IS_ERR(clob)) { @@ -1234,6 +1204,43 @@ int cl_inode_init(struct inode *inode, struct lustre_md *md) return result; } +/** + * Wait for others drop their references of the object at first, then we drop + * the last one, which will lead to the object be destroyed immediately. + * Must be called after cl_object_kill() against this object. + * + * The reason we want to do this is: destroying top object will wait for sub + * objects being destroyed first, so we can't let bottom layer (e.g. from ASTs) + * to initiate top object destroying which may deadlock. See bz22520. + */ +static void cl_object_put_last(struct lu_env *env, struct cl_object *obj) +{ + struct lu_object_header *header = obj->co_lu.lo_header; + cfs_waitlink_t waiter; + + if (unlikely(cfs_atomic_read(&header->loh_ref) != 1)) { + struct lu_site *site = obj->co_lu.lo_dev->ld_site; + struct lu_site_bkt_data *bkt; + + bkt = lu_site_bkt_from_fid(site, &header->loh_fid); + + cfs_waitlink_init(&waiter); + cfs_waitq_add(&bkt->lsb_marche_funebre, &waiter); + + while (1) { + cfs_set_current_state(CFS_TASK_UNINT); + if (cfs_atomic_read(&header->loh_ref) == 1) + break; + cfs_waitq_wait(&waiter, CFS_TASK_UNINT); + } + + cfs_set_current_state(CFS_TASK_RUNNING); + cfs_waitq_del(&bkt->lsb_marche_funebre, &waiter); + } + + cl_object_put(env, obj); +} + void cl_inode_fini(struct inode *inode) { struct lu_env *env; @@ -1249,7 +1256,7 @@ void cl_inode_fini(struct inode *inode) env = cl_env_get(&refcheck); emergency = IS_ERR(env); if (emergency) { - mutex_lock(&ccc_inode_fini_guard); + cfs_mutex_lock(&ccc_inode_fini_guard); LASSERT(ccc_inode_fini_env != NULL); cl_env_implant(ccc_inode_fini_env, &refcheck); env = ccc_inode_fini_env; @@ -1261,11 +1268,11 @@ void cl_inode_fini(struct inode *inode) */ cl_object_kill(env, clob); lu_object_ref_del(&clob->co_lu, "inode", inode); - cl_object_put(env, clob); + cl_object_put_last(env, clob); lli->lli_clob = NULL; if (emergency) { cl_env_unplant(ccc_inode_fini_env, &refcheck); - mutex_unlock(&ccc_inode_fini_guard); + cfs_mutex_unlock(&ccc_inode_fini_guard); } else cl_env_put(env, &refcheck); cl_env_reexit(cookie); @@ -1296,25 +1303,12 @@ __u16 ll_dirent_type_get(struct lu_dirent *ent) /** * build inode number from passed @fid */ -ino_t cl_fid_build_ino(const struct lu_fid *fid) +__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32) { - ino_t ino; - ENTRY; - - if (fid_is_igif(fid)) { - ino = lu_igif_ino(fid); - RETURN(ino); - } - - /* Very stupid and having many downsides inode allocation algorithm - * based on fid. */ - ino = fid_flatten(fid) & 0xFFFFFFFF; - - if (unlikely(ino == 0)) - /* the first result ino is 0xFFC001, so this is rarely used */ - ino = 0xffbcde; - ino = ino | 0x80000000; - RETURN(ino); + if (BITS_PER_LONG == 32 || api32) + RETURN(fid_flatten32(fid)); + else + RETURN(fid_flatten(fid)); } /**