X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Flclient%2Flcommon_cl.c;h=c9ba7488d07fd90380df82b8a24a961951d7232c;hp=9f7d9a3051f1c937ae5886d9fe5f39042180312f;hb=3bffa4d32bc5b0bc71ba6873e262ddbca436bae1;hpb=48d29ff71de18d8d375b072a7287ba9ecdb6cdce diff --git a/lustre/lclient/lcommon_cl.c b/lustre/lclient/lcommon_cl.c index 9f7d9a3..c9ba748 100644 --- a/lustre/lclient/lcommon_cl.c +++ b/lustre/lclient/lcommon_cl.c @@ -1,6 +1,4 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * +/* * GPL HEADER START * * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. @@ -28,6 +26,8 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -46,7 +46,6 @@ # include # include # include -# include # include # include # include @@ -290,7 +289,7 @@ static struct lu_env *ccc_inode_fini_env = NULL; * A mutex serializing calls to slp_inode_fini() under extreme memory * pressure, when environments cannot be allocated. */ -static CFS_DEFINE_MUTEX(ccc_inode_fini_guard); +static DEFINE_MUTEX(ccc_inode_fini_guard); static int dummy_refcheck; int ccc_global_init(struct lu_device_type *device_type) @@ -298,15 +297,26 @@ int ccc_global_init(struct lu_device_type *device_type) int result; result = lu_kmem_init(ccc_caches); - if (result == 0) { - result = lu_device_type_init(device_type); - ccc_inode_fini_env = cl_env_alloc(&dummy_refcheck, - LCT_REMEMBER|LCT_NOREF); - if (IS_ERR(ccc_inode_fini_env)) - result = PTR_ERR(ccc_inode_fini_env); - else - ccc_inode_fini_env->le_ctx.lc_cookie = 0x4; + if (result) + return result; + + result = lu_device_type_init(device_type); + if (result) + goto out_kmem; + + ccc_inode_fini_env = cl_env_alloc(&dummy_refcheck, + LCT_REMEMBER|LCT_NOREF); + if (IS_ERR(ccc_inode_fini_env)) { + result = PTR_ERR(ccc_inode_fini_env); + goto out_device; } + + ccc_inode_fini_env->le_ctx.lc_cookie = 0x4; + return 0; +out_device: + lu_device_type_fini(device_type); +out_kmem: + lu_kmem_fini(ccc_caches); return result; } @@ -358,6 +368,7 @@ int ccc_object_init0(const struct lu_env *env, { vob->cob_inode = conf->coc_inode; vob->cob_transient_pages = 0; + cl_object_page_init(&vob->cob_cl, sizeof(struct ccc_page)); return 0; } @@ -427,6 +438,13 @@ int ccc_object_glimpse(const struct lu_env *env, lvb->lvb_mtime = cl_inode_mtime(inode); lvb->lvb_atime = cl_inode_atime(inode); lvb->lvb_ctime = cl_inode_ctime(inode); + /* + * LU-417: Add dirty pages block count lest i_blocks reports 0, some + * "cp" or "tar" on remote node may think it's a completely sparse file + * and skip it. + */ + if (lvb->lvb_size > 0 && lvb->lvb_blocks == 0) + lvb->lvb_blocks = dirty_cnt(inode); RETURN(0); } @@ -439,6 +457,22 @@ int ccc_conf_set(const struct lu_env *env, struct cl_object *obj, return 0; } +static void ccc_object_size_lock(struct cl_object *obj) +{ + struct inode *inode = ccc_object_inode(obj); + + cl_isize_lock(inode); + cl_object_attr_lock(obj); +} + +static void ccc_object_size_unlock(struct cl_object *obj) +{ + struct inode *inode = ccc_object_inode(obj); + + cl_object_attr_unlock(obj); + cl_isize_unlock(inode); +} + /***************************************************************************** * * Page operations. @@ -650,24 +684,22 @@ void ccc_lock_state(const struct lu_env *env, * of finding lock in the cache. */ if (state == CLS_HELD && lock->cll_state < CLS_HELD) { - int rc; - - obj = slice->cls_obj; - inode = ccc_object_inode(obj); - attr = ccc_env_thread_attr(env); - - /* vmtruncate()->ll_truncate() first sets the i_size and then - * the kms under both a DLM lock and the - * ll_inode_size_lock(). If we don't get the - * ll_inode_size_lock() here we can match the DLM lock and - * reset i_size from the kms before the truncating path has - * updated the kms. generic_file_write can then trust the - * stale i_size when doing appending writes and effectively - * cancel the result of the truncate. Getting the - * ll_inode_size_lock() after the enqueue maintains the DLM - * -> ll_inode_size_lock() acquiring order. */ - cl_isize_lock(inode, 0); - cl_object_attr_lock(obj); + int rc; + + obj = slice->cls_obj; + inode = ccc_object_inode(obj); + attr = ccc_env_thread_attr(env); + + /* vmtruncate() sets the i_size + * under both a DLM lock and the + * ll_inode_size_lock(). If we don't get the + * ll_inode_size_lock() here we can match the DLM lock and + * reset i_size. generic_file_write can then trust the + * stale i_size when doing appending writes and effectively + * cancel the result of the truncate. Getting the + * ll_inode_size_lock() after the enqueue maintains the DLM + * -> ll_inode_size_lock() acquiring order. */ + ccc_object_size_lock(obj); rc = cl_object_attr_get(env, obj, attr); if (rc == 0) { if (lock->cll_descr.cld_start == 0 && @@ -684,10 +716,9 @@ void ccc_lock_state(const struct lu_env *env, } else { CL_LOCK_DEBUG(D_INFO, env, lock, "attr_get: %d\n", rc); } - cl_object_attr_unlock(obj); - cl_isize_unlock(inode, 0); - } - EXIT; + ccc_object_size_unlock(obj); + } + EXIT; } /***************************************************************************** @@ -807,24 +838,6 @@ void ccc_io_advance(const struct lu_env *env, } } -static void ccc_object_size_lock(struct cl_object *obj, int vfslock) -{ - struct inode *inode = ccc_object_inode(obj); - - if (vfslock) - cl_isize_lock(inode, 0); - cl_object_attr_lock(obj); -} - -static void ccc_object_size_unlock(struct cl_object *obj, int vfslock) -{ - struct inode *inode = ccc_object_inode(obj); - - cl_object_attr_unlock(obj); - if (vfslock) - cl_isize_unlock(inode, 0); -} - /** * Helper function that if necessary adjusts file size (inode->i_size), when * position at the offset \a pos is accessed. File size can be arbitrary stale @@ -835,13 +848,9 @@ static void ccc_object_size_unlock(struct cl_object *obj, int vfslock) * protect consistency between inode size and cl_object * attributes. cl_object_size_lock() protects consistency between cl_attr's of * top-object and sub-objects. - * - * In page fault path cl_isize_lock cannot be taken, client has to live with - * the resulting races. */ int ccc_prep_size(const struct lu_env *env, struct cl_object *obj, - struct cl_io *io, loff_t start, size_t count, int vfslock, - int *exceed) + struct cl_io *io, loff_t start, size_t count, int *exceed) { struct cl_attr *attr = ccc_env_thread_attr(env); struct inode *inode = ccc_object_inode(obj); @@ -868,7 +877,7 @@ int ccc_prep_size(const struct lu_env *env, struct cl_object *obj, * ll_inode_size_lock(). This guarantees that short reads are handled * correctly in the face of concurrent writes and truncates. */ - ccc_object_size_lock(obj, vfslock); + ccc_object_size_lock(obj); result = cl_object_attr_get(env, obj, attr); if (result == 0) { kms = attr->cat_kms; @@ -878,8 +887,8 @@ int ccc_prep_size(const struct lu_env *env, struct cl_object *obj, * return a short read (B) or some zeroes at the end * of the buffer (C) */ - ccc_object_size_unlock(obj, vfslock); - result = cl_glimpse_lock(env, io, inode, obj); + ccc_object_size_unlock(obj); + result = cl_glimpse_lock(env, io, inode, obj, 0); if (result == 0 && exceed != NULL) { /* If objective page index exceed end-of-file * page index, return directly. Do not expect @@ -905,24 +914,8 @@ int ccc_prep_size(const struct lu_env *env, struct cl_object *obj, * which will always be >= the kms value here. * b=11081 */ - /* - * XXX in a page fault path, change inode size without - * ll_inode_size_lock() held! there is a race - * condition with truncate path. (see ll_extent_lock) - */ - /* - * XXX i_size_write() is not used because it is not - * safe to take the ll_inode_size_lock() due to a - * potential lock inversion (bug 6077). And since - * it's not safe to use i_size_write() without a - * covering mutex we do the assignment directly. It - * is not critical that the size be correct. - */ if (cl_isize_read(inode) < kms) { - if (vfslock) - cl_isize_write_nolock(inode, kms); - else - cl_isize_write(inode, kms); + cl_isize_write_nolock(inode, kms); CDEBUG(D_VFSTRACE, DFID" updating i_size "LPU64"\n", PFID(lu_object_fid(&obj->co_lu)), @@ -931,7 +924,7 @@ int ccc_prep_size(const struct lu_env *env, struct cl_object *obj, } } } - ccc_object_size_unlock(obj, vfslock); + ccc_object_size_unlock(obj); return result; } @@ -946,6 +939,9 @@ void ccc_req_completion(const struct lu_env *env, { struct ccc_req *vrq; + if (ioret > 0) + cl_stats_tally(slice->crs_dev, slice->crs_req->crq_type, ioret); + vrq = cl2ccc_req(slice); OBD_SLAB_FREE_PTR(vrq, ccc_req_kmem); } @@ -979,38 +975,29 @@ void ccc_req_attr_set(const struct lu_env *env, struct obdo *oa; obd_flag valid_flags; - oa = attr->cra_oa; - inode = ccc_object_inode(obj); - valid_flags = OBD_MD_FLTYPE|OBD_MD_FLATIME; - - if (flags != (obd_valid)~0ULL) - valid_flags |= OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME; - else { - LASSERT(attr->cra_capa == NULL); - attr->cra_capa = cl_capa_lookup(inode, - slice->crs_req->crq_type); - } - - if (slice->crs_req->crq_type == CRT_WRITE) { - if (flags & OBD_MD_FLEPOCH) { - oa->o_valid |= OBD_MD_FLEPOCH; - oa->o_ioepoch = cl_i2info(inode)->lli_ioepoch; - valid_flags |= OBD_MD_FLMTIME|OBD_MD_FLCTIME| - OBD_MD_FLUID|OBD_MD_FLGID; - } - } - obdo_from_inode(oa, inode, &cl_i2info(inode)->lli_fid, - valid_flags & flags); + oa = attr->cra_oa; + inode = ccc_object_inode(obj); + valid_flags = OBD_MD_FLTYPE; + + if ((flags & OBD_MD_FLOSSCAPA) != 0) { + LASSERT(attr->cra_capa == NULL); + attr->cra_capa = cl_capa_lookup(inode, + slice->crs_req->crq_type); + } + + if (slice->crs_req->crq_type == CRT_WRITE) { + if (flags & OBD_MD_FLEPOCH) { + oa->o_valid |= OBD_MD_FLEPOCH; + oa->o_ioepoch = cl_i2info(inode)->lli_ioepoch; + valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME | + OBD_MD_FLUID | OBD_MD_FLGID; + } + } + obdo_from_inode(oa, inode, valid_flags & flags); + obdo_set_parent_fid(oa, &cl_i2info(inode)->lli_fid); #ifdef __KERNEL__ - /* Bug11742 - set the OBD_FL_MMAP flag for memory mapped files */ - if (cfs_atomic_read(&(cl_inode2ccc(inode)->cob_mmap_cnt)) != 0) { - if (!(oa->o_valid & OBD_MD_FLFLAGS)) { - oa->o_valid |= OBD_MD_FLFLAGS; - oa->o_flags = OBD_FL_MMAP; - } else { - oa->o_flags |= OBD_FL_MMAP; - } - } + memcpy(attr->cra_jobid, cl_i2info(inode)->lli_jobid, + JOBSTATS_JOBID_SIZE); #endif } @@ -1033,7 +1020,7 @@ int cl_setattr_ost(struct inode *inode, const struct iattr *attr, if (IS_ERR(env)) RETURN(PTR_ERR(env)); - io = &ccc_env_info(env)->cti_io; + io = ccc_env_thread_io(env); io->ci_obj = cl_i2info(inode)->lli_clob; io->u.ci_setattr.sa_attr.lvb_atime = LTIME_S(attr->ia_atime); @@ -1043,13 +1030,24 @@ int cl_setattr_ost(struct inode *inode, const struct iattr *attr, io->u.ci_setattr.sa_valid = attr->ia_valid; io->u.ci_setattr.sa_capa = capa; - if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) +again: + if (cl_io_init(env, io, CIT_SETATTR, io->ci_obj) == 0) { + struct ccc_io *cio = ccc_env_io(env); + + if (attr->ia_valid & ATTR_FILE) + /* populate the file descriptor for ftruncate to honor + * group lock - see LU-787 */ + cio->cui_fd = cl_iattr2fd(inode, attr); + result = cl_io_loop(env, io); - else + } else { result = io->ci_result; + } cl_io_fini(env, io); - cl_env_put(env, &refcheck); - RETURN(result); + if (unlikely(io->ci_need_restart)) + goto again; + cl_env_put(env, &refcheck); + RETURN(result); } /***************************************************************************** @@ -1145,20 +1143,22 @@ struct cl_page *ccc_vmpage_page_transient(cfs_page_t *vmpage) } /** - * Initializes or updates CLIO part when new meta-data arrives from the - * server. + * Initialize or update CLIO structures for regular files when new + * meta-data arrives from the server. * - * - allocates cl_object if necessary, - * - updated layout, if object was already here. + * \param inode regular file inode + * \param md new file metadata from MDS + * - allocates cl_object if necessary, + * - updated layout, if object was already here. */ -int cl_inode_init(struct inode *inode, struct lustre_md *md) +int cl_file_inode_init(struct inode *inode, struct lustre_md *md) { struct lu_env *env; struct cl_inode_info *lli; struct cl_object *clob; struct lu_site *site; struct lu_fid *fid; - const struct cl_object_conf conf = { + struct cl_object_conf conf = { .coc_inode = inode, .u = { .coc_md = md @@ -1167,11 +1167,8 @@ int cl_inode_init(struct inode *inode, struct lustre_md *md) int result = 0; int refcheck; - /* LASSERT(inode->i_state & I_NEW); */ LASSERT(md->body->valid & OBD_MD_FLID); - - if (!S_ISREG(cl_inode_mode(inode))) - return 0; + LASSERT(S_ISREG(cl_inode_mode(inode))); env = cl_env_get(&refcheck); if (IS_ERR(env)) @@ -1183,21 +1180,27 @@ int cl_inode_init(struct inode *inode, struct lustre_md *md) LASSERT(fid_is_sane(fid)); if (lli->lli_clob == NULL) { + /* clob is slave of inode, empty lli_clob means for new inode, + * there is no clob in cache with the given fid, so it is + * unnecessary to perform lookup-alloc-lookup-insert, just + * alloc and insert directly. */ +#ifdef __KERNEL__ + LASSERT(inode->i_state & I_NEW); +#endif + conf.coc_lu.loc_flags = LOC_F_NEW; clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev), fid, &conf); if (!IS_ERR(clob)) { /* * No locking is necessary, as new inode is * locked by I_NEW bit. - * - * XXX not true for call from ll_update_inode(). */ lli->lli_clob = clob; + lli->lli_has_smd = md->lsm != NULL; lu_object_ref_add(&clob->co_lu, "inode", inode); } else result = PTR_ERR(clob); - } else - result = cl_conf_set(env, lli->lli_clob, &conf); + } cl_env_put(env, &refcheck); if (result != 0) @@ -1218,14 +1221,16 @@ int cl_inode_init(struct inode *inode, struct lustre_md *md) static void cl_object_put_last(struct lu_env *env, struct cl_object *obj) { struct lu_object_header *header = obj->co_lu.lo_header; - struct lu_site *site; cfs_waitlink_t waiter; if (unlikely(cfs_atomic_read(&header->loh_ref) != 1)) { - site = obj->co_lu.lo_dev->ld_site; + struct lu_site *site = obj->co_lu.lo_dev->ld_site; + struct lu_site_bkt_data *bkt; + + bkt = lu_site_bkt_from_fid(site, &header->loh_fid); cfs_waitlink_init(&waiter); - cfs_waitq_add(&site->ls_marche_funebre, &waiter); + cfs_waitq_add(&bkt->lsb_marche_funebre, &waiter); while (1) { cfs_set_current_state(CFS_TASK_UNINT); @@ -1235,7 +1240,7 @@ static void cl_object_put_last(struct lu_env *env, struct cl_object *obj) } cfs_set_current_state(CFS_TASK_RUNNING); - cfs_waitq_del(&site->ls_marche_funebre, &waiter); + cfs_waitq_del(&bkt->lsb_marche_funebre, &waiter); } cl_object_put(env, obj); @@ -1256,7 +1261,7 @@ void cl_inode_fini(struct inode *inode) env = cl_env_get(&refcheck); emergency = IS_ERR(env); if (emergency) { - cfs_mutex_lock(&ccc_inode_fini_guard); + mutex_lock(&ccc_inode_fini_guard); LASSERT(ccc_inode_fini_env != NULL); cl_env_implant(ccc_inode_fini_env, &refcheck); env = ccc_inode_fini_env; @@ -1272,7 +1277,7 @@ void cl_inode_fini(struct inode *inode) lli->lli_clob = NULL; if (emergency) { cl_env_unplant(ccc_inode_fini_env, &refcheck); - cfs_mutex_unlock(&ccc_inode_fini_guard); + mutex_unlock(&ccc_inode_fini_guard); } else cl_env_put(env, &refcheck); cl_env_reexit(cookie); @@ -1302,22 +1307,13 @@ __u16 ll_dirent_type_get(struct lu_dirent *ent) } /** - * for 32 bit inode numbers directly map seq+oid to 32bit number. - */ -__u32 cl_fid_build_ino32(const struct lu_fid *fid) -{ - RETURN(fid_flatten32(fid)); -} - -/** * build inode number from passed @fid */ -__u64 cl_fid_build_ino(const struct lu_fid *fid) +__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32) { -#if BITS_PER_LONG == 32 - RETURN(fid_flatten32(fid)); -#else - RETURN(fid_flatten(fid)); -#endif + if (BITS_PER_LONG == 32 || api32) + RETURN(fid_flatten32(fid)); + else + RETURN(fid_flatten(fid)); } /** @@ -1336,3 +1332,21 @@ __u32 cl_fid_build_gen(const struct lu_fid *fid) gen = (fid_flatten(fid) >> 32); RETURN(gen); } + +/* lsm is unreliable after hsm implementation as layout can be changed at + * any time. This is only to support old, non-clio-ized interfaces. It will + * cause deadlock if clio operations are called with this extra layout refcount + * because in case the layout changed during the IO, ll_layout_refresh() will + * have to wait for the refcount to become zero to destroy the older layout. + * + * Notice that the lsm returned by this function may not be valid unless called + * inside layout lock - MDS_INODELOCK_LAYOUT. */ +struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode) +{ + return lov_lsm_get(cl_i2info(inode)->lli_clob); +} + +void inline ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm) +{ + lov_lsm_put(cl_i2info(inode)->lli_clob, lsm); +}