X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fllite%2Fllite_lib.c;h=01ddfd2512c6fb6d5b84263544590e93a134aeae;hb=cf775525f5fa8c96d073809a6afc4af0643dda9a;hp=acdec48fd955b797fe6df1d5a9c81cd554b2a40d;hpb=ff3c1a0c6d43fb210faa5e0705241672e0754ce4;p=fs%2Flustre-release.git diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index acdec48..01ddfd2 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -27,7 +27,7 @@ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2011, 2012, Whamcloud, Inc. + * Copyright (c) 2011, 2013, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -94,11 +94,7 @@ static struct ll_sb_info *ll_init_sbi(void) si_meminfo(&si); pages = si.totalram - si.totalhigh; if (pages >> (20 - CFS_PAGE_SHIFT) < 512) { -#ifdef HAVE_BGL_SUPPORT - lru_page_max = pages / 4; -#else lru_page_max = pages / 2; -#endif } else { lru_page_max = (pages / 4) * 3; } @@ -221,7 +217,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, OBD_CONNECT_RMT_CLIENT | OBD_CONNECT_VBR | OBD_CONNECT_FULL20 | OBD_CONNECT_64BITHASH| OBD_CONNECT_EINPROGRESS | - OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE; + OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE | + OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS; if (sbi->ll_flags & LL_SBI_SOM_PREVIEW) data->ocd_connect_flags |= OBD_CONNECT_SOM; @@ -267,7 +264,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, if (sbi->ll_flags & LL_SBI_RMT_CLIENT) data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE; - data->ocd_brw_size = PTLRPC_MAX_BRW_SIZE; + data->ocd_brw_size = MD_MAX_BRW_SIZE; err = obd_connect(NULL, &sbi->ll_md_exp, obd, &sbi->ll_sb_uuid, data, NULL); if (err == -EBUSY) { @@ -281,51 +278,59 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, GOTO(out, err); } - err = obd_fid_init(sbi->ll_md_exp); - if (err) { - CERROR("Can't init metadata layer FID infrastructure, " - "rc %d\n", err); - GOTO(out_md, err); - } + sbi->ll_md_exp->exp_connect_data = *data; - err = obd_statfs(NULL, sbi->ll_md_exp, osfs, - cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), 0); - if (err) - GOTO(out_md_fid, err); - - /* This needs to be after statfs to ensure connect has finished. - * Note that "data" does NOT contain the valid connect reply. - * If connecting to a 1.8 server there will be no LMV device, so - * we can access the MDC export directly and exp_connect_flags will - * be non-zero, but if accessing an upgraded 2.1 server it will - * have the correct flags filled in. - * XXX: fill in the LMV exp_connect_flags from MDC(s). */ - valid = sbi->ll_md_exp->exp_connect_flags & CLIENT_CONNECT_MDT_REQD; - if (sbi->ll_md_exp->exp_connect_flags != 0 && - valid != CLIENT_CONNECT_MDT_REQD) { - char *buf; - - OBD_ALLOC_WAIT(buf, CFS_PAGE_SIZE); - obd_connect_flags2str(buf, CFS_PAGE_SIZE, - valid ^ CLIENT_CONNECT_MDT_REQD, ","); - LCONSOLE_ERROR_MSG(0x170, "Server %s does not support " - "feature(s) needed for correct operation " - "of this client (%s). Please upgrade " - "server or downgrade client.\n", - sbi->ll_md_exp->exp_obd->obd_name, buf); - OBD_FREE(buf, CFS_PAGE_SIZE); - GOTO(out_md, err = -EPROTO); - } + err = obd_fid_init(sbi->ll_md_exp->exp_obd, sbi->ll_md_exp, + LUSTRE_SEQ_METADATA); + if (err) { + CERROR("%s: Can't init metadata layer FID infrastructure, " + "rc = %d\n", sbi->ll_md_exp->exp_obd->obd_name, err); + GOTO(out_md, err); + } - size = sizeof(*data); - err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA), - KEY_CONN_DATA, &size, data, NULL); - if (err) { - CERROR("Get connect data failed: %d \n", err); - GOTO(out_md, err); - } + /* For mount, we only need fs info from MDT0, and also in DNE, it + * can make sure the client can be mounted as long as MDT0 is + * avaible */ + err = obd_statfs(NULL, sbi->ll_md_exp, osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_FOR_MDT0); + if (err) + GOTO(out_md_fid, err); + + /* This needs to be after statfs to ensure connect has finished. + * Note that "data" does NOT contain the valid connect reply. + * If connecting to a 1.8 server there will be no LMV device, so + * we can access the MDC export directly and exp_connect_flags will + * be non-zero, but if accessing an upgraded 2.1 server it will + * have the correct flags filled in. + * XXX: fill in the LMV exp_connect_flags from MDC(s). */ + valid = exp_connect_flags(sbi->ll_md_exp) & CLIENT_CONNECT_MDT_REQD; + if (exp_connect_flags(sbi->ll_md_exp) != 0 && + valid != CLIENT_CONNECT_MDT_REQD) { + char *buf; + + OBD_ALLOC_WAIT(buf, CFS_PAGE_SIZE); + obd_connect_flags2str(buf, CFS_PAGE_SIZE, + valid ^ CLIENT_CONNECT_MDT_REQD, ","); + LCONSOLE_ERROR_MSG(0x170, "Server %s does not support " + "feature(s) needed for correct operation " + "of this client (%s). Please upgrade " + "server or downgrade client.\n", + sbi->ll_md_exp->exp_obd->obd_name, buf); + OBD_FREE(buf, CFS_PAGE_SIZE); + GOTO(out_md_fid, err = -EPROTO); + } + + size = sizeof(*data); + err = obd_get_info(NULL, sbi->ll_md_exp, sizeof(KEY_CONN_DATA), + KEY_CONN_DATA, &size, data, NULL); + if (err) { + CERROR("%s: Get connect data failed: rc = %d\n", + sbi->ll_md_exp->exp_obd->obd_name, err); + GOTO(out_md_fid, err); + } - LASSERT(osfs->os_bsize); + LASSERT(osfs->os_bsize); sb->s_blocksize = osfs->os_bsize; sb->s_blocksize_bits = log2(osfs->os_bsize); sb->s_magic = LL_SUPER_MAGIC; @@ -379,24 +384,24 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, if (data->ocd_connect_flags & OBD_CONNECT_64BITHASH) sbi->ll_flags |= LL_SBI_64BIT_HASH; - if (data->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) - sbi->ll_md_brw_size = data->ocd_brw_size; - else - sbi->ll_md_brw_size = CFS_PAGE_SIZE; + if (data->ocd_connect_flags & OBD_CONNECT_BRW_SIZE) + sbi->ll_md_brw_size = data->ocd_brw_size; + else + sbi->ll_md_brw_size = CFS_PAGE_SIZE; if (data->ocd_connect_flags & OBD_CONNECT_LAYOUTLOCK) { LCONSOLE_INFO("Layout lock feature supported.\n"); sbi->ll_flags |= LL_SBI_LAYOUT_LOCK; } - obd = class_name2obd(dt); - if (!obd) { - CERROR("DT %s: not setup or attached\n", dt); - GOTO(out_md_fid, err = -ENODEV); - } + obd = class_name2obd(dt); + if (!obd) { + CERROR("DT %s: not setup or attached\n", dt); + GOTO(out_md_fid, err = -ENODEV); + } data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION | - OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | + OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | OBD_CONNECT_CANCELSET | OBD_CONNECT_FID | OBD_CONNECT_SRVLOCK | OBD_CONNECT_TRUNCLOCK| OBD_CONNECT_AT | OBD_CONNECT_RMT_CLIENT | @@ -404,7 +409,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, OBD_CONNECT_FULL20 | OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | OBD_CONNECT_EINPROGRESS | - OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE; + OBD_CONNECT_JOBSTATS | OBD_CONNECT_LVB_TYPE | + OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_PINGLESS; if (sbi->ll_flags & LL_SBI_SOM_PREVIEW) data->ocd_connect_flags |= OBD_CONNECT_SOM; @@ -435,85 +441,96 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, obd->obd_upcall.onu_owner = &sbi->ll_lco; obd->obd_upcall.onu_upcall = cl_ocd_update; - data->ocd_brw_size = PTLRPC_MAX_BRW_SIZE; + data->ocd_brw_size = DT_MAX_BRW_SIZE; + + err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data, + NULL); + if (err == -EBUSY) { + LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing " + "recovery, of which this client is not a " + "part. Please wait for recovery to " + "complete, abort, or time out.\n", dt); + GOTO(out_md, err); + } else if (err) { + CERROR("%s: Cannot connect to %s: rc = %d\n", + sbi->ll_dt_exp->exp_obd->obd_name, dt, err); + GOTO(out_md, err); + } - err = obd_connect(NULL, &sbi->ll_dt_exp, obd, &sbi->ll_sb_uuid, data, NULL); - if (err == -EBUSY) { - LCONSOLE_ERROR_MSG(0x150, "An OST (dt %s) is performing " - "recovery, of which this client is not a " - "part. Please wait for recovery to " - "complete, abort, or time out.\n", dt); - GOTO(out_md_fid, err); - } else if (err) { - CERROR("Cannot connect to %s: rc = %d\n", dt, err); - GOTO(out_md_fid, err); - } + sbi->ll_dt_exp->exp_connect_data = *data; - err = obd_fid_init(sbi->ll_dt_exp); - if (err) { - CERROR("Can't init data layer FID infrastructure, " - "rc %d\n", err); - GOTO(out_dt, err); - } + err = obd_fid_init(sbi->ll_dt_exp->exp_obd, sbi->ll_dt_exp, + LUSTRE_SEQ_METADATA); + if (err) { + CERROR("%s: Can't init data layer FID infrastructure, " + "rc = %d\n", sbi->ll_dt_exp->exp_obd->obd_name, err); + GOTO(out_dt, err); + } mutex_lock(&sbi->ll_lco.lco_lock); - sbi->ll_lco.lco_flags = data->ocd_connect_flags; - sbi->ll_lco.lco_md_exp = sbi->ll_md_exp; - sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp; + sbi->ll_lco.lco_flags = data->ocd_connect_flags; + sbi->ll_lco.lco_md_exp = sbi->ll_md_exp; + sbi->ll_lco.lco_dt_exp = sbi->ll_dt_exp; mutex_unlock(&sbi->ll_lco.lco_lock); - fid_zero(&sbi->ll_root_fid); - err = md_getstatus(sbi->ll_md_exp, &sbi->ll_root_fid, &oc); - if (err) { - CERROR("cannot mds_connect: rc = %d\n", err); - GOTO(out_lock_cn_cb, err); - } - if (!fid_is_sane(&sbi->ll_root_fid)) { - CERROR("Invalid root fid during mount\n"); - GOTO(out_lock_cn_cb, err = -EINVAL); - } - CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid)); + fid_zero(&sbi->ll_root_fid); + err = md_getstatus(sbi->ll_md_exp, &sbi->ll_root_fid, &oc); + if (err) { + CERROR("cannot mds_connect: rc = %d\n", err); + GOTO(out_lock_cn_cb, err); + } + if (!fid_is_sane(&sbi->ll_root_fid)) { + CERROR("%s: Invalid root fid "DFID" during mount\n", + sbi->ll_md_exp->exp_obd->obd_name, + PFID(&sbi->ll_root_fid)); + GOTO(out_lock_cn_cb, err = -EINVAL); + } + CDEBUG(D_SUPER, "rootfid "DFID"\n", PFID(&sbi->ll_root_fid)); - sb->s_op = &lustre_super_operations; + sb->s_op = &lustre_super_operations; #if THREAD_SIZE >= 8192 /*b=17630*/ sb->s_export_op = &lustre_export_operations; #endif - /* make root inode - * XXX: move this to after cbd setup? */ - valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMDSCAPA; - if (sbi->ll_flags & LL_SBI_RMT_CLIENT) - valid |= OBD_MD_FLRMTPERM; - else if (sbi->ll_flags & LL_SBI_ACL) - valid |= OBD_MD_FLACL; + /* make root inode + * XXX: move this to after cbd setup? */ + valid = OBD_MD_FLGETATTR | OBD_MD_FLBLOCKS | OBD_MD_FLMDSCAPA; + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) + valid |= OBD_MD_FLRMTPERM; + else if (sbi->ll_flags & LL_SBI_ACL) + valid |= OBD_MD_FLACL; - OBD_ALLOC_PTR(op_data); - if (op_data == NULL) - GOTO(out_lock_cn_cb, err = -ENOMEM); + OBD_ALLOC_PTR(op_data); + if (op_data == NULL) + GOTO(out_lock_cn_cb, err = -ENOMEM); - op_data->op_fid1 = sbi->ll_root_fid; - op_data->op_mode = 0; - op_data->op_capa1 = oc; - op_data->op_valid = valid; + op_data->op_fid1 = sbi->ll_root_fid; + op_data->op_mode = 0; + op_data->op_capa1 = oc; + op_data->op_valid = valid; + + err = md_getattr(sbi->ll_md_exp, op_data, &request); + if (oc) + capa_put(oc); + OBD_FREE_PTR(op_data); + if (err) { + CERROR("%s: md_getattr failed for root: rc = %d\n", + sbi->ll_md_exp->exp_obd->obd_name, err); + GOTO(out_lock_cn_cb, err); + } - err = md_getattr(sbi->ll_md_exp, op_data, &request); - if (oc) - capa_put(oc); - OBD_FREE_PTR(op_data); - if (err) { - CERROR("md_getattr failed for root: rc = %d\n", err); - GOTO(out_lock_cn_cb, err); - } - err = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp, - sbi->ll_md_exp, &lmd); - if (err) { - CERROR("failed to understand root inode md: rc = %d\n", err); - ptlrpc_req_finished (request); - GOTO(out_lock_cn_cb, err); - } + err = md_get_lustre_md(sbi->ll_md_exp, request, sbi->ll_dt_exp, + sbi->ll_md_exp, &lmd); + if (err) { + CERROR("failed to understand root inode md: rc = %d\n", err); + ptlrpc_req_finished(request); + GOTO(out_lock_cn_cb, err); + } LASSERT(fid_is_sane(&sbi->ll_root_fid)); - root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid, 0), &lmd); + root = ll_iget(sb, cl_fid_build_ino(&sbi->ll_root_fid, + sbi->ll_flags & LL_SBI_32BIT_API), + &lmd); md_free_lustre_md(sbi->ll_md_exp, &lmd); ptlrpc_req_finished(request); @@ -559,7 +576,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, if (sb->s_root == NULL) { CERROR("%s: can't make root dentry\n", ll_get_fsname(sb, NULL, 0)); - GOTO(out_lock_cn_cb, err = -ENOMEM); + GOTO(out_root, err = -ENOMEM); } #ifdef HAVE_DCACHE_LOCK @@ -591,12 +608,14 @@ out_root: if (root) iput(root); out_lock_cn_cb: - obd_fid_fini(sbi->ll_dt_exp); + obd_fid_fini(sbi->ll_dt_exp->exp_obd); out_dt: obd_disconnect(sbi->ll_dt_exp); sbi->ll_dt_exp = NULL; + /* Make sure all OScs are gone, since cl_cache is accessing sbi. */ + obd_zombie_barrier(); out_md_fid: - obd_fid_fini(sbi->ll_md_exp); + obd_fid_fini(sbi->ll_md_exp->exp_obd); out_md: obd_disconnect(sbi->ll_md_exp); sbi->ll_md_exp = NULL; @@ -625,17 +644,17 @@ int ll_get_max_mdsize(struct ll_sb_info *sbi, int *lmmsize) void ll_dump_inode(struct inode *inode) { - struct list_head *tmp; - int dentry_count = 0; + struct ll_d_hlist_node *tmp; + int dentry_count = 0; - LASSERT(inode != NULL); + LASSERT(inode != NULL); - list_for_each(tmp, &inode->i_dentry) - dentry_count++; + ll_d_hlist_for_each(tmp, &inode->i_dentry) + dentry_count++; - CERROR("inode %p dump: dev=%s ino=%lu mode=%o count=%u, %d dentries\n", - inode, ll_i2mdexp(inode)->exp_obd->obd_name, inode->i_ino, - inode->i_mode, atomic_read(&inode->i_count), dentry_count); + CERROR("inode %p dump: dev=%s ino=%lu mode=%o count=%u, %d dentries\n", + inode, ll_i2mdexp(inode)->exp_obd->obd_name, inode->i_ino, + inode->i_mode, atomic_read(&inode->i_count), dentry_count); } void lustre_dump_dentry(struct dentry *dentry, int recur) @@ -684,13 +703,16 @@ void client_common_put_super(struct super_block *sb) cfs_list_del(&sbi->ll_conn_chain); - obd_fid_fini(sbi->ll_dt_exp); + obd_fid_fini(sbi->ll_dt_exp->exp_obd); obd_disconnect(sbi->ll_dt_exp); sbi->ll_dt_exp = NULL; + /* wait till all OSCs are gone, since cl_cache is accessing sbi. + * see LU-2543. */ + obd_zombie_barrier(); lprocfs_unregister_mountpoint(sbi); - obd_fid_fini(sbi->ll_md_exp); + obd_fid_fini(sbi->ll_md_exp->exp_obd); obd_disconnect(sbi->ll_md_exp); sbi->ll_md_exp = NULL; @@ -711,9 +733,11 @@ void ll_kill_super(struct super_block *sb) /* we need restore s_dev from changed for clustred NFS before put_super * because new kernels have cached s_dev and change sb->s_dev in * put_super not affected real removing devices */ - if (sbi) - sb->s_dev = sbi->ll_sdev_orig; - EXIT; + if (sbi) { + sb->s_dev = sbi->ll_sdev_orig; + sbi->ll_umounting = 1; + } + EXIT; } char *ll_read_opt(const char *opt, char *data) @@ -918,6 +942,7 @@ void ll_lli_init(struct ll_inode_info *lli) mutex_init(&lli->lli_och_mutex); spin_lock_init(&lli->lli_agl_lock); lli->lli_has_smd = false; + lli->lli_layout_gen = LL_LAYOUT_GEN_NONE; lli->lli_clob = NULL; LASSERT(lli->lli_vfs_inode.i_mode != 0); @@ -939,6 +964,7 @@ void ll_lli_init(struct ll_inode_info *lli) CFS_INIT_LIST_HEAD(&lli->lli_agl_list); lli->lli_agl_index = 0; lli->lli_async_rc = 0; + lli->lli_volatile = false; } mutex_init(&lli->lli_layout_mutex); } @@ -991,14 +1017,14 @@ int ll_fill_super(struct super_block *sb, struct vfsmount *mnt) if (err) GOTO(out_free, err); - err = ll_bdi_init(&lsi->lsi_bdi); - if (err) - GOTO(out_free, err); - lsi->lsi_flags |= LSI_BDI_INITIALIZED; - lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY; - err = ll_bdi_register(&lsi->lsi_bdi); - if (err) - GOTO(out_free, err); + err = bdi_init(&lsi->lsi_bdi); + if (err) + GOTO(out_free, err); + lsi->lsi_flags |= LSI_BDI_INITIALIZED; + lsi->lsi_bdi.capabilities = BDI_CAP_MAP_COPY; + err = ll_bdi_register(&lsi->lsi_bdi); + if (err) + GOTO(out_free, err); #ifdef HAVE_SB_BDI sb->s_bdi = &lsi->lsi_bdi; @@ -1107,23 +1133,46 @@ void ll_put_super(struct super_block *sb) if (profilenm) class_del_profile(profilenm); - if (lsi->lsi_flags & LSI_BDI_INITIALIZED) { - ll_bdi_destroy(&lsi->lsi_bdi); - lsi->lsi_flags &= ~LSI_BDI_INITIALIZED; - } + if (lsi->lsi_flags & LSI_BDI_INITIALIZED) { + bdi_destroy(&lsi->lsi_bdi); + lsi->lsi_flags &= ~LSI_BDI_INITIALIZED; + } ll_free_sbi(sb); lsi->lsi_llsbi = NULL; lustre_common_put_super(sb); - cl_env_cache_purge(~0); - cfs_module_put(THIS_MODULE); EXIT; } /* client_put_super */ +struct inode *ll_inode_from_resource_lock(struct ldlm_lock *lock) +{ + struct inode *inode = NULL; + + /* NOTE: we depend on atomic igrab() -bzzz */ + lock_res_and_lock(lock); + if (lock->l_resource->lr_lvb_inode) { + struct ll_inode_info * lli; + lli = ll_i2info(lock->l_resource->lr_lvb_inode); + if (lli->lli_inode_magic == LLI_INODE_MAGIC) { + inode = igrab(lock->l_resource->lr_lvb_inode); + } else { + inode = lock->l_resource->lr_lvb_inode; + LDLM_DEBUG_LIMIT(inode->i_state & I_FREEING ? D_INFO : + D_WARNING, lock, "lr_lvb_inode %p is " + "bogus: magic %08x", + lock->l_resource->lr_lvb_inode, + lli->lli_inode_magic); + inode = NULL; + } + } + unlock_res_and_lock(lock); + return inode; +} + struct inode *ll_inode_from_lock(struct ldlm_lock *lock) { struct inode *inode = NULL; @@ -1146,18 +1195,6 @@ struct inode *ll_inode_from_lock(struct ldlm_lock *lock) return inode; } -static int null_if_equal(struct ldlm_lock *lock, void *data) -{ - if (data == lock->l_ast_data) { - lock->l_ast_data = NULL; - - if (lock->l_req_mode != lock->l_granted_mode) - LDLM_ERROR(lock,"clearing inode with ungranted lock"); - } - - return LDLM_ITER_CONTINUE; -} - void ll_clear_inode(struct inode *inode) { struct ll_inode_info *lli = ll_i2info(inode); @@ -1175,8 +1212,7 @@ void ll_clear_inode(struct inode *inode) } ll_i2info(inode)->lli_flags &= ~LLIF_MDS_SIZE_LOCK; - md_change_cbdata(sbi->ll_md_exp, ll_inode2fid(inode), - null_if_equal, inode); + md_null_inode(sbi->ll_md_exp, ll_inode2fid(inode)); LASSERT(!lli->lli_open_fd_write_count); LASSERT(!lli->lli_open_fd_read_count); @@ -1269,16 +1305,12 @@ int ll_md_setattr(struct dentry *dentry, struct md_op_data *op_data, RETURN(rc); } - /* We want to adjust timestamps. - * If there is at least some data in file, we cleared ATTR_SIZE - * to avoid update size, otherwise it is important to do.(SOM case) - * (bug 6196) */ - ia_valid = op_data->op_attr.ia_valid; - /* Since we set ATTR_*_SET flags above, and already done permission - * check, So don't let inode_change_ok() check it again. */ - op_data->op_attr.ia_valid &= ~TIMES_SET_FLAGS; - rc = simple_setattr(dentry, &op_data->op_attr); - op_data->op_attr.ia_valid = ia_valid; + ia_valid = op_data->op_attr.ia_valid; + /* inode size will be in ll_setattr_ost, can't do it now since dirty + * cache is not cleared yet. */ + op_data->op_attr.ia_valid &= ~(TIMES_SET_FLAGS | ATTR_SIZE); + rc = simple_setattr(dentry, &op_data->op_attr); + op_data->op_attr.ia_valid = ia_valid; /* Extract epoch data if obtained. */ op_data->op_handle = md.body->handle; @@ -1401,12 +1433,10 @@ out_big: */ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr) { - struct lov_stripe_md *lsm; struct inode *inode = dentry->d_inode; struct ll_inode_info *lli = ll_i2info(inode); struct md_op_data *op_data = NULL; struct md_open_data *mod = NULL; - int ia_valid = attr->ia_valid; int rc = 0, rc1 = 0; ENTRY; @@ -1415,7 +1445,7 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr) PFID(&lli->lli_fid), i_size_read(inode), attr->ia_size, attr->ia_valid); - if (ia_valid & ATTR_SIZE) { + if (attr->ia_valid & ATTR_SIZE) { /* Check new size against VFS/VM file size limit and rlimit */ rc = inode_newsize_ok(inode, attr->ia_size); if (rc) @@ -1435,7 +1465,7 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr) } /* POSIX: check before ATTR_*TIME_SET set (from inode_change_ok) */ - if (ia_valid & TIMES_SET_FLAGS) { + if (attr->ia_valid & TIMES_SET_FLAGS) { if (cfs_curproc_fsuid() != inode->i_uid && !cfs_capable(CFS_CAP_FOWNER)) RETURN(-EPERM); @@ -1446,11 +1476,13 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr) attr->ia_ctime = CFS_CURRENT_TIME; attr->ia_valid |= ATTR_CTIME_SET; } - if (!(ia_valid & ATTR_ATIME_SET) && (attr->ia_valid & ATTR_ATIME)) { + if (!(attr->ia_valid & ATTR_ATIME_SET) && + (attr->ia_valid & ATTR_ATIME)) { attr->ia_atime = CFS_CURRENT_TIME; attr->ia_valid |= ATTR_ATIME_SET; } - if (!(ia_valid & ATTR_MTIME_SET) && (attr->ia_valid & ATTR_MTIME)) { + if (!(attr->ia_valid & ATTR_MTIME_SET) && + (attr->ia_valid & ATTR_MTIME)) { attr->ia_mtime = CFS_CURRENT_TIME; attr->ia_valid |= ATTR_MTIME_SET; } @@ -1460,6 +1492,14 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr) LTIME_S(attr->ia_mtime), LTIME_S(attr->ia_ctime), cfs_time_current_sec()); + /* If we are changing file size, file content is modified, flag it. */ + if (attr->ia_valid & ATTR_SIZE) { + attr->ia_valid |= MDS_OPEN_OWNEROVERRIDE; + spin_lock(&lli->lli_lock); + lli->lli_flags |= LLIF_DATA_MODIFIED; + spin_unlock(&lli->lli_lock); + } + /* We always do an MDS RPC, even if we're only changing the size; * only the MDS knows whether truncate() should fail with -ETXTBUSY */ @@ -1468,46 +1508,37 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr) RETURN(-ENOMEM); if (!S_ISDIR(inode->i_mode)) { - if (ia_valid & ATTR_SIZE) + if (attr->ia_valid & ATTR_SIZE) inode_dio_write_done(inode); mutex_unlock(&inode->i_mutex); down_write(&lli->lli_trunc_sem); - mutex_lock(&inode->i_mutex); - if (ia_valid & ATTR_SIZE) - inode_dio_wait(inode); } - /* We need a steady stripe configuration for setattr to avoid - * confusion. */ - lsm = ccc_inode_lsm_get(inode); - - /* NB: ATTR_SIZE will only be set after this point if the size - * resides on the MDS, ie, this file has no objects. */ - if (lsm != NULL) - attr->ia_valid &= ~ATTR_SIZE; - /* can't call ll_setattr_ost() while holding a refcount of lsm */ - ccc_inode_lsm_put(inode, lsm); - memcpy(&op_data->op_attr, attr, sizeof(*attr)); /* Open epoch for truncate. */ if (exp_connect_som(ll_i2mdexp(inode)) && - (ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MTIME_SET))) + (attr->ia_valid & (ATTR_SIZE | ATTR_MTIME | ATTR_MTIME_SET))) op_data->op_flags = MF_EPOCH_OPEN; rc = ll_md_setattr(dentry, op_data, &mod); if (rc) GOTO(out, rc); + /* RPC to MDT is sent, cancel data modification flag */ + if (rc == 0 && (op_data->op_bias & MDS_DATA_MODIFIED)) { + spin_lock(&lli->lli_lock); + lli->lli_flags &= ~LLIF_DATA_MODIFIED; + spin_unlock(&lli->lli_lock); + } + ll_ioepoch_open(lli, op_data->op_ioepoch); if (!S_ISREG(inode->i_mode)) GOTO(out, rc = 0); - if (ia_valid & ATTR_SIZE) - attr->ia_valid |= ATTR_SIZE; - if (ia_valid & (ATTR_SIZE | - ATTR_ATIME | ATTR_ATIME_SET | - ATTR_MTIME | ATTR_MTIME_SET)) + if (attr->ia_valid & (ATTR_SIZE | + ATTR_ATIME | ATTR_ATIME_SET | + ATTR_MTIME | ATTR_MTIME_SET)) /* For truncate and utimes sending attributes to OSTs, setting * mtime/atime to the past will be performed under PW [0:EOF] * extent lock (new_size:EOF for truncate). It may seem @@ -1525,13 +1556,17 @@ out: } ll_finish_md_op_data(op_data); } - if (!S_ISDIR(inode->i_mode)) + if (!S_ISDIR(inode->i_mode)) { up_write(&lli->lli_trunc_sem); + mutex_lock(&inode->i_mutex); + if (attr->ia_valid & ATTR_SIZE) + inode_dio_wait(inode); + } - ll_stats_ops_tally(ll_i2sbi(inode), (ia_valid & ATTR_SIZE) ? - LPROC_LL_TRUNC : LPROC_LL_SETATTR, 1); + ll_stats_ops_tally(ll_i2sbi(inode), (attr->ia_valid & ATTR_SIZE) ? + LPROC_LL_TRUNC : LPROC_LL_SETATTR, 1); - return rc; + return rc; } int ll_setattr(struct dentry *de, struct iattr *attr) @@ -1682,21 +1717,13 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0)); if (lsm != NULL) { - LASSERT(S_ISREG(inode->i_mode)); - CDEBUG(D_INODE, "adding lsm %p to inode %lu/%u(%p)\n", - lsm, inode->i_ino, inode->i_generation, inode); - /* cl_file_inode_init must go before lli_has_smd or a race - * is possible where client thinks the file has stripes, - * but lov raid0 is not setup yet and parallel e.g. - * glimpse would try to use uninitialized lov */ - if (cl_file_inode_init(inode, md) == 0) - lli->lli_has_smd = true; + if (!lli->lli_has_smd && + !(sbi->ll_flags & LL_SBI_LAYOUT_LOCK)) + cl_file_inode_init(inode, md); lli->lli_maxbytes = lsm->lsm_maxbytes; if (lli->lli_maxbytes > MAX_LFS_FILESIZE) lli->lli_maxbytes = MAX_LFS_FILESIZE; - if (md->lsm != NULL) - obd_free_memmd(ll_i2dtexp(inode), &md->lsm); } if (sbi->ll_flags & LL_SBI_RMT_CLIENT) { @@ -1712,7 +1739,8 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) spin_unlock(&lli->lli_lock); } #endif - inode->i_ino = cl_fid_build_ino(&body->fid1, 0); + inode->i_ino = cl_fid_build_ino(&body->fid1, + sbi->ll_flags & LL_SBI_32BIT_API); inode->i_generation = cl_fid_build_gen(&body->fid1); if (body->valid & OBD_MD_FLATIME) { @@ -1771,7 +1799,7 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) if (body->valid & OBD_MD_FLSIZE) { if (exp_connect_som(ll_i2mdexp(inode)) && - S_ISREG(inode->i_mode) && lli->lli_has_smd) { + S_ISREG(inode->i_mode)) { struct lustre_handle lockh; ldlm_mode_t mode; @@ -1878,7 +1906,8 @@ void ll_delete_inode(struct inode *inode) if (S_ISREG(inode->i_mode) && lli->lli_clob != NULL) /* discard all dirty pages before truncating them, required by * osc_extent implementation at LU-1030. */ - cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, CL_FSYNC_DISCARD); + cl_sync_file_range(inode, 0, OBD_OBJECT_EOF, + CL_FSYNC_DISCARD, 1); truncate_inode_pages(&inode->i_data, 0); @@ -1971,8 +2000,7 @@ int ll_iocontrol(struct inode *inode, struct file *file, RETURN(-ENOMEM); } oinfo.oi_md = lsm; - oinfo.oi_oa->o_id = lsm->lsm_object_id; - oinfo.oi_oa->o_seq = lsm->lsm_object_seq; + oinfo.oi_oa->o_oi = lsm->lsm_oi; oinfo.oi_oa->o_flags = flags; oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS | OBD_MD_FLGROUP; @@ -2053,16 +2081,15 @@ void ll_umount_begin(struct super_block *sb) obd->obd_force = 1; OBD_ALLOC_PTR(ioc_data); - if (ioc_data) { - obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp, - sizeof ioc_data, ioc_data, NULL); + if (ioc_data) { + obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_md_exp, + sizeof *ioc_data, ioc_data, NULL); - obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp, - sizeof ioc_data, ioc_data, NULL); - - OBD_FREE_PTR(ioc_data); - } + obd_iocontrol(IOC_OSC_SET_ACTIVE, sbi->ll_dt_exp, + sizeof *ioc_data, ioc_data, NULL); + OBD_FREE_PTR(ioc_data); + } /* Really, we'd like to wait until there are no requests outstanding, * and then continue. For now, we just invalidate the requests, @@ -2115,13 +2142,11 @@ int ll_remount_fs(struct super_block *sb, int *flags, char *data) return 0; } -int ll_prep_inode(struct inode **inode, - struct ptlrpc_request *req, - struct super_block *sb) +int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, + struct super_block *sb, struct lookup_intent *it) { struct ll_sb_info *sbi = NULL; struct lustre_md md; - __u64 ibits; int rc; ENTRY; @@ -2143,10 +2168,10 @@ int ll_prep_inode(struct inode **inode, */ LASSERT(fid_is_sane(&md.body->fid1)); - *inode = ll_iget(sb, cl_fid_build_ino(&md.body->fid1, 0), &md); + *inode = ll_iget(sb, cl_fid_build_ino(&md.body->fid1, + sbi->ll_flags & LL_SBI_32BIT_API), + &md); if (*inode == NULL || IS_ERR(*inode)) { - if (md.lsm) - obd_free_memmd(sbi->ll_dt_exp, &md.lsm); #ifdef CONFIG_FS_POSIX_ACL if (md.posix_acl) { posix_acl_release(md.posix_acl); @@ -2160,16 +2185,37 @@ int ll_prep_inode(struct inode **inode, } } - /* sanity check for LAYOUT lock. */ - ibits = MDS_INODELOCK_LAYOUT; - if (S_ISREG(md.body->mode) && sbi->ll_flags & LL_SBI_LAYOUT_LOCK && - md.lsm != NULL && !ll_have_md_lock(*inode, &ibits, LCK_MINMODE)) { - CERROR("%s: inode "DFID" (%p) layout lock not granted.\n", - ll_get_fsname(sb, NULL, 0), - PFID(ll_inode2fid(*inode)), *inode); + /* Handling piggyback layout lock. + * Layout lock can be piggybacked by getattr and open request. + * The lsm can be applied to inode only if it comes with a layout lock + * otherwise correct layout may be overwritten, for example: + * 1. proc1: mdt returns a lsm but not granting layout + * 2. layout was changed by another client + * 3. proc2: refresh layout and layout lock granted + * 4. proc1: to apply a stale layout */ + if (it != NULL && it->d.lustre.it_lock_mode != 0) { + struct lustre_handle lockh; + struct ldlm_lock *lock; + + lockh.cookie = it->d.lustre.it_lock_handle; + lock = ldlm_handle2lock(&lockh); + LASSERT(lock != NULL); + if (ldlm_has_layout(lock)) { + struct cl_object_conf conf; + + memset(&conf, 0, sizeof(conf)); + conf.coc_opc = OBJECT_CONF_SET; + conf.coc_inode = *inode; + conf.coc_lock = lock; + conf.u.coc_md = &md; + (void)ll_layout_conf(*inode, &conf); + } + LDLM_LOCK_PUT(lock); } out: + if (md.lsm != NULL) + obd_free_memmd(sbi->ll_dt_exp, &md.lsm); md_free_lustre_md(sbi->ll_md_exp, &md); RETURN(rc); } @@ -2278,17 +2324,21 @@ struct md_op_data * ll_prep_md_op_data(struct md_op_data *op_data, op_data->op_capa2 = NULL; } - op_data->op_name = name; - op_data->op_namelen = namelen; - op_data->op_mode = mode; - op_data->op_mod_time = cfs_time_current_sec(); - op_data->op_fsuid = cfs_curproc_fsuid(); - op_data->op_fsgid = cfs_curproc_fsgid(); - op_data->op_cap = cfs_curproc_cap_pack(); - op_data->op_bias = MDS_CHECK_SPLIT; - op_data->op_opc = opc; - op_data->op_mds = 0; - op_data->op_data = data; + op_data->op_name = name; + op_data->op_namelen = namelen; + op_data->op_mode = mode; + op_data->op_mod_time = cfs_time_current_sec(); + op_data->op_fsuid = cfs_curproc_fsuid(); + op_data->op_fsgid = cfs_curproc_fsgid(); + op_data->op_cap = cfs_curproc_cap_pack(); + op_data->op_bias = 0; + op_data->op_cli_flags = 0; + if ((opc == LUSTRE_OPC_CREATE) && (name != NULL) && + filename_is_volatile(name, namelen, NULL)) + op_data->op_bias |= MDS_CREATE_VOLATILE; + op_data->op_opc = opc; + op_data->op_mds = 0; + op_data->op_data = data; /* If the file is being opened after mknod() (normally due to NFS) * try to use the default stripe data from parent directory for @@ -2304,6 +2354,10 @@ struct md_op_data * ll_prep_md_op_data(struct md_op_data *op_data, /** We ignore parent's capability temporary. */ } + /* When called by ll_setattr_raw, file is i1. */ + if (LLIF_DATA_MODIFIED & ll_i2info(i1)->lli_flags) + op_data->op_bias |= MDS_DATA_MODIFIED; + return op_data; } @@ -2435,7 +2489,8 @@ void ll_dirty_page_discard_warn(cfs_page_t *page, int ioret) struct dentry *dentry = NULL; struct ccc_object *obj = cl_inode2ccc(page->mapping->host); - buf = (char *)__get_free_page(GFP_KERNEL); + /* this can be called inside spin lock so use GFP_ATOMIC. */ + buf = (char *)__get_free_page(GFP_ATOMIC); if (buf != NULL) { dentry = d_find_alias(page->mapping->host); if (dentry != NULL)