Whamcloud - gitweb
LU-14521 flr: delete mirror without volatile file
[fs/lustre-release.git] / lustre / llite / file.c
index 449f895..de918fb 100644 (file)
@@ -112,12 +112,12 @@ static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
        op_data->op_xvalid |= OP_XVALID_CTIME_SET;
        op_data->op_attr_blocks = inode->i_blocks;
        op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
-       if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
+       if (test_bit(LLIF_PROJECT_INHERIT, &ll_i2info(inode)->lli_flags))
                op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
        op_data->op_open_handle = och->och_open_handle;
 
        if (och->och_flags & FMODE_WRITE &&
-           ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
+           test_and_clear_bit(LLIF_DATA_MODIFIED, &ll_i2info(inode)->lli_flags))
                /* For HSM: if inode data has been modified, pack it so that
                 * MDT can set data dirty flag in the archive. */
                op_data->op_bias |= MDS_DATA_MODIFIED;
@@ -1365,7 +1365,7 @@ int ll_merge_attr(const struct lu_env *env, struct inode *inode)
         * POSIX. Solving this problem needs to send an RPC to MDT for each
         * read, this will hurt performance.
         */
-       if (ll_file_test_and_clear_flag(lli, LLIF_UPDATE_ATIME) ||
+       if (test_and_clear_bit(LLIF_UPDATE_ATIME, &lli->lli_flags) ||
            inode->i_atime.tv_sec < lli->lli_atime)
                inode->i_atime.tv_sec = lli->lli_atime;
 
@@ -1542,6 +1542,7 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
        struct vvp_io *vio = vvp_env_io(env);
        struct inode *inode = file_inode(file);
        struct ll_inode_info *lli = ll_i2info(inode);
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
        struct ll_file_data *fd  = file->private_data;
        struct range_lock range;
        struct cl_io *io;
@@ -1550,6 +1551,9 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
        unsigned int retried = 0, dio_lock = 0;
        bool is_aio = false;
        struct cl_dio_aio *ci_aio = NULL;
+       size_t per_bytes;
+       bool partial_io = false;
+       size_t max_io_pages, max_cached_pages;
 
        ENTRY;
 
@@ -1557,6 +1561,11 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
                file_dentry(file)->d_name.name,
                iot == CIT_READ ? "read" : "write", *ppos, count);
 
+       max_io_pages = PTLRPC_MAX_BRW_PAGES * OBD_MAX_RIF_DEFAULT;
+       max_cached_pages = sbi->ll_cache->ccc_lru_max;
+       if (max_io_pages > (max_cached_pages >> 2))
+               max_io_pages = max_cached_pages >> 2;
+
        io = vvp_env_thread_io(env);
        if (file->f_flags & O_DIRECT) {
                if (!is_sync_kiocb(args->u.normal.via_iocb))
@@ -1567,19 +1576,29 @@ ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
        }
 
 restart:
+       /**
+        * IO block size need be aware of cached page limit, otherwise
+        * if we have small max_cached_mb but large block IO issued, io
+        * could not be finished and blocked whole client.
+        */
+       if (file->f_flags & O_DIRECT)
+               per_bytes = count;
+       else
+               per_bytes = min(max_io_pages << PAGE_SHIFT, count);
+       partial_io = per_bytes < count;
        io = vvp_env_thread_io(env);
        ll_io_init(io, file, iot, args);
        io->ci_aio = ci_aio;
        io->ci_dio_lock = dio_lock;
        io->ci_ndelay_tried = retried;
 
-       if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
+       if (cl_io_rw_init(env, io, iot, *ppos, per_bytes) == 0) {
                bool range_locked = false;
 
                if (file->f_flags & O_APPEND)
                        range_lock_init(&range, 0, LUSTRE_EOF);
                else
-                       range_lock_init(&range, *ppos, *ppos + count - 1);
+                       range_lock_init(&range, *ppos, *ppos + per_bytes - 1);
 
                vio->vui_fd  = file->private_data;
                vio->vui_iter = args->u.normal.via_iter;
@@ -1631,6 +1650,16 @@ restart:
                /* prepare IO restart */
                if (count > 0)
                        args->u.normal.via_iter = vio->vui_iter;
+
+               if (partial_io) {
+                       /**
+                        * Reexpand iov count because it was zero
+                        * after IO finish.
+                        */
+                       iov_iter_reexpand(vio->vui_iter, count);
+                       if (per_bytes == io->ci_nob)
+                               io->ci_need_restart = 1;
+               }
        }
 out:
        cl_io_fini(env, io);
@@ -1883,7 +1912,7 @@ static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
                ll_heat_add(inode, CIT_WRITE, result);
                ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
                                   result);
-               ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
+               set_bit(LLIF_DATA_MODIFIED, &ll_i2info(inode)->lli_flags);
        }
 
        CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
@@ -2149,44 +2178,45 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
                              struct lov_mds_md **lmmp, int *lmm_size,
                              struct ptlrpc_request **request)
 {
-        struct ll_sb_info *sbi = ll_i2sbi(inode);
-        struct mdt_body  *body;
-        struct lov_mds_md *lmm = NULL;
-        struct ptlrpc_request *req = NULL;
-        struct md_op_data *op_data;
-        int rc, lmmsize;
+       struct ll_sb_info *sbi = ll_i2sbi(inode);
+       struct mdt_body *body;
+       struct lov_mds_md *lmm = NULL;
+       struct ptlrpc_request *req = NULL;
+       struct md_op_data *op_data;
+       int rc, lmmsize;
+
+       ENTRY;
 
        rc = ll_get_default_mdsize(sbi, &lmmsize);
        if (rc)
                RETURN(rc);
 
-        op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
-                                     strlen(filename), lmmsize,
-                                     LUSTRE_OPC_ANY, NULL);
-        if (IS_ERR(op_data))
-                RETURN(PTR_ERR(op_data));
+       op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
+                                    strlen(filename), lmmsize,
+                                    LUSTRE_OPC_ANY, NULL);
+       if (IS_ERR(op_data))
+               RETURN(PTR_ERR(op_data));
 
-        op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
-        rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
-        ll_finish_md_op_data(op_data);
-        if (rc < 0) {
-                CDEBUG(D_INFO, "md_getattr_name failed "
-                       "on %s: rc %d\n", filename, rc);
-                GOTO(out, rc);
-        }
+       op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
+       rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
+       ll_finish_md_op_data(op_data);
+       if (rc < 0) {
+               CDEBUG(D_INFO, "md_getattr_name failed "
+                      "on %s: rc %d\n", filename, rc);
+               GOTO(out, rc);
+       }
 
-        body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
-        LASSERT(body != NULL); /* checked by mdc_getattr_name */
+       body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
+       LASSERT(body != NULL); /* checked by mdc_getattr_name */
 
        lmmsize = body->mbo_eadatasize;
 
        if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
-                        lmmsize == 0) {
-                GOTO(out, rc = -ENODATA);
-        }
+           lmmsize == 0)
+               GOTO(out, rc = -ENODATA);
 
-        lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
-        LASSERT(lmm != NULL);
+       lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
+       LASSERT(lmm != NULL);
 
        if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
            lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
@@ -2196,11 +2226,10 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
 
        /*
         * This is coming from the MDS, so is probably in
-        * little endian.  We convert it to host endian before
+        * little endian. We convert it to host endian before
         * passing it to userspace.
         */
-       if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
-           __swab32(LOV_MAGIC_MAGIC)) {
+       if (cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) {
                int stripe_count = 0;
 
                if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
@@ -2209,28 +2238,74 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
                        if (le32_to_cpu(lmm->lmm_pattern) &
                            LOV_PATTERN_F_RELEASED)
                                stripe_count = 0;
-               }
-
-               lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
+                       lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
 
-               /* if function called for directory - we should
-                * avoid swab not existent lsm objects */
-               if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
-                       lustre_swab_lov_user_md_objects(
+                       /* if function called for directory - we should
+                        * avoid swab not existent lsm objects
+                        */
+                       if (lmm->lmm_magic == LOV_MAGIC_V1 &&
+                           S_ISREG(body->mbo_mode))
+                               lustre_swab_lov_user_md_objects(
                                ((struct lov_user_md_v1 *)lmm)->lmm_objects,
                                stripe_count);
-               else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
-                        S_ISREG(body->mbo_mode))
-                       lustre_swab_lov_user_md_objects(
+                       else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
+                                S_ISREG(body->mbo_mode))
+                               lustre_swab_lov_user_md_objects(
                                ((struct lov_user_md_v3 *)lmm)->lmm_objects,
                                stripe_count);
+               } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_COMP_V1)) {
+                       lustre_swab_lov_comp_md_v1(
+                               (struct lov_comp_md_v1 *)lmm);
+               }
        }
 
+       if (lmm->lmm_magic == LOV_MAGIC_COMP_V1) {
+               struct lov_comp_md_v1 *comp_v1 = NULL;
+               struct lov_comp_md_entry_v1 *ent;
+               struct lov_user_md_v1 *v1;
+               __u32 off;
+               int i = 0;
+
+               comp_v1 = (struct lov_comp_md_v1 *)lmm;
+               /* Dump the striping information */
+               for (; i < comp_v1->lcm_entry_count; i++) {
+                       ent = &comp_v1->lcm_entries[i];
+                       off = ent->lcme_offset;
+                       v1 = (struct lov_user_md_v1 *)((char *)lmm + off);
+                       CDEBUG(D_INFO,
+                              "comp[%d]: stripe_count=%u, stripe_size=%u\n",
+                              i, v1->lmm_stripe_count, v1->lmm_stripe_size);
+               }
+
+               /**
+                * Return valid stripe_count and stripe_size instead of 0 for
+                * DoM files to avoid divide-by-zero for older userspace that
+                * calls this ioctl, e.g. lustre ADIO driver.
+                */
+               if (lmm->lmm_stripe_count == 0)
+                       lmm->lmm_stripe_count = 1;
+               if (lmm->lmm_stripe_size == 0) {
+                       /* Since the first component of the file data is placed
+                        * on the MDT for faster access, the stripe_size of the
+                        * second one is always that applications which are
+                        * doing large IOs.
+                        */
+                       if (lmm->lmm_pattern == LOV_PATTERN_MDT)
+                               i = comp_v1->lcm_entry_count > 1 ? 1 : 0;
+                       else
+                               i = comp_v1->lcm_entry_count > 1 ?
+                                   comp_v1->lcm_entry_count - 1 : 0;
+                       ent = &comp_v1->lcm_entries[i];
+                       off = ent->lcme_offset;
+                       v1 = (struct lov_user_md_v1 *)((char *)lmm + off);
+                       lmm->lmm_stripe_size = v1->lmm_stripe_size;
+               }
+       }
 out:
        *lmmp = lmm;
        *lmm_size = lmmsize;
        *request = req;
-       return rc;
+       RETURN(rc);
 }
 
 static int ll_lov_setea(struct inode *inode, struct file *file,
@@ -2243,7 +2318,7 @@ static int ll_lov_setea(struct inode *inode, struct file *file,
        int                      rc;
        ENTRY;
 
-       if (!cfs_capable(CAP_SYS_ADMIN))
+       if (!capable(CAP_SYS_ADMIN))
                RETURN(-EPERM);
 
        OBD_ALLOC_LARGE(lump, lum_size);
@@ -2565,7 +2640,7 @@ int ll_fid2path(struct inode *inode, void __user *arg)
 
        ENTRY;
 
-       if (!cfs_capable(CAP_DAC_READ_SEARCH) &&
+       if (!capable(CAP_DAC_READ_SEARCH) &&
            !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
                RETURN(-EPERM);
 
@@ -2853,7 +2928,7 @@ int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
        /* Non-root users are forbidden to set or clear flags which are
         * NOT defined in HSM_USER_MASK. */
        if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
-           !cfs_capable(CAP_SYS_ADMIN))
+           !capable(CAP_SYS_ADMIN))
                RETURN(-EPERM);
 
        if (!exp_connect_archive_id_array(exp)) {
@@ -3229,7 +3304,7 @@ int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
                RETURN(-EFAULT);
 
        fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
-       if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
+       if (test_bit(LLIF_PROJECT_INHERIT, &ll_i2info(inode)->lli_flags))
                fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
        fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
        if (copy_to_user((struct fsxattr __user *)arg,
@@ -3252,7 +3327,7 @@ int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
        if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
                return -EINVAL;
 
-       if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
+       if (test_bit(LLIF_PROJECT_INHERIT, &ll_i2info(inode)->lli_flags)) {
                if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
                        return -EINVAL;
        } else {
@@ -3406,6 +3481,7 @@ static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
                if (!layout_file)
                        GOTO(out_lease_close, rc = -EBADF);
 
+               /* if layout_file == file, it means to destroy the mirror */
                sp.sp_inode = file_inode(layout_file);
                sp.sp_mirror_id = (__u16)mirror_id;
                data = &sp;
@@ -3735,7 +3811,10 @@ out:
 
                 RETURN(0);
         }
+       case OBD_IOC_GETNAME_OLD:
+               /* fall through */
        case OBD_IOC_GETDTNAME:
+               /* fall through */
        case OBD_IOC_GETMDNAME:
                RETURN(ll_get_obd_name(inode, cmd, arg));
        case LL_IOC_HSM_STATE_GET: {
@@ -4088,8 +4167,9 @@ out_state:
        }
 }
 
-loff_t ll_lseek(struct inode *inode, loff_t offset, int whence)
+loff_t ll_lseek(struct file *file, loff_t offset, int whence)
 {
+       struct inode *inode = file_inode(file);
        struct lu_env *env;
        struct cl_io *io;
        struct cl_lseek_io *lsio;
@@ -4105,6 +4185,7 @@ loff_t ll_lseek(struct inode *inode, loff_t offset, int whence)
 
        io = vvp_env_thread_io(env);
        io->ci_obj = ll_i2info(inode)->lli_clob;
+       ll_io_set_mirror(io, file);
 
        lsio = &io->u.ci_lseek;
        lsio->ls_start = offset;
@@ -4113,10 +4194,14 @@ loff_t ll_lseek(struct inode *inode, loff_t offset, int whence)
 
        do {
                rc = cl_io_init(env, io, CIT_LSEEK, io->ci_obj);
-               if (!rc)
+               if (!rc) {
+                       struct vvp_io *vio = vvp_env_io(env);
+
+                       vio->vui_fd = file->private_data;
                        rc = cl_io_loop(env, io);
-               else
+               } else {
                        rc = io->ci_result;
+               }
                retval = rc ? : lsio->ls_result;
                cl_io_fini(env, io);
        } while (unlikely(io->ci_need_restart));
@@ -4153,7 +4238,7 @@ static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
                cl_sync_file_range(inode, offset, OBD_OBJECT_EOF,
                                   CL_FSYNC_LOCAL, 0);
 
-               retval = ll_lseek(inode, offset, origin);
+               retval = ll_lseek(file, offset, origin);
                if (retval < 0)
                        return retval;
                retval = vfs_setpos(file, retval, ll_file_maxbytes(inode));
@@ -4930,7 +5015,7 @@ int ll_getattr_dentry(struct dentry *de, struct kstat *stat, u32 request_mask,
                 * restore the MDT holds the layout lock so the glimpse will
                 * block up to the end of restore (getattr will block)
                 */
-               if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
+               if (!test_bit(LLIF_FILE_RESTORING, &lli->lli_flags)) {
                        rc = ll_glimpse_size(inode);
                        if (rc < 0)
                                RETURN(rc);
@@ -5181,7 +5266,6 @@ int ll_inode_permission(struct inode *inode, int mask)
        struct root_squash_info *squash;
        struct cred *cred = NULL;
        const struct cred *old_cred = NULL;
-       cfs_cap_t cap;
        bool squash_id = false;
        ktime_t kstart = ktime_get();
 
@@ -5225,10 +5309,9 @@ int ll_inode_permission(struct inode *inode, int mask)
 
                cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
                cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
-               for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
-                       if (BIT(cap) & CFS_CAP_FS_MASK)
-                               cap_lower(cred->cap_effective, cap);
-               }
+               cred->cap_effective = cap_drop_nfsd_set(cred->cap_effective);
+               cred->cap_effective = cap_drop_fs_set(cred->cap_effective);
+
                old_cred = override_creds(cred);
        }