4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_get_fsname(inode->i_sb, NULL, 0),
143 PFID(&lli->lli_fid));
147 OBD_ALLOC_PTR(op_data);
148 /* We leak openhandle and request here on error, but not much to be
149 * done in OOM case since app won't retry close on error either. */
151 GOTO(out, rc = -ENOMEM);
153 ll_prepare_close(inode, op_data, och);
155 case MDS_CLOSE_LAYOUT_MERGE:
156 /* merge blocks from the victim inode */
157 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
158 op_data->op_attr.ia_valid |= ATTR_SIZE;
159 op_data->op_xvalid |= OP_XVALID_BLOCKS;
160 case MDS_CLOSE_LAYOUT_SPLIT:
161 case MDS_CLOSE_LAYOUT_SWAP: {
162 struct split_param *sp = data;
164 LASSERT(data != NULL);
165 op_data->op_bias |= bias;
166 op_data->op_data_version = 0;
167 op_data->op_lease_handle = och->och_lease_handle;
168 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
169 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
170 op_data->op_mirror_id = sp->sp_mirror_id;
172 op_data->op_fid2 = *ll_inode2fid(data);
177 case MDS_CLOSE_RESYNC_DONE: {
178 struct ll_ioc_lease *ioc = data;
180 LASSERT(data != NULL);
181 op_data->op_attr_blocks +=
182 ioc->lil_count * op_data->op_attr_blocks;
183 op_data->op_attr.ia_valid |= ATTR_SIZE;
184 op_data->op_xvalid |= OP_XVALID_BLOCKS;
185 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
187 op_data->op_lease_handle = och->och_lease_handle;
188 op_data->op_data = &ioc->lil_ids[0];
189 op_data->op_data_size =
190 ioc->lil_count * sizeof(ioc->lil_ids[0]);
194 case MDS_HSM_RELEASE:
195 LASSERT(data != NULL);
196 op_data->op_bias |= MDS_HSM_RELEASE;
197 op_data->op_data_version = *(__u64 *)data;
198 op_data->op_lease_handle = och->och_lease_handle;
199 op_data->op_attr.ia_valid |= ATTR_SIZE;
200 op_data->op_xvalid |= OP_XVALID_BLOCKS;
204 LASSERT(data == NULL);
208 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
209 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
210 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
211 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
213 rc = md_close(md_exp, op_data, och->och_mod, &req);
214 if (rc != 0 && rc != -EINTR)
215 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
216 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
218 if (rc == 0 && op_data->op_bias & bias) {
219 struct mdt_body *body;
221 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
222 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
226 ll_finish_md_op_data(op_data);
230 md_clear_open_replay_data(md_exp, och);
231 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
234 ptlrpc_req_finished(req); /* This is close request */
238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
240 struct ll_inode_info *lli = ll_i2info(inode);
241 struct obd_client_handle **och_p;
242 struct obd_client_handle *och;
247 if (fmode & FMODE_WRITE) {
248 och_p = &lli->lli_mds_write_och;
249 och_usecount = &lli->lli_open_fd_write_count;
250 } else if (fmode & FMODE_EXEC) {
251 och_p = &lli->lli_mds_exec_och;
252 och_usecount = &lli->lli_open_fd_exec_count;
254 LASSERT(fmode & FMODE_READ);
255 och_p = &lli->lli_mds_read_och;
256 och_usecount = &lli->lli_open_fd_read_count;
259 mutex_lock(&lli->lli_och_mutex);
260 if (*och_usecount > 0) {
261 /* There are still users of this handle, so skip
263 mutex_unlock(&lli->lli_och_mutex);
269 mutex_unlock(&lli->lli_och_mutex);
272 /* There might be a race and this handle may already
274 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
280 static int ll_md_close(struct inode *inode, struct file *file)
282 union ldlm_policy_data policy = {
283 .l_inodebits = { MDS_INODELOCK_OPEN },
285 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
286 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
287 struct ll_inode_info *lli = ll_i2info(inode);
288 struct lustre_handle lockh;
289 enum ldlm_mode lockmode;
293 /* clear group lock, if present */
294 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
295 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
297 if (fd->fd_lease_och != NULL) {
300 /* Usually the lease is not released when the
301 * application crashed, we need to release here. */
302 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
303 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
304 PFID(&lli->lli_fid), rc, lease_broken);
306 fd->fd_lease_och = NULL;
309 if (fd->fd_och != NULL) {
310 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
315 /* Let's see if we have good enough OPEN lock on the file and if
316 we can skip talking to MDS */
317 mutex_lock(&lli->lli_och_mutex);
318 if (fd->fd_omode & FMODE_WRITE) {
320 LASSERT(lli->lli_open_fd_write_count);
321 lli->lli_open_fd_write_count--;
322 } else if (fd->fd_omode & FMODE_EXEC) {
324 LASSERT(lli->lli_open_fd_exec_count);
325 lli->lli_open_fd_exec_count--;
328 LASSERT(lli->lli_open_fd_read_count);
329 lli->lli_open_fd_read_count--;
331 mutex_unlock(&lli->lli_och_mutex);
333 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
334 LDLM_IBITS, &policy, lockmode, &lockh))
335 rc = ll_md_real_close(inode, fd->fd_omode);
338 LUSTRE_FPRIVATE(file) = NULL;
339 ll_file_data_put(fd);
344 /* While this returns an error code, fput() the caller does not, so we need
345 * to make every effort to clean up all of our state here. Also, applications
346 * rarely check close errors and even if an error is returned they will not
347 * re-try the close call.
349 int ll_file_release(struct inode *inode, struct file *file)
351 struct ll_file_data *fd;
352 struct ll_sb_info *sbi = ll_i2sbi(inode);
353 struct ll_inode_info *lli = ll_i2info(inode);
357 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
358 PFID(ll_inode2fid(inode)), inode);
360 if (inode->i_sb->s_root != file_dentry(file))
361 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
362 fd = LUSTRE_FPRIVATE(file);
365 /* The last ref on @file, maybe not the the owner pid of statahead,
366 * because parent and child process can share the same file handle. */
367 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
368 ll_deauthorize_statahead(inode, fd);
370 if (inode->i_sb->s_root == file_dentry(file)) {
371 LUSTRE_FPRIVATE(file) = NULL;
372 ll_file_data_put(fd);
376 if (!S_ISDIR(inode->i_mode)) {
377 if (lli->lli_clob != NULL)
378 lov_read_and_clear_async_rc(lli->lli_clob);
379 lli->lli_async_rc = 0;
382 rc = ll_md_close(inode, file);
384 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
385 libcfs_debug_dumplog();
390 static inline int ll_dom_readpage(void *data, struct page *page)
392 struct niobuf_local *lnb = data;
395 kaddr = ll_kmap_atomic(page, KM_USER0);
396 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
397 if (lnb->lnb_len < PAGE_SIZE)
398 memset(kaddr + lnb->lnb_len, 0,
399 PAGE_SIZE - lnb->lnb_len);
400 flush_dcache_page(page);
401 SetPageUptodate(page);
402 ll_kunmap_atomic(kaddr, KM_USER0);
408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
409 struct lookup_intent *it)
411 struct ll_inode_info *lli = ll_i2info(inode);
412 struct cl_object *obj = lli->lli_clob;
413 struct address_space *mapping = inode->i_mapping;
415 struct niobuf_remote *rnb;
420 struct lustre_handle lockh;
421 struct ldlm_lock *lock;
422 unsigned long index, start;
423 struct niobuf_local lnb;
425 bool dom_lock = false;
432 if (it->it_lock_mode != 0) {
433 lockh.cookie = it->it_lock_handle;
434 lock = ldlm_handle2lock(&lockh);
436 dom_lock = ldlm_has_dom(lock);
443 env = cl_env_get(&refcheck);
447 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
449 GOTO(out_env, rc = -ENODATA);
451 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
452 data = (char *)rnb + sizeof(*rnb);
454 if (rnb == NULL || rnb->rnb_len == 0)
455 GOTO(out_env, rc = 0);
457 CDEBUG(D_INFO, "Get data buffer along with open, len %i, i_size %llu\n",
458 rnb->rnb_len, i_size_read(inode));
460 io = vvp_env_thread_io(env);
462 io->ci_ignore_layout = 1;
463 rc = cl_io_init(env, io, CIT_MISC, obj);
467 lnb.lnb_file_offset = rnb->rnb_offset;
468 start = lnb.lnb_file_offset / PAGE_SIZE;
470 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
471 lnb.lnb_page_offset = 0;
475 lnb.lnb_data = data + (index << PAGE_SHIFT);
476 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
477 if (lnb.lnb_len > PAGE_SIZE)
478 lnb.lnb_len = PAGE_SIZE;
480 vmpage = read_cache_page(mapping, index + start,
481 ll_dom_readpage, &lnb);
482 if (IS_ERR(vmpage)) {
483 CWARN("%s: cannot fill page %lu for "DFID
484 " with data: rc = %li\n",
485 ll_get_fsname(inode->i_sb, NULL, 0),
486 index + start, PFID(lu_object_fid(&obj->co_lu)),
491 if (vmpage->mapping == NULL) {
494 /* page was truncated */
495 GOTO(out_io, rc = -ENODATA);
497 clp = cl_page_find(env, obj, vmpage->index, vmpage,
502 GOTO(out_io, rc = PTR_ERR(clp));
506 cl_page_export(env, clp, 1);
507 cl_page_put(env, clp);
511 } while (rnb->rnb_len > (index << PAGE_SHIFT));
517 cl_env_put(env, &refcheck);
520 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
521 struct lookup_intent *itp)
523 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
524 struct dentry *parent = de->d_parent;
525 const char *name = NULL;
527 struct md_op_data *op_data;
528 struct ptlrpc_request *req = NULL;
532 LASSERT(parent != NULL);
533 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
535 /* if server supports open-by-fid, or file name is invalid, don't pack
536 * name in open request */
537 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
538 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
539 name = de->d_name.name;
540 len = de->d_name.len;
543 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
544 name, len, 0, LUSTRE_OPC_ANY, NULL);
546 RETURN(PTR_ERR(op_data));
547 op_data->op_data = lmm;
548 op_data->op_data_size = lmmsize;
550 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
551 &ll_md_blocking_ast, 0);
552 ll_finish_md_op_data(op_data);
554 /* reason for keep own exit path - don`t flood log
555 * with messages with -ESTALE errors.
557 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
558 it_open_error(DISP_OPEN_OPEN, itp))
560 ll_release_openhandle(de, itp);
564 if (it_disposition(itp, DISP_LOOKUP_NEG))
565 GOTO(out, rc = -ENOENT);
567 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
568 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
569 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
573 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
575 if (!rc && itp->it_lock_mode) {
576 ll_dom_finish_open(de->d_inode, req, itp);
577 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
581 ptlrpc_req_finished(req);
582 ll_intent_drop_lock(itp);
584 /* We did open by fid, but by the time we got to the server,
585 * the object disappeared. If this is a create, we cannot really
586 * tell the userspace that the file it was trying to create
587 * does not exist. Instead let's return -ESTALE, and the VFS will
588 * retry the create with LOOKUP_REVAL that we are going to catch
589 * in ll_revalidate_dentry() and use lookup then.
591 if (rc == -ENOENT && itp->it_op & IT_CREAT)
597 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
598 struct obd_client_handle *och)
600 struct mdt_body *body;
602 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
603 och->och_open_handle = body->mbo_open_handle;
604 och->och_fid = body->mbo_fid1;
605 och->och_lease_handle.cookie = it->it_lock_handle;
606 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
607 och->och_flags = it->it_flags;
609 return md_set_open_replay_data(md_exp, och, it);
612 static int ll_local_open(struct file *file, struct lookup_intent *it,
613 struct ll_file_data *fd, struct obd_client_handle *och)
615 struct inode *inode = file_inode(file);
618 LASSERT(!LUSTRE_FPRIVATE(file));
625 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
630 LUSTRE_FPRIVATE(file) = fd;
631 ll_readahead_init(inode, &fd->fd_ras);
632 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
634 /* ll_cl_context initialize */
635 rwlock_init(&fd->fd_lock);
636 INIT_LIST_HEAD(&fd->fd_lccs);
641 /* Open a file, and (for the very first open) create objects on the OSTs at
642 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
643 * creation or open until ll_lov_setstripe() ioctl is called.
645 * If we already have the stripe MD locally then we don't request it in
646 * md_open(), by passing a lmm_size = 0.
648 * It is up to the application to ensure no other processes open this file
649 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
650 * used. We might be able to avoid races of that sort by getting lli_open_sem
651 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
652 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
654 int ll_file_open(struct inode *inode, struct file *file)
656 struct ll_inode_info *lli = ll_i2info(inode);
657 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
658 .it_flags = file->f_flags };
659 struct obd_client_handle **och_p = NULL;
660 __u64 *och_usecount = NULL;
661 struct ll_file_data *fd;
665 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
666 PFID(ll_inode2fid(inode)), inode, file->f_flags);
668 it = file->private_data; /* XXX: compat macro */
669 file->private_data = NULL; /* prevent ll_local_open assertion */
671 fd = ll_file_data_get();
673 GOTO(out_nofiledata, rc = -ENOMEM);
676 if (S_ISDIR(inode->i_mode))
677 ll_authorize_statahead(inode, fd);
679 if (inode->i_sb->s_root == file_dentry(file)) {
680 LUSTRE_FPRIVATE(file) = fd;
684 if (!it || !it->it_disposition) {
685 /* Convert f_flags into access mode. We cannot use file->f_mode,
686 * because everything but O_ACCMODE mask was stripped from
688 if ((oit.it_flags + 1) & O_ACCMODE)
690 if (file->f_flags & O_TRUNC)
691 oit.it_flags |= FMODE_WRITE;
693 /* kernel only call f_op->open in dentry_open. filp_open calls
694 * dentry_open after call to open_namei that checks permissions.
695 * Only nfsd_open call dentry_open directly without checking
696 * permissions and because of that this code below is safe. */
697 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
698 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
700 /* We do not want O_EXCL here, presumably we opened the file
701 * already? XXX - NFS implications? */
702 oit.it_flags &= ~O_EXCL;
704 /* bug20584, if "it_flags" contains O_CREAT, the file will be
705 * created if necessary, then "IT_CREAT" should be set to keep
706 * consistent with it */
707 if (oit.it_flags & O_CREAT)
708 oit.it_op |= IT_CREAT;
714 /* Let's see if we have file open on MDS already. */
715 if (it->it_flags & FMODE_WRITE) {
716 och_p = &lli->lli_mds_write_och;
717 och_usecount = &lli->lli_open_fd_write_count;
718 } else if (it->it_flags & FMODE_EXEC) {
719 och_p = &lli->lli_mds_exec_och;
720 och_usecount = &lli->lli_open_fd_exec_count;
722 och_p = &lli->lli_mds_read_och;
723 och_usecount = &lli->lli_open_fd_read_count;
726 mutex_lock(&lli->lli_och_mutex);
727 if (*och_p) { /* Open handle is present */
728 if (it_disposition(it, DISP_OPEN_OPEN)) {
729 /* Well, there's extra open request that we do not need,
730 let's close it somehow. This will decref request. */
731 rc = it_open_error(DISP_OPEN_OPEN, it);
733 mutex_unlock(&lli->lli_och_mutex);
734 GOTO(out_openerr, rc);
737 ll_release_openhandle(file_dentry(file), it);
741 rc = ll_local_open(file, it, fd, NULL);
744 mutex_unlock(&lli->lli_och_mutex);
745 GOTO(out_openerr, rc);
748 LASSERT(*och_usecount == 0);
749 if (!it->it_disposition) {
750 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
751 /* We cannot just request lock handle now, new ELC code
752 means that one of other OPEN locks for this file
753 could be cancelled, and since blocking ast handler
754 would attempt to grab och_mutex as well, that would
755 result in a deadlock */
756 mutex_unlock(&lli->lli_och_mutex);
758 * Normally called under two situations:
760 * 2. A race/condition on MDS resulting in no open
761 * handle to be returned from LOOKUP|OPEN request,
762 * for example if the target entry was a symlink.
764 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
765 * marked by a bit set in ll_iget_for_nfs. Clear the
766 * bit so that it's not confusing later callers.
768 * NB; when ldd is NULL, it must have come via normal
769 * lookup path only, since ll_iget_for_nfs always calls
772 if (ldd && ldd->lld_nfs_dentry) {
773 ldd->lld_nfs_dentry = 0;
774 it->it_flags |= MDS_OPEN_LOCK;
778 * Always specify MDS_OPEN_BY_FID because we don't want
779 * to get file with different fid.
781 it->it_flags |= MDS_OPEN_BY_FID;
782 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
785 GOTO(out_openerr, rc);
789 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
791 GOTO(out_och_free, rc = -ENOMEM);
795 /* md_intent_lock() didn't get a request ref if there was an
796 * open error, so don't do cleanup on the request here
798 /* XXX (green): Should not we bail out on any error here, not
799 * just open error? */
800 rc = it_open_error(DISP_OPEN_OPEN, it);
802 GOTO(out_och_free, rc);
804 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
805 "inode %p: disposition %x, status %d\n", inode,
806 it_disposition(it, ~0), it->it_status);
808 rc = ll_local_open(file, it, fd, *och_p);
810 GOTO(out_och_free, rc);
812 mutex_unlock(&lli->lli_och_mutex);
815 /* Must do this outside lli_och_mutex lock to prevent deadlock where
816 different kind of OPEN lock for this same inode gets cancelled
817 by ldlm_cancel_lru */
818 if (!S_ISREG(inode->i_mode))
819 GOTO(out_och_free, rc);
821 cl_lov_delay_create_clear(&file->f_flags);
822 GOTO(out_och_free, rc);
826 if (och_p && *och_p) {
827 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
828 *och_p = NULL; /* OBD_FREE writes some magic there */
831 mutex_unlock(&lli->lli_och_mutex);
834 if (lli->lli_opendir_key == fd)
835 ll_deauthorize_statahead(inode, fd);
837 ll_file_data_put(fd);
839 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
843 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
844 ptlrpc_req_finished(it->it_request);
845 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
851 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
852 struct ldlm_lock_desc *desc, void *data, int flag)
855 struct lustre_handle lockh;
859 case LDLM_CB_BLOCKING:
860 ldlm_lock2handle(lock, &lockh);
861 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
863 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
867 case LDLM_CB_CANCELING:
875 * When setting a lease on a file, we take ownership of the lli_mds_*_och
876 * and save it as fd->fd_och so as to force client to reopen the file even
877 * if it has an open lock in cache already.
879 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
880 struct lustre_handle *old_open_handle)
882 struct ll_inode_info *lli = ll_i2info(inode);
883 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
884 struct obd_client_handle **och_p;
889 /* Get the openhandle of the file */
890 mutex_lock(&lli->lli_och_mutex);
891 if (fd->fd_lease_och != NULL)
892 GOTO(out_unlock, rc = -EBUSY);
894 if (fd->fd_och == NULL) {
895 if (file->f_mode & FMODE_WRITE) {
896 LASSERT(lli->lli_mds_write_och != NULL);
897 och_p = &lli->lli_mds_write_och;
898 och_usecount = &lli->lli_open_fd_write_count;
900 LASSERT(lli->lli_mds_read_och != NULL);
901 och_p = &lli->lli_mds_read_och;
902 och_usecount = &lli->lli_open_fd_read_count;
905 if (*och_usecount > 1)
906 GOTO(out_unlock, rc = -EBUSY);
913 *old_open_handle = fd->fd_och->och_open_handle;
917 mutex_unlock(&lli->lli_och_mutex);
922 * Release ownership on lli_mds_*_och when putting back a file lease.
924 static int ll_lease_och_release(struct inode *inode, struct file *file)
926 struct ll_inode_info *lli = ll_i2info(inode);
927 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
928 struct obd_client_handle **och_p;
929 struct obd_client_handle *old_och = NULL;
934 mutex_lock(&lli->lli_och_mutex);
935 if (file->f_mode & FMODE_WRITE) {
936 och_p = &lli->lli_mds_write_och;
937 och_usecount = &lli->lli_open_fd_write_count;
939 och_p = &lli->lli_mds_read_och;
940 och_usecount = &lli->lli_open_fd_read_count;
943 /* The file may have been open by another process (broken lease) so
944 * *och_p is not NULL. In this case we should simply increase usecount
947 if (*och_p != NULL) {
948 old_och = fd->fd_och;
955 mutex_unlock(&lli->lli_och_mutex);
958 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
964 * Acquire a lease and open the file.
966 static struct obd_client_handle *
967 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
970 struct lookup_intent it = { .it_op = IT_OPEN };
971 struct ll_sb_info *sbi = ll_i2sbi(inode);
972 struct md_op_data *op_data;
973 struct ptlrpc_request *req = NULL;
974 struct lustre_handle old_open_handle = { 0 };
975 struct obd_client_handle *och = NULL;
980 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
981 RETURN(ERR_PTR(-EINVAL));
984 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
985 RETURN(ERR_PTR(-EPERM));
987 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
994 RETURN(ERR_PTR(-ENOMEM));
996 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
997 LUSTRE_OPC_ANY, NULL);
999 GOTO(out, rc = PTR_ERR(op_data));
1001 /* To tell the MDT this openhandle is from the same owner */
1002 op_data->op_open_handle = old_open_handle;
1004 it.it_flags = fmode | open_flags;
1005 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1006 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1007 &ll_md_blocking_lease_ast,
1008 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1009 * it can be cancelled which may mislead applications that the lease is
1011 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1012 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1013 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1014 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1015 ll_finish_md_op_data(op_data);
1016 ptlrpc_req_finished(req);
1018 GOTO(out_release_it, rc);
1020 if (it_disposition(&it, DISP_LOOKUP_NEG))
1021 GOTO(out_release_it, rc = -ENOENT);
1023 rc = it_open_error(DISP_OPEN_OPEN, &it);
1025 GOTO(out_release_it, rc);
1027 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1028 ll_och_fill(sbi->ll_md_exp, &it, och);
1030 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1031 GOTO(out_close, rc = -EOPNOTSUPP);
1033 /* already get lease, handle lease lock */
1034 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1035 if (it.it_lock_mode == 0 ||
1036 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1037 /* open lock must return for lease */
1038 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1039 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1041 GOTO(out_close, rc = -EPROTO);
1044 ll_intent_release(&it);
1048 /* Cancel open lock */
1049 if (it.it_lock_mode != 0) {
1050 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1052 it.it_lock_mode = 0;
1053 och->och_lease_handle.cookie = 0ULL;
1055 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1057 CERROR("%s: error closing file "DFID": %d\n",
1058 ll_get_fsname(inode->i_sb, NULL, 0),
1059 PFID(&ll_i2info(inode)->lli_fid), rc2);
1060 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1062 ll_intent_release(&it);
1066 RETURN(ERR_PTR(rc));
1070 * Check whether a layout swap can be done between two inodes.
1072 * \param[in] inode1 First inode to check
1073 * \param[in] inode2 Second inode to check
1075 * \retval 0 on success, layout swap can be performed between both inodes
1076 * \retval negative error code if requirements are not met
1078 static int ll_check_swap_layouts_validity(struct inode *inode1,
1079 struct inode *inode2)
1081 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1084 if (inode_permission(inode1, MAY_WRITE) ||
1085 inode_permission(inode2, MAY_WRITE))
1088 if (inode1->i_sb != inode2->i_sb)
1094 static int ll_swap_layouts_close(struct obd_client_handle *och,
1095 struct inode *inode, struct inode *inode2)
1097 const struct lu_fid *fid1 = ll_inode2fid(inode);
1098 const struct lu_fid *fid2;
1102 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1103 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1105 rc = ll_check_swap_layouts_validity(inode, inode2);
1107 GOTO(out_free_och, rc);
1109 /* We now know that inode2 is a lustre inode */
1110 fid2 = ll_inode2fid(inode2);
1112 rc = lu_fid_cmp(fid1, fid2);
1114 GOTO(out_free_och, rc = -EINVAL);
1116 /* Close the file and {swap,merge} layouts between inode & inode2.
1117 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1118 * because we still need it to pack l_remote_handle to MDT. */
1119 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1122 och = NULL; /* freed in ll_close_inode_openhandle() */
1132 * Release lease and close the file.
1133 * It will check if the lease has ever broken.
1135 static int ll_lease_close_intent(struct obd_client_handle *och,
1136 struct inode *inode,
1137 bool *lease_broken, enum mds_op_bias bias,
1140 struct ldlm_lock *lock;
1141 bool cancelled = true;
1145 lock = ldlm_handle2lock(&och->och_lease_handle);
1147 lock_res_and_lock(lock);
1148 cancelled = ldlm_is_cancel(lock);
1149 unlock_res_and_lock(lock);
1150 LDLM_LOCK_PUT(lock);
1153 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1154 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1156 if (lease_broken != NULL)
1157 *lease_broken = cancelled;
1159 if (!cancelled && !bias)
1160 ldlm_cli_cancel(&och->och_lease_handle, 0);
1162 if (cancelled) { /* no need to excute intent */
1167 rc = ll_close_inode_openhandle(inode, och, bias, data);
1171 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1174 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1178 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1180 static int ll_lease_file_resync(struct obd_client_handle *och,
1181 struct inode *inode)
1183 struct ll_sb_info *sbi = ll_i2sbi(inode);
1184 struct md_op_data *op_data;
1185 __u64 data_version_unused;
1189 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1190 LUSTRE_OPC_ANY, NULL);
1191 if (IS_ERR(op_data))
1192 RETURN(PTR_ERR(op_data));
1194 /* before starting file resync, it's necessary to clean up page cache
1195 * in client memory, otherwise once the layout version is increased,
1196 * writing back cached data will be denied the OSTs. */
1197 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1201 op_data->op_lease_handle = och->och_lease_handle;
1202 rc = md_file_resync(sbi->ll_md_exp, op_data);
1208 ll_finish_md_op_data(op_data);
1212 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1214 struct ll_inode_info *lli = ll_i2info(inode);
1215 struct cl_object *obj = lli->lli_clob;
1216 struct cl_attr *attr = vvp_env_thread_attr(env);
1224 ll_inode_size_lock(inode);
1226 /* Merge timestamps the most recently obtained from MDS with
1227 * timestamps obtained from OSTs.
1229 * Do not overwrite atime of inode because it may be refreshed
1230 * by file_accessed() function. If the read was served by cache
1231 * data, there is no RPC to be sent so that atime may not be
1232 * transferred to OSTs at all. MDT only updates atime at close time
1233 * if it's at least 'mdd.*.atime_diff' older.
1234 * All in all, the atime in Lustre does not strictly comply with
1235 * POSIX. Solving this problem needs to send an RPC to MDT for each
1236 * read, this will hurt performance. */
1237 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1238 LTIME_S(inode->i_atime) = lli->lli_atime;
1239 lli->lli_update_atime = 0;
1241 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1242 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1244 atime = LTIME_S(inode->i_atime);
1245 mtime = LTIME_S(inode->i_mtime);
1246 ctime = LTIME_S(inode->i_ctime);
1248 cl_object_attr_lock(obj);
1249 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1252 rc = cl_object_attr_get(env, obj, attr);
1253 cl_object_attr_unlock(obj);
1256 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1258 if (atime < attr->cat_atime)
1259 atime = attr->cat_atime;
1261 if (ctime < attr->cat_ctime)
1262 ctime = attr->cat_ctime;
1264 if (mtime < attr->cat_mtime)
1265 mtime = attr->cat_mtime;
1267 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1268 PFID(&lli->lli_fid), attr->cat_size);
1270 i_size_write(inode, attr->cat_size);
1271 inode->i_blocks = attr->cat_blocks;
1273 LTIME_S(inode->i_atime) = atime;
1274 LTIME_S(inode->i_mtime) = mtime;
1275 LTIME_S(inode->i_ctime) = ctime;
1278 ll_inode_size_unlock(inode);
1284 * Set designated mirror for I/O.
1286 * So far only read, write, and truncated can support to issue I/O to
1287 * designated mirror.
1289 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1291 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1293 /* clear layout version for generic(non-resync) I/O in case it carries
1294 * stale layout version due to I/O restart */
1295 io->ci_layout_version = 0;
1297 /* FLR: disable non-delay for designated mirror I/O because obviously
1298 * only one mirror is available */
1299 if (fd->fd_designated_mirror > 0) {
1301 io->ci_designated_mirror = fd->fd_designated_mirror;
1302 io->ci_layout_version = fd->fd_layout_version;
1303 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1307 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1308 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1311 static bool file_is_noatime(const struct file *file)
1313 const struct vfsmount *mnt = file->f_path.mnt;
1314 const struct inode *inode = file_inode((struct file *)file);
1316 /* Adapted from file_accessed() and touch_atime().*/
1317 if (file->f_flags & O_NOATIME)
1320 if (inode->i_flags & S_NOATIME)
1323 if (IS_NOATIME(inode))
1326 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1329 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1332 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1338 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1340 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1342 struct inode *inode = file_inode(file);
1343 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1345 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1346 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1347 io->u.ci_rw.rw_file = file;
1348 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1349 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1350 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1352 if (iot == CIT_WRITE) {
1353 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1354 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1355 file->f_flags & O_DIRECT ||
1358 io->ci_obj = ll_i2info(inode)->lli_clob;
1359 io->ci_lockreq = CILR_MAYBE;
1360 if (ll_file_nolock(file)) {
1361 io->ci_lockreq = CILR_NEVER;
1362 io->ci_no_srvlock = 1;
1363 } else if (file->f_flags & O_APPEND) {
1364 io->ci_lockreq = CILR_MANDATORY;
1366 io->ci_noatime = file_is_noatime(file);
1367 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1368 io->ci_pio = !io->u.ci_rw.rw_append;
1372 /* FLR: only use non-delay I/O for read as there is only one
1373 * avaliable mirror for write. */
1374 io->ci_ndelay = !(iot == CIT_WRITE);
1376 ll_io_set_mirror(io, file);
1379 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1381 struct cl_io_pt *pt = ptask->pt_cbdata;
1382 struct file *file = pt->cip_file;
1385 loff_t pos = pt->cip_pos;
1390 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1391 file_dentry(file)->d_name.name,
1392 pt->cip_iot == CIT_READ ? "read" : "write",
1393 pos, pos + pt->cip_count);
1395 env = cl_env_get(&refcheck);
1397 RETURN(PTR_ERR(env));
1399 io = vvp_env_thread_io(env);
1400 ll_io_init(io, file, pt->cip_iot);
1401 io->u.ci_rw.rw_iter = pt->cip_iter;
1402 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1403 io->ci_pio = 0; /* It's already in parallel task */
1405 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1406 pt->cip_count - pt->cip_result);
1408 struct vvp_io *vio = vvp_env_io(env);
1410 vio->vui_io_subtype = IO_NORMAL;
1411 vio->vui_fd = LUSTRE_FPRIVATE(file);
1413 ll_cl_add(file, env, io, LCC_RW);
1414 rc = cl_io_loop(env, io);
1415 ll_cl_remove(file, env);
1417 /* cl_io_rw_init() handled IO */
1421 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1427 if (io->ci_nob > 0) {
1428 pt->cip_result += io->ci_nob;
1429 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1431 pt->cip_iocb.ki_pos = pos;
1432 #ifdef HAVE_KIOCB_KI_LEFT
1433 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1434 #elif defined(HAVE_KI_NBYTES)
1435 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1439 cl_io_fini(env, io);
1440 cl_env_put(env, &refcheck);
1442 pt->cip_need_restart = io->ci_need_restart;
1444 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1445 file_dentry(file)->d_name.name,
1446 pt->cip_iot == CIT_READ ? "read" : "write",
1447 pt->cip_result, rc);
1449 RETURN(pt->cip_result > 0 ? 0 : rc);
1453 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1454 struct file *file, enum cl_io_type iot,
1455 loff_t *ppos, size_t count)
1457 struct range_lock range;
1458 struct vvp_io *vio = vvp_env_io(env);
1459 struct inode *inode = file_inode(file);
1460 struct ll_inode_info *lli = ll_i2info(inode);
1461 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1466 unsigned retried = 0;
1467 bool restarted = false;
1471 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1472 file_dentry(file)->d_name.name,
1473 iot == CIT_READ ? "read" : "write", pos, pos + count);
1476 io = vvp_env_thread_io(env);
1477 ll_io_init(io, file, iot);
1478 if (args->via_io_subtype == IO_NORMAL) {
1479 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1480 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1482 if (args->via_io_subtype != IO_NORMAL || restarted)
1484 io->ci_ndelay_tried = retried;
1486 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1487 bool range_locked = false;
1489 if (file->f_flags & O_APPEND)
1490 range_lock_init(&range, 0, LUSTRE_EOF);
1492 range_lock_init(&range, pos, pos + count - 1);
1494 vio->vui_fd = LUSTRE_FPRIVATE(file);
1495 vio->vui_io_subtype = args->via_io_subtype;
1497 switch (vio->vui_io_subtype) {
1499 /* Direct IO reads must also take range lock,
1500 * or multiple reads will try to work on the same pages
1501 * See LU-6227 for details. */
1502 if (((iot == CIT_WRITE) ||
1503 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1504 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1505 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1507 rc = range_lock(&lli->lli_write_tree, &range);
1511 range_locked = true;
1515 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1516 vio->u.splice.vui_flags = args->u.splice.via_flags;
1519 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1523 ll_cl_add(file, env, io, LCC_RW);
1524 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1525 !lli->lli_inode_locked) {
1527 lli->lli_inode_locked = 1;
1529 rc = cl_io_loop(env, io);
1530 if (lli->lli_inode_locked) {
1531 lli->lli_inode_locked = 0;
1532 inode_unlock(inode);
1534 ll_cl_remove(file, env);
1537 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1539 range_unlock(&lli->lli_write_tree, &range);
1542 /* cl_io_rw_init() handled IO */
1546 if (io->ci_nob > 0) {
1547 result += io->ci_nob;
1548 count -= io->ci_nob;
1550 if (args->via_io_subtype == IO_NORMAL) {
1551 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1553 /* CLIO is too complicated. See LU-11069. */
1554 if (cl_io_is_append(io))
1555 pos = io->u.ci_rw.rw_iocb.ki_pos;
1559 args->u.normal.via_iocb->ki_pos = pos;
1560 #ifdef HAVE_KIOCB_KI_LEFT
1561 args->u.normal.via_iocb->ki_left = count;
1562 #elif defined(HAVE_KI_NBYTES)
1563 args->u.normal.via_iocb->ki_nbytes = count;
1567 pos = io->u.ci_rw.rw_range.cir_pos;
1571 cl_io_fini(env, io);
1574 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1575 file->f_path.dentry->d_name.name,
1576 iot, rc, result, io->ci_need_restart);
1578 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1580 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1581 file_dentry(file)->d_name.name,
1582 iot == CIT_READ ? "read" : "write",
1583 pos, pos + count, result, rc);
1584 /* preserve the tried count for FLR */
1585 retried = io->ci_ndelay_tried;
1590 if (iot == CIT_READ) {
1592 ll_stats_ops_tally(ll_i2sbi(inode),
1593 LPROC_LL_READ_BYTES, result);
1594 } else if (iot == CIT_WRITE) {
1596 ll_stats_ops_tally(ll_i2sbi(inode),
1597 LPROC_LL_WRITE_BYTES, result);
1598 fd->fd_write_failed = false;
1599 } else if (result == 0 && rc == 0) {
1602 fd->fd_write_failed = true;
1604 fd->fd_write_failed = false;
1605 } else if (rc != -ERESTARTSYS) {
1606 fd->fd_write_failed = true;
1610 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1611 file_dentry(file)->d_name.name,
1612 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1616 RETURN(result > 0 ? result : rc);
1620 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1621 * especially for small I/O.
1623 * To serve a read request, CLIO has to create and initialize a cl_io and
1624 * then request DLM lock. This has turned out to have siginificant overhead
1625 * and affects the performance of small I/O dramatically.
1627 * It's not necessary to create a cl_io for each I/O. Under the help of read
1628 * ahead, most of the pages being read are already in memory cache and we can
1629 * read those pages directly because if the pages exist, the corresponding DLM
1630 * lock must exist so that page content must be valid.
1632 * In fast read implementation, the llite speculatively finds and reads pages
1633 * in memory cache. There are three scenarios for fast read:
1634 * - If the page exists and is uptodate, kernel VM will provide the data and
1635 * CLIO won't be intervened;
1636 * - If the page was brought into memory by read ahead, it will be exported
1637 * and read ahead parameters will be updated;
1638 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1639 * it will go back and invoke normal read, i.e., a cl_io will be created
1640 * and DLM lock will be requested.
1642 * POSIX compliance: posix standard states that read is intended to be atomic.
1643 * Lustre read implementation is in line with Linux kernel read implementation
1644 * and neither of them complies with POSIX standard in this matter. Fast read
1645 * doesn't make the situation worse on single node but it may interleave write
1646 * results from multiple nodes due to short read handling in ll_file_aio_read().
1648 * \param env - lu_env
1649 * \param iocb - kiocb from kernel
1650 * \param iter - user space buffers where the data will be copied
1652 * \retval - number of bytes have been read, or error code if error occurred.
1655 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1659 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1662 /* NB: we can't do direct IO for fast read because it will need a lock
1663 * to make IO engine happy. */
1664 if (iocb->ki_filp->f_flags & O_DIRECT)
1667 result = generic_file_read_iter(iocb, iter);
1669 /* If the first page is not in cache, generic_file_aio_read() will be
1670 * returned with -ENODATA.
1671 * See corresponding code in ll_readpage(). */
1672 if (result == -ENODATA)
1676 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1677 LPROC_LL_READ_BYTES, result);
1683 * Read from a file (through the page cache).
1685 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1688 struct vvp_io_args *args;
1693 result = ll_do_fast_read(iocb, to);
1694 if (result < 0 || iov_iter_count(to) == 0)
1697 env = cl_env_get(&refcheck);
1699 return PTR_ERR(env);
1701 args = ll_env_args(env, IO_NORMAL);
1702 args->u.normal.via_iter = to;
1703 args->u.normal.via_iocb = iocb;
1705 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1706 &iocb->ki_pos, iov_iter_count(to));
1709 else if (result == 0)
1712 cl_env_put(env, &refcheck);
1718 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1719 * If a page is already in the page cache and dirty (and some other things -
1720 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1721 * write to it without doing a full I/O, because Lustre already knows about it
1722 * and will write it out. This saves a lot of processing time.
1724 * All writes here are within one page, so exclusion is handled by the page
1725 * lock on the vm page. We do not do tiny writes for writes which touch
1726 * multiple pages because it's very unlikely multiple sequential pages are
1727 * are already dirty.
1729 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1730 * and are unlikely to be to already dirty pages.
1732 * Attribute updates are important here, we do them in ll_tiny_write_end.
1734 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1736 ssize_t count = iov_iter_count(iter);
1737 struct file *file = iocb->ki_filp;
1738 struct inode *inode = file_inode(file);
1743 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1744 * of function for why.
1746 if (count >= PAGE_SIZE ||
1747 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1750 result = __generic_file_write_iter(iocb, iter);
1752 /* If the page is not already dirty, ll_tiny_write_begin returns
1753 * -ENODATA. We continue on to normal write.
1755 if (result == -ENODATA)
1759 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1761 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1764 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1770 * Write to a file (through the page cache).
1772 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1774 struct vvp_io_args *args;
1776 ssize_t rc_tiny = 0, rc_normal;
1781 /* NB: we can't do direct IO for tiny writes because they use the page
1782 * cache, we can't do sync writes because tiny writes can't flush
1783 * pages, and we can't do append writes because we can't guarantee the
1784 * required DLM locks are held to protect file size.
1786 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1787 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1788 rc_tiny = ll_do_tiny_write(iocb, from);
1790 /* In case of error, go on and try normal write - Only stop if tiny
1791 * write completed I/O.
1793 if (iov_iter_count(from) == 0)
1794 GOTO(out, rc_normal = rc_tiny);
1796 env = cl_env_get(&refcheck);
1798 return PTR_ERR(env);
1800 args = ll_env_args(env, IO_NORMAL);
1801 args->u.normal.via_iter = from;
1802 args->u.normal.via_iocb = iocb;
1804 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1805 &iocb->ki_pos, iov_iter_count(from));
1807 /* On success, combine bytes written. */
1808 if (rc_tiny >= 0 && rc_normal > 0)
1809 rc_normal += rc_tiny;
1810 /* On error, only return error from normal write if tiny write did not
1811 * write any bytes. Otherwise return bytes written by tiny write.
1813 else if (rc_tiny > 0)
1814 rc_normal = rc_tiny;
1816 cl_env_put(env, &refcheck);
1821 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1823 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1825 static int ll_file_get_iov_count(const struct iovec *iov,
1826 unsigned long *nr_segs, size_t *count)
1831 for (seg = 0; seg < *nr_segs; seg++) {
1832 const struct iovec *iv = &iov[seg];
1835 * If any segment has a negative length, or the cumulative
1836 * length ever wraps negative then return -EINVAL.
1839 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1841 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1846 cnt -= iv->iov_len; /* This segment is no good */
1853 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1854 unsigned long nr_segs, loff_t pos)
1861 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1865 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1866 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1867 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1868 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1869 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1871 result = ll_file_read_iter(iocb, &to);
1876 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1879 struct iovec iov = { .iov_base = buf, .iov_len = count };
1884 init_sync_kiocb(&kiocb, file);
1885 kiocb.ki_pos = *ppos;
1886 #ifdef HAVE_KIOCB_KI_LEFT
1887 kiocb.ki_left = count;
1888 #elif defined(HAVE_KI_NBYTES)
1889 kiocb.i_nbytes = count;
1892 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1893 *ppos = kiocb.ki_pos;
1899 * Write to a file (through the page cache).
1902 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1903 unsigned long nr_segs, loff_t pos)
1905 struct iov_iter from;
1910 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1914 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1915 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1916 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1917 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1918 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1920 result = ll_file_write_iter(iocb, &from);
1925 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1926 size_t count, loff_t *ppos)
1928 struct iovec iov = { .iov_base = (void __user *)buf,
1935 init_sync_kiocb(&kiocb, file);
1936 kiocb.ki_pos = *ppos;
1937 #ifdef HAVE_KIOCB_KI_LEFT
1938 kiocb.ki_left = count;
1939 #elif defined(HAVE_KI_NBYTES)
1940 kiocb.ki_nbytes = count;
1943 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1944 *ppos = kiocb.ki_pos;
1948 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1951 * Send file content (through pagecache) somewhere with helper
1953 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1954 struct pipe_inode_info *pipe, size_t count,
1958 struct vvp_io_args *args;
1963 env = cl_env_get(&refcheck);
1965 RETURN(PTR_ERR(env));
1967 args = ll_env_args(env, IO_SPLICE);
1968 args->u.splice.via_pipe = pipe;
1969 args->u.splice.via_flags = flags;
1971 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1972 cl_env_put(env, &refcheck);
1976 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1977 __u64 flags, struct lov_user_md *lum, int lum_size)
1979 struct lookup_intent oit = {
1981 .it_flags = flags | MDS_OPEN_BY_FID,
1986 ll_inode_size_lock(inode);
1987 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1989 GOTO(out_unlock, rc);
1991 ll_release_openhandle(dentry, &oit);
1994 ll_inode_size_unlock(inode);
1995 ll_intent_release(&oit);
2000 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2001 struct lov_mds_md **lmmp, int *lmm_size,
2002 struct ptlrpc_request **request)
2004 struct ll_sb_info *sbi = ll_i2sbi(inode);
2005 struct mdt_body *body;
2006 struct lov_mds_md *lmm = NULL;
2007 struct ptlrpc_request *req = NULL;
2008 struct md_op_data *op_data;
2011 rc = ll_get_default_mdsize(sbi, &lmmsize);
2015 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2016 strlen(filename), lmmsize,
2017 LUSTRE_OPC_ANY, NULL);
2018 if (IS_ERR(op_data))
2019 RETURN(PTR_ERR(op_data));
2021 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2022 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2023 ll_finish_md_op_data(op_data);
2025 CDEBUG(D_INFO, "md_getattr_name failed "
2026 "on %s: rc %d\n", filename, rc);
2030 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2031 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2033 lmmsize = body->mbo_eadatasize;
2035 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2037 GOTO(out, rc = -ENODATA);
2040 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2041 LASSERT(lmm != NULL);
2043 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2044 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2045 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2046 GOTO(out, rc = -EPROTO);
2049 * This is coming from the MDS, so is probably in
2050 * little endian. We convert it to host endian before
2051 * passing it to userspace.
2053 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2056 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2057 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2058 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2059 if (le32_to_cpu(lmm->lmm_pattern) &
2060 LOV_PATTERN_F_RELEASED)
2064 /* if function called for directory - we should
2065 * avoid swab not existent lsm objects */
2066 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2067 lustre_swab_lov_user_md_v1(
2068 (struct lov_user_md_v1 *)lmm);
2069 if (S_ISREG(body->mbo_mode))
2070 lustre_swab_lov_user_md_objects(
2071 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2073 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2074 lustre_swab_lov_user_md_v3(
2075 (struct lov_user_md_v3 *)lmm);
2076 if (S_ISREG(body->mbo_mode))
2077 lustre_swab_lov_user_md_objects(
2078 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2080 } else if (lmm->lmm_magic ==
2081 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2082 lustre_swab_lov_comp_md_v1(
2083 (struct lov_comp_md_v1 *)lmm);
2089 *lmm_size = lmmsize;
2094 static int ll_lov_setea(struct inode *inode, struct file *file,
2097 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2098 struct lov_user_md *lump;
2099 int lum_size = sizeof(struct lov_user_md) +
2100 sizeof(struct lov_user_ost_data);
2104 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2107 OBD_ALLOC_LARGE(lump, lum_size);
2111 if (copy_from_user(lump, arg, lum_size))
2112 GOTO(out_lump, rc = -EFAULT);
2114 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2116 cl_lov_delay_create_clear(&file->f_flags);
2119 OBD_FREE_LARGE(lump, lum_size);
2123 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2130 env = cl_env_get(&refcheck);
2132 RETURN(PTR_ERR(env));
2134 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2135 cl_env_put(env, &refcheck);
2139 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2142 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2143 struct lov_user_md *klum;
2145 __u64 flags = FMODE_WRITE;
2148 rc = ll_copy_user_md(lum, &klum);
2153 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2158 rc = put_user(0, &lum->lmm_stripe_count);
2162 rc = ll_layout_refresh(inode, &gen);
2166 rc = ll_file_getstripe(inode, arg, lum_size);
2168 cl_lov_delay_create_clear(&file->f_flags);
2171 OBD_FREE(klum, lum_size);
2176 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2178 struct ll_inode_info *lli = ll_i2info(inode);
2179 struct cl_object *obj = lli->lli_clob;
2180 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2181 struct ll_grouplock grouplock;
2186 CWARN("group id for group lock must not be 0\n");
2190 if (ll_file_nolock(file))
2191 RETURN(-EOPNOTSUPP);
2193 spin_lock(&lli->lli_lock);
2194 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2195 CWARN("group lock already existed with gid %lu\n",
2196 fd->fd_grouplock.lg_gid);
2197 spin_unlock(&lli->lli_lock);
2200 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2201 spin_unlock(&lli->lli_lock);
2204 * XXX: group lock needs to protect all OST objects while PFL
2205 * can add new OST objects during the IO, so we'd instantiate
2206 * all OST objects before getting its group lock.
2211 struct cl_layout cl = {
2212 .cl_is_composite = false,
2214 struct lu_extent ext = {
2216 .e_end = OBD_OBJECT_EOF,
2219 env = cl_env_get(&refcheck);
2221 RETURN(PTR_ERR(env));
2223 rc = cl_object_layout_get(env, obj, &cl);
2224 if (!rc && cl.cl_is_composite)
2225 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2228 cl_env_put(env, &refcheck);
2233 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2234 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2238 spin_lock(&lli->lli_lock);
2239 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2240 spin_unlock(&lli->lli_lock);
2241 CERROR("another thread just won the race\n");
2242 cl_put_grouplock(&grouplock);
2246 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2247 fd->fd_grouplock = grouplock;
2248 spin_unlock(&lli->lli_lock);
2250 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2254 static int ll_put_grouplock(struct inode *inode, struct file *file,
2257 struct ll_inode_info *lli = ll_i2info(inode);
2258 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2259 struct ll_grouplock grouplock;
2262 spin_lock(&lli->lli_lock);
2263 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2264 spin_unlock(&lli->lli_lock);
2265 CWARN("no group lock held\n");
2269 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2271 if (fd->fd_grouplock.lg_gid != arg) {
2272 CWARN("group lock %lu doesn't match current id %lu\n",
2273 arg, fd->fd_grouplock.lg_gid);
2274 spin_unlock(&lli->lli_lock);
2278 grouplock = fd->fd_grouplock;
2279 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2280 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2281 spin_unlock(&lli->lli_lock);
2283 cl_put_grouplock(&grouplock);
2284 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2289 * Close inode open handle
2291 * \param dentry [in] dentry which contains the inode
2292 * \param it [in,out] intent which contains open info and result
2295 * \retval <0 failure
2297 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2299 struct inode *inode = dentry->d_inode;
2300 struct obd_client_handle *och;
2306 /* Root ? Do nothing. */
2307 if (dentry->d_inode->i_sb->s_root == dentry)
2310 /* No open handle to close? Move away */
2311 if (!it_disposition(it, DISP_OPEN_OPEN))
2314 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2316 OBD_ALLOC(och, sizeof(*och));
2318 GOTO(out, rc = -ENOMEM);
2320 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2322 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2324 /* this one is in place of ll_file_open */
2325 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2326 ptlrpc_req_finished(it->it_request);
2327 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2333 * Get size for inode for which FIEMAP mapping is requested.
2334 * Make the FIEMAP get_info call and returns the result.
2335 * \param fiemap kernel buffer to hold extens
2336 * \param num_bytes kernel buffer size
2338 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2344 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2347 /* Checks for fiemap flags */
2348 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2349 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2353 /* Check for FIEMAP_FLAG_SYNC */
2354 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2355 rc = filemap_fdatawrite(inode->i_mapping);
2360 env = cl_env_get(&refcheck);
2362 RETURN(PTR_ERR(env));
2364 if (i_size_read(inode) == 0) {
2365 rc = ll_glimpse_size(inode);
2370 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2371 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2372 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2374 /* If filesize is 0, then there would be no objects for mapping */
2375 if (fmkey.lfik_oa.o_size == 0) {
2376 fiemap->fm_mapped_extents = 0;
2380 fmkey.lfik_fiemap = *fiemap;
2382 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2383 &fmkey, fiemap, &num_bytes);
2385 cl_env_put(env, &refcheck);
2389 int ll_fid2path(struct inode *inode, void __user *arg)
2391 struct obd_export *exp = ll_i2mdexp(inode);
2392 const struct getinfo_fid2path __user *gfin = arg;
2394 struct getinfo_fid2path *gfout;
2400 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2401 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2404 /* Only need to get the buflen */
2405 if (get_user(pathlen, &gfin->gf_pathlen))
2408 if (pathlen > PATH_MAX)
2411 outsize = sizeof(*gfout) + pathlen;
2412 OBD_ALLOC(gfout, outsize);
2416 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2417 GOTO(gf_free, rc = -EFAULT);
2418 /* append root FID after gfout to let MDT know the root FID so that it
2419 * can lookup the correct path, this is mainly for fileset.
2420 * old server without fileset mount support will ignore this. */
2421 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2423 /* Call mdc_iocontrol */
2424 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2428 if (copy_to_user(arg, gfout, outsize))
2432 OBD_FREE(gfout, outsize);
2437 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2439 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2447 ioc->idv_version = 0;
2448 ioc->idv_layout_version = UINT_MAX;
2450 /* If no file object initialized, we consider its version is 0. */
2454 env = cl_env_get(&refcheck);
2456 RETURN(PTR_ERR(env));
2458 io = vvp_env_thread_io(env);
2460 io->u.ci_data_version.dv_data_version = 0;
2461 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2462 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2465 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2466 result = cl_io_loop(env, io);
2468 result = io->ci_result;
2470 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2471 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2473 cl_io_fini(env, io);
2475 if (unlikely(io->ci_need_restart))
2478 cl_env_put(env, &refcheck);
2484 * Read the data_version for inode.
2486 * This value is computed using stripe object version on OST.
2487 * Version is computed using server side locking.
2489 * @param flags if do sync on the OST side;
2491 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2492 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2494 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2496 struct ioc_data_version ioc = { .idv_flags = flags };
2499 rc = ll_ioc_data_version(inode, &ioc);
2501 *data_version = ioc.idv_version;
2507 * Trigger a HSM release request for the provided inode.
2509 int ll_hsm_release(struct inode *inode)
2512 struct obd_client_handle *och = NULL;
2513 __u64 data_version = 0;
2518 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2519 ll_get_fsname(inode->i_sb, NULL, 0),
2520 PFID(&ll_i2info(inode)->lli_fid));
2522 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2524 GOTO(out, rc = PTR_ERR(och));
2526 /* Grab latest data_version and [am]time values */
2527 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2531 env = cl_env_get(&refcheck);
2533 GOTO(out, rc = PTR_ERR(env));
2535 rc = ll_merge_attr(env, inode);
2536 cl_env_put(env, &refcheck);
2538 /* If error happen, we have the wrong size for a file.
2544 /* Release the file.
2545 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2546 * we still need it to pack l_remote_handle to MDT. */
2547 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2553 if (och != NULL && !IS_ERR(och)) /* close the file */
2554 ll_lease_close(och, inode, NULL);
2559 struct ll_swap_stack {
2562 struct inode *inode1;
2563 struct inode *inode2;
2568 static int ll_swap_layouts(struct file *file1, struct file *file2,
2569 struct lustre_swap_layouts *lsl)
2571 struct mdc_swap_layouts msl;
2572 struct md_op_data *op_data;
2575 struct ll_swap_stack *llss = NULL;
2578 OBD_ALLOC_PTR(llss);
2582 llss->inode1 = file_inode(file1);
2583 llss->inode2 = file_inode(file2);
2585 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2589 /* we use 2 bool because it is easier to swap than 2 bits */
2590 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2591 llss->check_dv1 = true;
2593 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2594 llss->check_dv2 = true;
2596 /* we cannot use lsl->sl_dvX directly because we may swap them */
2597 llss->dv1 = lsl->sl_dv1;
2598 llss->dv2 = lsl->sl_dv2;
2600 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2601 if (rc == 0) /* same file, done! */
2604 if (rc < 0) { /* sequentialize it */
2605 swap(llss->inode1, llss->inode2);
2607 swap(llss->dv1, llss->dv2);
2608 swap(llss->check_dv1, llss->check_dv2);
2612 if (gid != 0) { /* application asks to flush dirty cache */
2613 rc = ll_get_grouplock(llss->inode1, file1, gid);
2617 rc = ll_get_grouplock(llss->inode2, file2, gid);
2619 ll_put_grouplock(llss->inode1, file1, gid);
2624 /* ultimate check, before swaping the layouts we check if
2625 * dataversion has changed (if requested) */
2626 if (llss->check_dv1) {
2627 rc = ll_data_version(llss->inode1, &dv, 0);
2630 if (dv != llss->dv1)
2631 GOTO(putgl, rc = -EAGAIN);
2634 if (llss->check_dv2) {
2635 rc = ll_data_version(llss->inode2, &dv, 0);
2638 if (dv != llss->dv2)
2639 GOTO(putgl, rc = -EAGAIN);
2642 /* struct md_op_data is used to send the swap args to the mdt
2643 * only flags is missing, so we use struct mdc_swap_layouts
2644 * through the md_op_data->op_data */
2645 /* flags from user space have to be converted before they are send to
2646 * server, no flag is sent today, they are only used on the client */
2649 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2650 0, LUSTRE_OPC_ANY, &msl);
2651 if (IS_ERR(op_data))
2652 GOTO(free, rc = PTR_ERR(op_data));
2654 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2655 sizeof(*op_data), op_data, NULL);
2656 ll_finish_md_op_data(op_data);
2663 ll_put_grouplock(llss->inode2, file2, gid);
2664 ll_put_grouplock(llss->inode1, file1, gid);
2674 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2676 struct md_op_data *op_data;
2680 /* Detect out-of range masks */
2681 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2684 /* Non-root users are forbidden to set or clear flags which are
2685 * NOT defined in HSM_USER_MASK. */
2686 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2687 !cfs_capable(CFS_CAP_SYS_ADMIN))
2690 /* Detect out-of range archive id */
2691 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2692 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2695 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2696 LUSTRE_OPC_ANY, hss);
2697 if (IS_ERR(op_data))
2698 RETURN(PTR_ERR(op_data));
2700 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2701 sizeof(*op_data), op_data, NULL);
2703 ll_finish_md_op_data(op_data);
2708 static int ll_hsm_import(struct inode *inode, struct file *file,
2709 struct hsm_user_import *hui)
2711 struct hsm_state_set *hss = NULL;
2712 struct iattr *attr = NULL;
2716 if (!S_ISREG(inode->i_mode))
2722 GOTO(out, rc = -ENOMEM);
2724 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2725 hss->hss_archive_id = hui->hui_archive_id;
2726 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2727 rc = ll_hsm_state_set(inode, hss);
2731 OBD_ALLOC_PTR(attr);
2733 GOTO(out, rc = -ENOMEM);
2735 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2736 attr->ia_mode |= S_IFREG;
2737 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2738 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2739 attr->ia_size = hui->hui_size;
2740 attr->ia_mtime.tv_sec = hui->hui_mtime;
2741 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2742 attr->ia_atime.tv_sec = hui->hui_atime;
2743 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2745 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2746 ATTR_UID | ATTR_GID |
2747 ATTR_MTIME | ATTR_MTIME_SET |
2748 ATTR_ATIME | ATTR_ATIME_SET;
2752 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2756 inode_unlock(inode);
2768 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2770 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2771 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2774 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2776 struct inode *inode = file_inode(file);
2778 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2779 ATTR_MTIME | ATTR_MTIME_SET |
2782 .tv_sec = lfu->lfu_atime_sec,
2783 .tv_nsec = lfu->lfu_atime_nsec,
2786 .tv_sec = lfu->lfu_mtime_sec,
2787 .tv_nsec = lfu->lfu_mtime_nsec,
2790 .tv_sec = lfu->lfu_ctime_sec,
2791 .tv_nsec = lfu->lfu_ctime_nsec,
2797 if (!capable(CAP_SYS_ADMIN))
2800 if (!S_ISREG(inode->i_mode))
2804 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2806 inode_unlock(inode);
2811 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2814 case MODE_READ_USER:
2816 case MODE_WRITE_USER:
2823 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2825 /* Used to allow the upper layers of the client to request an LDLM lock
2826 * without doing an actual read or write.
2828 * Used for ladvise lockahead to manually request specific locks.
2830 * \param[in] file file this ladvise lock request is on
2831 * \param[in] ladvise ladvise struct describing this lock request
2833 * \retval 0 success, no detailed result available (sync requests
2834 * and requests sent to the server [not handled locally]
2835 * cannot return detailed results)
2836 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2837 * see definitions for details.
2838 * \retval negative negative errno on error
2840 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2842 struct lu_env *env = NULL;
2843 struct cl_io *io = NULL;
2844 struct cl_lock *lock = NULL;
2845 struct cl_lock_descr *descr = NULL;
2846 struct dentry *dentry = file->f_path.dentry;
2847 struct inode *inode = dentry->d_inode;
2848 enum cl_lock_mode cl_mode;
2849 off_t start = ladvise->lla_start;
2850 off_t end = ladvise->lla_end;
2856 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2857 "start=%llu, end=%llu\n", dentry->d_name.len,
2858 dentry->d_name.name, dentry->d_inode,
2859 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2862 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2864 GOTO(out, result = cl_mode);
2866 /* Get IO environment */
2867 result = cl_io_get(inode, &env, &io, &refcheck);
2871 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2874 * nothing to do for this io. This currently happens when
2875 * stripe sub-object's are not yet created.
2877 result = io->ci_result;
2878 } else if (result == 0) {
2879 lock = vvp_env_lock(env);
2880 descr = &lock->cll_descr;
2882 descr->cld_obj = io->ci_obj;
2883 /* Convert byte offsets to pages */
2884 descr->cld_start = cl_index(io->ci_obj, start);
2885 descr->cld_end = cl_index(io->ci_obj, end);
2886 descr->cld_mode = cl_mode;
2887 /* CEF_MUST is used because we do not want to convert a
2888 * lockahead request to a lockless lock */
2889 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2892 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2893 descr->cld_enq_flags |= CEF_SPECULATIVE;
2895 result = cl_lock_request(env, io, lock);
2897 /* On success, we need to release the lock */
2899 cl_lock_release(env, lock);
2901 cl_io_fini(env, io);
2902 cl_env_put(env, &refcheck);
2904 /* -ECANCELED indicates a matching lock with a different extent
2905 * was already present, and -EEXIST indicates a matching lock
2906 * on exactly the same extent was already present.
2907 * We convert them to positive values for userspace to make
2908 * recognizing true errors easier.
2909 * Note we can only return these detailed results on async requests,
2910 * as sync requests look the same as i/o requests for locking. */
2911 if (result == -ECANCELED)
2912 result = LLA_RESULT_DIFFERENT;
2913 else if (result == -EEXIST)
2914 result = LLA_RESULT_SAME;
2919 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2921 static int ll_ladvise_sanity(struct inode *inode,
2922 struct llapi_lu_ladvise *ladvise)
2924 enum lu_ladvise_type advice = ladvise->lla_advice;
2925 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2926 * be in the first 32 bits of enum ladvise_flags */
2927 __u32 flags = ladvise->lla_peradvice_flags;
2928 /* 3 lines at 80 characters per line, should be plenty */
2931 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2933 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2934 "last supported advice is %s (value '%d'): rc = %d\n",
2935 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2936 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2940 /* Per-advice checks */
2942 case LU_LADVISE_LOCKNOEXPAND:
2943 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2945 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2947 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2948 ladvise_names[advice], rc);
2952 case LU_LADVISE_LOCKAHEAD:
2953 /* Currently only READ and WRITE modes can be requested */
2954 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2955 ladvise->lla_lockahead_mode == 0) {
2957 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2959 ll_get_fsname(inode->i_sb, NULL, 0),
2960 ladvise->lla_lockahead_mode,
2961 ladvise_names[advice], rc);
2964 case LU_LADVISE_WILLREAD:
2965 case LU_LADVISE_DONTNEED:
2967 /* Note fall through above - These checks apply to all advices
2968 * except LOCKNOEXPAND */
2969 if (flags & ~LF_DEFAULT_MASK) {
2971 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2973 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2974 ladvise_names[advice], rc);
2977 if (ladvise->lla_start >= ladvise->lla_end) {
2979 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2980 "for %s: rc = %d\n",
2981 ll_get_fsname(inode->i_sb, NULL, 0),
2982 ladvise->lla_start, ladvise->lla_end,
2983 ladvise_names[advice], rc);
2995 * Give file access advices
2997 * The ladvise interface is similar to Linux fadvise() system call, except it
2998 * forwards the advices directly from Lustre client to server. The server side
2999 * codes will apply appropriate read-ahead and caching techniques for the
3000 * corresponding files.
3002 * A typical workload for ladvise is e.g. a bunch of different clients are
3003 * doing small random reads of a file, so prefetching pages into OSS cache
3004 * with big linear reads before the random IO is a net benefit. Fetching
3005 * all that data into each client cache with fadvise() may not be, due to
3006 * much more data being sent to the client.
3008 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3009 struct llapi_lu_ladvise *ladvise)
3013 struct cl_ladvise_io *lio;
3018 env = cl_env_get(&refcheck);
3020 RETURN(PTR_ERR(env));
3022 io = vvp_env_thread_io(env);
3023 io->ci_obj = ll_i2info(inode)->lli_clob;
3025 /* initialize parameters for ladvise */
3026 lio = &io->u.ci_ladvise;
3027 lio->li_start = ladvise->lla_start;
3028 lio->li_end = ladvise->lla_end;
3029 lio->li_fid = ll_inode2fid(inode);
3030 lio->li_advice = ladvise->lla_advice;
3031 lio->li_flags = flags;
3033 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3034 rc = cl_io_loop(env, io);
3038 cl_io_fini(env, io);
3039 cl_env_put(env, &refcheck);
3043 static int ll_lock_noexpand(struct file *file, int flags)
3045 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3047 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3052 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3055 struct fsxattr fsxattr;
3057 if (copy_from_user(&fsxattr,
3058 (const struct fsxattr __user *)arg,
3062 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3063 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3064 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3065 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3066 if (copy_to_user((struct fsxattr __user *)arg,
3067 &fsxattr, sizeof(fsxattr)))
3073 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3077 struct md_op_data *op_data;
3078 struct ptlrpc_request *req = NULL;
3080 struct fsxattr fsxattr;
3081 struct cl_object *obj;
3085 /* only root could change project ID */
3086 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
3089 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3090 LUSTRE_OPC_ANY, NULL);
3091 if (IS_ERR(op_data))
3092 RETURN(PTR_ERR(op_data));
3094 if (copy_from_user(&fsxattr,
3095 (const struct fsxattr __user *)arg,
3097 GOTO(out_fsxattr, rc = -EFAULT);
3099 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3100 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3101 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3102 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3103 op_data->op_projid = fsxattr.fsx_projid;
3104 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3105 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3107 ptlrpc_req_finished(req);
3109 GOTO(out_fsxattr, rc);
3110 ll_update_inode_flags(inode, op_data->op_attr_flags);
3111 obj = ll_i2info(inode)->lli_clob;
3113 GOTO(out_fsxattr, rc);
3115 OBD_ALLOC_PTR(attr);
3117 GOTO(out_fsxattr, rc = -ENOMEM);
3119 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3120 fsxattr.fsx_xflags);
3123 ll_finish_md_op_data(op_data);
3127 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3130 struct inode *inode = file_inode(file);
3131 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3132 struct ll_inode_info *lli = ll_i2info(inode);
3133 struct obd_client_handle *och = NULL;
3134 struct split_param sp;
3137 enum mds_op_bias bias = 0;
3138 struct file *layout_file = NULL;
3140 size_t data_size = 0;
3144 mutex_lock(&lli->lli_och_mutex);
3145 if (fd->fd_lease_och != NULL) {
3146 och = fd->fd_lease_och;
3147 fd->fd_lease_och = NULL;
3149 mutex_unlock(&lli->lli_och_mutex);
3152 GOTO(out, rc = -ENOLCK);
3154 fmode = och->och_flags;
3156 switch (ioc->lil_flags) {
3157 case LL_LEASE_RESYNC_DONE:
3158 if (ioc->lil_count > IOC_IDS_MAX)
3159 GOTO(out, rc = -EINVAL);
3161 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3162 OBD_ALLOC(data, data_size);
3164 GOTO(out, rc = -ENOMEM);
3166 if (copy_from_user(data, (void __user *)arg, data_size))
3167 GOTO(out, rc = -EFAULT);
3169 bias = MDS_CLOSE_RESYNC_DONE;
3171 case LL_LEASE_LAYOUT_MERGE: {
3174 if (ioc->lil_count != 1)
3175 GOTO(out, rc = -EINVAL);
3177 arg += sizeof(*ioc);
3178 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3179 GOTO(out, rc = -EFAULT);
3181 layout_file = fget(fd);
3183 GOTO(out, rc = -EBADF);
3185 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3186 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3187 GOTO(out, rc = -EPERM);
3189 data = file_inode(layout_file);
3190 bias = MDS_CLOSE_LAYOUT_MERGE;
3193 case LL_LEASE_LAYOUT_SPLIT: {
3197 if (ioc->lil_count != 2)
3198 GOTO(out, rc = -EINVAL);
3200 arg += sizeof(*ioc);
3201 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3202 GOTO(out, rc = -EFAULT);
3204 arg += sizeof(__u32);
3205 if (copy_from_user(&mirror_id, (void __user *)arg,
3207 GOTO(out, rc = -EFAULT);
3209 layout_file = fget(fdv);
3211 GOTO(out, rc = -EBADF);
3213 sp.sp_inode = file_inode(layout_file);
3214 sp.sp_mirror_id = (__u16)mirror_id;
3216 bias = MDS_CLOSE_LAYOUT_SPLIT;
3220 /* without close intent */
3224 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3228 rc = ll_lease_och_release(inode, file);
3237 switch (ioc->lil_flags) {
3238 case LL_LEASE_RESYNC_DONE:
3240 OBD_FREE(data, data_size);
3242 case LL_LEASE_LAYOUT_MERGE:
3243 case LL_LEASE_LAYOUT_SPLIT:
3250 rc = ll_lease_type_from_fmode(fmode);
3254 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3257 struct inode *inode = file_inode(file);
3258 struct ll_inode_info *lli = ll_i2info(inode);
3259 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3260 struct obd_client_handle *och = NULL;
3261 __u64 open_flags = 0;
3267 switch (ioc->lil_mode) {
3268 case LL_LEASE_WRLCK:
3269 if (!(file->f_mode & FMODE_WRITE))
3271 fmode = FMODE_WRITE;
3273 case LL_LEASE_RDLCK:
3274 if (!(file->f_mode & FMODE_READ))
3278 case LL_LEASE_UNLCK:
3279 RETURN(ll_file_unlock_lease(file, ioc, arg));
3284 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3286 /* apply for lease */
3287 if (ioc->lil_flags & LL_LEASE_RESYNC)
3288 open_flags = MDS_OPEN_RESYNC;
3289 och = ll_lease_open(inode, file, fmode, open_flags);
3291 RETURN(PTR_ERR(och));
3293 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3294 rc = ll_lease_file_resync(och, inode);
3296 ll_lease_close(och, inode, NULL);
3299 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3301 ll_lease_close(och, inode, NULL);
3307 mutex_lock(&lli->lli_och_mutex);
3308 if (fd->fd_lease_och == NULL) {
3309 fd->fd_lease_och = och;
3312 mutex_unlock(&lli->lli_och_mutex);
3314 /* impossible now that only excl is supported for now */
3315 ll_lease_close(och, inode, &lease_broken);
3322 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3324 struct inode *inode = file_inode(file);
3325 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3329 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3330 PFID(ll_inode2fid(inode)), inode, cmd);
3331 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3333 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3334 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3338 case LL_IOC_GETFLAGS:
3339 /* Get the current value of the file flags */
3340 return put_user(fd->fd_flags, (int __user *)arg);
3341 case LL_IOC_SETFLAGS:
3342 case LL_IOC_CLRFLAGS:
3343 /* Set or clear specific file flags */
3344 /* XXX This probably needs checks to ensure the flags are
3345 * not abused, and to handle any flag side effects.
3347 if (get_user(flags, (int __user *) arg))
3350 if (cmd == LL_IOC_SETFLAGS) {
3351 if ((flags & LL_FILE_IGNORE_LOCK) &&
3352 !(file->f_flags & O_DIRECT)) {
3353 CERROR("%s: unable to disable locking on "
3354 "non-O_DIRECT file\n", current->comm);
3358 fd->fd_flags |= flags;
3360 fd->fd_flags &= ~flags;
3363 case LL_IOC_LOV_SETSTRIPE:
3364 case LL_IOC_LOV_SETSTRIPE_NEW:
3365 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3366 case LL_IOC_LOV_SETEA:
3367 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3368 case LL_IOC_LOV_SWAP_LAYOUTS: {
3370 struct lustre_swap_layouts lsl;
3372 if (copy_from_user(&lsl, (char __user *)arg,
3373 sizeof(struct lustre_swap_layouts)))
3376 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3379 file2 = fget(lsl.sl_fd);
3383 /* O_WRONLY or O_RDWR */
3384 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3385 GOTO(out, rc = -EPERM);
3387 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3388 struct inode *inode2;
3389 struct ll_inode_info *lli;
3390 struct obd_client_handle *och = NULL;
3392 lli = ll_i2info(inode);
3393 mutex_lock(&lli->lli_och_mutex);
3394 if (fd->fd_lease_och != NULL) {
3395 och = fd->fd_lease_och;
3396 fd->fd_lease_och = NULL;
3398 mutex_unlock(&lli->lli_och_mutex);
3400 GOTO(out, rc = -ENOLCK);
3401 inode2 = file_inode(file2);
3402 rc = ll_swap_layouts_close(och, inode, inode2);
3404 rc = ll_swap_layouts(file, file2, &lsl);
3410 case LL_IOC_LOV_GETSTRIPE:
3411 case LL_IOC_LOV_GETSTRIPE_NEW:
3412 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3413 case FS_IOC_GETFLAGS:
3414 case FS_IOC_SETFLAGS:
3415 RETURN(ll_iocontrol(inode, file, cmd, arg));
3416 case FSFILT_IOC_GETVERSION:
3417 case FS_IOC_GETVERSION:
3418 RETURN(put_user(inode->i_generation, (int __user *)arg));
3419 /* We need to special case any other ioctls we want to handle,
3420 * to send them to the MDS/OST as appropriate and to properly
3421 * network encode the arg field. */
3422 case FS_IOC_SETVERSION:
3425 case LL_IOC_GROUP_LOCK:
3426 RETURN(ll_get_grouplock(inode, file, arg));
3427 case LL_IOC_GROUP_UNLOCK:
3428 RETURN(ll_put_grouplock(inode, file, arg));
3429 case IOC_OBD_STATFS:
3430 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3432 case LL_IOC_FLUSHCTX:
3433 RETURN(ll_flush_ctx(inode));
3434 case LL_IOC_PATH2FID: {
3435 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3436 sizeof(struct lu_fid)))
3441 case LL_IOC_GETPARENT:
3442 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3444 case OBD_IOC_FID2PATH:
3445 RETURN(ll_fid2path(inode, (void __user *)arg));
3446 case LL_IOC_DATA_VERSION: {
3447 struct ioc_data_version idv;
3450 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3453 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3454 rc = ll_ioc_data_version(inode, &idv);
3457 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3463 case LL_IOC_GET_MDTIDX: {
3466 mdtidx = ll_get_mdt_idx(inode);
3470 if (put_user((int)mdtidx, (int __user *)arg))
3475 case OBD_IOC_GETDTNAME:
3476 case OBD_IOC_GETMDNAME:
3477 RETURN(ll_get_obd_name(inode, cmd, arg));
3478 case LL_IOC_HSM_STATE_GET: {
3479 struct md_op_data *op_data;
3480 struct hsm_user_state *hus;
3487 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3488 LUSTRE_OPC_ANY, hus);
3489 if (IS_ERR(op_data)) {
3491 RETURN(PTR_ERR(op_data));
3494 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3497 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3500 ll_finish_md_op_data(op_data);
3504 case LL_IOC_HSM_STATE_SET: {
3505 struct hsm_state_set *hss;
3512 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3517 rc = ll_hsm_state_set(inode, hss);
3522 case LL_IOC_HSM_ACTION: {
3523 struct md_op_data *op_data;
3524 struct hsm_current_action *hca;
3531 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3532 LUSTRE_OPC_ANY, hca);
3533 if (IS_ERR(op_data)) {
3535 RETURN(PTR_ERR(op_data));
3538 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3541 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3544 ll_finish_md_op_data(op_data);
3548 case LL_IOC_SET_LEASE_OLD: {
3549 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3551 RETURN(ll_file_set_lease(file, &ioc, 0));
3553 case LL_IOC_SET_LEASE: {
3554 struct ll_ioc_lease ioc;
3556 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3559 RETURN(ll_file_set_lease(file, &ioc, arg));
3561 case LL_IOC_GET_LEASE: {
3562 struct ll_inode_info *lli = ll_i2info(inode);
3563 struct ldlm_lock *lock = NULL;
3566 mutex_lock(&lli->lli_och_mutex);
3567 if (fd->fd_lease_och != NULL) {
3568 struct obd_client_handle *och = fd->fd_lease_och;
3570 lock = ldlm_handle2lock(&och->och_lease_handle);
3572 lock_res_and_lock(lock);
3573 if (!ldlm_is_cancel(lock))
3574 fmode = och->och_flags;
3576 unlock_res_and_lock(lock);
3577 LDLM_LOCK_PUT(lock);
3580 mutex_unlock(&lli->lli_och_mutex);
3582 RETURN(ll_lease_type_from_fmode(fmode));
3584 case LL_IOC_HSM_IMPORT: {
3585 struct hsm_user_import *hui;
3591 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3596 rc = ll_hsm_import(inode, file, hui);
3601 case LL_IOC_FUTIMES_3: {
3602 struct ll_futimes_3 lfu;
3604 if (copy_from_user(&lfu,
3605 (const struct ll_futimes_3 __user *)arg,
3609 RETURN(ll_file_futimes_3(file, &lfu));
3611 case LL_IOC_LADVISE: {
3612 struct llapi_ladvise_hdr *k_ladvise_hdr;
3613 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3616 int alloc_size = sizeof(*k_ladvise_hdr);
3619 u_ladvise_hdr = (void __user *)arg;
3620 OBD_ALLOC_PTR(k_ladvise_hdr);
3621 if (k_ladvise_hdr == NULL)
3624 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3625 GOTO(out_ladvise, rc = -EFAULT);
3627 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3628 k_ladvise_hdr->lah_count < 1)
3629 GOTO(out_ladvise, rc = -EINVAL);
3631 num_advise = k_ladvise_hdr->lah_count;
3632 if (num_advise >= LAH_COUNT_MAX)
3633 GOTO(out_ladvise, rc = -EFBIG);
3635 OBD_FREE_PTR(k_ladvise_hdr);
3636 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3637 lah_advise[num_advise]);
3638 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3639 if (k_ladvise_hdr == NULL)
3643 * TODO: submit multiple advices to one server in a single RPC
3645 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3646 GOTO(out_ladvise, rc = -EFAULT);
3648 for (i = 0; i < num_advise; i++) {
3649 struct llapi_lu_ladvise *k_ladvise =
3650 &k_ladvise_hdr->lah_advise[i];
3651 struct llapi_lu_ladvise __user *u_ladvise =
3652 &u_ladvise_hdr->lah_advise[i];
3654 rc = ll_ladvise_sanity(inode, k_ladvise);
3656 GOTO(out_ladvise, rc);
3658 switch (k_ladvise->lla_advice) {
3659 case LU_LADVISE_LOCKNOEXPAND:
3660 rc = ll_lock_noexpand(file,
3661 k_ladvise->lla_peradvice_flags);
3662 GOTO(out_ladvise, rc);
3663 case LU_LADVISE_LOCKAHEAD:
3665 rc = ll_file_lock_ahead(file, k_ladvise);
3668 GOTO(out_ladvise, rc);
3671 &u_ladvise->lla_lockahead_result))
3672 GOTO(out_ladvise, rc = -EFAULT);
3675 rc = ll_ladvise(inode, file,
3676 k_ladvise_hdr->lah_flags,
3679 GOTO(out_ladvise, rc);
3686 OBD_FREE(k_ladvise_hdr, alloc_size);
3689 case LL_IOC_FLR_SET_MIRROR: {
3690 /* mirror I/O must be direct to avoid polluting page cache
3692 if (!(file->f_flags & O_DIRECT))
3695 fd->fd_designated_mirror = (__u32)arg;
3698 case LL_IOC_FSGETXATTR:
3699 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3700 case LL_IOC_FSSETXATTR:
3701 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3703 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3705 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3706 (void __user *)arg));
3710 #ifndef HAVE_FILE_LLSEEK_SIZE
3711 static inline loff_t
3712 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3714 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3716 if (offset > maxsize)
3719 if (offset != file->f_pos) {
3720 file->f_pos = offset;
3721 file->f_version = 0;
3727 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3728 loff_t maxsize, loff_t eof)
3730 struct inode *inode = file_inode(file);
3738 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3739 * position-querying operation. Avoid rewriting the "same"
3740 * f_pos value back to the file because a concurrent read(),
3741 * write() or lseek() might have altered it
3746 * f_lock protects against read/modify/write race with other
3747 * SEEK_CURs. Note that parallel writes and reads behave
3751 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3752 inode_unlock(inode);
3756 * In the generic case the entire file is data, so as long as
3757 * offset isn't at the end of the file then the offset is data.
3764 * There is a virtual hole at the end of the file, so as long as
3765 * offset isn't i_size or larger, return i_size.
3773 return llseek_execute(file, offset, maxsize);
3777 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3779 struct inode *inode = file_inode(file);
3780 loff_t retval, eof = 0;
3783 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3784 (origin == SEEK_CUR) ? file->f_pos : 0);
3785 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3786 PFID(ll_inode2fid(inode)), inode, retval, retval,
3788 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3790 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3791 retval = ll_glimpse_size(inode);
3794 eof = i_size_read(inode);
3797 retval = ll_generic_file_llseek_size(file, offset, origin,
3798 ll_file_maxbytes(inode), eof);
3802 static int ll_flush(struct file *file, fl_owner_t id)
3804 struct inode *inode = file_inode(file);
3805 struct ll_inode_info *lli = ll_i2info(inode);
3806 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3809 LASSERT(!S_ISDIR(inode->i_mode));
3811 /* catch async errors that were recorded back when async writeback
3812 * failed for pages in this mapping. */
3813 rc = lli->lli_async_rc;
3814 lli->lli_async_rc = 0;
3815 if (lli->lli_clob != NULL) {
3816 err = lov_read_and_clear_async_rc(lli->lli_clob);
3821 /* The application has been told write failure already.
3822 * Do not report failure again. */
3823 if (fd->fd_write_failed)
3825 return rc ? -EIO : 0;
3829 * Called to make sure a portion of file has been written out.
3830 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3832 * Return how many pages have been written.
3834 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3835 enum cl_fsync_mode mode, int ignore_layout)
3839 struct cl_fsync_io *fio;
3844 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3845 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3848 env = cl_env_get(&refcheck);
3850 RETURN(PTR_ERR(env));
3852 io = vvp_env_thread_io(env);
3853 io->ci_obj = ll_i2info(inode)->lli_clob;
3854 io->ci_ignore_layout = ignore_layout;
3856 /* initialize parameters for sync */
3857 fio = &io->u.ci_fsync;
3858 fio->fi_start = start;
3860 fio->fi_fid = ll_inode2fid(inode);
3861 fio->fi_mode = mode;
3862 fio->fi_nr_written = 0;
3864 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3865 result = cl_io_loop(env, io);
3867 result = io->ci_result;
3869 result = fio->fi_nr_written;
3870 cl_io_fini(env, io);
3871 cl_env_put(env, &refcheck);
3877 * When dentry is provided (the 'else' case), file_dentry() may be
3878 * null and dentry must be used directly rather than pulled from
3879 * file_dentry() as is done otherwise.
3882 #ifdef HAVE_FILE_FSYNC_4ARGS
3883 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3885 struct dentry *dentry = file_dentry(file);
3887 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3888 int ll_fsync(struct file *file, int datasync)
3890 struct dentry *dentry = file_dentry(file);
3892 loff_t end = LLONG_MAX;
3894 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3897 loff_t end = LLONG_MAX;
3899 struct inode *inode = dentry->d_inode;
3900 struct ll_inode_info *lli = ll_i2info(inode);
3901 struct ptlrpc_request *req;
3905 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3906 PFID(ll_inode2fid(inode)), inode);
3907 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3909 #ifdef HAVE_FILE_FSYNC_4ARGS
3910 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3911 lock_inode = !lli->lli_inode_locked;
3915 /* fsync's caller has already called _fdata{sync,write}, we want
3916 * that IO to finish before calling the osc and mdc sync methods */
3917 rc = filemap_fdatawait(inode->i_mapping);
3920 /* catch async errors that were recorded back when async writeback
3921 * failed for pages in this mapping. */
3922 if (!S_ISDIR(inode->i_mode)) {
3923 err = lli->lli_async_rc;
3924 lli->lli_async_rc = 0;
3927 if (lli->lli_clob != NULL) {
3928 err = lov_read_and_clear_async_rc(lli->lli_clob);
3934 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3938 ptlrpc_req_finished(req);
3940 if (S_ISREG(inode->i_mode)) {
3941 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3943 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3944 if (rc == 0 && err < 0)
3947 fd->fd_write_failed = true;
3949 fd->fd_write_failed = false;
3952 #ifdef HAVE_FILE_FSYNC_4ARGS
3954 inode_unlock(inode);
3960 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3962 struct inode *inode = file_inode(file);
3963 struct ll_sb_info *sbi = ll_i2sbi(inode);
3964 struct ldlm_enqueue_info einfo = {
3965 .ei_type = LDLM_FLOCK,
3966 .ei_cb_cp = ldlm_flock_completion_ast,
3967 .ei_cbdata = file_lock,
3969 struct md_op_data *op_data;
3970 struct lustre_handle lockh = { 0 };
3971 union ldlm_policy_data flock = { { 0 } };
3972 int fl_type = file_lock->fl_type;
3978 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3979 PFID(ll_inode2fid(inode)), file_lock);
3981 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3983 if (file_lock->fl_flags & FL_FLOCK) {
3984 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3985 /* flocks are whole-file locks */
3986 flock.l_flock.end = OFFSET_MAX;
3987 /* For flocks owner is determined by the local file desctiptor*/
3988 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3989 } else if (file_lock->fl_flags & FL_POSIX) {
3990 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3991 flock.l_flock.start = file_lock->fl_start;
3992 flock.l_flock.end = file_lock->fl_end;
3996 flock.l_flock.pid = file_lock->fl_pid;
3998 /* Somewhat ugly workaround for svc lockd.
3999 * lockd installs custom fl_lmops->lm_compare_owner that checks
4000 * for the fl_owner to be the same (which it always is on local node
4001 * I guess between lockd processes) and then compares pid.
4002 * As such we assign pid to the owner field to make it all work,
4003 * conflict with normal locks is unlikely since pid space and
4004 * pointer space for current->files are not intersecting */
4005 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4006 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4010 einfo.ei_mode = LCK_PR;
4013 /* An unlock request may or may not have any relation to
4014 * existing locks so we may not be able to pass a lock handle
4015 * via a normal ldlm_lock_cancel() request. The request may even
4016 * unlock a byte range in the middle of an existing lock. In
4017 * order to process an unlock request we need all of the same
4018 * information that is given with a normal read or write record
4019 * lock request. To avoid creating another ldlm unlock (cancel)
4020 * message we'll treat a LCK_NL flock request as an unlock. */
4021 einfo.ei_mode = LCK_NL;
4024 einfo.ei_mode = LCK_PW;
4027 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4042 flags = LDLM_FL_BLOCK_NOWAIT;
4048 flags = LDLM_FL_TEST_LOCK;
4051 CERROR("unknown fcntl lock command: %d\n", cmd);
4055 /* Save the old mode so that if the mode in the lock changes we
4056 * can decrement the appropriate reader or writer refcount. */
4057 file_lock->fl_type = einfo.ei_mode;
4059 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4060 LUSTRE_OPC_ANY, NULL);
4061 if (IS_ERR(op_data))
4062 RETURN(PTR_ERR(op_data));
4064 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4065 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4066 flock.l_flock.pid, flags, einfo.ei_mode,
4067 flock.l_flock.start, flock.l_flock.end);
4069 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4072 /* Restore the file lock type if not TEST lock. */
4073 if (!(flags & LDLM_FL_TEST_LOCK))
4074 file_lock->fl_type = fl_type;
4076 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4077 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4078 !(flags & LDLM_FL_TEST_LOCK))
4079 rc2 = locks_lock_file_wait(file, file_lock);
4081 if ((file_lock->fl_flags & FL_FLOCK) &&
4082 (rc == 0 || file_lock->fl_type == F_UNLCK))
4083 rc2 = flock_lock_file_wait(file, file_lock);
4084 if ((file_lock->fl_flags & FL_POSIX) &&
4085 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4086 !(flags & LDLM_FL_TEST_LOCK))
4087 rc2 = posix_lock_file_wait(file, file_lock);
4088 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4090 if (rc2 && file_lock->fl_type != F_UNLCK) {
4091 einfo.ei_mode = LCK_NL;
4092 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4097 ll_finish_md_op_data(op_data);
4102 int ll_get_fid_by_name(struct inode *parent, const char *name,
4103 int namelen, struct lu_fid *fid,
4104 struct inode **inode)
4106 struct md_op_data *op_data = NULL;
4107 struct mdt_body *body;
4108 struct ptlrpc_request *req;
4112 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4113 LUSTRE_OPC_ANY, NULL);
4114 if (IS_ERR(op_data))
4115 RETURN(PTR_ERR(op_data));
4117 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4118 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4119 ll_finish_md_op_data(op_data);
4123 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4125 GOTO(out_req, rc = -EFAULT);
4127 *fid = body->mbo_fid1;
4130 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4132 ptlrpc_req_finished(req);
4136 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4139 struct dentry *dchild = NULL;
4140 struct inode *child_inode = NULL;
4141 struct md_op_data *op_data;
4142 struct ptlrpc_request *request = NULL;
4143 struct obd_client_handle *och = NULL;
4145 struct mdt_body *body;
4146 __u64 data_version = 0;
4147 size_t namelen = strlen(name);
4148 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4152 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4153 PFID(ll_inode2fid(parent)), name,
4154 lum->lum_stripe_offset, lum->lum_stripe_count);
4156 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4157 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4158 lustre_swab_lmv_user_md(lum);
4160 /* Get child FID first */
4161 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4164 dchild = d_lookup(file_dentry(file), &qstr);
4166 if (dchild->d_inode)
4167 child_inode = igrab(dchild->d_inode);
4172 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4181 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4182 OBD_CONNECT2_DIR_MIGRATE)) {
4183 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4184 ll_i2info(child_inode)->lli_lsm_md) {
4185 CERROR("%s: MDT doesn't support stripe directory "
4187 ll_get_fsname(parent->i_sb, NULL, 0));
4188 GOTO(out_iput, rc = -EOPNOTSUPP);
4193 * lfs migrate command needs to be blocked on the client
4194 * by checking the migrate FID against the FID of the
4197 if (child_inode == parent->i_sb->s_root->d_inode)
4198 GOTO(out_iput, rc = -EINVAL);
4200 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4201 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4202 if (IS_ERR(op_data))
4203 GOTO(out_iput, rc = PTR_ERR(op_data));
4205 inode_lock(child_inode);
4206 op_data->op_fid3 = *ll_inode2fid(child_inode);
4207 if (!fid_is_sane(&op_data->op_fid3)) {
4208 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4209 ll_get_fsname(parent->i_sb, NULL, 0), name,
4210 PFID(&op_data->op_fid3));
4211 GOTO(out_unlock, rc = -EINVAL);
4214 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4215 op_data->op_data = lum;
4216 op_data->op_data_size = lumlen;
4219 if (S_ISREG(child_inode->i_mode)) {
4220 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4224 GOTO(out_unlock, rc);
4227 rc = ll_data_version(child_inode, &data_version,
4230 GOTO(out_close, rc);
4232 op_data->op_open_handle = och->och_open_handle;
4233 op_data->op_data_version = data_version;
4234 op_data->op_lease_handle = och->och_lease_handle;
4235 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4237 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4238 och->och_mod->mod_open_req->rq_replay = 0;
4239 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4242 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4243 name, namelen, &request);
4245 LASSERT(request != NULL);
4246 ll_update_times(request, parent);
4248 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4249 LASSERT(body != NULL);
4251 /* If the server does release layout lock, then we cleanup
4252 * the client och here, otherwise release it in out_close: */
4253 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4254 obd_mod_put(och->och_mod);
4255 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4257 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4263 if (request != NULL) {
4264 ptlrpc_req_finished(request);
4268 /* Try again if the file layout has changed. */
4269 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4274 ll_lease_close(och, child_inode, NULL);
4276 clear_nlink(child_inode);
4278 inode_unlock(child_inode);
4279 ll_finish_md_op_data(op_data);
4286 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4294 * test if some locks matching bits and l_req_mode are acquired
4295 * - bits can be in different locks
4296 * - if found clear the common lock bits in *bits
4297 * - the bits not found, are kept in *bits
4299 * \param bits [IN] searched lock bits [IN]
4300 * \param l_req_mode [IN] searched lock mode
4301 * \retval boolean, true iff all bits are found
4303 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4305 struct lustre_handle lockh;
4306 union ldlm_policy_data policy;
4307 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4308 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4317 fid = &ll_i2info(inode)->lli_fid;
4318 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4319 ldlm_lockname[mode]);
4321 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4322 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4323 policy.l_inodebits.bits = *bits & (1 << i);
4324 if (policy.l_inodebits.bits == 0)
4327 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4328 &policy, mode, &lockh)) {
4329 struct ldlm_lock *lock;
4331 lock = ldlm_handle2lock(&lockh);
4334 ~(lock->l_policy_data.l_inodebits.bits);
4335 LDLM_LOCK_PUT(lock);
4337 *bits &= ~policy.l_inodebits.bits;
4344 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4345 struct lustre_handle *lockh, __u64 flags,
4346 enum ldlm_mode mode)
4348 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4353 fid = &ll_i2info(inode)->lli_fid;
4354 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4356 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4357 fid, LDLM_IBITS, &policy, mode, lockh);
4362 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4364 /* Already unlinked. Just update nlink and return success */
4365 if (rc == -ENOENT) {
4367 /* If it is striped directory, and there is bad stripe
4368 * Let's revalidate the dentry again, instead of returning
4370 if (S_ISDIR(inode->i_mode) &&
4371 ll_i2info(inode)->lli_lsm_md != NULL)
4374 /* This path cannot be hit for regular files unless in
4375 * case of obscure races, so no need to to validate
4377 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4379 } else if (rc != 0) {
4380 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4381 "%s: revalidate FID "DFID" error: rc = %d\n",
4382 ll_get_fsname(inode->i_sb, NULL, 0),
4383 PFID(ll_inode2fid(inode)), rc);
4389 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4391 struct inode *inode = dentry->d_inode;
4392 struct obd_export *exp = ll_i2mdexp(inode);
4393 struct lookup_intent oit = {
4396 struct ptlrpc_request *req = NULL;
4397 struct md_op_data *op_data;
4401 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4402 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4404 /* Call getattr by fid, so do not provide name at all. */
4405 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4406 LUSTRE_OPC_ANY, NULL);
4407 if (IS_ERR(op_data))
4408 RETURN(PTR_ERR(op_data));
4410 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4411 ll_finish_md_op_data(op_data);
4413 rc = ll_inode_revalidate_fini(inode, rc);
4417 rc = ll_revalidate_it_finish(req, &oit, dentry);
4419 ll_intent_release(&oit);
4423 /* Unlinked? Unhash dentry, so it is not picked up later by
4424 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4425 * here to preserve get_cwd functionality on 2.6.
4427 if (!dentry->d_inode->i_nlink) {
4428 ll_lock_dcache(inode);
4429 d_lustre_invalidate(dentry, 0);
4430 ll_unlock_dcache(inode);
4433 ll_lookup_finish_locks(&oit, dentry);
4435 ptlrpc_req_finished(req);
4440 static int ll_merge_md_attr(struct inode *inode)
4442 struct cl_attr attr = { 0 };
4445 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4446 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4447 &attr, ll_md_blocking_ast);
4451 set_nlink(inode, attr.cat_nlink);
4452 inode->i_blocks = attr.cat_blocks;
4453 i_size_write(inode, attr.cat_size);
4455 ll_i2info(inode)->lli_atime = attr.cat_atime;
4456 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4457 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4462 static inline dev_t ll_compat_encode_dev(dev_t dev)
4464 /* The compat_sys_*stat*() syscalls will fail unless the
4465 * device majors and minors are both less than 256. Note that
4466 * the value returned here will be passed through
4467 * old_encode_dev() in cp_compat_stat(). And so we are not
4468 * trying to return a valid compat (u16) device number, just
4469 * one that will pass the old_valid_dev() check. */
4471 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4474 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4475 int ll_getattr(const struct path *path, struct kstat *stat,
4476 u32 request_mask, unsigned int flags)
4478 struct dentry *de = path->dentry;
4480 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4483 struct inode *inode = de->d_inode;
4484 struct ll_sb_info *sbi = ll_i2sbi(inode);
4485 struct ll_inode_info *lli = ll_i2info(inode);
4488 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4490 rc = ll_inode_revalidate(de, IT_GETATTR);
4494 if (S_ISREG(inode->i_mode)) {
4495 /* In case of restore, the MDT has the right size and has
4496 * already send it back without granting the layout lock,
4497 * inode is up-to-date so glimpse is useless.
4498 * Also to glimpse we need the layout, in case of a running
4499 * restore the MDT holds the layout lock so the glimpse will
4500 * block up to the end of restore (getattr will block)
4502 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4503 rc = ll_glimpse_size(inode);
4508 /* If object isn't regular a file then don't validate size. */
4509 if (S_ISDIR(inode->i_mode) &&
4510 lli->lli_lsm_md != NULL) {
4511 rc = ll_merge_md_attr(inode);
4516 LTIME_S(inode->i_atime) = lli->lli_atime;
4517 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4518 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4521 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4523 if (ll_need_32bit_api(sbi)) {
4524 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4525 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4526 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4528 stat->ino = inode->i_ino;
4529 stat->dev = inode->i_sb->s_dev;
4530 stat->rdev = inode->i_rdev;
4533 stat->mode = inode->i_mode;
4534 stat->uid = inode->i_uid;
4535 stat->gid = inode->i_gid;
4536 stat->atime = inode->i_atime;
4537 stat->mtime = inode->i_mtime;
4538 stat->ctime = inode->i_ctime;
4539 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4541 stat->nlink = inode->i_nlink;
4542 stat->size = i_size_read(inode);
4543 stat->blocks = inode->i_blocks;
4548 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4549 __u64 start, __u64 len)
4553 struct fiemap *fiemap;
4554 unsigned int extent_count = fieinfo->fi_extents_max;
4556 num_bytes = sizeof(*fiemap) + (extent_count *
4557 sizeof(struct fiemap_extent));
4558 OBD_ALLOC_LARGE(fiemap, num_bytes);
4563 fiemap->fm_flags = fieinfo->fi_flags;
4564 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4565 fiemap->fm_start = start;
4566 fiemap->fm_length = len;
4567 if (extent_count > 0 &&
4568 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4569 sizeof(struct fiemap_extent)) != 0)
4570 GOTO(out, rc = -EFAULT);
4572 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4574 fieinfo->fi_flags = fiemap->fm_flags;
4575 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4576 if (extent_count > 0 &&
4577 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4578 fiemap->fm_mapped_extents *
4579 sizeof(struct fiemap_extent)) != 0)
4580 GOTO(out, rc = -EFAULT);
4582 OBD_FREE_LARGE(fiemap, num_bytes);
4586 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4588 struct ll_inode_info *lli = ll_i2info(inode);
4589 struct posix_acl *acl = NULL;
4592 spin_lock(&lli->lli_lock);
4593 /* VFS' acl_permission_check->check_acl will release the refcount */
4594 acl = posix_acl_dup(lli->lli_posix_acl);
4595 spin_unlock(&lli->lli_lock);
4600 #ifdef HAVE_IOP_SET_ACL
4601 #ifdef CONFIG_FS_POSIX_ACL
4602 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4604 struct ll_sb_info *sbi = ll_i2sbi(inode);
4605 struct ptlrpc_request *req = NULL;
4606 const char *name = NULL;
4608 size_t value_size = 0;
4613 case ACL_TYPE_ACCESS:
4614 name = XATTR_NAME_POSIX_ACL_ACCESS;
4616 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4619 case ACL_TYPE_DEFAULT:
4620 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4621 if (!S_ISDIR(inode->i_mode))
4622 rc = acl ? -EACCES : 0;
4633 value_size = posix_acl_xattr_size(acl->a_count);
4634 value = kmalloc(value_size, GFP_NOFS);
4636 GOTO(out, rc = -ENOMEM);
4638 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4640 GOTO(out_value, rc);
4643 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4644 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4645 name, value, value_size, 0, 0, &req);
4647 ptlrpc_req_finished(req);
4652 forget_cached_acl(inode, type);
4654 set_cached_acl(inode, type, acl);
4657 #endif /* CONFIG_FS_POSIX_ACL */
4658 #endif /* HAVE_IOP_SET_ACL */
4660 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4662 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4663 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4665 ll_check_acl(struct inode *inode, int mask)
4668 # ifdef CONFIG_FS_POSIX_ACL
4669 struct posix_acl *acl;
4673 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4674 if (flags & IPERM_FLAG_RCU)
4677 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4682 rc = posix_acl_permission(inode, acl, mask);
4683 posix_acl_release(acl);
4686 # else /* !CONFIG_FS_POSIX_ACL */
4688 # endif /* CONFIG_FS_POSIX_ACL */
4690 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4692 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4693 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4695 # ifdef HAVE_INODE_PERMISION_2ARGS
4696 int ll_inode_permission(struct inode *inode, int mask)
4698 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4703 struct ll_sb_info *sbi;
4704 struct root_squash_info *squash;
4705 struct cred *cred = NULL;
4706 const struct cred *old_cred = NULL;
4708 bool squash_id = false;
4711 #ifdef MAY_NOT_BLOCK
4712 if (mask & MAY_NOT_BLOCK)
4714 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4715 if (flags & IPERM_FLAG_RCU)
4719 /* as root inode are NOT getting validated in lookup operation,
4720 * need to do it before permission check. */
4722 if (inode == inode->i_sb->s_root->d_inode) {
4723 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4728 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4729 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4731 /* squash fsuid/fsgid if needed */
4732 sbi = ll_i2sbi(inode);
4733 squash = &sbi->ll_squash;
4734 if (unlikely(squash->rsi_uid != 0 &&
4735 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4736 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4740 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4741 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4742 squash->rsi_uid, squash->rsi_gid);
4744 /* update current process's credentials
4745 * and FS capability */
4746 cred = prepare_creds();
4750 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4751 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4752 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4753 if ((1 << cap) & CFS_CAP_FS_MASK)
4754 cap_lower(cred->cap_effective, cap);
4756 old_cred = override_creds(cred);
4759 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4760 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4761 /* restore current process's credentials and FS capability */
4763 revert_creds(old_cred);
4770 /* -o localflock - only provides locally consistent flock locks */
4771 struct file_operations ll_file_operations = {
4772 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4773 # ifdef HAVE_SYNC_READ_WRITE
4774 .read = new_sync_read,
4775 .write = new_sync_write,
4777 .read_iter = ll_file_read_iter,
4778 .write_iter = ll_file_write_iter,
4779 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4780 .read = ll_file_read,
4781 .aio_read = ll_file_aio_read,
4782 .write = ll_file_write,
4783 .aio_write = ll_file_aio_write,
4784 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4785 .unlocked_ioctl = ll_file_ioctl,
4786 .open = ll_file_open,
4787 .release = ll_file_release,
4788 .mmap = ll_file_mmap,
4789 .llseek = ll_file_seek,
4790 .splice_read = ll_file_splice_read,
4795 struct file_operations ll_file_operations_flock = {
4796 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4797 # ifdef HAVE_SYNC_READ_WRITE
4798 .read = new_sync_read,
4799 .write = new_sync_write,
4800 # endif /* HAVE_SYNC_READ_WRITE */
4801 .read_iter = ll_file_read_iter,
4802 .write_iter = ll_file_write_iter,
4803 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4804 .read = ll_file_read,
4805 .aio_read = ll_file_aio_read,
4806 .write = ll_file_write,
4807 .aio_write = ll_file_aio_write,
4808 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4809 .unlocked_ioctl = ll_file_ioctl,
4810 .open = ll_file_open,
4811 .release = ll_file_release,
4812 .mmap = ll_file_mmap,
4813 .llseek = ll_file_seek,
4814 .splice_read = ll_file_splice_read,
4817 .flock = ll_file_flock,
4818 .lock = ll_file_flock
4821 /* These are for -o noflock - to return ENOSYS on flock calls */
4822 struct file_operations ll_file_operations_noflock = {
4823 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4824 # ifdef HAVE_SYNC_READ_WRITE
4825 .read = new_sync_read,
4826 .write = new_sync_write,
4827 # endif /* HAVE_SYNC_READ_WRITE */
4828 .read_iter = ll_file_read_iter,
4829 .write_iter = ll_file_write_iter,
4830 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4831 .read = ll_file_read,
4832 .aio_read = ll_file_aio_read,
4833 .write = ll_file_write,
4834 .aio_write = ll_file_aio_write,
4835 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4836 .unlocked_ioctl = ll_file_ioctl,
4837 .open = ll_file_open,
4838 .release = ll_file_release,
4839 .mmap = ll_file_mmap,
4840 .llseek = ll_file_seek,
4841 .splice_read = ll_file_splice_read,
4844 .flock = ll_file_noflock,
4845 .lock = ll_file_noflock
4848 struct inode_operations ll_file_inode_operations = {
4849 .setattr = ll_setattr,
4850 .getattr = ll_getattr,
4851 .permission = ll_inode_permission,
4852 #ifdef HAVE_IOP_XATTR
4853 .setxattr = ll_setxattr,
4854 .getxattr = ll_getxattr,
4855 .removexattr = ll_removexattr,
4857 .listxattr = ll_listxattr,
4858 .fiemap = ll_fiemap,
4859 #ifdef HAVE_IOP_GET_ACL
4860 .get_acl = ll_get_acl,
4862 #ifdef HAVE_IOP_SET_ACL
4863 .set_acl = ll_set_acl,
4867 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4869 struct ll_inode_info *lli = ll_i2info(inode);
4870 struct cl_object *obj = lli->lli_clob;
4879 env = cl_env_get(&refcheck);
4881 RETURN(PTR_ERR(env));
4883 rc = cl_conf_set(env, lli->lli_clob, conf);
4887 if (conf->coc_opc == OBJECT_CONF_SET) {
4888 struct ldlm_lock *lock = conf->coc_lock;
4889 struct cl_layout cl = {
4893 LASSERT(lock != NULL);
4894 LASSERT(ldlm_has_layout(lock));
4896 /* it can only be allowed to match after layout is
4897 * applied to inode otherwise false layout would be
4898 * seen. Applying layout shoud happen before dropping
4899 * the intent lock. */
4900 ldlm_lock_allow_match(lock);
4902 rc = cl_object_layout_get(env, obj, &cl);
4907 DFID": layout version change: %u -> %u\n",
4908 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4910 ll_layout_version_set(lli, cl.cl_layout_gen);
4914 cl_env_put(env, &refcheck);
4919 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4920 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4923 struct ll_sb_info *sbi = ll_i2sbi(inode);
4924 struct ptlrpc_request *req;
4925 struct mdt_body *body;
4932 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4933 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4934 lock->l_lvb_data, lock->l_lvb_len);
4936 if (lock->l_lvb_data != NULL)
4939 /* if layout lock was granted right away, the layout is returned
4940 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4941 * blocked and then granted via completion ast, we have to fetch
4942 * layout here. Please note that we can't use the LVB buffer in
4943 * completion AST because it doesn't have a large enough buffer */
4944 rc = ll_get_default_mdsize(sbi, &lmmsize);
4946 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4947 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4951 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4953 GOTO(out, rc = -EPROTO);
4955 lmmsize = body->mbo_eadatasize;
4956 if (lmmsize == 0) /* empty layout */
4959 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4961 GOTO(out, rc = -EFAULT);
4963 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4964 if (lvbdata == NULL)
4965 GOTO(out, rc = -ENOMEM);
4967 memcpy(lvbdata, lmm, lmmsize);
4968 lock_res_and_lock(lock);
4969 if (unlikely(lock->l_lvb_data == NULL)) {
4970 lock->l_lvb_type = LVB_T_LAYOUT;
4971 lock->l_lvb_data = lvbdata;
4972 lock->l_lvb_len = lmmsize;
4975 unlock_res_and_lock(lock);
4978 OBD_FREE_LARGE(lvbdata, lmmsize);
4983 ptlrpc_req_finished(req);
4988 * Apply the layout to the inode. Layout lock is held and will be released
4991 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4992 struct inode *inode)
4994 struct ll_inode_info *lli = ll_i2info(inode);
4995 struct ll_sb_info *sbi = ll_i2sbi(inode);
4996 struct ldlm_lock *lock;
4997 struct cl_object_conf conf;
5000 bool wait_layout = false;
5003 LASSERT(lustre_handle_is_used(lockh));
5005 lock = ldlm_handle2lock(lockh);
5006 LASSERT(lock != NULL);
5007 LASSERT(ldlm_has_layout(lock));
5009 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5010 PFID(&lli->lli_fid), inode);
5012 /* in case this is a caching lock and reinstate with new inode */
5013 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5015 lock_res_and_lock(lock);
5016 lvb_ready = ldlm_is_lvb_ready(lock);
5017 unlock_res_and_lock(lock);
5019 /* checking lvb_ready is racy but this is okay. The worst case is
5020 * that multi processes may configure the file on the same time. */
5024 rc = ll_layout_fetch(inode, lock);
5028 /* for layout lock, lmm is stored in lock's lvb.
5029 * lvb_data is immutable if the lock is held so it's safe to access it
5032 * set layout to file. Unlikely this will fail as old layout was
5033 * surely eliminated */
5034 memset(&conf, 0, sizeof conf);
5035 conf.coc_opc = OBJECT_CONF_SET;
5036 conf.coc_inode = inode;
5037 conf.coc_lock = lock;
5038 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5039 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5040 rc = ll_layout_conf(inode, &conf);
5042 /* refresh layout failed, need to wait */
5043 wait_layout = rc == -EBUSY;
5046 LDLM_LOCK_PUT(lock);
5047 ldlm_lock_decref(lockh, mode);
5049 /* wait for IO to complete if it's still being used. */
5051 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5052 ll_get_fsname(inode->i_sb, NULL, 0),
5053 PFID(&lli->lli_fid), inode);
5055 memset(&conf, 0, sizeof conf);
5056 conf.coc_opc = OBJECT_CONF_WAIT;
5057 conf.coc_inode = inode;
5058 rc = ll_layout_conf(inode, &conf);
5062 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5063 ll_get_fsname(inode->i_sb, NULL, 0),
5064 PFID(&lli->lli_fid), rc);
5070 * Issue layout intent RPC to MDS.
5071 * \param inode [in] file inode
5072 * \param intent [in] layout intent
5074 * \retval 0 on success
5075 * \retval < 0 error code
5077 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5079 struct ll_inode_info *lli = ll_i2info(inode);
5080 struct ll_sb_info *sbi = ll_i2sbi(inode);
5081 struct md_op_data *op_data;
5082 struct lookup_intent it;
5083 struct ptlrpc_request *req;
5087 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5088 0, 0, LUSTRE_OPC_ANY, NULL);
5089 if (IS_ERR(op_data))
5090 RETURN(PTR_ERR(op_data));
5092 op_data->op_data = intent;
5093 op_data->op_data_size = sizeof(*intent);
5095 memset(&it, 0, sizeof(it));
5096 it.it_op = IT_LAYOUT;
5097 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5098 intent->li_opc == LAYOUT_INTENT_TRUNC)
5099 it.it_flags = FMODE_WRITE;
5101 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5102 ll_get_fsname(inode->i_sb, NULL, 0),
5103 PFID(&lli->lli_fid), inode);
5105 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5106 &ll_md_blocking_ast, 0);
5107 if (it.it_request != NULL)
5108 ptlrpc_req_finished(it.it_request);
5109 it.it_request = NULL;
5111 ll_finish_md_op_data(op_data);
5113 /* set lock data in case this is a new lock */
5115 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5117 ll_intent_drop_lock(&it);
5123 * This function checks if there exists a LAYOUT lock on the client side,
5124 * or enqueues it if it doesn't have one in cache.
5126 * This function will not hold layout lock so it may be revoked any time after
5127 * this function returns. Any operations depend on layout should be redone
5130 * This function should be called before lov_io_init() to get an uptodate
5131 * layout version, the caller should save the version number and after IO
5132 * is finished, this function should be called again to verify that layout
5133 * is not changed during IO time.
5135 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5137 struct ll_inode_info *lli = ll_i2info(inode);
5138 struct ll_sb_info *sbi = ll_i2sbi(inode);
5139 struct lustre_handle lockh;
5140 struct layout_intent intent = {
5141 .li_opc = LAYOUT_INTENT_ACCESS,
5143 enum ldlm_mode mode;
5147 *gen = ll_layout_version_get(lli);
5148 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5152 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5153 LASSERT(S_ISREG(inode->i_mode));
5155 /* take layout lock mutex to enqueue layout lock exclusively. */
5156 mutex_lock(&lli->lli_layout_mutex);
5159 /* mostly layout lock is caching on the local side, so try to
5160 * match it before grabbing layout lock mutex. */
5161 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5162 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5163 if (mode != 0) { /* hit cached lock */
5164 rc = ll_layout_lock_set(&lockh, mode, inode);
5170 rc = ll_layout_intent(inode, &intent);
5176 *gen = ll_layout_version_get(lli);
5177 mutex_unlock(&lli->lli_layout_mutex);
5183 * Issue layout intent RPC indicating where in a file an IO is about to write.
5185 * \param[in] inode file inode.
5186 * \param[in] ext write range with start offset of fille in bytes where
5187 * an IO is about to write, and exclusive end offset in
5190 * \retval 0 on success
5191 * \retval < 0 error code
5193 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5194 struct lu_extent *ext)
5196 struct layout_intent intent = {
5198 .li_extent.e_start = ext->e_start,
5199 .li_extent.e_end = ext->e_end,
5204 rc = ll_layout_intent(inode, &intent);
5210 * This function send a restore request to the MDT
5212 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5214 struct hsm_user_request *hur;
5218 len = sizeof(struct hsm_user_request) +
5219 sizeof(struct hsm_user_item);
5220 OBD_ALLOC(hur, len);
5224 hur->hur_request.hr_action = HUA_RESTORE;
5225 hur->hur_request.hr_archive_id = 0;
5226 hur->hur_request.hr_flags = 0;
5227 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5228 sizeof(hur->hur_user_item[0].hui_fid));
5229 hur->hur_user_item[0].hui_extent.offset = offset;
5230 hur->hur_user_item[0].hui_extent.length = length;
5231 hur->hur_request.hr_itemcount = 1;
5232 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,