4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_get_fsname(inode->i_sb, NULL, 0),
143 PFID(&lli->lli_fid));
147 OBD_ALLOC_PTR(op_data);
148 /* We leak openhandle and request here on error, but not much to be
149 * done in OOM case since app won't retry close on error either. */
151 GOTO(out, rc = -ENOMEM);
153 ll_prepare_close(inode, op_data, och);
155 case MDS_CLOSE_LAYOUT_MERGE:
156 /* merge blocks from the victim inode */
157 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
158 op_data->op_attr.ia_valid |= ATTR_SIZE;
159 op_data->op_xvalid |= OP_XVALID_BLOCKS;
160 case MDS_CLOSE_LAYOUT_SPLIT:
161 case MDS_CLOSE_LAYOUT_SWAP: {
162 struct split_param *sp = data;
164 LASSERT(data != NULL);
165 op_data->op_bias |= bias;
166 op_data->op_data_version = 0;
167 op_data->op_lease_handle = och->och_lease_handle;
168 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
169 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
170 op_data->op_mirror_id = sp->sp_mirror_id;
172 op_data->op_fid2 = *ll_inode2fid(data);
177 case MDS_CLOSE_RESYNC_DONE: {
178 struct ll_ioc_lease *ioc = data;
180 LASSERT(data != NULL);
181 op_data->op_attr_blocks +=
182 ioc->lil_count * op_data->op_attr_blocks;
183 op_data->op_attr.ia_valid |= ATTR_SIZE;
184 op_data->op_xvalid |= OP_XVALID_BLOCKS;
185 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
187 op_data->op_lease_handle = och->och_lease_handle;
188 op_data->op_data = &ioc->lil_ids[0];
189 op_data->op_data_size =
190 ioc->lil_count * sizeof(ioc->lil_ids[0]);
194 case MDS_HSM_RELEASE:
195 LASSERT(data != NULL);
196 op_data->op_bias |= MDS_HSM_RELEASE;
197 op_data->op_data_version = *(__u64 *)data;
198 op_data->op_lease_handle = och->och_lease_handle;
199 op_data->op_attr.ia_valid |= ATTR_SIZE;
200 op_data->op_xvalid |= OP_XVALID_BLOCKS;
204 LASSERT(data == NULL);
208 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
209 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
210 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
211 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
213 rc = md_close(md_exp, op_data, och->och_mod, &req);
214 if (rc != 0 && rc != -EINTR)
215 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
216 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
218 if (rc == 0 && op_data->op_bias & bias) {
219 struct mdt_body *body;
221 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
222 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
226 ll_finish_md_op_data(op_data);
230 md_clear_open_replay_data(md_exp, och);
231 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
234 ptlrpc_req_finished(req); /* This is close request */
238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
240 struct ll_inode_info *lli = ll_i2info(inode);
241 struct obd_client_handle **och_p;
242 struct obd_client_handle *och;
247 if (fmode & FMODE_WRITE) {
248 och_p = &lli->lli_mds_write_och;
249 och_usecount = &lli->lli_open_fd_write_count;
250 } else if (fmode & FMODE_EXEC) {
251 och_p = &lli->lli_mds_exec_och;
252 och_usecount = &lli->lli_open_fd_exec_count;
254 LASSERT(fmode & FMODE_READ);
255 och_p = &lli->lli_mds_read_och;
256 och_usecount = &lli->lli_open_fd_read_count;
259 mutex_lock(&lli->lli_och_mutex);
260 if (*och_usecount > 0) {
261 /* There are still users of this handle, so skip
263 mutex_unlock(&lli->lli_och_mutex);
269 mutex_unlock(&lli->lli_och_mutex);
272 /* There might be a race and this handle may already
274 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
280 static int ll_md_close(struct inode *inode, struct file *file)
282 union ldlm_policy_data policy = {
283 .l_inodebits = { MDS_INODELOCK_OPEN },
285 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
286 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
287 struct ll_inode_info *lli = ll_i2info(inode);
288 struct lustre_handle lockh;
289 enum ldlm_mode lockmode;
293 /* clear group lock, if present */
294 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
295 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
297 if (fd->fd_lease_och != NULL) {
300 /* Usually the lease is not released when the
301 * application crashed, we need to release here. */
302 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
303 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
304 PFID(&lli->lli_fid), rc, lease_broken);
306 fd->fd_lease_och = NULL;
309 if (fd->fd_och != NULL) {
310 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
315 /* Let's see if we have good enough OPEN lock on the file and if
316 we can skip talking to MDS */
317 mutex_lock(&lli->lli_och_mutex);
318 if (fd->fd_omode & FMODE_WRITE) {
320 LASSERT(lli->lli_open_fd_write_count);
321 lli->lli_open_fd_write_count--;
322 } else if (fd->fd_omode & FMODE_EXEC) {
324 LASSERT(lli->lli_open_fd_exec_count);
325 lli->lli_open_fd_exec_count--;
328 LASSERT(lli->lli_open_fd_read_count);
329 lli->lli_open_fd_read_count--;
331 mutex_unlock(&lli->lli_och_mutex);
333 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
334 LDLM_IBITS, &policy, lockmode, &lockh))
335 rc = ll_md_real_close(inode, fd->fd_omode);
338 LUSTRE_FPRIVATE(file) = NULL;
339 ll_file_data_put(fd);
344 /* While this returns an error code, fput() the caller does not, so we need
345 * to make every effort to clean up all of our state here. Also, applications
346 * rarely check close errors and even if an error is returned they will not
347 * re-try the close call.
349 int ll_file_release(struct inode *inode, struct file *file)
351 struct ll_file_data *fd;
352 struct ll_sb_info *sbi = ll_i2sbi(inode);
353 struct ll_inode_info *lli = ll_i2info(inode);
357 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
358 PFID(ll_inode2fid(inode)), inode);
360 if (inode->i_sb->s_root != file_dentry(file))
361 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
362 fd = LUSTRE_FPRIVATE(file);
365 /* The last ref on @file, maybe not the the owner pid of statahead,
366 * because parent and child process can share the same file handle. */
367 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
368 ll_deauthorize_statahead(inode, fd);
370 if (inode->i_sb->s_root == file_dentry(file)) {
371 LUSTRE_FPRIVATE(file) = NULL;
372 ll_file_data_put(fd);
376 if (!S_ISDIR(inode->i_mode)) {
377 if (lli->lli_clob != NULL)
378 lov_read_and_clear_async_rc(lli->lli_clob);
379 lli->lli_async_rc = 0;
382 rc = ll_md_close(inode, file);
384 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
385 libcfs_debug_dumplog();
390 static inline int ll_dom_readpage(void *data, struct page *page)
392 struct niobuf_local *lnb = data;
395 kaddr = ll_kmap_atomic(page, KM_USER0);
396 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
397 if (lnb->lnb_len < PAGE_SIZE)
398 memset(kaddr + lnb->lnb_len, 0,
399 PAGE_SIZE - lnb->lnb_len);
400 flush_dcache_page(page);
401 SetPageUptodate(page);
402 ll_kunmap_atomic(kaddr, KM_USER0);
408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
409 struct lookup_intent *it)
411 struct ll_inode_info *lli = ll_i2info(inode);
412 struct cl_object *obj = lli->lli_clob;
413 struct address_space *mapping = inode->i_mapping;
415 struct niobuf_remote *rnb;
420 struct lustre_handle lockh;
421 struct ldlm_lock *lock;
422 unsigned long index, start;
423 struct niobuf_local lnb;
425 bool dom_lock = false;
432 if (it->it_lock_mode != 0) {
433 lockh.cookie = it->it_lock_handle;
434 lock = ldlm_handle2lock(&lockh);
436 dom_lock = ldlm_has_dom(lock);
443 env = cl_env_get(&refcheck);
447 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
449 GOTO(out_env, rc = -ENODATA);
451 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
452 data = (char *)rnb + sizeof(*rnb);
454 if (rnb == NULL || rnb->rnb_len == 0)
455 GOTO(out_env, rc = 0);
457 CDEBUG(D_INFO, "Get data buffer along with open, len %i, i_size %llu\n",
458 rnb->rnb_len, i_size_read(inode));
460 io = vvp_env_thread_io(env);
462 io->ci_ignore_layout = 1;
463 rc = cl_io_init(env, io, CIT_MISC, obj);
467 lnb.lnb_file_offset = rnb->rnb_offset;
468 start = lnb.lnb_file_offset / PAGE_SIZE;
470 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
471 lnb.lnb_page_offset = 0;
475 lnb.lnb_data = data + (index << PAGE_SHIFT);
476 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
477 if (lnb.lnb_len > PAGE_SIZE)
478 lnb.lnb_len = PAGE_SIZE;
480 vmpage = read_cache_page(mapping, index + start,
481 ll_dom_readpage, &lnb);
482 if (IS_ERR(vmpage)) {
483 CWARN("%s: cannot fill page %lu for "DFID
484 " with data: rc = %li\n",
485 ll_get_fsname(inode->i_sb, NULL, 0),
486 index + start, PFID(lu_object_fid(&obj->co_lu)),
491 if (vmpage->mapping == NULL) {
494 /* page was truncated */
495 GOTO(out_io, rc = -ENODATA);
497 clp = cl_page_find(env, obj, vmpage->index, vmpage,
502 GOTO(out_io, rc = PTR_ERR(clp));
506 cl_page_export(env, clp, 1);
507 cl_page_put(env, clp);
511 } while (rnb->rnb_len > (index << PAGE_SHIFT));
517 cl_env_put(env, &refcheck);
520 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
521 struct lookup_intent *itp)
523 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
524 struct dentry *parent = de->d_parent;
525 const char *name = NULL;
527 struct md_op_data *op_data;
528 struct ptlrpc_request *req = NULL;
532 LASSERT(parent != NULL);
533 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
535 /* if server supports open-by-fid, or file name is invalid, don't pack
536 * name in open request */
537 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
538 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
539 name = de->d_name.name;
540 len = de->d_name.len;
543 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
544 name, len, 0, LUSTRE_OPC_ANY, NULL);
546 RETURN(PTR_ERR(op_data));
547 op_data->op_data = lmm;
548 op_data->op_data_size = lmmsize;
550 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
551 &ll_md_blocking_ast, 0);
552 ll_finish_md_op_data(op_data);
554 /* reason for keep own exit path - don`t flood log
555 * with messages with -ESTALE errors.
557 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
558 it_open_error(DISP_OPEN_OPEN, itp))
560 ll_release_openhandle(de, itp);
564 if (it_disposition(itp, DISP_LOOKUP_NEG))
565 GOTO(out, rc = -ENOENT);
567 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
568 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
569 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
573 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
575 if (!rc && itp->it_lock_mode) {
576 ll_dom_finish_open(de->d_inode, req, itp);
577 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
581 ptlrpc_req_finished(req);
582 ll_intent_drop_lock(itp);
584 /* We did open by fid, but by the time we got to the server,
585 * the object disappeared. If this is a create, we cannot really
586 * tell the userspace that the file it was trying to create
587 * does not exist. Instead let's return -ESTALE, and the VFS will
588 * retry the create with LOOKUP_REVAL that we are going to catch
589 * in ll_revalidate_dentry() and use lookup then.
591 if (rc == -ENOENT && itp->it_op & IT_CREAT)
597 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
598 struct obd_client_handle *och)
600 struct mdt_body *body;
602 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
603 och->och_open_handle = body->mbo_open_handle;
604 och->och_fid = body->mbo_fid1;
605 och->och_lease_handle.cookie = it->it_lock_handle;
606 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
607 och->och_flags = it->it_flags;
609 return md_set_open_replay_data(md_exp, och, it);
612 static int ll_local_open(struct file *file, struct lookup_intent *it,
613 struct ll_file_data *fd, struct obd_client_handle *och)
615 struct inode *inode = file_inode(file);
618 LASSERT(!LUSTRE_FPRIVATE(file));
625 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
630 LUSTRE_FPRIVATE(file) = fd;
631 ll_readahead_init(inode, &fd->fd_ras);
632 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
634 /* ll_cl_context initialize */
635 rwlock_init(&fd->fd_lock);
636 INIT_LIST_HEAD(&fd->fd_lccs);
641 /* Open a file, and (for the very first open) create objects on the OSTs at
642 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
643 * creation or open until ll_lov_setstripe() ioctl is called.
645 * If we already have the stripe MD locally then we don't request it in
646 * md_open(), by passing a lmm_size = 0.
648 * It is up to the application to ensure no other processes open this file
649 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
650 * used. We might be able to avoid races of that sort by getting lli_open_sem
651 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
652 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
654 int ll_file_open(struct inode *inode, struct file *file)
656 struct ll_inode_info *lli = ll_i2info(inode);
657 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
658 .it_flags = file->f_flags };
659 struct obd_client_handle **och_p = NULL;
660 __u64 *och_usecount = NULL;
661 struct ll_file_data *fd;
665 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
666 PFID(ll_inode2fid(inode)), inode, file->f_flags);
668 it = file->private_data; /* XXX: compat macro */
669 file->private_data = NULL; /* prevent ll_local_open assertion */
671 fd = ll_file_data_get();
673 GOTO(out_nofiledata, rc = -ENOMEM);
676 if (S_ISDIR(inode->i_mode))
677 ll_authorize_statahead(inode, fd);
679 if (inode->i_sb->s_root == file_dentry(file)) {
680 LUSTRE_FPRIVATE(file) = fd;
684 if (!it || !it->it_disposition) {
685 /* Convert f_flags into access mode. We cannot use file->f_mode,
686 * because everything but O_ACCMODE mask was stripped from
688 if ((oit.it_flags + 1) & O_ACCMODE)
690 if (file->f_flags & O_TRUNC)
691 oit.it_flags |= FMODE_WRITE;
693 /* kernel only call f_op->open in dentry_open. filp_open calls
694 * dentry_open after call to open_namei that checks permissions.
695 * Only nfsd_open call dentry_open directly without checking
696 * permissions and because of that this code below is safe.
698 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
699 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
701 /* We do not want O_EXCL here, presumably we opened the file
702 * already? XXX - NFS implications? */
703 oit.it_flags &= ~O_EXCL;
705 /* bug20584, if "it_flags" contains O_CREAT, the file will be
706 * created if necessary, then "IT_CREAT" should be set to keep
707 * consistent with it */
708 if (oit.it_flags & O_CREAT)
709 oit.it_op |= IT_CREAT;
715 /* Let's see if we have file open on MDS already. */
716 if (it->it_flags & FMODE_WRITE) {
717 och_p = &lli->lli_mds_write_och;
718 och_usecount = &lli->lli_open_fd_write_count;
719 } else if (it->it_flags & FMODE_EXEC) {
720 och_p = &lli->lli_mds_exec_och;
721 och_usecount = &lli->lli_open_fd_exec_count;
723 och_p = &lli->lli_mds_read_och;
724 och_usecount = &lli->lli_open_fd_read_count;
727 mutex_lock(&lli->lli_och_mutex);
728 if (*och_p) { /* Open handle is present */
729 if (it_disposition(it, DISP_OPEN_OPEN)) {
730 /* Well, there's extra open request that we do not need,
731 let's close it somehow. This will decref request. */
732 rc = it_open_error(DISP_OPEN_OPEN, it);
734 mutex_unlock(&lli->lli_och_mutex);
735 GOTO(out_openerr, rc);
738 ll_release_openhandle(file_dentry(file), it);
742 rc = ll_local_open(file, it, fd, NULL);
745 mutex_unlock(&lli->lli_och_mutex);
746 GOTO(out_openerr, rc);
749 LASSERT(*och_usecount == 0);
750 if (!it->it_disposition) {
751 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
752 /* We cannot just request lock handle now, new ELC code
753 means that one of other OPEN locks for this file
754 could be cancelled, and since blocking ast handler
755 would attempt to grab och_mutex as well, that would
756 result in a deadlock */
757 mutex_unlock(&lli->lli_och_mutex);
759 * Normally called under two situations:
761 * 2. A race/condition on MDS resulting in no open
762 * handle to be returned from LOOKUP|OPEN request,
763 * for example if the target entry was a symlink.
765 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
766 * marked by a bit set in ll_iget_for_nfs. Clear the
767 * bit so that it's not confusing later callers.
769 * NB; when ldd is NULL, it must have come via normal
770 * lookup path only, since ll_iget_for_nfs always calls
773 if (ldd && ldd->lld_nfs_dentry) {
774 ldd->lld_nfs_dentry = 0;
775 it->it_flags |= MDS_OPEN_LOCK;
779 * Always specify MDS_OPEN_BY_FID because we don't want
780 * to get file with different fid.
782 it->it_flags |= MDS_OPEN_BY_FID;
783 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
786 GOTO(out_openerr, rc);
790 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
792 GOTO(out_och_free, rc = -ENOMEM);
796 /* md_intent_lock() didn't get a request ref if there was an
797 * open error, so don't do cleanup on the request here
799 /* XXX (green): Should not we bail out on any error here, not
800 * just open error? */
801 rc = it_open_error(DISP_OPEN_OPEN, it);
803 GOTO(out_och_free, rc);
805 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
806 "inode %p: disposition %x, status %d\n", inode,
807 it_disposition(it, ~0), it->it_status);
809 rc = ll_local_open(file, it, fd, *och_p);
811 GOTO(out_och_free, rc);
813 mutex_unlock(&lli->lli_och_mutex);
816 /* Must do this outside lli_och_mutex lock to prevent deadlock where
817 different kind of OPEN lock for this same inode gets cancelled
818 by ldlm_cancel_lru */
819 if (!S_ISREG(inode->i_mode))
820 GOTO(out_och_free, rc);
822 cl_lov_delay_create_clear(&file->f_flags);
823 GOTO(out_och_free, rc);
827 if (och_p && *och_p) {
828 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
829 *och_p = NULL; /* OBD_FREE writes some magic there */
832 mutex_unlock(&lli->lli_och_mutex);
835 if (lli->lli_opendir_key == fd)
836 ll_deauthorize_statahead(inode, fd);
838 ll_file_data_put(fd);
840 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
844 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
845 ptlrpc_req_finished(it->it_request);
846 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
852 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
853 struct ldlm_lock_desc *desc, void *data, int flag)
856 struct lustre_handle lockh;
860 case LDLM_CB_BLOCKING:
861 ldlm_lock2handle(lock, &lockh);
862 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
864 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
868 case LDLM_CB_CANCELING:
876 * When setting a lease on a file, we take ownership of the lli_mds_*_och
877 * and save it as fd->fd_och so as to force client to reopen the file even
878 * if it has an open lock in cache already.
880 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
881 struct lustre_handle *old_open_handle)
883 struct ll_inode_info *lli = ll_i2info(inode);
884 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
885 struct obd_client_handle **och_p;
890 /* Get the openhandle of the file */
891 mutex_lock(&lli->lli_och_mutex);
892 if (fd->fd_lease_och != NULL)
893 GOTO(out_unlock, rc = -EBUSY);
895 if (fd->fd_och == NULL) {
896 if (file->f_mode & FMODE_WRITE) {
897 LASSERT(lli->lli_mds_write_och != NULL);
898 och_p = &lli->lli_mds_write_och;
899 och_usecount = &lli->lli_open_fd_write_count;
901 LASSERT(lli->lli_mds_read_och != NULL);
902 och_p = &lli->lli_mds_read_och;
903 och_usecount = &lli->lli_open_fd_read_count;
906 if (*och_usecount > 1)
907 GOTO(out_unlock, rc = -EBUSY);
914 *old_open_handle = fd->fd_och->och_open_handle;
918 mutex_unlock(&lli->lli_och_mutex);
923 * Release ownership on lli_mds_*_och when putting back a file lease.
925 static int ll_lease_och_release(struct inode *inode, struct file *file)
927 struct ll_inode_info *lli = ll_i2info(inode);
928 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
929 struct obd_client_handle **och_p;
930 struct obd_client_handle *old_och = NULL;
935 mutex_lock(&lli->lli_och_mutex);
936 if (file->f_mode & FMODE_WRITE) {
937 och_p = &lli->lli_mds_write_och;
938 och_usecount = &lli->lli_open_fd_write_count;
940 och_p = &lli->lli_mds_read_och;
941 och_usecount = &lli->lli_open_fd_read_count;
944 /* The file may have been open by another process (broken lease) so
945 * *och_p is not NULL. In this case we should simply increase usecount
948 if (*och_p != NULL) {
949 old_och = fd->fd_och;
956 mutex_unlock(&lli->lli_och_mutex);
959 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
965 * Acquire a lease and open the file.
967 static struct obd_client_handle *
968 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
971 struct lookup_intent it = { .it_op = IT_OPEN };
972 struct ll_sb_info *sbi = ll_i2sbi(inode);
973 struct md_op_data *op_data;
974 struct ptlrpc_request *req = NULL;
975 struct lustre_handle old_open_handle = { 0 };
976 struct obd_client_handle *och = NULL;
981 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
982 RETURN(ERR_PTR(-EINVAL));
985 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
986 RETURN(ERR_PTR(-EPERM));
988 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
995 RETURN(ERR_PTR(-ENOMEM));
997 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
998 LUSTRE_OPC_ANY, NULL);
1000 GOTO(out, rc = PTR_ERR(op_data));
1002 /* To tell the MDT this openhandle is from the same owner */
1003 op_data->op_open_handle = old_open_handle;
1005 it.it_flags = fmode | open_flags;
1006 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1007 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1008 &ll_md_blocking_lease_ast,
1009 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1010 * it can be cancelled which may mislead applications that the lease is
1012 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1013 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1014 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1015 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1016 ll_finish_md_op_data(op_data);
1017 ptlrpc_req_finished(req);
1019 GOTO(out_release_it, rc);
1021 if (it_disposition(&it, DISP_LOOKUP_NEG))
1022 GOTO(out_release_it, rc = -ENOENT);
1024 rc = it_open_error(DISP_OPEN_OPEN, &it);
1026 GOTO(out_release_it, rc);
1028 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1029 ll_och_fill(sbi->ll_md_exp, &it, och);
1031 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1032 GOTO(out_close, rc = -EOPNOTSUPP);
1034 /* already get lease, handle lease lock */
1035 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1036 if (it.it_lock_mode == 0 ||
1037 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1038 /* open lock must return for lease */
1039 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1040 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1042 GOTO(out_close, rc = -EPROTO);
1045 ll_intent_release(&it);
1049 /* Cancel open lock */
1050 if (it.it_lock_mode != 0) {
1051 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1053 it.it_lock_mode = 0;
1054 och->och_lease_handle.cookie = 0ULL;
1056 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1058 CERROR("%s: error closing file "DFID": %d\n",
1059 ll_get_fsname(inode->i_sb, NULL, 0),
1060 PFID(&ll_i2info(inode)->lli_fid), rc2);
1061 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1063 ll_intent_release(&it);
1067 RETURN(ERR_PTR(rc));
1071 * Check whether a layout swap can be done between two inodes.
1073 * \param[in] inode1 First inode to check
1074 * \param[in] inode2 Second inode to check
1076 * \retval 0 on success, layout swap can be performed between both inodes
1077 * \retval negative error code if requirements are not met
1079 static int ll_check_swap_layouts_validity(struct inode *inode1,
1080 struct inode *inode2)
1082 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1085 if (inode_permission(inode1, MAY_WRITE) ||
1086 inode_permission(inode2, MAY_WRITE))
1089 if (inode1->i_sb != inode2->i_sb)
1095 static int ll_swap_layouts_close(struct obd_client_handle *och,
1096 struct inode *inode, struct inode *inode2)
1098 const struct lu_fid *fid1 = ll_inode2fid(inode);
1099 const struct lu_fid *fid2;
1103 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1104 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1106 rc = ll_check_swap_layouts_validity(inode, inode2);
1108 GOTO(out_free_och, rc);
1110 /* We now know that inode2 is a lustre inode */
1111 fid2 = ll_inode2fid(inode2);
1113 rc = lu_fid_cmp(fid1, fid2);
1115 GOTO(out_free_och, rc = -EINVAL);
1117 /* Close the file and {swap,merge} layouts between inode & inode2.
1118 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1119 * because we still need it to pack l_remote_handle to MDT. */
1120 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1123 och = NULL; /* freed in ll_close_inode_openhandle() */
1133 * Release lease and close the file.
1134 * It will check if the lease has ever broken.
1136 static int ll_lease_close_intent(struct obd_client_handle *och,
1137 struct inode *inode,
1138 bool *lease_broken, enum mds_op_bias bias,
1141 struct ldlm_lock *lock;
1142 bool cancelled = true;
1146 lock = ldlm_handle2lock(&och->och_lease_handle);
1148 lock_res_and_lock(lock);
1149 cancelled = ldlm_is_cancel(lock);
1150 unlock_res_and_lock(lock);
1151 LDLM_LOCK_PUT(lock);
1154 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1155 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1157 if (lease_broken != NULL)
1158 *lease_broken = cancelled;
1160 if (!cancelled && !bias)
1161 ldlm_cli_cancel(&och->och_lease_handle, 0);
1163 if (cancelled) { /* no need to excute intent */
1168 rc = ll_close_inode_openhandle(inode, och, bias, data);
1172 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1175 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1179 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1181 static int ll_lease_file_resync(struct obd_client_handle *och,
1182 struct inode *inode)
1184 struct ll_sb_info *sbi = ll_i2sbi(inode);
1185 struct md_op_data *op_data;
1186 __u64 data_version_unused;
1190 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1191 LUSTRE_OPC_ANY, NULL);
1192 if (IS_ERR(op_data))
1193 RETURN(PTR_ERR(op_data));
1195 /* before starting file resync, it's necessary to clean up page cache
1196 * in client memory, otherwise once the layout version is increased,
1197 * writing back cached data will be denied the OSTs. */
1198 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1202 op_data->op_lease_handle = och->och_lease_handle;
1203 rc = md_file_resync(sbi->ll_md_exp, op_data);
1209 ll_finish_md_op_data(op_data);
1213 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1215 struct ll_inode_info *lli = ll_i2info(inode);
1216 struct cl_object *obj = lli->lli_clob;
1217 struct cl_attr *attr = vvp_env_thread_attr(env);
1225 ll_inode_size_lock(inode);
1227 /* Merge timestamps the most recently obtained from MDS with
1228 * timestamps obtained from OSTs.
1230 * Do not overwrite atime of inode because it may be refreshed
1231 * by file_accessed() function. If the read was served by cache
1232 * data, there is no RPC to be sent so that atime may not be
1233 * transferred to OSTs at all. MDT only updates atime at close time
1234 * if it's at least 'mdd.*.atime_diff' older.
1235 * All in all, the atime in Lustre does not strictly comply with
1236 * POSIX. Solving this problem needs to send an RPC to MDT for each
1237 * read, this will hurt performance. */
1238 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1239 LTIME_S(inode->i_atime) = lli->lli_atime;
1240 lli->lli_update_atime = 0;
1242 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1243 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1245 atime = LTIME_S(inode->i_atime);
1246 mtime = LTIME_S(inode->i_mtime);
1247 ctime = LTIME_S(inode->i_ctime);
1249 cl_object_attr_lock(obj);
1250 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1253 rc = cl_object_attr_get(env, obj, attr);
1254 cl_object_attr_unlock(obj);
1257 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1259 if (atime < attr->cat_atime)
1260 atime = attr->cat_atime;
1262 if (ctime < attr->cat_ctime)
1263 ctime = attr->cat_ctime;
1265 if (mtime < attr->cat_mtime)
1266 mtime = attr->cat_mtime;
1268 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1269 PFID(&lli->lli_fid), attr->cat_size);
1271 i_size_write(inode, attr->cat_size);
1272 inode->i_blocks = attr->cat_blocks;
1274 LTIME_S(inode->i_atime) = atime;
1275 LTIME_S(inode->i_mtime) = mtime;
1276 LTIME_S(inode->i_ctime) = ctime;
1279 ll_inode_size_unlock(inode);
1285 * Set designated mirror for I/O.
1287 * So far only read, write, and truncated can support to issue I/O to
1288 * designated mirror.
1290 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1292 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1294 /* clear layout version for generic(non-resync) I/O in case it carries
1295 * stale layout version due to I/O restart */
1296 io->ci_layout_version = 0;
1298 /* FLR: disable non-delay for designated mirror I/O because obviously
1299 * only one mirror is available */
1300 if (fd->fd_designated_mirror > 0) {
1302 io->ci_designated_mirror = fd->fd_designated_mirror;
1303 io->ci_layout_version = fd->fd_layout_version;
1304 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1308 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1309 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1312 static bool file_is_noatime(const struct file *file)
1314 const struct vfsmount *mnt = file->f_path.mnt;
1315 const struct inode *inode = file_inode((struct file *)file);
1317 /* Adapted from file_accessed() and touch_atime().*/
1318 if (file->f_flags & O_NOATIME)
1321 if (inode->i_flags & S_NOATIME)
1324 if (IS_NOATIME(inode))
1327 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1330 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1333 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1339 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1341 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1343 struct inode *inode = file_inode(file);
1344 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1346 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1347 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1348 io->u.ci_rw.rw_file = file;
1349 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1350 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1351 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1353 if (iot == CIT_WRITE) {
1354 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1355 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1356 file->f_flags & O_DIRECT ||
1359 io->ci_obj = ll_i2info(inode)->lli_clob;
1360 io->ci_lockreq = CILR_MAYBE;
1361 if (ll_file_nolock(file)) {
1362 io->ci_lockreq = CILR_NEVER;
1363 io->ci_no_srvlock = 1;
1364 } else if (file->f_flags & O_APPEND) {
1365 io->ci_lockreq = CILR_MANDATORY;
1367 io->ci_noatime = file_is_noatime(file);
1368 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1369 io->ci_pio = !io->u.ci_rw.rw_append;
1373 /* FLR: only use non-delay I/O for read as there is only one
1374 * avaliable mirror for write. */
1375 io->ci_ndelay = !(iot == CIT_WRITE);
1377 ll_io_set_mirror(io, file);
1380 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1382 struct cl_io_pt *pt = ptask->pt_cbdata;
1383 struct file *file = pt->cip_file;
1386 loff_t pos = pt->cip_pos;
1391 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1392 file_dentry(file)->d_name.name,
1393 pt->cip_iot == CIT_READ ? "read" : "write",
1394 pos, pos + pt->cip_count);
1396 env = cl_env_get(&refcheck);
1398 RETURN(PTR_ERR(env));
1400 io = vvp_env_thread_io(env);
1401 ll_io_init(io, file, pt->cip_iot);
1402 io->u.ci_rw.rw_iter = pt->cip_iter;
1403 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1404 io->ci_pio = 0; /* It's already in parallel task */
1406 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1407 pt->cip_count - pt->cip_result);
1409 struct vvp_io *vio = vvp_env_io(env);
1411 vio->vui_io_subtype = IO_NORMAL;
1412 vio->vui_fd = LUSTRE_FPRIVATE(file);
1414 ll_cl_add(file, env, io, LCC_RW);
1415 rc = cl_io_loop(env, io);
1416 ll_cl_remove(file, env);
1418 /* cl_io_rw_init() handled IO */
1422 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1428 if (io->ci_nob > 0) {
1429 pt->cip_result += io->ci_nob;
1430 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1432 pt->cip_iocb.ki_pos = pos;
1433 #ifdef HAVE_KIOCB_KI_LEFT
1434 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1435 #elif defined(HAVE_KI_NBYTES)
1436 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1440 cl_io_fini(env, io);
1441 cl_env_put(env, &refcheck);
1443 pt->cip_need_restart = io->ci_need_restart;
1445 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1446 file_dentry(file)->d_name.name,
1447 pt->cip_iot == CIT_READ ? "read" : "write",
1448 pt->cip_result, rc);
1450 RETURN(pt->cip_result > 0 ? 0 : rc);
1454 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1455 struct file *file, enum cl_io_type iot,
1456 loff_t *ppos, size_t count)
1458 struct range_lock range;
1459 struct vvp_io *vio = vvp_env_io(env);
1460 struct inode *inode = file_inode(file);
1461 struct ll_inode_info *lli = ll_i2info(inode);
1462 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1467 unsigned retried = 0;
1468 bool restarted = false;
1472 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1473 file_dentry(file)->d_name.name,
1474 iot == CIT_READ ? "read" : "write", pos, pos + count);
1477 io = vvp_env_thread_io(env);
1478 ll_io_init(io, file, iot);
1479 if (args->via_io_subtype == IO_NORMAL) {
1480 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1481 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1483 if (args->via_io_subtype != IO_NORMAL || restarted)
1485 io->ci_ndelay_tried = retried;
1487 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1488 bool range_locked = false;
1490 if (file->f_flags & O_APPEND)
1491 range_lock_init(&range, 0, LUSTRE_EOF);
1493 range_lock_init(&range, pos, pos + count - 1);
1495 vio->vui_fd = LUSTRE_FPRIVATE(file);
1496 vio->vui_io_subtype = args->via_io_subtype;
1498 switch (vio->vui_io_subtype) {
1500 /* Direct IO reads must also take range lock,
1501 * or multiple reads will try to work on the same pages
1502 * See LU-6227 for details. */
1503 if (((iot == CIT_WRITE) ||
1504 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1505 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1506 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1508 rc = range_lock(&lli->lli_write_tree, &range);
1512 range_locked = true;
1516 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1517 vio->u.splice.vui_flags = args->u.splice.via_flags;
1520 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1524 ll_cl_add(file, env, io, LCC_RW);
1525 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1526 !lli->lli_inode_locked) {
1528 lli->lli_inode_locked = 1;
1530 rc = cl_io_loop(env, io);
1531 if (lli->lli_inode_locked) {
1532 lli->lli_inode_locked = 0;
1533 inode_unlock(inode);
1535 ll_cl_remove(file, env);
1538 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1540 range_unlock(&lli->lli_write_tree, &range);
1543 /* cl_io_rw_init() handled IO */
1547 if (io->ci_nob > 0) {
1548 result += io->ci_nob;
1549 count -= io->ci_nob;
1551 if (args->via_io_subtype == IO_NORMAL) {
1552 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1554 /* CLIO is too complicated. See LU-11069. */
1555 if (cl_io_is_append(io))
1556 pos = io->u.ci_rw.rw_iocb.ki_pos;
1560 args->u.normal.via_iocb->ki_pos = pos;
1561 #ifdef HAVE_KIOCB_KI_LEFT
1562 args->u.normal.via_iocb->ki_left = count;
1563 #elif defined(HAVE_KI_NBYTES)
1564 args->u.normal.via_iocb->ki_nbytes = count;
1568 pos = io->u.ci_rw.rw_range.cir_pos;
1572 cl_io_fini(env, io);
1575 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1576 file->f_path.dentry->d_name.name,
1577 iot, rc, result, io->ci_need_restart);
1579 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1581 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1582 file_dentry(file)->d_name.name,
1583 iot == CIT_READ ? "read" : "write",
1584 pos, pos + count, result, rc);
1585 /* preserve the tried count for FLR */
1586 retried = io->ci_ndelay_tried;
1591 if (iot == CIT_READ) {
1593 ll_stats_ops_tally(ll_i2sbi(inode),
1594 LPROC_LL_READ_BYTES, result);
1595 } else if (iot == CIT_WRITE) {
1597 ll_stats_ops_tally(ll_i2sbi(inode),
1598 LPROC_LL_WRITE_BYTES, result);
1599 fd->fd_write_failed = false;
1600 } else if (result == 0 && rc == 0) {
1603 fd->fd_write_failed = true;
1605 fd->fd_write_failed = false;
1606 } else if (rc != -ERESTARTSYS) {
1607 fd->fd_write_failed = true;
1611 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1612 file_dentry(file)->d_name.name,
1613 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1617 RETURN(result > 0 ? result : rc);
1621 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1622 * especially for small I/O.
1624 * To serve a read request, CLIO has to create and initialize a cl_io and
1625 * then request DLM lock. This has turned out to have siginificant overhead
1626 * and affects the performance of small I/O dramatically.
1628 * It's not necessary to create a cl_io for each I/O. Under the help of read
1629 * ahead, most of the pages being read are already in memory cache and we can
1630 * read those pages directly because if the pages exist, the corresponding DLM
1631 * lock must exist so that page content must be valid.
1633 * In fast read implementation, the llite speculatively finds and reads pages
1634 * in memory cache. There are three scenarios for fast read:
1635 * - If the page exists and is uptodate, kernel VM will provide the data and
1636 * CLIO won't be intervened;
1637 * - If the page was brought into memory by read ahead, it will be exported
1638 * and read ahead parameters will be updated;
1639 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1640 * it will go back and invoke normal read, i.e., a cl_io will be created
1641 * and DLM lock will be requested.
1643 * POSIX compliance: posix standard states that read is intended to be atomic.
1644 * Lustre read implementation is in line with Linux kernel read implementation
1645 * and neither of them complies with POSIX standard in this matter. Fast read
1646 * doesn't make the situation worse on single node but it may interleave write
1647 * results from multiple nodes due to short read handling in ll_file_aio_read().
1649 * \param env - lu_env
1650 * \param iocb - kiocb from kernel
1651 * \param iter - user space buffers where the data will be copied
1653 * \retval - number of bytes have been read, or error code if error occurred.
1656 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1660 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1663 /* NB: we can't do direct IO for fast read because it will need a lock
1664 * to make IO engine happy. */
1665 if (iocb->ki_filp->f_flags & O_DIRECT)
1668 result = generic_file_read_iter(iocb, iter);
1670 /* If the first page is not in cache, generic_file_aio_read() will be
1671 * returned with -ENODATA.
1672 * See corresponding code in ll_readpage(). */
1673 if (result == -ENODATA)
1677 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1678 LPROC_LL_READ_BYTES, result);
1684 * Read from a file (through the page cache).
1686 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1689 struct vvp_io_args *args;
1694 result = ll_do_fast_read(iocb, to);
1695 if (result < 0 || iov_iter_count(to) == 0)
1698 env = cl_env_get(&refcheck);
1700 return PTR_ERR(env);
1702 args = ll_env_args(env, IO_NORMAL);
1703 args->u.normal.via_iter = to;
1704 args->u.normal.via_iocb = iocb;
1706 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1707 &iocb->ki_pos, iov_iter_count(to));
1710 else if (result == 0)
1713 cl_env_put(env, &refcheck);
1719 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1720 * If a page is already in the page cache and dirty (and some other things -
1721 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1722 * write to it without doing a full I/O, because Lustre already knows about it
1723 * and will write it out. This saves a lot of processing time.
1725 * All writes here are within one page, so exclusion is handled by the page
1726 * lock on the vm page. We do not do tiny writes for writes which touch
1727 * multiple pages because it's very unlikely multiple sequential pages are
1728 * are already dirty.
1730 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1731 * and are unlikely to be to already dirty pages.
1733 * Attribute updates are important here, we do them in ll_tiny_write_end.
1735 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1737 ssize_t count = iov_iter_count(iter);
1738 struct file *file = iocb->ki_filp;
1739 struct inode *inode = file_inode(file);
1744 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1745 * of function for why.
1747 if (count >= PAGE_SIZE ||
1748 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1751 result = __generic_file_write_iter(iocb, iter);
1753 /* If the page is not already dirty, ll_tiny_write_begin returns
1754 * -ENODATA. We continue on to normal write.
1756 if (result == -ENODATA)
1760 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1762 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1765 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1771 * Write to a file (through the page cache).
1773 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1775 struct vvp_io_args *args;
1777 ssize_t rc_tiny = 0, rc_normal;
1782 /* NB: we can't do direct IO for tiny writes because they use the page
1783 * cache, we can't do sync writes because tiny writes can't flush
1784 * pages, and we can't do append writes because we can't guarantee the
1785 * required DLM locks are held to protect file size.
1787 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1788 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1789 rc_tiny = ll_do_tiny_write(iocb, from);
1791 /* In case of error, go on and try normal write - Only stop if tiny
1792 * write completed I/O.
1794 if (iov_iter_count(from) == 0)
1795 GOTO(out, rc_normal = rc_tiny);
1797 env = cl_env_get(&refcheck);
1799 return PTR_ERR(env);
1801 args = ll_env_args(env, IO_NORMAL);
1802 args->u.normal.via_iter = from;
1803 args->u.normal.via_iocb = iocb;
1805 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1806 &iocb->ki_pos, iov_iter_count(from));
1808 /* On success, combine bytes written. */
1809 if (rc_tiny >= 0 && rc_normal > 0)
1810 rc_normal += rc_tiny;
1811 /* On error, only return error from normal write if tiny write did not
1812 * write any bytes. Otherwise return bytes written by tiny write.
1814 else if (rc_tiny > 0)
1815 rc_normal = rc_tiny;
1817 cl_env_put(env, &refcheck);
1822 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1824 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1826 static int ll_file_get_iov_count(const struct iovec *iov,
1827 unsigned long *nr_segs, size_t *count)
1832 for (seg = 0; seg < *nr_segs; seg++) {
1833 const struct iovec *iv = &iov[seg];
1836 * If any segment has a negative length, or the cumulative
1837 * length ever wraps negative then return -EINVAL.
1840 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1842 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1847 cnt -= iv->iov_len; /* This segment is no good */
1854 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1855 unsigned long nr_segs, loff_t pos)
1862 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1866 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1867 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1868 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1869 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1870 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1872 result = ll_file_read_iter(iocb, &to);
1877 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1880 struct iovec iov = { .iov_base = buf, .iov_len = count };
1885 init_sync_kiocb(&kiocb, file);
1886 kiocb.ki_pos = *ppos;
1887 #ifdef HAVE_KIOCB_KI_LEFT
1888 kiocb.ki_left = count;
1889 #elif defined(HAVE_KI_NBYTES)
1890 kiocb.i_nbytes = count;
1893 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1894 *ppos = kiocb.ki_pos;
1900 * Write to a file (through the page cache).
1903 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1904 unsigned long nr_segs, loff_t pos)
1906 struct iov_iter from;
1911 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1915 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1916 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1917 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1918 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1919 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1921 result = ll_file_write_iter(iocb, &from);
1926 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1927 size_t count, loff_t *ppos)
1929 struct iovec iov = { .iov_base = (void __user *)buf,
1936 init_sync_kiocb(&kiocb, file);
1937 kiocb.ki_pos = *ppos;
1938 #ifdef HAVE_KIOCB_KI_LEFT
1939 kiocb.ki_left = count;
1940 #elif defined(HAVE_KI_NBYTES)
1941 kiocb.ki_nbytes = count;
1944 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1945 *ppos = kiocb.ki_pos;
1949 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1952 * Send file content (through pagecache) somewhere with helper
1954 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1955 struct pipe_inode_info *pipe, size_t count,
1959 struct vvp_io_args *args;
1964 env = cl_env_get(&refcheck);
1966 RETURN(PTR_ERR(env));
1968 args = ll_env_args(env, IO_SPLICE);
1969 args->u.splice.via_pipe = pipe;
1970 args->u.splice.via_flags = flags;
1972 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1973 cl_env_put(env, &refcheck);
1977 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1978 __u64 flags, struct lov_user_md *lum, int lum_size)
1980 struct lookup_intent oit = {
1982 .it_flags = flags | MDS_OPEN_BY_FID,
1987 ll_inode_size_lock(inode);
1988 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1990 GOTO(out_unlock, rc);
1992 ll_release_openhandle(dentry, &oit);
1995 ll_inode_size_unlock(inode);
1996 ll_intent_release(&oit);
2001 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2002 struct lov_mds_md **lmmp, int *lmm_size,
2003 struct ptlrpc_request **request)
2005 struct ll_sb_info *sbi = ll_i2sbi(inode);
2006 struct mdt_body *body;
2007 struct lov_mds_md *lmm = NULL;
2008 struct ptlrpc_request *req = NULL;
2009 struct md_op_data *op_data;
2012 rc = ll_get_default_mdsize(sbi, &lmmsize);
2016 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2017 strlen(filename), lmmsize,
2018 LUSTRE_OPC_ANY, NULL);
2019 if (IS_ERR(op_data))
2020 RETURN(PTR_ERR(op_data));
2022 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2023 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2024 ll_finish_md_op_data(op_data);
2026 CDEBUG(D_INFO, "md_getattr_name failed "
2027 "on %s: rc %d\n", filename, rc);
2031 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2032 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2034 lmmsize = body->mbo_eadatasize;
2036 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2038 GOTO(out, rc = -ENODATA);
2041 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2042 LASSERT(lmm != NULL);
2044 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2045 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2046 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2047 GOTO(out, rc = -EPROTO);
2050 * This is coming from the MDS, so is probably in
2051 * little endian. We convert it to host endian before
2052 * passing it to userspace.
2054 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2057 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2058 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2059 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2060 if (le32_to_cpu(lmm->lmm_pattern) &
2061 LOV_PATTERN_F_RELEASED)
2065 /* if function called for directory - we should
2066 * avoid swab not existent lsm objects */
2067 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2068 lustre_swab_lov_user_md_v1(
2069 (struct lov_user_md_v1 *)lmm);
2070 if (S_ISREG(body->mbo_mode))
2071 lustre_swab_lov_user_md_objects(
2072 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2074 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2075 lustre_swab_lov_user_md_v3(
2076 (struct lov_user_md_v3 *)lmm);
2077 if (S_ISREG(body->mbo_mode))
2078 lustre_swab_lov_user_md_objects(
2079 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2081 } else if (lmm->lmm_magic ==
2082 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2083 lustre_swab_lov_comp_md_v1(
2084 (struct lov_comp_md_v1 *)lmm);
2090 *lmm_size = lmmsize;
2095 static int ll_lov_setea(struct inode *inode, struct file *file,
2098 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2099 struct lov_user_md *lump;
2100 int lum_size = sizeof(struct lov_user_md) +
2101 sizeof(struct lov_user_ost_data);
2105 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2108 OBD_ALLOC_LARGE(lump, lum_size);
2112 if (copy_from_user(lump, arg, lum_size))
2113 GOTO(out_lump, rc = -EFAULT);
2115 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2117 cl_lov_delay_create_clear(&file->f_flags);
2120 OBD_FREE_LARGE(lump, lum_size);
2124 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2131 env = cl_env_get(&refcheck);
2133 RETURN(PTR_ERR(env));
2135 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2136 cl_env_put(env, &refcheck);
2140 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2143 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2144 struct lov_user_md *klum;
2146 __u64 flags = FMODE_WRITE;
2149 rc = ll_copy_user_md(lum, &klum);
2154 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2159 rc = put_user(0, &lum->lmm_stripe_count);
2163 rc = ll_layout_refresh(inode, &gen);
2167 rc = ll_file_getstripe(inode, arg, lum_size);
2169 cl_lov_delay_create_clear(&file->f_flags);
2172 OBD_FREE(klum, lum_size);
2177 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2179 struct ll_inode_info *lli = ll_i2info(inode);
2180 struct cl_object *obj = lli->lli_clob;
2181 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2182 struct ll_grouplock grouplock;
2187 CWARN("group id for group lock must not be 0\n");
2191 if (ll_file_nolock(file))
2192 RETURN(-EOPNOTSUPP);
2194 spin_lock(&lli->lli_lock);
2195 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2196 CWARN("group lock already existed with gid %lu\n",
2197 fd->fd_grouplock.lg_gid);
2198 spin_unlock(&lli->lli_lock);
2201 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2202 spin_unlock(&lli->lli_lock);
2205 * XXX: group lock needs to protect all OST objects while PFL
2206 * can add new OST objects during the IO, so we'd instantiate
2207 * all OST objects before getting its group lock.
2212 struct cl_layout cl = {
2213 .cl_is_composite = false,
2215 struct lu_extent ext = {
2217 .e_end = OBD_OBJECT_EOF,
2220 env = cl_env_get(&refcheck);
2222 RETURN(PTR_ERR(env));
2224 rc = cl_object_layout_get(env, obj, &cl);
2225 if (!rc && cl.cl_is_composite)
2226 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2229 cl_env_put(env, &refcheck);
2234 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2235 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2239 spin_lock(&lli->lli_lock);
2240 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2241 spin_unlock(&lli->lli_lock);
2242 CERROR("another thread just won the race\n");
2243 cl_put_grouplock(&grouplock);
2247 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2248 fd->fd_grouplock = grouplock;
2249 spin_unlock(&lli->lli_lock);
2251 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2255 static int ll_put_grouplock(struct inode *inode, struct file *file,
2258 struct ll_inode_info *lli = ll_i2info(inode);
2259 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2260 struct ll_grouplock grouplock;
2263 spin_lock(&lli->lli_lock);
2264 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2265 spin_unlock(&lli->lli_lock);
2266 CWARN("no group lock held\n");
2270 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2272 if (fd->fd_grouplock.lg_gid != arg) {
2273 CWARN("group lock %lu doesn't match current id %lu\n",
2274 arg, fd->fd_grouplock.lg_gid);
2275 spin_unlock(&lli->lli_lock);
2279 grouplock = fd->fd_grouplock;
2280 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2281 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2282 spin_unlock(&lli->lli_lock);
2284 cl_put_grouplock(&grouplock);
2285 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2290 * Close inode open handle
2292 * \param dentry [in] dentry which contains the inode
2293 * \param it [in,out] intent which contains open info and result
2296 * \retval <0 failure
2298 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2300 struct inode *inode = dentry->d_inode;
2301 struct obd_client_handle *och;
2307 /* Root ? Do nothing. */
2308 if (dentry->d_inode->i_sb->s_root == dentry)
2311 /* No open handle to close? Move away */
2312 if (!it_disposition(it, DISP_OPEN_OPEN))
2315 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2317 OBD_ALLOC(och, sizeof(*och));
2319 GOTO(out, rc = -ENOMEM);
2321 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2323 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2325 /* this one is in place of ll_file_open */
2326 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2327 ptlrpc_req_finished(it->it_request);
2328 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2334 * Get size for inode for which FIEMAP mapping is requested.
2335 * Make the FIEMAP get_info call and returns the result.
2336 * \param fiemap kernel buffer to hold extens
2337 * \param num_bytes kernel buffer size
2339 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2345 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2348 /* Checks for fiemap flags */
2349 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2350 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2354 /* Check for FIEMAP_FLAG_SYNC */
2355 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2356 rc = filemap_fdatawrite(inode->i_mapping);
2361 env = cl_env_get(&refcheck);
2363 RETURN(PTR_ERR(env));
2365 if (i_size_read(inode) == 0) {
2366 rc = ll_glimpse_size(inode);
2371 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2372 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2373 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2375 /* If filesize is 0, then there would be no objects for mapping */
2376 if (fmkey.lfik_oa.o_size == 0) {
2377 fiemap->fm_mapped_extents = 0;
2381 fmkey.lfik_fiemap = *fiemap;
2383 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2384 &fmkey, fiemap, &num_bytes);
2386 cl_env_put(env, &refcheck);
2390 int ll_fid2path(struct inode *inode, void __user *arg)
2392 struct obd_export *exp = ll_i2mdexp(inode);
2393 const struct getinfo_fid2path __user *gfin = arg;
2395 struct getinfo_fid2path *gfout;
2401 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2402 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2405 /* Only need to get the buflen */
2406 if (get_user(pathlen, &gfin->gf_pathlen))
2409 if (pathlen > PATH_MAX)
2412 outsize = sizeof(*gfout) + pathlen;
2413 OBD_ALLOC(gfout, outsize);
2417 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2418 GOTO(gf_free, rc = -EFAULT);
2419 /* append root FID after gfout to let MDT know the root FID so that it
2420 * can lookup the correct path, this is mainly for fileset.
2421 * old server without fileset mount support will ignore this. */
2422 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2424 /* Call mdc_iocontrol */
2425 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2429 if (copy_to_user(arg, gfout, outsize))
2433 OBD_FREE(gfout, outsize);
2438 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2440 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2448 ioc->idv_version = 0;
2449 ioc->idv_layout_version = UINT_MAX;
2451 /* If no file object initialized, we consider its version is 0. */
2455 env = cl_env_get(&refcheck);
2457 RETURN(PTR_ERR(env));
2459 io = vvp_env_thread_io(env);
2461 io->u.ci_data_version.dv_data_version = 0;
2462 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2463 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2466 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2467 result = cl_io_loop(env, io);
2469 result = io->ci_result;
2471 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2472 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2474 cl_io_fini(env, io);
2476 if (unlikely(io->ci_need_restart))
2479 cl_env_put(env, &refcheck);
2485 * Read the data_version for inode.
2487 * This value is computed using stripe object version on OST.
2488 * Version is computed using server side locking.
2490 * @param flags if do sync on the OST side;
2492 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2493 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2495 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2497 struct ioc_data_version ioc = { .idv_flags = flags };
2500 rc = ll_ioc_data_version(inode, &ioc);
2502 *data_version = ioc.idv_version;
2508 * Trigger a HSM release request for the provided inode.
2510 int ll_hsm_release(struct inode *inode)
2513 struct obd_client_handle *och = NULL;
2514 __u64 data_version = 0;
2519 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2520 ll_get_fsname(inode->i_sb, NULL, 0),
2521 PFID(&ll_i2info(inode)->lli_fid));
2523 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2525 GOTO(out, rc = PTR_ERR(och));
2527 /* Grab latest data_version and [am]time values */
2528 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2532 env = cl_env_get(&refcheck);
2534 GOTO(out, rc = PTR_ERR(env));
2536 rc = ll_merge_attr(env, inode);
2537 cl_env_put(env, &refcheck);
2539 /* If error happen, we have the wrong size for a file.
2545 /* Release the file.
2546 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2547 * we still need it to pack l_remote_handle to MDT. */
2548 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2554 if (och != NULL && !IS_ERR(och)) /* close the file */
2555 ll_lease_close(och, inode, NULL);
2560 struct ll_swap_stack {
2563 struct inode *inode1;
2564 struct inode *inode2;
2569 static int ll_swap_layouts(struct file *file1, struct file *file2,
2570 struct lustre_swap_layouts *lsl)
2572 struct mdc_swap_layouts msl;
2573 struct md_op_data *op_data;
2576 struct ll_swap_stack *llss = NULL;
2579 OBD_ALLOC_PTR(llss);
2583 llss->inode1 = file_inode(file1);
2584 llss->inode2 = file_inode(file2);
2586 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2590 /* we use 2 bool because it is easier to swap than 2 bits */
2591 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2592 llss->check_dv1 = true;
2594 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2595 llss->check_dv2 = true;
2597 /* we cannot use lsl->sl_dvX directly because we may swap them */
2598 llss->dv1 = lsl->sl_dv1;
2599 llss->dv2 = lsl->sl_dv2;
2601 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2602 if (rc == 0) /* same file, done! */
2605 if (rc < 0) { /* sequentialize it */
2606 swap(llss->inode1, llss->inode2);
2608 swap(llss->dv1, llss->dv2);
2609 swap(llss->check_dv1, llss->check_dv2);
2613 if (gid != 0) { /* application asks to flush dirty cache */
2614 rc = ll_get_grouplock(llss->inode1, file1, gid);
2618 rc = ll_get_grouplock(llss->inode2, file2, gid);
2620 ll_put_grouplock(llss->inode1, file1, gid);
2625 /* ultimate check, before swaping the layouts we check if
2626 * dataversion has changed (if requested) */
2627 if (llss->check_dv1) {
2628 rc = ll_data_version(llss->inode1, &dv, 0);
2631 if (dv != llss->dv1)
2632 GOTO(putgl, rc = -EAGAIN);
2635 if (llss->check_dv2) {
2636 rc = ll_data_version(llss->inode2, &dv, 0);
2639 if (dv != llss->dv2)
2640 GOTO(putgl, rc = -EAGAIN);
2643 /* struct md_op_data is used to send the swap args to the mdt
2644 * only flags is missing, so we use struct mdc_swap_layouts
2645 * through the md_op_data->op_data */
2646 /* flags from user space have to be converted before they are send to
2647 * server, no flag is sent today, they are only used on the client */
2650 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2651 0, LUSTRE_OPC_ANY, &msl);
2652 if (IS_ERR(op_data))
2653 GOTO(free, rc = PTR_ERR(op_data));
2655 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2656 sizeof(*op_data), op_data, NULL);
2657 ll_finish_md_op_data(op_data);
2664 ll_put_grouplock(llss->inode2, file2, gid);
2665 ll_put_grouplock(llss->inode1, file1, gid);
2675 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2677 struct md_op_data *op_data;
2681 /* Detect out-of range masks */
2682 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2685 /* Non-root users are forbidden to set or clear flags which are
2686 * NOT defined in HSM_USER_MASK. */
2687 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2688 !cfs_capable(CFS_CAP_SYS_ADMIN))
2691 /* Detect out-of range archive id */
2692 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2693 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2696 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2697 LUSTRE_OPC_ANY, hss);
2698 if (IS_ERR(op_data))
2699 RETURN(PTR_ERR(op_data));
2701 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2702 sizeof(*op_data), op_data, NULL);
2704 ll_finish_md_op_data(op_data);
2709 static int ll_hsm_import(struct inode *inode, struct file *file,
2710 struct hsm_user_import *hui)
2712 struct hsm_state_set *hss = NULL;
2713 struct iattr *attr = NULL;
2717 if (!S_ISREG(inode->i_mode))
2723 GOTO(out, rc = -ENOMEM);
2725 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2726 hss->hss_archive_id = hui->hui_archive_id;
2727 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2728 rc = ll_hsm_state_set(inode, hss);
2732 OBD_ALLOC_PTR(attr);
2734 GOTO(out, rc = -ENOMEM);
2736 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2737 attr->ia_mode |= S_IFREG;
2738 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2739 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2740 attr->ia_size = hui->hui_size;
2741 attr->ia_mtime.tv_sec = hui->hui_mtime;
2742 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2743 attr->ia_atime.tv_sec = hui->hui_atime;
2744 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2746 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2747 ATTR_UID | ATTR_GID |
2748 ATTR_MTIME | ATTR_MTIME_SET |
2749 ATTR_ATIME | ATTR_ATIME_SET;
2753 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2757 inode_unlock(inode);
2769 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2771 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2772 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2775 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2777 struct inode *inode = file_inode(file);
2779 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2780 ATTR_MTIME | ATTR_MTIME_SET |
2783 .tv_sec = lfu->lfu_atime_sec,
2784 .tv_nsec = lfu->lfu_atime_nsec,
2787 .tv_sec = lfu->lfu_mtime_sec,
2788 .tv_nsec = lfu->lfu_mtime_nsec,
2791 .tv_sec = lfu->lfu_ctime_sec,
2792 .tv_nsec = lfu->lfu_ctime_nsec,
2798 if (!capable(CAP_SYS_ADMIN))
2801 if (!S_ISREG(inode->i_mode))
2805 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2807 inode_unlock(inode);
2812 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2815 case MODE_READ_USER:
2817 case MODE_WRITE_USER:
2824 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2826 /* Used to allow the upper layers of the client to request an LDLM lock
2827 * without doing an actual read or write.
2829 * Used for ladvise lockahead to manually request specific locks.
2831 * \param[in] file file this ladvise lock request is on
2832 * \param[in] ladvise ladvise struct describing this lock request
2834 * \retval 0 success, no detailed result available (sync requests
2835 * and requests sent to the server [not handled locally]
2836 * cannot return detailed results)
2837 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2838 * see definitions for details.
2839 * \retval negative negative errno on error
2841 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2843 struct lu_env *env = NULL;
2844 struct cl_io *io = NULL;
2845 struct cl_lock *lock = NULL;
2846 struct cl_lock_descr *descr = NULL;
2847 struct dentry *dentry = file->f_path.dentry;
2848 struct inode *inode = dentry->d_inode;
2849 enum cl_lock_mode cl_mode;
2850 off_t start = ladvise->lla_start;
2851 off_t end = ladvise->lla_end;
2857 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2858 "start=%llu, end=%llu\n", dentry->d_name.len,
2859 dentry->d_name.name, dentry->d_inode,
2860 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2863 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2865 GOTO(out, result = cl_mode);
2867 /* Get IO environment */
2868 result = cl_io_get(inode, &env, &io, &refcheck);
2872 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2875 * nothing to do for this io. This currently happens when
2876 * stripe sub-object's are not yet created.
2878 result = io->ci_result;
2879 } else if (result == 0) {
2880 lock = vvp_env_lock(env);
2881 descr = &lock->cll_descr;
2883 descr->cld_obj = io->ci_obj;
2884 /* Convert byte offsets to pages */
2885 descr->cld_start = cl_index(io->ci_obj, start);
2886 descr->cld_end = cl_index(io->ci_obj, end);
2887 descr->cld_mode = cl_mode;
2888 /* CEF_MUST is used because we do not want to convert a
2889 * lockahead request to a lockless lock */
2890 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2893 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2894 descr->cld_enq_flags |= CEF_SPECULATIVE;
2896 result = cl_lock_request(env, io, lock);
2898 /* On success, we need to release the lock */
2900 cl_lock_release(env, lock);
2902 cl_io_fini(env, io);
2903 cl_env_put(env, &refcheck);
2905 /* -ECANCELED indicates a matching lock with a different extent
2906 * was already present, and -EEXIST indicates a matching lock
2907 * on exactly the same extent was already present.
2908 * We convert them to positive values for userspace to make
2909 * recognizing true errors easier.
2910 * Note we can only return these detailed results on async requests,
2911 * as sync requests look the same as i/o requests for locking. */
2912 if (result == -ECANCELED)
2913 result = LLA_RESULT_DIFFERENT;
2914 else if (result == -EEXIST)
2915 result = LLA_RESULT_SAME;
2920 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2922 static int ll_ladvise_sanity(struct inode *inode,
2923 struct llapi_lu_ladvise *ladvise)
2925 enum lu_ladvise_type advice = ladvise->lla_advice;
2926 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2927 * be in the first 32 bits of enum ladvise_flags */
2928 __u32 flags = ladvise->lla_peradvice_flags;
2929 /* 3 lines at 80 characters per line, should be plenty */
2932 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2934 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2935 "last supported advice is %s (value '%d'): rc = %d\n",
2936 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2937 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2941 /* Per-advice checks */
2943 case LU_LADVISE_LOCKNOEXPAND:
2944 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2946 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2948 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2949 ladvise_names[advice], rc);
2953 case LU_LADVISE_LOCKAHEAD:
2954 /* Currently only READ and WRITE modes can be requested */
2955 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2956 ladvise->lla_lockahead_mode == 0) {
2958 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2960 ll_get_fsname(inode->i_sb, NULL, 0),
2961 ladvise->lla_lockahead_mode,
2962 ladvise_names[advice], rc);
2965 case LU_LADVISE_WILLREAD:
2966 case LU_LADVISE_DONTNEED:
2968 /* Note fall through above - These checks apply to all advices
2969 * except LOCKNOEXPAND */
2970 if (flags & ~LF_DEFAULT_MASK) {
2972 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2974 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2975 ladvise_names[advice], rc);
2978 if (ladvise->lla_start >= ladvise->lla_end) {
2980 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2981 "for %s: rc = %d\n",
2982 ll_get_fsname(inode->i_sb, NULL, 0),
2983 ladvise->lla_start, ladvise->lla_end,
2984 ladvise_names[advice], rc);
2996 * Give file access advices
2998 * The ladvise interface is similar to Linux fadvise() system call, except it
2999 * forwards the advices directly from Lustre client to server. The server side
3000 * codes will apply appropriate read-ahead and caching techniques for the
3001 * corresponding files.
3003 * A typical workload for ladvise is e.g. a bunch of different clients are
3004 * doing small random reads of a file, so prefetching pages into OSS cache
3005 * with big linear reads before the random IO is a net benefit. Fetching
3006 * all that data into each client cache with fadvise() may not be, due to
3007 * much more data being sent to the client.
3009 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3010 struct llapi_lu_ladvise *ladvise)
3014 struct cl_ladvise_io *lio;
3019 env = cl_env_get(&refcheck);
3021 RETURN(PTR_ERR(env));
3023 io = vvp_env_thread_io(env);
3024 io->ci_obj = ll_i2info(inode)->lli_clob;
3026 /* initialize parameters for ladvise */
3027 lio = &io->u.ci_ladvise;
3028 lio->li_start = ladvise->lla_start;
3029 lio->li_end = ladvise->lla_end;
3030 lio->li_fid = ll_inode2fid(inode);
3031 lio->li_advice = ladvise->lla_advice;
3032 lio->li_flags = flags;
3034 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3035 rc = cl_io_loop(env, io);
3039 cl_io_fini(env, io);
3040 cl_env_put(env, &refcheck);
3044 static int ll_lock_noexpand(struct file *file, int flags)
3046 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3048 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3053 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3056 struct fsxattr fsxattr;
3058 if (copy_from_user(&fsxattr,
3059 (const struct fsxattr __user *)arg,
3063 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3064 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3065 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3066 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3067 if (copy_to_user((struct fsxattr __user *)arg,
3068 &fsxattr, sizeof(fsxattr)))
3074 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3078 struct md_op_data *op_data;
3079 struct ptlrpc_request *req = NULL;
3081 struct fsxattr fsxattr;
3082 struct cl_object *obj;
3086 /* only root could change project ID */
3087 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
3090 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3091 LUSTRE_OPC_ANY, NULL);
3092 if (IS_ERR(op_data))
3093 RETURN(PTR_ERR(op_data));
3095 if (copy_from_user(&fsxattr,
3096 (const struct fsxattr __user *)arg,
3098 GOTO(out_fsxattr, rc = -EFAULT);
3100 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3101 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3102 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3103 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3104 op_data->op_projid = fsxattr.fsx_projid;
3105 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3106 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3108 ptlrpc_req_finished(req);
3110 GOTO(out_fsxattr, rc);
3111 ll_update_inode_flags(inode, op_data->op_attr_flags);
3112 obj = ll_i2info(inode)->lli_clob;
3114 GOTO(out_fsxattr, rc);
3116 OBD_ALLOC_PTR(attr);
3118 GOTO(out_fsxattr, rc = -ENOMEM);
3120 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3121 fsxattr.fsx_xflags);
3124 ll_finish_md_op_data(op_data);
3128 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3131 struct inode *inode = file_inode(file);
3132 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3133 struct ll_inode_info *lli = ll_i2info(inode);
3134 struct obd_client_handle *och = NULL;
3135 struct split_param sp;
3138 enum mds_op_bias bias = 0;
3139 struct file *layout_file = NULL;
3141 size_t data_size = 0;
3145 mutex_lock(&lli->lli_och_mutex);
3146 if (fd->fd_lease_och != NULL) {
3147 och = fd->fd_lease_och;
3148 fd->fd_lease_och = NULL;
3150 mutex_unlock(&lli->lli_och_mutex);
3153 GOTO(out, rc = -ENOLCK);
3155 fmode = och->och_flags;
3157 switch (ioc->lil_flags) {
3158 case LL_LEASE_RESYNC_DONE:
3159 if (ioc->lil_count > IOC_IDS_MAX)
3160 GOTO(out, rc = -EINVAL);
3162 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3163 OBD_ALLOC(data, data_size);
3165 GOTO(out, rc = -ENOMEM);
3167 if (copy_from_user(data, (void __user *)arg, data_size))
3168 GOTO(out, rc = -EFAULT);
3170 bias = MDS_CLOSE_RESYNC_DONE;
3172 case LL_LEASE_LAYOUT_MERGE: {
3175 if (ioc->lil_count != 1)
3176 GOTO(out, rc = -EINVAL);
3178 arg += sizeof(*ioc);
3179 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3180 GOTO(out, rc = -EFAULT);
3182 layout_file = fget(fd);
3184 GOTO(out, rc = -EBADF);
3186 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3187 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3188 GOTO(out, rc = -EPERM);
3190 data = file_inode(layout_file);
3191 bias = MDS_CLOSE_LAYOUT_MERGE;
3194 case LL_LEASE_LAYOUT_SPLIT: {
3198 if (ioc->lil_count != 2)
3199 GOTO(out, rc = -EINVAL);
3201 arg += sizeof(*ioc);
3202 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3203 GOTO(out, rc = -EFAULT);
3205 arg += sizeof(__u32);
3206 if (copy_from_user(&mirror_id, (void __user *)arg,
3208 GOTO(out, rc = -EFAULT);
3210 layout_file = fget(fdv);
3212 GOTO(out, rc = -EBADF);
3214 sp.sp_inode = file_inode(layout_file);
3215 sp.sp_mirror_id = (__u16)mirror_id;
3217 bias = MDS_CLOSE_LAYOUT_SPLIT;
3221 /* without close intent */
3225 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3229 rc = ll_lease_och_release(inode, file);
3238 switch (ioc->lil_flags) {
3239 case LL_LEASE_RESYNC_DONE:
3241 OBD_FREE(data, data_size);
3243 case LL_LEASE_LAYOUT_MERGE:
3244 case LL_LEASE_LAYOUT_SPLIT:
3251 rc = ll_lease_type_from_fmode(fmode);
3255 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3258 struct inode *inode = file_inode(file);
3259 struct ll_inode_info *lli = ll_i2info(inode);
3260 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3261 struct obd_client_handle *och = NULL;
3262 __u64 open_flags = 0;
3268 switch (ioc->lil_mode) {
3269 case LL_LEASE_WRLCK:
3270 if (!(file->f_mode & FMODE_WRITE))
3272 fmode = FMODE_WRITE;
3274 case LL_LEASE_RDLCK:
3275 if (!(file->f_mode & FMODE_READ))
3279 case LL_LEASE_UNLCK:
3280 RETURN(ll_file_unlock_lease(file, ioc, arg));
3285 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3287 /* apply for lease */
3288 if (ioc->lil_flags & LL_LEASE_RESYNC)
3289 open_flags = MDS_OPEN_RESYNC;
3290 och = ll_lease_open(inode, file, fmode, open_flags);
3292 RETURN(PTR_ERR(och));
3294 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3295 rc = ll_lease_file_resync(och, inode);
3297 ll_lease_close(och, inode, NULL);
3300 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3302 ll_lease_close(och, inode, NULL);
3308 mutex_lock(&lli->lli_och_mutex);
3309 if (fd->fd_lease_och == NULL) {
3310 fd->fd_lease_och = och;
3313 mutex_unlock(&lli->lli_och_mutex);
3315 /* impossible now that only excl is supported for now */
3316 ll_lease_close(och, inode, &lease_broken);
3323 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3325 struct inode *inode = file_inode(file);
3326 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3330 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3331 PFID(ll_inode2fid(inode)), inode, cmd);
3332 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3334 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3335 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3339 case LL_IOC_GETFLAGS:
3340 /* Get the current value of the file flags */
3341 return put_user(fd->fd_flags, (int __user *)arg);
3342 case LL_IOC_SETFLAGS:
3343 case LL_IOC_CLRFLAGS:
3344 /* Set or clear specific file flags */
3345 /* XXX This probably needs checks to ensure the flags are
3346 * not abused, and to handle any flag side effects.
3348 if (get_user(flags, (int __user *) arg))
3351 if (cmd == LL_IOC_SETFLAGS) {
3352 if ((flags & LL_FILE_IGNORE_LOCK) &&
3353 !(file->f_flags & O_DIRECT)) {
3354 CERROR("%s: unable to disable locking on "
3355 "non-O_DIRECT file\n", current->comm);
3359 fd->fd_flags |= flags;
3361 fd->fd_flags &= ~flags;
3364 case LL_IOC_LOV_SETSTRIPE:
3365 case LL_IOC_LOV_SETSTRIPE_NEW:
3366 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3367 case LL_IOC_LOV_SETEA:
3368 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3369 case LL_IOC_LOV_SWAP_LAYOUTS: {
3371 struct lustre_swap_layouts lsl;
3373 if (copy_from_user(&lsl, (char __user *)arg,
3374 sizeof(struct lustre_swap_layouts)))
3377 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3380 file2 = fget(lsl.sl_fd);
3384 /* O_WRONLY or O_RDWR */
3385 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3386 GOTO(out, rc = -EPERM);
3388 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3389 struct inode *inode2;
3390 struct ll_inode_info *lli;
3391 struct obd_client_handle *och = NULL;
3393 lli = ll_i2info(inode);
3394 mutex_lock(&lli->lli_och_mutex);
3395 if (fd->fd_lease_och != NULL) {
3396 och = fd->fd_lease_och;
3397 fd->fd_lease_och = NULL;
3399 mutex_unlock(&lli->lli_och_mutex);
3401 GOTO(out, rc = -ENOLCK);
3402 inode2 = file_inode(file2);
3403 rc = ll_swap_layouts_close(och, inode, inode2);
3405 rc = ll_swap_layouts(file, file2, &lsl);
3411 case LL_IOC_LOV_GETSTRIPE:
3412 case LL_IOC_LOV_GETSTRIPE_NEW:
3413 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3414 case FS_IOC_GETFLAGS:
3415 case FS_IOC_SETFLAGS:
3416 RETURN(ll_iocontrol(inode, file, cmd, arg));
3417 case FSFILT_IOC_GETVERSION:
3418 case FS_IOC_GETVERSION:
3419 RETURN(put_user(inode->i_generation, (int __user *)arg));
3420 /* We need to special case any other ioctls we want to handle,
3421 * to send them to the MDS/OST as appropriate and to properly
3422 * network encode the arg field. */
3423 case FS_IOC_SETVERSION:
3426 case LL_IOC_GROUP_LOCK:
3427 RETURN(ll_get_grouplock(inode, file, arg));
3428 case LL_IOC_GROUP_UNLOCK:
3429 RETURN(ll_put_grouplock(inode, file, arg));
3430 case IOC_OBD_STATFS:
3431 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3433 case LL_IOC_FLUSHCTX:
3434 RETURN(ll_flush_ctx(inode));
3435 case LL_IOC_PATH2FID: {
3436 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3437 sizeof(struct lu_fid)))
3442 case LL_IOC_GETPARENT:
3443 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3445 case OBD_IOC_FID2PATH:
3446 RETURN(ll_fid2path(inode, (void __user *)arg));
3447 case LL_IOC_DATA_VERSION: {
3448 struct ioc_data_version idv;
3451 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3454 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3455 rc = ll_ioc_data_version(inode, &idv);
3458 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3464 case LL_IOC_GET_MDTIDX: {
3467 mdtidx = ll_get_mdt_idx(inode);
3471 if (put_user((int)mdtidx, (int __user *)arg))
3476 case OBD_IOC_GETDTNAME:
3477 case OBD_IOC_GETMDNAME:
3478 RETURN(ll_get_obd_name(inode, cmd, arg));
3479 case LL_IOC_HSM_STATE_GET: {
3480 struct md_op_data *op_data;
3481 struct hsm_user_state *hus;
3488 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3489 LUSTRE_OPC_ANY, hus);
3490 if (IS_ERR(op_data)) {
3492 RETURN(PTR_ERR(op_data));
3495 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3498 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3501 ll_finish_md_op_data(op_data);
3505 case LL_IOC_HSM_STATE_SET: {
3506 struct hsm_state_set *hss;
3513 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3518 rc = ll_hsm_state_set(inode, hss);
3523 case LL_IOC_HSM_ACTION: {
3524 struct md_op_data *op_data;
3525 struct hsm_current_action *hca;
3532 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3533 LUSTRE_OPC_ANY, hca);
3534 if (IS_ERR(op_data)) {
3536 RETURN(PTR_ERR(op_data));
3539 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3542 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3545 ll_finish_md_op_data(op_data);
3549 case LL_IOC_SET_LEASE_OLD: {
3550 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3552 RETURN(ll_file_set_lease(file, &ioc, 0));
3554 case LL_IOC_SET_LEASE: {
3555 struct ll_ioc_lease ioc;
3557 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3560 RETURN(ll_file_set_lease(file, &ioc, arg));
3562 case LL_IOC_GET_LEASE: {
3563 struct ll_inode_info *lli = ll_i2info(inode);
3564 struct ldlm_lock *lock = NULL;
3567 mutex_lock(&lli->lli_och_mutex);
3568 if (fd->fd_lease_och != NULL) {
3569 struct obd_client_handle *och = fd->fd_lease_och;
3571 lock = ldlm_handle2lock(&och->och_lease_handle);
3573 lock_res_and_lock(lock);
3574 if (!ldlm_is_cancel(lock))
3575 fmode = och->och_flags;
3577 unlock_res_and_lock(lock);
3578 LDLM_LOCK_PUT(lock);
3581 mutex_unlock(&lli->lli_och_mutex);
3583 RETURN(ll_lease_type_from_fmode(fmode));
3585 case LL_IOC_HSM_IMPORT: {
3586 struct hsm_user_import *hui;
3592 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3597 rc = ll_hsm_import(inode, file, hui);
3602 case LL_IOC_FUTIMES_3: {
3603 struct ll_futimes_3 lfu;
3605 if (copy_from_user(&lfu,
3606 (const struct ll_futimes_3 __user *)arg,
3610 RETURN(ll_file_futimes_3(file, &lfu));
3612 case LL_IOC_LADVISE: {
3613 struct llapi_ladvise_hdr *k_ladvise_hdr;
3614 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3617 int alloc_size = sizeof(*k_ladvise_hdr);
3620 u_ladvise_hdr = (void __user *)arg;
3621 OBD_ALLOC_PTR(k_ladvise_hdr);
3622 if (k_ladvise_hdr == NULL)
3625 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3626 GOTO(out_ladvise, rc = -EFAULT);
3628 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3629 k_ladvise_hdr->lah_count < 1)
3630 GOTO(out_ladvise, rc = -EINVAL);
3632 num_advise = k_ladvise_hdr->lah_count;
3633 if (num_advise >= LAH_COUNT_MAX)
3634 GOTO(out_ladvise, rc = -EFBIG);
3636 OBD_FREE_PTR(k_ladvise_hdr);
3637 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3638 lah_advise[num_advise]);
3639 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3640 if (k_ladvise_hdr == NULL)
3644 * TODO: submit multiple advices to one server in a single RPC
3646 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3647 GOTO(out_ladvise, rc = -EFAULT);
3649 for (i = 0; i < num_advise; i++) {
3650 struct llapi_lu_ladvise *k_ladvise =
3651 &k_ladvise_hdr->lah_advise[i];
3652 struct llapi_lu_ladvise __user *u_ladvise =
3653 &u_ladvise_hdr->lah_advise[i];
3655 rc = ll_ladvise_sanity(inode, k_ladvise);
3657 GOTO(out_ladvise, rc);
3659 switch (k_ladvise->lla_advice) {
3660 case LU_LADVISE_LOCKNOEXPAND:
3661 rc = ll_lock_noexpand(file,
3662 k_ladvise->lla_peradvice_flags);
3663 GOTO(out_ladvise, rc);
3664 case LU_LADVISE_LOCKAHEAD:
3666 rc = ll_file_lock_ahead(file, k_ladvise);
3669 GOTO(out_ladvise, rc);
3672 &u_ladvise->lla_lockahead_result))
3673 GOTO(out_ladvise, rc = -EFAULT);
3676 rc = ll_ladvise(inode, file,
3677 k_ladvise_hdr->lah_flags,
3680 GOTO(out_ladvise, rc);
3687 OBD_FREE(k_ladvise_hdr, alloc_size);
3690 case LL_IOC_FLR_SET_MIRROR: {
3691 /* mirror I/O must be direct to avoid polluting page cache
3693 if (!(file->f_flags & O_DIRECT))
3696 fd->fd_designated_mirror = (__u32)arg;
3699 case LL_IOC_FSGETXATTR:
3700 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3701 case LL_IOC_FSSETXATTR:
3702 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3704 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3706 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3707 (void __user *)arg));
3711 #ifndef HAVE_FILE_LLSEEK_SIZE
3712 static inline loff_t
3713 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3715 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3717 if (offset > maxsize)
3720 if (offset != file->f_pos) {
3721 file->f_pos = offset;
3722 file->f_version = 0;
3728 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3729 loff_t maxsize, loff_t eof)
3731 struct inode *inode = file_inode(file);
3739 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3740 * position-querying operation. Avoid rewriting the "same"
3741 * f_pos value back to the file because a concurrent read(),
3742 * write() or lseek() might have altered it
3747 * f_lock protects against read/modify/write race with other
3748 * SEEK_CURs. Note that parallel writes and reads behave
3752 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3753 inode_unlock(inode);
3757 * In the generic case the entire file is data, so as long as
3758 * offset isn't at the end of the file then the offset is data.
3765 * There is a virtual hole at the end of the file, so as long as
3766 * offset isn't i_size or larger, return i_size.
3774 return llseek_execute(file, offset, maxsize);
3778 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3780 struct inode *inode = file_inode(file);
3781 loff_t retval, eof = 0;
3784 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3785 (origin == SEEK_CUR) ? file->f_pos : 0);
3786 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3787 PFID(ll_inode2fid(inode)), inode, retval, retval,
3789 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3791 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3792 retval = ll_glimpse_size(inode);
3795 eof = i_size_read(inode);
3798 retval = ll_generic_file_llseek_size(file, offset, origin,
3799 ll_file_maxbytes(inode), eof);
3803 static int ll_flush(struct file *file, fl_owner_t id)
3805 struct inode *inode = file_inode(file);
3806 struct ll_inode_info *lli = ll_i2info(inode);
3807 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3810 LASSERT(!S_ISDIR(inode->i_mode));
3812 /* catch async errors that were recorded back when async writeback
3813 * failed for pages in this mapping. */
3814 rc = lli->lli_async_rc;
3815 lli->lli_async_rc = 0;
3816 if (lli->lli_clob != NULL) {
3817 err = lov_read_and_clear_async_rc(lli->lli_clob);
3822 /* The application has been told write failure already.
3823 * Do not report failure again. */
3824 if (fd->fd_write_failed)
3826 return rc ? -EIO : 0;
3830 * Called to make sure a portion of file has been written out.
3831 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3833 * Return how many pages have been written.
3835 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3836 enum cl_fsync_mode mode, int ignore_layout)
3840 struct cl_fsync_io *fio;
3845 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3846 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3849 env = cl_env_get(&refcheck);
3851 RETURN(PTR_ERR(env));
3853 io = vvp_env_thread_io(env);
3854 io->ci_obj = ll_i2info(inode)->lli_clob;
3855 io->ci_ignore_layout = ignore_layout;
3857 /* initialize parameters for sync */
3858 fio = &io->u.ci_fsync;
3859 fio->fi_start = start;
3861 fio->fi_fid = ll_inode2fid(inode);
3862 fio->fi_mode = mode;
3863 fio->fi_nr_written = 0;
3865 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3866 result = cl_io_loop(env, io);
3868 result = io->ci_result;
3870 result = fio->fi_nr_written;
3871 cl_io_fini(env, io);
3872 cl_env_put(env, &refcheck);
3878 * When dentry is provided (the 'else' case), file_dentry() may be
3879 * null and dentry must be used directly rather than pulled from
3880 * file_dentry() as is done otherwise.
3883 #ifdef HAVE_FILE_FSYNC_4ARGS
3884 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3886 struct dentry *dentry = file_dentry(file);
3888 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3889 int ll_fsync(struct file *file, int datasync)
3891 struct dentry *dentry = file_dentry(file);
3893 loff_t end = LLONG_MAX;
3895 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3898 loff_t end = LLONG_MAX;
3900 struct inode *inode = dentry->d_inode;
3901 struct ll_inode_info *lli = ll_i2info(inode);
3902 struct ptlrpc_request *req;
3906 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3907 PFID(ll_inode2fid(inode)), inode);
3908 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3910 #ifdef HAVE_FILE_FSYNC_4ARGS
3911 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3912 lock_inode = !lli->lli_inode_locked;
3916 /* fsync's caller has already called _fdata{sync,write}, we want
3917 * that IO to finish before calling the osc and mdc sync methods */
3918 rc = filemap_fdatawait(inode->i_mapping);
3921 /* catch async errors that were recorded back when async writeback
3922 * failed for pages in this mapping. */
3923 if (!S_ISDIR(inode->i_mode)) {
3924 err = lli->lli_async_rc;
3925 lli->lli_async_rc = 0;
3928 if (lli->lli_clob != NULL) {
3929 err = lov_read_and_clear_async_rc(lli->lli_clob);
3935 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3939 ptlrpc_req_finished(req);
3941 if (S_ISREG(inode->i_mode)) {
3942 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3944 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3945 if (rc == 0 && err < 0)
3948 fd->fd_write_failed = true;
3950 fd->fd_write_failed = false;
3953 #ifdef HAVE_FILE_FSYNC_4ARGS
3955 inode_unlock(inode);
3961 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3963 struct inode *inode = file_inode(file);
3964 struct ll_sb_info *sbi = ll_i2sbi(inode);
3965 struct ldlm_enqueue_info einfo = {
3966 .ei_type = LDLM_FLOCK,
3967 .ei_cb_cp = ldlm_flock_completion_ast,
3968 .ei_cbdata = file_lock,
3970 struct md_op_data *op_data;
3971 struct lustre_handle lockh = { 0 };
3972 union ldlm_policy_data flock = { { 0 } };
3973 int fl_type = file_lock->fl_type;
3979 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3980 PFID(ll_inode2fid(inode)), file_lock);
3982 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3984 if (file_lock->fl_flags & FL_FLOCK) {
3985 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3986 /* flocks are whole-file locks */
3987 flock.l_flock.end = OFFSET_MAX;
3988 /* For flocks owner is determined by the local file desctiptor*/
3989 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3990 } else if (file_lock->fl_flags & FL_POSIX) {
3991 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3992 flock.l_flock.start = file_lock->fl_start;
3993 flock.l_flock.end = file_lock->fl_end;
3997 flock.l_flock.pid = file_lock->fl_pid;
3999 /* Somewhat ugly workaround for svc lockd.
4000 * lockd installs custom fl_lmops->lm_compare_owner that checks
4001 * for the fl_owner to be the same (which it always is on local node
4002 * I guess between lockd processes) and then compares pid.
4003 * As such we assign pid to the owner field to make it all work,
4004 * conflict with normal locks is unlikely since pid space and
4005 * pointer space for current->files are not intersecting */
4006 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4007 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4011 einfo.ei_mode = LCK_PR;
4014 /* An unlock request may or may not have any relation to
4015 * existing locks so we may not be able to pass a lock handle
4016 * via a normal ldlm_lock_cancel() request. The request may even
4017 * unlock a byte range in the middle of an existing lock. In
4018 * order to process an unlock request we need all of the same
4019 * information that is given with a normal read or write record
4020 * lock request. To avoid creating another ldlm unlock (cancel)
4021 * message we'll treat a LCK_NL flock request as an unlock. */
4022 einfo.ei_mode = LCK_NL;
4025 einfo.ei_mode = LCK_PW;
4028 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4043 flags = LDLM_FL_BLOCK_NOWAIT;
4049 flags = LDLM_FL_TEST_LOCK;
4052 CERROR("unknown fcntl lock command: %d\n", cmd);
4056 /* Save the old mode so that if the mode in the lock changes we
4057 * can decrement the appropriate reader or writer refcount. */
4058 file_lock->fl_type = einfo.ei_mode;
4060 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4061 LUSTRE_OPC_ANY, NULL);
4062 if (IS_ERR(op_data))
4063 RETURN(PTR_ERR(op_data));
4065 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4066 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4067 flock.l_flock.pid, flags, einfo.ei_mode,
4068 flock.l_flock.start, flock.l_flock.end);
4070 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4073 /* Restore the file lock type if not TEST lock. */
4074 if (!(flags & LDLM_FL_TEST_LOCK))
4075 file_lock->fl_type = fl_type;
4077 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4078 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4079 !(flags & LDLM_FL_TEST_LOCK))
4080 rc2 = locks_lock_file_wait(file, file_lock);
4082 if ((file_lock->fl_flags & FL_FLOCK) &&
4083 (rc == 0 || file_lock->fl_type == F_UNLCK))
4084 rc2 = flock_lock_file_wait(file, file_lock);
4085 if ((file_lock->fl_flags & FL_POSIX) &&
4086 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4087 !(flags & LDLM_FL_TEST_LOCK))
4088 rc2 = posix_lock_file_wait(file, file_lock);
4089 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4091 if (rc2 && file_lock->fl_type != F_UNLCK) {
4092 einfo.ei_mode = LCK_NL;
4093 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4098 ll_finish_md_op_data(op_data);
4103 int ll_get_fid_by_name(struct inode *parent, const char *name,
4104 int namelen, struct lu_fid *fid,
4105 struct inode **inode)
4107 struct md_op_data *op_data = NULL;
4108 struct mdt_body *body;
4109 struct ptlrpc_request *req;
4113 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4114 LUSTRE_OPC_ANY, NULL);
4115 if (IS_ERR(op_data))
4116 RETURN(PTR_ERR(op_data));
4118 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4119 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4120 ll_finish_md_op_data(op_data);
4124 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4126 GOTO(out_req, rc = -EFAULT);
4128 *fid = body->mbo_fid1;
4131 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4133 ptlrpc_req_finished(req);
4137 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4140 struct dentry *dchild = NULL;
4141 struct inode *child_inode = NULL;
4142 struct md_op_data *op_data;
4143 struct ptlrpc_request *request = NULL;
4144 struct obd_client_handle *och = NULL;
4146 struct mdt_body *body;
4147 __u64 data_version = 0;
4148 size_t namelen = strlen(name);
4149 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4153 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4154 PFID(ll_inode2fid(parent)), name,
4155 lum->lum_stripe_offset, lum->lum_stripe_count);
4157 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4158 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4159 lustre_swab_lmv_user_md(lum);
4161 /* Get child FID first */
4162 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4165 dchild = d_lookup(file_dentry(file), &qstr);
4167 if (dchild->d_inode)
4168 child_inode = igrab(dchild->d_inode);
4173 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4182 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4183 OBD_CONNECT2_DIR_MIGRATE)) {
4184 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4185 ll_i2info(child_inode)->lli_lsm_md) {
4186 CERROR("%s: MDT doesn't support stripe directory "
4188 ll_get_fsname(parent->i_sb, NULL, 0));
4189 GOTO(out_iput, rc = -EOPNOTSUPP);
4194 * lfs migrate command needs to be blocked on the client
4195 * by checking the migrate FID against the FID of the
4198 if (child_inode == parent->i_sb->s_root->d_inode)
4199 GOTO(out_iput, rc = -EINVAL);
4201 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4202 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4203 if (IS_ERR(op_data))
4204 GOTO(out_iput, rc = PTR_ERR(op_data));
4206 inode_lock(child_inode);
4207 op_data->op_fid3 = *ll_inode2fid(child_inode);
4208 if (!fid_is_sane(&op_data->op_fid3)) {
4209 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4210 ll_get_fsname(parent->i_sb, NULL, 0), name,
4211 PFID(&op_data->op_fid3));
4212 GOTO(out_unlock, rc = -EINVAL);
4215 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4216 op_data->op_data = lum;
4217 op_data->op_data_size = lumlen;
4220 if (S_ISREG(child_inode->i_mode)) {
4221 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4225 GOTO(out_unlock, rc);
4228 rc = ll_data_version(child_inode, &data_version,
4231 GOTO(out_close, rc);
4233 op_data->op_open_handle = och->och_open_handle;
4234 op_data->op_data_version = data_version;
4235 op_data->op_lease_handle = och->och_lease_handle;
4236 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4238 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4239 och->och_mod->mod_open_req->rq_replay = 0;
4240 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4243 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4244 name, namelen, &request);
4246 LASSERT(request != NULL);
4247 ll_update_times(request, parent);
4249 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4250 LASSERT(body != NULL);
4252 /* If the server does release layout lock, then we cleanup
4253 * the client och here, otherwise release it in out_close: */
4254 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4255 obd_mod_put(och->och_mod);
4256 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4258 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4264 if (request != NULL) {
4265 ptlrpc_req_finished(request);
4269 /* Try again if the file layout has changed. */
4270 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4275 ll_lease_close(och, child_inode, NULL);
4277 clear_nlink(child_inode);
4279 inode_unlock(child_inode);
4280 ll_finish_md_op_data(op_data);
4287 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4295 * test if some locks matching bits and l_req_mode are acquired
4296 * - bits can be in different locks
4297 * - if found clear the common lock bits in *bits
4298 * - the bits not found, are kept in *bits
4300 * \param bits [IN] searched lock bits [IN]
4301 * \param l_req_mode [IN] searched lock mode
4302 * \retval boolean, true iff all bits are found
4304 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4306 struct lustre_handle lockh;
4307 union ldlm_policy_data policy;
4308 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4309 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4318 fid = &ll_i2info(inode)->lli_fid;
4319 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4320 ldlm_lockname[mode]);
4322 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4323 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4324 policy.l_inodebits.bits = *bits & (1 << i);
4325 if (policy.l_inodebits.bits == 0)
4328 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4329 &policy, mode, &lockh)) {
4330 struct ldlm_lock *lock;
4332 lock = ldlm_handle2lock(&lockh);
4335 ~(lock->l_policy_data.l_inodebits.bits);
4336 LDLM_LOCK_PUT(lock);
4338 *bits &= ~policy.l_inodebits.bits;
4345 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4346 struct lustre_handle *lockh, __u64 flags,
4347 enum ldlm_mode mode)
4349 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4354 fid = &ll_i2info(inode)->lli_fid;
4355 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4357 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4358 fid, LDLM_IBITS, &policy, mode, lockh);
4363 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4365 /* Already unlinked. Just update nlink and return success */
4366 if (rc == -ENOENT) {
4368 /* If it is striped directory, and there is bad stripe
4369 * Let's revalidate the dentry again, instead of returning
4371 if (S_ISDIR(inode->i_mode) &&
4372 ll_i2info(inode)->lli_lsm_md != NULL)
4375 /* This path cannot be hit for regular files unless in
4376 * case of obscure races, so no need to to validate
4378 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4380 } else if (rc != 0) {
4381 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4382 "%s: revalidate FID "DFID" error: rc = %d\n",
4383 ll_get_fsname(inode->i_sb, NULL, 0),
4384 PFID(ll_inode2fid(inode)), rc);
4390 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4392 struct inode *inode = dentry->d_inode;
4393 struct obd_export *exp = ll_i2mdexp(inode);
4394 struct lookup_intent oit = {
4397 struct ptlrpc_request *req = NULL;
4398 struct md_op_data *op_data;
4402 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4403 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4405 /* Call getattr by fid, so do not provide name at all. */
4406 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4407 LUSTRE_OPC_ANY, NULL);
4408 if (IS_ERR(op_data))
4409 RETURN(PTR_ERR(op_data));
4411 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4412 ll_finish_md_op_data(op_data);
4414 rc = ll_inode_revalidate_fini(inode, rc);
4418 rc = ll_revalidate_it_finish(req, &oit, dentry);
4420 ll_intent_release(&oit);
4424 /* Unlinked? Unhash dentry, so it is not picked up later by
4425 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4426 * here to preserve get_cwd functionality on 2.6.
4428 if (!dentry->d_inode->i_nlink) {
4429 ll_lock_dcache(inode);
4430 d_lustre_invalidate(dentry, 0);
4431 ll_unlock_dcache(inode);
4434 ll_lookup_finish_locks(&oit, dentry);
4436 ptlrpc_req_finished(req);
4441 static int ll_merge_md_attr(struct inode *inode)
4443 struct cl_attr attr = { 0 };
4446 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4447 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4448 &attr, ll_md_blocking_ast);
4452 set_nlink(inode, attr.cat_nlink);
4453 inode->i_blocks = attr.cat_blocks;
4454 i_size_write(inode, attr.cat_size);
4456 ll_i2info(inode)->lli_atime = attr.cat_atime;
4457 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4458 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4463 static inline dev_t ll_compat_encode_dev(dev_t dev)
4465 /* The compat_sys_*stat*() syscalls will fail unless the
4466 * device majors and minors are both less than 256. Note that
4467 * the value returned here will be passed through
4468 * old_encode_dev() in cp_compat_stat(). And so we are not
4469 * trying to return a valid compat (u16) device number, just
4470 * one that will pass the old_valid_dev() check. */
4472 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4475 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4476 int ll_getattr(const struct path *path, struct kstat *stat,
4477 u32 request_mask, unsigned int flags)
4479 struct dentry *de = path->dentry;
4481 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4484 struct inode *inode = de->d_inode;
4485 struct ll_sb_info *sbi = ll_i2sbi(inode);
4486 struct ll_inode_info *lli = ll_i2info(inode);
4489 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4491 rc = ll_inode_revalidate(de, IT_GETATTR);
4495 if (S_ISREG(inode->i_mode)) {
4496 /* In case of restore, the MDT has the right size and has
4497 * already send it back without granting the layout lock,
4498 * inode is up-to-date so glimpse is useless.
4499 * Also to glimpse we need the layout, in case of a running
4500 * restore the MDT holds the layout lock so the glimpse will
4501 * block up to the end of restore (getattr will block)
4503 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4504 rc = ll_glimpse_size(inode);
4509 /* If object isn't regular a file then don't validate size. */
4510 if (S_ISDIR(inode->i_mode) &&
4511 lli->lli_lsm_md != NULL) {
4512 rc = ll_merge_md_attr(inode);
4517 LTIME_S(inode->i_atime) = lli->lli_atime;
4518 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4519 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4522 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4524 if (ll_need_32bit_api(sbi)) {
4525 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4526 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4527 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4529 stat->ino = inode->i_ino;
4530 stat->dev = inode->i_sb->s_dev;
4531 stat->rdev = inode->i_rdev;
4534 stat->mode = inode->i_mode;
4535 stat->uid = inode->i_uid;
4536 stat->gid = inode->i_gid;
4537 stat->atime = inode->i_atime;
4538 stat->mtime = inode->i_mtime;
4539 stat->ctime = inode->i_ctime;
4540 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4542 stat->nlink = inode->i_nlink;
4543 stat->size = i_size_read(inode);
4544 stat->blocks = inode->i_blocks;
4549 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4550 __u64 start, __u64 len)
4554 struct fiemap *fiemap;
4555 unsigned int extent_count = fieinfo->fi_extents_max;
4557 num_bytes = sizeof(*fiemap) + (extent_count *
4558 sizeof(struct fiemap_extent));
4559 OBD_ALLOC_LARGE(fiemap, num_bytes);
4564 fiemap->fm_flags = fieinfo->fi_flags;
4565 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4566 fiemap->fm_start = start;
4567 fiemap->fm_length = len;
4568 if (extent_count > 0 &&
4569 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4570 sizeof(struct fiemap_extent)) != 0)
4571 GOTO(out, rc = -EFAULT);
4573 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4575 fieinfo->fi_flags = fiemap->fm_flags;
4576 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4577 if (extent_count > 0 &&
4578 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4579 fiemap->fm_mapped_extents *
4580 sizeof(struct fiemap_extent)) != 0)
4581 GOTO(out, rc = -EFAULT);
4583 OBD_FREE_LARGE(fiemap, num_bytes);
4587 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4589 struct ll_inode_info *lli = ll_i2info(inode);
4590 struct posix_acl *acl = NULL;
4593 spin_lock(&lli->lli_lock);
4594 /* VFS' acl_permission_check->check_acl will release the refcount */
4595 acl = posix_acl_dup(lli->lli_posix_acl);
4596 spin_unlock(&lli->lli_lock);
4601 #ifdef HAVE_IOP_SET_ACL
4602 #ifdef CONFIG_FS_POSIX_ACL
4603 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4605 struct ll_sb_info *sbi = ll_i2sbi(inode);
4606 struct ptlrpc_request *req = NULL;
4607 const char *name = NULL;
4609 size_t value_size = 0;
4614 case ACL_TYPE_ACCESS:
4615 name = XATTR_NAME_POSIX_ACL_ACCESS;
4617 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4620 case ACL_TYPE_DEFAULT:
4621 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4622 if (!S_ISDIR(inode->i_mode))
4623 rc = acl ? -EACCES : 0;
4634 value_size = posix_acl_xattr_size(acl->a_count);
4635 value = kmalloc(value_size, GFP_NOFS);
4637 GOTO(out, rc = -ENOMEM);
4639 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4641 GOTO(out_value, rc);
4644 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4645 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4646 name, value, value_size, 0, 0, &req);
4648 ptlrpc_req_finished(req);
4653 forget_cached_acl(inode, type);
4655 set_cached_acl(inode, type, acl);
4658 #endif /* CONFIG_FS_POSIX_ACL */
4659 #endif /* HAVE_IOP_SET_ACL */
4661 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4663 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4664 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4666 ll_check_acl(struct inode *inode, int mask)
4669 # ifdef CONFIG_FS_POSIX_ACL
4670 struct posix_acl *acl;
4674 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4675 if (flags & IPERM_FLAG_RCU)
4678 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4683 rc = posix_acl_permission(inode, acl, mask);
4684 posix_acl_release(acl);
4687 # else /* !CONFIG_FS_POSIX_ACL */
4689 # endif /* CONFIG_FS_POSIX_ACL */
4691 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4693 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4694 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4696 # ifdef HAVE_INODE_PERMISION_2ARGS
4697 int ll_inode_permission(struct inode *inode, int mask)
4699 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4704 struct ll_sb_info *sbi;
4705 struct root_squash_info *squash;
4706 struct cred *cred = NULL;
4707 const struct cred *old_cred = NULL;
4709 bool squash_id = false;
4712 #ifdef MAY_NOT_BLOCK
4713 if (mask & MAY_NOT_BLOCK)
4715 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4716 if (flags & IPERM_FLAG_RCU)
4720 /* as root inode are NOT getting validated in lookup operation,
4721 * need to do it before permission check. */
4723 if (inode == inode->i_sb->s_root->d_inode) {
4724 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4729 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4730 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4732 /* squash fsuid/fsgid if needed */
4733 sbi = ll_i2sbi(inode);
4734 squash = &sbi->ll_squash;
4735 if (unlikely(squash->rsi_uid != 0 &&
4736 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4737 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4741 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4742 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4743 squash->rsi_uid, squash->rsi_gid);
4745 /* update current process's credentials
4746 * and FS capability */
4747 cred = prepare_creds();
4751 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4752 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4753 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4754 if ((1 << cap) & CFS_CAP_FS_MASK)
4755 cap_lower(cred->cap_effective, cap);
4757 old_cred = override_creds(cred);
4760 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4761 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4762 /* restore current process's credentials and FS capability */
4764 revert_creds(old_cred);
4771 /* -o localflock - only provides locally consistent flock locks */
4772 struct file_operations ll_file_operations = {
4773 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4774 # ifdef HAVE_SYNC_READ_WRITE
4775 .read = new_sync_read,
4776 .write = new_sync_write,
4778 .read_iter = ll_file_read_iter,
4779 .write_iter = ll_file_write_iter,
4780 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4781 .read = ll_file_read,
4782 .aio_read = ll_file_aio_read,
4783 .write = ll_file_write,
4784 .aio_write = ll_file_aio_write,
4785 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4786 .unlocked_ioctl = ll_file_ioctl,
4787 .open = ll_file_open,
4788 .release = ll_file_release,
4789 .mmap = ll_file_mmap,
4790 .llseek = ll_file_seek,
4791 .splice_read = ll_file_splice_read,
4796 struct file_operations ll_file_operations_flock = {
4797 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4798 # ifdef HAVE_SYNC_READ_WRITE
4799 .read = new_sync_read,
4800 .write = new_sync_write,
4801 # endif /* HAVE_SYNC_READ_WRITE */
4802 .read_iter = ll_file_read_iter,
4803 .write_iter = ll_file_write_iter,
4804 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4805 .read = ll_file_read,
4806 .aio_read = ll_file_aio_read,
4807 .write = ll_file_write,
4808 .aio_write = ll_file_aio_write,
4809 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4810 .unlocked_ioctl = ll_file_ioctl,
4811 .open = ll_file_open,
4812 .release = ll_file_release,
4813 .mmap = ll_file_mmap,
4814 .llseek = ll_file_seek,
4815 .splice_read = ll_file_splice_read,
4818 .flock = ll_file_flock,
4819 .lock = ll_file_flock
4822 /* These are for -o noflock - to return ENOSYS on flock calls */
4823 struct file_operations ll_file_operations_noflock = {
4824 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4825 # ifdef HAVE_SYNC_READ_WRITE
4826 .read = new_sync_read,
4827 .write = new_sync_write,
4828 # endif /* HAVE_SYNC_READ_WRITE */
4829 .read_iter = ll_file_read_iter,
4830 .write_iter = ll_file_write_iter,
4831 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4832 .read = ll_file_read,
4833 .aio_read = ll_file_aio_read,
4834 .write = ll_file_write,
4835 .aio_write = ll_file_aio_write,
4836 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4837 .unlocked_ioctl = ll_file_ioctl,
4838 .open = ll_file_open,
4839 .release = ll_file_release,
4840 .mmap = ll_file_mmap,
4841 .llseek = ll_file_seek,
4842 .splice_read = ll_file_splice_read,
4845 .flock = ll_file_noflock,
4846 .lock = ll_file_noflock
4849 struct inode_operations ll_file_inode_operations = {
4850 .setattr = ll_setattr,
4851 .getattr = ll_getattr,
4852 .permission = ll_inode_permission,
4853 #ifdef HAVE_IOP_XATTR
4854 .setxattr = ll_setxattr,
4855 .getxattr = ll_getxattr,
4856 .removexattr = ll_removexattr,
4858 .listxattr = ll_listxattr,
4859 .fiemap = ll_fiemap,
4860 #ifdef HAVE_IOP_GET_ACL
4861 .get_acl = ll_get_acl,
4863 #ifdef HAVE_IOP_SET_ACL
4864 .set_acl = ll_set_acl,
4868 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4870 struct ll_inode_info *lli = ll_i2info(inode);
4871 struct cl_object *obj = lli->lli_clob;
4880 env = cl_env_get(&refcheck);
4882 RETURN(PTR_ERR(env));
4884 rc = cl_conf_set(env, lli->lli_clob, conf);
4888 if (conf->coc_opc == OBJECT_CONF_SET) {
4889 struct ldlm_lock *lock = conf->coc_lock;
4890 struct cl_layout cl = {
4894 LASSERT(lock != NULL);
4895 LASSERT(ldlm_has_layout(lock));
4897 /* it can only be allowed to match after layout is
4898 * applied to inode otherwise false layout would be
4899 * seen. Applying layout shoud happen before dropping
4900 * the intent lock. */
4901 ldlm_lock_allow_match(lock);
4903 rc = cl_object_layout_get(env, obj, &cl);
4908 DFID": layout version change: %u -> %u\n",
4909 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4911 ll_layout_version_set(lli, cl.cl_layout_gen);
4915 cl_env_put(env, &refcheck);
4920 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4921 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4924 struct ll_sb_info *sbi = ll_i2sbi(inode);
4925 struct ptlrpc_request *req;
4926 struct mdt_body *body;
4933 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4934 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4935 lock->l_lvb_data, lock->l_lvb_len);
4937 if (lock->l_lvb_data != NULL)
4940 /* if layout lock was granted right away, the layout is returned
4941 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4942 * blocked and then granted via completion ast, we have to fetch
4943 * layout here. Please note that we can't use the LVB buffer in
4944 * completion AST because it doesn't have a large enough buffer */
4945 rc = ll_get_default_mdsize(sbi, &lmmsize);
4947 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4948 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4952 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4954 GOTO(out, rc = -EPROTO);
4956 lmmsize = body->mbo_eadatasize;
4957 if (lmmsize == 0) /* empty layout */
4960 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4962 GOTO(out, rc = -EFAULT);
4964 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4965 if (lvbdata == NULL)
4966 GOTO(out, rc = -ENOMEM);
4968 memcpy(lvbdata, lmm, lmmsize);
4969 lock_res_and_lock(lock);
4970 if (unlikely(lock->l_lvb_data == NULL)) {
4971 lock->l_lvb_type = LVB_T_LAYOUT;
4972 lock->l_lvb_data = lvbdata;
4973 lock->l_lvb_len = lmmsize;
4976 unlock_res_and_lock(lock);
4979 OBD_FREE_LARGE(lvbdata, lmmsize);
4984 ptlrpc_req_finished(req);
4989 * Apply the layout to the inode. Layout lock is held and will be released
4992 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4993 struct inode *inode)
4995 struct ll_inode_info *lli = ll_i2info(inode);
4996 struct ll_sb_info *sbi = ll_i2sbi(inode);
4997 struct ldlm_lock *lock;
4998 struct cl_object_conf conf;
5001 bool wait_layout = false;
5004 LASSERT(lustre_handle_is_used(lockh));
5006 lock = ldlm_handle2lock(lockh);
5007 LASSERT(lock != NULL);
5008 LASSERT(ldlm_has_layout(lock));
5010 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5011 PFID(&lli->lli_fid), inode);
5013 /* in case this is a caching lock and reinstate with new inode */
5014 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5016 lock_res_and_lock(lock);
5017 lvb_ready = ldlm_is_lvb_ready(lock);
5018 unlock_res_and_lock(lock);
5020 /* checking lvb_ready is racy but this is okay. The worst case is
5021 * that multi processes may configure the file on the same time. */
5025 rc = ll_layout_fetch(inode, lock);
5029 /* for layout lock, lmm is stored in lock's lvb.
5030 * lvb_data is immutable if the lock is held so it's safe to access it
5033 * set layout to file. Unlikely this will fail as old layout was
5034 * surely eliminated */
5035 memset(&conf, 0, sizeof conf);
5036 conf.coc_opc = OBJECT_CONF_SET;
5037 conf.coc_inode = inode;
5038 conf.coc_lock = lock;
5039 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5040 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5041 rc = ll_layout_conf(inode, &conf);
5043 /* refresh layout failed, need to wait */
5044 wait_layout = rc == -EBUSY;
5047 LDLM_LOCK_PUT(lock);
5048 ldlm_lock_decref(lockh, mode);
5050 /* wait for IO to complete if it's still being used. */
5052 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5053 ll_get_fsname(inode->i_sb, NULL, 0),
5054 PFID(&lli->lli_fid), inode);
5056 memset(&conf, 0, sizeof conf);
5057 conf.coc_opc = OBJECT_CONF_WAIT;
5058 conf.coc_inode = inode;
5059 rc = ll_layout_conf(inode, &conf);
5063 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5064 ll_get_fsname(inode->i_sb, NULL, 0),
5065 PFID(&lli->lli_fid), rc);
5071 * Issue layout intent RPC to MDS.
5072 * \param inode [in] file inode
5073 * \param intent [in] layout intent
5075 * \retval 0 on success
5076 * \retval < 0 error code
5078 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5080 struct ll_inode_info *lli = ll_i2info(inode);
5081 struct ll_sb_info *sbi = ll_i2sbi(inode);
5082 struct md_op_data *op_data;
5083 struct lookup_intent it;
5084 struct ptlrpc_request *req;
5088 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5089 0, 0, LUSTRE_OPC_ANY, NULL);
5090 if (IS_ERR(op_data))
5091 RETURN(PTR_ERR(op_data));
5093 op_data->op_data = intent;
5094 op_data->op_data_size = sizeof(*intent);
5096 memset(&it, 0, sizeof(it));
5097 it.it_op = IT_LAYOUT;
5098 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5099 intent->li_opc == LAYOUT_INTENT_TRUNC)
5100 it.it_flags = FMODE_WRITE;
5102 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5103 ll_get_fsname(inode->i_sb, NULL, 0),
5104 PFID(&lli->lli_fid), inode);
5106 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5107 &ll_md_blocking_ast, 0);
5108 if (it.it_request != NULL)
5109 ptlrpc_req_finished(it.it_request);
5110 it.it_request = NULL;
5112 ll_finish_md_op_data(op_data);
5114 /* set lock data in case this is a new lock */
5116 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5118 ll_intent_drop_lock(&it);
5124 * This function checks if there exists a LAYOUT lock on the client side,
5125 * or enqueues it if it doesn't have one in cache.
5127 * This function will not hold layout lock so it may be revoked any time after
5128 * this function returns. Any operations depend on layout should be redone
5131 * This function should be called before lov_io_init() to get an uptodate
5132 * layout version, the caller should save the version number and after IO
5133 * is finished, this function should be called again to verify that layout
5134 * is not changed during IO time.
5136 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5138 struct ll_inode_info *lli = ll_i2info(inode);
5139 struct ll_sb_info *sbi = ll_i2sbi(inode);
5140 struct lustre_handle lockh;
5141 struct layout_intent intent = {
5142 .li_opc = LAYOUT_INTENT_ACCESS,
5144 enum ldlm_mode mode;
5148 *gen = ll_layout_version_get(lli);
5149 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5153 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5154 LASSERT(S_ISREG(inode->i_mode));
5156 /* take layout lock mutex to enqueue layout lock exclusively. */
5157 mutex_lock(&lli->lli_layout_mutex);
5160 /* mostly layout lock is caching on the local side, so try to
5161 * match it before grabbing layout lock mutex. */
5162 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5163 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5164 if (mode != 0) { /* hit cached lock */
5165 rc = ll_layout_lock_set(&lockh, mode, inode);
5171 rc = ll_layout_intent(inode, &intent);
5177 *gen = ll_layout_version_get(lli);
5178 mutex_unlock(&lli->lli_layout_mutex);
5184 * Issue layout intent RPC indicating where in a file an IO is about to write.
5186 * \param[in] inode file inode.
5187 * \param[in] ext write range with start offset of fille in bytes where
5188 * an IO is about to write, and exclusive end offset in
5191 * \retval 0 on success
5192 * \retval < 0 error code
5194 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5195 struct lu_extent *ext)
5197 struct layout_intent intent = {
5199 .li_extent.e_start = ext->e_start,
5200 .li_extent.e_end = ext->e_end,
5205 rc = ll_layout_intent(inode, &intent);
5211 * This function send a restore request to the MDT
5213 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5215 struct hsm_user_request *hur;
5219 len = sizeof(struct hsm_user_request) +
5220 sizeof(struct hsm_user_item);
5221 OBD_ALLOC(hur, len);
5225 hur->hur_request.hr_action = HUA_RESTORE;
5226 hur->hur_request.hr_archive_id = 0;
5227 hur->hur_request.hr_flags = 0;
5228 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5229 sizeof(hur->hur_user_item[0].hui_fid));
5230 hur->hur_user_item[0].hui_extent.offset = offset;
5231 hur->hur_user_item[0].hui_extent.length = length;
5232 hur->hur_request.hr_itemcount = 1;
5233 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,