4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_get_fsname(inode->i_sb, NULL, 0),
143 PFID(&lli->lli_fid));
147 OBD_ALLOC_PTR(op_data);
148 /* We leak openhandle and request here on error, but not much to be
149 * done in OOM case since app won't retry close on error either. */
151 GOTO(out, rc = -ENOMEM);
153 ll_prepare_close(inode, op_data, och);
155 case MDS_CLOSE_LAYOUT_MERGE:
156 /* merge blocks from the victim inode */
157 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
158 op_data->op_attr.ia_valid |= ATTR_SIZE;
159 op_data->op_xvalid |= OP_XVALID_BLOCKS;
160 case MDS_CLOSE_LAYOUT_SPLIT:
161 case MDS_CLOSE_LAYOUT_SWAP: {
162 struct split_param *sp = data;
164 LASSERT(data != NULL);
165 op_data->op_bias |= bias;
166 op_data->op_data_version = 0;
167 op_data->op_lease_handle = och->och_lease_handle;
168 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
169 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
170 op_data->op_mirror_id = sp->sp_mirror_id;
172 op_data->op_fid2 = *ll_inode2fid(data);
177 case MDS_CLOSE_RESYNC_DONE: {
178 struct ll_ioc_lease *ioc = data;
180 LASSERT(data != NULL);
181 op_data->op_attr_blocks +=
182 ioc->lil_count * op_data->op_attr_blocks;
183 op_data->op_attr.ia_valid |= ATTR_SIZE;
184 op_data->op_xvalid |= OP_XVALID_BLOCKS;
185 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
187 op_data->op_lease_handle = och->och_lease_handle;
188 op_data->op_data = &ioc->lil_ids[0];
189 op_data->op_data_size =
190 ioc->lil_count * sizeof(ioc->lil_ids[0]);
194 case MDS_HSM_RELEASE:
195 LASSERT(data != NULL);
196 op_data->op_bias |= MDS_HSM_RELEASE;
197 op_data->op_data_version = *(__u64 *)data;
198 op_data->op_lease_handle = och->och_lease_handle;
199 op_data->op_attr.ia_valid |= ATTR_SIZE;
200 op_data->op_xvalid |= OP_XVALID_BLOCKS;
204 LASSERT(data == NULL);
208 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
209 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
210 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
211 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
213 rc = md_close(md_exp, op_data, och->och_mod, &req);
214 if (rc != 0 && rc != -EINTR)
215 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
216 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
218 if (rc == 0 && op_data->op_bias & bias) {
219 struct mdt_body *body;
221 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
222 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
226 ll_finish_md_op_data(op_data);
230 md_clear_open_replay_data(md_exp, och);
231 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
234 ptlrpc_req_finished(req); /* This is close request */
238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
240 struct ll_inode_info *lli = ll_i2info(inode);
241 struct obd_client_handle **och_p;
242 struct obd_client_handle *och;
247 if (fmode & FMODE_WRITE) {
248 och_p = &lli->lli_mds_write_och;
249 och_usecount = &lli->lli_open_fd_write_count;
250 } else if (fmode & FMODE_EXEC) {
251 och_p = &lli->lli_mds_exec_och;
252 och_usecount = &lli->lli_open_fd_exec_count;
254 LASSERT(fmode & FMODE_READ);
255 och_p = &lli->lli_mds_read_och;
256 och_usecount = &lli->lli_open_fd_read_count;
259 mutex_lock(&lli->lli_och_mutex);
260 if (*och_usecount > 0) {
261 /* There are still users of this handle, so skip
263 mutex_unlock(&lli->lli_och_mutex);
269 mutex_unlock(&lli->lli_och_mutex);
272 /* There might be a race and this handle may already
274 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
280 static int ll_md_close(struct inode *inode, struct file *file)
282 union ldlm_policy_data policy = {
283 .l_inodebits = { MDS_INODELOCK_OPEN },
285 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
286 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
287 struct ll_inode_info *lli = ll_i2info(inode);
288 struct lustre_handle lockh;
289 enum ldlm_mode lockmode;
293 /* clear group lock, if present */
294 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
295 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
297 if (fd->fd_lease_och != NULL) {
300 /* Usually the lease is not released when the
301 * application crashed, we need to release here. */
302 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
303 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
304 PFID(&lli->lli_fid), rc, lease_broken);
306 fd->fd_lease_och = NULL;
309 if (fd->fd_och != NULL) {
310 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
315 /* Let's see if we have good enough OPEN lock on the file and if
316 we can skip talking to MDS */
317 mutex_lock(&lli->lli_och_mutex);
318 if (fd->fd_omode & FMODE_WRITE) {
320 LASSERT(lli->lli_open_fd_write_count);
321 lli->lli_open_fd_write_count--;
322 } else if (fd->fd_omode & FMODE_EXEC) {
324 LASSERT(lli->lli_open_fd_exec_count);
325 lli->lli_open_fd_exec_count--;
328 LASSERT(lli->lli_open_fd_read_count);
329 lli->lli_open_fd_read_count--;
331 mutex_unlock(&lli->lli_och_mutex);
333 /* LU-4398: do not cache write open lock if the file has exec bit */
334 if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
335 !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
336 LDLM_IBITS, &policy, lockmode, &lockh))
337 rc = ll_md_real_close(inode, fd->fd_omode);
340 LUSTRE_FPRIVATE(file) = NULL;
341 ll_file_data_put(fd);
346 /* While this returns an error code, fput() the caller does not, so we need
347 * to make every effort to clean up all of our state here. Also, applications
348 * rarely check close errors and even if an error is returned they will not
349 * re-try the close call.
351 int ll_file_release(struct inode *inode, struct file *file)
353 struct ll_file_data *fd;
354 struct ll_sb_info *sbi = ll_i2sbi(inode);
355 struct ll_inode_info *lli = ll_i2info(inode);
359 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
360 PFID(ll_inode2fid(inode)), inode);
362 if (inode->i_sb->s_root != file_dentry(file))
363 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
364 fd = LUSTRE_FPRIVATE(file);
367 /* The last ref on @file, maybe not the the owner pid of statahead,
368 * because parent and child process can share the same file handle. */
369 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
370 ll_deauthorize_statahead(inode, fd);
372 if (inode->i_sb->s_root == file_dentry(file)) {
373 LUSTRE_FPRIVATE(file) = NULL;
374 ll_file_data_put(fd);
378 if (!S_ISDIR(inode->i_mode)) {
379 if (lli->lli_clob != NULL)
380 lov_read_and_clear_async_rc(lli->lli_clob);
381 lli->lli_async_rc = 0;
384 rc = ll_md_close(inode, file);
386 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
387 libcfs_debug_dumplog();
392 static inline int ll_dom_readpage(void *data, struct page *page)
394 struct niobuf_local *lnb = data;
397 kaddr = ll_kmap_atomic(page, KM_USER0);
398 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
399 if (lnb->lnb_len < PAGE_SIZE)
400 memset(kaddr + lnb->lnb_len, 0,
401 PAGE_SIZE - lnb->lnb_len);
402 flush_dcache_page(page);
403 SetPageUptodate(page);
404 ll_kunmap_atomic(kaddr, KM_USER0);
410 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
411 struct lookup_intent *it)
413 struct ll_inode_info *lli = ll_i2info(inode);
414 struct cl_object *obj = lli->lli_clob;
415 struct address_space *mapping = inode->i_mapping;
417 struct niobuf_remote *rnb;
418 struct mdt_body *body;
420 struct lustre_handle lockh;
421 struct ldlm_lock *lock;
422 unsigned long index, start;
423 struct niobuf_local lnb;
424 bool dom_lock = false;
431 if (it->it_lock_mode != 0) {
432 lockh.cookie = it->it_lock_handle;
433 lock = ldlm_handle2lock(&lockh);
435 dom_lock = ldlm_has_dom(lock);
441 if (!req_capsule_field_present(&req->rq_pill, &RMF_NIOBUF_INLINE,
445 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
446 if (rnb == NULL || rnb->rnb_len == 0)
449 /* LU-11595: Server may return whole file and that is OK always or
450 * it may return just file tail and its offset must be aligned with
451 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
452 * smaller then offset may be not aligned and that data is just ignored.
454 if (rnb->rnb_offset % PAGE_SIZE)
457 /* Server returns whole file or just file tail if it fills in reply
458 * buffer, in both cases total size should be equal to the file size.
460 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
461 if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
462 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
463 ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
464 rnb->rnb_len, body->mbo_dom_size);
468 CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
469 rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
471 data = (char *)rnb + sizeof(*rnb);
473 lnb.lnb_file_offset = rnb->rnb_offset;
474 start = lnb.lnb_file_offset / PAGE_SIZE;
476 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
477 lnb.lnb_page_offset = 0;
479 lnb.lnb_data = data + (index << PAGE_SHIFT);
480 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
481 if (lnb.lnb_len > PAGE_SIZE)
482 lnb.lnb_len = PAGE_SIZE;
484 vmpage = read_cache_page(mapping, index + start,
485 ll_dom_readpage, &lnb);
486 if (IS_ERR(vmpage)) {
487 CWARN("%s: cannot fill page %lu for "DFID
488 " with data: rc = %li\n",
489 ll_get_fsname(inode->i_sb, NULL, 0),
490 index + start, PFID(lu_object_fid(&obj->co_lu)),
496 } while (rnb->rnb_len > (index << PAGE_SHIFT));
500 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
501 struct lookup_intent *itp)
503 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
504 struct dentry *parent = de->d_parent;
507 struct md_op_data *op_data;
508 struct ptlrpc_request *req = NULL;
512 LASSERT(parent != NULL);
513 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
515 /* if server supports open-by-fid, or file name is invalid, don't pack
516 * name in open request */
517 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
518 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
520 len = de->d_name.len;
521 name = kmalloc(len + 1, GFP_NOFS);
526 spin_lock(&de->d_lock);
527 if (len != de->d_name.len) {
528 spin_unlock(&de->d_lock);
532 memcpy(name, de->d_name.name, len);
534 spin_unlock(&de->d_lock);
536 if (!lu_name_is_valid_2(name, len)) {
542 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
543 name, len, 0, LUSTRE_OPC_ANY, NULL);
544 if (IS_ERR(op_data)) {
546 RETURN(PTR_ERR(op_data));
548 op_data->op_data = lmm;
549 op_data->op_data_size = lmmsize;
551 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
552 &ll_md_blocking_ast, 0);
554 ll_finish_md_op_data(op_data);
556 /* reason for keep own exit path - don`t flood log
557 * with messages with -ESTALE errors.
559 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
560 it_open_error(DISP_OPEN_OPEN, itp))
562 ll_release_openhandle(de, itp);
566 if (it_disposition(itp, DISP_LOOKUP_NEG))
567 GOTO(out, rc = -ENOENT);
569 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
570 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
571 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
575 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
577 if (!rc && itp->it_lock_mode) {
578 ll_dom_finish_open(de->d_inode, req, itp);
579 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
583 ptlrpc_req_finished(req);
584 ll_intent_drop_lock(itp);
586 /* We did open by fid, but by the time we got to the server,
587 * the object disappeared. If this is a create, we cannot really
588 * tell the userspace that the file it was trying to create
589 * does not exist. Instead let's return -ESTALE, and the VFS will
590 * retry the create with LOOKUP_REVAL that we are going to catch
591 * in ll_revalidate_dentry() and use lookup then.
593 if (rc == -ENOENT && itp->it_op & IT_CREAT)
599 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
600 struct obd_client_handle *och)
602 struct mdt_body *body;
604 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
605 och->och_open_handle = body->mbo_open_handle;
606 och->och_fid = body->mbo_fid1;
607 och->och_lease_handle.cookie = it->it_lock_handle;
608 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
609 och->och_flags = it->it_flags;
611 return md_set_open_replay_data(md_exp, och, it);
614 static int ll_local_open(struct file *file, struct lookup_intent *it,
615 struct ll_file_data *fd, struct obd_client_handle *och)
617 struct inode *inode = file_inode(file);
620 LASSERT(!LUSTRE_FPRIVATE(file));
627 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
632 LUSTRE_FPRIVATE(file) = fd;
633 ll_readahead_init(inode, &fd->fd_ras);
634 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
636 /* ll_cl_context initialize */
637 rwlock_init(&fd->fd_lock);
638 INIT_LIST_HEAD(&fd->fd_lccs);
643 /* Open a file, and (for the very first open) create objects on the OSTs at
644 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
645 * creation or open until ll_lov_setstripe() ioctl is called.
647 * If we already have the stripe MD locally then we don't request it in
648 * md_open(), by passing a lmm_size = 0.
650 * It is up to the application to ensure no other processes open this file
651 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
652 * used. We might be able to avoid races of that sort by getting lli_open_sem
653 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
654 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
656 int ll_file_open(struct inode *inode, struct file *file)
658 struct ll_inode_info *lli = ll_i2info(inode);
659 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
660 .it_flags = file->f_flags };
661 struct obd_client_handle **och_p = NULL;
662 __u64 *och_usecount = NULL;
663 struct ll_file_data *fd;
667 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
668 PFID(ll_inode2fid(inode)), inode, file->f_flags);
670 it = file->private_data; /* XXX: compat macro */
671 file->private_data = NULL; /* prevent ll_local_open assertion */
673 fd = ll_file_data_get();
675 GOTO(out_nofiledata, rc = -ENOMEM);
678 if (S_ISDIR(inode->i_mode))
679 ll_authorize_statahead(inode, fd);
681 if (inode->i_sb->s_root == file_dentry(file)) {
682 LUSTRE_FPRIVATE(file) = fd;
686 if (!it || !it->it_disposition) {
687 /* Convert f_flags into access mode. We cannot use file->f_mode,
688 * because everything but O_ACCMODE mask was stripped from
690 if ((oit.it_flags + 1) & O_ACCMODE)
692 if (file->f_flags & O_TRUNC)
693 oit.it_flags |= FMODE_WRITE;
695 /* kernel only call f_op->open in dentry_open. filp_open calls
696 * dentry_open after call to open_namei that checks permissions.
697 * Only nfsd_open call dentry_open directly without checking
698 * permissions and because of that this code below is safe.
700 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
701 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
703 /* We do not want O_EXCL here, presumably we opened the file
704 * already? XXX - NFS implications? */
705 oit.it_flags &= ~O_EXCL;
707 /* bug20584, if "it_flags" contains O_CREAT, the file will be
708 * created if necessary, then "IT_CREAT" should be set to keep
709 * consistent with it */
710 if (oit.it_flags & O_CREAT)
711 oit.it_op |= IT_CREAT;
717 /* Let's see if we have file open on MDS already. */
718 if (it->it_flags & FMODE_WRITE) {
719 och_p = &lli->lli_mds_write_och;
720 och_usecount = &lli->lli_open_fd_write_count;
721 } else if (it->it_flags & FMODE_EXEC) {
722 och_p = &lli->lli_mds_exec_och;
723 och_usecount = &lli->lli_open_fd_exec_count;
725 och_p = &lli->lli_mds_read_och;
726 och_usecount = &lli->lli_open_fd_read_count;
729 mutex_lock(&lli->lli_och_mutex);
730 if (*och_p) { /* Open handle is present */
731 if (it_disposition(it, DISP_OPEN_OPEN)) {
732 /* Well, there's extra open request that we do not need,
733 let's close it somehow. This will decref request. */
734 rc = it_open_error(DISP_OPEN_OPEN, it);
736 mutex_unlock(&lli->lli_och_mutex);
737 GOTO(out_openerr, rc);
740 ll_release_openhandle(file_dentry(file), it);
744 rc = ll_local_open(file, it, fd, NULL);
747 mutex_unlock(&lli->lli_och_mutex);
748 GOTO(out_openerr, rc);
751 LASSERT(*och_usecount == 0);
752 if (!it->it_disposition) {
753 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
754 /* We cannot just request lock handle now, new ELC code
755 means that one of other OPEN locks for this file
756 could be cancelled, and since blocking ast handler
757 would attempt to grab och_mutex as well, that would
758 result in a deadlock */
759 mutex_unlock(&lli->lli_och_mutex);
761 * Normally called under two situations:
763 * 2. A race/condition on MDS resulting in no open
764 * handle to be returned from LOOKUP|OPEN request,
765 * for example if the target entry was a symlink.
767 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
768 * marked by a bit set in ll_iget_for_nfs. Clear the
769 * bit so that it's not confusing later callers.
771 * NB; when ldd is NULL, it must have come via normal
772 * lookup path only, since ll_iget_for_nfs always calls
775 if (ldd && ldd->lld_nfs_dentry) {
776 ldd->lld_nfs_dentry = 0;
777 it->it_flags |= MDS_OPEN_LOCK;
781 * Always specify MDS_OPEN_BY_FID because we don't want
782 * to get file with different fid.
784 it->it_flags |= MDS_OPEN_BY_FID;
785 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
788 GOTO(out_openerr, rc);
792 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
794 GOTO(out_och_free, rc = -ENOMEM);
798 /* md_intent_lock() didn't get a request ref if there was an
799 * open error, so don't do cleanup on the request here
801 /* XXX (green): Should not we bail out on any error here, not
802 * just open error? */
803 rc = it_open_error(DISP_OPEN_OPEN, it);
805 GOTO(out_och_free, rc);
807 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
808 "inode %p: disposition %x, status %d\n", inode,
809 it_disposition(it, ~0), it->it_status);
811 rc = ll_local_open(file, it, fd, *och_p);
813 GOTO(out_och_free, rc);
815 mutex_unlock(&lli->lli_och_mutex);
818 /* Must do this outside lli_och_mutex lock to prevent deadlock where
819 different kind of OPEN lock for this same inode gets cancelled
820 by ldlm_cancel_lru */
821 if (!S_ISREG(inode->i_mode))
822 GOTO(out_och_free, rc);
824 cl_lov_delay_create_clear(&file->f_flags);
825 GOTO(out_och_free, rc);
829 if (och_p && *och_p) {
830 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
831 *och_p = NULL; /* OBD_FREE writes some magic there */
834 mutex_unlock(&lli->lli_och_mutex);
837 if (lli->lli_opendir_key == fd)
838 ll_deauthorize_statahead(inode, fd);
840 ll_file_data_put(fd);
842 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
846 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
847 ptlrpc_req_finished(it->it_request);
848 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
854 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
855 struct ldlm_lock_desc *desc, void *data, int flag)
858 struct lustre_handle lockh;
862 case LDLM_CB_BLOCKING:
863 ldlm_lock2handle(lock, &lockh);
864 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
866 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
870 case LDLM_CB_CANCELING:
878 * When setting a lease on a file, we take ownership of the lli_mds_*_och
879 * and save it as fd->fd_och so as to force client to reopen the file even
880 * if it has an open lock in cache already.
882 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
883 struct lustre_handle *old_open_handle)
885 struct ll_inode_info *lli = ll_i2info(inode);
886 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
887 struct obd_client_handle **och_p;
892 /* Get the openhandle of the file */
893 mutex_lock(&lli->lli_och_mutex);
894 if (fd->fd_lease_och != NULL)
895 GOTO(out_unlock, rc = -EBUSY);
897 if (fd->fd_och == NULL) {
898 if (file->f_mode & FMODE_WRITE) {
899 LASSERT(lli->lli_mds_write_och != NULL);
900 och_p = &lli->lli_mds_write_och;
901 och_usecount = &lli->lli_open_fd_write_count;
903 LASSERT(lli->lli_mds_read_och != NULL);
904 och_p = &lli->lli_mds_read_och;
905 och_usecount = &lli->lli_open_fd_read_count;
908 if (*och_usecount > 1)
909 GOTO(out_unlock, rc = -EBUSY);
916 *old_open_handle = fd->fd_och->och_open_handle;
920 mutex_unlock(&lli->lli_och_mutex);
925 * Release ownership on lli_mds_*_och when putting back a file lease.
927 static int ll_lease_och_release(struct inode *inode, struct file *file)
929 struct ll_inode_info *lli = ll_i2info(inode);
930 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
931 struct obd_client_handle **och_p;
932 struct obd_client_handle *old_och = NULL;
937 mutex_lock(&lli->lli_och_mutex);
938 if (file->f_mode & FMODE_WRITE) {
939 och_p = &lli->lli_mds_write_och;
940 och_usecount = &lli->lli_open_fd_write_count;
942 och_p = &lli->lli_mds_read_och;
943 och_usecount = &lli->lli_open_fd_read_count;
946 /* The file may have been open by another process (broken lease) so
947 * *och_p is not NULL. In this case we should simply increase usecount
950 if (*och_p != NULL) {
951 old_och = fd->fd_och;
958 mutex_unlock(&lli->lli_och_mutex);
961 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
967 * Acquire a lease and open the file.
969 static struct obd_client_handle *
970 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
973 struct lookup_intent it = { .it_op = IT_OPEN };
974 struct ll_sb_info *sbi = ll_i2sbi(inode);
975 struct md_op_data *op_data;
976 struct ptlrpc_request *req = NULL;
977 struct lustre_handle old_open_handle = { 0 };
978 struct obd_client_handle *och = NULL;
983 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
984 RETURN(ERR_PTR(-EINVAL));
987 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
988 RETURN(ERR_PTR(-EPERM));
990 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
997 RETURN(ERR_PTR(-ENOMEM));
999 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1000 LUSTRE_OPC_ANY, NULL);
1001 if (IS_ERR(op_data))
1002 GOTO(out, rc = PTR_ERR(op_data));
1004 /* To tell the MDT this openhandle is from the same owner */
1005 op_data->op_open_handle = old_open_handle;
1007 it.it_flags = fmode | open_flags;
1008 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1009 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1010 &ll_md_blocking_lease_ast,
1011 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1012 * it can be cancelled which may mislead applications that the lease is
1014 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1015 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1016 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1017 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1018 ll_finish_md_op_data(op_data);
1019 ptlrpc_req_finished(req);
1021 GOTO(out_release_it, rc);
1023 if (it_disposition(&it, DISP_LOOKUP_NEG))
1024 GOTO(out_release_it, rc = -ENOENT);
1026 rc = it_open_error(DISP_OPEN_OPEN, &it);
1028 GOTO(out_release_it, rc);
1030 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1031 rc = ll_och_fill(sbi->ll_md_exp, &it, och);
1033 GOTO(out_release_it, rc);
1035 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1036 GOTO(out_close, rc = -EOPNOTSUPP);
1038 /* already get lease, handle lease lock */
1039 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1040 if (it.it_lock_mode == 0 ||
1041 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1042 /* open lock must return for lease */
1043 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1044 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1046 GOTO(out_close, rc = -EPROTO);
1049 ll_intent_release(&it);
1053 /* Cancel open lock */
1054 if (it.it_lock_mode != 0) {
1055 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1057 it.it_lock_mode = 0;
1058 och->och_lease_handle.cookie = 0ULL;
1060 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1062 CERROR("%s: error closing file "DFID": %d\n",
1063 ll_get_fsname(inode->i_sb, NULL, 0),
1064 PFID(&ll_i2info(inode)->lli_fid), rc2);
1065 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1067 ll_intent_release(&it);
1071 RETURN(ERR_PTR(rc));
1075 * Check whether a layout swap can be done between two inodes.
1077 * \param[in] inode1 First inode to check
1078 * \param[in] inode2 Second inode to check
1080 * \retval 0 on success, layout swap can be performed between both inodes
1081 * \retval negative error code if requirements are not met
1083 static int ll_check_swap_layouts_validity(struct inode *inode1,
1084 struct inode *inode2)
1086 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1089 if (inode_permission(inode1, MAY_WRITE) ||
1090 inode_permission(inode2, MAY_WRITE))
1093 if (inode1->i_sb != inode2->i_sb)
1099 static int ll_swap_layouts_close(struct obd_client_handle *och,
1100 struct inode *inode, struct inode *inode2)
1102 const struct lu_fid *fid1 = ll_inode2fid(inode);
1103 const struct lu_fid *fid2;
1107 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1108 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1110 rc = ll_check_swap_layouts_validity(inode, inode2);
1112 GOTO(out_free_och, rc);
1114 /* We now know that inode2 is a lustre inode */
1115 fid2 = ll_inode2fid(inode2);
1117 rc = lu_fid_cmp(fid1, fid2);
1119 GOTO(out_free_och, rc = -EINVAL);
1121 /* Close the file and {swap,merge} layouts between inode & inode2.
1122 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1123 * because we still need it to pack l_remote_handle to MDT. */
1124 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1127 och = NULL; /* freed in ll_close_inode_openhandle() */
1137 * Release lease and close the file.
1138 * It will check if the lease has ever broken.
1140 static int ll_lease_close_intent(struct obd_client_handle *och,
1141 struct inode *inode,
1142 bool *lease_broken, enum mds_op_bias bias,
1145 struct ldlm_lock *lock;
1146 bool cancelled = true;
1150 lock = ldlm_handle2lock(&och->och_lease_handle);
1152 lock_res_and_lock(lock);
1153 cancelled = ldlm_is_cancel(lock);
1154 unlock_res_and_lock(lock);
1155 LDLM_LOCK_PUT(lock);
1158 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1159 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1161 if (lease_broken != NULL)
1162 *lease_broken = cancelled;
1164 if (!cancelled && !bias)
1165 ldlm_cli_cancel(&och->och_lease_handle, 0);
1167 if (cancelled) { /* no need to excute intent */
1172 rc = ll_close_inode_openhandle(inode, och, bias, data);
1176 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1179 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1183 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1185 static int ll_lease_file_resync(struct obd_client_handle *och,
1186 struct inode *inode, unsigned long arg)
1188 struct ll_sb_info *sbi = ll_i2sbi(inode);
1189 struct md_op_data *op_data;
1190 struct ll_ioc_lease_id ioc;
1191 __u64 data_version_unused;
1195 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1196 LUSTRE_OPC_ANY, NULL);
1197 if (IS_ERR(op_data))
1198 RETURN(PTR_ERR(op_data));
1200 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1204 /* before starting file resync, it's necessary to clean up page cache
1205 * in client memory, otherwise once the layout version is increased,
1206 * writing back cached data will be denied the OSTs. */
1207 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1211 op_data->op_lease_handle = och->och_lease_handle;
1212 op_data->op_mirror_id = ioc.lil_mirror_id;
1213 rc = md_file_resync(sbi->ll_md_exp, op_data);
1219 ll_finish_md_op_data(op_data);
1223 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1225 struct ll_inode_info *lli = ll_i2info(inode);
1226 struct cl_object *obj = lli->lli_clob;
1227 struct cl_attr *attr = vvp_env_thread_attr(env);
1235 ll_inode_size_lock(inode);
1237 /* Merge timestamps the most recently obtained from MDS with
1238 * timestamps obtained from OSTs.
1240 * Do not overwrite atime of inode because it may be refreshed
1241 * by file_accessed() function. If the read was served by cache
1242 * data, there is no RPC to be sent so that atime may not be
1243 * transferred to OSTs at all. MDT only updates atime at close time
1244 * if it's at least 'mdd.*.atime_diff' older.
1245 * All in all, the atime in Lustre does not strictly comply with
1246 * POSIX. Solving this problem needs to send an RPC to MDT for each
1247 * read, this will hurt performance.
1249 if (inode->i_atime.tv_sec < lli->lli_atime ||
1250 lli->lli_update_atime) {
1251 inode->i_atime.tv_sec = lli->lli_atime;
1252 lli->lli_update_atime = 0;
1254 inode->i_mtime.tv_sec = lli->lli_mtime;
1255 inode->i_ctime.tv_sec = lli->lli_ctime;
1257 mtime = inode->i_mtime.tv_sec;
1258 atime = inode->i_atime.tv_sec;
1259 ctime = inode->i_ctime.tv_sec;
1261 cl_object_attr_lock(obj);
1262 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1265 rc = cl_object_attr_get(env, obj, attr);
1266 cl_object_attr_unlock(obj);
1269 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1271 if (atime < attr->cat_atime)
1272 atime = attr->cat_atime;
1274 if (ctime < attr->cat_ctime)
1275 ctime = attr->cat_ctime;
1277 if (mtime < attr->cat_mtime)
1278 mtime = attr->cat_mtime;
1280 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1281 PFID(&lli->lli_fid), attr->cat_size);
1283 i_size_write(inode, attr->cat_size);
1284 inode->i_blocks = attr->cat_blocks;
1286 inode->i_mtime.tv_sec = mtime;
1287 inode->i_atime.tv_sec = atime;
1288 inode->i_ctime.tv_sec = ctime;
1291 ll_inode_size_unlock(inode);
1297 * Set designated mirror for I/O.
1299 * So far only read, write, and truncated can support to issue I/O to
1300 * designated mirror.
1302 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1304 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1306 /* clear layout version for generic(non-resync) I/O in case it carries
1307 * stale layout version due to I/O restart */
1308 io->ci_layout_version = 0;
1310 /* FLR: disable non-delay for designated mirror I/O because obviously
1311 * only one mirror is available */
1312 if (fd->fd_designated_mirror > 0) {
1314 io->ci_designated_mirror = fd->fd_designated_mirror;
1315 io->ci_layout_version = fd->fd_layout_version;
1318 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1319 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1322 static bool file_is_noatime(const struct file *file)
1324 const struct vfsmount *mnt = file->f_path.mnt;
1325 const struct inode *inode = file_inode((struct file *)file);
1327 /* Adapted from file_accessed() and touch_atime().*/
1328 if (file->f_flags & O_NOATIME)
1331 if (inode->i_flags & S_NOATIME)
1334 if (IS_NOATIME(inode))
1337 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1340 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1343 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1349 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1351 struct inode *inode = file_inode(file);
1352 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1354 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1355 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1357 if (iot == CIT_WRITE) {
1358 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1359 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1360 file->f_flags & O_DIRECT ||
1363 io->ci_obj = ll_i2info(inode)->lli_clob;
1364 io->ci_lockreq = CILR_MAYBE;
1365 if (ll_file_nolock(file)) {
1366 io->ci_lockreq = CILR_NEVER;
1367 io->ci_no_srvlock = 1;
1368 } else if (file->f_flags & O_APPEND) {
1369 io->ci_lockreq = CILR_MANDATORY;
1371 io->ci_noatime = file_is_noatime(file);
1373 /* FLR: only use non-delay I/O for read as there is only one
1374 * avaliable mirror for write. */
1375 io->ci_ndelay = !(iot == CIT_WRITE);
1377 ll_io_set_mirror(io, file);
1381 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1382 struct file *file, enum cl_io_type iot,
1383 loff_t *ppos, size_t count)
1385 struct vvp_io *vio = vvp_env_io(env);
1386 struct inode *inode = file_inode(file);
1387 struct ll_inode_info *lli = ll_i2info(inode);
1388 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1389 struct range_lock range;
1393 unsigned retried = 0;
1394 bool restarted = false;
1398 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1399 file_dentry(file)->d_name.name,
1400 iot == CIT_READ ? "read" : "write", *ppos, count);
1403 io = vvp_env_thread_io(env);
1404 ll_io_init(io, file, iot);
1405 io->ci_ndelay_tried = retried;
1407 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1408 bool range_locked = false;
1410 if (file->f_flags & O_APPEND)
1411 range_lock_init(&range, 0, LUSTRE_EOF);
1413 range_lock_init(&range, *ppos, *ppos + count - 1);
1415 vio->vui_fd = LUSTRE_FPRIVATE(file);
1416 vio->vui_io_subtype = args->via_io_subtype;
1418 switch (vio->vui_io_subtype) {
1420 vio->vui_iter = args->u.normal.via_iter;
1421 vio->vui_iocb = args->u.normal.via_iocb;
1422 /* Direct IO reads must also take range lock,
1423 * or multiple reads will try to work on the same pages
1424 * See LU-6227 for details. */
1425 if (((iot == CIT_WRITE) ||
1426 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1427 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1428 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1430 rc = range_lock(&lli->lli_write_tree, &range);
1434 range_locked = true;
1438 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1439 vio->u.splice.vui_flags = args->u.splice.via_flags;
1442 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1446 ll_cl_add(file, env, io, LCC_RW);
1447 rc = cl_io_loop(env, io);
1448 ll_cl_remove(file, env);
1451 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1453 range_unlock(&lli->lli_write_tree, &range);
1456 /* cl_io_rw_init() handled IO */
1460 if (io->ci_nob > 0) {
1461 result += io->ci_nob;
1462 count -= io->ci_nob;
1463 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1465 /* prepare IO restart */
1466 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1467 args->u.normal.via_iter = vio->vui_iter;
1470 cl_io_fini(env, io);
1473 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1474 file->f_path.dentry->d_name.name,
1475 iot, rc, result, io->ci_need_restart);
1477 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1479 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1480 file_dentry(file)->d_name.name,
1481 iot == CIT_READ ? "read" : "write",
1482 *ppos, count, result, rc);
1483 /* preserve the tried count for FLR */
1484 retried = io->ci_ndelay_tried;
1489 if (iot == CIT_READ) {
1491 ll_stats_ops_tally(ll_i2sbi(inode),
1492 LPROC_LL_READ_BYTES, result);
1493 } else if (iot == CIT_WRITE) {
1495 ll_stats_ops_tally(ll_i2sbi(inode),
1496 LPROC_LL_WRITE_BYTES, result);
1497 fd->fd_write_failed = false;
1498 } else if (result == 0 && rc == 0) {
1501 fd->fd_write_failed = true;
1503 fd->fd_write_failed = false;
1504 } else if (rc != -ERESTARTSYS) {
1505 fd->fd_write_failed = true;
1509 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1511 RETURN(result > 0 ? result : rc);
1515 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1516 * especially for small I/O.
1518 * To serve a read request, CLIO has to create and initialize a cl_io and
1519 * then request DLM lock. This has turned out to have siginificant overhead
1520 * and affects the performance of small I/O dramatically.
1522 * It's not necessary to create a cl_io for each I/O. Under the help of read
1523 * ahead, most of the pages being read are already in memory cache and we can
1524 * read those pages directly because if the pages exist, the corresponding DLM
1525 * lock must exist so that page content must be valid.
1527 * In fast read implementation, the llite speculatively finds and reads pages
1528 * in memory cache. There are three scenarios for fast read:
1529 * - If the page exists and is uptodate, kernel VM will provide the data and
1530 * CLIO won't be intervened;
1531 * - If the page was brought into memory by read ahead, it will be exported
1532 * and read ahead parameters will be updated;
1533 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1534 * it will go back and invoke normal read, i.e., a cl_io will be created
1535 * and DLM lock will be requested.
1537 * POSIX compliance: posix standard states that read is intended to be atomic.
1538 * Lustre read implementation is in line with Linux kernel read implementation
1539 * and neither of them complies with POSIX standard in this matter. Fast read
1540 * doesn't make the situation worse on single node but it may interleave write
1541 * results from multiple nodes due to short read handling in ll_file_aio_read().
1543 * \param env - lu_env
1544 * \param iocb - kiocb from kernel
1545 * \param iter - user space buffers where the data will be copied
1547 * \retval - number of bytes have been read, or error code if error occurred.
1550 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1554 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1557 /* NB: we can't do direct IO for fast read because it will need a lock
1558 * to make IO engine happy. */
1559 if (iocb->ki_filp->f_flags & O_DIRECT)
1562 result = generic_file_read_iter(iocb, iter);
1564 /* If the first page is not in cache, generic_file_aio_read() will be
1565 * returned with -ENODATA.
1566 * See corresponding code in ll_readpage(). */
1567 if (result == -ENODATA)
1571 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1572 LPROC_LL_READ_BYTES, result);
1578 * Read from a file (through the page cache).
1580 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1583 struct vvp_io_args *args;
1584 struct file *file = iocb->ki_filp;
1589 if (!iov_iter_count(to))
1592 result = ll_do_fast_read(iocb, to);
1593 if (result < 0 || iov_iter_count(to) == 0)
1596 env = cl_env_get(&refcheck);
1598 return PTR_ERR(env);
1600 args = ll_env_args(env, IO_NORMAL);
1601 args->u.normal.via_iter = to;
1602 args->u.normal.via_iocb = iocb;
1604 rc2 = ll_file_io_generic(env, args, file, CIT_READ,
1605 &iocb->ki_pos, iov_iter_count(to));
1608 else if (result == 0)
1611 cl_env_put(env, &refcheck);
1614 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1615 LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
1622 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1623 * If a page is already in the page cache and dirty (and some other things -
1624 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1625 * write to it without doing a full I/O, because Lustre already knows about it
1626 * and will write it out. This saves a lot of processing time.
1628 * All writes here are within one page, so exclusion is handled by the page
1629 * lock on the vm page. We do not do tiny writes for writes which touch
1630 * multiple pages because it's very unlikely multiple sequential pages are
1631 * are already dirty.
1633 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1634 * and are unlikely to be to already dirty pages.
1636 * Attribute updates are important here, we do them in ll_tiny_write_end.
1638 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1640 ssize_t count = iov_iter_count(iter);
1641 struct file *file = iocb->ki_filp;
1642 struct inode *inode = file_inode(file);
1643 bool lock_inode = !IS_NOSEC(inode);
1648 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1649 * of function for why.
1651 if (count >= PAGE_SIZE ||
1652 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1655 if (unlikely(lock_inode))
1657 result = __generic_file_write_iter(iocb, iter);
1659 if (unlikely(lock_inode))
1660 inode_unlock(inode);
1662 /* If the page is not already dirty, ll_tiny_write_begin returns
1663 * -ENODATA. We continue on to normal write.
1665 if (result == -ENODATA)
1669 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1671 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1674 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1680 * Write to a file (through the page cache).
1682 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1684 struct vvp_io_args *args;
1686 ssize_t rc_tiny = 0, rc_normal;
1687 struct file *file = iocb->ki_filp;
1692 if (!iov_iter_count(from))
1693 GOTO(out, rc_normal = 0);
1695 /* NB: we can't do direct IO for tiny writes because they use the page
1696 * cache, we can't do sync writes because tiny writes can't flush
1697 * pages, and we can't do append writes because we can't guarantee the
1698 * required DLM locks are held to protect file size.
1700 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
1701 !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1702 rc_tiny = ll_do_tiny_write(iocb, from);
1704 /* In case of error, go on and try normal write - Only stop if tiny
1705 * write completed I/O.
1707 if (iov_iter_count(from) == 0)
1708 GOTO(out, rc_normal = rc_tiny);
1710 env = cl_env_get(&refcheck);
1712 return PTR_ERR(env);
1714 args = ll_env_args(env, IO_NORMAL);
1715 args->u.normal.via_iter = from;
1716 args->u.normal.via_iocb = iocb;
1718 rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
1719 &iocb->ki_pos, iov_iter_count(from));
1721 /* On success, combine bytes written. */
1722 if (rc_tiny >= 0 && rc_normal > 0)
1723 rc_normal += rc_tiny;
1724 /* On error, only return error from normal write if tiny write did not
1725 * write any bytes. Otherwise return bytes written by tiny write.
1727 else if (rc_tiny > 0)
1728 rc_normal = rc_tiny;
1730 cl_env_put(env, &refcheck);
1733 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1734 LUSTRE_FPRIVATE(file), iocb->ki_pos,
1739 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1741 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1743 static int ll_file_get_iov_count(const struct iovec *iov,
1744 unsigned long *nr_segs, size_t *count)
1749 for (seg = 0; seg < *nr_segs; seg++) {
1750 const struct iovec *iv = &iov[seg];
1753 * If any segment has a negative length, or the cumulative
1754 * length ever wraps negative then return -EINVAL.
1757 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1759 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1764 cnt -= iv->iov_len; /* This segment is no good */
1771 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1772 unsigned long nr_segs, loff_t pos)
1779 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1786 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1787 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1788 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1789 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1790 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1792 result = ll_file_read_iter(iocb, &to);
1797 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1800 struct iovec iov = { .iov_base = buf, .iov_len = count };
1809 init_sync_kiocb(&kiocb, file);
1810 kiocb.ki_pos = *ppos;
1811 #ifdef HAVE_KIOCB_KI_LEFT
1812 kiocb.ki_left = count;
1813 #elif defined(HAVE_KI_NBYTES)
1814 kiocb.i_nbytes = count;
1817 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1818 *ppos = kiocb.ki_pos;
1824 * Write to a file (through the page cache).
1827 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1828 unsigned long nr_segs, loff_t pos)
1830 struct iov_iter from;
1835 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1842 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1843 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1844 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1845 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1846 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1848 result = ll_file_write_iter(iocb, &from);
1853 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1854 size_t count, loff_t *ppos)
1856 struct iovec iov = { .iov_base = (void __user *)buf,
1866 init_sync_kiocb(&kiocb, file);
1867 kiocb.ki_pos = *ppos;
1868 #ifdef HAVE_KIOCB_KI_LEFT
1869 kiocb.ki_left = count;
1870 #elif defined(HAVE_KI_NBYTES)
1871 kiocb.ki_nbytes = count;
1874 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1875 *ppos = kiocb.ki_pos;
1879 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1882 * Send file content (through pagecache) somewhere with helper
1884 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1885 struct pipe_inode_info *pipe, size_t count,
1889 struct vvp_io_args *args;
1894 env = cl_env_get(&refcheck);
1896 RETURN(PTR_ERR(env));
1898 args = ll_env_args(env, IO_SPLICE);
1899 args->u.splice.via_pipe = pipe;
1900 args->u.splice.via_flags = flags;
1902 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1903 cl_env_put(env, &refcheck);
1906 ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
1907 LUSTRE_FPRIVATE(in_file), *ppos, result,
1912 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1913 __u64 flags, struct lov_user_md *lum, int lum_size)
1915 struct lookup_intent oit = {
1917 .it_flags = flags | MDS_OPEN_BY_FID,
1922 if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
1923 le32_to_cpu(LOV_MAGIC_MAGIC)) {
1924 /* this code will only exist for big-endian systems */
1925 lustre_swab_lov_user_md(lum, 0);
1928 ll_inode_size_lock(inode);
1929 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1931 GOTO(out_unlock, rc);
1933 ll_release_openhandle(dentry, &oit);
1936 ll_inode_size_unlock(inode);
1937 ll_intent_release(&oit);
1942 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1943 struct lov_mds_md **lmmp, int *lmm_size,
1944 struct ptlrpc_request **request)
1946 struct ll_sb_info *sbi = ll_i2sbi(inode);
1947 struct mdt_body *body;
1948 struct lov_mds_md *lmm = NULL;
1949 struct ptlrpc_request *req = NULL;
1950 struct md_op_data *op_data;
1953 rc = ll_get_default_mdsize(sbi, &lmmsize);
1957 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1958 strlen(filename), lmmsize,
1959 LUSTRE_OPC_ANY, NULL);
1960 if (IS_ERR(op_data))
1961 RETURN(PTR_ERR(op_data));
1963 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1964 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1965 ll_finish_md_op_data(op_data);
1967 CDEBUG(D_INFO, "md_getattr_name failed "
1968 "on %s: rc %d\n", filename, rc);
1972 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1973 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1975 lmmsize = body->mbo_eadatasize;
1977 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1979 GOTO(out, rc = -ENODATA);
1982 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1983 LASSERT(lmm != NULL);
1985 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1986 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1987 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1988 GOTO(out, rc = -EPROTO);
1991 * This is coming from the MDS, so is probably in
1992 * little endian. We convert it to host endian before
1993 * passing it to userspace.
1995 if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
1996 __swab32(LOV_MAGIC_MAGIC)) {
1997 int stripe_count = 0;
1999 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2000 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2001 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2002 if (le32_to_cpu(lmm->lmm_pattern) &
2003 LOV_PATTERN_F_RELEASED)
2007 lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
2009 /* if function called for directory - we should
2010 * avoid swab not existent lsm objects */
2011 if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
2012 lustre_swab_lov_user_md_objects(
2013 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2015 else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
2016 S_ISREG(body->mbo_mode))
2017 lustre_swab_lov_user_md_objects(
2018 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2024 *lmm_size = lmmsize;
2029 static int ll_lov_setea(struct inode *inode, struct file *file,
2032 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2033 struct lov_user_md *lump;
2034 int lum_size = sizeof(struct lov_user_md) +
2035 sizeof(struct lov_user_ost_data);
2039 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2042 OBD_ALLOC_LARGE(lump, lum_size);
2046 if (copy_from_user(lump, arg, lum_size))
2047 GOTO(out_lump, rc = -EFAULT);
2049 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2051 cl_lov_delay_create_clear(&file->f_flags);
2054 OBD_FREE_LARGE(lump, lum_size);
2058 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2065 env = cl_env_get(&refcheck);
2067 RETURN(PTR_ERR(env));
2069 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2070 cl_env_put(env, &refcheck);
2074 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2077 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2078 struct lov_user_md *klum;
2080 __u64 flags = FMODE_WRITE;
2083 rc = ll_copy_user_md(lum, &klum);
2088 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2093 rc = put_user(0, &lum->lmm_stripe_count);
2097 rc = ll_layout_refresh(inode, &gen);
2101 rc = ll_file_getstripe(inode, arg, lum_size);
2103 cl_lov_delay_create_clear(&file->f_flags);
2106 OBD_FREE_LARGE(klum, lum_size);
2111 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2113 struct ll_inode_info *lli = ll_i2info(inode);
2114 struct cl_object *obj = lli->lli_clob;
2115 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2116 struct ll_grouplock grouplock;
2121 CWARN("group id for group lock must not be 0\n");
2125 if (ll_file_nolock(file))
2126 RETURN(-EOPNOTSUPP);
2128 spin_lock(&lli->lli_lock);
2129 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2130 CWARN("group lock already existed with gid %lu\n",
2131 fd->fd_grouplock.lg_gid);
2132 spin_unlock(&lli->lli_lock);
2135 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2136 spin_unlock(&lli->lli_lock);
2139 * XXX: group lock needs to protect all OST objects while PFL
2140 * can add new OST objects during the IO, so we'd instantiate
2141 * all OST objects before getting its group lock.
2146 struct cl_layout cl = {
2147 .cl_is_composite = false,
2149 struct lu_extent ext = {
2151 .e_end = OBD_OBJECT_EOF,
2154 env = cl_env_get(&refcheck);
2156 RETURN(PTR_ERR(env));
2158 rc = cl_object_layout_get(env, obj, &cl);
2159 if (!rc && cl.cl_is_composite)
2160 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2163 cl_env_put(env, &refcheck);
2168 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2169 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2173 spin_lock(&lli->lli_lock);
2174 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2175 spin_unlock(&lli->lli_lock);
2176 CERROR("another thread just won the race\n");
2177 cl_put_grouplock(&grouplock);
2181 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2182 fd->fd_grouplock = grouplock;
2183 spin_unlock(&lli->lli_lock);
2185 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2189 static int ll_put_grouplock(struct inode *inode, struct file *file,
2192 struct ll_inode_info *lli = ll_i2info(inode);
2193 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2194 struct ll_grouplock grouplock;
2197 spin_lock(&lli->lli_lock);
2198 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2199 spin_unlock(&lli->lli_lock);
2200 CWARN("no group lock held\n");
2204 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2206 if (fd->fd_grouplock.lg_gid != arg) {
2207 CWARN("group lock %lu doesn't match current id %lu\n",
2208 arg, fd->fd_grouplock.lg_gid);
2209 spin_unlock(&lli->lli_lock);
2213 grouplock = fd->fd_grouplock;
2214 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2215 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2216 spin_unlock(&lli->lli_lock);
2218 cl_put_grouplock(&grouplock);
2219 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2224 * Close inode open handle
2226 * \param dentry [in] dentry which contains the inode
2227 * \param it [in,out] intent which contains open info and result
2230 * \retval <0 failure
2232 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2234 struct inode *inode = dentry->d_inode;
2235 struct obd_client_handle *och;
2241 /* Root ? Do nothing. */
2242 if (dentry->d_inode->i_sb->s_root == dentry)
2245 /* No open handle to close? Move away */
2246 if (!it_disposition(it, DISP_OPEN_OPEN))
2249 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2251 OBD_ALLOC(och, sizeof(*och));
2253 GOTO(out, rc = -ENOMEM);
2255 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2259 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2261 /* this one is in place of ll_file_open */
2262 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2263 ptlrpc_req_finished(it->it_request);
2264 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2270 * Get size for inode for which FIEMAP mapping is requested.
2271 * Make the FIEMAP get_info call and returns the result.
2272 * \param fiemap kernel buffer to hold extens
2273 * \param num_bytes kernel buffer size
2275 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2281 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2284 /* Checks for fiemap flags */
2285 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2286 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2290 /* Check for FIEMAP_FLAG_SYNC */
2291 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2292 rc = filemap_fdatawrite(inode->i_mapping);
2297 env = cl_env_get(&refcheck);
2299 RETURN(PTR_ERR(env));
2301 if (i_size_read(inode) == 0) {
2302 rc = ll_glimpse_size(inode);
2307 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2308 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2309 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2311 /* If filesize is 0, then there would be no objects for mapping */
2312 if (fmkey.lfik_oa.o_size == 0) {
2313 fiemap->fm_mapped_extents = 0;
2317 fmkey.lfik_fiemap = *fiemap;
2319 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2320 &fmkey, fiemap, &num_bytes);
2322 cl_env_put(env, &refcheck);
2326 int ll_fid2path(struct inode *inode, void __user *arg)
2328 struct obd_export *exp = ll_i2mdexp(inode);
2329 const struct getinfo_fid2path __user *gfin = arg;
2331 struct getinfo_fid2path *gfout;
2337 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2338 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2341 /* Only need to get the buflen */
2342 if (get_user(pathlen, &gfin->gf_pathlen))
2345 if (pathlen > PATH_MAX)
2348 outsize = sizeof(*gfout) + pathlen;
2349 OBD_ALLOC(gfout, outsize);
2353 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2354 GOTO(gf_free, rc = -EFAULT);
2355 /* append root FID after gfout to let MDT know the root FID so that it
2356 * can lookup the correct path, this is mainly for fileset.
2357 * old server without fileset mount support will ignore this. */
2358 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2360 /* Call mdc_iocontrol */
2361 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2365 if (copy_to_user(arg, gfout, outsize))
2369 OBD_FREE(gfout, outsize);
2374 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2376 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2384 ioc->idv_version = 0;
2385 ioc->idv_layout_version = UINT_MAX;
2387 /* If no file object initialized, we consider its version is 0. */
2391 env = cl_env_get(&refcheck);
2393 RETURN(PTR_ERR(env));
2395 io = vvp_env_thread_io(env);
2397 io->u.ci_data_version.dv_data_version = 0;
2398 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2399 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2402 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2403 result = cl_io_loop(env, io);
2405 result = io->ci_result;
2407 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2408 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2410 cl_io_fini(env, io);
2412 if (unlikely(io->ci_need_restart))
2415 cl_env_put(env, &refcheck);
2421 * Read the data_version for inode.
2423 * This value is computed using stripe object version on OST.
2424 * Version is computed using server side locking.
2426 * @param flags if do sync on the OST side;
2428 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2429 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2431 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2433 struct ioc_data_version ioc = { .idv_flags = flags };
2436 rc = ll_ioc_data_version(inode, &ioc);
2438 *data_version = ioc.idv_version;
2444 * Trigger a HSM release request for the provided inode.
2446 int ll_hsm_release(struct inode *inode)
2449 struct obd_client_handle *och = NULL;
2450 __u64 data_version = 0;
2455 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2456 ll_get_fsname(inode->i_sb, NULL, 0),
2457 PFID(&ll_i2info(inode)->lli_fid));
2459 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2461 GOTO(out, rc = PTR_ERR(och));
2463 /* Grab latest data_version and [am]time values */
2464 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2468 env = cl_env_get(&refcheck);
2470 GOTO(out, rc = PTR_ERR(env));
2472 rc = ll_merge_attr(env, inode);
2473 cl_env_put(env, &refcheck);
2475 /* If error happen, we have the wrong size for a file.
2481 /* Release the file.
2482 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2483 * we still need it to pack l_remote_handle to MDT. */
2484 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2490 if (och != NULL && !IS_ERR(och)) /* close the file */
2491 ll_lease_close(och, inode, NULL);
2496 struct ll_swap_stack {
2499 struct inode *inode1;
2500 struct inode *inode2;
2505 static int ll_swap_layouts(struct file *file1, struct file *file2,
2506 struct lustre_swap_layouts *lsl)
2508 struct mdc_swap_layouts msl;
2509 struct md_op_data *op_data;
2512 struct ll_swap_stack *llss = NULL;
2515 OBD_ALLOC_PTR(llss);
2519 llss->inode1 = file_inode(file1);
2520 llss->inode2 = file_inode(file2);
2522 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2526 /* we use 2 bool because it is easier to swap than 2 bits */
2527 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2528 llss->check_dv1 = true;
2530 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2531 llss->check_dv2 = true;
2533 /* we cannot use lsl->sl_dvX directly because we may swap them */
2534 llss->dv1 = lsl->sl_dv1;
2535 llss->dv2 = lsl->sl_dv2;
2537 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2538 if (rc == 0) /* same file, done! */
2541 if (rc < 0) { /* sequentialize it */
2542 swap(llss->inode1, llss->inode2);
2544 swap(llss->dv1, llss->dv2);
2545 swap(llss->check_dv1, llss->check_dv2);
2549 if (gid != 0) { /* application asks to flush dirty cache */
2550 rc = ll_get_grouplock(llss->inode1, file1, gid);
2554 rc = ll_get_grouplock(llss->inode2, file2, gid);
2556 ll_put_grouplock(llss->inode1, file1, gid);
2561 /* ultimate check, before swaping the layouts we check if
2562 * dataversion has changed (if requested) */
2563 if (llss->check_dv1) {
2564 rc = ll_data_version(llss->inode1, &dv, 0);
2567 if (dv != llss->dv1)
2568 GOTO(putgl, rc = -EAGAIN);
2571 if (llss->check_dv2) {
2572 rc = ll_data_version(llss->inode2, &dv, 0);
2575 if (dv != llss->dv2)
2576 GOTO(putgl, rc = -EAGAIN);
2579 /* struct md_op_data is used to send the swap args to the mdt
2580 * only flags is missing, so we use struct mdc_swap_layouts
2581 * through the md_op_data->op_data */
2582 /* flags from user space have to be converted before they are send to
2583 * server, no flag is sent today, they are only used on the client */
2586 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2587 0, LUSTRE_OPC_ANY, &msl);
2588 if (IS_ERR(op_data))
2589 GOTO(free, rc = PTR_ERR(op_data));
2591 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2592 sizeof(*op_data), op_data, NULL);
2593 ll_finish_md_op_data(op_data);
2600 ll_put_grouplock(llss->inode2, file2, gid);
2601 ll_put_grouplock(llss->inode1, file1, gid);
2611 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2613 struct obd_export *exp = ll_i2mdexp(inode);
2614 struct md_op_data *op_data;
2618 /* Detect out-of range masks */
2619 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2622 /* Non-root users are forbidden to set or clear flags which are
2623 * NOT defined in HSM_USER_MASK. */
2624 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2625 !cfs_capable(CFS_CAP_SYS_ADMIN))
2628 if (!exp_connect_archive_id_array(exp)) {
2629 /* Detect out-of range archive id */
2630 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2631 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2635 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2636 LUSTRE_OPC_ANY, hss);
2637 if (IS_ERR(op_data))
2638 RETURN(PTR_ERR(op_data));
2640 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2643 ll_finish_md_op_data(op_data);
2648 static int ll_hsm_import(struct inode *inode, struct file *file,
2649 struct hsm_user_import *hui)
2651 struct hsm_state_set *hss = NULL;
2652 struct iattr *attr = NULL;
2656 if (!S_ISREG(inode->i_mode))
2662 GOTO(out, rc = -ENOMEM);
2664 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2665 hss->hss_archive_id = hui->hui_archive_id;
2666 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2667 rc = ll_hsm_state_set(inode, hss);
2671 OBD_ALLOC_PTR(attr);
2673 GOTO(out, rc = -ENOMEM);
2675 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2676 attr->ia_mode |= S_IFREG;
2677 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2678 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2679 attr->ia_size = hui->hui_size;
2680 attr->ia_mtime.tv_sec = hui->hui_mtime;
2681 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2682 attr->ia_atime.tv_sec = hui->hui_atime;
2683 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2685 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2686 ATTR_UID | ATTR_GID |
2687 ATTR_MTIME | ATTR_MTIME_SET |
2688 ATTR_ATIME | ATTR_ATIME_SET;
2692 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2696 inode_unlock(inode);
2708 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2710 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2711 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2714 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2716 struct inode *inode = file_inode(file);
2718 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2719 ATTR_MTIME | ATTR_MTIME_SET |
2722 .tv_sec = lfu->lfu_atime_sec,
2723 .tv_nsec = lfu->lfu_atime_nsec,
2726 .tv_sec = lfu->lfu_mtime_sec,
2727 .tv_nsec = lfu->lfu_mtime_nsec,
2730 .tv_sec = lfu->lfu_ctime_sec,
2731 .tv_nsec = lfu->lfu_ctime_nsec,
2737 if (!capable(CAP_SYS_ADMIN))
2740 if (!S_ISREG(inode->i_mode))
2744 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2746 inode_unlock(inode);
2751 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2754 case MODE_READ_USER:
2756 case MODE_WRITE_USER:
2763 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2765 /* Used to allow the upper layers of the client to request an LDLM lock
2766 * without doing an actual read or write.
2768 * Used for ladvise lockahead to manually request specific locks.
2770 * \param[in] file file this ladvise lock request is on
2771 * \param[in] ladvise ladvise struct describing this lock request
2773 * \retval 0 success, no detailed result available (sync requests
2774 * and requests sent to the server [not handled locally]
2775 * cannot return detailed results)
2776 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2777 * see definitions for details.
2778 * \retval negative negative errno on error
2780 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2782 struct lu_env *env = NULL;
2783 struct cl_io *io = NULL;
2784 struct cl_lock *lock = NULL;
2785 struct cl_lock_descr *descr = NULL;
2786 struct dentry *dentry = file->f_path.dentry;
2787 struct inode *inode = dentry->d_inode;
2788 enum cl_lock_mode cl_mode;
2789 off_t start = ladvise->lla_start;
2790 off_t end = ladvise->lla_end;
2796 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2797 "start=%llu, end=%llu\n", dentry->d_name.len,
2798 dentry->d_name.name, dentry->d_inode,
2799 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2802 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2804 GOTO(out, result = cl_mode);
2806 /* Get IO environment */
2807 result = cl_io_get(inode, &env, &io, &refcheck);
2811 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2814 * nothing to do for this io. This currently happens when
2815 * stripe sub-object's are not yet created.
2817 result = io->ci_result;
2818 } else if (result == 0) {
2819 lock = vvp_env_lock(env);
2820 descr = &lock->cll_descr;
2822 descr->cld_obj = io->ci_obj;
2823 /* Convert byte offsets to pages */
2824 descr->cld_start = cl_index(io->ci_obj, start);
2825 descr->cld_end = cl_index(io->ci_obj, end);
2826 descr->cld_mode = cl_mode;
2827 /* CEF_MUST is used because we do not want to convert a
2828 * lockahead request to a lockless lock */
2829 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2832 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2833 descr->cld_enq_flags |= CEF_SPECULATIVE;
2835 result = cl_lock_request(env, io, lock);
2837 /* On success, we need to release the lock */
2839 cl_lock_release(env, lock);
2841 cl_io_fini(env, io);
2842 cl_env_put(env, &refcheck);
2844 /* -ECANCELED indicates a matching lock with a different extent
2845 * was already present, and -EEXIST indicates a matching lock
2846 * on exactly the same extent was already present.
2847 * We convert them to positive values for userspace to make
2848 * recognizing true errors easier.
2849 * Note we can only return these detailed results on async requests,
2850 * as sync requests look the same as i/o requests for locking. */
2851 if (result == -ECANCELED)
2852 result = LLA_RESULT_DIFFERENT;
2853 else if (result == -EEXIST)
2854 result = LLA_RESULT_SAME;
2859 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2861 static int ll_ladvise_sanity(struct inode *inode,
2862 struct llapi_lu_ladvise *ladvise)
2864 enum lu_ladvise_type advice = ladvise->lla_advice;
2865 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2866 * be in the first 32 bits of enum ladvise_flags */
2867 __u32 flags = ladvise->lla_peradvice_flags;
2868 /* 3 lines at 80 characters per line, should be plenty */
2871 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2873 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2874 "last supported advice is %s (value '%d'): rc = %d\n",
2875 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2876 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2880 /* Per-advice checks */
2882 case LU_LADVISE_LOCKNOEXPAND:
2883 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2885 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2887 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2888 ladvise_names[advice], rc);
2892 case LU_LADVISE_LOCKAHEAD:
2893 /* Currently only READ and WRITE modes can be requested */
2894 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2895 ladvise->lla_lockahead_mode == 0) {
2897 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2899 ll_get_fsname(inode->i_sb, NULL, 0),
2900 ladvise->lla_lockahead_mode,
2901 ladvise_names[advice], rc);
2904 case LU_LADVISE_WILLREAD:
2905 case LU_LADVISE_DONTNEED:
2907 /* Note fall through above - These checks apply to all advices
2908 * except LOCKNOEXPAND */
2909 if (flags & ~LF_DEFAULT_MASK) {
2911 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2913 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2914 ladvise_names[advice], rc);
2917 if (ladvise->lla_start >= ladvise->lla_end) {
2919 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2920 "for %s: rc = %d\n",
2921 ll_get_fsname(inode->i_sb, NULL, 0),
2922 ladvise->lla_start, ladvise->lla_end,
2923 ladvise_names[advice], rc);
2935 * Give file access advices
2937 * The ladvise interface is similar to Linux fadvise() system call, except it
2938 * forwards the advices directly from Lustre client to server. The server side
2939 * codes will apply appropriate read-ahead and caching techniques for the
2940 * corresponding files.
2942 * A typical workload for ladvise is e.g. a bunch of different clients are
2943 * doing small random reads of a file, so prefetching pages into OSS cache
2944 * with big linear reads before the random IO is a net benefit. Fetching
2945 * all that data into each client cache with fadvise() may not be, due to
2946 * much more data being sent to the client.
2948 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2949 struct llapi_lu_ladvise *ladvise)
2953 struct cl_ladvise_io *lio;
2958 env = cl_env_get(&refcheck);
2960 RETURN(PTR_ERR(env));
2962 io = vvp_env_thread_io(env);
2963 io->ci_obj = ll_i2info(inode)->lli_clob;
2965 /* initialize parameters for ladvise */
2966 lio = &io->u.ci_ladvise;
2967 lio->li_start = ladvise->lla_start;
2968 lio->li_end = ladvise->lla_end;
2969 lio->li_fid = ll_inode2fid(inode);
2970 lio->li_advice = ladvise->lla_advice;
2971 lio->li_flags = flags;
2973 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2974 rc = cl_io_loop(env, io);
2978 cl_io_fini(env, io);
2979 cl_env_put(env, &refcheck);
2983 static int ll_lock_noexpand(struct file *file, int flags)
2985 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2987 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2992 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2995 struct fsxattr fsxattr;
2997 if (copy_from_user(&fsxattr,
2998 (const struct fsxattr __user *)arg,
3002 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3003 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3004 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3005 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3006 if (copy_to_user((struct fsxattr __user *)arg,
3007 &fsxattr, sizeof(fsxattr)))
3013 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3016 * Project Quota ID state is only allowed to change from within the init
3017 * namespace. Enforce that restriction only if we are trying to change
3018 * the quota ID state. Everything else is allowed in user namespaces.
3020 if (current_user_ns() == &init_user_ns)
3023 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3026 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3027 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3030 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3037 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3041 struct md_op_data *op_data;
3042 struct ptlrpc_request *req = NULL;
3044 struct fsxattr fsxattr;
3045 struct cl_object *obj;
3049 if (copy_from_user(&fsxattr,
3050 (const struct fsxattr __user *)arg,
3054 rc = ll_ioctl_check_project(inode, &fsxattr);
3058 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3059 LUSTRE_OPC_ANY, NULL);
3060 if (IS_ERR(op_data))
3061 RETURN(PTR_ERR(op_data));
3063 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3064 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3065 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3066 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3067 op_data->op_projid = fsxattr.fsx_projid;
3068 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3069 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3071 ptlrpc_req_finished(req);
3073 GOTO(out_fsxattr, rc);
3074 ll_update_inode_flags(inode, op_data->op_attr_flags);
3075 obj = ll_i2info(inode)->lli_clob;
3077 GOTO(out_fsxattr, rc);
3079 OBD_ALLOC_PTR(attr);
3081 GOTO(out_fsxattr, rc = -ENOMEM);
3083 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3084 fsxattr.fsx_xflags);
3087 ll_finish_md_op_data(op_data);
3091 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3094 struct inode *inode = file_inode(file);
3095 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3096 struct ll_inode_info *lli = ll_i2info(inode);
3097 struct obd_client_handle *och = NULL;
3098 struct split_param sp;
3101 enum mds_op_bias bias = 0;
3102 struct file *layout_file = NULL;
3104 size_t data_size = 0;
3108 mutex_lock(&lli->lli_och_mutex);
3109 if (fd->fd_lease_och != NULL) {
3110 och = fd->fd_lease_och;
3111 fd->fd_lease_och = NULL;
3113 mutex_unlock(&lli->lli_och_mutex);
3116 GOTO(out, rc = -ENOLCK);
3118 fmode = och->och_flags;
3120 switch (ioc->lil_flags) {
3121 case LL_LEASE_RESYNC_DONE:
3122 if (ioc->lil_count > IOC_IDS_MAX)
3123 GOTO(out, rc = -EINVAL);
3125 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3126 OBD_ALLOC(data, data_size);
3128 GOTO(out, rc = -ENOMEM);
3130 if (copy_from_user(data, (void __user *)arg, data_size))
3131 GOTO(out, rc = -EFAULT);
3133 bias = MDS_CLOSE_RESYNC_DONE;
3135 case LL_LEASE_LAYOUT_MERGE: {
3138 if (ioc->lil_count != 1)
3139 GOTO(out, rc = -EINVAL);
3141 arg += sizeof(*ioc);
3142 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3143 GOTO(out, rc = -EFAULT);
3145 layout_file = fget(fd);
3147 GOTO(out, rc = -EBADF);
3149 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3150 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3151 GOTO(out, rc = -EPERM);
3153 data = file_inode(layout_file);
3154 bias = MDS_CLOSE_LAYOUT_MERGE;
3157 case LL_LEASE_LAYOUT_SPLIT: {
3161 if (ioc->lil_count != 2)
3162 GOTO(out, rc = -EINVAL);
3164 arg += sizeof(*ioc);
3165 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3166 GOTO(out, rc = -EFAULT);
3168 arg += sizeof(__u32);
3169 if (copy_from_user(&mirror_id, (void __user *)arg,
3171 GOTO(out, rc = -EFAULT);
3173 layout_file = fget(fdv);
3175 GOTO(out, rc = -EBADF);
3177 sp.sp_inode = file_inode(layout_file);
3178 sp.sp_mirror_id = (__u16)mirror_id;
3180 bias = MDS_CLOSE_LAYOUT_SPLIT;
3184 /* without close intent */
3188 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3192 rc = ll_lease_och_release(inode, file);
3201 switch (ioc->lil_flags) {
3202 case LL_LEASE_RESYNC_DONE:
3204 OBD_FREE(data, data_size);
3206 case LL_LEASE_LAYOUT_MERGE:
3207 case LL_LEASE_LAYOUT_SPLIT:
3214 rc = ll_lease_type_from_fmode(fmode);
3218 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3221 struct inode *inode = file_inode(file);
3222 struct ll_inode_info *lli = ll_i2info(inode);
3223 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3224 struct obd_client_handle *och = NULL;
3225 __u64 open_flags = 0;
3231 switch (ioc->lil_mode) {
3232 case LL_LEASE_WRLCK:
3233 if (!(file->f_mode & FMODE_WRITE))
3235 fmode = FMODE_WRITE;
3237 case LL_LEASE_RDLCK:
3238 if (!(file->f_mode & FMODE_READ))
3242 case LL_LEASE_UNLCK:
3243 RETURN(ll_file_unlock_lease(file, ioc, arg));
3248 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3250 /* apply for lease */
3251 if (ioc->lil_flags & LL_LEASE_RESYNC)
3252 open_flags = MDS_OPEN_RESYNC;
3253 och = ll_lease_open(inode, file, fmode, open_flags);
3255 RETURN(PTR_ERR(och));
3257 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3258 rc = ll_lease_file_resync(och, inode, arg);
3260 ll_lease_close(och, inode, NULL);
3263 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3265 ll_lease_close(och, inode, NULL);
3271 mutex_lock(&lli->lli_och_mutex);
3272 if (fd->fd_lease_och == NULL) {
3273 fd->fd_lease_och = och;
3276 mutex_unlock(&lli->lli_och_mutex);
3278 /* impossible now that only excl is supported for now */
3279 ll_lease_close(och, inode, &lease_broken);
3286 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3288 struct inode *inode = file_inode(file);
3289 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3293 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3294 PFID(ll_inode2fid(inode)), inode, cmd);
3295 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3297 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3298 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3302 case LL_IOC_GETFLAGS:
3303 /* Get the current value of the file flags */
3304 return put_user(fd->fd_flags, (int __user *)arg);
3305 case LL_IOC_SETFLAGS:
3306 case LL_IOC_CLRFLAGS:
3307 /* Set or clear specific file flags */
3308 /* XXX This probably needs checks to ensure the flags are
3309 * not abused, and to handle any flag side effects.
3311 if (get_user(flags, (int __user *) arg))
3314 if (cmd == LL_IOC_SETFLAGS) {
3315 if ((flags & LL_FILE_IGNORE_LOCK) &&
3316 !(file->f_flags & O_DIRECT)) {
3317 CERROR("%s: unable to disable locking on "
3318 "non-O_DIRECT file\n", current->comm);
3322 fd->fd_flags |= flags;
3324 fd->fd_flags &= ~flags;
3327 case LL_IOC_LOV_SETSTRIPE:
3328 case LL_IOC_LOV_SETSTRIPE_NEW:
3329 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3330 case LL_IOC_LOV_SETEA:
3331 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3332 case LL_IOC_LOV_SWAP_LAYOUTS: {
3334 struct lustre_swap_layouts lsl;
3336 if (copy_from_user(&lsl, (char __user *)arg,
3337 sizeof(struct lustre_swap_layouts)))
3340 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3343 file2 = fget(lsl.sl_fd);
3347 /* O_WRONLY or O_RDWR */
3348 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3349 GOTO(out, rc = -EPERM);
3351 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3352 struct inode *inode2;
3353 struct ll_inode_info *lli;
3354 struct obd_client_handle *och = NULL;
3356 lli = ll_i2info(inode);
3357 mutex_lock(&lli->lli_och_mutex);
3358 if (fd->fd_lease_och != NULL) {
3359 och = fd->fd_lease_och;
3360 fd->fd_lease_och = NULL;
3362 mutex_unlock(&lli->lli_och_mutex);
3364 GOTO(out, rc = -ENOLCK);
3365 inode2 = file_inode(file2);
3366 rc = ll_swap_layouts_close(och, inode, inode2);
3368 rc = ll_swap_layouts(file, file2, &lsl);
3374 case LL_IOC_LOV_GETSTRIPE:
3375 case LL_IOC_LOV_GETSTRIPE_NEW:
3376 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3377 case FS_IOC_GETFLAGS:
3378 case FS_IOC_SETFLAGS:
3379 RETURN(ll_iocontrol(inode, file, cmd, arg));
3380 case FSFILT_IOC_GETVERSION:
3381 case FS_IOC_GETVERSION:
3382 RETURN(put_user(inode->i_generation, (int __user *)arg));
3383 /* We need to special case any other ioctls we want to handle,
3384 * to send them to the MDS/OST as appropriate and to properly
3385 * network encode the arg field. */
3386 case FS_IOC_SETVERSION:
3389 case LL_IOC_GROUP_LOCK:
3390 RETURN(ll_get_grouplock(inode, file, arg));
3391 case LL_IOC_GROUP_UNLOCK:
3392 RETURN(ll_put_grouplock(inode, file, arg));
3393 case IOC_OBD_STATFS:
3394 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3396 case LL_IOC_FLUSHCTX:
3397 RETURN(ll_flush_ctx(inode));
3398 case LL_IOC_PATH2FID: {
3399 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3400 sizeof(struct lu_fid)))
3405 case LL_IOC_GETPARENT:
3406 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3408 case OBD_IOC_FID2PATH:
3409 RETURN(ll_fid2path(inode, (void __user *)arg));
3410 case LL_IOC_DATA_VERSION: {
3411 struct ioc_data_version idv;
3414 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3417 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3418 rc = ll_ioc_data_version(inode, &idv);
3421 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3427 case LL_IOC_GET_MDTIDX: {
3430 mdtidx = ll_get_mdt_idx(inode);
3434 if (put_user((int)mdtidx, (int __user *)arg))
3439 case OBD_IOC_GETDTNAME:
3440 case OBD_IOC_GETMDNAME:
3441 RETURN(ll_get_obd_name(inode, cmd, arg));
3442 case LL_IOC_HSM_STATE_GET: {
3443 struct md_op_data *op_data;
3444 struct hsm_user_state *hus;
3451 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3452 LUSTRE_OPC_ANY, hus);
3453 if (IS_ERR(op_data)) {
3455 RETURN(PTR_ERR(op_data));
3458 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3461 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3464 ll_finish_md_op_data(op_data);
3468 case LL_IOC_HSM_STATE_SET: {
3469 struct hsm_state_set *hss;
3476 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3481 rc = ll_hsm_state_set(inode, hss);
3486 case LL_IOC_HSM_ACTION: {
3487 struct md_op_data *op_data;
3488 struct hsm_current_action *hca;
3495 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3496 LUSTRE_OPC_ANY, hca);
3497 if (IS_ERR(op_data)) {
3499 RETURN(PTR_ERR(op_data));
3502 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3505 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3508 ll_finish_md_op_data(op_data);
3512 case LL_IOC_SET_LEASE_OLD: {
3513 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3515 RETURN(ll_file_set_lease(file, &ioc, 0));
3517 case LL_IOC_SET_LEASE: {
3518 struct ll_ioc_lease ioc;
3520 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3523 RETURN(ll_file_set_lease(file, &ioc, arg));
3525 case LL_IOC_GET_LEASE: {
3526 struct ll_inode_info *lli = ll_i2info(inode);
3527 struct ldlm_lock *lock = NULL;
3530 mutex_lock(&lli->lli_och_mutex);
3531 if (fd->fd_lease_och != NULL) {
3532 struct obd_client_handle *och = fd->fd_lease_och;
3534 lock = ldlm_handle2lock(&och->och_lease_handle);
3536 lock_res_and_lock(lock);
3537 if (!ldlm_is_cancel(lock))
3538 fmode = och->och_flags;
3540 unlock_res_and_lock(lock);
3541 LDLM_LOCK_PUT(lock);
3544 mutex_unlock(&lli->lli_och_mutex);
3546 RETURN(ll_lease_type_from_fmode(fmode));
3548 case LL_IOC_HSM_IMPORT: {
3549 struct hsm_user_import *hui;
3555 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3560 rc = ll_hsm_import(inode, file, hui);
3565 case LL_IOC_FUTIMES_3: {
3566 struct ll_futimes_3 lfu;
3568 if (copy_from_user(&lfu,
3569 (const struct ll_futimes_3 __user *)arg,
3573 RETURN(ll_file_futimes_3(file, &lfu));
3575 case LL_IOC_LADVISE: {
3576 struct llapi_ladvise_hdr *k_ladvise_hdr;
3577 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3580 int alloc_size = sizeof(*k_ladvise_hdr);
3583 u_ladvise_hdr = (void __user *)arg;
3584 OBD_ALLOC_PTR(k_ladvise_hdr);
3585 if (k_ladvise_hdr == NULL)
3588 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3589 GOTO(out_ladvise, rc = -EFAULT);
3591 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3592 k_ladvise_hdr->lah_count < 1)
3593 GOTO(out_ladvise, rc = -EINVAL);
3595 num_advise = k_ladvise_hdr->lah_count;
3596 if (num_advise >= LAH_COUNT_MAX)
3597 GOTO(out_ladvise, rc = -EFBIG);
3599 OBD_FREE_PTR(k_ladvise_hdr);
3600 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3601 lah_advise[num_advise]);
3602 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3603 if (k_ladvise_hdr == NULL)
3607 * TODO: submit multiple advices to one server in a single RPC
3609 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3610 GOTO(out_ladvise, rc = -EFAULT);
3612 for (i = 0; i < num_advise; i++) {
3613 struct llapi_lu_ladvise *k_ladvise =
3614 &k_ladvise_hdr->lah_advise[i];
3615 struct llapi_lu_ladvise __user *u_ladvise =
3616 &u_ladvise_hdr->lah_advise[i];
3618 rc = ll_ladvise_sanity(inode, k_ladvise);
3620 GOTO(out_ladvise, rc);
3622 switch (k_ladvise->lla_advice) {
3623 case LU_LADVISE_LOCKNOEXPAND:
3624 rc = ll_lock_noexpand(file,
3625 k_ladvise->lla_peradvice_flags);
3626 GOTO(out_ladvise, rc);
3627 case LU_LADVISE_LOCKAHEAD:
3629 rc = ll_file_lock_ahead(file, k_ladvise);
3632 GOTO(out_ladvise, rc);
3635 &u_ladvise->lla_lockahead_result))
3636 GOTO(out_ladvise, rc = -EFAULT);
3639 rc = ll_ladvise(inode, file,
3640 k_ladvise_hdr->lah_flags,
3643 GOTO(out_ladvise, rc);
3650 OBD_FREE(k_ladvise_hdr, alloc_size);
3653 case LL_IOC_FLR_SET_MIRROR: {
3654 /* mirror I/O must be direct to avoid polluting page cache
3656 if (!(file->f_flags & O_DIRECT))
3659 fd->fd_designated_mirror = (__u32)arg;
3662 case LL_IOC_FSGETXATTR:
3663 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3664 case LL_IOC_FSSETXATTR:
3665 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3667 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3669 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3670 (void __user *)arg));
3674 #ifndef HAVE_FILE_LLSEEK_SIZE
3675 static inline loff_t
3676 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3678 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3680 if (offset > maxsize)
3683 if (offset != file->f_pos) {
3684 file->f_pos = offset;
3685 file->f_version = 0;
3691 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3692 loff_t maxsize, loff_t eof)
3694 struct inode *inode = file_inode(file);
3702 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3703 * position-querying operation. Avoid rewriting the "same"
3704 * f_pos value back to the file because a concurrent read(),
3705 * write() or lseek() might have altered it
3710 * f_lock protects against read/modify/write race with other
3711 * SEEK_CURs. Note that parallel writes and reads behave
3715 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3716 inode_unlock(inode);
3720 * In the generic case the entire file is data, so as long as
3721 * offset isn't at the end of the file then the offset is data.
3728 * There is a virtual hole at the end of the file, so as long as
3729 * offset isn't i_size or larger, return i_size.
3737 return llseek_execute(file, offset, maxsize);
3741 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3743 struct inode *inode = file_inode(file);
3744 loff_t retval, eof = 0;
3747 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3748 (origin == SEEK_CUR) ? file->f_pos : 0);
3749 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3750 PFID(ll_inode2fid(inode)), inode, retval, retval,
3752 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3754 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3755 retval = ll_glimpse_size(inode);
3758 eof = i_size_read(inode);
3761 retval = ll_generic_file_llseek_size(file, offset, origin,
3762 ll_file_maxbytes(inode), eof);
3766 static int ll_flush(struct file *file, fl_owner_t id)
3768 struct inode *inode = file_inode(file);
3769 struct ll_inode_info *lli = ll_i2info(inode);
3770 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3773 LASSERT(!S_ISDIR(inode->i_mode));
3775 /* catch async errors that were recorded back when async writeback
3776 * failed for pages in this mapping. */
3777 rc = lli->lli_async_rc;
3778 lli->lli_async_rc = 0;
3779 if (lli->lli_clob != NULL) {
3780 err = lov_read_and_clear_async_rc(lli->lli_clob);
3785 /* The application has been told write failure already.
3786 * Do not report failure again. */
3787 if (fd->fd_write_failed)
3789 return rc ? -EIO : 0;
3793 * Called to make sure a portion of file has been written out.
3794 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3796 * Return how many pages have been written.
3798 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3799 enum cl_fsync_mode mode, int ignore_layout)
3803 struct cl_fsync_io *fio;
3808 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3809 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3812 env = cl_env_get(&refcheck);
3814 RETURN(PTR_ERR(env));
3816 io = vvp_env_thread_io(env);
3817 io->ci_obj = ll_i2info(inode)->lli_clob;
3818 io->ci_ignore_layout = ignore_layout;
3820 /* initialize parameters for sync */
3821 fio = &io->u.ci_fsync;
3822 fio->fi_start = start;
3824 fio->fi_fid = ll_inode2fid(inode);
3825 fio->fi_mode = mode;
3826 fio->fi_nr_written = 0;
3828 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3829 result = cl_io_loop(env, io);
3831 result = io->ci_result;
3833 result = fio->fi_nr_written;
3834 cl_io_fini(env, io);
3835 cl_env_put(env, &refcheck);
3841 * When dentry is provided (the 'else' case), file_dentry() may be
3842 * null and dentry must be used directly rather than pulled from
3843 * file_dentry() as is done otherwise.
3846 #ifdef HAVE_FILE_FSYNC_4ARGS
3847 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3849 struct dentry *dentry = file_dentry(file);
3850 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3851 int ll_fsync(struct file *file, int datasync)
3853 struct dentry *dentry = file_dentry(file);
3855 loff_t end = LLONG_MAX;
3857 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3860 loff_t end = LLONG_MAX;
3862 struct inode *inode = dentry->d_inode;
3863 struct ll_inode_info *lli = ll_i2info(inode);
3864 struct ptlrpc_request *req;
3868 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3869 PFID(ll_inode2fid(inode)), inode);
3870 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3872 #ifdef HAVE_FILE_FSYNC_4ARGS
3873 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3876 /* fsync's caller has already called _fdata{sync,write}, we want
3877 * that IO to finish before calling the osc and mdc sync methods */
3878 rc = filemap_fdatawait(inode->i_mapping);
3881 /* catch async errors that were recorded back when async writeback
3882 * failed for pages in this mapping. */
3883 if (!S_ISDIR(inode->i_mode)) {
3884 err = lli->lli_async_rc;
3885 lli->lli_async_rc = 0;
3888 if (lli->lli_clob != NULL) {
3889 err = lov_read_and_clear_async_rc(lli->lli_clob);
3895 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3899 ptlrpc_req_finished(req);
3901 if (S_ISREG(inode->i_mode)) {
3902 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3904 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3905 if (rc == 0 && err < 0)
3908 fd->fd_write_failed = true;
3910 fd->fd_write_failed = false;
3913 #ifdef HAVE_FILE_FSYNC_4ARGS
3914 inode_unlock(inode);
3920 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3922 struct inode *inode = file_inode(file);
3923 struct ll_sb_info *sbi = ll_i2sbi(inode);
3924 struct ldlm_enqueue_info einfo = {
3925 .ei_type = LDLM_FLOCK,
3926 .ei_cb_cp = ldlm_flock_completion_ast,
3927 .ei_cbdata = file_lock,
3929 struct md_op_data *op_data;
3930 struct lustre_handle lockh = { 0 };
3931 union ldlm_policy_data flock = { { 0 } };
3932 int fl_type = file_lock->fl_type;
3938 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3939 PFID(ll_inode2fid(inode)), file_lock);
3941 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3943 if (file_lock->fl_flags & FL_FLOCK) {
3944 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3945 /* flocks are whole-file locks */
3946 flock.l_flock.end = OFFSET_MAX;
3947 /* For flocks owner is determined by the local file desctiptor*/
3948 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3949 } else if (file_lock->fl_flags & FL_POSIX) {
3950 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3951 flock.l_flock.start = file_lock->fl_start;
3952 flock.l_flock.end = file_lock->fl_end;
3956 flock.l_flock.pid = file_lock->fl_pid;
3958 /* Somewhat ugly workaround for svc lockd.
3959 * lockd installs custom fl_lmops->lm_compare_owner that checks
3960 * for the fl_owner to be the same (which it always is on local node
3961 * I guess between lockd processes) and then compares pid.
3962 * As such we assign pid to the owner field to make it all work,
3963 * conflict with normal locks is unlikely since pid space and
3964 * pointer space for current->files are not intersecting */
3965 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3966 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3970 einfo.ei_mode = LCK_PR;
3973 /* An unlock request may or may not have any relation to
3974 * existing locks so we may not be able to pass a lock handle
3975 * via a normal ldlm_lock_cancel() request. The request may even
3976 * unlock a byte range in the middle of an existing lock. In
3977 * order to process an unlock request we need all of the same
3978 * information that is given with a normal read or write record
3979 * lock request. To avoid creating another ldlm unlock (cancel)
3980 * message we'll treat a LCK_NL flock request as an unlock. */
3981 einfo.ei_mode = LCK_NL;
3984 einfo.ei_mode = LCK_PW;
3987 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4002 flags = LDLM_FL_BLOCK_NOWAIT;
4008 flags = LDLM_FL_TEST_LOCK;
4011 CERROR("unknown fcntl lock command: %d\n", cmd);
4015 /* Save the old mode so that if the mode in the lock changes we
4016 * can decrement the appropriate reader or writer refcount. */
4017 file_lock->fl_type = einfo.ei_mode;
4019 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4020 LUSTRE_OPC_ANY, NULL);
4021 if (IS_ERR(op_data))
4022 RETURN(PTR_ERR(op_data));
4024 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4025 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4026 flock.l_flock.pid, flags, einfo.ei_mode,
4027 flock.l_flock.start, flock.l_flock.end);
4029 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4032 /* Restore the file lock type if not TEST lock. */
4033 if (!(flags & LDLM_FL_TEST_LOCK))
4034 file_lock->fl_type = fl_type;
4036 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4037 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4038 !(flags & LDLM_FL_TEST_LOCK))
4039 rc2 = locks_lock_file_wait(file, file_lock);
4041 if ((file_lock->fl_flags & FL_FLOCK) &&
4042 (rc == 0 || file_lock->fl_type == F_UNLCK))
4043 rc2 = flock_lock_file_wait(file, file_lock);
4044 if ((file_lock->fl_flags & FL_POSIX) &&
4045 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4046 !(flags & LDLM_FL_TEST_LOCK))
4047 rc2 = posix_lock_file_wait(file, file_lock);
4048 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4050 if (rc2 && file_lock->fl_type != F_UNLCK) {
4051 einfo.ei_mode = LCK_NL;
4052 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4057 ll_finish_md_op_data(op_data);
4062 int ll_get_fid_by_name(struct inode *parent, const char *name,
4063 int namelen, struct lu_fid *fid,
4064 struct inode **inode)
4066 struct md_op_data *op_data = NULL;
4067 struct mdt_body *body;
4068 struct ptlrpc_request *req;
4072 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4073 LUSTRE_OPC_ANY, NULL);
4074 if (IS_ERR(op_data))
4075 RETURN(PTR_ERR(op_data));
4077 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4078 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4079 ll_finish_md_op_data(op_data);
4083 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4085 GOTO(out_req, rc = -EFAULT);
4087 *fid = body->mbo_fid1;
4090 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4092 ptlrpc_req_finished(req);
4096 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4099 struct dentry *dchild = NULL;
4100 struct inode *child_inode = NULL;
4101 struct md_op_data *op_data;
4102 struct ptlrpc_request *request = NULL;
4103 struct obd_client_handle *och = NULL;
4105 struct mdt_body *body;
4106 __u64 data_version = 0;
4107 size_t namelen = strlen(name);
4108 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4112 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4113 PFID(ll_inode2fid(parent)), name,
4114 lum->lum_stripe_offset, lum->lum_stripe_count);
4116 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4117 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4118 lustre_swab_lmv_user_md(lum);
4120 /* Get child FID first */
4121 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4124 dchild = d_lookup(file_dentry(file), &qstr);
4126 if (dchild->d_inode)
4127 child_inode = igrab(dchild->d_inode);
4132 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4141 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4142 OBD_CONNECT2_DIR_MIGRATE)) {
4143 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4144 ll_i2info(child_inode)->lli_lsm_md) {
4145 CERROR("%s: MDT doesn't support stripe directory "
4147 ll_get_fsname(parent->i_sb, NULL, 0));
4148 GOTO(out_iput, rc = -EOPNOTSUPP);
4153 * lfs migrate command needs to be blocked on the client
4154 * by checking the migrate FID against the FID of the
4157 if (child_inode == parent->i_sb->s_root->d_inode)
4158 GOTO(out_iput, rc = -EINVAL);
4160 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4161 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4162 if (IS_ERR(op_data))
4163 GOTO(out_iput, rc = PTR_ERR(op_data));
4165 inode_lock(child_inode);
4166 op_data->op_fid3 = *ll_inode2fid(child_inode);
4167 if (!fid_is_sane(&op_data->op_fid3)) {
4168 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4169 ll_get_fsname(parent->i_sb, NULL, 0), name,
4170 PFID(&op_data->op_fid3));
4171 GOTO(out_unlock, rc = -EINVAL);
4174 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4175 op_data->op_data = lum;
4176 op_data->op_data_size = lumlen;
4179 if (S_ISREG(child_inode->i_mode)) {
4180 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4184 GOTO(out_unlock, rc);
4187 rc = ll_data_version(child_inode, &data_version,
4190 GOTO(out_close, rc);
4192 op_data->op_open_handle = och->och_open_handle;
4193 op_data->op_data_version = data_version;
4194 op_data->op_lease_handle = och->och_lease_handle;
4195 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4197 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4198 och->och_mod->mod_open_req->rq_replay = 0;
4199 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4202 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4203 name, namelen, &request);
4205 LASSERT(request != NULL);
4206 ll_update_times(request, parent);
4208 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4209 LASSERT(body != NULL);
4211 /* If the server does release layout lock, then we cleanup
4212 * the client och here, otherwise release it in out_close: */
4213 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4214 obd_mod_put(och->och_mod);
4215 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4217 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4223 if (request != NULL) {
4224 ptlrpc_req_finished(request);
4228 /* Try again if the file layout has changed. */
4229 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4234 ll_lease_close(och, child_inode, NULL);
4236 clear_nlink(child_inode);
4238 inode_unlock(child_inode);
4239 ll_finish_md_op_data(op_data);
4246 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4254 * test if some locks matching bits and l_req_mode are acquired
4255 * - bits can be in different locks
4256 * - if found clear the common lock bits in *bits
4257 * - the bits not found, are kept in *bits
4259 * \param bits [IN] searched lock bits [IN]
4260 * \param l_req_mode [IN] searched lock mode
4261 * \retval boolean, true iff all bits are found
4263 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4265 struct lustre_handle lockh;
4266 union ldlm_policy_data policy;
4267 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4268 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4277 fid = &ll_i2info(inode)->lli_fid;
4278 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4279 ldlm_lockname[mode]);
4281 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4282 for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4283 policy.l_inodebits.bits = *bits & (1 << i);
4284 if (policy.l_inodebits.bits == 0)
4287 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4288 &policy, mode, &lockh)) {
4289 struct ldlm_lock *lock;
4291 lock = ldlm_handle2lock(&lockh);
4294 ~(lock->l_policy_data.l_inodebits.bits);
4295 LDLM_LOCK_PUT(lock);
4297 *bits &= ~policy.l_inodebits.bits;
4304 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4305 struct lustre_handle *lockh, __u64 flags,
4306 enum ldlm_mode mode)
4308 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4313 fid = &ll_i2info(inode)->lli_fid;
4314 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4316 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4317 fid, LDLM_IBITS, &policy, mode, lockh);
4322 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4324 /* Already unlinked. Just update nlink and return success */
4325 if (rc == -ENOENT) {
4327 /* If it is striped directory, and there is bad stripe
4328 * Let's revalidate the dentry again, instead of returning
4330 if (S_ISDIR(inode->i_mode) &&
4331 ll_i2info(inode)->lli_lsm_md != NULL)
4334 /* This path cannot be hit for regular files unless in
4335 * case of obscure races, so no need to to validate
4337 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4339 } else if (rc != 0) {
4340 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4341 "%s: revalidate FID "DFID" error: rc = %d\n",
4342 ll_get_fsname(inode->i_sb, NULL, 0),
4343 PFID(ll_inode2fid(inode)), rc);
4349 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4351 struct inode *parent;
4352 struct inode *inode = dentry->d_inode;
4353 struct obd_export *exp = ll_i2mdexp(inode);
4354 struct lookup_intent oit = {
4357 struct ptlrpc_request *req = NULL;
4358 struct md_op_data *op_data;
4359 const char *name = NULL;
4364 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4365 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4367 if (exp_connect_flags2(exp) & OBD_CONNECT2_GETATTR_PFID) {
4368 parent = dentry->d_parent->d_inode;
4369 name = dentry->d_name.name;
4370 namelen = dentry->d_name.len;
4375 op_data = ll_prep_md_op_data(NULL, parent, inode, name, namelen, 0,
4376 LUSTRE_OPC_ANY, NULL);
4377 if (IS_ERR(op_data))
4378 RETURN(PTR_ERR(op_data));
4380 /* Call getattr by fid */
4381 if (exp_connect_flags2(exp) & OBD_CONNECT2_GETATTR_PFID)
4382 op_data->op_flags = MF_GETATTR_BY_FID;
4383 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4384 ll_finish_md_op_data(op_data);
4386 rc = ll_inode_revalidate_fini(inode, rc);
4390 rc = ll_revalidate_it_finish(req, &oit, dentry);
4392 ll_intent_release(&oit);
4396 /* Unlinked? Unhash dentry, so it is not picked up later by
4397 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4398 * here to preserve get_cwd functionality on 2.6.
4400 if (!dentry->d_inode->i_nlink) {
4401 ll_lock_dcache(inode);
4402 d_lustre_invalidate(dentry, 0);
4403 ll_unlock_dcache(inode);
4406 ll_lookup_finish_locks(&oit, dentry);
4408 ptlrpc_req_finished(req);
4413 static int ll_merge_md_attr(struct inode *inode)
4415 struct ll_inode_info *lli = ll_i2info(inode);
4416 struct cl_attr attr = { 0 };
4419 LASSERT(lli->lli_lsm_md != NULL);
4420 down_read(&lli->lli_lsm_sem);
4421 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4422 &attr, ll_md_blocking_ast);
4423 up_read(&lli->lli_lsm_sem);
4427 set_nlink(inode, attr.cat_nlink);
4428 inode->i_blocks = attr.cat_blocks;
4429 i_size_write(inode, attr.cat_size);
4431 ll_i2info(inode)->lli_atime = attr.cat_atime;
4432 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4433 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4438 static inline dev_t ll_compat_encode_dev(dev_t dev)
4440 /* The compat_sys_*stat*() syscalls will fail unless the
4441 * device majors and minors are both less than 256. Note that
4442 * the value returned here will be passed through
4443 * old_encode_dev() in cp_compat_stat(). And so we are not
4444 * trying to return a valid compat (u16) device number, just
4445 * one that will pass the old_valid_dev() check. */
4447 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4450 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4451 int ll_getattr(const struct path *path, struct kstat *stat,
4452 u32 request_mask, unsigned int flags)
4454 struct dentry *de = path->dentry;
4456 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4459 struct inode *inode = de->d_inode;
4460 struct ll_sb_info *sbi = ll_i2sbi(inode);
4461 struct ll_inode_info *lli = ll_i2info(inode);
4464 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4466 rc = ll_inode_revalidate(de, IT_GETATTR);
4470 if (S_ISREG(inode->i_mode)) {
4471 /* In case of restore, the MDT has the right size and has
4472 * already send it back without granting the layout lock,
4473 * inode is up-to-date so glimpse is useless.
4474 * Also to glimpse we need the layout, in case of a running
4475 * restore the MDT holds the layout lock so the glimpse will
4476 * block up to the end of restore (getattr will block)
4478 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4479 rc = ll_glimpse_size(inode);
4484 /* If object isn't regular a file then don't validate size. */
4485 if (S_ISDIR(inode->i_mode) &&
4486 lli->lli_lsm_md != NULL) {
4487 rc = ll_merge_md_attr(inode);
4492 inode->i_atime.tv_sec = lli->lli_atime;
4493 inode->i_mtime.tv_sec = lli->lli_mtime;
4494 inode->i_ctime.tv_sec = lli->lli_ctime;
4497 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4499 if (ll_need_32bit_api(sbi)) {
4500 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4501 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4502 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4504 stat->ino = inode->i_ino;
4505 stat->dev = inode->i_sb->s_dev;
4506 stat->rdev = inode->i_rdev;
4509 stat->mode = inode->i_mode;
4510 stat->uid = inode->i_uid;
4511 stat->gid = inode->i_gid;
4512 stat->atime = inode->i_atime;
4513 stat->mtime = inode->i_mtime;
4514 stat->ctime = inode->i_ctime;
4515 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4517 stat->nlink = inode->i_nlink;
4518 stat->size = i_size_read(inode);
4519 stat->blocks = inode->i_blocks;
4524 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4525 __u64 start, __u64 len)
4529 struct fiemap *fiemap;
4530 unsigned int extent_count = fieinfo->fi_extents_max;
4532 num_bytes = sizeof(*fiemap) + (extent_count *
4533 sizeof(struct fiemap_extent));
4534 OBD_ALLOC_LARGE(fiemap, num_bytes);
4539 fiemap->fm_flags = fieinfo->fi_flags;
4540 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4541 fiemap->fm_start = start;
4542 fiemap->fm_length = len;
4543 if (extent_count > 0 &&
4544 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4545 sizeof(struct fiemap_extent)) != 0)
4546 GOTO(out, rc = -EFAULT);
4548 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4550 fieinfo->fi_flags = fiemap->fm_flags;
4551 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4552 if (extent_count > 0 &&
4553 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4554 fiemap->fm_mapped_extents *
4555 sizeof(struct fiemap_extent)) != 0)
4556 GOTO(out, rc = -EFAULT);
4558 OBD_FREE_LARGE(fiemap, num_bytes);
4562 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4564 struct ll_inode_info *lli = ll_i2info(inode);
4565 struct posix_acl *acl = NULL;
4568 spin_lock(&lli->lli_lock);
4569 /* VFS' acl_permission_check->check_acl will release the refcount */
4570 acl = posix_acl_dup(lli->lli_posix_acl);
4571 spin_unlock(&lli->lli_lock);
4576 #ifdef HAVE_IOP_SET_ACL
4577 #ifdef CONFIG_FS_POSIX_ACL
4578 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4580 struct ll_sb_info *sbi = ll_i2sbi(inode);
4581 struct ptlrpc_request *req = NULL;
4582 const char *name = NULL;
4584 size_t value_size = 0;
4589 case ACL_TYPE_ACCESS:
4590 name = XATTR_NAME_POSIX_ACL_ACCESS;
4592 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4595 case ACL_TYPE_DEFAULT:
4596 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4597 if (!S_ISDIR(inode->i_mode))
4598 rc = acl ? -EACCES : 0;
4609 value_size = posix_acl_xattr_size(acl->a_count);
4610 value = kmalloc(value_size, GFP_NOFS);
4612 GOTO(out, rc = -ENOMEM);
4614 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4616 GOTO(out_value, rc);
4619 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4620 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4621 name, value, value_size, 0, 0, &req);
4623 ptlrpc_req_finished(req);
4628 forget_cached_acl(inode, type);
4630 set_cached_acl(inode, type, acl);
4633 #endif /* CONFIG_FS_POSIX_ACL */
4634 #endif /* HAVE_IOP_SET_ACL */
4636 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4638 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4639 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4641 ll_check_acl(struct inode *inode, int mask)
4644 # ifdef CONFIG_FS_POSIX_ACL
4645 struct posix_acl *acl;
4649 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4650 if (flags & IPERM_FLAG_RCU)
4653 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4658 rc = posix_acl_permission(inode, acl, mask);
4659 posix_acl_release(acl);
4662 # else /* !CONFIG_FS_POSIX_ACL */
4664 # endif /* CONFIG_FS_POSIX_ACL */
4666 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4668 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4669 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4671 # ifdef HAVE_INODE_PERMISION_2ARGS
4672 int ll_inode_permission(struct inode *inode, int mask)
4674 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4679 struct ll_sb_info *sbi;
4680 struct root_squash_info *squash;
4681 struct cred *cred = NULL;
4682 const struct cred *old_cred = NULL;
4684 bool squash_id = false;
4687 #ifdef MAY_NOT_BLOCK
4688 if (mask & MAY_NOT_BLOCK)
4690 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4691 if (flags & IPERM_FLAG_RCU)
4695 /* as root inode are NOT getting validated in lookup operation,
4696 * need to do it before permission check. */
4698 if (inode == inode->i_sb->s_root->d_inode) {
4699 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4704 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4705 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4707 /* squash fsuid/fsgid if needed */
4708 sbi = ll_i2sbi(inode);
4709 squash = &sbi->ll_squash;
4710 if (unlikely(squash->rsi_uid != 0 &&
4711 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4712 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4716 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4717 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4718 squash->rsi_uid, squash->rsi_gid);
4720 /* update current process's credentials
4721 * and FS capability */
4722 cred = prepare_creds();
4726 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4727 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4728 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4729 if ((1 << cap) & CFS_CAP_FS_MASK)
4730 cap_lower(cred->cap_effective, cap);
4732 old_cred = override_creds(cred);
4735 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4736 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4737 /* restore current process's credentials and FS capability */
4739 revert_creds(old_cred);
4746 /* -o localflock - only provides locally consistent flock locks */
4747 struct file_operations ll_file_operations = {
4748 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4749 # ifdef HAVE_SYNC_READ_WRITE
4750 .read = new_sync_read,
4751 .write = new_sync_write,
4753 .read_iter = ll_file_read_iter,
4754 .write_iter = ll_file_write_iter,
4755 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4756 .read = ll_file_read,
4757 .aio_read = ll_file_aio_read,
4758 .write = ll_file_write,
4759 .aio_write = ll_file_aio_write,
4760 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4761 .unlocked_ioctl = ll_file_ioctl,
4762 .open = ll_file_open,
4763 .release = ll_file_release,
4764 .mmap = ll_file_mmap,
4765 .llseek = ll_file_seek,
4766 .splice_read = ll_file_splice_read,
4771 struct file_operations ll_file_operations_flock = {
4772 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4773 # ifdef HAVE_SYNC_READ_WRITE
4774 .read = new_sync_read,
4775 .write = new_sync_write,
4776 # endif /* HAVE_SYNC_READ_WRITE */
4777 .read_iter = ll_file_read_iter,
4778 .write_iter = ll_file_write_iter,
4779 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4780 .read = ll_file_read,
4781 .aio_read = ll_file_aio_read,
4782 .write = ll_file_write,
4783 .aio_write = ll_file_aio_write,
4784 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4785 .unlocked_ioctl = ll_file_ioctl,
4786 .open = ll_file_open,
4787 .release = ll_file_release,
4788 .mmap = ll_file_mmap,
4789 .llseek = ll_file_seek,
4790 .splice_read = ll_file_splice_read,
4793 .flock = ll_file_flock,
4794 .lock = ll_file_flock
4797 /* These are for -o noflock - to return ENOSYS on flock calls */
4798 struct file_operations ll_file_operations_noflock = {
4799 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4800 # ifdef HAVE_SYNC_READ_WRITE
4801 .read = new_sync_read,
4802 .write = new_sync_write,
4803 # endif /* HAVE_SYNC_READ_WRITE */
4804 .read_iter = ll_file_read_iter,
4805 .write_iter = ll_file_write_iter,
4806 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4807 .read = ll_file_read,
4808 .aio_read = ll_file_aio_read,
4809 .write = ll_file_write,
4810 .aio_write = ll_file_aio_write,
4811 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4812 .unlocked_ioctl = ll_file_ioctl,
4813 .open = ll_file_open,
4814 .release = ll_file_release,
4815 .mmap = ll_file_mmap,
4816 .llseek = ll_file_seek,
4817 .splice_read = ll_file_splice_read,
4820 .flock = ll_file_noflock,
4821 .lock = ll_file_noflock
4824 struct inode_operations ll_file_inode_operations = {
4825 .setattr = ll_setattr,
4826 .getattr = ll_getattr,
4827 .permission = ll_inode_permission,
4828 #ifdef HAVE_IOP_XATTR
4829 .setxattr = ll_setxattr,
4830 .getxattr = ll_getxattr,
4831 .removexattr = ll_removexattr,
4833 .listxattr = ll_listxattr,
4834 .fiemap = ll_fiemap,
4835 #ifdef HAVE_IOP_GET_ACL
4836 .get_acl = ll_get_acl,
4838 #ifdef HAVE_IOP_SET_ACL
4839 .set_acl = ll_set_acl,
4843 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4845 struct ll_inode_info *lli = ll_i2info(inode);
4846 struct cl_object *obj = lli->lli_clob;
4855 env = cl_env_get(&refcheck);
4857 RETURN(PTR_ERR(env));
4859 rc = cl_conf_set(env, lli->lli_clob, conf);
4863 if (conf->coc_opc == OBJECT_CONF_SET) {
4864 struct ldlm_lock *lock = conf->coc_lock;
4865 struct cl_layout cl = {
4869 LASSERT(lock != NULL);
4870 LASSERT(ldlm_has_layout(lock));
4872 /* it can only be allowed to match after layout is
4873 * applied to inode otherwise false layout would be
4874 * seen. Applying layout shoud happen before dropping
4875 * the intent lock. */
4876 ldlm_lock_allow_match(lock);
4878 rc = cl_object_layout_get(env, obj, &cl);
4883 DFID": layout version change: %u -> %u\n",
4884 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4886 ll_layout_version_set(lli, cl.cl_layout_gen);
4890 cl_env_put(env, &refcheck);
4895 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4896 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4899 struct ll_sb_info *sbi = ll_i2sbi(inode);
4900 struct ptlrpc_request *req;
4907 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4908 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4909 lock->l_lvb_data, lock->l_lvb_len);
4911 if (lock->l_lvb_data != NULL)
4914 /* if layout lock was granted right away, the layout is returned
4915 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4916 * blocked and then granted via completion ast, we have to fetch
4917 * layout here. Please note that we can't use the LVB buffer in
4918 * completion AST because it doesn't have a large enough buffer */
4919 rc = ll_get_default_mdsize(sbi, &lmmsize);
4923 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4924 XATTR_NAME_LOV, lmmsize, &req);
4927 GOTO(out, rc = 0); /* empty layout */
4934 if (lmmsize == 0) /* empty layout */
4937 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4939 GOTO(out, rc = -EFAULT);
4941 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4942 if (lvbdata == NULL)
4943 GOTO(out, rc = -ENOMEM);
4945 memcpy(lvbdata, lmm, lmmsize);
4946 lock_res_and_lock(lock);
4947 if (unlikely(lock->l_lvb_data == NULL)) {
4948 lock->l_lvb_type = LVB_T_LAYOUT;
4949 lock->l_lvb_data = lvbdata;
4950 lock->l_lvb_len = lmmsize;
4953 unlock_res_and_lock(lock);
4956 OBD_FREE_LARGE(lvbdata, lmmsize);
4961 ptlrpc_req_finished(req);
4966 * Apply the layout to the inode. Layout lock is held and will be released
4969 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4970 struct inode *inode)
4972 struct ll_inode_info *lli = ll_i2info(inode);
4973 struct ll_sb_info *sbi = ll_i2sbi(inode);
4974 struct ldlm_lock *lock;
4975 struct cl_object_conf conf;
4978 bool wait_layout = false;
4981 LASSERT(lustre_handle_is_used(lockh));
4983 lock = ldlm_handle2lock(lockh);
4984 LASSERT(lock != NULL);
4985 LASSERT(ldlm_has_layout(lock));
4987 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4988 PFID(&lli->lli_fid), inode);
4990 /* in case this is a caching lock and reinstate with new inode */
4991 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4993 lock_res_and_lock(lock);
4994 lvb_ready = ldlm_is_lvb_ready(lock);
4995 unlock_res_and_lock(lock);
4997 /* checking lvb_ready is racy but this is okay. The worst case is
4998 * that multi processes may configure the file on the same time. */
5002 rc = ll_layout_fetch(inode, lock);
5006 /* for layout lock, lmm is stored in lock's lvb.
5007 * lvb_data is immutable if the lock is held so it's safe to access it
5010 * set layout to file. Unlikely this will fail as old layout was
5011 * surely eliminated */
5012 memset(&conf, 0, sizeof conf);
5013 conf.coc_opc = OBJECT_CONF_SET;
5014 conf.coc_inode = inode;
5015 conf.coc_lock = lock;
5016 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5017 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5018 rc = ll_layout_conf(inode, &conf);
5020 /* refresh layout failed, need to wait */
5021 wait_layout = rc == -EBUSY;
5024 LDLM_LOCK_PUT(lock);
5025 ldlm_lock_decref(lockh, mode);
5027 /* wait for IO to complete if it's still being used. */
5029 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5030 ll_get_fsname(inode->i_sb, NULL, 0),
5031 PFID(&lli->lli_fid), inode);
5033 memset(&conf, 0, sizeof conf);
5034 conf.coc_opc = OBJECT_CONF_WAIT;
5035 conf.coc_inode = inode;
5036 rc = ll_layout_conf(inode, &conf);
5040 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5041 ll_get_fsname(inode->i_sb, NULL, 0),
5042 PFID(&lli->lli_fid), rc);
5048 * Issue layout intent RPC to MDS.
5049 * \param inode [in] file inode
5050 * \param intent [in] layout intent
5052 * \retval 0 on success
5053 * \retval < 0 error code
5055 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5057 struct ll_inode_info *lli = ll_i2info(inode);
5058 struct ll_sb_info *sbi = ll_i2sbi(inode);
5059 struct md_op_data *op_data;
5060 struct lookup_intent it;
5061 struct ptlrpc_request *req;
5065 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5066 0, 0, LUSTRE_OPC_ANY, NULL);
5067 if (IS_ERR(op_data))
5068 RETURN(PTR_ERR(op_data));
5070 op_data->op_data = intent;
5071 op_data->op_data_size = sizeof(*intent);
5073 memset(&it, 0, sizeof(it));
5074 it.it_op = IT_LAYOUT;
5075 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5076 intent->li_opc == LAYOUT_INTENT_TRUNC)
5077 it.it_flags = FMODE_WRITE;
5079 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5080 ll_get_fsname(inode->i_sb, NULL, 0),
5081 PFID(&lli->lli_fid), inode);
5083 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5084 &ll_md_blocking_ast, 0);
5085 if (it.it_request != NULL)
5086 ptlrpc_req_finished(it.it_request);
5087 it.it_request = NULL;
5089 ll_finish_md_op_data(op_data);
5091 /* set lock data in case this is a new lock */
5093 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5095 ll_intent_drop_lock(&it);
5101 * This function checks if there exists a LAYOUT lock on the client side,
5102 * or enqueues it if it doesn't have one in cache.
5104 * This function will not hold layout lock so it may be revoked any time after
5105 * this function returns. Any operations depend on layout should be redone
5108 * This function should be called before lov_io_init() to get an uptodate
5109 * layout version, the caller should save the version number and after IO
5110 * is finished, this function should be called again to verify that layout
5111 * is not changed during IO time.
5113 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5115 struct ll_inode_info *lli = ll_i2info(inode);
5116 struct ll_sb_info *sbi = ll_i2sbi(inode);
5117 struct lustre_handle lockh;
5118 struct layout_intent intent = {
5119 .li_opc = LAYOUT_INTENT_ACCESS,
5121 enum ldlm_mode mode;
5125 *gen = ll_layout_version_get(lli);
5126 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5130 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5131 LASSERT(S_ISREG(inode->i_mode));
5133 /* take layout lock mutex to enqueue layout lock exclusively. */
5134 mutex_lock(&lli->lli_layout_mutex);
5137 /* mostly layout lock is caching on the local side, so try to
5138 * match it before grabbing layout lock mutex. */
5139 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5140 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5141 if (mode != 0) { /* hit cached lock */
5142 rc = ll_layout_lock_set(&lockh, mode, inode);
5148 rc = ll_layout_intent(inode, &intent);
5154 *gen = ll_layout_version_get(lli);
5155 mutex_unlock(&lli->lli_layout_mutex);
5161 * Issue layout intent RPC indicating where in a file an IO is about to write.
5163 * \param[in] inode file inode.
5164 * \param[in] ext write range with start offset of fille in bytes where
5165 * an IO is about to write, and exclusive end offset in
5168 * \retval 0 on success
5169 * \retval < 0 error code
5171 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5172 struct lu_extent *ext)
5174 struct layout_intent intent = {
5176 .li_extent.e_start = ext->e_start,
5177 .li_extent.e_end = ext->e_end,
5182 rc = ll_layout_intent(inode, &intent);
5188 * This function send a restore request to the MDT
5190 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5192 struct hsm_user_request *hur;
5196 len = sizeof(struct hsm_user_request) +
5197 sizeof(struct hsm_user_item);
5198 OBD_ALLOC(hur, len);
5202 hur->hur_request.hr_action = HUA_RESTORE;
5203 hur->hur_request.hr_archive_id = 0;
5204 hur->hur_request.hr_flags = 0;
5205 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5206 sizeof(hur->hur_user_item[0].hui_fid));
5207 hur->hur_user_item[0].hui_extent.offset = offset;
5208 hur->hur_user_item[0].hui_extent.length = length;
5209 hur->hur_request.hr_itemcount = 1;
5210 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,