4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_get_fsname(inode->i_sb, NULL, 0),
143 PFID(&lli->lli_fid));
147 OBD_ALLOC_PTR(op_data);
148 /* We leak openhandle and request here on error, but not much to be
149 * done in OOM case since app won't retry close on error either. */
151 GOTO(out, rc = -ENOMEM);
153 ll_prepare_close(inode, op_data, och);
155 case MDS_CLOSE_LAYOUT_MERGE:
156 /* merge blocks from the victim inode */
157 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
158 op_data->op_attr.ia_valid |= ATTR_SIZE;
159 op_data->op_xvalid |= OP_XVALID_BLOCKS;
160 case MDS_CLOSE_LAYOUT_SPLIT:
161 case MDS_CLOSE_LAYOUT_SWAP: {
162 struct split_param *sp = data;
164 LASSERT(data != NULL);
165 op_data->op_bias |= bias;
166 op_data->op_data_version = 0;
167 op_data->op_lease_handle = och->och_lease_handle;
168 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
169 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
170 op_data->op_mirror_id = sp->sp_mirror_id;
172 op_data->op_fid2 = *ll_inode2fid(data);
177 case MDS_CLOSE_RESYNC_DONE: {
178 struct ll_ioc_lease *ioc = data;
180 LASSERT(data != NULL);
181 op_data->op_attr_blocks +=
182 ioc->lil_count * op_data->op_attr_blocks;
183 op_data->op_attr.ia_valid |= ATTR_SIZE;
184 op_data->op_xvalid |= OP_XVALID_BLOCKS;
185 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
187 op_data->op_lease_handle = och->och_lease_handle;
188 op_data->op_data = &ioc->lil_ids[0];
189 op_data->op_data_size =
190 ioc->lil_count * sizeof(ioc->lil_ids[0]);
194 case MDS_HSM_RELEASE:
195 LASSERT(data != NULL);
196 op_data->op_bias |= MDS_HSM_RELEASE;
197 op_data->op_data_version = *(__u64 *)data;
198 op_data->op_lease_handle = och->och_lease_handle;
199 op_data->op_attr.ia_valid |= ATTR_SIZE;
200 op_data->op_xvalid |= OP_XVALID_BLOCKS;
204 LASSERT(data == NULL);
208 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
209 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
210 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
211 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
213 rc = md_close(md_exp, op_data, och->och_mod, &req);
214 if (rc != 0 && rc != -EINTR)
215 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
216 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
218 if (rc == 0 && op_data->op_bias & bias) {
219 struct mdt_body *body;
221 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
222 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
226 ll_finish_md_op_data(op_data);
230 md_clear_open_replay_data(md_exp, och);
231 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
234 ptlrpc_req_finished(req); /* This is close request */
238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
240 struct ll_inode_info *lli = ll_i2info(inode);
241 struct obd_client_handle **och_p;
242 struct obd_client_handle *och;
247 if (fmode & FMODE_WRITE) {
248 och_p = &lli->lli_mds_write_och;
249 och_usecount = &lli->lli_open_fd_write_count;
250 } else if (fmode & FMODE_EXEC) {
251 och_p = &lli->lli_mds_exec_och;
252 och_usecount = &lli->lli_open_fd_exec_count;
254 LASSERT(fmode & FMODE_READ);
255 och_p = &lli->lli_mds_read_och;
256 och_usecount = &lli->lli_open_fd_read_count;
259 mutex_lock(&lli->lli_och_mutex);
260 if (*och_usecount > 0) {
261 /* There are still users of this handle, so skip
263 mutex_unlock(&lli->lli_och_mutex);
269 mutex_unlock(&lli->lli_och_mutex);
272 /* There might be a race and this handle may already
274 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
280 static int ll_md_close(struct inode *inode, struct file *file)
282 union ldlm_policy_data policy = {
283 .l_inodebits = { MDS_INODELOCK_OPEN },
285 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
286 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
287 struct ll_inode_info *lli = ll_i2info(inode);
288 struct lustre_handle lockh;
289 enum ldlm_mode lockmode;
293 /* clear group lock, if present */
294 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
295 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
297 if (fd->fd_lease_och != NULL) {
300 /* Usually the lease is not released when the
301 * application crashed, we need to release here. */
302 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
303 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
304 PFID(&lli->lli_fid), rc, lease_broken);
306 fd->fd_lease_och = NULL;
309 if (fd->fd_och != NULL) {
310 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
315 /* Let's see if we have good enough OPEN lock on the file and if
316 we can skip talking to MDS */
317 mutex_lock(&lli->lli_och_mutex);
318 if (fd->fd_omode & FMODE_WRITE) {
320 LASSERT(lli->lli_open_fd_write_count);
321 lli->lli_open_fd_write_count--;
322 } else if (fd->fd_omode & FMODE_EXEC) {
324 LASSERT(lli->lli_open_fd_exec_count);
325 lli->lli_open_fd_exec_count--;
328 LASSERT(lli->lli_open_fd_read_count);
329 lli->lli_open_fd_read_count--;
331 mutex_unlock(&lli->lli_och_mutex);
333 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
334 LDLM_IBITS, &policy, lockmode, &lockh))
335 rc = ll_md_real_close(inode, fd->fd_omode);
338 LUSTRE_FPRIVATE(file) = NULL;
339 ll_file_data_put(fd);
344 /* While this returns an error code, fput() the caller does not, so we need
345 * to make every effort to clean up all of our state here. Also, applications
346 * rarely check close errors and even if an error is returned they will not
347 * re-try the close call.
349 int ll_file_release(struct inode *inode, struct file *file)
351 struct ll_file_data *fd;
352 struct ll_sb_info *sbi = ll_i2sbi(inode);
353 struct ll_inode_info *lli = ll_i2info(inode);
357 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
358 PFID(ll_inode2fid(inode)), inode);
360 if (inode->i_sb->s_root != file_dentry(file))
361 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
362 fd = LUSTRE_FPRIVATE(file);
365 /* The last ref on @file, maybe not the the owner pid of statahead,
366 * because parent and child process can share the same file handle. */
367 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
368 ll_deauthorize_statahead(inode, fd);
370 if (inode->i_sb->s_root == file_dentry(file)) {
371 LUSTRE_FPRIVATE(file) = NULL;
372 ll_file_data_put(fd);
376 if (!S_ISDIR(inode->i_mode)) {
377 if (lli->lli_clob != NULL)
378 lov_read_and_clear_async_rc(lli->lli_clob);
379 lli->lli_async_rc = 0;
382 rc = ll_md_close(inode, file);
384 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
385 libcfs_debug_dumplog();
390 static inline int ll_dom_readpage(void *data, struct page *page)
392 struct niobuf_local *lnb = data;
395 kaddr = ll_kmap_atomic(page, KM_USER0);
396 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
397 if (lnb->lnb_len < PAGE_SIZE)
398 memset(kaddr + lnb->lnb_len, 0,
399 PAGE_SIZE - lnb->lnb_len);
400 flush_dcache_page(page);
401 SetPageUptodate(page);
402 ll_kunmap_atomic(kaddr, KM_USER0);
408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
409 struct lookup_intent *it)
411 struct ll_inode_info *lli = ll_i2info(inode);
412 struct cl_object *obj = lli->lli_clob;
413 struct address_space *mapping = inode->i_mapping;
415 struct niobuf_remote *rnb;
417 struct lustre_handle lockh;
418 struct ldlm_lock *lock;
419 unsigned long index, start;
420 struct niobuf_local lnb;
421 bool dom_lock = false;
428 if (it->it_lock_mode != 0) {
429 lockh.cookie = it->it_lock_handle;
430 lock = ldlm_handle2lock(&lockh);
432 dom_lock = ldlm_has_dom(lock);
438 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
442 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
443 if (rnb == NULL || rnb->rnb_len == 0)
446 /* LU-11595: Server may return whole file and that is OK always or
447 * it may return just file tail and its offset must be aligned with
448 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
449 * smaller then offset may be not aligned and that data is just ignored.
451 if (rnb->rnb_offset % PAGE_SIZE)
454 /* Server returns whole file or just file tail if it fills in
455 * reply buffer, in both cases total size should be inode size.
457 if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
458 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
459 ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
460 rnb->rnb_len, i_size_read(inode));
464 CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
465 rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
467 data = (char *)rnb + sizeof(*rnb);
469 lnb.lnb_file_offset = rnb->rnb_offset;
470 start = lnb.lnb_file_offset / PAGE_SIZE;
472 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
473 lnb.lnb_page_offset = 0;
475 lnb.lnb_data = data + (index << PAGE_SHIFT);
476 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
477 if (lnb.lnb_len > PAGE_SIZE)
478 lnb.lnb_len = PAGE_SIZE;
480 vmpage = read_cache_page(mapping, index + start,
481 ll_dom_readpage, &lnb);
482 if (IS_ERR(vmpage)) {
483 CWARN("%s: cannot fill page %lu for "DFID
484 " with data: rc = %li\n",
485 ll_get_fsname(inode->i_sb, NULL, 0),
486 index + start, PFID(lu_object_fid(&obj->co_lu)),
492 } while (rnb->rnb_len > (index << PAGE_SHIFT));
496 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
497 struct lookup_intent *itp)
499 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
500 struct dentry *parent = de->d_parent;
501 const char *name = NULL;
503 struct md_op_data *op_data;
504 struct ptlrpc_request *req = NULL;
508 LASSERT(parent != NULL);
509 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
511 /* if server supports open-by-fid, or file name is invalid, don't pack
512 * name in open request */
513 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
514 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
515 name = de->d_name.name;
516 len = de->d_name.len;
519 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
520 name, len, 0, LUSTRE_OPC_ANY, NULL);
522 RETURN(PTR_ERR(op_data));
523 op_data->op_data = lmm;
524 op_data->op_data_size = lmmsize;
526 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
527 &ll_md_blocking_ast, 0);
528 ll_finish_md_op_data(op_data);
530 /* reason for keep own exit path - don`t flood log
531 * with messages with -ESTALE errors.
533 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
534 it_open_error(DISP_OPEN_OPEN, itp))
536 ll_release_openhandle(de, itp);
540 if (it_disposition(itp, DISP_LOOKUP_NEG))
541 GOTO(out, rc = -ENOENT);
543 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
544 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
545 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
549 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
551 if (!rc && itp->it_lock_mode) {
552 ll_dom_finish_open(de->d_inode, req, itp);
553 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
557 ptlrpc_req_finished(req);
558 ll_intent_drop_lock(itp);
560 /* We did open by fid, but by the time we got to the server,
561 * the object disappeared. If this is a create, we cannot really
562 * tell the userspace that the file it was trying to create
563 * does not exist. Instead let's return -ESTALE, and the VFS will
564 * retry the create with LOOKUP_REVAL that we are going to catch
565 * in ll_revalidate_dentry() and use lookup then.
567 if (rc == -ENOENT && itp->it_op & IT_CREAT)
573 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
574 struct obd_client_handle *och)
576 struct mdt_body *body;
578 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
579 och->och_open_handle = body->mbo_open_handle;
580 och->och_fid = body->mbo_fid1;
581 och->och_lease_handle.cookie = it->it_lock_handle;
582 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
583 och->och_flags = it->it_flags;
585 return md_set_open_replay_data(md_exp, och, it);
588 static int ll_local_open(struct file *file, struct lookup_intent *it,
589 struct ll_file_data *fd, struct obd_client_handle *och)
591 struct inode *inode = file_inode(file);
594 LASSERT(!LUSTRE_FPRIVATE(file));
601 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
606 LUSTRE_FPRIVATE(file) = fd;
607 ll_readahead_init(inode, &fd->fd_ras);
608 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
610 /* ll_cl_context initialize */
611 rwlock_init(&fd->fd_lock);
612 INIT_LIST_HEAD(&fd->fd_lccs);
617 /* Open a file, and (for the very first open) create objects on the OSTs at
618 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
619 * creation or open until ll_lov_setstripe() ioctl is called.
621 * If we already have the stripe MD locally then we don't request it in
622 * md_open(), by passing a lmm_size = 0.
624 * It is up to the application to ensure no other processes open this file
625 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
626 * used. We might be able to avoid races of that sort by getting lli_open_sem
627 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
628 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
630 int ll_file_open(struct inode *inode, struct file *file)
632 struct ll_inode_info *lli = ll_i2info(inode);
633 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
634 .it_flags = file->f_flags };
635 struct obd_client_handle **och_p = NULL;
636 __u64 *och_usecount = NULL;
637 struct ll_file_data *fd;
641 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
642 PFID(ll_inode2fid(inode)), inode, file->f_flags);
644 it = file->private_data; /* XXX: compat macro */
645 file->private_data = NULL; /* prevent ll_local_open assertion */
647 fd = ll_file_data_get();
649 GOTO(out_nofiledata, rc = -ENOMEM);
652 if (S_ISDIR(inode->i_mode))
653 ll_authorize_statahead(inode, fd);
655 if (inode->i_sb->s_root == file_dentry(file)) {
656 LUSTRE_FPRIVATE(file) = fd;
660 if (!it || !it->it_disposition) {
661 /* Convert f_flags into access mode. We cannot use file->f_mode,
662 * because everything but O_ACCMODE mask was stripped from
664 if ((oit.it_flags + 1) & O_ACCMODE)
666 if (file->f_flags & O_TRUNC)
667 oit.it_flags |= FMODE_WRITE;
669 /* kernel only call f_op->open in dentry_open. filp_open calls
670 * dentry_open after call to open_namei that checks permissions.
671 * Only nfsd_open call dentry_open directly without checking
672 * permissions and because of that this code below is safe.
674 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
675 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
677 /* We do not want O_EXCL here, presumably we opened the file
678 * already? XXX - NFS implications? */
679 oit.it_flags &= ~O_EXCL;
681 /* bug20584, if "it_flags" contains O_CREAT, the file will be
682 * created if necessary, then "IT_CREAT" should be set to keep
683 * consistent with it */
684 if (oit.it_flags & O_CREAT)
685 oit.it_op |= IT_CREAT;
691 /* Let's see if we have file open on MDS already. */
692 if (it->it_flags & FMODE_WRITE) {
693 och_p = &lli->lli_mds_write_och;
694 och_usecount = &lli->lli_open_fd_write_count;
695 } else if (it->it_flags & FMODE_EXEC) {
696 och_p = &lli->lli_mds_exec_och;
697 och_usecount = &lli->lli_open_fd_exec_count;
699 och_p = &lli->lli_mds_read_och;
700 och_usecount = &lli->lli_open_fd_read_count;
703 mutex_lock(&lli->lli_och_mutex);
704 if (*och_p) { /* Open handle is present */
705 if (it_disposition(it, DISP_OPEN_OPEN)) {
706 /* Well, there's extra open request that we do not need,
707 let's close it somehow. This will decref request. */
708 rc = it_open_error(DISP_OPEN_OPEN, it);
710 mutex_unlock(&lli->lli_och_mutex);
711 GOTO(out_openerr, rc);
714 ll_release_openhandle(file_dentry(file), it);
718 rc = ll_local_open(file, it, fd, NULL);
721 mutex_unlock(&lli->lli_och_mutex);
722 GOTO(out_openerr, rc);
725 LASSERT(*och_usecount == 0);
726 if (!it->it_disposition) {
727 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
728 /* We cannot just request lock handle now, new ELC code
729 means that one of other OPEN locks for this file
730 could be cancelled, and since blocking ast handler
731 would attempt to grab och_mutex as well, that would
732 result in a deadlock */
733 mutex_unlock(&lli->lli_och_mutex);
735 * Normally called under two situations:
737 * 2. A race/condition on MDS resulting in no open
738 * handle to be returned from LOOKUP|OPEN request,
739 * for example if the target entry was a symlink.
741 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
742 * marked by a bit set in ll_iget_for_nfs. Clear the
743 * bit so that it's not confusing later callers.
745 * NB; when ldd is NULL, it must have come via normal
746 * lookup path only, since ll_iget_for_nfs always calls
749 if (ldd && ldd->lld_nfs_dentry) {
750 ldd->lld_nfs_dentry = 0;
751 it->it_flags |= MDS_OPEN_LOCK;
755 * Always specify MDS_OPEN_BY_FID because we don't want
756 * to get file with different fid.
758 it->it_flags |= MDS_OPEN_BY_FID;
759 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
762 GOTO(out_openerr, rc);
766 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
768 GOTO(out_och_free, rc = -ENOMEM);
772 /* md_intent_lock() didn't get a request ref if there was an
773 * open error, so don't do cleanup on the request here
775 /* XXX (green): Should not we bail out on any error here, not
776 * just open error? */
777 rc = it_open_error(DISP_OPEN_OPEN, it);
779 GOTO(out_och_free, rc);
781 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
782 "inode %p: disposition %x, status %d\n", inode,
783 it_disposition(it, ~0), it->it_status);
785 rc = ll_local_open(file, it, fd, *och_p);
787 GOTO(out_och_free, rc);
789 mutex_unlock(&lli->lli_och_mutex);
792 /* Must do this outside lli_och_mutex lock to prevent deadlock where
793 different kind of OPEN lock for this same inode gets cancelled
794 by ldlm_cancel_lru */
795 if (!S_ISREG(inode->i_mode))
796 GOTO(out_och_free, rc);
798 cl_lov_delay_create_clear(&file->f_flags);
799 GOTO(out_och_free, rc);
803 if (och_p && *och_p) {
804 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
805 *och_p = NULL; /* OBD_FREE writes some magic there */
808 mutex_unlock(&lli->lli_och_mutex);
811 if (lli->lli_opendir_key == fd)
812 ll_deauthorize_statahead(inode, fd);
814 ll_file_data_put(fd);
816 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
820 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
821 ptlrpc_req_finished(it->it_request);
822 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
828 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
829 struct ldlm_lock_desc *desc, void *data, int flag)
832 struct lustre_handle lockh;
836 case LDLM_CB_BLOCKING:
837 ldlm_lock2handle(lock, &lockh);
838 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
840 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
844 case LDLM_CB_CANCELING:
852 * When setting a lease on a file, we take ownership of the lli_mds_*_och
853 * and save it as fd->fd_och so as to force client to reopen the file even
854 * if it has an open lock in cache already.
856 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
857 struct lustre_handle *old_open_handle)
859 struct ll_inode_info *lli = ll_i2info(inode);
860 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
861 struct obd_client_handle **och_p;
866 /* Get the openhandle of the file */
867 mutex_lock(&lli->lli_och_mutex);
868 if (fd->fd_lease_och != NULL)
869 GOTO(out_unlock, rc = -EBUSY);
871 if (fd->fd_och == NULL) {
872 if (file->f_mode & FMODE_WRITE) {
873 LASSERT(lli->lli_mds_write_och != NULL);
874 och_p = &lli->lli_mds_write_och;
875 och_usecount = &lli->lli_open_fd_write_count;
877 LASSERT(lli->lli_mds_read_och != NULL);
878 och_p = &lli->lli_mds_read_och;
879 och_usecount = &lli->lli_open_fd_read_count;
882 if (*och_usecount > 1)
883 GOTO(out_unlock, rc = -EBUSY);
890 *old_open_handle = fd->fd_och->och_open_handle;
894 mutex_unlock(&lli->lli_och_mutex);
899 * Release ownership on lli_mds_*_och when putting back a file lease.
901 static int ll_lease_och_release(struct inode *inode, struct file *file)
903 struct ll_inode_info *lli = ll_i2info(inode);
904 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
905 struct obd_client_handle **och_p;
906 struct obd_client_handle *old_och = NULL;
911 mutex_lock(&lli->lli_och_mutex);
912 if (file->f_mode & FMODE_WRITE) {
913 och_p = &lli->lli_mds_write_och;
914 och_usecount = &lli->lli_open_fd_write_count;
916 och_p = &lli->lli_mds_read_och;
917 och_usecount = &lli->lli_open_fd_read_count;
920 /* The file may have been open by another process (broken lease) so
921 * *och_p is not NULL. In this case we should simply increase usecount
924 if (*och_p != NULL) {
925 old_och = fd->fd_och;
932 mutex_unlock(&lli->lli_och_mutex);
935 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
941 * Acquire a lease and open the file.
943 static struct obd_client_handle *
944 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
947 struct lookup_intent it = { .it_op = IT_OPEN };
948 struct ll_sb_info *sbi = ll_i2sbi(inode);
949 struct md_op_data *op_data;
950 struct ptlrpc_request *req = NULL;
951 struct lustre_handle old_open_handle = { 0 };
952 struct obd_client_handle *och = NULL;
957 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
958 RETURN(ERR_PTR(-EINVAL));
961 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
962 RETURN(ERR_PTR(-EPERM));
964 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
971 RETURN(ERR_PTR(-ENOMEM));
973 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
974 LUSTRE_OPC_ANY, NULL);
976 GOTO(out, rc = PTR_ERR(op_data));
978 /* To tell the MDT this openhandle is from the same owner */
979 op_data->op_open_handle = old_open_handle;
981 it.it_flags = fmode | open_flags;
982 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
983 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
984 &ll_md_blocking_lease_ast,
985 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
986 * it can be cancelled which may mislead applications that the lease is
988 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
989 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
990 * doesn't deal with openhandle, so normal openhandle will be leaked. */
991 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
992 ll_finish_md_op_data(op_data);
993 ptlrpc_req_finished(req);
995 GOTO(out_release_it, rc);
997 if (it_disposition(&it, DISP_LOOKUP_NEG))
998 GOTO(out_release_it, rc = -ENOENT);
1000 rc = it_open_error(DISP_OPEN_OPEN, &it);
1002 GOTO(out_release_it, rc);
1004 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1005 ll_och_fill(sbi->ll_md_exp, &it, och);
1007 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1008 GOTO(out_close, rc = -EOPNOTSUPP);
1010 /* already get lease, handle lease lock */
1011 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1012 if (it.it_lock_mode == 0 ||
1013 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1014 /* open lock must return for lease */
1015 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1016 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1018 GOTO(out_close, rc = -EPROTO);
1021 ll_intent_release(&it);
1025 /* Cancel open lock */
1026 if (it.it_lock_mode != 0) {
1027 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1029 it.it_lock_mode = 0;
1030 och->och_lease_handle.cookie = 0ULL;
1032 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1034 CERROR("%s: error closing file "DFID": %d\n",
1035 ll_get_fsname(inode->i_sb, NULL, 0),
1036 PFID(&ll_i2info(inode)->lli_fid), rc2);
1037 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1039 ll_intent_release(&it);
1043 RETURN(ERR_PTR(rc));
1047 * Check whether a layout swap can be done between two inodes.
1049 * \param[in] inode1 First inode to check
1050 * \param[in] inode2 Second inode to check
1052 * \retval 0 on success, layout swap can be performed between both inodes
1053 * \retval negative error code if requirements are not met
1055 static int ll_check_swap_layouts_validity(struct inode *inode1,
1056 struct inode *inode2)
1058 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1061 if (inode_permission(inode1, MAY_WRITE) ||
1062 inode_permission(inode2, MAY_WRITE))
1065 if (inode1->i_sb != inode2->i_sb)
1071 static int ll_swap_layouts_close(struct obd_client_handle *och,
1072 struct inode *inode, struct inode *inode2)
1074 const struct lu_fid *fid1 = ll_inode2fid(inode);
1075 const struct lu_fid *fid2;
1079 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1080 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1082 rc = ll_check_swap_layouts_validity(inode, inode2);
1084 GOTO(out_free_och, rc);
1086 /* We now know that inode2 is a lustre inode */
1087 fid2 = ll_inode2fid(inode2);
1089 rc = lu_fid_cmp(fid1, fid2);
1091 GOTO(out_free_och, rc = -EINVAL);
1093 /* Close the file and {swap,merge} layouts between inode & inode2.
1094 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1095 * because we still need it to pack l_remote_handle to MDT. */
1096 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1099 och = NULL; /* freed in ll_close_inode_openhandle() */
1109 * Release lease and close the file.
1110 * It will check if the lease has ever broken.
1112 static int ll_lease_close_intent(struct obd_client_handle *och,
1113 struct inode *inode,
1114 bool *lease_broken, enum mds_op_bias bias,
1117 struct ldlm_lock *lock;
1118 bool cancelled = true;
1122 lock = ldlm_handle2lock(&och->och_lease_handle);
1124 lock_res_and_lock(lock);
1125 cancelled = ldlm_is_cancel(lock);
1126 unlock_res_and_lock(lock);
1127 LDLM_LOCK_PUT(lock);
1130 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1131 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1133 if (lease_broken != NULL)
1134 *lease_broken = cancelled;
1136 if (!cancelled && !bias)
1137 ldlm_cli_cancel(&och->och_lease_handle, 0);
1139 if (cancelled) { /* no need to excute intent */
1144 rc = ll_close_inode_openhandle(inode, och, bias, data);
1148 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1151 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1155 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1157 static int ll_lease_file_resync(struct obd_client_handle *och,
1158 struct inode *inode, unsigned long arg)
1160 struct ll_sb_info *sbi = ll_i2sbi(inode);
1161 struct md_op_data *op_data;
1162 struct ll_ioc_lease_id ioc;
1163 __u64 data_version_unused;
1167 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1168 LUSTRE_OPC_ANY, NULL);
1169 if (IS_ERR(op_data))
1170 RETURN(PTR_ERR(op_data));
1172 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1176 /* before starting file resync, it's necessary to clean up page cache
1177 * in client memory, otherwise once the layout version is increased,
1178 * writing back cached data will be denied the OSTs. */
1179 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1183 op_data->op_lease_handle = och->och_lease_handle;
1184 op_data->op_mirror_id = ioc.lil_mirror_id;
1185 rc = md_file_resync(sbi->ll_md_exp, op_data);
1191 ll_finish_md_op_data(op_data);
1195 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1197 struct ll_inode_info *lli = ll_i2info(inode);
1198 struct cl_object *obj = lli->lli_clob;
1199 struct cl_attr *attr = vvp_env_thread_attr(env);
1207 ll_inode_size_lock(inode);
1209 /* Merge timestamps the most recently obtained from MDS with
1210 * timestamps obtained from OSTs.
1212 * Do not overwrite atime of inode because it may be refreshed
1213 * by file_accessed() function. If the read was served by cache
1214 * data, there is no RPC to be sent so that atime may not be
1215 * transferred to OSTs at all. MDT only updates atime at close time
1216 * if it's at least 'mdd.*.atime_diff' older.
1217 * All in all, the atime in Lustre does not strictly comply with
1218 * POSIX. Solving this problem needs to send an RPC to MDT for each
1219 * read, this will hurt performance. */
1220 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1221 LTIME_S(inode->i_atime) = lli->lli_atime;
1222 lli->lli_update_atime = 0;
1224 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1225 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1227 atime = LTIME_S(inode->i_atime);
1228 mtime = LTIME_S(inode->i_mtime);
1229 ctime = LTIME_S(inode->i_ctime);
1231 cl_object_attr_lock(obj);
1232 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1235 rc = cl_object_attr_get(env, obj, attr);
1236 cl_object_attr_unlock(obj);
1239 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1241 if (atime < attr->cat_atime)
1242 atime = attr->cat_atime;
1244 if (ctime < attr->cat_ctime)
1245 ctime = attr->cat_ctime;
1247 if (mtime < attr->cat_mtime)
1248 mtime = attr->cat_mtime;
1250 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1251 PFID(&lli->lli_fid), attr->cat_size);
1253 i_size_write(inode, attr->cat_size);
1254 inode->i_blocks = attr->cat_blocks;
1256 LTIME_S(inode->i_atime) = atime;
1257 LTIME_S(inode->i_mtime) = mtime;
1258 LTIME_S(inode->i_ctime) = ctime;
1261 ll_inode_size_unlock(inode);
1267 * Set designated mirror for I/O.
1269 * So far only read, write, and truncated can support to issue I/O to
1270 * designated mirror.
1272 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1274 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1276 /* clear layout version for generic(non-resync) I/O in case it carries
1277 * stale layout version due to I/O restart */
1278 io->ci_layout_version = 0;
1280 /* FLR: disable non-delay for designated mirror I/O because obviously
1281 * only one mirror is available */
1282 if (fd->fd_designated_mirror > 0) {
1284 io->ci_designated_mirror = fd->fd_designated_mirror;
1285 io->ci_layout_version = fd->fd_layout_version;
1286 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1290 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1291 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1294 static bool file_is_noatime(const struct file *file)
1296 const struct vfsmount *mnt = file->f_path.mnt;
1297 const struct inode *inode = file_inode((struct file *)file);
1299 /* Adapted from file_accessed() and touch_atime().*/
1300 if (file->f_flags & O_NOATIME)
1303 if (inode->i_flags & S_NOATIME)
1306 if (IS_NOATIME(inode))
1309 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1312 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1315 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1321 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1323 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1325 struct inode *inode = file_inode(file);
1326 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1328 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1329 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1330 io->u.ci_rw.rw_file = file;
1331 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1332 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1333 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1335 if (iot == CIT_WRITE) {
1336 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1337 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1338 file->f_flags & O_DIRECT ||
1341 io->ci_obj = ll_i2info(inode)->lli_clob;
1342 io->ci_lockreq = CILR_MAYBE;
1343 if (ll_file_nolock(file)) {
1344 io->ci_lockreq = CILR_NEVER;
1345 io->ci_no_srvlock = 1;
1346 } else if (file->f_flags & O_APPEND) {
1347 io->ci_lockreq = CILR_MANDATORY;
1349 io->ci_noatime = file_is_noatime(file);
1350 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1351 io->ci_pio = !io->u.ci_rw.rw_append;
1355 /* FLR: only use non-delay I/O for read as there is only one
1356 * avaliable mirror for write. */
1357 io->ci_ndelay = !(iot == CIT_WRITE);
1359 ll_io_set_mirror(io, file);
1362 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1364 struct cl_io_pt *pt = ptask->pt_cbdata;
1365 struct file *file = pt->cip_file;
1368 loff_t pos = pt->cip_pos;
1373 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1374 file_dentry(file)->d_name.name,
1375 pt->cip_iot == CIT_READ ? "read" : "write",
1376 pos, pos + pt->cip_count);
1378 env = cl_env_get(&refcheck);
1380 RETURN(PTR_ERR(env));
1382 io = vvp_env_thread_io(env);
1383 ll_io_init(io, file, pt->cip_iot);
1384 io->u.ci_rw.rw_iter = pt->cip_iter;
1385 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1386 io->ci_pio = 0; /* It's already in parallel task */
1388 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1389 pt->cip_count - pt->cip_result);
1391 struct vvp_io *vio = vvp_env_io(env);
1393 vio->vui_io_subtype = IO_NORMAL;
1394 vio->vui_fd = LUSTRE_FPRIVATE(file);
1396 ll_cl_add(file, env, io, LCC_RW);
1397 rc = cl_io_loop(env, io);
1398 ll_cl_remove(file, env);
1400 /* cl_io_rw_init() handled IO */
1404 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1410 if (io->ci_nob > 0) {
1411 pt->cip_result += io->ci_nob;
1412 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1414 pt->cip_iocb.ki_pos = pos;
1415 #ifdef HAVE_KIOCB_KI_LEFT
1416 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1417 #elif defined(HAVE_KI_NBYTES)
1418 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1422 cl_io_fini(env, io);
1423 cl_env_put(env, &refcheck);
1425 pt->cip_need_restart = io->ci_need_restart;
1427 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1428 file_dentry(file)->d_name.name,
1429 pt->cip_iot == CIT_READ ? "read" : "write",
1430 pt->cip_result, rc);
1432 RETURN(pt->cip_result > 0 ? 0 : rc);
1436 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1437 struct file *file, enum cl_io_type iot,
1438 loff_t *ppos, size_t count)
1440 struct range_lock range;
1441 struct vvp_io *vio = vvp_env_io(env);
1442 struct inode *inode = file_inode(file);
1443 struct ll_inode_info *lli = ll_i2info(inode);
1444 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1449 unsigned retried = 0;
1450 bool restarted = false;
1454 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1455 file_dentry(file)->d_name.name,
1456 iot == CIT_READ ? "read" : "write", pos, pos + count);
1459 io = vvp_env_thread_io(env);
1460 ll_io_init(io, file, iot);
1461 if (args->via_io_subtype == IO_NORMAL) {
1462 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1463 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1465 if (args->via_io_subtype != IO_NORMAL || restarted)
1467 io->ci_ndelay_tried = retried;
1469 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1470 bool range_locked = false;
1472 if (file->f_flags & O_APPEND)
1473 range_lock_init(&range, 0, LUSTRE_EOF);
1475 range_lock_init(&range, pos, pos + count - 1);
1477 vio->vui_fd = LUSTRE_FPRIVATE(file);
1478 vio->vui_io_subtype = args->via_io_subtype;
1480 switch (vio->vui_io_subtype) {
1482 /* Direct IO reads must also take range lock,
1483 * or multiple reads will try to work on the same pages
1484 * See LU-6227 for details. */
1485 if (((iot == CIT_WRITE) ||
1486 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1487 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1488 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1490 rc = range_lock(&lli->lli_write_tree, &range);
1494 range_locked = true;
1498 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1499 vio->u.splice.vui_flags = args->u.splice.via_flags;
1502 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1506 ll_cl_add(file, env, io, LCC_RW);
1507 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1508 !lli->lli_inode_locked) {
1510 lli->lli_inode_locked = 1;
1512 rc = cl_io_loop(env, io);
1513 if (lli->lli_inode_locked) {
1514 lli->lli_inode_locked = 0;
1515 inode_unlock(inode);
1517 ll_cl_remove(file, env);
1520 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1522 range_unlock(&lli->lli_write_tree, &range);
1525 /* cl_io_rw_init() handled IO */
1529 if (io->ci_nob > 0) {
1530 result += io->ci_nob;
1531 count -= io->ci_nob;
1533 if (args->via_io_subtype == IO_NORMAL) {
1534 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1536 /* CLIO is too complicated. See LU-11069. */
1537 if (cl_io_is_append(io))
1538 pos = io->u.ci_rw.rw_iocb.ki_pos;
1542 args->u.normal.via_iocb->ki_pos = pos;
1543 #ifdef HAVE_KIOCB_KI_LEFT
1544 args->u.normal.via_iocb->ki_left = count;
1545 #elif defined(HAVE_KI_NBYTES)
1546 args->u.normal.via_iocb->ki_nbytes = count;
1550 pos = io->u.ci_rw.rw_range.cir_pos;
1554 cl_io_fini(env, io);
1557 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1558 file->f_path.dentry->d_name.name,
1559 iot, rc, result, io->ci_need_restart);
1561 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1563 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1564 file_dentry(file)->d_name.name,
1565 iot == CIT_READ ? "read" : "write",
1566 pos, pos + count, result, rc);
1567 /* preserve the tried count for FLR */
1568 retried = io->ci_ndelay_tried;
1573 if (iot == CIT_READ) {
1575 ll_stats_ops_tally(ll_i2sbi(inode),
1576 LPROC_LL_READ_BYTES, result);
1577 } else if (iot == CIT_WRITE) {
1579 ll_stats_ops_tally(ll_i2sbi(inode),
1580 LPROC_LL_WRITE_BYTES, result);
1581 fd->fd_write_failed = false;
1582 } else if (result == 0 && rc == 0) {
1585 fd->fd_write_failed = true;
1587 fd->fd_write_failed = false;
1588 } else if (rc != -ERESTARTSYS) {
1589 fd->fd_write_failed = true;
1593 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1594 file_dentry(file)->d_name.name,
1595 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1599 RETURN(result > 0 ? result : rc);
1603 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1604 * especially for small I/O.
1606 * To serve a read request, CLIO has to create and initialize a cl_io and
1607 * then request DLM lock. This has turned out to have siginificant overhead
1608 * and affects the performance of small I/O dramatically.
1610 * It's not necessary to create a cl_io for each I/O. Under the help of read
1611 * ahead, most of the pages being read are already in memory cache and we can
1612 * read those pages directly because if the pages exist, the corresponding DLM
1613 * lock must exist so that page content must be valid.
1615 * In fast read implementation, the llite speculatively finds and reads pages
1616 * in memory cache. There are three scenarios for fast read:
1617 * - If the page exists and is uptodate, kernel VM will provide the data and
1618 * CLIO won't be intervened;
1619 * - If the page was brought into memory by read ahead, it will be exported
1620 * and read ahead parameters will be updated;
1621 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1622 * it will go back and invoke normal read, i.e., a cl_io will be created
1623 * and DLM lock will be requested.
1625 * POSIX compliance: posix standard states that read is intended to be atomic.
1626 * Lustre read implementation is in line with Linux kernel read implementation
1627 * and neither of them complies with POSIX standard in this matter. Fast read
1628 * doesn't make the situation worse on single node but it may interleave write
1629 * results from multiple nodes due to short read handling in ll_file_aio_read().
1631 * \param env - lu_env
1632 * \param iocb - kiocb from kernel
1633 * \param iter - user space buffers where the data will be copied
1635 * \retval - number of bytes have been read, or error code if error occurred.
1638 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1642 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1645 /* NB: we can't do direct IO for fast read because it will need a lock
1646 * to make IO engine happy. */
1647 if (iocb->ki_filp->f_flags & O_DIRECT)
1650 result = generic_file_read_iter(iocb, iter);
1652 /* If the first page is not in cache, generic_file_aio_read() will be
1653 * returned with -ENODATA.
1654 * See corresponding code in ll_readpage(). */
1655 if (result == -ENODATA)
1659 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1660 LPROC_LL_READ_BYTES, result);
1666 * Read from a file (through the page cache).
1668 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1671 struct vvp_io_args *args;
1676 result = ll_do_fast_read(iocb, to);
1677 if (result < 0 || iov_iter_count(to) == 0)
1680 env = cl_env_get(&refcheck);
1682 return PTR_ERR(env);
1684 args = ll_env_args(env, IO_NORMAL);
1685 args->u.normal.via_iter = to;
1686 args->u.normal.via_iocb = iocb;
1688 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1689 &iocb->ki_pos, iov_iter_count(to));
1692 else if (result == 0)
1695 cl_env_put(env, &refcheck);
1701 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1702 * If a page is already in the page cache and dirty (and some other things -
1703 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1704 * write to it without doing a full I/O, because Lustre already knows about it
1705 * and will write it out. This saves a lot of processing time.
1707 * All writes here are within one page, so exclusion is handled by the page
1708 * lock on the vm page. We do not do tiny writes for writes which touch
1709 * multiple pages because it's very unlikely multiple sequential pages are
1710 * are already dirty.
1712 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1713 * and are unlikely to be to already dirty pages.
1715 * Attribute updates are important here, we do them in ll_tiny_write_end.
1717 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1719 ssize_t count = iov_iter_count(iter);
1720 struct file *file = iocb->ki_filp;
1721 struct inode *inode = file_inode(file);
1726 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1727 * of function for why.
1729 if (count >= PAGE_SIZE ||
1730 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1733 result = __generic_file_write_iter(iocb, iter);
1735 /* If the page is not already dirty, ll_tiny_write_begin returns
1736 * -ENODATA. We continue on to normal write.
1738 if (result == -ENODATA)
1742 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1744 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1747 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1753 * Write to a file (through the page cache).
1755 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1757 struct vvp_io_args *args;
1759 ssize_t rc_tiny = 0, rc_normal;
1764 /* NB: we can't do direct IO for tiny writes because they use the page
1765 * cache, we can't do sync writes because tiny writes can't flush
1766 * pages, and we can't do append writes because we can't guarantee the
1767 * required DLM locks are held to protect file size.
1769 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1770 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1771 rc_tiny = ll_do_tiny_write(iocb, from);
1773 /* In case of error, go on and try normal write - Only stop if tiny
1774 * write completed I/O.
1776 if (iov_iter_count(from) == 0)
1777 GOTO(out, rc_normal = rc_tiny);
1779 env = cl_env_get(&refcheck);
1781 return PTR_ERR(env);
1783 args = ll_env_args(env, IO_NORMAL);
1784 args->u.normal.via_iter = from;
1785 args->u.normal.via_iocb = iocb;
1787 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1788 &iocb->ki_pos, iov_iter_count(from));
1790 /* On success, combine bytes written. */
1791 if (rc_tiny >= 0 && rc_normal > 0)
1792 rc_normal += rc_tiny;
1793 /* On error, only return error from normal write if tiny write did not
1794 * write any bytes. Otherwise return bytes written by tiny write.
1796 else if (rc_tiny > 0)
1797 rc_normal = rc_tiny;
1799 cl_env_put(env, &refcheck);
1804 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1806 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1808 static int ll_file_get_iov_count(const struct iovec *iov,
1809 unsigned long *nr_segs, size_t *count)
1814 for (seg = 0; seg < *nr_segs; seg++) {
1815 const struct iovec *iv = &iov[seg];
1818 * If any segment has a negative length, or the cumulative
1819 * length ever wraps negative then return -EINVAL.
1822 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1824 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1829 cnt -= iv->iov_len; /* This segment is no good */
1836 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1837 unsigned long nr_segs, loff_t pos)
1844 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1848 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1849 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1850 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1851 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1852 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1854 result = ll_file_read_iter(iocb, &to);
1859 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1862 struct iovec iov = { .iov_base = buf, .iov_len = count };
1867 init_sync_kiocb(&kiocb, file);
1868 kiocb.ki_pos = *ppos;
1869 #ifdef HAVE_KIOCB_KI_LEFT
1870 kiocb.ki_left = count;
1871 #elif defined(HAVE_KI_NBYTES)
1872 kiocb.i_nbytes = count;
1875 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1876 *ppos = kiocb.ki_pos;
1882 * Write to a file (through the page cache).
1885 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1886 unsigned long nr_segs, loff_t pos)
1888 struct iov_iter from;
1893 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1897 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1898 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1899 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1900 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1901 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1903 result = ll_file_write_iter(iocb, &from);
1908 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1909 size_t count, loff_t *ppos)
1911 struct iovec iov = { .iov_base = (void __user *)buf,
1918 init_sync_kiocb(&kiocb, file);
1919 kiocb.ki_pos = *ppos;
1920 #ifdef HAVE_KIOCB_KI_LEFT
1921 kiocb.ki_left = count;
1922 #elif defined(HAVE_KI_NBYTES)
1923 kiocb.ki_nbytes = count;
1926 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1927 *ppos = kiocb.ki_pos;
1931 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1934 * Send file content (through pagecache) somewhere with helper
1936 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1937 struct pipe_inode_info *pipe, size_t count,
1941 struct vvp_io_args *args;
1946 env = cl_env_get(&refcheck);
1948 RETURN(PTR_ERR(env));
1950 args = ll_env_args(env, IO_SPLICE);
1951 args->u.splice.via_pipe = pipe;
1952 args->u.splice.via_flags = flags;
1954 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1955 cl_env_put(env, &refcheck);
1959 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1960 __u64 flags, struct lov_user_md *lum, int lum_size)
1962 struct lookup_intent oit = {
1964 .it_flags = flags | MDS_OPEN_BY_FID,
1969 ll_inode_size_lock(inode);
1970 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1972 GOTO(out_unlock, rc);
1974 ll_release_openhandle(dentry, &oit);
1977 ll_inode_size_unlock(inode);
1978 ll_intent_release(&oit);
1983 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1984 struct lov_mds_md **lmmp, int *lmm_size,
1985 struct ptlrpc_request **request)
1987 struct ll_sb_info *sbi = ll_i2sbi(inode);
1988 struct mdt_body *body;
1989 struct lov_mds_md *lmm = NULL;
1990 struct ptlrpc_request *req = NULL;
1991 struct md_op_data *op_data;
1994 rc = ll_get_default_mdsize(sbi, &lmmsize);
1998 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1999 strlen(filename), lmmsize,
2000 LUSTRE_OPC_ANY, NULL);
2001 if (IS_ERR(op_data))
2002 RETURN(PTR_ERR(op_data));
2004 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2005 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2006 ll_finish_md_op_data(op_data);
2008 CDEBUG(D_INFO, "md_getattr_name failed "
2009 "on %s: rc %d\n", filename, rc);
2013 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2014 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2016 lmmsize = body->mbo_eadatasize;
2018 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2020 GOTO(out, rc = -ENODATA);
2023 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2024 LASSERT(lmm != NULL);
2026 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2027 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2028 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2029 GOTO(out, rc = -EPROTO);
2032 * This is coming from the MDS, so is probably in
2033 * little endian. We convert it to host endian before
2034 * passing it to userspace.
2036 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2039 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2040 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2041 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2042 if (le32_to_cpu(lmm->lmm_pattern) &
2043 LOV_PATTERN_F_RELEASED)
2047 /* if function called for directory - we should
2048 * avoid swab not existent lsm objects */
2049 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2050 lustre_swab_lov_user_md_v1(
2051 (struct lov_user_md_v1 *)lmm);
2052 if (S_ISREG(body->mbo_mode))
2053 lustre_swab_lov_user_md_objects(
2054 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2056 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2057 lustre_swab_lov_user_md_v3(
2058 (struct lov_user_md_v3 *)lmm);
2059 if (S_ISREG(body->mbo_mode))
2060 lustre_swab_lov_user_md_objects(
2061 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2063 } else if (lmm->lmm_magic ==
2064 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2065 lustre_swab_lov_comp_md_v1(
2066 (struct lov_comp_md_v1 *)lmm);
2072 *lmm_size = lmmsize;
2077 static int ll_lov_setea(struct inode *inode, struct file *file,
2080 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2081 struct lov_user_md *lump;
2082 int lum_size = sizeof(struct lov_user_md) +
2083 sizeof(struct lov_user_ost_data);
2087 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2090 OBD_ALLOC_LARGE(lump, lum_size);
2094 if (copy_from_user(lump, arg, lum_size))
2095 GOTO(out_lump, rc = -EFAULT);
2097 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2099 cl_lov_delay_create_clear(&file->f_flags);
2102 OBD_FREE_LARGE(lump, lum_size);
2106 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2113 env = cl_env_get(&refcheck);
2115 RETURN(PTR_ERR(env));
2117 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2118 cl_env_put(env, &refcheck);
2122 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2125 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2126 struct lov_user_md *klum;
2128 __u64 flags = FMODE_WRITE;
2131 rc = ll_copy_user_md(lum, &klum);
2136 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2141 rc = put_user(0, &lum->lmm_stripe_count);
2145 rc = ll_layout_refresh(inode, &gen);
2149 rc = ll_file_getstripe(inode, arg, lum_size);
2151 cl_lov_delay_create_clear(&file->f_flags);
2154 OBD_FREE(klum, lum_size);
2159 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2161 struct ll_inode_info *lli = ll_i2info(inode);
2162 struct cl_object *obj = lli->lli_clob;
2163 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2164 struct ll_grouplock grouplock;
2169 CWARN("group id for group lock must not be 0\n");
2173 if (ll_file_nolock(file))
2174 RETURN(-EOPNOTSUPP);
2176 spin_lock(&lli->lli_lock);
2177 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2178 CWARN("group lock already existed with gid %lu\n",
2179 fd->fd_grouplock.lg_gid);
2180 spin_unlock(&lli->lli_lock);
2183 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2184 spin_unlock(&lli->lli_lock);
2187 * XXX: group lock needs to protect all OST objects while PFL
2188 * can add new OST objects during the IO, so we'd instantiate
2189 * all OST objects before getting its group lock.
2194 struct cl_layout cl = {
2195 .cl_is_composite = false,
2197 struct lu_extent ext = {
2199 .e_end = OBD_OBJECT_EOF,
2202 env = cl_env_get(&refcheck);
2204 RETURN(PTR_ERR(env));
2206 rc = cl_object_layout_get(env, obj, &cl);
2207 if (!rc && cl.cl_is_composite)
2208 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2211 cl_env_put(env, &refcheck);
2216 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2217 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2221 spin_lock(&lli->lli_lock);
2222 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2223 spin_unlock(&lli->lli_lock);
2224 CERROR("another thread just won the race\n");
2225 cl_put_grouplock(&grouplock);
2229 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2230 fd->fd_grouplock = grouplock;
2231 spin_unlock(&lli->lli_lock);
2233 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2237 static int ll_put_grouplock(struct inode *inode, struct file *file,
2240 struct ll_inode_info *lli = ll_i2info(inode);
2241 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2242 struct ll_grouplock grouplock;
2245 spin_lock(&lli->lli_lock);
2246 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2247 spin_unlock(&lli->lli_lock);
2248 CWARN("no group lock held\n");
2252 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2254 if (fd->fd_grouplock.lg_gid != arg) {
2255 CWARN("group lock %lu doesn't match current id %lu\n",
2256 arg, fd->fd_grouplock.lg_gid);
2257 spin_unlock(&lli->lli_lock);
2261 grouplock = fd->fd_grouplock;
2262 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2263 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2264 spin_unlock(&lli->lli_lock);
2266 cl_put_grouplock(&grouplock);
2267 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2272 * Close inode open handle
2274 * \param dentry [in] dentry which contains the inode
2275 * \param it [in,out] intent which contains open info and result
2278 * \retval <0 failure
2280 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2282 struct inode *inode = dentry->d_inode;
2283 struct obd_client_handle *och;
2289 /* Root ? Do nothing. */
2290 if (dentry->d_inode->i_sb->s_root == dentry)
2293 /* No open handle to close? Move away */
2294 if (!it_disposition(it, DISP_OPEN_OPEN))
2297 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2299 OBD_ALLOC(och, sizeof(*och));
2301 GOTO(out, rc = -ENOMEM);
2303 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2305 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2307 /* this one is in place of ll_file_open */
2308 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2309 ptlrpc_req_finished(it->it_request);
2310 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2316 * Get size for inode for which FIEMAP mapping is requested.
2317 * Make the FIEMAP get_info call and returns the result.
2318 * \param fiemap kernel buffer to hold extens
2319 * \param num_bytes kernel buffer size
2321 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2327 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2330 /* Checks for fiemap flags */
2331 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2332 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2336 /* Check for FIEMAP_FLAG_SYNC */
2337 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2338 rc = filemap_fdatawrite(inode->i_mapping);
2343 env = cl_env_get(&refcheck);
2345 RETURN(PTR_ERR(env));
2347 if (i_size_read(inode) == 0) {
2348 rc = ll_glimpse_size(inode);
2353 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2354 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2355 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2357 /* If filesize is 0, then there would be no objects for mapping */
2358 if (fmkey.lfik_oa.o_size == 0) {
2359 fiemap->fm_mapped_extents = 0;
2363 fmkey.lfik_fiemap = *fiemap;
2365 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2366 &fmkey, fiemap, &num_bytes);
2368 cl_env_put(env, &refcheck);
2372 int ll_fid2path(struct inode *inode, void __user *arg)
2374 struct obd_export *exp = ll_i2mdexp(inode);
2375 const struct getinfo_fid2path __user *gfin = arg;
2377 struct getinfo_fid2path *gfout;
2383 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2384 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2387 /* Only need to get the buflen */
2388 if (get_user(pathlen, &gfin->gf_pathlen))
2391 if (pathlen > PATH_MAX)
2394 outsize = sizeof(*gfout) + pathlen;
2395 OBD_ALLOC(gfout, outsize);
2399 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2400 GOTO(gf_free, rc = -EFAULT);
2401 /* append root FID after gfout to let MDT know the root FID so that it
2402 * can lookup the correct path, this is mainly for fileset.
2403 * old server without fileset mount support will ignore this. */
2404 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2406 /* Call mdc_iocontrol */
2407 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2411 if (copy_to_user(arg, gfout, outsize))
2415 OBD_FREE(gfout, outsize);
2420 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2422 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2430 ioc->idv_version = 0;
2431 ioc->idv_layout_version = UINT_MAX;
2433 /* If no file object initialized, we consider its version is 0. */
2437 env = cl_env_get(&refcheck);
2439 RETURN(PTR_ERR(env));
2441 io = vvp_env_thread_io(env);
2443 io->u.ci_data_version.dv_data_version = 0;
2444 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2445 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2448 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2449 result = cl_io_loop(env, io);
2451 result = io->ci_result;
2453 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2454 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2456 cl_io_fini(env, io);
2458 if (unlikely(io->ci_need_restart))
2461 cl_env_put(env, &refcheck);
2467 * Read the data_version for inode.
2469 * This value is computed using stripe object version on OST.
2470 * Version is computed using server side locking.
2472 * @param flags if do sync on the OST side;
2474 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2475 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2477 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2479 struct ioc_data_version ioc = { .idv_flags = flags };
2482 rc = ll_ioc_data_version(inode, &ioc);
2484 *data_version = ioc.idv_version;
2490 * Trigger a HSM release request for the provided inode.
2492 int ll_hsm_release(struct inode *inode)
2495 struct obd_client_handle *och = NULL;
2496 __u64 data_version = 0;
2501 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2502 ll_get_fsname(inode->i_sb, NULL, 0),
2503 PFID(&ll_i2info(inode)->lli_fid));
2505 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2507 GOTO(out, rc = PTR_ERR(och));
2509 /* Grab latest data_version and [am]time values */
2510 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2514 env = cl_env_get(&refcheck);
2516 GOTO(out, rc = PTR_ERR(env));
2518 rc = ll_merge_attr(env, inode);
2519 cl_env_put(env, &refcheck);
2521 /* If error happen, we have the wrong size for a file.
2527 /* Release the file.
2528 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2529 * we still need it to pack l_remote_handle to MDT. */
2530 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2536 if (och != NULL && !IS_ERR(och)) /* close the file */
2537 ll_lease_close(och, inode, NULL);
2542 struct ll_swap_stack {
2545 struct inode *inode1;
2546 struct inode *inode2;
2551 static int ll_swap_layouts(struct file *file1, struct file *file2,
2552 struct lustre_swap_layouts *lsl)
2554 struct mdc_swap_layouts msl;
2555 struct md_op_data *op_data;
2558 struct ll_swap_stack *llss = NULL;
2561 OBD_ALLOC_PTR(llss);
2565 llss->inode1 = file_inode(file1);
2566 llss->inode2 = file_inode(file2);
2568 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2572 /* we use 2 bool because it is easier to swap than 2 bits */
2573 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2574 llss->check_dv1 = true;
2576 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2577 llss->check_dv2 = true;
2579 /* we cannot use lsl->sl_dvX directly because we may swap them */
2580 llss->dv1 = lsl->sl_dv1;
2581 llss->dv2 = lsl->sl_dv2;
2583 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2584 if (rc == 0) /* same file, done! */
2587 if (rc < 0) { /* sequentialize it */
2588 swap(llss->inode1, llss->inode2);
2590 swap(llss->dv1, llss->dv2);
2591 swap(llss->check_dv1, llss->check_dv2);
2595 if (gid != 0) { /* application asks to flush dirty cache */
2596 rc = ll_get_grouplock(llss->inode1, file1, gid);
2600 rc = ll_get_grouplock(llss->inode2, file2, gid);
2602 ll_put_grouplock(llss->inode1, file1, gid);
2607 /* ultimate check, before swaping the layouts we check if
2608 * dataversion has changed (if requested) */
2609 if (llss->check_dv1) {
2610 rc = ll_data_version(llss->inode1, &dv, 0);
2613 if (dv != llss->dv1)
2614 GOTO(putgl, rc = -EAGAIN);
2617 if (llss->check_dv2) {
2618 rc = ll_data_version(llss->inode2, &dv, 0);
2621 if (dv != llss->dv2)
2622 GOTO(putgl, rc = -EAGAIN);
2625 /* struct md_op_data is used to send the swap args to the mdt
2626 * only flags is missing, so we use struct mdc_swap_layouts
2627 * through the md_op_data->op_data */
2628 /* flags from user space have to be converted before they are send to
2629 * server, no flag is sent today, they are only used on the client */
2632 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2633 0, LUSTRE_OPC_ANY, &msl);
2634 if (IS_ERR(op_data))
2635 GOTO(free, rc = PTR_ERR(op_data));
2637 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2638 sizeof(*op_data), op_data, NULL);
2639 ll_finish_md_op_data(op_data);
2646 ll_put_grouplock(llss->inode2, file2, gid);
2647 ll_put_grouplock(llss->inode1, file1, gid);
2657 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2659 struct obd_export *exp = ll_i2mdexp(inode);
2660 struct md_op_data *op_data;
2664 /* Detect out-of range masks */
2665 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2668 /* Non-root users are forbidden to set or clear flags which are
2669 * NOT defined in HSM_USER_MASK. */
2670 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2671 !cfs_capable(CFS_CAP_SYS_ADMIN))
2674 if (!exp_connect_archive_id_array(exp)) {
2675 /* Detect out-of range archive id */
2676 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2677 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2681 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2682 LUSTRE_OPC_ANY, hss);
2683 if (IS_ERR(op_data))
2684 RETURN(PTR_ERR(op_data));
2686 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2689 ll_finish_md_op_data(op_data);
2694 static int ll_hsm_import(struct inode *inode, struct file *file,
2695 struct hsm_user_import *hui)
2697 struct hsm_state_set *hss = NULL;
2698 struct iattr *attr = NULL;
2702 if (!S_ISREG(inode->i_mode))
2708 GOTO(out, rc = -ENOMEM);
2710 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2711 hss->hss_archive_id = hui->hui_archive_id;
2712 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2713 rc = ll_hsm_state_set(inode, hss);
2717 OBD_ALLOC_PTR(attr);
2719 GOTO(out, rc = -ENOMEM);
2721 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2722 attr->ia_mode |= S_IFREG;
2723 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2724 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2725 attr->ia_size = hui->hui_size;
2726 attr->ia_mtime.tv_sec = hui->hui_mtime;
2727 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2728 attr->ia_atime.tv_sec = hui->hui_atime;
2729 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2731 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2732 ATTR_UID | ATTR_GID |
2733 ATTR_MTIME | ATTR_MTIME_SET |
2734 ATTR_ATIME | ATTR_ATIME_SET;
2738 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2742 inode_unlock(inode);
2754 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2756 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2757 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2760 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2762 struct inode *inode = file_inode(file);
2764 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2765 ATTR_MTIME | ATTR_MTIME_SET |
2768 .tv_sec = lfu->lfu_atime_sec,
2769 .tv_nsec = lfu->lfu_atime_nsec,
2772 .tv_sec = lfu->lfu_mtime_sec,
2773 .tv_nsec = lfu->lfu_mtime_nsec,
2776 .tv_sec = lfu->lfu_ctime_sec,
2777 .tv_nsec = lfu->lfu_ctime_nsec,
2783 if (!capable(CAP_SYS_ADMIN))
2786 if (!S_ISREG(inode->i_mode))
2790 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2792 inode_unlock(inode);
2797 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2800 case MODE_READ_USER:
2802 case MODE_WRITE_USER:
2809 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2811 /* Used to allow the upper layers of the client to request an LDLM lock
2812 * without doing an actual read or write.
2814 * Used for ladvise lockahead to manually request specific locks.
2816 * \param[in] file file this ladvise lock request is on
2817 * \param[in] ladvise ladvise struct describing this lock request
2819 * \retval 0 success, no detailed result available (sync requests
2820 * and requests sent to the server [not handled locally]
2821 * cannot return detailed results)
2822 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2823 * see definitions for details.
2824 * \retval negative negative errno on error
2826 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2828 struct lu_env *env = NULL;
2829 struct cl_io *io = NULL;
2830 struct cl_lock *lock = NULL;
2831 struct cl_lock_descr *descr = NULL;
2832 struct dentry *dentry = file->f_path.dentry;
2833 struct inode *inode = dentry->d_inode;
2834 enum cl_lock_mode cl_mode;
2835 off_t start = ladvise->lla_start;
2836 off_t end = ladvise->lla_end;
2842 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2843 "start=%llu, end=%llu\n", dentry->d_name.len,
2844 dentry->d_name.name, dentry->d_inode,
2845 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2848 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2850 GOTO(out, result = cl_mode);
2852 /* Get IO environment */
2853 result = cl_io_get(inode, &env, &io, &refcheck);
2857 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2860 * nothing to do for this io. This currently happens when
2861 * stripe sub-object's are not yet created.
2863 result = io->ci_result;
2864 } else if (result == 0) {
2865 lock = vvp_env_lock(env);
2866 descr = &lock->cll_descr;
2868 descr->cld_obj = io->ci_obj;
2869 /* Convert byte offsets to pages */
2870 descr->cld_start = cl_index(io->ci_obj, start);
2871 descr->cld_end = cl_index(io->ci_obj, end);
2872 descr->cld_mode = cl_mode;
2873 /* CEF_MUST is used because we do not want to convert a
2874 * lockahead request to a lockless lock */
2875 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2878 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2879 descr->cld_enq_flags |= CEF_SPECULATIVE;
2881 result = cl_lock_request(env, io, lock);
2883 /* On success, we need to release the lock */
2885 cl_lock_release(env, lock);
2887 cl_io_fini(env, io);
2888 cl_env_put(env, &refcheck);
2890 /* -ECANCELED indicates a matching lock with a different extent
2891 * was already present, and -EEXIST indicates a matching lock
2892 * on exactly the same extent was already present.
2893 * We convert them to positive values for userspace to make
2894 * recognizing true errors easier.
2895 * Note we can only return these detailed results on async requests,
2896 * as sync requests look the same as i/o requests for locking. */
2897 if (result == -ECANCELED)
2898 result = LLA_RESULT_DIFFERENT;
2899 else if (result == -EEXIST)
2900 result = LLA_RESULT_SAME;
2905 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2907 static int ll_ladvise_sanity(struct inode *inode,
2908 struct llapi_lu_ladvise *ladvise)
2910 enum lu_ladvise_type advice = ladvise->lla_advice;
2911 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2912 * be in the first 32 bits of enum ladvise_flags */
2913 __u32 flags = ladvise->lla_peradvice_flags;
2914 /* 3 lines at 80 characters per line, should be plenty */
2917 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2919 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2920 "last supported advice is %s (value '%d'): rc = %d\n",
2921 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2922 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2926 /* Per-advice checks */
2928 case LU_LADVISE_LOCKNOEXPAND:
2929 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2931 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2933 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2934 ladvise_names[advice], rc);
2938 case LU_LADVISE_LOCKAHEAD:
2939 /* Currently only READ and WRITE modes can be requested */
2940 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2941 ladvise->lla_lockahead_mode == 0) {
2943 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2945 ll_get_fsname(inode->i_sb, NULL, 0),
2946 ladvise->lla_lockahead_mode,
2947 ladvise_names[advice], rc);
2950 case LU_LADVISE_WILLREAD:
2951 case LU_LADVISE_DONTNEED:
2953 /* Note fall through above - These checks apply to all advices
2954 * except LOCKNOEXPAND */
2955 if (flags & ~LF_DEFAULT_MASK) {
2957 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2959 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2960 ladvise_names[advice], rc);
2963 if (ladvise->lla_start >= ladvise->lla_end) {
2965 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2966 "for %s: rc = %d\n",
2967 ll_get_fsname(inode->i_sb, NULL, 0),
2968 ladvise->lla_start, ladvise->lla_end,
2969 ladvise_names[advice], rc);
2981 * Give file access advices
2983 * The ladvise interface is similar to Linux fadvise() system call, except it
2984 * forwards the advices directly from Lustre client to server. The server side
2985 * codes will apply appropriate read-ahead and caching techniques for the
2986 * corresponding files.
2988 * A typical workload for ladvise is e.g. a bunch of different clients are
2989 * doing small random reads of a file, so prefetching pages into OSS cache
2990 * with big linear reads before the random IO is a net benefit. Fetching
2991 * all that data into each client cache with fadvise() may not be, due to
2992 * much more data being sent to the client.
2994 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2995 struct llapi_lu_ladvise *ladvise)
2999 struct cl_ladvise_io *lio;
3004 env = cl_env_get(&refcheck);
3006 RETURN(PTR_ERR(env));
3008 io = vvp_env_thread_io(env);
3009 io->ci_obj = ll_i2info(inode)->lli_clob;
3011 /* initialize parameters for ladvise */
3012 lio = &io->u.ci_ladvise;
3013 lio->li_start = ladvise->lla_start;
3014 lio->li_end = ladvise->lla_end;
3015 lio->li_fid = ll_inode2fid(inode);
3016 lio->li_advice = ladvise->lla_advice;
3017 lio->li_flags = flags;
3019 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3020 rc = cl_io_loop(env, io);
3024 cl_io_fini(env, io);
3025 cl_env_put(env, &refcheck);
3029 static int ll_lock_noexpand(struct file *file, int flags)
3031 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3033 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3038 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3041 struct fsxattr fsxattr;
3043 if (copy_from_user(&fsxattr,
3044 (const struct fsxattr __user *)arg,
3048 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3049 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3050 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3051 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3052 if (copy_to_user((struct fsxattr __user *)arg,
3053 &fsxattr, sizeof(fsxattr)))
3059 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3062 * Project Quota ID state is only allowed to change from within the init
3063 * namespace. Enforce that restriction only if we are trying to change
3064 * the quota ID state. Everything else is allowed in user namespaces.
3066 if (current_user_ns() == &init_user_ns)
3069 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3072 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3073 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3076 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3083 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3087 struct md_op_data *op_data;
3088 struct ptlrpc_request *req = NULL;
3090 struct fsxattr fsxattr;
3091 struct cl_object *obj;
3095 if (copy_from_user(&fsxattr,
3096 (const struct fsxattr __user *)arg,
3100 rc = ll_ioctl_check_project(inode, &fsxattr);
3104 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3105 LUSTRE_OPC_ANY, NULL);
3106 if (IS_ERR(op_data))
3107 RETURN(PTR_ERR(op_data));
3109 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3110 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3111 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3112 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3113 op_data->op_projid = fsxattr.fsx_projid;
3114 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3115 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3117 ptlrpc_req_finished(req);
3119 GOTO(out_fsxattr, rc);
3120 ll_update_inode_flags(inode, op_data->op_attr_flags);
3121 obj = ll_i2info(inode)->lli_clob;
3123 GOTO(out_fsxattr, rc);
3125 OBD_ALLOC_PTR(attr);
3127 GOTO(out_fsxattr, rc = -ENOMEM);
3129 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3130 fsxattr.fsx_xflags);
3133 ll_finish_md_op_data(op_data);
3137 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3140 struct inode *inode = file_inode(file);
3141 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3142 struct ll_inode_info *lli = ll_i2info(inode);
3143 struct obd_client_handle *och = NULL;
3144 struct split_param sp;
3147 enum mds_op_bias bias = 0;
3148 struct file *layout_file = NULL;
3150 size_t data_size = 0;
3154 mutex_lock(&lli->lli_och_mutex);
3155 if (fd->fd_lease_och != NULL) {
3156 och = fd->fd_lease_och;
3157 fd->fd_lease_och = NULL;
3159 mutex_unlock(&lli->lli_och_mutex);
3162 GOTO(out, rc = -ENOLCK);
3164 fmode = och->och_flags;
3166 switch (ioc->lil_flags) {
3167 case LL_LEASE_RESYNC_DONE:
3168 if (ioc->lil_count > IOC_IDS_MAX)
3169 GOTO(out, rc = -EINVAL);
3171 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3172 OBD_ALLOC(data, data_size);
3174 GOTO(out, rc = -ENOMEM);
3176 if (copy_from_user(data, (void __user *)arg, data_size))
3177 GOTO(out, rc = -EFAULT);
3179 bias = MDS_CLOSE_RESYNC_DONE;
3181 case LL_LEASE_LAYOUT_MERGE: {
3184 if (ioc->lil_count != 1)
3185 GOTO(out, rc = -EINVAL);
3187 arg += sizeof(*ioc);
3188 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3189 GOTO(out, rc = -EFAULT);
3191 layout_file = fget(fd);
3193 GOTO(out, rc = -EBADF);
3195 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3196 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3197 GOTO(out, rc = -EPERM);
3199 data = file_inode(layout_file);
3200 bias = MDS_CLOSE_LAYOUT_MERGE;
3203 case LL_LEASE_LAYOUT_SPLIT: {
3207 if (ioc->lil_count != 2)
3208 GOTO(out, rc = -EINVAL);
3210 arg += sizeof(*ioc);
3211 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3212 GOTO(out, rc = -EFAULT);
3214 arg += sizeof(__u32);
3215 if (copy_from_user(&mirror_id, (void __user *)arg,
3217 GOTO(out, rc = -EFAULT);
3219 layout_file = fget(fdv);
3221 GOTO(out, rc = -EBADF);
3223 sp.sp_inode = file_inode(layout_file);
3224 sp.sp_mirror_id = (__u16)mirror_id;
3226 bias = MDS_CLOSE_LAYOUT_SPLIT;
3230 /* without close intent */
3234 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3238 rc = ll_lease_och_release(inode, file);
3247 switch (ioc->lil_flags) {
3248 case LL_LEASE_RESYNC_DONE:
3250 OBD_FREE(data, data_size);
3252 case LL_LEASE_LAYOUT_MERGE:
3253 case LL_LEASE_LAYOUT_SPLIT:
3260 rc = ll_lease_type_from_fmode(fmode);
3264 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3267 struct inode *inode = file_inode(file);
3268 struct ll_inode_info *lli = ll_i2info(inode);
3269 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3270 struct obd_client_handle *och = NULL;
3271 __u64 open_flags = 0;
3277 switch (ioc->lil_mode) {
3278 case LL_LEASE_WRLCK:
3279 if (!(file->f_mode & FMODE_WRITE))
3281 fmode = FMODE_WRITE;
3283 case LL_LEASE_RDLCK:
3284 if (!(file->f_mode & FMODE_READ))
3288 case LL_LEASE_UNLCK:
3289 RETURN(ll_file_unlock_lease(file, ioc, arg));
3294 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3296 /* apply for lease */
3297 if (ioc->lil_flags & LL_LEASE_RESYNC)
3298 open_flags = MDS_OPEN_RESYNC;
3299 och = ll_lease_open(inode, file, fmode, open_flags);
3301 RETURN(PTR_ERR(och));
3303 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3304 rc = ll_lease_file_resync(och, inode, arg);
3306 ll_lease_close(och, inode, NULL);
3309 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3311 ll_lease_close(och, inode, NULL);
3317 mutex_lock(&lli->lli_och_mutex);
3318 if (fd->fd_lease_och == NULL) {
3319 fd->fd_lease_och = och;
3322 mutex_unlock(&lli->lli_och_mutex);
3324 /* impossible now that only excl is supported for now */
3325 ll_lease_close(och, inode, &lease_broken);
3332 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3334 struct inode *inode = file_inode(file);
3335 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3339 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3340 PFID(ll_inode2fid(inode)), inode, cmd);
3341 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3343 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3344 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3348 case LL_IOC_GETFLAGS:
3349 /* Get the current value of the file flags */
3350 return put_user(fd->fd_flags, (int __user *)arg);
3351 case LL_IOC_SETFLAGS:
3352 case LL_IOC_CLRFLAGS:
3353 /* Set or clear specific file flags */
3354 /* XXX This probably needs checks to ensure the flags are
3355 * not abused, and to handle any flag side effects.
3357 if (get_user(flags, (int __user *) arg))
3360 if (cmd == LL_IOC_SETFLAGS) {
3361 if ((flags & LL_FILE_IGNORE_LOCK) &&
3362 !(file->f_flags & O_DIRECT)) {
3363 CERROR("%s: unable to disable locking on "
3364 "non-O_DIRECT file\n", current->comm);
3368 fd->fd_flags |= flags;
3370 fd->fd_flags &= ~flags;
3373 case LL_IOC_LOV_SETSTRIPE:
3374 case LL_IOC_LOV_SETSTRIPE_NEW:
3375 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3376 case LL_IOC_LOV_SETEA:
3377 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3378 case LL_IOC_LOV_SWAP_LAYOUTS: {
3380 struct lustre_swap_layouts lsl;
3382 if (copy_from_user(&lsl, (char __user *)arg,
3383 sizeof(struct lustre_swap_layouts)))
3386 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3389 file2 = fget(lsl.sl_fd);
3393 /* O_WRONLY or O_RDWR */
3394 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3395 GOTO(out, rc = -EPERM);
3397 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3398 struct inode *inode2;
3399 struct ll_inode_info *lli;
3400 struct obd_client_handle *och = NULL;
3402 lli = ll_i2info(inode);
3403 mutex_lock(&lli->lli_och_mutex);
3404 if (fd->fd_lease_och != NULL) {
3405 och = fd->fd_lease_och;
3406 fd->fd_lease_och = NULL;
3408 mutex_unlock(&lli->lli_och_mutex);
3410 GOTO(out, rc = -ENOLCK);
3411 inode2 = file_inode(file2);
3412 rc = ll_swap_layouts_close(och, inode, inode2);
3414 rc = ll_swap_layouts(file, file2, &lsl);
3420 case LL_IOC_LOV_GETSTRIPE:
3421 case LL_IOC_LOV_GETSTRIPE_NEW:
3422 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3423 case FS_IOC_GETFLAGS:
3424 case FS_IOC_SETFLAGS:
3425 RETURN(ll_iocontrol(inode, file, cmd, arg));
3426 case FSFILT_IOC_GETVERSION:
3427 case FS_IOC_GETVERSION:
3428 RETURN(put_user(inode->i_generation, (int __user *)arg));
3429 /* We need to special case any other ioctls we want to handle,
3430 * to send them to the MDS/OST as appropriate and to properly
3431 * network encode the arg field. */
3432 case FS_IOC_SETVERSION:
3435 case LL_IOC_GROUP_LOCK:
3436 RETURN(ll_get_grouplock(inode, file, arg));
3437 case LL_IOC_GROUP_UNLOCK:
3438 RETURN(ll_put_grouplock(inode, file, arg));
3439 case IOC_OBD_STATFS:
3440 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3442 case LL_IOC_FLUSHCTX:
3443 RETURN(ll_flush_ctx(inode));
3444 case LL_IOC_PATH2FID: {
3445 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3446 sizeof(struct lu_fid)))
3451 case LL_IOC_GETPARENT:
3452 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3454 case OBD_IOC_FID2PATH:
3455 RETURN(ll_fid2path(inode, (void __user *)arg));
3456 case LL_IOC_DATA_VERSION: {
3457 struct ioc_data_version idv;
3460 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3463 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3464 rc = ll_ioc_data_version(inode, &idv);
3467 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3473 case LL_IOC_GET_MDTIDX: {
3476 mdtidx = ll_get_mdt_idx(inode);
3480 if (put_user((int)mdtidx, (int __user *)arg))
3485 case OBD_IOC_GETDTNAME:
3486 case OBD_IOC_GETMDNAME:
3487 RETURN(ll_get_obd_name(inode, cmd, arg));
3488 case LL_IOC_HSM_STATE_GET: {
3489 struct md_op_data *op_data;
3490 struct hsm_user_state *hus;
3497 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3498 LUSTRE_OPC_ANY, hus);
3499 if (IS_ERR(op_data)) {
3501 RETURN(PTR_ERR(op_data));
3504 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3507 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3510 ll_finish_md_op_data(op_data);
3514 case LL_IOC_HSM_STATE_SET: {
3515 struct hsm_state_set *hss;
3522 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3527 rc = ll_hsm_state_set(inode, hss);
3532 case LL_IOC_HSM_ACTION: {
3533 struct md_op_data *op_data;
3534 struct hsm_current_action *hca;
3541 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3542 LUSTRE_OPC_ANY, hca);
3543 if (IS_ERR(op_data)) {
3545 RETURN(PTR_ERR(op_data));
3548 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3551 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3554 ll_finish_md_op_data(op_data);
3558 case LL_IOC_SET_LEASE_OLD: {
3559 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3561 RETURN(ll_file_set_lease(file, &ioc, 0));
3563 case LL_IOC_SET_LEASE: {
3564 struct ll_ioc_lease ioc;
3566 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3569 RETURN(ll_file_set_lease(file, &ioc, arg));
3571 case LL_IOC_GET_LEASE: {
3572 struct ll_inode_info *lli = ll_i2info(inode);
3573 struct ldlm_lock *lock = NULL;
3576 mutex_lock(&lli->lli_och_mutex);
3577 if (fd->fd_lease_och != NULL) {
3578 struct obd_client_handle *och = fd->fd_lease_och;
3580 lock = ldlm_handle2lock(&och->och_lease_handle);
3582 lock_res_and_lock(lock);
3583 if (!ldlm_is_cancel(lock))
3584 fmode = och->och_flags;
3586 unlock_res_and_lock(lock);
3587 LDLM_LOCK_PUT(lock);
3590 mutex_unlock(&lli->lli_och_mutex);
3592 RETURN(ll_lease_type_from_fmode(fmode));
3594 case LL_IOC_HSM_IMPORT: {
3595 struct hsm_user_import *hui;
3601 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3606 rc = ll_hsm_import(inode, file, hui);
3611 case LL_IOC_FUTIMES_3: {
3612 struct ll_futimes_3 lfu;
3614 if (copy_from_user(&lfu,
3615 (const struct ll_futimes_3 __user *)arg,
3619 RETURN(ll_file_futimes_3(file, &lfu));
3621 case LL_IOC_LADVISE: {
3622 struct llapi_ladvise_hdr *k_ladvise_hdr;
3623 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3626 int alloc_size = sizeof(*k_ladvise_hdr);
3629 u_ladvise_hdr = (void __user *)arg;
3630 OBD_ALLOC_PTR(k_ladvise_hdr);
3631 if (k_ladvise_hdr == NULL)
3634 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3635 GOTO(out_ladvise, rc = -EFAULT);
3637 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3638 k_ladvise_hdr->lah_count < 1)
3639 GOTO(out_ladvise, rc = -EINVAL);
3641 num_advise = k_ladvise_hdr->lah_count;
3642 if (num_advise >= LAH_COUNT_MAX)
3643 GOTO(out_ladvise, rc = -EFBIG);
3645 OBD_FREE_PTR(k_ladvise_hdr);
3646 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3647 lah_advise[num_advise]);
3648 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3649 if (k_ladvise_hdr == NULL)
3653 * TODO: submit multiple advices to one server in a single RPC
3655 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3656 GOTO(out_ladvise, rc = -EFAULT);
3658 for (i = 0; i < num_advise; i++) {
3659 struct llapi_lu_ladvise *k_ladvise =
3660 &k_ladvise_hdr->lah_advise[i];
3661 struct llapi_lu_ladvise __user *u_ladvise =
3662 &u_ladvise_hdr->lah_advise[i];
3664 rc = ll_ladvise_sanity(inode, k_ladvise);
3666 GOTO(out_ladvise, rc);
3668 switch (k_ladvise->lla_advice) {
3669 case LU_LADVISE_LOCKNOEXPAND:
3670 rc = ll_lock_noexpand(file,
3671 k_ladvise->lla_peradvice_flags);
3672 GOTO(out_ladvise, rc);
3673 case LU_LADVISE_LOCKAHEAD:
3675 rc = ll_file_lock_ahead(file, k_ladvise);
3678 GOTO(out_ladvise, rc);
3681 &u_ladvise->lla_lockahead_result))
3682 GOTO(out_ladvise, rc = -EFAULT);
3685 rc = ll_ladvise(inode, file,
3686 k_ladvise_hdr->lah_flags,
3689 GOTO(out_ladvise, rc);
3696 OBD_FREE(k_ladvise_hdr, alloc_size);
3699 case LL_IOC_FLR_SET_MIRROR: {
3700 /* mirror I/O must be direct to avoid polluting page cache
3702 if (!(file->f_flags & O_DIRECT))
3705 fd->fd_designated_mirror = (__u32)arg;
3708 case LL_IOC_FSGETXATTR:
3709 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3710 case LL_IOC_FSSETXATTR:
3711 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3713 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3715 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3716 (void __user *)arg));
3720 #ifndef HAVE_FILE_LLSEEK_SIZE
3721 static inline loff_t
3722 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3724 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3726 if (offset > maxsize)
3729 if (offset != file->f_pos) {
3730 file->f_pos = offset;
3731 file->f_version = 0;
3737 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3738 loff_t maxsize, loff_t eof)
3740 struct inode *inode = file_inode(file);
3748 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3749 * position-querying operation. Avoid rewriting the "same"
3750 * f_pos value back to the file because a concurrent read(),
3751 * write() or lseek() might have altered it
3756 * f_lock protects against read/modify/write race with other
3757 * SEEK_CURs. Note that parallel writes and reads behave
3761 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3762 inode_unlock(inode);
3766 * In the generic case the entire file is data, so as long as
3767 * offset isn't at the end of the file then the offset is data.
3774 * There is a virtual hole at the end of the file, so as long as
3775 * offset isn't i_size or larger, return i_size.
3783 return llseek_execute(file, offset, maxsize);
3787 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3789 struct inode *inode = file_inode(file);
3790 loff_t retval, eof = 0;
3793 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3794 (origin == SEEK_CUR) ? file->f_pos : 0);
3795 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3796 PFID(ll_inode2fid(inode)), inode, retval, retval,
3798 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3800 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3801 retval = ll_glimpse_size(inode);
3804 eof = i_size_read(inode);
3807 retval = ll_generic_file_llseek_size(file, offset, origin,
3808 ll_file_maxbytes(inode), eof);
3812 static int ll_flush(struct file *file, fl_owner_t id)
3814 struct inode *inode = file_inode(file);
3815 struct ll_inode_info *lli = ll_i2info(inode);
3816 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3819 LASSERT(!S_ISDIR(inode->i_mode));
3821 /* catch async errors that were recorded back when async writeback
3822 * failed for pages in this mapping. */
3823 rc = lli->lli_async_rc;
3824 lli->lli_async_rc = 0;
3825 if (lli->lli_clob != NULL) {
3826 err = lov_read_and_clear_async_rc(lli->lli_clob);
3831 /* The application has been told write failure already.
3832 * Do not report failure again. */
3833 if (fd->fd_write_failed)
3835 return rc ? -EIO : 0;
3839 * Called to make sure a portion of file has been written out.
3840 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3842 * Return how many pages have been written.
3844 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3845 enum cl_fsync_mode mode, int ignore_layout)
3849 struct cl_fsync_io *fio;
3854 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3855 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3858 env = cl_env_get(&refcheck);
3860 RETURN(PTR_ERR(env));
3862 io = vvp_env_thread_io(env);
3863 io->ci_obj = ll_i2info(inode)->lli_clob;
3864 io->ci_ignore_layout = ignore_layout;
3866 /* initialize parameters for sync */
3867 fio = &io->u.ci_fsync;
3868 fio->fi_start = start;
3870 fio->fi_fid = ll_inode2fid(inode);
3871 fio->fi_mode = mode;
3872 fio->fi_nr_written = 0;
3874 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3875 result = cl_io_loop(env, io);
3877 result = io->ci_result;
3879 result = fio->fi_nr_written;
3880 cl_io_fini(env, io);
3881 cl_env_put(env, &refcheck);
3887 * When dentry is provided (the 'else' case), file_dentry() may be
3888 * null and dentry must be used directly rather than pulled from
3889 * file_dentry() as is done otherwise.
3892 #ifdef HAVE_FILE_FSYNC_4ARGS
3893 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3895 struct dentry *dentry = file_dentry(file);
3897 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3898 int ll_fsync(struct file *file, int datasync)
3900 struct dentry *dentry = file_dentry(file);
3902 loff_t end = LLONG_MAX;
3904 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3907 loff_t end = LLONG_MAX;
3909 struct inode *inode = dentry->d_inode;
3910 struct ll_inode_info *lli = ll_i2info(inode);
3911 struct ptlrpc_request *req;
3915 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3916 PFID(ll_inode2fid(inode)), inode);
3917 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3919 #ifdef HAVE_FILE_FSYNC_4ARGS
3920 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3921 lock_inode = !lli->lli_inode_locked;
3925 /* fsync's caller has already called _fdata{sync,write}, we want
3926 * that IO to finish before calling the osc and mdc sync methods */
3927 rc = filemap_fdatawait(inode->i_mapping);
3930 /* catch async errors that were recorded back when async writeback
3931 * failed for pages in this mapping. */
3932 if (!S_ISDIR(inode->i_mode)) {
3933 err = lli->lli_async_rc;
3934 lli->lli_async_rc = 0;
3937 if (lli->lli_clob != NULL) {
3938 err = lov_read_and_clear_async_rc(lli->lli_clob);
3944 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3948 ptlrpc_req_finished(req);
3950 if (S_ISREG(inode->i_mode)) {
3951 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3953 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3954 if (rc == 0 && err < 0)
3957 fd->fd_write_failed = true;
3959 fd->fd_write_failed = false;
3962 #ifdef HAVE_FILE_FSYNC_4ARGS
3964 inode_unlock(inode);
3970 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3972 struct inode *inode = file_inode(file);
3973 struct ll_sb_info *sbi = ll_i2sbi(inode);
3974 struct ldlm_enqueue_info einfo = {
3975 .ei_type = LDLM_FLOCK,
3976 .ei_cb_cp = ldlm_flock_completion_ast,
3977 .ei_cbdata = file_lock,
3979 struct md_op_data *op_data;
3980 struct lustre_handle lockh = { 0 };
3981 union ldlm_policy_data flock = { { 0 } };
3982 int fl_type = file_lock->fl_type;
3988 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3989 PFID(ll_inode2fid(inode)), file_lock);
3991 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3993 if (file_lock->fl_flags & FL_FLOCK) {
3994 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3995 /* flocks are whole-file locks */
3996 flock.l_flock.end = OFFSET_MAX;
3997 /* For flocks owner is determined by the local file desctiptor*/
3998 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3999 } else if (file_lock->fl_flags & FL_POSIX) {
4000 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4001 flock.l_flock.start = file_lock->fl_start;
4002 flock.l_flock.end = file_lock->fl_end;
4006 flock.l_flock.pid = file_lock->fl_pid;
4008 /* Somewhat ugly workaround for svc lockd.
4009 * lockd installs custom fl_lmops->lm_compare_owner that checks
4010 * for the fl_owner to be the same (which it always is on local node
4011 * I guess between lockd processes) and then compares pid.
4012 * As such we assign pid to the owner field to make it all work,
4013 * conflict with normal locks is unlikely since pid space and
4014 * pointer space for current->files are not intersecting */
4015 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4016 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4020 einfo.ei_mode = LCK_PR;
4023 /* An unlock request may or may not have any relation to
4024 * existing locks so we may not be able to pass a lock handle
4025 * via a normal ldlm_lock_cancel() request. The request may even
4026 * unlock a byte range in the middle of an existing lock. In
4027 * order to process an unlock request we need all of the same
4028 * information that is given with a normal read or write record
4029 * lock request. To avoid creating another ldlm unlock (cancel)
4030 * message we'll treat a LCK_NL flock request as an unlock. */
4031 einfo.ei_mode = LCK_NL;
4034 einfo.ei_mode = LCK_PW;
4037 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4052 flags = LDLM_FL_BLOCK_NOWAIT;
4058 flags = LDLM_FL_TEST_LOCK;
4061 CERROR("unknown fcntl lock command: %d\n", cmd);
4065 /* Save the old mode so that if the mode in the lock changes we
4066 * can decrement the appropriate reader or writer refcount. */
4067 file_lock->fl_type = einfo.ei_mode;
4069 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4070 LUSTRE_OPC_ANY, NULL);
4071 if (IS_ERR(op_data))
4072 RETURN(PTR_ERR(op_data));
4074 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4075 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4076 flock.l_flock.pid, flags, einfo.ei_mode,
4077 flock.l_flock.start, flock.l_flock.end);
4079 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4082 /* Restore the file lock type if not TEST lock. */
4083 if (!(flags & LDLM_FL_TEST_LOCK))
4084 file_lock->fl_type = fl_type;
4086 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4087 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4088 !(flags & LDLM_FL_TEST_LOCK))
4089 rc2 = locks_lock_file_wait(file, file_lock);
4091 if ((file_lock->fl_flags & FL_FLOCK) &&
4092 (rc == 0 || file_lock->fl_type == F_UNLCK))
4093 rc2 = flock_lock_file_wait(file, file_lock);
4094 if ((file_lock->fl_flags & FL_POSIX) &&
4095 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4096 !(flags & LDLM_FL_TEST_LOCK))
4097 rc2 = posix_lock_file_wait(file, file_lock);
4098 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4100 if (rc2 && file_lock->fl_type != F_UNLCK) {
4101 einfo.ei_mode = LCK_NL;
4102 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4107 ll_finish_md_op_data(op_data);
4112 int ll_get_fid_by_name(struct inode *parent, const char *name,
4113 int namelen, struct lu_fid *fid,
4114 struct inode **inode)
4116 struct md_op_data *op_data = NULL;
4117 struct mdt_body *body;
4118 struct ptlrpc_request *req;
4122 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4123 LUSTRE_OPC_ANY, NULL);
4124 if (IS_ERR(op_data))
4125 RETURN(PTR_ERR(op_data));
4127 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4128 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4129 ll_finish_md_op_data(op_data);
4133 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4135 GOTO(out_req, rc = -EFAULT);
4137 *fid = body->mbo_fid1;
4140 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4142 ptlrpc_req_finished(req);
4146 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4149 struct dentry *dchild = NULL;
4150 struct inode *child_inode = NULL;
4151 struct md_op_data *op_data;
4152 struct ptlrpc_request *request = NULL;
4153 struct obd_client_handle *och = NULL;
4155 struct mdt_body *body;
4156 __u64 data_version = 0;
4157 size_t namelen = strlen(name);
4158 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4162 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4163 PFID(ll_inode2fid(parent)), name,
4164 lum->lum_stripe_offset, lum->lum_stripe_count);
4166 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4167 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4168 lustre_swab_lmv_user_md(lum);
4170 /* Get child FID first */
4171 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4174 dchild = d_lookup(file_dentry(file), &qstr);
4176 if (dchild->d_inode)
4177 child_inode = igrab(dchild->d_inode);
4182 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4191 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4192 OBD_CONNECT2_DIR_MIGRATE)) {
4193 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4194 ll_i2info(child_inode)->lli_lsm_md) {
4195 CERROR("%s: MDT doesn't support stripe directory "
4197 ll_get_fsname(parent->i_sb, NULL, 0));
4198 GOTO(out_iput, rc = -EOPNOTSUPP);
4203 * lfs migrate command needs to be blocked on the client
4204 * by checking the migrate FID against the FID of the
4207 if (child_inode == parent->i_sb->s_root->d_inode)
4208 GOTO(out_iput, rc = -EINVAL);
4210 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4211 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4212 if (IS_ERR(op_data))
4213 GOTO(out_iput, rc = PTR_ERR(op_data));
4215 inode_lock(child_inode);
4216 op_data->op_fid3 = *ll_inode2fid(child_inode);
4217 if (!fid_is_sane(&op_data->op_fid3)) {
4218 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4219 ll_get_fsname(parent->i_sb, NULL, 0), name,
4220 PFID(&op_data->op_fid3));
4221 GOTO(out_unlock, rc = -EINVAL);
4224 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4225 op_data->op_data = lum;
4226 op_data->op_data_size = lumlen;
4229 if (S_ISREG(child_inode->i_mode)) {
4230 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4234 GOTO(out_unlock, rc);
4237 rc = ll_data_version(child_inode, &data_version,
4240 GOTO(out_close, rc);
4242 op_data->op_open_handle = och->och_open_handle;
4243 op_data->op_data_version = data_version;
4244 op_data->op_lease_handle = och->och_lease_handle;
4245 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4247 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4248 och->och_mod->mod_open_req->rq_replay = 0;
4249 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4252 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4253 name, namelen, &request);
4255 LASSERT(request != NULL);
4256 ll_update_times(request, parent);
4258 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4259 LASSERT(body != NULL);
4261 /* If the server does release layout lock, then we cleanup
4262 * the client och here, otherwise release it in out_close: */
4263 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4264 obd_mod_put(och->och_mod);
4265 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4267 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4273 if (request != NULL) {
4274 ptlrpc_req_finished(request);
4278 /* Try again if the file layout has changed. */
4279 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4284 ll_lease_close(och, child_inode, NULL);
4286 clear_nlink(child_inode);
4288 inode_unlock(child_inode);
4289 ll_finish_md_op_data(op_data);
4296 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4304 * test if some locks matching bits and l_req_mode are acquired
4305 * - bits can be in different locks
4306 * - if found clear the common lock bits in *bits
4307 * - the bits not found, are kept in *bits
4309 * \param bits [IN] searched lock bits [IN]
4310 * \param l_req_mode [IN] searched lock mode
4311 * \retval boolean, true iff all bits are found
4313 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4315 struct lustre_handle lockh;
4316 union ldlm_policy_data policy;
4317 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4318 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4327 fid = &ll_i2info(inode)->lli_fid;
4328 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4329 ldlm_lockname[mode]);
4331 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4332 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4333 policy.l_inodebits.bits = *bits & (1 << i);
4334 if (policy.l_inodebits.bits == 0)
4337 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4338 &policy, mode, &lockh)) {
4339 struct ldlm_lock *lock;
4341 lock = ldlm_handle2lock(&lockh);
4344 ~(lock->l_policy_data.l_inodebits.bits);
4345 LDLM_LOCK_PUT(lock);
4347 *bits &= ~policy.l_inodebits.bits;
4354 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4355 struct lustre_handle *lockh, __u64 flags,
4356 enum ldlm_mode mode)
4358 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4363 fid = &ll_i2info(inode)->lli_fid;
4364 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4366 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4367 fid, LDLM_IBITS, &policy, mode, lockh);
4372 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4374 /* Already unlinked. Just update nlink and return success */
4375 if (rc == -ENOENT) {
4377 /* If it is striped directory, and there is bad stripe
4378 * Let's revalidate the dentry again, instead of returning
4380 if (S_ISDIR(inode->i_mode) &&
4381 ll_i2info(inode)->lli_lsm_md != NULL)
4384 /* This path cannot be hit for regular files unless in
4385 * case of obscure races, so no need to to validate
4387 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4389 } else if (rc != 0) {
4390 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4391 "%s: revalidate FID "DFID" error: rc = %d\n",
4392 ll_get_fsname(inode->i_sb, NULL, 0),
4393 PFID(ll_inode2fid(inode)), rc);
4399 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4401 struct inode *inode = dentry->d_inode;
4402 struct obd_export *exp = ll_i2mdexp(inode);
4403 struct lookup_intent oit = {
4406 struct ptlrpc_request *req = NULL;
4407 struct md_op_data *op_data;
4411 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4412 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4414 /* Call getattr by fid, so do not provide name at all. */
4415 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4416 LUSTRE_OPC_ANY, NULL);
4417 if (IS_ERR(op_data))
4418 RETURN(PTR_ERR(op_data));
4420 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4421 ll_finish_md_op_data(op_data);
4423 rc = ll_inode_revalidate_fini(inode, rc);
4427 rc = ll_revalidate_it_finish(req, &oit, dentry);
4429 ll_intent_release(&oit);
4433 /* Unlinked? Unhash dentry, so it is not picked up later by
4434 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4435 * here to preserve get_cwd functionality on 2.6.
4437 if (!dentry->d_inode->i_nlink) {
4438 ll_lock_dcache(inode);
4439 d_lustre_invalidate(dentry, 0);
4440 ll_unlock_dcache(inode);
4443 ll_lookup_finish_locks(&oit, dentry);
4445 ptlrpc_req_finished(req);
4450 static int ll_merge_md_attr(struct inode *inode)
4452 struct ll_inode_info *lli = ll_i2info(inode);
4453 struct cl_attr attr = { 0 };
4456 LASSERT(lli->lli_lsm_md != NULL);
4457 down_read(&lli->lli_lsm_sem);
4458 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4459 &attr, ll_md_blocking_ast);
4460 up_read(&lli->lli_lsm_sem);
4464 set_nlink(inode, attr.cat_nlink);
4465 inode->i_blocks = attr.cat_blocks;
4466 i_size_write(inode, attr.cat_size);
4468 ll_i2info(inode)->lli_atime = attr.cat_atime;
4469 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4470 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4475 static inline dev_t ll_compat_encode_dev(dev_t dev)
4477 /* The compat_sys_*stat*() syscalls will fail unless the
4478 * device majors and minors are both less than 256. Note that
4479 * the value returned here will be passed through
4480 * old_encode_dev() in cp_compat_stat(). And so we are not
4481 * trying to return a valid compat (u16) device number, just
4482 * one that will pass the old_valid_dev() check. */
4484 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4487 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4488 int ll_getattr(const struct path *path, struct kstat *stat,
4489 u32 request_mask, unsigned int flags)
4491 struct dentry *de = path->dentry;
4493 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4496 struct inode *inode = de->d_inode;
4497 struct ll_sb_info *sbi = ll_i2sbi(inode);
4498 struct ll_inode_info *lli = ll_i2info(inode);
4501 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4503 rc = ll_inode_revalidate(de, IT_GETATTR);
4507 if (S_ISREG(inode->i_mode)) {
4508 /* In case of restore, the MDT has the right size and has
4509 * already send it back without granting the layout lock,
4510 * inode is up-to-date so glimpse is useless.
4511 * Also to glimpse we need the layout, in case of a running
4512 * restore the MDT holds the layout lock so the glimpse will
4513 * block up to the end of restore (getattr will block)
4515 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4516 rc = ll_glimpse_size(inode);
4521 /* If object isn't regular a file then don't validate size. */
4522 if (S_ISDIR(inode->i_mode) &&
4523 lli->lli_lsm_md != NULL) {
4524 rc = ll_merge_md_attr(inode);
4529 LTIME_S(inode->i_atime) = lli->lli_atime;
4530 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4531 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4534 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4536 if (ll_need_32bit_api(sbi)) {
4537 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4538 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4539 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4541 stat->ino = inode->i_ino;
4542 stat->dev = inode->i_sb->s_dev;
4543 stat->rdev = inode->i_rdev;
4546 stat->mode = inode->i_mode;
4547 stat->uid = inode->i_uid;
4548 stat->gid = inode->i_gid;
4549 stat->atime = inode->i_atime;
4550 stat->mtime = inode->i_mtime;
4551 stat->ctime = inode->i_ctime;
4552 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4554 stat->nlink = inode->i_nlink;
4555 stat->size = i_size_read(inode);
4556 stat->blocks = inode->i_blocks;
4561 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4562 __u64 start, __u64 len)
4566 struct fiemap *fiemap;
4567 unsigned int extent_count = fieinfo->fi_extents_max;
4569 num_bytes = sizeof(*fiemap) + (extent_count *
4570 sizeof(struct fiemap_extent));
4571 OBD_ALLOC_LARGE(fiemap, num_bytes);
4576 fiemap->fm_flags = fieinfo->fi_flags;
4577 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4578 fiemap->fm_start = start;
4579 fiemap->fm_length = len;
4580 if (extent_count > 0 &&
4581 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4582 sizeof(struct fiemap_extent)) != 0)
4583 GOTO(out, rc = -EFAULT);
4585 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4587 fieinfo->fi_flags = fiemap->fm_flags;
4588 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4589 if (extent_count > 0 &&
4590 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4591 fiemap->fm_mapped_extents *
4592 sizeof(struct fiemap_extent)) != 0)
4593 GOTO(out, rc = -EFAULT);
4595 OBD_FREE_LARGE(fiemap, num_bytes);
4599 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4601 struct ll_inode_info *lli = ll_i2info(inode);
4602 struct posix_acl *acl = NULL;
4605 spin_lock(&lli->lli_lock);
4606 /* VFS' acl_permission_check->check_acl will release the refcount */
4607 acl = posix_acl_dup(lli->lli_posix_acl);
4608 spin_unlock(&lli->lli_lock);
4613 #ifdef HAVE_IOP_SET_ACL
4614 #ifdef CONFIG_FS_POSIX_ACL
4615 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4617 struct ll_sb_info *sbi = ll_i2sbi(inode);
4618 struct ptlrpc_request *req = NULL;
4619 const char *name = NULL;
4621 size_t value_size = 0;
4626 case ACL_TYPE_ACCESS:
4627 name = XATTR_NAME_POSIX_ACL_ACCESS;
4629 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4632 case ACL_TYPE_DEFAULT:
4633 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4634 if (!S_ISDIR(inode->i_mode))
4635 rc = acl ? -EACCES : 0;
4646 value_size = posix_acl_xattr_size(acl->a_count);
4647 value = kmalloc(value_size, GFP_NOFS);
4649 GOTO(out, rc = -ENOMEM);
4651 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4653 GOTO(out_value, rc);
4656 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4657 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4658 name, value, value_size, 0, 0, &req);
4660 ptlrpc_req_finished(req);
4665 forget_cached_acl(inode, type);
4667 set_cached_acl(inode, type, acl);
4670 #endif /* CONFIG_FS_POSIX_ACL */
4671 #endif /* HAVE_IOP_SET_ACL */
4673 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4675 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4676 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4678 ll_check_acl(struct inode *inode, int mask)
4681 # ifdef CONFIG_FS_POSIX_ACL
4682 struct posix_acl *acl;
4686 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4687 if (flags & IPERM_FLAG_RCU)
4690 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4695 rc = posix_acl_permission(inode, acl, mask);
4696 posix_acl_release(acl);
4699 # else /* !CONFIG_FS_POSIX_ACL */
4701 # endif /* CONFIG_FS_POSIX_ACL */
4703 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4705 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4706 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4708 # ifdef HAVE_INODE_PERMISION_2ARGS
4709 int ll_inode_permission(struct inode *inode, int mask)
4711 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4716 struct ll_sb_info *sbi;
4717 struct root_squash_info *squash;
4718 struct cred *cred = NULL;
4719 const struct cred *old_cred = NULL;
4721 bool squash_id = false;
4724 #ifdef MAY_NOT_BLOCK
4725 if (mask & MAY_NOT_BLOCK)
4727 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4728 if (flags & IPERM_FLAG_RCU)
4732 /* as root inode are NOT getting validated in lookup operation,
4733 * need to do it before permission check. */
4735 if (inode == inode->i_sb->s_root->d_inode) {
4736 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4741 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4742 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4744 /* squash fsuid/fsgid if needed */
4745 sbi = ll_i2sbi(inode);
4746 squash = &sbi->ll_squash;
4747 if (unlikely(squash->rsi_uid != 0 &&
4748 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4749 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4753 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4754 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4755 squash->rsi_uid, squash->rsi_gid);
4757 /* update current process's credentials
4758 * and FS capability */
4759 cred = prepare_creds();
4763 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4764 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4765 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4766 if ((1 << cap) & CFS_CAP_FS_MASK)
4767 cap_lower(cred->cap_effective, cap);
4769 old_cred = override_creds(cred);
4772 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4773 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4774 /* restore current process's credentials and FS capability */
4776 revert_creds(old_cred);
4783 /* -o localflock - only provides locally consistent flock locks */
4784 struct file_operations ll_file_operations = {
4785 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4786 # ifdef HAVE_SYNC_READ_WRITE
4787 .read = new_sync_read,
4788 .write = new_sync_write,
4790 .read_iter = ll_file_read_iter,
4791 .write_iter = ll_file_write_iter,
4792 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4793 .read = ll_file_read,
4794 .aio_read = ll_file_aio_read,
4795 .write = ll_file_write,
4796 .aio_write = ll_file_aio_write,
4797 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4798 .unlocked_ioctl = ll_file_ioctl,
4799 .open = ll_file_open,
4800 .release = ll_file_release,
4801 .mmap = ll_file_mmap,
4802 .llseek = ll_file_seek,
4803 .splice_read = ll_file_splice_read,
4808 struct file_operations ll_file_operations_flock = {
4809 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4810 # ifdef HAVE_SYNC_READ_WRITE
4811 .read = new_sync_read,
4812 .write = new_sync_write,
4813 # endif /* HAVE_SYNC_READ_WRITE */
4814 .read_iter = ll_file_read_iter,
4815 .write_iter = ll_file_write_iter,
4816 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4817 .read = ll_file_read,
4818 .aio_read = ll_file_aio_read,
4819 .write = ll_file_write,
4820 .aio_write = ll_file_aio_write,
4821 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4822 .unlocked_ioctl = ll_file_ioctl,
4823 .open = ll_file_open,
4824 .release = ll_file_release,
4825 .mmap = ll_file_mmap,
4826 .llseek = ll_file_seek,
4827 .splice_read = ll_file_splice_read,
4830 .flock = ll_file_flock,
4831 .lock = ll_file_flock
4834 /* These are for -o noflock - to return ENOSYS on flock calls */
4835 struct file_operations ll_file_operations_noflock = {
4836 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4837 # ifdef HAVE_SYNC_READ_WRITE
4838 .read = new_sync_read,
4839 .write = new_sync_write,
4840 # endif /* HAVE_SYNC_READ_WRITE */
4841 .read_iter = ll_file_read_iter,
4842 .write_iter = ll_file_write_iter,
4843 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4844 .read = ll_file_read,
4845 .aio_read = ll_file_aio_read,
4846 .write = ll_file_write,
4847 .aio_write = ll_file_aio_write,
4848 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4849 .unlocked_ioctl = ll_file_ioctl,
4850 .open = ll_file_open,
4851 .release = ll_file_release,
4852 .mmap = ll_file_mmap,
4853 .llseek = ll_file_seek,
4854 .splice_read = ll_file_splice_read,
4857 .flock = ll_file_noflock,
4858 .lock = ll_file_noflock
4861 struct inode_operations ll_file_inode_operations = {
4862 .setattr = ll_setattr,
4863 .getattr = ll_getattr,
4864 .permission = ll_inode_permission,
4865 #ifdef HAVE_IOP_XATTR
4866 .setxattr = ll_setxattr,
4867 .getxattr = ll_getxattr,
4868 .removexattr = ll_removexattr,
4870 .listxattr = ll_listxattr,
4871 .fiemap = ll_fiemap,
4872 #ifdef HAVE_IOP_GET_ACL
4873 .get_acl = ll_get_acl,
4875 #ifdef HAVE_IOP_SET_ACL
4876 .set_acl = ll_set_acl,
4880 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4882 struct ll_inode_info *lli = ll_i2info(inode);
4883 struct cl_object *obj = lli->lli_clob;
4892 env = cl_env_get(&refcheck);
4894 RETURN(PTR_ERR(env));
4896 rc = cl_conf_set(env, lli->lli_clob, conf);
4900 if (conf->coc_opc == OBJECT_CONF_SET) {
4901 struct ldlm_lock *lock = conf->coc_lock;
4902 struct cl_layout cl = {
4906 LASSERT(lock != NULL);
4907 LASSERT(ldlm_has_layout(lock));
4909 /* it can only be allowed to match after layout is
4910 * applied to inode otherwise false layout would be
4911 * seen. Applying layout shoud happen before dropping
4912 * the intent lock. */
4913 ldlm_lock_allow_match(lock);
4915 rc = cl_object_layout_get(env, obj, &cl);
4920 DFID": layout version change: %u -> %u\n",
4921 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4923 ll_layout_version_set(lli, cl.cl_layout_gen);
4927 cl_env_put(env, &refcheck);
4932 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4933 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4936 struct ll_sb_info *sbi = ll_i2sbi(inode);
4937 struct ptlrpc_request *req;
4944 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4945 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4946 lock->l_lvb_data, lock->l_lvb_len);
4948 if (lock->l_lvb_data != NULL)
4951 /* if layout lock was granted right away, the layout is returned
4952 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4953 * blocked and then granted via completion ast, we have to fetch
4954 * layout here. Please note that we can't use the LVB buffer in
4955 * completion AST because it doesn't have a large enough buffer */
4956 rc = ll_get_default_mdsize(sbi, &lmmsize);
4960 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4961 XATTR_NAME_LOV, lmmsize, &req);
4964 GOTO(out, rc = 0); /* empty layout */
4971 if (lmmsize == 0) /* empty layout */
4974 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4976 GOTO(out, rc = -EFAULT);
4978 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4979 if (lvbdata == NULL)
4980 GOTO(out, rc = -ENOMEM);
4982 memcpy(lvbdata, lmm, lmmsize);
4983 lock_res_and_lock(lock);
4984 if (unlikely(lock->l_lvb_data == NULL)) {
4985 lock->l_lvb_type = LVB_T_LAYOUT;
4986 lock->l_lvb_data = lvbdata;
4987 lock->l_lvb_len = lmmsize;
4990 unlock_res_and_lock(lock);
4993 OBD_FREE_LARGE(lvbdata, lmmsize);
4998 ptlrpc_req_finished(req);
5003 * Apply the layout to the inode. Layout lock is held and will be released
5006 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5007 struct inode *inode)
5009 struct ll_inode_info *lli = ll_i2info(inode);
5010 struct ll_sb_info *sbi = ll_i2sbi(inode);
5011 struct ldlm_lock *lock;
5012 struct cl_object_conf conf;
5015 bool wait_layout = false;
5018 LASSERT(lustre_handle_is_used(lockh));
5020 lock = ldlm_handle2lock(lockh);
5021 LASSERT(lock != NULL);
5022 LASSERT(ldlm_has_layout(lock));
5024 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5025 PFID(&lli->lli_fid), inode);
5027 /* in case this is a caching lock and reinstate with new inode */
5028 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5030 lock_res_and_lock(lock);
5031 lvb_ready = ldlm_is_lvb_ready(lock);
5032 unlock_res_and_lock(lock);
5034 /* checking lvb_ready is racy but this is okay. The worst case is
5035 * that multi processes may configure the file on the same time. */
5039 rc = ll_layout_fetch(inode, lock);
5043 /* for layout lock, lmm is stored in lock's lvb.
5044 * lvb_data is immutable if the lock is held so it's safe to access it
5047 * set layout to file. Unlikely this will fail as old layout was
5048 * surely eliminated */
5049 memset(&conf, 0, sizeof conf);
5050 conf.coc_opc = OBJECT_CONF_SET;
5051 conf.coc_inode = inode;
5052 conf.coc_lock = lock;
5053 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5054 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5055 rc = ll_layout_conf(inode, &conf);
5057 /* refresh layout failed, need to wait */
5058 wait_layout = rc == -EBUSY;
5061 LDLM_LOCK_PUT(lock);
5062 ldlm_lock_decref(lockh, mode);
5064 /* wait for IO to complete if it's still being used. */
5066 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5067 ll_get_fsname(inode->i_sb, NULL, 0),
5068 PFID(&lli->lli_fid), inode);
5070 memset(&conf, 0, sizeof conf);
5071 conf.coc_opc = OBJECT_CONF_WAIT;
5072 conf.coc_inode = inode;
5073 rc = ll_layout_conf(inode, &conf);
5077 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5078 ll_get_fsname(inode->i_sb, NULL, 0),
5079 PFID(&lli->lli_fid), rc);
5085 * Issue layout intent RPC to MDS.
5086 * \param inode [in] file inode
5087 * \param intent [in] layout intent
5089 * \retval 0 on success
5090 * \retval < 0 error code
5092 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5094 struct ll_inode_info *lli = ll_i2info(inode);
5095 struct ll_sb_info *sbi = ll_i2sbi(inode);
5096 struct md_op_data *op_data;
5097 struct lookup_intent it;
5098 struct ptlrpc_request *req;
5102 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5103 0, 0, LUSTRE_OPC_ANY, NULL);
5104 if (IS_ERR(op_data))
5105 RETURN(PTR_ERR(op_data));
5107 op_data->op_data = intent;
5108 op_data->op_data_size = sizeof(*intent);
5110 memset(&it, 0, sizeof(it));
5111 it.it_op = IT_LAYOUT;
5112 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5113 intent->li_opc == LAYOUT_INTENT_TRUNC)
5114 it.it_flags = FMODE_WRITE;
5116 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5117 ll_get_fsname(inode->i_sb, NULL, 0),
5118 PFID(&lli->lli_fid), inode);
5120 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5121 &ll_md_blocking_ast, 0);
5122 if (it.it_request != NULL)
5123 ptlrpc_req_finished(it.it_request);
5124 it.it_request = NULL;
5126 ll_finish_md_op_data(op_data);
5128 /* set lock data in case this is a new lock */
5130 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5132 ll_intent_drop_lock(&it);
5138 * This function checks if there exists a LAYOUT lock on the client side,
5139 * or enqueues it if it doesn't have one in cache.
5141 * This function will not hold layout lock so it may be revoked any time after
5142 * this function returns. Any operations depend on layout should be redone
5145 * This function should be called before lov_io_init() to get an uptodate
5146 * layout version, the caller should save the version number and after IO
5147 * is finished, this function should be called again to verify that layout
5148 * is not changed during IO time.
5150 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5152 struct ll_inode_info *lli = ll_i2info(inode);
5153 struct ll_sb_info *sbi = ll_i2sbi(inode);
5154 struct lustre_handle lockh;
5155 struct layout_intent intent = {
5156 .li_opc = LAYOUT_INTENT_ACCESS,
5158 enum ldlm_mode mode;
5162 *gen = ll_layout_version_get(lli);
5163 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5167 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5168 LASSERT(S_ISREG(inode->i_mode));
5170 /* take layout lock mutex to enqueue layout lock exclusively. */
5171 mutex_lock(&lli->lli_layout_mutex);
5174 /* mostly layout lock is caching on the local side, so try to
5175 * match it before grabbing layout lock mutex. */
5176 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5177 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5178 if (mode != 0) { /* hit cached lock */
5179 rc = ll_layout_lock_set(&lockh, mode, inode);
5185 rc = ll_layout_intent(inode, &intent);
5191 *gen = ll_layout_version_get(lli);
5192 mutex_unlock(&lli->lli_layout_mutex);
5198 * Issue layout intent RPC indicating where in a file an IO is about to write.
5200 * \param[in] inode file inode.
5201 * \param[in] ext write range with start offset of fille in bytes where
5202 * an IO is about to write, and exclusive end offset in
5205 * \retval 0 on success
5206 * \retval < 0 error code
5208 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5209 struct lu_extent *ext)
5211 struct layout_intent intent = {
5213 .li_extent.e_start = ext->e_start,
5214 .li_extent.e_end = ext->e_end,
5219 rc = ll_layout_intent(inode, &intent);
5225 * This function send a restore request to the MDT
5227 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5229 struct hsm_user_request *hur;
5233 len = sizeof(struct hsm_user_request) +
5234 sizeof(struct hsm_user_item);
5235 OBD_ALLOC(hur, len);
5239 hur->hur_request.hr_action = HUA_RESTORE;
5240 hur->hur_request.hr_archive_id = 0;
5241 hur->hur_request.hr_flags = 0;
5242 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5243 sizeof(hur->hur_user_item[0].hui_fid));
5244 hur->hur_user_item[0].hui_extent.offset = offset;
5245 hur->hur_user_item[0].hui_extent.length = length;
5246 hur->hur_request.hr_itemcount = 1;
5247 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,