4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_get_fsname(inode->i_sb, NULL, 0),
143 PFID(&lli->lli_fid));
147 OBD_ALLOC_PTR(op_data);
148 /* We leak openhandle and request here on error, but not much to be
149 * done in OOM case since app won't retry close on error either. */
151 GOTO(out, rc = -ENOMEM);
153 ll_prepare_close(inode, op_data, och);
155 case MDS_CLOSE_LAYOUT_MERGE:
156 /* merge blocks from the victim inode */
157 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
158 op_data->op_attr.ia_valid |= ATTR_SIZE;
159 op_data->op_xvalid |= OP_XVALID_BLOCKS;
160 case MDS_CLOSE_LAYOUT_SPLIT:
161 case MDS_CLOSE_LAYOUT_SWAP: {
162 struct split_param *sp = data;
164 LASSERT(data != NULL);
165 op_data->op_bias |= bias;
166 op_data->op_data_version = 0;
167 op_data->op_lease_handle = och->och_lease_handle;
168 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
169 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
170 op_data->op_mirror_id = sp->sp_mirror_id;
172 op_data->op_fid2 = *ll_inode2fid(data);
177 case MDS_CLOSE_RESYNC_DONE: {
178 struct ll_ioc_lease *ioc = data;
180 LASSERT(data != NULL);
181 op_data->op_attr_blocks +=
182 ioc->lil_count * op_data->op_attr_blocks;
183 op_data->op_attr.ia_valid |= ATTR_SIZE;
184 op_data->op_xvalid |= OP_XVALID_BLOCKS;
185 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
187 op_data->op_lease_handle = och->och_lease_handle;
188 op_data->op_data = &ioc->lil_ids[0];
189 op_data->op_data_size =
190 ioc->lil_count * sizeof(ioc->lil_ids[0]);
194 case MDS_HSM_RELEASE:
195 LASSERT(data != NULL);
196 op_data->op_bias |= MDS_HSM_RELEASE;
197 op_data->op_data_version = *(__u64 *)data;
198 op_data->op_lease_handle = och->och_lease_handle;
199 op_data->op_attr.ia_valid |= ATTR_SIZE;
200 op_data->op_xvalid |= OP_XVALID_BLOCKS;
204 LASSERT(data == NULL);
208 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
209 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
210 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
211 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
213 rc = md_close(md_exp, op_data, och->och_mod, &req);
214 if (rc != 0 && rc != -EINTR)
215 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
216 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
218 if (rc == 0 && op_data->op_bias & bias) {
219 struct mdt_body *body;
221 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
222 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
226 ll_finish_md_op_data(op_data);
230 md_clear_open_replay_data(md_exp, och);
231 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
234 ptlrpc_req_finished(req); /* This is close request */
238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
240 struct ll_inode_info *lli = ll_i2info(inode);
241 struct obd_client_handle **och_p;
242 struct obd_client_handle *och;
247 if (fmode & FMODE_WRITE) {
248 och_p = &lli->lli_mds_write_och;
249 och_usecount = &lli->lli_open_fd_write_count;
250 } else if (fmode & FMODE_EXEC) {
251 och_p = &lli->lli_mds_exec_och;
252 och_usecount = &lli->lli_open_fd_exec_count;
254 LASSERT(fmode & FMODE_READ);
255 och_p = &lli->lli_mds_read_och;
256 och_usecount = &lli->lli_open_fd_read_count;
259 mutex_lock(&lli->lli_och_mutex);
260 if (*och_usecount > 0) {
261 /* There are still users of this handle, so skip
263 mutex_unlock(&lli->lli_och_mutex);
269 mutex_unlock(&lli->lli_och_mutex);
272 /* There might be a race and this handle may already
274 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
280 static int ll_md_close(struct inode *inode, struct file *file)
282 union ldlm_policy_data policy = {
283 .l_inodebits = { MDS_INODELOCK_OPEN },
285 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
286 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
287 struct ll_inode_info *lli = ll_i2info(inode);
288 struct lustre_handle lockh;
289 enum ldlm_mode lockmode;
293 /* clear group lock, if present */
294 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
295 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
297 if (fd->fd_lease_och != NULL) {
300 /* Usually the lease is not released when the
301 * application crashed, we need to release here. */
302 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
303 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
304 PFID(&lli->lli_fid), rc, lease_broken);
306 fd->fd_lease_och = NULL;
309 if (fd->fd_och != NULL) {
310 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
315 /* Let's see if we have good enough OPEN lock on the file and if
316 we can skip talking to MDS */
317 mutex_lock(&lli->lli_och_mutex);
318 if (fd->fd_omode & FMODE_WRITE) {
320 LASSERT(lli->lli_open_fd_write_count);
321 lli->lli_open_fd_write_count--;
322 } else if (fd->fd_omode & FMODE_EXEC) {
324 LASSERT(lli->lli_open_fd_exec_count);
325 lli->lli_open_fd_exec_count--;
328 LASSERT(lli->lli_open_fd_read_count);
329 lli->lli_open_fd_read_count--;
331 mutex_unlock(&lli->lli_och_mutex);
333 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
334 LDLM_IBITS, &policy, lockmode, &lockh))
335 rc = ll_md_real_close(inode, fd->fd_omode);
338 LUSTRE_FPRIVATE(file) = NULL;
339 ll_file_data_put(fd);
344 /* While this returns an error code, fput() the caller does not, so we need
345 * to make every effort to clean up all of our state here. Also, applications
346 * rarely check close errors and even if an error is returned they will not
347 * re-try the close call.
349 int ll_file_release(struct inode *inode, struct file *file)
351 struct ll_file_data *fd;
352 struct ll_sb_info *sbi = ll_i2sbi(inode);
353 struct ll_inode_info *lli = ll_i2info(inode);
357 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
358 PFID(ll_inode2fid(inode)), inode);
360 if (inode->i_sb->s_root != file_dentry(file))
361 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
362 fd = LUSTRE_FPRIVATE(file);
365 /* The last ref on @file, maybe not the the owner pid of statahead,
366 * because parent and child process can share the same file handle. */
367 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
368 ll_deauthorize_statahead(inode, fd);
370 if (inode->i_sb->s_root == file_dentry(file)) {
371 LUSTRE_FPRIVATE(file) = NULL;
372 ll_file_data_put(fd);
376 if (!S_ISDIR(inode->i_mode)) {
377 if (lli->lli_clob != NULL)
378 lov_read_and_clear_async_rc(lli->lli_clob);
379 lli->lli_async_rc = 0;
382 rc = ll_md_close(inode, file);
384 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
385 libcfs_debug_dumplog();
390 static inline int ll_dom_readpage(void *data, struct page *page)
392 struct niobuf_local *lnb = data;
395 kaddr = ll_kmap_atomic(page, KM_USER0);
396 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
397 if (lnb->lnb_len < PAGE_SIZE)
398 memset(kaddr + lnb->lnb_len, 0,
399 PAGE_SIZE - lnb->lnb_len);
400 flush_dcache_page(page);
401 SetPageUptodate(page);
402 ll_kunmap_atomic(kaddr, KM_USER0);
408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
409 struct lookup_intent *it)
411 struct ll_inode_info *lli = ll_i2info(inode);
412 struct cl_object *obj = lli->lli_clob;
413 struct address_space *mapping = inode->i_mapping;
415 struct niobuf_remote *rnb;
417 struct lustre_handle lockh;
418 struct ldlm_lock *lock;
419 unsigned long index, start;
420 struct niobuf_local lnb;
421 bool dom_lock = false;
428 if (it->it_lock_mode != 0) {
429 lockh.cookie = it->it_lock_handle;
430 lock = ldlm_handle2lock(&lockh);
432 dom_lock = ldlm_has_dom(lock);
438 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
442 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
443 if (rnb == NULL || rnb->rnb_len == 0)
446 /* LU-11595: Server may return whole file and that is OK always or
447 * it may return just file tail and its offset must be aligned with
448 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
449 * smaller then offset may be not aligned and that data is just ignored.
451 if (rnb->rnb_offset % PAGE_SIZE)
454 /* Server returns whole file or just file tail if it fills in
455 * reply buffer, in both cases total size should be inode size.
457 if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
458 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
459 ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
460 rnb->rnb_len, i_size_read(inode));
464 CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
465 rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
467 data = (char *)rnb + sizeof(*rnb);
469 lnb.lnb_file_offset = rnb->rnb_offset;
470 start = lnb.lnb_file_offset / PAGE_SIZE;
472 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
473 lnb.lnb_page_offset = 0;
475 lnb.lnb_data = data + (index << PAGE_SHIFT);
476 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
477 if (lnb.lnb_len > PAGE_SIZE)
478 lnb.lnb_len = PAGE_SIZE;
480 vmpage = read_cache_page(mapping, index + start,
481 ll_dom_readpage, &lnb);
482 if (IS_ERR(vmpage)) {
483 CWARN("%s: cannot fill page %lu for "DFID
484 " with data: rc = %li\n",
485 ll_get_fsname(inode->i_sb, NULL, 0),
486 index + start, PFID(lu_object_fid(&obj->co_lu)),
492 } while (rnb->rnb_len > (index << PAGE_SHIFT));
496 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
497 struct lookup_intent *itp)
499 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
500 struct dentry *parent = de->d_parent;
501 const char *name = NULL;
503 struct md_op_data *op_data;
504 struct ptlrpc_request *req = NULL;
508 LASSERT(parent != NULL);
509 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
511 /* if server supports open-by-fid, or file name is invalid, don't pack
512 * name in open request */
513 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
514 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
515 name = de->d_name.name;
516 len = de->d_name.len;
519 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
520 name, len, 0, LUSTRE_OPC_ANY, NULL);
522 RETURN(PTR_ERR(op_data));
523 op_data->op_data = lmm;
524 op_data->op_data_size = lmmsize;
526 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
527 &ll_md_blocking_ast, 0);
528 ll_finish_md_op_data(op_data);
530 /* reason for keep own exit path - don`t flood log
531 * with messages with -ESTALE errors.
533 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
534 it_open_error(DISP_OPEN_OPEN, itp))
536 ll_release_openhandle(de, itp);
540 if (it_disposition(itp, DISP_LOOKUP_NEG))
541 GOTO(out, rc = -ENOENT);
543 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
544 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
545 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
549 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
551 if (!rc && itp->it_lock_mode) {
552 ll_dom_finish_open(de->d_inode, req, itp);
553 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
557 ptlrpc_req_finished(req);
558 ll_intent_drop_lock(itp);
560 /* We did open by fid, but by the time we got to the server,
561 * the object disappeared. If this is a create, we cannot really
562 * tell the userspace that the file it was trying to create
563 * does not exist. Instead let's return -ESTALE, and the VFS will
564 * retry the create with LOOKUP_REVAL that we are going to catch
565 * in ll_revalidate_dentry() and use lookup then.
567 if (rc == -ENOENT && itp->it_op & IT_CREAT)
573 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
574 struct obd_client_handle *och)
576 struct mdt_body *body;
578 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
579 och->och_open_handle = body->mbo_open_handle;
580 och->och_fid = body->mbo_fid1;
581 och->och_lease_handle.cookie = it->it_lock_handle;
582 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
583 och->och_flags = it->it_flags;
585 return md_set_open_replay_data(md_exp, och, it);
588 static int ll_local_open(struct file *file, struct lookup_intent *it,
589 struct ll_file_data *fd, struct obd_client_handle *och)
591 struct inode *inode = file_inode(file);
594 LASSERT(!LUSTRE_FPRIVATE(file));
601 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
606 LUSTRE_FPRIVATE(file) = fd;
607 ll_readahead_init(inode, &fd->fd_ras);
608 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
610 /* ll_cl_context initialize */
611 rwlock_init(&fd->fd_lock);
612 INIT_LIST_HEAD(&fd->fd_lccs);
617 /* Open a file, and (for the very first open) create objects on the OSTs at
618 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
619 * creation or open until ll_lov_setstripe() ioctl is called.
621 * If we already have the stripe MD locally then we don't request it in
622 * md_open(), by passing a lmm_size = 0.
624 * It is up to the application to ensure no other processes open this file
625 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
626 * used. We might be able to avoid races of that sort by getting lli_open_sem
627 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
628 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
630 int ll_file_open(struct inode *inode, struct file *file)
632 struct ll_inode_info *lli = ll_i2info(inode);
633 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
634 .it_flags = file->f_flags };
635 struct obd_client_handle **och_p = NULL;
636 __u64 *och_usecount = NULL;
637 struct ll_file_data *fd;
641 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
642 PFID(ll_inode2fid(inode)), inode, file->f_flags);
644 it = file->private_data; /* XXX: compat macro */
645 file->private_data = NULL; /* prevent ll_local_open assertion */
647 fd = ll_file_data_get();
649 GOTO(out_nofiledata, rc = -ENOMEM);
652 if (S_ISDIR(inode->i_mode))
653 ll_authorize_statahead(inode, fd);
655 if (inode->i_sb->s_root == file_dentry(file)) {
656 LUSTRE_FPRIVATE(file) = fd;
660 if (!it || !it->it_disposition) {
661 /* Convert f_flags into access mode. We cannot use file->f_mode,
662 * because everything but O_ACCMODE mask was stripped from
664 if ((oit.it_flags + 1) & O_ACCMODE)
666 if (file->f_flags & O_TRUNC)
667 oit.it_flags |= FMODE_WRITE;
669 /* kernel only call f_op->open in dentry_open. filp_open calls
670 * dentry_open after call to open_namei that checks permissions.
671 * Only nfsd_open call dentry_open directly without checking
672 * permissions and because of that this code below is safe.
674 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
675 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
677 /* We do not want O_EXCL here, presumably we opened the file
678 * already? XXX - NFS implications? */
679 oit.it_flags &= ~O_EXCL;
681 /* bug20584, if "it_flags" contains O_CREAT, the file will be
682 * created if necessary, then "IT_CREAT" should be set to keep
683 * consistent with it */
684 if (oit.it_flags & O_CREAT)
685 oit.it_op |= IT_CREAT;
691 /* Let's see if we have file open on MDS already. */
692 if (it->it_flags & FMODE_WRITE) {
693 och_p = &lli->lli_mds_write_och;
694 och_usecount = &lli->lli_open_fd_write_count;
695 } else if (it->it_flags & FMODE_EXEC) {
696 och_p = &lli->lli_mds_exec_och;
697 och_usecount = &lli->lli_open_fd_exec_count;
699 och_p = &lli->lli_mds_read_och;
700 och_usecount = &lli->lli_open_fd_read_count;
703 mutex_lock(&lli->lli_och_mutex);
704 if (*och_p) { /* Open handle is present */
705 if (it_disposition(it, DISP_OPEN_OPEN)) {
706 /* Well, there's extra open request that we do not need,
707 let's close it somehow. This will decref request. */
708 rc = it_open_error(DISP_OPEN_OPEN, it);
710 mutex_unlock(&lli->lli_och_mutex);
711 GOTO(out_openerr, rc);
714 ll_release_openhandle(file_dentry(file), it);
718 rc = ll_local_open(file, it, fd, NULL);
721 mutex_unlock(&lli->lli_och_mutex);
722 GOTO(out_openerr, rc);
725 LASSERT(*och_usecount == 0);
726 if (!it->it_disposition) {
727 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
728 /* We cannot just request lock handle now, new ELC code
729 means that one of other OPEN locks for this file
730 could be cancelled, and since blocking ast handler
731 would attempt to grab och_mutex as well, that would
732 result in a deadlock */
733 mutex_unlock(&lli->lli_och_mutex);
735 * Normally called under two situations:
737 * 2. A race/condition on MDS resulting in no open
738 * handle to be returned from LOOKUP|OPEN request,
739 * for example if the target entry was a symlink.
741 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
742 * marked by a bit set in ll_iget_for_nfs. Clear the
743 * bit so that it's not confusing later callers.
745 * NB; when ldd is NULL, it must have come via normal
746 * lookup path only, since ll_iget_for_nfs always calls
749 if (ldd && ldd->lld_nfs_dentry) {
750 ldd->lld_nfs_dentry = 0;
751 it->it_flags |= MDS_OPEN_LOCK;
755 * Always specify MDS_OPEN_BY_FID because we don't want
756 * to get file with different fid.
758 it->it_flags |= MDS_OPEN_BY_FID;
759 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
762 GOTO(out_openerr, rc);
766 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
768 GOTO(out_och_free, rc = -ENOMEM);
772 /* md_intent_lock() didn't get a request ref if there was an
773 * open error, so don't do cleanup on the request here
775 /* XXX (green): Should not we bail out on any error here, not
776 * just open error? */
777 rc = it_open_error(DISP_OPEN_OPEN, it);
779 GOTO(out_och_free, rc);
781 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
782 "inode %p: disposition %x, status %d\n", inode,
783 it_disposition(it, ~0), it->it_status);
785 rc = ll_local_open(file, it, fd, *och_p);
787 GOTO(out_och_free, rc);
789 mutex_unlock(&lli->lli_och_mutex);
792 /* Must do this outside lli_och_mutex lock to prevent deadlock where
793 different kind of OPEN lock for this same inode gets cancelled
794 by ldlm_cancel_lru */
795 if (!S_ISREG(inode->i_mode))
796 GOTO(out_och_free, rc);
798 cl_lov_delay_create_clear(&file->f_flags);
799 GOTO(out_och_free, rc);
803 if (och_p && *och_p) {
804 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
805 *och_p = NULL; /* OBD_FREE writes some magic there */
808 mutex_unlock(&lli->lli_och_mutex);
811 if (lli->lli_opendir_key == fd)
812 ll_deauthorize_statahead(inode, fd);
814 ll_file_data_put(fd);
816 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
820 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
821 ptlrpc_req_finished(it->it_request);
822 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
828 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
829 struct ldlm_lock_desc *desc, void *data, int flag)
832 struct lustre_handle lockh;
836 case LDLM_CB_BLOCKING:
837 ldlm_lock2handle(lock, &lockh);
838 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
840 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
844 case LDLM_CB_CANCELING:
852 * When setting a lease on a file, we take ownership of the lli_mds_*_och
853 * and save it as fd->fd_och so as to force client to reopen the file even
854 * if it has an open lock in cache already.
856 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
857 struct lustre_handle *old_open_handle)
859 struct ll_inode_info *lli = ll_i2info(inode);
860 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
861 struct obd_client_handle **och_p;
866 /* Get the openhandle of the file */
867 mutex_lock(&lli->lli_och_mutex);
868 if (fd->fd_lease_och != NULL)
869 GOTO(out_unlock, rc = -EBUSY);
871 if (fd->fd_och == NULL) {
872 if (file->f_mode & FMODE_WRITE) {
873 LASSERT(lli->lli_mds_write_och != NULL);
874 och_p = &lli->lli_mds_write_och;
875 och_usecount = &lli->lli_open_fd_write_count;
877 LASSERT(lli->lli_mds_read_och != NULL);
878 och_p = &lli->lli_mds_read_och;
879 och_usecount = &lli->lli_open_fd_read_count;
882 if (*och_usecount > 1)
883 GOTO(out_unlock, rc = -EBUSY);
890 *old_open_handle = fd->fd_och->och_open_handle;
894 mutex_unlock(&lli->lli_och_mutex);
899 * Release ownership on lli_mds_*_och when putting back a file lease.
901 static int ll_lease_och_release(struct inode *inode, struct file *file)
903 struct ll_inode_info *lli = ll_i2info(inode);
904 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
905 struct obd_client_handle **och_p;
906 struct obd_client_handle *old_och = NULL;
911 mutex_lock(&lli->lli_och_mutex);
912 if (file->f_mode & FMODE_WRITE) {
913 och_p = &lli->lli_mds_write_och;
914 och_usecount = &lli->lli_open_fd_write_count;
916 och_p = &lli->lli_mds_read_och;
917 och_usecount = &lli->lli_open_fd_read_count;
920 /* The file may have been open by another process (broken lease) so
921 * *och_p is not NULL. In this case we should simply increase usecount
924 if (*och_p != NULL) {
925 old_och = fd->fd_och;
932 mutex_unlock(&lli->lli_och_mutex);
935 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
941 * Acquire a lease and open the file.
943 static struct obd_client_handle *
944 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
947 struct lookup_intent it = { .it_op = IT_OPEN };
948 struct ll_sb_info *sbi = ll_i2sbi(inode);
949 struct md_op_data *op_data;
950 struct ptlrpc_request *req = NULL;
951 struct lustre_handle old_open_handle = { 0 };
952 struct obd_client_handle *och = NULL;
957 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
958 RETURN(ERR_PTR(-EINVAL));
961 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
962 RETURN(ERR_PTR(-EPERM));
964 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
971 RETURN(ERR_PTR(-ENOMEM));
973 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
974 LUSTRE_OPC_ANY, NULL);
976 GOTO(out, rc = PTR_ERR(op_data));
978 /* To tell the MDT this openhandle is from the same owner */
979 op_data->op_open_handle = old_open_handle;
981 it.it_flags = fmode | open_flags;
982 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
983 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
984 &ll_md_blocking_lease_ast,
985 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
986 * it can be cancelled which may mislead applications that the lease is
988 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
989 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
990 * doesn't deal with openhandle, so normal openhandle will be leaked. */
991 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
992 ll_finish_md_op_data(op_data);
993 ptlrpc_req_finished(req);
995 GOTO(out_release_it, rc);
997 if (it_disposition(&it, DISP_LOOKUP_NEG))
998 GOTO(out_release_it, rc = -ENOENT);
1000 rc = it_open_error(DISP_OPEN_OPEN, &it);
1002 GOTO(out_release_it, rc);
1004 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1005 ll_och_fill(sbi->ll_md_exp, &it, och);
1007 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1008 GOTO(out_close, rc = -EOPNOTSUPP);
1010 /* already get lease, handle lease lock */
1011 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1012 if (it.it_lock_mode == 0 ||
1013 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1014 /* open lock must return for lease */
1015 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1016 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1018 GOTO(out_close, rc = -EPROTO);
1021 ll_intent_release(&it);
1025 /* Cancel open lock */
1026 if (it.it_lock_mode != 0) {
1027 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1029 it.it_lock_mode = 0;
1030 och->och_lease_handle.cookie = 0ULL;
1032 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1034 CERROR("%s: error closing file "DFID": %d\n",
1035 ll_get_fsname(inode->i_sb, NULL, 0),
1036 PFID(&ll_i2info(inode)->lli_fid), rc2);
1037 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1039 ll_intent_release(&it);
1043 RETURN(ERR_PTR(rc));
1047 * Check whether a layout swap can be done between two inodes.
1049 * \param[in] inode1 First inode to check
1050 * \param[in] inode2 Second inode to check
1052 * \retval 0 on success, layout swap can be performed between both inodes
1053 * \retval negative error code if requirements are not met
1055 static int ll_check_swap_layouts_validity(struct inode *inode1,
1056 struct inode *inode2)
1058 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1061 if (inode_permission(inode1, MAY_WRITE) ||
1062 inode_permission(inode2, MAY_WRITE))
1065 if (inode1->i_sb != inode2->i_sb)
1071 static int ll_swap_layouts_close(struct obd_client_handle *och,
1072 struct inode *inode, struct inode *inode2)
1074 const struct lu_fid *fid1 = ll_inode2fid(inode);
1075 const struct lu_fid *fid2;
1079 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1080 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1082 rc = ll_check_swap_layouts_validity(inode, inode2);
1084 GOTO(out_free_och, rc);
1086 /* We now know that inode2 is a lustre inode */
1087 fid2 = ll_inode2fid(inode2);
1089 rc = lu_fid_cmp(fid1, fid2);
1091 GOTO(out_free_och, rc = -EINVAL);
1093 /* Close the file and {swap,merge} layouts between inode & inode2.
1094 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1095 * because we still need it to pack l_remote_handle to MDT. */
1096 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1099 och = NULL; /* freed in ll_close_inode_openhandle() */
1109 * Release lease and close the file.
1110 * It will check if the lease has ever broken.
1112 static int ll_lease_close_intent(struct obd_client_handle *och,
1113 struct inode *inode,
1114 bool *lease_broken, enum mds_op_bias bias,
1117 struct ldlm_lock *lock;
1118 bool cancelled = true;
1122 lock = ldlm_handle2lock(&och->och_lease_handle);
1124 lock_res_and_lock(lock);
1125 cancelled = ldlm_is_cancel(lock);
1126 unlock_res_and_lock(lock);
1127 LDLM_LOCK_PUT(lock);
1130 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1131 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1133 if (lease_broken != NULL)
1134 *lease_broken = cancelled;
1136 if (!cancelled && !bias)
1137 ldlm_cli_cancel(&och->och_lease_handle, 0);
1139 if (cancelled) { /* no need to excute intent */
1144 rc = ll_close_inode_openhandle(inode, och, bias, data);
1148 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1151 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1155 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1157 static int ll_lease_file_resync(struct obd_client_handle *och,
1158 struct inode *inode, unsigned long arg)
1160 struct ll_sb_info *sbi = ll_i2sbi(inode);
1161 struct md_op_data *op_data;
1162 struct ll_ioc_lease_id ioc;
1163 __u64 data_version_unused;
1167 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1168 LUSTRE_OPC_ANY, NULL);
1169 if (IS_ERR(op_data))
1170 RETURN(PTR_ERR(op_data));
1172 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1176 /* before starting file resync, it's necessary to clean up page cache
1177 * in client memory, otherwise once the layout version is increased,
1178 * writing back cached data will be denied the OSTs. */
1179 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1183 op_data->op_lease_handle = och->och_lease_handle;
1184 op_data->op_mirror_id = ioc.lil_mirror_id;
1185 rc = md_file_resync(sbi->ll_md_exp, op_data);
1191 ll_finish_md_op_data(op_data);
1195 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1197 struct ll_inode_info *lli = ll_i2info(inode);
1198 struct cl_object *obj = lli->lli_clob;
1199 struct cl_attr *attr = vvp_env_thread_attr(env);
1207 ll_inode_size_lock(inode);
1209 /* Merge timestamps the most recently obtained from MDS with
1210 * timestamps obtained from OSTs.
1212 * Do not overwrite atime of inode because it may be refreshed
1213 * by file_accessed() function. If the read was served by cache
1214 * data, there is no RPC to be sent so that atime may not be
1215 * transferred to OSTs at all. MDT only updates atime at close time
1216 * if it's at least 'mdd.*.atime_diff' older.
1217 * All in all, the atime in Lustre does not strictly comply with
1218 * POSIX. Solving this problem needs to send an RPC to MDT for each
1219 * read, this will hurt performance. */
1220 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1221 LTIME_S(inode->i_atime) = lli->lli_atime;
1222 lli->lli_update_atime = 0;
1224 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1225 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1227 atime = LTIME_S(inode->i_atime);
1228 mtime = LTIME_S(inode->i_mtime);
1229 ctime = LTIME_S(inode->i_ctime);
1231 cl_object_attr_lock(obj);
1232 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1235 rc = cl_object_attr_get(env, obj, attr);
1236 cl_object_attr_unlock(obj);
1239 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1241 if (atime < attr->cat_atime)
1242 atime = attr->cat_atime;
1244 if (ctime < attr->cat_ctime)
1245 ctime = attr->cat_ctime;
1247 if (mtime < attr->cat_mtime)
1248 mtime = attr->cat_mtime;
1250 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1251 PFID(&lli->lli_fid), attr->cat_size);
1253 i_size_write(inode, attr->cat_size);
1254 inode->i_blocks = attr->cat_blocks;
1256 LTIME_S(inode->i_atime) = atime;
1257 LTIME_S(inode->i_mtime) = mtime;
1258 LTIME_S(inode->i_ctime) = ctime;
1261 ll_inode_size_unlock(inode);
1267 * Set designated mirror for I/O.
1269 * So far only read, write, and truncated can support to issue I/O to
1270 * designated mirror.
1272 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1274 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1276 /* clear layout version for generic(non-resync) I/O in case it carries
1277 * stale layout version due to I/O restart */
1278 io->ci_layout_version = 0;
1280 /* FLR: disable non-delay for designated mirror I/O because obviously
1281 * only one mirror is available */
1282 if (fd->fd_designated_mirror > 0) {
1284 io->ci_designated_mirror = fd->fd_designated_mirror;
1285 io->ci_layout_version = fd->fd_layout_version;
1286 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1290 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1291 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1294 static bool file_is_noatime(const struct file *file)
1296 const struct vfsmount *mnt = file->f_path.mnt;
1297 const struct inode *inode = file_inode((struct file *)file);
1299 /* Adapted from file_accessed() and touch_atime().*/
1300 if (file->f_flags & O_NOATIME)
1303 if (inode->i_flags & S_NOATIME)
1306 if (IS_NOATIME(inode))
1309 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1312 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1315 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1321 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1323 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1325 struct inode *inode = file_inode(file);
1326 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1328 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1329 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1330 io->u.ci_rw.rw_file = file;
1331 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1332 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1333 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1335 if (iot == CIT_WRITE) {
1336 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1337 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1338 file->f_flags & O_DIRECT ||
1341 io->ci_obj = ll_i2info(inode)->lli_clob;
1342 io->ci_lockreq = CILR_MAYBE;
1343 if (ll_file_nolock(file)) {
1344 io->ci_lockreq = CILR_NEVER;
1345 io->ci_no_srvlock = 1;
1346 } else if (file->f_flags & O_APPEND) {
1347 io->ci_lockreq = CILR_MANDATORY;
1349 io->ci_noatime = file_is_noatime(file);
1350 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1351 io->ci_pio = !io->u.ci_rw.rw_append;
1355 /* FLR: only use non-delay I/O for read as there is only one
1356 * avaliable mirror for write. */
1357 io->ci_ndelay = !(iot == CIT_WRITE);
1359 ll_io_set_mirror(io, file);
1362 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1364 struct cl_io_pt *pt = ptask->pt_cbdata;
1365 struct file *file = pt->cip_file;
1368 loff_t pos = pt->cip_pos;
1373 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1374 file_dentry(file)->d_name.name,
1375 pt->cip_iot == CIT_READ ? "read" : "write",
1376 pos, pos + pt->cip_count);
1378 env = cl_env_get(&refcheck);
1380 RETURN(PTR_ERR(env));
1382 io = vvp_env_thread_io(env);
1383 ll_io_init(io, file, pt->cip_iot);
1384 io->u.ci_rw.rw_iter = pt->cip_iter;
1385 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1386 io->ci_pio = 0; /* It's already in parallel task */
1388 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1389 pt->cip_count - pt->cip_result);
1391 struct vvp_io *vio = vvp_env_io(env);
1393 vio->vui_io_subtype = IO_NORMAL;
1394 vio->vui_fd = LUSTRE_FPRIVATE(file);
1396 ll_cl_add(file, env, io, LCC_RW);
1397 rc = cl_io_loop(env, io);
1398 ll_cl_remove(file, env);
1400 /* cl_io_rw_init() handled IO */
1404 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1410 if (io->ci_nob > 0) {
1411 pt->cip_result += io->ci_nob;
1412 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1414 pt->cip_iocb.ki_pos = pos;
1415 #ifdef HAVE_KIOCB_KI_LEFT
1416 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1417 #elif defined(HAVE_KI_NBYTES)
1418 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1422 cl_io_fini(env, io);
1423 cl_env_put(env, &refcheck);
1425 pt->cip_need_restart = io->ci_need_restart;
1427 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1428 file_dentry(file)->d_name.name,
1429 pt->cip_iot == CIT_READ ? "read" : "write",
1430 pt->cip_result, rc);
1432 RETURN(pt->cip_result > 0 ? 0 : rc);
1436 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1437 struct file *file, enum cl_io_type iot,
1438 loff_t *ppos, size_t count)
1440 struct range_lock range;
1441 struct vvp_io *vio = vvp_env_io(env);
1442 struct inode *inode = file_inode(file);
1443 struct ll_inode_info *lli = ll_i2info(inode);
1444 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1449 unsigned retried = 0;
1450 bool restarted = false;
1454 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1455 file_dentry(file)->d_name.name,
1456 iot == CIT_READ ? "read" : "write", pos, pos + count);
1459 io = vvp_env_thread_io(env);
1460 ll_io_init(io, file, iot);
1461 if (args->via_io_subtype == IO_NORMAL) {
1462 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1463 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1465 if (args->via_io_subtype != IO_NORMAL || restarted)
1467 io->ci_ndelay_tried = retried;
1469 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1470 bool range_locked = false;
1472 if (file->f_flags & O_APPEND)
1473 range_lock_init(&range, 0, LUSTRE_EOF);
1475 range_lock_init(&range, pos, pos + count - 1);
1477 vio->vui_fd = LUSTRE_FPRIVATE(file);
1478 vio->vui_io_subtype = args->via_io_subtype;
1480 switch (vio->vui_io_subtype) {
1482 /* Direct IO reads must also take range lock,
1483 * or multiple reads will try to work on the same pages
1484 * See LU-6227 for details. */
1485 if (((iot == CIT_WRITE) ||
1486 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1487 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1488 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1490 rc = range_lock(&lli->lli_write_tree, &range);
1494 range_locked = true;
1498 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1499 vio->u.splice.vui_flags = args->u.splice.via_flags;
1502 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1506 ll_cl_add(file, env, io, LCC_RW);
1507 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1508 !lli->lli_inode_locked) {
1510 lli->lli_inode_locked = 1;
1512 rc = cl_io_loop(env, io);
1513 if (lli->lli_inode_locked) {
1514 lli->lli_inode_locked = 0;
1515 inode_unlock(inode);
1517 ll_cl_remove(file, env);
1520 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1522 range_unlock(&lli->lli_write_tree, &range);
1525 /* cl_io_rw_init() handled IO */
1529 if (io->ci_nob > 0) {
1530 result += io->ci_nob;
1531 count -= io->ci_nob;
1533 if (args->via_io_subtype == IO_NORMAL) {
1534 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1536 /* CLIO is too complicated. See LU-11069. */
1537 if (cl_io_is_append(io))
1538 pos = io->u.ci_rw.rw_iocb.ki_pos;
1542 args->u.normal.via_iocb->ki_pos = pos;
1543 #ifdef HAVE_KIOCB_KI_LEFT
1544 args->u.normal.via_iocb->ki_left = count;
1545 #elif defined(HAVE_KI_NBYTES)
1546 args->u.normal.via_iocb->ki_nbytes = count;
1550 pos = io->u.ci_rw.rw_range.cir_pos;
1554 cl_io_fini(env, io);
1557 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1558 file->f_path.dentry->d_name.name,
1559 iot, rc, result, io->ci_need_restart);
1561 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1563 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1564 file_dentry(file)->d_name.name,
1565 iot == CIT_READ ? "read" : "write",
1566 pos, pos + count, result, rc);
1567 /* preserve the tried count for FLR */
1568 retried = io->ci_ndelay_tried;
1573 if (iot == CIT_READ) {
1575 ll_stats_ops_tally(ll_i2sbi(inode),
1576 LPROC_LL_READ_BYTES, result);
1577 } else if (iot == CIT_WRITE) {
1579 ll_stats_ops_tally(ll_i2sbi(inode),
1580 LPROC_LL_WRITE_BYTES, result);
1581 fd->fd_write_failed = false;
1582 } else if (result == 0 && rc == 0) {
1585 fd->fd_write_failed = true;
1587 fd->fd_write_failed = false;
1588 } else if (rc != -ERESTARTSYS) {
1589 fd->fd_write_failed = true;
1593 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1594 file_dentry(file)->d_name.name,
1595 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1599 RETURN(result > 0 ? result : rc);
1603 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1604 * especially for small I/O.
1606 * To serve a read request, CLIO has to create and initialize a cl_io and
1607 * then request DLM lock. This has turned out to have siginificant overhead
1608 * and affects the performance of small I/O dramatically.
1610 * It's not necessary to create a cl_io for each I/O. Under the help of read
1611 * ahead, most of the pages being read are already in memory cache and we can
1612 * read those pages directly because if the pages exist, the corresponding DLM
1613 * lock must exist so that page content must be valid.
1615 * In fast read implementation, the llite speculatively finds and reads pages
1616 * in memory cache. There are three scenarios for fast read:
1617 * - If the page exists and is uptodate, kernel VM will provide the data and
1618 * CLIO won't be intervened;
1619 * - If the page was brought into memory by read ahead, it will be exported
1620 * and read ahead parameters will be updated;
1621 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1622 * it will go back and invoke normal read, i.e., a cl_io will be created
1623 * and DLM lock will be requested.
1625 * POSIX compliance: posix standard states that read is intended to be atomic.
1626 * Lustre read implementation is in line with Linux kernel read implementation
1627 * and neither of them complies with POSIX standard in this matter. Fast read
1628 * doesn't make the situation worse on single node but it may interleave write
1629 * results from multiple nodes due to short read handling in ll_file_aio_read().
1631 * \param env - lu_env
1632 * \param iocb - kiocb from kernel
1633 * \param iter - user space buffers where the data will be copied
1635 * \retval - number of bytes have been read, or error code if error occurred.
1638 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1642 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1645 /* NB: we can't do direct IO for fast read because it will need a lock
1646 * to make IO engine happy. */
1647 if (iocb->ki_filp->f_flags & O_DIRECT)
1650 result = generic_file_read_iter(iocb, iter);
1652 /* If the first page is not in cache, generic_file_aio_read() will be
1653 * returned with -ENODATA.
1654 * See corresponding code in ll_readpage(). */
1655 if (result == -ENODATA)
1659 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1660 LPROC_LL_READ_BYTES, result);
1666 * Read from a file (through the page cache).
1668 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1671 struct vvp_io_args *args;
1676 result = ll_do_fast_read(iocb, to);
1677 if (result < 0 || iov_iter_count(to) == 0)
1680 env = cl_env_get(&refcheck);
1682 return PTR_ERR(env);
1684 args = ll_env_args(env, IO_NORMAL);
1685 args->u.normal.via_iter = to;
1686 args->u.normal.via_iocb = iocb;
1688 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1689 &iocb->ki_pos, iov_iter_count(to));
1692 else if (result == 0)
1695 cl_env_put(env, &refcheck);
1701 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1702 * If a page is already in the page cache and dirty (and some other things -
1703 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1704 * write to it without doing a full I/O, because Lustre already knows about it
1705 * and will write it out. This saves a lot of processing time.
1707 * All writes here are within one page, so exclusion is handled by the page
1708 * lock on the vm page. We do not do tiny writes for writes which touch
1709 * multiple pages because it's very unlikely multiple sequential pages are
1710 * are already dirty.
1712 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1713 * and are unlikely to be to already dirty pages.
1715 * Attribute updates are important here, we do them in ll_tiny_write_end.
1717 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1719 ssize_t count = iov_iter_count(iter);
1720 struct file *file = iocb->ki_filp;
1721 struct inode *inode = file_inode(file);
1726 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1727 * of function for why.
1729 if (count >= PAGE_SIZE ||
1730 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1733 result = __generic_file_write_iter(iocb, iter);
1735 /* If the page is not already dirty, ll_tiny_write_begin returns
1736 * -ENODATA. We continue on to normal write.
1738 if (result == -ENODATA)
1742 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1744 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1747 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1753 * Write to a file (through the page cache).
1755 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1757 struct vvp_io_args *args;
1759 ssize_t rc_tiny = 0, rc_normal;
1764 /* NB: we can't do direct IO for tiny writes because they use the page
1765 * cache, we can't do sync writes because tiny writes can't flush
1766 * pages, and we can't do append writes because we can't guarantee the
1767 * required DLM locks are held to protect file size.
1769 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1770 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1771 rc_tiny = ll_do_tiny_write(iocb, from);
1773 /* In case of error, go on and try normal write - Only stop if tiny
1774 * write completed I/O.
1776 if (iov_iter_count(from) == 0)
1777 GOTO(out, rc_normal = rc_tiny);
1779 env = cl_env_get(&refcheck);
1781 return PTR_ERR(env);
1783 args = ll_env_args(env, IO_NORMAL);
1784 args->u.normal.via_iter = from;
1785 args->u.normal.via_iocb = iocb;
1787 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1788 &iocb->ki_pos, iov_iter_count(from));
1790 /* On success, combine bytes written. */
1791 if (rc_tiny >= 0 && rc_normal > 0)
1792 rc_normal += rc_tiny;
1793 /* On error, only return error from normal write if tiny write did not
1794 * write any bytes. Otherwise return bytes written by tiny write.
1796 else if (rc_tiny > 0)
1797 rc_normal = rc_tiny;
1799 cl_env_put(env, &refcheck);
1804 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1806 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1808 static int ll_file_get_iov_count(const struct iovec *iov,
1809 unsigned long *nr_segs, size_t *count)
1814 for (seg = 0; seg < *nr_segs; seg++) {
1815 const struct iovec *iv = &iov[seg];
1818 * If any segment has a negative length, or the cumulative
1819 * length ever wraps negative then return -EINVAL.
1822 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1824 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1829 cnt -= iv->iov_len; /* This segment is no good */
1836 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1837 unsigned long nr_segs, loff_t pos)
1844 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1848 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1849 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1850 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1851 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1852 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1854 result = ll_file_read_iter(iocb, &to);
1859 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1862 struct iovec iov = { .iov_base = buf, .iov_len = count };
1867 init_sync_kiocb(&kiocb, file);
1868 kiocb.ki_pos = *ppos;
1869 #ifdef HAVE_KIOCB_KI_LEFT
1870 kiocb.ki_left = count;
1871 #elif defined(HAVE_KI_NBYTES)
1872 kiocb.i_nbytes = count;
1875 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1876 *ppos = kiocb.ki_pos;
1882 * Write to a file (through the page cache).
1885 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1886 unsigned long nr_segs, loff_t pos)
1888 struct iov_iter from;
1893 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1897 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1898 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1899 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1900 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1901 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1903 result = ll_file_write_iter(iocb, &from);
1908 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1909 size_t count, loff_t *ppos)
1911 struct iovec iov = { .iov_base = (void __user *)buf,
1918 init_sync_kiocb(&kiocb, file);
1919 kiocb.ki_pos = *ppos;
1920 #ifdef HAVE_KIOCB_KI_LEFT
1921 kiocb.ki_left = count;
1922 #elif defined(HAVE_KI_NBYTES)
1923 kiocb.ki_nbytes = count;
1926 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1927 *ppos = kiocb.ki_pos;
1931 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1934 * Send file content (through pagecache) somewhere with helper
1936 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1937 struct pipe_inode_info *pipe, size_t count,
1941 struct vvp_io_args *args;
1946 env = cl_env_get(&refcheck);
1948 RETURN(PTR_ERR(env));
1950 args = ll_env_args(env, IO_SPLICE);
1951 args->u.splice.via_pipe = pipe;
1952 args->u.splice.via_flags = flags;
1954 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1955 cl_env_put(env, &refcheck);
1959 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1960 __u64 flags, struct lov_user_md *lum, int lum_size)
1962 struct lookup_intent oit = {
1964 .it_flags = flags | MDS_OPEN_BY_FID,
1969 ll_inode_size_lock(inode);
1970 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1972 GOTO(out_unlock, rc);
1974 ll_release_openhandle(dentry, &oit);
1977 ll_inode_size_unlock(inode);
1978 ll_intent_release(&oit);
1983 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1984 struct lov_mds_md **lmmp, int *lmm_size,
1985 struct ptlrpc_request **request)
1987 struct ll_sb_info *sbi = ll_i2sbi(inode);
1988 struct mdt_body *body;
1989 struct lov_mds_md *lmm = NULL;
1990 struct ptlrpc_request *req = NULL;
1991 struct md_op_data *op_data;
1994 rc = ll_get_default_mdsize(sbi, &lmmsize);
1998 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1999 strlen(filename), lmmsize,
2000 LUSTRE_OPC_ANY, NULL);
2001 if (IS_ERR(op_data))
2002 RETURN(PTR_ERR(op_data));
2004 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2005 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2006 ll_finish_md_op_data(op_data);
2008 CDEBUG(D_INFO, "md_getattr_name failed "
2009 "on %s: rc %d\n", filename, rc);
2013 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2014 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2016 lmmsize = body->mbo_eadatasize;
2018 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2020 GOTO(out, rc = -ENODATA);
2023 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2024 LASSERT(lmm != NULL);
2026 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2027 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2028 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2029 GOTO(out, rc = -EPROTO);
2032 * This is coming from the MDS, so is probably in
2033 * little endian. We convert it to host endian before
2034 * passing it to userspace.
2036 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2039 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2040 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2041 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2042 if (le32_to_cpu(lmm->lmm_pattern) &
2043 LOV_PATTERN_F_RELEASED)
2047 /* if function called for directory - we should
2048 * avoid swab not existent lsm objects */
2049 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2050 lustre_swab_lov_user_md_v1(
2051 (struct lov_user_md_v1 *)lmm);
2052 if (S_ISREG(body->mbo_mode))
2053 lustre_swab_lov_user_md_objects(
2054 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2056 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2057 lustre_swab_lov_user_md_v3(
2058 (struct lov_user_md_v3 *)lmm);
2059 if (S_ISREG(body->mbo_mode))
2060 lustre_swab_lov_user_md_objects(
2061 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2063 } else if (lmm->lmm_magic ==
2064 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2065 lustre_swab_lov_comp_md_v1(
2066 (struct lov_comp_md_v1 *)lmm);
2072 *lmm_size = lmmsize;
2077 static int ll_lov_setea(struct inode *inode, struct file *file,
2080 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2081 struct lov_user_md *lump;
2082 int lum_size = sizeof(struct lov_user_md) +
2083 sizeof(struct lov_user_ost_data);
2087 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2090 OBD_ALLOC_LARGE(lump, lum_size);
2094 if (copy_from_user(lump, arg, lum_size))
2095 GOTO(out_lump, rc = -EFAULT);
2097 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2099 cl_lov_delay_create_clear(&file->f_flags);
2102 OBD_FREE_LARGE(lump, lum_size);
2106 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2113 env = cl_env_get(&refcheck);
2115 RETURN(PTR_ERR(env));
2117 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2118 cl_env_put(env, &refcheck);
2122 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2125 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2126 struct lov_user_md *klum;
2128 __u64 flags = FMODE_WRITE;
2131 rc = ll_copy_user_md(lum, &klum);
2136 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2141 rc = put_user(0, &lum->lmm_stripe_count);
2145 rc = ll_layout_refresh(inode, &gen);
2149 rc = ll_file_getstripe(inode, arg, lum_size);
2151 cl_lov_delay_create_clear(&file->f_flags);
2154 OBD_FREE(klum, lum_size);
2159 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2161 struct ll_inode_info *lli = ll_i2info(inode);
2162 struct cl_object *obj = lli->lli_clob;
2163 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2164 struct ll_grouplock grouplock;
2169 CWARN("group id for group lock must not be 0\n");
2173 if (ll_file_nolock(file))
2174 RETURN(-EOPNOTSUPP);
2176 spin_lock(&lli->lli_lock);
2177 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2178 CWARN("group lock already existed with gid %lu\n",
2179 fd->fd_grouplock.lg_gid);
2180 spin_unlock(&lli->lli_lock);
2183 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2184 spin_unlock(&lli->lli_lock);
2187 * XXX: group lock needs to protect all OST objects while PFL
2188 * can add new OST objects during the IO, so we'd instantiate
2189 * all OST objects before getting its group lock.
2194 struct cl_layout cl = {
2195 .cl_is_composite = false,
2197 struct lu_extent ext = {
2199 .e_end = OBD_OBJECT_EOF,
2202 env = cl_env_get(&refcheck);
2204 RETURN(PTR_ERR(env));
2206 rc = cl_object_layout_get(env, obj, &cl);
2207 if (!rc && cl.cl_is_composite)
2208 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2211 cl_env_put(env, &refcheck);
2216 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2217 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2221 spin_lock(&lli->lli_lock);
2222 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2223 spin_unlock(&lli->lli_lock);
2224 CERROR("another thread just won the race\n");
2225 cl_put_grouplock(&grouplock);
2229 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2230 fd->fd_grouplock = grouplock;
2231 spin_unlock(&lli->lli_lock);
2233 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2237 static int ll_put_grouplock(struct inode *inode, struct file *file,
2240 struct ll_inode_info *lli = ll_i2info(inode);
2241 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2242 struct ll_grouplock grouplock;
2245 spin_lock(&lli->lli_lock);
2246 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2247 spin_unlock(&lli->lli_lock);
2248 CWARN("no group lock held\n");
2252 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2254 if (fd->fd_grouplock.lg_gid != arg) {
2255 CWARN("group lock %lu doesn't match current id %lu\n",
2256 arg, fd->fd_grouplock.lg_gid);
2257 spin_unlock(&lli->lli_lock);
2261 grouplock = fd->fd_grouplock;
2262 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2263 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2264 spin_unlock(&lli->lli_lock);
2266 cl_put_grouplock(&grouplock);
2267 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2272 * Close inode open handle
2274 * \param dentry [in] dentry which contains the inode
2275 * \param it [in,out] intent which contains open info and result
2278 * \retval <0 failure
2280 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2282 struct inode *inode = dentry->d_inode;
2283 struct obd_client_handle *och;
2289 /* Root ? Do nothing. */
2290 if (dentry->d_inode->i_sb->s_root == dentry)
2293 /* No open handle to close? Move away */
2294 if (!it_disposition(it, DISP_OPEN_OPEN))
2297 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2299 OBD_ALLOC(och, sizeof(*och));
2301 GOTO(out, rc = -ENOMEM);
2303 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2305 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2307 /* this one is in place of ll_file_open */
2308 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2309 ptlrpc_req_finished(it->it_request);
2310 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2316 * Get size for inode for which FIEMAP mapping is requested.
2317 * Make the FIEMAP get_info call and returns the result.
2318 * \param fiemap kernel buffer to hold extens
2319 * \param num_bytes kernel buffer size
2321 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2327 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2330 /* Checks for fiemap flags */
2331 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2332 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2336 /* Check for FIEMAP_FLAG_SYNC */
2337 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2338 rc = filemap_fdatawrite(inode->i_mapping);
2343 env = cl_env_get(&refcheck);
2345 RETURN(PTR_ERR(env));
2347 if (i_size_read(inode) == 0) {
2348 rc = ll_glimpse_size(inode);
2353 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2354 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2355 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2357 /* If filesize is 0, then there would be no objects for mapping */
2358 if (fmkey.lfik_oa.o_size == 0) {
2359 fiemap->fm_mapped_extents = 0;
2363 fmkey.lfik_fiemap = *fiemap;
2365 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2366 &fmkey, fiemap, &num_bytes);
2368 cl_env_put(env, &refcheck);
2372 int ll_fid2path(struct inode *inode, void __user *arg)
2374 struct obd_export *exp = ll_i2mdexp(inode);
2375 const struct getinfo_fid2path __user *gfin = arg;
2377 struct getinfo_fid2path *gfout;
2383 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2384 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2387 /* Only need to get the buflen */
2388 if (get_user(pathlen, &gfin->gf_pathlen))
2391 if (pathlen > PATH_MAX)
2394 outsize = sizeof(*gfout) + pathlen;
2395 OBD_ALLOC(gfout, outsize);
2399 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2400 GOTO(gf_free, rc = -EFAULT);
2401 /* append root FID after gfout to let MDT know the root FID so that it
2402 * can lookup the correct path, this is mainly for fileset.
2403 * old server without fileset mount support will ignore this. */
2404 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2406 /* Call mdc_iocontrol */
2407 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2411 if (copy_to_user(arg, gfout, outsize))
2415 OBD_FREE(gfout, outsize);
2420 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2422 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2430 ioc->idv_version = 0;
2431 ioc->idv_layout_version = UINT_MAX;
2433 /* If no file object initialized, we consider its version is 0. */
2437 env = cl_env_get(&refcheck);
2439 RETURN(PTR_ERR(env));
2441 io = vvp_env_thread_io(env);
2443 io->u.ci_data_version.dv_data_version = 0;
2444 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2445 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2448 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2449 result = cl_io_loop(env, io);
2451 result = io->ci_result;
2453 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2454 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2456 cl_io_fini(env, io);
2458 if (unlikely(io->ci_need_restart))
2461 cl_env_put(env, &refcheck);
2467 * Read the data_version for inode.
2469 * This value is computed using stripe object version on OST.
2470 * Version is computed using server side locking.
2472 * @param flags if do sync on the OST side;
2474 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2475 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2477 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2479 struct ioc_data_version ioc = { .idv_flags = flags };
2482 rc = ll_ioc_data_version(inode, &ioc);
2484 *data_version = ioc.idv_version;
2490 * Trigger a HSM release request for the provided inode.
2492 int ll_hsm_release(struct inode *inode)
2495 struct obd_client_handle *och = NULL;
2496 __u64 data_version = 0;
2501 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2502 ll_get_fsname(inode->i_sb, NULL, 0),
2503 PFID(&ll_i2info(inode)->lli_fid));
2505 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2507 GOTO(out, rc = PTR_ERR(och));
2509 /* Grab latest data_version and [am]time values */
2510 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2514 env = cl_env_get(&refcheck);
2516 GOTO(out, rc = PTR_ERR(env));
2518 rc = ll_merge_attr(env, inode);
2519 cl_env_put(env, &refcheck);
2521 /* If error happen, we have the wrong size for a file.
2527 /* Release the file.
2528 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2529 * we still need it to pack l_remote_handle to MDT. */
2530 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2536 if (och != NULL && !IS_ERR(och)) /* close the file */
2537 ll_lease_close(och, inode, NULL);
2542 struct ll_swap_stack {
2545 struct inode *inode1;
2546 struct inode *inode2;
2551 static int ll_swap_layouts(struct file *file1, struct file *file2,
2552 struct lustre_swap_layouts *lsl)
2554 struct mdc_swap_layouts msl;
2555 struct md_op_data *op_data;
2558 struct ll_swap_stack *llss = NULL;
2561 OBD_ALLOC_PTR(llss);
2565 llss->inode1 = file_inode(file1);
2566 llss->inode2 = file_inode(file2);
2568 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2572 /* we use 2 bool because it is easier to swap than 2 bits */
2573 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2574 llss->check_dv1 = true;
2576 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2577 llss->check_dv2 = true;
2579 /* we cannot use lsl->sl_dvX directly because we may swap them */
2580 llss->dv1 = lsl->sl_dv1;
2581 llss->dv2 = lsl->sl_dv2;
2583 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2584 if (rc == 0) /* same file, done! */
2587 if (rc < 0) { /* sequentialize it */
2588 swap(llss->inode1, llss->inode2);
2590 swap(llss->dv1, llss->dv2);
2591 swap(llss->check_dv1, llss->check_dv2);
2595 if (gid != 0) { /* application asks to flush dirty cache */
2596 rc = ll_get_grouplock(llss->inode1, file1, gid);
2600 rc = ll_get_grouplock(llss->inode2, file2, gid);
2602 ll_put_grouplock(llss->inode1, file1, gid);
2607 /* ultimate check, before swaping the layouts we check if
2608 * dataversion has changed (if requested) */
2609 if (llss->check_dv1) {
2610 rc = ll_data_version(llss->inode1, &dv, 0);
2613 if (dv != llss->dv1)
2614 GOTO(putgl, rc = -EAGAIN);
2617 if (llss->check_dv2) {
2618 rc = ll_data_version(llss->inode2, &dv, 0);
2621 if (dv != llss->dv2)
2622 GOTO(putgl, rc = -EAGAIN);
2625 /* struct md_op_data is used to send the swap args to the mdt
2626 * only flags is missing, so we use struct mdc_swap_layouts
2627 * through the md_op_data->op_data */
2628 /* flags from user space have to be converted before they are send to
2629 * server, no flag is sent today, they are only used on the client */
2632 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2633 0, LUSTRE_OPC_ANY, &msl);
2634 if (IS_ERR(op_data))
2635 GOTO(free, rc = PTR_ERR(op_data));
2637 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2638 sizeof(*op_data), op_data, NULL);
2639 ll_finish_md_op_data(op_data);
2646 ll_put_grouplock(llss->inode2, file2, gid);
2647 ll_put_grouplock(llss->inode1, file1, gid);
2657 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2659 struct md_op_data *op_data;
2663 /* Detect out-of range masks */
2664 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2667 /* Non-root users are forbidden to set or clear flags which are
2668 * NOT defined in HSM_USER_MASK. */
2669 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2670 !cfs_capable(CFS_CAP_SYS_ADMIN))
2673 /* Detect out-of range archive id */
2674 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2675 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2678 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2679 LUSTRE_OPC_ANY, hss);
2680 if (IS_ERR(op_data))
2681 RETURN(PTR_ERR(op_data));
2683 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2684 sizeof(*op_data), op_data, NULL);
2686 ll_finish_md_op_data(op_data);
2691 static int ll_hsm_import(struct inode *inode, struct file *file,
2692 struct hsm_user_import *hui)
2694 struct hsm_state_set *hss = NULL;
2695 struct iattr *attr = NULL;
2699 if (!S_ISREG(inode->i_mode))
2705 GOTO(out, rc = -ENOMEM);
2707 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2708 hss->hss_archive_id = hui->hui_archive_id;
2709 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2710 rc = ll_hsm_state_set(inode, hss);
2714 OBD_ALLOC_PTR(attr);
2716 GOTO(out, rc = -ENOMEM);
2718 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2719 attr->ia_mode |= S_IFREG;
2720 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2721 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2722 attr->ia_size = hui->hui_size;
2723 attr->ia_mtime.tv_sec = hui->hui_mtime;
2724 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2725 attr->ia_atime.tv_sec = hui->hui_atime;
2726 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2728 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2729 ATTR_UID | ATTR_GID |
2730 ATTR_MTIME | ATTR_MTIME_SET |
2731 ATTR_ATIME | ATTR_ATIME_SET;
2735 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2739 inode_unlock(inode);
2751 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2753 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2754 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2757 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2759 struct inode *inode = file_inode(file);
2761 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2762 ATTR_MTIME | ATTR_MTIME_SET |
2765 .tv_sec = lfu->lfu_atime_sec,
2766 .tv_nsec = lfu->lfu_atime_nsec,
2769 .tv_sec = lfu->lfu_mtime_sec,
2770 .tv_nsec = lfu->lfu_mtime_nsec,
2773 .tv_sec = lfu->lfu_ctime_sec,
2774 .tv_nsec = lfu->lfu_ctime_nsec,
2780 if (!capable(CAP_SYS_ADMIN))
2783 if (!S_ISREG(inode->i_mode))
2787 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2789 inode_unlock(inode);
2794 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2797 case MODE_READ_USER:
2799 case MODE_WRITE_USER:
2806 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2808 /* Used to allow the upper layers of the client to request an LDLM lock
2809 * without doing an actual read or write.
2811 * Used for ladvise lockahead to manually request specific locks.
2813 * \param[in] file file this ladvise lock request is on
2814 * \param[in] ladvise ladvise struct describing this lock request
2816 * \retval 0 success, no detailed result available (sync requests
2817 * and requests sent to the server [not handled locally]
2818 * cannot return detailed results)
2819 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2820 * see definitions for details.
2821 * \retval negative negative errno on error
2823 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2825 struct lu_env *env = NULL;
2826 struct cl_io *io = NULL;
2827 struct cl_lock *lock = NULL;
2828 struct cl_lock_descr *descr = NULL;
2829 struct dentry *dentry = file->f_path.dentry;
2830 struct inode *inode = dentry->d_inode;
2831 enum cl_lock_mode cl_mode;
2832 off_t start = ladvise->lla_start;
2833 off_t end = ladvise->lla_end;
2839 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2840 "start=%llu, end=%llu\n", dentry->d_name.len,
2841 dentry->d_name.name, dentry->d_inode,
2842 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2845 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2847 GOTO(out, result = cl_mode);
2849 /* Get IO environment */
2850 result = cl_io_get(inode, &env, &io, &refcheck);
2854 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2857 * nothing to do for this io. This currently happens when
2858 * stripe sub-object's are not yet created.
2860 result = io->ci_result;
2861 } else if (result == 0) {
2862 lock = vvp_env_lock(env);
2863 descr = &lock->cll_descr;
2865 descr->cld_obj = io->ci_obj;
2866 /* Convert byte offsets to pages */
2867 descr->cld_start = cl_index(io->ci_obj, start);
2868 descr->cld_end = cl_index(io->ci_obj, end);
2869 descr->cld_mode = cl_mode;
2870 /* CEF_MUST is used because we do not want to convert a
2871 * lockahead request to a lockless lock */
2872 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2875 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2876 descr->cld_enq_flags |= CEF_SPECULATIVE;
2878 result = cl_lock_request(env, io, lock);
2880 /* On success, we need to release the lock */
2882 cl_lock_release(env, lock);
2884 cl_io_fini(env, io);
2885 cl_env_put(env, &refcheck);
2887 /* -ECANCELED indicates a matching lock with a different extent
2888 * was already present, and -EEXIST indicates a matching lock
2889 * on exactly the same extent was already present.
2890 * We convert them to positive values for userspace to make
2891 * recognizing true errors easier.
2892 * Note we can only return these detailed results on async requests,
2893 * as sync requests look the same as i/o requests for locking. */
2894 if (result == -ECANCELED)
2895 result = LLA_RESULT_DIFFERENT;
2896 else if (result == -EEXIST)
2897 result = LLA_RESULT_SAME;
2902 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2904 static int ll_ladvise_sanity(struct inode *inode,
2905 struct llapi_lu_ladvise *ladvise)
2907 enum lu_ladvise_type advice = ladvise->lla_advice;
2908 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2909 * be in the first 32 bits of enum ladvise_flags */
2910 __u32 flags = ladvise->lla_peradvice_flags;
2911 /* 3 lines at 80 characters per line, should be plenty */
2914 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2916 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2917 "last supported advice is %s (value '%d'): rc = %d\n",
2918 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2919 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2923 /* Per-advice checks */
2925 case LU_LADVISE_LOCKNOEXPAND:
2926 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2928 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2930 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2931 ladvise_names[advice], rc);
2935 case LU_LADVISE_LOCKAHEAD:
2936 /* Currently only READ and WRITE modes can be requested */
2937 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2938 ladvise->lla_lockahead_mode == 0) {
2940 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2942 ll_get_fsname(inode->i_sb, NULL, 0),
2943 ladvise->lla_lockahead_mode,
2944 ladvise_names[advice], rc);
2947 case LU_LADVISE_WILLREAD:
2948 case LU_LADVISE_DONTNEED:
2950 /* Note fall through above - These checks apply to all advices
2951 * except LOCKNOEXPAND */
2952 if (flags & ~LF_DEFAULT_MASK) {
2954 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2956 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2957 ladvise_names[advice], rc);
2960 if (ladvise->lla_start >= ladvise->lla_end) {
2962 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2963 "for %s: rc = %d\n",
2964 ll_get_fsname(inode->i_sb, NULL, 0),
2965 ladvise->lla_start, ladvise->lla_end,
2966 ladvise_names[advice], rc);
2978 * Give file access advices
2980 * The ladvise interface is similar to Linux fadvise() system call, except it
2981 * forwards the advices directly from Lustre client to server. The server side
2982 * codes will apply appropriate read-ahead and caching techniques for the
2983 * corresponding files.
2985 * A typical workload for ladvise is e.g. a bunch of different clients are
2986 * doing small random reads of a file, so prefetching pages into OSS cache
2987 * with big linear reads before the random IO is a net benefit. Fetching
2988 * all that data into each client cache with fadvise() may not be, due to
2989 * much more data being sent to the client.
2991 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2992 struct llapi_lu_ladvise *ladvise)
2996 struct cl_ladvise_io *lio;
3001 env = cl_env_get(&refcheck);
3003 RETURN(PTR_ERR(env));
3005 io = vvp_env_thread_io(env);
3006 io->ci_obj = ll_i2info(inode)->lli_clob;
3008 /* initialize parameters for ladvise */
3009 lio = &io->u.ci_ladvise;
3010 lio->li_start = ladvise->lla_start;
3011 lio->li_end = ladvise->lla_end;
3012 lio->li_fid = ll_inode2fid(inode);
3013 lio->li_advice = ladvise->lla_advice;
3014 lio->li_flags = flags;
3016 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3017 rc = cl_io_loop(env, io);
3021 cl_io_fini(env, io);
3022 cl_env_put(env, &refcheck);
3026 static int ll_lock_noexpand(struct file *file, int flags)
3028 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3030 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3035 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3038 struct fsxattr fsxattr;
3040 if (copy_from_user(&fsxattr,
3041 (const struct fsxattr __user *)arg,
3045 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3046 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3047 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3048 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3049 if (copy_to_user((struct fsxattr __user *)arg,
3050 &fsxattr, sizeof(fsxattr)))
3056 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3059 * Project Quota ID state is only allowed to change from within the init
3060 * namespace. Enforce that restriction only if we are trying to change
3061 * the quota ID state. Everything else is allowed in user namespaces.
3063 if (current_user_ns() == &init_user_ns)
3066 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3069 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3070 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3073 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3080 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3084 struct md_op_data *op_data;
3085 struct ptlrpc_request *req = NULL;
3087 struct fsxattr fsxattr;
3088 struct cl_object *obj;
3092 if (copy_from_user(&fsxattr,
3093 (const struct fsxattr __user *)arg,
3097 rc = ll_ioctl_check_project(inode, &fsxattr);
3101 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3102 LUSTRE_OPC_ANY, NULL);
3103 if (IS_ERR(op_data))
3104 RETURN(PTR_ERR(op_data));
3106 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3107 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3108 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3110 op_data->op_projid = fsxattr.fsx_projid;
3111 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3112 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3114 ptlrpc_req_finished(req);
3116 GOTO(out_fsxattr, rc);
3117 ll_update_inode_flags(inode, op_data->op_attr_flags);
3118 obj = ll_i2info(inode)->lli_clob;
3120 GOTO(out_fsxattr, rc);
3122 OBD_ALLOC_PTR(attr);
3124 GOTO(out_fsxattr, rc = -ENOMEM);
3126 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3127 fsxattr.fsx_xflags);
3130 ll_finish_md_op_data(op_data);
3134 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3137 struct inode *inode = file_inode(file);
3138 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3139 struct ll_inode_info *lli = ll_i2info(inode);
3140 struct obd_client_handle *och = NULL;
3141 struct split_param sp;
3144 enum mds_op_bias bias = 0;
3145 struct file *layout_file = NULL;
3147 size_t data_size = 0;
3151 mutex_lock(&lli->lli_och_mutex);
3152 if (fd->fd_lease_och != NULL) {
3153 och = fd->fd_lease_och;
3154 fd->fd_lease_och = NULL;
3156 mutex_unlock(&lli->lli_och_mutex);
3159 GOTO(out, rc = -ENOLCK);
3161 fmode = och->och_flags;
3163 switch (ioc->lil_flags) {
3164 case LL_LEASE_RESYNC_DONE:
3165 if (ioc->lil_count > IOC_IDS_MAX)
3166 GOTO(out, rc = -EINVAL);
3168 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3169 OBD_ALLOC(data, data_size);
3171 GOTO(out, rc = -ENOMEM);
3173 if (copy_from_user(data, (void __user *)arg, data_size))
3174 GOTO(out, rc = -EFAULT);
3176 bias = MDS_CLOSE_RESYNC_DONE;
3178 case LL_LEASE_LAYOUT_MERGE: {
3181 if (ioc->lil_count != 1)
3182 GOTO(out, rc = -EINVAL);
3184 arg += sizeof(*ioc);
3185 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3186 GOTO(out, rc = -EFAULT);
3188 layout_file = fget(fd);
3190 GOTO(out, rc = -EBADF);
3192 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3193 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3194 GOTO(out, rc = -EPERM);
3196 data = file_inode(layout_file);
3197 bias = MDS_CLOSE_LAYOUT_MERGE;
3200 case LL_LEASE_LAYOUT_SPLIT: {
3204 if (ioc->lil_count != 2)
3205 GOTO(out, rc = -EINVAL);
3207 arg += sizeof(*ioc);
3208 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3209 GOTO(out, rc = -EFAULT);
3211 arg += sizeof(__u32);
3212 if (copy_from_user(&mirror_id, (void __user *)arg,
3214 GOTO(out, rc = -EFAULT);
3216 layout_file = fget(fdv);
3218 GOTO(out, rc = -EBADF);
3220 sp.sp_inode = file_inode(layout_file);
3221 sp.sp_mirror_id = (__u16)mirror_id;
3223 bias = MDS_CLOSE_LAYOUT_SPLIT;
3227 /* without close intent */
3231 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3235 rc = ll_lease_och_release(inode, file);
3244 switch (ioc->lil_flags) {
3245 case LL_LEASE_RESYNC_DONE:
3247 OBD_FREE(data, data_size);
3249 case LL_LEASE_LAYOUT_MERGE:
3250 case LL_LEASE_LAYOUT_SPLIT:
3257 rc = ll_lease_type_from_fmode(fmode);
3261 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3264 struct inode *inode = file_inode(file);
3265 struct ll_inode_info *lli = ll_i2info(inode);
3266 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3267 struct obd_client_handle *och = NULL;
3268 __u64 open_flags = 0;
3274 switch (ioc->lil_mode) {
3275 case LL_LEASE_WRLCK:
3276 if (!(file->f_mode & FMODE_WRITE))
3278 fmode = FMODE_WRITE;
3280 case LL_LEASE_RDLCK:
3281 if (!(file->f_mode & FMODE_READ))
3285 case LL_LEASE_UNLCK:
3286 RETURN(ll_file_unlock_lease(file, ioc, arg));
3291 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3293 /* apply for lease */
3294 if (ioc->lil_flags & LL_LEASE_RESYNC)
3295 open_flags = MDS_OPEN_RESYNC;
3296 och = ll_lease_open(inode, file, fmode, open_flags);
3298 RETURN(PTR_ERR(och));
3300 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3301 rc = ll_lease_file_resync(och, inode, arg);
3303 ll_lease_close(och, inode, NULL);
3306 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3308 ll_lease_close(och, inode, NULL);
3314 mutex_lock(&lli->lli_och_mutex);
3315 if (fd->fd_lease_och == NULL) {
3316 fd->fd_lease_och = och;
3319 mutex_unlock(&lli->lli_och_mutex);
3321 /* impossible now that only excl is supported for now */
3322 ll_lease_close(och, inode, &lease_broken);
3329 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3331 struct inode *inode = file_inode(file);
3332 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3336 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3337 PFID(ll_inode2fid(inode)), inode, cmd);
3338 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3340 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3341 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3345 case LL_IOC_GETFLAGS:
3346 /* Get the current value of the file flags */
3347 return put_user(fd->fd_flags, (int __user *)arg);
3348 case LL_IOC_SETFLAGS:
3349 case LL_IOC_CLRFLAGS:
3350 /* Set or clear specific file flags */
3351 /* XXX This probably needs checks to ensure the flags are
3352 * not abused, and to handle any flag side effects.
3354 if (get_user(flags, (int __user *) arg))
3357 if (cmd == LL_IOC_SETFLAGS) {
3358 if ((flags & LL_FILE_IGNORE_LOCK) &&
3359 !(file->f_flags & O_DIRECT)) {
3360 CERROR("%s: unable to disable locking on "
3361 "non-O_DIRECT file\n", current->comm);
3365 fd->fd_flags |= flags;
3367 fd->fd_flags &= ~flags;
3370 case LL_IOC_LOV_SETSTRIPE:
3371 case LL_IOC_LOV_SETSTRIPE_NEW:
3372 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3373 case LL_IOC_LOV_SETEA:
3374 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3375 case LL_IOC_LOV_SWAP_LAYOUTS: {
3377 struct lustre_swap_layouts lsl;
3379 if (copy_from_user(&lsl, (char __user *)arg,
3380 sizeof(struct lustre_swap_layouts)))
3383 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3386 file2 = fget(lsl.sl_fd);
3390 /* O_WRONLY or O_RDWR */
3391 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3392 GOTO(out, rc = -EPERM);
3394 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3395 struct inode *inode2;
3396 struct ll_inode_info *lli;
3397 struct obd_client_handle *och = NULL;
3399 lli = ll_i2info(inode);
3400 mutex_lock(&lli->lli_och_mutex);
3401 if (fd->fd_lease_och != NULL) {
3402 och = fd->fd_lease_och;
3403 fd->fd_lease_och = NULL;
3405 mutex_unlock(&lli->lli_och_mutex);
3407 GOTO(out, rc = -ENOLCK);
3408 inode2 = file_inode(file2);
3409 rc = ll_swap_layouts_close(och, inode, inode2);
3411 rc = ll_swap_layouts(file, file2, &lsl);
3417 case LL_IOC_LOV_GETSTRIPE:
3418 case LL_IOC_LOV_GETSTRIPE_NEW:
3419 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3420 case FS_IOC_GETFLAGS:
3421 case FS_IOC_SETFLAGS:
3422 RETURN(ll_iocontrol(inode, file, cmd, arg));
3423 case FSFILT_IOC_GETVERSION:
3424 case FS_IOC_GETVERSION:
3425 RETURN(put_user(inode->i_generation, (int __user *)arg));
3426 /* We need to special case any other ioctls we want to handle,
3427 * to send them to the MDS/OST as appropriate and to properly
3428 * network encode the arg field. */
3429 case FS_IOC_SETVERSION:
3432 case LL_IOC_GROUP_LOCK:
3433 RETURN(ll_get_grouplock(inode, file, arg));
3434 case LL_IOC_GROUP_UNLOCK:
3435 RETURN(ll_put_grouplock(inode, file, arg));
3436 case IOC_OBD_STATFS:
3437 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3439 case LL_IOC_FLUSHCTX:
3440 RETURN(ll_flush_ctx(inode));
3441 case LL_IOC_PATH2FID: {
3442 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3443 sizeof(struct lu_fid)))
3448 case LL_IOC_GETPARENT:
3449 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3451 case OBD_IOC_FID2PATH:
3452 RETURN(ll_fid2path(inode, (void __user *)arg));
3453 case LL_IOC_DATA_VERSION: {
3454 struct ioc_data_version idv;
3457 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3460 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3461 rc = ll_ioc_data_version(inode, &idv);
3464 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3470 case LL_IOC_GET_MDTIDX: {
3473 mdtidx = ll_get_mdt_idx(inode);
3477 if (put_user((int)mdtidx, (int __user *)arg))
3482 case OBD_IOC_GETDTNAME:
3483 case OBD_IOC_GETMDNAME:
3484 RETURN(ll_get_obd_name(inode, cmd, arg));
3485 case LL_IOC_HSM_STATE_GET: {
3486 struct md_op_data *op_data;
3487 struct hsm_user_state *hus;
3494 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3495 LUSTRE_OPC_ANY, hus);
3496 if (IS_ERR(op_data)) {
3498 RETURN(PTR_ERR(op_data));
3501 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3504 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3507 ll_finish_md_op_data(op_data);
3511 case LL_IOC_HSM_STATE_SET: {
3512 struct hsm_state_set *hss;
3519 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3524 rc = ll_hsm_state_set(inode, hss);
3529 case LL_IOC_HSM_ACTION: {
3530 struct md_op_data *op_data;
3531 struct hsm_current_action *hca;
3538 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3539 LUSTRE_OPC_ANY, hca);
3540 if (IS_ERR(op_data)) {
3542 RETURN(PTR_ERR(op_data));
3545 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3548 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3551 ll_finish_md_op_data(op_data);
3555 case LL_IOC_SET_LEASE_OLD: {
3556 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3558 RETURN(ll_file_set_lease(file, &ioc, 0));
3560 case LL_IOC_SET_LEASE: {
3561 struct ll_ioc_lease ioc;
3563 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3566 RETURN(ll_file_set_lease(file, &ioc, arg));
3568 case LL_IOC_GET_LEASE: {
3569 struct ll_inode_info *lli = ll_i2info(inode);
3570 struct ldlm_lock *lock = NULL;
3573 mutex_lock(&lli->lli_och_mutex);
3574 if (fd->fd_lease_och != NULL) {
3575 struct obd_client_handle *och = fd->fd_lease_och;
3577 lock = ldlm_handle2lock(&och->och_lease_handle);
3579 lock_res_and_lock(lock);
3580 if (!ldlm_is_cancel(lock))
3581 fmode = och->och_flags;
3583 unlock_res_and_lock(lock);
3584 LDLM_LOCK_PUT(lock);
3587 mutex_unlock(&lli->lli_och_mutex);
3589 RETURN(ll_lease_type_from_fmode(fmode));
3591 case LL_IOC_HSM_IMPORT: {
3592 struct hsm_user_import *hui;
3598 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3603 rc = ll_hsm_import(inode, file, hui);
3608 case LL_IOC_FUTIMES_3: {
3609 struct ll_futimes_3 lfu;
3611 if (copy_from_user(&lfu,
3612 (const struct ll_futimes_3 __user *)arg,
3616 RETURN(ll_file_futimes_3(file, &lfu));
3618 case LL_IOC_LADVISE: {
3619 struct llapi_ladvise_hdr *k_ladvise_hdr;
3620 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3623 int alloc_size = sizeof(*k_ladvise_hdr);
3626 u_ladvise_hdr = (void __user *)arg;
3627 OBD_ALLOC_PTR(k_ladvise_hdr);
3628 if (k_ladvise_hdr == NULL)
3631 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3632 GOTO(out_ladvise, rc = -EFAULT);
3634 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3635 k_ladvise_hdr->lah_count < 1)
3636 GOTO(out_ladvise, rc = -EINVAL);
3638 num_advise = k_ladvise_hdr->lah_count;
3639 if (num_advise >= LAH_COUNT_MAX)
3640 GOTO(out_ladvise, rc = -EFBIG);
3642 OBD_FREE_PTR(k_ladvise_hdr);
3643 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3644 lah_advise[num_advise]);
3645 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3646 if (k_ladvise_hdr == NULL)
3650 * TODO: submit multiple advices to one server in a single RPC
3652 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3653 GOTO(out_ladvise, rc = -EFAULT);
3655 for (i = 0; i < num_advise; i++) {
3656 struct llapi_lu_ladvise *k_ladvise =
3657 &k_ladvise_hdr->lah_advise[i];
3658 struct llapi_lu_ladvise __user *u_ladvise =
3659 &u_ladvise_hdr->lah_advise[i];
3661 rc = ll_ladvise_sanity(inode, k_ladvise);
3663 GOTO(out_ladvise, rc);
3665 switch (k_ladvise->lla_advice) {
3666 case LU_LADVISE_LOCKNOEXPAND:
3667 rc = ll_lock_noexpand(file,
3668 k_ladvise->lla_peradvice_flags);
3669 GOTO(out_ladvise, rc);
3670 case LU_LADVISE_LOCKAHEAD:
3672 rc = ll_file_lock_ahead(file, k_ladvise);
3675 GOTO(out_ladvise, rc);
3678 &u_ladvise->lla_lockahead_result))
3679 GOTO(out_ladvise, rc = -EFAULT);
3682 rc = ll_ladvise(inode, file,
3683 k_ladvise_hdr->lah_flags,
3686 GOTO(out_ladvise, rc);
3693 OBD_FREE(k_ladvise_hdr, alloc_size);
3696 case LL_IOC_FLR_SET_MIRROR: {
3697 /* mirror I/O must be direct to avoid polluting page cache
3699 if (!(file->f_flags & O_DIRECT))
3702 fd->fd_designated_mirror = (__u32)arg;
3705 case LL_IOC_FSGETXATTR:
3706 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3707 case LL_IOC_FSSETXATTR:
3708 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3710 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3712 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3713 (void __user *)arg));
3717 #ifndef HAVE_FILE_LLSEEK_SIZE
3718 static inline loff_t
3719 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3721 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3723 if (offset > maxsize)
3726 if (offset != file->f_pos) {
3727 file->f_pos = offset;
3728 file->f_version = 0;
3734 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3735 loff_t maxsize, loff_t eof)
3737 struct inode *inode = file_inode(file);
3745 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3746 * position-querying operation. Avoid rewriting the "same"
3747 * f_pos value back to the file because a concurrent read(),
3748 * write() or lseek() might have altered it
3753 * f_lock protects against read/modify/write race with other
3754 * SEEK_CURs. Note that parallel writes and reads behave
3758 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3759 inode_unlock(inode);
3763 * In the generic case the entire file is data, so as long as
3764 * offset isn't at the end of the file then the offset is data.
3771 * There is a virtual hole at the end of the file, so as long as
3772 * offset isn't i_size or larger, return i_size.
3780 return llseek_execute(file, offset, maxsize);
3784 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3786 struct inode *inode = file_inode(file);
3787 loff_t retval, eof = 0;
3790 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3791 (origin == SEEK_CUR) ? file->f_pos : 0);
3792 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3793 PFID(ll_inode2fid(inode)), inode, retval, retval,
3795 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3797 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3798 retval = ll_glimpse_size(inode);
3801 eof = i_size_read(inode);
3804 retval = ll_generic_file_llseek_size(file, offset, origin,
3805 ll_file_maxbytes(inode), eof);
3809 static int ll_flush(struct file *file, fl_owner_t id)
3811 struct inode *inode = file_inode(file);
3812 struct ll_inode_info *lli = ll_i2info(inode);
3813 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3816 LASSERT(!S_ISDIR(inode->i_mode));
3818 /* catch async errors that were recorded back when async writeback
3819 * failed for pages in this mapping. */
3820 rc = lli->lli_async_rc;
3821 lli->lli_async_rc = 0;
3822 if (lli->lli_clob != NULL) {
3823 err = lov_read_and_clear_async_rc(lli->lli_clob);
3828 /* The application has been told write failure already.
3829 * Do not report failure again. */
3830 if (fd->fd_write_failed)
3832 return rc ? -EIO : 0;
3836 * Called to make sure a portion of file has been written out.
3837 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3839 * Return how many pages have been written.
3841 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3842 enum cl_fsync_mode mode, int ignore_layout)
3846 struct cl_fsync_io *fio;
3851 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3852 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3855 env = cl_env_get(&refcheck);
3857 RETURN(PTR_ERR(env));
3859 io = vvp_env_thread_io(env);
3860 io->ci_obj = ll_i2info(inode)->lli_clob;
3861 io->ci_ignore_layout = ignore_layout;
3863 /* initialize parameters for sync */
3864 fio = &io->u.ci_fsync;
3865 fio->fi_start = start;
3867 fio->fi_fid = ll_inode2fid(inode);
3868 fio->fi_mode = mode;
3869 fio->fi_nr_written = 0;
3871 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3872 result = cl_io_loop(env, io);
3874 result = io->ci_result;
3876 result = fio->fi_nr_written;
3877 cl_io_fini(env, io);
3878 cl_env_put(env, &refcheck);
3884 * When dentry is provided (the 'else' case), file_dentry() may be
3885 * null and dentry must be used directly rather than pulled from
3886 * file_dentry() as is done otherwise.
3889 #ifdef HAVE_FILE_FSYNC_4ARGS
3890 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3892 struct dentry *dentry = file_dentry(file);
3894 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3895 int ll_fsync(struct file *file, int datasync)
3897 struct dentry *dentry = file_dentry(file);
3899 loff_t end = LLONG_MAX;
3901 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3904 loff_t end = LLONG_MAX;
3906 struct inode *inode = dentry->d_inode;
3907 struct ll_inode_info *lli = ll_i2info(inode);
3908 struct ptlrpc_request *req;
3912 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3913 PFID(ll_inode2fid(inode)), inode);
3914 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3916 #ifdef HAVE_FILE_FSYNC_4ARGS
3917 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3918 lock_inode = !lli->lli_inode_locked;
3922 /* fsync's caller has already called _fdata{sync,write}, we want
3923 * that IO to finish before calling the osc and mdc sync methods */
3924 rc = filemap_fdatawait(inode->i_mapping);
3927 /* catch async errors that were recorded back when async writeback
3928 * failed for pages in this mapping. */
3929 if (!S_ISDIR(inode->i_mode)) {
3930 err = lli->lli_async_rc;
3931 lli->lli_async_rc = 0;
3934 if (lli->lli_clob != NULL) {
3935 err = lov_read_and_clear_async_rc(lli->lli_clob);
3941 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3945 ptlrpc_req_finished(req);
3947 if (S_ISREG(inode->i_mode)) {
3948 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3950 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3951 if (rc == 0 && err < 0)
3954 fd->fd_write_failed = true;
3956 fd->fd_write_failed = false;
3959 #ifdef HAVE_FILE_FSYNC_4ARGS
3961 inode_unlock(inode);
3967 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3969 struct inode *inode = file_inode(file);
3970 struct ll_sb_info *sbi = ll_i2sbi(inode);
3971 struct ldlm_enqueue_info einfo = {
3972 .ei_type = LDLM_FLOCK,
3973 .ei_cb_cp = ldlm_flock_completion_ast,
3974 .ei_cbdata = file_lock,
3976 struct md_op_data *op_data;
3977 struct lustre_handle lockh = { 0 };
3978 union ldlm_policy_data flock = { { 0 } };
3979 int fl_type = file_lock->fl_type;
3985 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3986 PFID(ll_inode2fid(inode)), file_lock);
3988 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3990 if (file_lock->fl_flags & FL_FLOCK) {
3991 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3992 /* flocks are whole-file locks */
3993 flock.l_flock.end = OFFSET_MAX;
3994 /* For flocks owner is determined by the local file desctiptor*/
3995 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3996 } else if (file_lock->fl_flags & FL_POSIX) {
3997 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3998 flock.l_flock.start = file_lock->fl_start;
3999 flock.l_flock.end = file_lock->fl_end;
4003 flock.l_flock.pid = file_lock->fl_pid;
4005 /* Somewhat ugly workaround for svc lockd.
4006 * lockd installs custom fl_lmops->lm_compare_owner that checks
4007 * for the fl_owner to be the same (which it always is on local node
4008 * I guess between lockd processes) and then compares pid.
4009 * As such we assign pid to the owner field to make it all work,
4010 * conflict with normal locks is unlikely since pid space and
4011 * pointer space for current->files are not intersecting */
4012 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4013 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4017 einfo.ei_mode = LCK_PR;
4020 /* An unlock request may or may not have any relation to
4021 * existing locks so we may not be able to pass a lock handle
4022 * via a normal ldlm_lock_cancel() request. The request may even
4023 * unlock a byte range in the middle of an existing lock. In
4024 * order to process an unlock request we need all of the same
4025 * information that is given with a normal read or write record
4026 * lock request. To avoid creating another ldlm unlock (cancel)
4027 * message we'll treat a LCK_NL flock request as an unlock. */
4028 einfo.ei_mode = LCK_NL;
4031 einfo.ei_mode = LCK_PW;
4034 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4049 flags = LDLM_FL_BLOCK_NOWAIT;
4055 flags = LDLM_FL_TEST_LOCK;
4058 CERROR("unknown fcntl lock command: %d\n", cmd);
4062 /* Save the old mode so that if the mode in the lock changes we
4063 * can decrement the appropriate reader or writer refcount. */
4064 file_lock->fl_type = einfo.ei_mode;
4066 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4067 LUSTRE_OPC_ANY, NULL);
4068 if (IS_ERR(op_data))
4069 RETURN(PTR_ERR(op_data));
4071 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4072 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4073 flock.l_flock.pid, flags, einfo.ei_mode,
4074 flock.l_flock.start, flock.l_flock.end);
4076 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4079 /* Restore the file lock type if not TEST lock. */
4080 if (!(flags & LDLM_FL_TEST_LOCK))
4081 file_lock->fl_type = fl_type;
4083 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4084 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4085 !(flags & LDLM_FL_TEST_LOCK))
4086 rc2 = locks_lock_file_wait(file, file_lock);
4088 if ((file_lock->fl_flags & FL_FLOCK) &&
4089 (rc == 0 || file_lock->fl_type == F_UNLCK))
4090 rc2 = flock_lock_file_wait(file, file_lock);
4091 if ((file_lock->fl_flags & FL_POSIX) &&
4092 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4093 !(flags & LDLM_FL_TEST_LOCK))
4094 rc2 = posix_lock_file_wait(file, file_lock);
4095 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4097 if (rc2 && file_lock->fl_type != F_UNLCK) {
4098 einfo.ei_mode = LCK_NL;
4099 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4104 ll_finish_md_op_data(op_data);
4109 int ll_get_fid_by_name(struct inode *parent, const char *name,
4110 int namelen, struct lu_fid *fid,
4111 struct inode **inode)
4113 struct md_op_data *op_data = NULL;
4114 struct mdt_body *body;
4115 struct ptlrpc_request *req;
4119 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4120 LUSTRE_OPC_ANY, NULL);
4121 if (IS_ERR(op_data))
4122 RETURN(PTR_ERR(op_data));
4124 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4125 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4126 ll_finish_md_op_data(op_data);
4130 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4132 GOTO(out_req, rc = -EFAULT);
4134 *fid = body->mbo_fid1;
4137 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4139 ptlrpc_req_finished(req);
4143 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4146 struct dentry *dchild = NULL;
4147 struct inode *child_inode = NULL;
4148 struct md_op_data *op_data;
4149 struct ptlrpc_request *request = NULL;
4150 struct obd_client_handle *och = NULL;
4152 struct mdt_body *body;
4153 __u64 data_version = 0;
4154 size_t namelen = strlen(name);
4155 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4159 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4160 PFID(ll_inode2fid(parent)), name,
4161 lum->lum_stripe_offset, lum->lum_stripe_count);
4163 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4164 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4165 lustre_swab_lmv_user_md(lum);
4167 /* Get child FID first */
4168 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4171 dchild = d_lookup(file_dentry(file), &qstr);
4173 if (dchild->d_inode)
4174 child_inode = igrab(dchild->d_inode);
4179 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4188 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4189 OBD_CONNECT2_DIR_MIGRATE)) {
4190 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4191 ll_i2info(child_inode)->lli_lsm_md) {
4192 CERROR("%s: MDT doesn't support stripe directory "
4194 ll_get_fsname(parent->i_sb, NULL, 0));
4195 GOTO(out_iput, rc = -EOPNOTSUPP);
4200 * lfs migrate command needs to be blocked on the client
4201 * by checking the migrate FID against the FID of the
4204 if (child_inode == parent->i_sb->s_root->d_inode)
4205 GOTO(out_iput, rc = -EINVAL);
4207 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4208 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4209 if (IS_ERR(op_data))
4210 GOTO(out_iput, rc = PTR_ERR(op_data));
4212 inode_lock(child_inode);
4213 op_data->op_fid3 = *ll_inode2fid(child_inode);
4214 if (!fid_is_sane(&op_data->op_fid3)) {
4215 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4216 ll_get_fsname(parent->i_sb, NULL, 0), name,
4217 PFID(&op_data->op_fid3));
4218 GOTO(out_unlock, rc = -EINVAL);
4221 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4222 op_data->op_data = lum;
4223 op_data->op_data_size = lumlen;
4226 if (S_ISREG(child_inode->i_mode)) {
4227 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4231 GOTO(out_unlock, rc);
4234 rc = ll_data_version(child_inode, &data_version,
4237 GOTO(out_close, rc);
4239 op_data->op_open_handle = och->och_open_handle;
4240 op_data->op_data_version = data_version;
4241 op_data->op_lease_handle = och->och_lease_handle;
4242 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4244 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4245 och->och_mod->mod_open_req->rq_replay = 0;
4246 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4249 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4250 name, namelen, &request);
4252 LASSERT(request != NULL);
4253 ll_update_times(request, parent);
4255 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4256 LASSERT(body != NULL);
4258 /* If the server does release layout lock, then we cleanup
4259 * the client och here, otherwise release it in out_close: */
4260 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4261 obd_mod_put(och->och_mod);
4262 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4264 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4270 if (request != NULL) {
4271 ptlrpc_req_finished(request);
4275 /* Try again if the file layout has changed. */
4276 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4281 ll_lease_close(och, child_inode, NULL);
4283 clear_nlink(child_inode);
4285 inode_unlock(child_inode);
4286 ll_finish_md_op_data(op_data);
4293 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4301 * test if some locks matching bits and l_req_mode are acquired
4302 * - bits can be in different locks
4303 * - if found clear the common lock bits in *bits
4304 * - the bits not found, are kept in *bits
4306 * \param bits [IN] searched lock bits [IN]
4307 * \param l_req_mode [IN] searched lock mode
4308 * \retval boolean, true iff all bits are found
4310 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4312 struct lustre_handle lockh;
4313 union ldlm_policy_data policy;
4314 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4315 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4324 fid = &ll_i2info(inode)->lli_fid;
4325 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4326 ldlm_lockname[mode]);
4328 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4329 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4330 policy.l_inodebits.bits = *bits & (1 << i);
4331 if (policy.l_inodebits.bits == 0)
4334 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4335 &policy, mode, &lockh)) {
4336 struct ldlm_lock *lock;
4338 lock = ldlm_handle2lock(&lockh);
4341 ~(lock->l_policy_data.l_inodebits.bits);
4342 LDLM_LOCK_PUT(lock);
4344 *bits &= ~policy.l_inodebits.bits;
4351 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4352 struct lustre_handle *lockh, __u64 flags,
4353 enum ldlm_mode mode)
4355 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4360 fid = &ll_i2info(inode)->lli_fid;
4361 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4363 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4364 fid, LDLM_IBITS, &policy, mode, lockh);
4369 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4371 /* Already unlinked. Just update nlink and return success */
4372 if (rc == -ENOENT) {
4374 /* If it is striped directory, and there is bad stripe
4375 * Let's revalidate the dentry again, instead of returning
4377 if (S_ISDIR(inode->i_mode) &&
4378 ll_i2info(inode)->lli_lsm_md != NULL)
4381 /* This path cannot be hit for regular files unless in
4382 * case of obscure races, so no need to to validate
4384 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4386 } else if (rc != 0) {
4387 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4388 "%s: revalidate FID "DFID" error: rc = %d\n",
4389 ll_get_fsname(inode->i_sb, NULL, 0),
4390 PFID(ll_inode2fid(inode)), rc);
4396 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4398 struct inode *inode = dentry->d_inode;
4399 struct obd_export *exp = ll_i2mdexp(inode);
4400 struct lookup_intent oit = {
4403 struct ptlrpc_request *req = NULL;
4404 struct md_op_data *op_data;
4408 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4409 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4411 /* Call getattr by fid, so do not provide name at all. */
4412 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4413 LUSTRE_OPC_ANY, NULL);
4414 if (IS_ERR(op_data))
4415 RETURN(PTR_ERR(op_data));
4417 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4418 ll_finish_md_op_data(op_data);
4420 rc = ll_inode_revalidate_fini(inode, rc);
4424 rc = ll_revalidate_it_finish(req, &oit, dentry);
4426 ll_intent_release(&oit);
4430 /* Unlinked? Unhash dentry, so it is not picked up later by
4431 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4432 * here to preserve get_cwd functionality on 2.6.
4434 if (!dentry->d_inode->i_nlink) {
4435 ll_lock_dcache(inode);
4436 d_lustre_invalidate(dentry, 0);
4437 ll_unlock_dcache(inode);
4440 ll_lookup_finish_locks(&oit, dentry);
4442 ptlrpc_req_finished(req);
4447 static int ll_merge_md_attr(struct inode *inode)
4449 struct ll_inode_info *lli = ll_i2info(inode);
4450 struct cl_attr attr = { 0 };
4453 LASSERT(lli->lli_lsm_md != NULL);
4454 down_read(&lli->lli_lsm_sem);
4455 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4456 &attr, ll_md_blocking_ast);
4457 up_read(&lli->lli_lsm_sem);
4461 set_nlink(inode, attr.cat_nlink);
4462 inode->i_blocks = attr.cat_blocks;
4463 i_size_write(inode, attr.cat_size);
4465 ll_i2info(inode)->lli_atime = attr.cat_atime;
4466 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4467 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4472 static inline dev_t ll_compat_encode_dev(dev_t dev)
4474 /* The compat_sys_*stat*() syscalls will fail unless the
4475 * device majors and minors are both less than 256. Note that
4476 * the value returned here will be passed through
4477 * old_encode_dev() in cp_compat_stat(). And so we are not
4478 * trying to return a valid compat (u16) device number, just
4479 * one that will pass the old_valid_dev() check. */
4481 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4484 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4485 int ll_getattr(const struct path *path, struct kstat *stat,
4486 u32 request_mask, unsigned int flags)
4488 struct dentry *de = path->dentry;
4490 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4493 struct inode *inode = de->d_inode;
4494 struct ll_sb_info *sbi = ll_i2sbi(inode);
4495 struct ll_inode_info *lli = ll_i2info(inode);
4498 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4500 rc = ll_inode_revalidate(de, IT_GETATTR);
4504 if (S_ISREG(inode->i_mode)) {
4505 /* In case of restore, the MDT has the right size and has
4506 * already send it back without granting the layout lock,
4507 * inode is up-to-date so glimpse is useless.
4508 * Also to glimpse we need the layout, in case of a running
4509 * restore the MDT holds the layout lock so the glimpse will
4510 * block up to the end of restore (getattr will block)
4512 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4513 rc = ll_glimpse_size(inode);
4518 /* If object isn't regular a file then don't validate size. */
4519 if (S_ISDIR(inode->i_mode) &&
4520 lli->lli_lsm_md != NULL) {
4521 rc = ll_merge_md_attr(inode);
4526 LTIME_S(inode->i_atime) = lli->lli_atime;
4527 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4528 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4531 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4533 if (ll_need_32bit_api(sbi)) {
4534 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4535 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4536 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4538 stat->ino = inode->i_ino;
4539 stat->dev = inode->i_sb->s_dev;
4540 stat->rdev = inode->i_rdev;
4543 stat->mode = inode->i_mode;
4544 stat->uid = inode->i_uid;
4545 stat->gid = inode->i_gid;
4546 stat->atime = inode->i_atime;
4547 stat->mtime = inode->i_mtime;
4548 stat->ctime = inode->i_ctime;
4549 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4551 stat->nlink = inode->i_nlink;
4552 stat->size = i_size_read(inode);
4553 stat->blocks = inode->i_blocks;
4558 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4559 __u64 start, __u64 len)
4563 struct fiemap *fiemap;
4564 unsigned int extent_count = fieinfo->fi_extents_max;
4566 num_bytes = sizeof(*fiemap) + (extent_count *
4567 sizeof(struct fiemap_extent));
4568 OBD_ALLOC_LARGE(fiemap, num_bytes);
4573 fiemap->fm_flags = fieinfo->fi_flags;
4574 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4575 fiemap->fm_start = start;
4576 fiemap->fm_length = len;
4577 if (extent_count > 0 &&
4578 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4579 sizeof(struct fiemap_extent)) != 0)
4580 GOTO(out, rc = -EFAULT);
4582 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4584 fieinfo->fi_flags = fiemap->fm_flags;
4585 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4586 if (extent_count > 0 &&
4587 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4588 fiemap->fm_mapped_extents *
4589 sizeof(struct fiemap_extent)) != 0)
4590 GOTO(out, rc = -EFAULT);
4592 OBD_FREE_LARGE(fiemap, num_bytes);
4596 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4598 struct ll_inode_info *lli = ll_i2info(inode);
4599 struct posix_acl *acl = NULL;
4602 spin_lock(&lli->lli_lock);
4603 /* VFS' acl_permission_check->check_acl will release the refcount */
4604 acl = posix_acl_dup(lli->lli_posix_acl);
4605 spin_unlock(&lli->lli_lock);
4610 #ifdef HAVE_IOP_SET_ACL
4611 #ifdef CONFIG_FS_POSIX_ACL
4612 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4614 struct ll_sb_info *sbi = ll_i2sbi(inode);
4615 struct ptlrpc_request *req = NULL;
4616 const char *name = NULL;
4618 size_t value_size = 0;
4623 case ACL_TYPE_ACCESS:
4624 name = XATTR_NAME_POSIX_ACL_ACCESS;
4626 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4629 case ACL_TYPE_DEFAULT:
4630 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4631 if (!S_ISDIR(inode->i_mode))
4632 rc = acl ? -EACCES : 0;
4643 value_size = posix_acl_xattr_size(acl->a_count);
4644 value = kmalloc(value_size, GFP_NOFS);
4646 GOTO(out, rc = -ENOMEM);
4648 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4650 GOTO(out_value, rc);
4653 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4654 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4655 name, value, value_size, 0, 0, &req);
4657 ptlrpc_req_finished(req);
4662 forget_cached_acl(inode, type);
4664 set_cached_acl(inode, type, acl);
4667 #endif /* CONFIG_FS_POSIX_ACL */
4668 #endif /* HAVE_IOP_SET_ACL */
4670 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4672 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4673 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4675 ll_check_acl(struct inode *inode, int mask)
4678 # ifdef CONFIG_FS_POSIX_ACL
4679 struct posix_acl *acl;
4683 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4684 if (flags & IPERM_FLAG_RCU)
4687 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4692 rc = posix_acl_permission(inode, acl, mask);
4693 posix_acl_release(acl);
4696 # else /* !CONFIG_FS_POSIX_ACL */
4698 # endif /* CONFIG_FS_POSIX_ACL */
4700 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4702 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4703 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4705 # ifdef HAVE_INODE_PERMISION_2ARGS
4706 int ll_inode_permission(struct inode *inode, int mask)
4708 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4713 struct ll_sb_info *sbi;
4714 struct root_squash_info *squash;
4715 struct cred *cred = NULL;
4716 const struct cred *old_cred = NULL;
4718 bool squash_id = false;
4721 #ifdef MAY_NOT_BLOCK
4722 if (mask & MAY_NOT_BLOCK)
4724 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4725 if (flags & IPERM_FLAG_RCU)
4729 /* as root inode are NOT getting validated in lookup operation,
4730 * need to do it before permission check. */
4732 if (inode == inode->i_sb->s_root->d_inode) {
4733 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4738 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4739 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4741 /* squash fsuid/fsgid if needed */
4742 sbi = ll_i2sbi(inode);
4743 squash = &sbi->ll_squash;
4744 if (unlikely(squash->rsi_uid != 0 &&
4745 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4746 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4750 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4751 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4752 squash->rsi_uid, squash->rsi_gid);
4754 /* update current process's credentials
4755 * and FS capability */
4756 cred = prepare_creds();
4760 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4761 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4762 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4763 if ((1 << cap) & CFS_CAP_FS_MASK)
4764 cap_lower(cred->cap_effective, cap);
4766 old_cred = override_creds(cred);
4769 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4770 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4771 /* restore current process's credentials and FS capability */
4773 revert_creds(old_cred);
4780 /* -o localflock - only provides locally consistent flock locks */
4781 struct file_operations ll_file_operations = {
4782 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4783 # ifdef HAVE_SYNC_READ_WRITE
4784 .read = new_sync_read,
4785 .write = new_sync_write,
4787 .read_iter = ll_file_read_iter,
4788 .write_iter = ll_file_write_iter,
4789 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4790 .read = ll_file_read,
4791 .aio_read = ll_file_aio_read,
4792 .write = ll_file_write,
4793 .aio_write = ll_file_aio_write,
4794 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4795 .unlocked_ioctl = ll_file_ioctl,
4796 .open = ll_file_open,
4797 .release = ll_file_release,
4798 .mmap = ll_file_mmap,
4799 .llseek = ll_file_seek,
4800 .splice_read = ll_file_splice_read,
4805 struct file_operations ll_file_operations_flock = {
4806 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4807 # ifdef HAVE_SYNC_READ_WRITE
4808 .read = new_sync_read,
4809 .write = new_sync_write,
4810 # endif /* HAVE_SYNC_READ_WRITE */
4811 .read_iter = ll_file_read_iter,
4812 .write_iter = ll_file_write_iter,
4813 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4814 .read = ll_file_read,
4815 .aio_read = ll_file_aio_read,
4816 .write = ll_file_write,
4817 .aio_write = ll_file_aio_write,
4818 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4819 .unlocked_ioctl = ll_file_ioctl,
4820 .open = ll_file_open,
4821 .release = ll_file_release,
4822 .mmap = ll_file_mmap,
4823 .llseek = ll_file_seek,
4824 .splice_read = ll_file_splice_read,
4827 .flock = ll_file_flock,
4828 .lock = ll_file_flock
4831 /* These are for -o noflock - to return ENOSYS on flock calls */
4832 struct file_operations ll_file_operations_noflock = {
4833 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4834 # ifdef HAVE_SYNC_READ_WRITE
4835 .read = new_sync_read,
4836 .write = new_sync_write,
4837 # endif /* HAVE_SYNC_READ_WRITE */
4838 .read_iter = ll_file_read_iter,
4839 .write_iter = ll_file_write_iter,
4840 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4841 .read = ll_file_read,
4842 .aio_read = ll_file_aio_read,
4843 .write = ll_file_write,
4844 .aio_write = ll_file_aio_write,
4845 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4846 .unlocked_ioctl = ll_file_ioctl,
4847 .open = ll_file_open,
4848 .release = ll_file_release,
4849 .mmap = ll_file_mmap,
4850 .llseek = ll_file_seek,
4851 .splice_read = ll_file_splice_read,
4854 .flock = ll_file_noflock,
4855 .lock = ll_file_noflock
4858 struct inode_operations ll_file_inode_operations = {
4859 .setattr = ll_setattr,
4860 .getattr = ll_getattr,
4861 .permission = ll_inode_permission,
4862 #ifdef HAVE_IOP_XATTR
4863 .setxattr = ll_setxattr,
4864 .getxattr = ll_getxattr,
4865 .removexattr = ll_removexattr,
4867 .listxattr = ll_listxattr,
4868 .fiemap = ll_fiemap,
4869 #ifdef HAVE_IOP_GET_ACL
4870 .get_acl = ll_get_acl,
4872 #ifdef HAVE_IOP_SET_ACL
4873 .set_acl = ll_set_acl,
4877 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4879 struct ll_inode_info *lli = ll_i2info(inode);
4880 struct cl_object *obj = lli->lli_clob;
4889 env = cl_env_get(&refcheck);
4891 RETURN(PTR_ERR(env));
4893 rc = cl_conf_set(env, lli->lli_clob, conf);
4897 if (conf->coc_opc == OBJECT_CONF_SET) {
4898 struct ldlm_lock *lock = conf->coc_lock;
4899 struct cl_layout cl = {
4903 LASSERT(lock != NULL);
4904 LASSERT(ldlm_has_layout(lock));
4906 /* it can only be allowed to match after layout is
4907 * applied to inode otherwise false layout would be
4908 * seen. Applying layout shoud happen before dropping
4909 * the intent lock. */
4910 ldlm_lock_allow_match(lock);
4912 rc = cl_object_layout_get(env, obj, &cl);
4917 DFID": layout version change: %u -> %u\n",
4918 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4920 ll_layout_version_set(lli, cl.cl_layout_gen);
4924 cl_env_put(env, &refcheck);
4929 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4930 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4933 struct ll_sb_info *sbi = ll_i2sbi(inode);
4934 struct ptlrpc_request *req;
4941 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4942 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4943 lock->l_lvb_data, lock->l_lvb_len);
4945 if (lock->l_lvb_data != NULL)
4948 /* if layout lock was granted right away, the layout is returned
4949 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4950 * blocked and then granted via completion ast, we have to fetch
4951 * layout here. Please note that we can't use the LVB buffer in
4952 * completion AST because it doesn't have a large enough buffer */
4953 rc = ll_get_default_mdsize(sbi, &lmmsize);
4957 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4958 XATTR_NAME_LOV, lmmsize, &req);
4961 GOTO(out, rc = 0); /* empty layout */
4968 if (lmmsize == 0) /* empty layout */
4971 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4973 GOTO(out, rc = -EFAULT);
4975 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4976 if (lvbdata == NULL)
4977 GOTO(out, rc = -ENOMEM);
4979 memcpy(lvbdata, lmm, lmmsize);
4980 lock_res_and_lock(lock);
4981 if (unlikely(lock->l_lvb_data == NULL)) {
4982 lock->l_lvb_type = LVB_T_LAYOUT;
4983 lock->l_lvb_data = lvbdata;
4984 lock->l_lvb_len = lmmsize;
4987 unlock_res_and_lock(lock);
4990 OBD_FREE_LARGE(lvbdata, lmmsize);
4995 ptlrpc_req_finished(req);
5000 * Apply the layout to the inode. Layout lock is held and will be released
5003 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5004 struct inode *inode)
5006 struct ll_inode_info *lli = ll_i2info(inode);
5007 struct ll_sb_info *sbi = ll_i2sbi(inode);
5008 struct ldlm_lock *lock;
5009 struct cl_object_conf conf;
5012 bool wait_layout = false;
5015 LASSERT(lustre_handle_is_used(lockh));
5017 lock = ldlm_handle2lock(lockh);
5018 LASSERT(lock != NULL);
5019 LASSERT(ldlm_has_layout(lock));
5021 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5022 PFID(&lli->lli_fid), inode);
5024 /* in case this is a caching lock and reinstate with new inode */
5025 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5027 lock_res_and_lock(lock);
5028 lvb_ready = ldlm_is_lvb_ready(lock);
5029 unlock_res_and_lock(lock);
5031 /* checking lvb_ready is racy but this is okay. The worst case is
5032 * that multi processes may configure the file on the same time. */
5036 rc = ll_layout_fetch(inode, lock);
5040 /* for layout lock, lmm is stored in lock's lvb.
5041 * lvb_data is immutable if the lock is held so it's safe to access it
5044 * set layout to file. Unlikely this will fail as old layout was
5045 * surely eliminated */
5046 memset(&conf, 0, sizeof conf);
5047 conf.coc_opc = OBJECT_CONF_SET;
5048 conf.coc_inode = inode;
5049 conf.coc_lock = lock;
5050 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5051 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5052 rc = ll_layout_conf(inode, &conf);
5054 /* refresh layout failed, need to wait */
5055 wait_layout = rc == -EBUSY;
5058 LDLM_LOCK_PUT(lock);
5059 ldlm_lock_decref(lockh, mode);
5061 /* wait for IO to complete if it's still being used. */
5063 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5064 ll_get_fsname(inode->i_sb, NULL, 0),
5065 PFID(&lli->lli_fid), inode);
5067 memset(&conf, 0, sizeof conf);
5068 conf.coc_opc = OBJECT_CONF_WAIT;
5069 conf.coc_inode = inode;
5070 rc = ll_layout_conf(inode, &conf);
5074 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5075 ll_get_fsname(inode->i_sb, NULL, 0),
5076 PFID(&lli->lli_fid), rc);
5082 * Issue layout intent RPC to MDS.
5083 * \param inode [in] file inode
5084 * \param intent [in] layout intent
5086 * \retval 0 on success
5087 * \retval < 0 error code
5089 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5091 struct ll_inode_info *lli = ll_i2info(inode);
5092 struct ll_sb_info *sbi = ll_i2sbi(inode);
5093 struct md_op_data *op_data;
5094 struct lookup_intent it;
5095 struct ptlrpc_request *req;
5099 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5100 0, 0, LUSTRE_OPC_ANY, NULL);
5101 if (IS_ERR(op_data))
5102 RETURN(PTR_ERR(op_data));
5104 op_data->op_data = intent;
5105 op_data->op_data_size = sizeof(*intent);
5107 memset(&it, 0, sizeof(it));
5108 it.it_op = IT_LAYOUT;
5109 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5110 intent->li_opc == LAYOUT_INTENT_TRUNC)
5111 it.it_flags = FMODE_WRITE;
5113 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5114 ll_get_fsname(inode->i_sb, NULL, 0),
5115 PFID(&lli->lli_fid), inode);
5117 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5118 &ll_md_blocking_ast, 0);
5119 if (it.it_request != NULL)
5120 ptlrpc_req_finished(it.it_request);
5121 it.it_request = NULL;
5123 ll_finish_md_op_data(op_data);
5125 /* set lock data in case this is a new lock */
5127 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5129 ll_intent_drop_lock(&it);
5135 * This function checks if there exists a LAYOUT lock on the client side,
5136 * or enqueues it if it doesn't have one in cache.
5138 * This function will not hold layout lock so it may be revoked any time after
5139 * this function returns. Any operations depend on layout should be redone
5142 * This function should be called before lov_io_init() to get an uptodate
5143 * layout version, the caller should save the version number and after IO
5144 * is finished, this function should be called again to verify that layout
5145 * is not changed during IO time.
5147 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5149 struct ll_inode_info *lli = ll_i2info(inode);
5150 struct ll_sb_info *sbi = ll_i2sbi(inode);
5151 struct lustre_handle lockh;
5152 struct layout_intent intent = {
5153 .li_opc = LAYOUT_INTENT_ACCESS,
5155 enum ldlm_mode mode;
5159 *gen = ll_layout_version_get(lli);
5160 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5164 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5165 LASSERT(S_ISREG(inode->i_mode));
5167 /* take layout lock mutex to enqueue layout lock exclusively. */
5168 mutex_lock(&lli->lli_layout_mutex);
5171 /* mostly layout lock is caching on the local side, so try to
5172 * match it before grabbing layout lock mutex. */
5173 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5174 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5175 if (mode != 0) { /* hit cached lock */
5176 rc = ll_layout_lock_set(&lockh, mode, inode);
5182 rc = ll_layout_intent(inode, &intent);
5188 *gen = ll_layout_version_get(lli);
5189 mutex_unlock(&lli->lli_layout_mutex);
5195 * Issue layout intent RPC indicating where in a file an IO is about to write.
5197 * \param[in] inode file inode.
5198 * \param[in] ext write range with start offset of fille in bytes where
5199 * an IO is about to write, and exclusive end offset in
5202 * \retval 0 on success
5203 * \retval < 0 error code
5205 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5206 struct lu_extent *ext)
5208 struct layout_intent intent = {
5210 .li_extent.e_start = ext->e_start,
5211 .li_extent.e_end = ext->e_end,
5216 rc = ll_layout_intent(inode, &intent);
5222 * This function send a restore request to the MDT
5224 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5226 struct hsm_user_request *hur;
5230 len = sizeof(struct hsm_user_request) +
5231 sizeof(struct hsm_user_item);
5232 OBD_ALLOC(hur, len);
5236 hur->hur_request.hr_action = HUA_RESTORE;
5237 hur->hur_request.hr_archive_id = 0;
5238 hur->hur_request.hr_flags = 0;
5239 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5240 sizeof(hur->hur_user_item[0].hui_fid));
5241 hur->hur_user_item[0].hui_extent.offset = offset;
5242 hur->hur_user_item[0].hui_extent.length = length;
5243 hur->hur_request.hr_itemcount = 1;
5244 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,