4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_get_fsname(inode->i_sb, NULL, 0),
143 PFID(&lli->lli_fid));
147 OBD_ALLOC_PTR(op_data);
148 /* We leak openhandle and request here on error, but not much to be
149 * done in OOM case since app won't retry close on error either. */
151 GOTO(out, rc = -ENOMEM);
153 ll_prepare_close(inode, op_data, och);
155 case MDS_CLOSE_LAYOUT_MERGE:
156 /* merge blocks from the victim inode */
157 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
158 op_data->op_attr.ia_valid |= ATTR_SIZE;
159 op_data->op_xvalid |= OP_XVALID_BLOCKS;
160 case MDS_CLOSE_LAYOUT_SPLIT:
161 case MDS_CLOSE_LAYOUT_SWAP: {
162 struct split_param *sp = data;
164 LASSERT(data != NULL);
165 op_data->op_bias |= bias;
166 op_data->op_data_version = 0;
167 op_data->op_lease_handle = och->och_lease_handle;
168 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
169 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
170 op_data->op_mirror_id = sp->sp_mirror_id;
172 op_data->op_fid2 = *ll_inode2fid(data);
177 case MDS_CLOSE_RESYNC_DONE: {
178 struct ll_ioc_lease *ioc = data;
180 LASSERT(data != NULL);
181 op_data->op_attr_blocks +=
182 ioc->lil_count * op_data->op_attr_blocks;
183 op_data->op_attr.ia_valid |= ATTR_SIZE;
184 op_data->op_xvalid |= OP_XVALID_BLOCKS;
185 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
187 op_data->op_lease_handle = och->och_lease_handle;
188 op_data->op_data = &ioc->lil_ids[0];
189 op_data->op_data_size =
190 ioc->lil_count * sizeof(ioc->lil_ids[0]);
194 case MDS_HSM_RELEASE:
195 LASSERT(data != NULL);
196 op_data->op_bias |= MDS_HSM_RELEASE;
197 op_data->op_data_version = *(__u64 *)data;
198 op_data->op_lease_handle = och->och_lease_handle;
199 op_data->op_attr.ia_valid |= ATTR_SIZE;
200 op_data->op_xvalid |= OP_XVALID_BLOCKS;
204 LASSERT(data == NULL);
208 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
209 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
210 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
211 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
213 rc = md_close(md_exp, op_data, och->och_mod, &req);
214 if (rc != 0 && rc != -EINTR)
215 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
216 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
218 if (rc == 0 && op_data->op_bias & bias) {
219 struct mdt_body *body;
221 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
222 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
226 ll_finish_md_op_data(op_data);
230 md_clear_open_replay_data(md_exp, och);
231 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
234 ptlrpc_req_finished(req); /* This is close request */
238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
240 struct ll_inode_info *lli = ll_i2info(inode);
241 struct obd_client_handle **och_p;
242 struct obd_client_handle *och;
247 if (fmode & FMODE_WRITE) {
248 och_p = &lli->lli_mds_write_och;
249 och_usecount = &lli->lli_open_fd_write_count;
250 } else if (fmode & FMODE_EXEC) {
251 och_p = &lli->lli_mds_exec_och;
252 och_usecount = &lli->lli_open_fd_exec_count;
254 LASSERT(fmode & FMODE_READ);
255 och_p = &lli->lli_mds_read_och;
256 och_usecount = &lli->lli_open_fd_read_count;
259 mutex_lock(&lli->lli_och_mutex);
260 if (*och_usecount > 0) {
261 /* There are still users of this handle, so skip
263 mutex_unlock(&lli->lli_och_mutex);
269 mutex_unlock(&lli->lli_och_mutex);
272 /* There might be a race and this handle may already
274 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
280 static int ll_md_close(struct inode *inode, struct file *file)
282 union ldlm_policy_data policy = {
283 .l_inodebits = { MDS_INODELOCK_OPEN },
285 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
286 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
287 struct ll_inode_info *lli = ll_i2info(inode);
288 struct lustre_handle lockh;
289 enum ldlm_mode lockmode;
293 /* clear group lock, if present */
294 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
295 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
297 if (fd->fd_lease_och != NULL) {
300 /* Usually the lease is not released when the
301 * application crashed, we need to release here. */
302 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
303 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
304 PFID(&lli->lli_fid), rc, lease_broken);
306 fd->fd_lease_och = NULL;
309 if (fd->fd_och != NULL) {
310 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
315 /* Let's see if we have good enough OPEN lock on the file and if
316 we can skip talking to MDS */
317 mutex_lock(&lli->lli_och_mutex);
318 if (fd->fd_omode & FMODE_WRITE) {
320 LASSERT(lli->lli_open_fd_write_count);
321 lli->lli_open_fd_write_count--;
322 } else if (fd->fd_omode & FMODE_EXEC) {
324 LASSERT(lli->lli_open_fd_exec_count);
325 lli->lli_open_fd_exec_count--;
328 LASSERT(lli->lli_open_fd_read_count);
329 lli->lli_open_fd_read_count--;
331 mutex_unlock(&lli->lli_och_mutex);
333 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
334 LDLM_IBITS, &policy, lockmode, &lockh))
335 rc = ll_md_real_close(inode, fd->fd_omode);
338 LUSTRE_FPRIVATE(file) = NULL;
339 ll_file_data_put(fd);
344 /* While this returns an error code, fput() the caller does not, so we need
345 * to make every effort to clean up all of our state here. Also, applications
346 * rarely check close errors and even if an error is returned they will not
347 * re-try the close call.
349 int ll_file_release(struct inode *inode, struct file *file)
351 struct ll_file_data *fd;
352 struct ll_sb_info *sbi = ll_i2sbi(inode);
353 struct ll_inode_info *lli = ll_i2info(inode);
357 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
358 PFID(ll_inode2fid(inode)), inode);
360 if (inode->i_sb->s_root != file_dentry(file))
361 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
362 fd = LUSTRE_FPRIVATE(file);
365 /* The last ref on @file, maybe not the the owner pid of statahead,
366 * because parent and child process can share the same file handle. */
367 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
368 ll_deauthorize_statahead(inode, fd);
370 if (inode->i_sb->s_root == file_dentry(file)) {
371 LUSTRE_FPRIVATE(file) = NULL;
372 ll_file_data_put(fd);
376 if (!S_ISDIR(inode->i_mode)) {
377 if (lli->lli_clob != NULL)
378 lov_read_and_clear_async_rc(lli->lli_clob);
379 lli->lli_async_rc = 0;
382 rc = ll_md_close(inode, file);
384 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
385 libcfs_debug_dumplog();
390 static inline int ll_dom_readpage(void *data, struct page *page)
392 struct niobuf_local *lnb = data;
395 kaddr = ll_kmap_atomic(page, KM_USER0);
396 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
397 if (lnb->lnb_len < PAGE_SIZE)
398 memset(kaddr + lnb->lnb_len, 0,
399 PAGE_SIZE - lnb->lnb_len);
400 flush_dcache_page(page);
401 SetPageUptodate(page);
402 ll_kunmap_atomic(kaddr, KM_USER0);
408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
409 struct lookup_intent *it)
411 struct ll_inode_info *lli = ll_i2info(inode);
412 struct cl_object *obj = lli->lli_clob;
413 struct address_space *mapping = inode->i_mapping;
415 struct niobuf_remote *rnb;
417 struct lustre_handle lockh;
418 struct ldlm_lock *lock;
419 unsigned long index, start;
420 struct niobuf_local lnb;
421 bool dom_lock = false;
428 if (it->it_lock_mode != 0) {
429 lockh.cookie = it->it_lock_handle;
430 lock = ldlm_handle2lock(&lockh);
432 dom_lock = ldlm_has_dom(lock);
438 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
442 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
443 if (rnb == NULL || rnb->rnb_len == 0)
446 /* LU-11595: Server may return whole file and that is OK always or
447 * it may return just file tail and its offset must be aligned with
448 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
449 * smaller then offset may be not aligned and that data is just ignored.
451 if (rnb->rnb_offset % PAGE_SIZE)
454 /* Server returns whole file or just file tail if it fills in
455 * reply buffer, in both cases total size should be inode size.
457 if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
458 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
459 ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
460 rnb->rnb_len, i_size_read(inode));
464 CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
465 rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
467 data = (char *)rnb + sizeof(*rnb);
469 lnb.lnb_file_offset = rnb->rnb_offset;
470 start = lnb.lnb_file_offset / PAGE_SIZE;
472 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
473 lnb.lnb_page_offset = 0;
475 lnb.lnb_data = data + (index << PAGE_SHIFT);
476 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
477 if (lnb.lnb_len > PAGE_SIZE)
478 lnb.lnb_len = PAGE_SIZE;
480 vmpage = read_cache_page(mapping, index + start,
481 ll_dom_readpage, &lnb);
482 if (IS_ERR(vmpage)) {
483 CWARN("%s: cannot fill page %lu for "DFID
484 " with data: rc = %li\n",
485 ll_get_fsname(inode->i_sb, NULL, 0),
486 index + start, PFID(lu_object_fid(&obj->co_lu)),
492 } while (rnb->rnb_len > (index << PAGE_SHIFT));
496 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
497 struct lookup_intent *itp)
499 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
500 struct dentry *parent = de->d_parent;
503 struct md_op_data *op_data;
504 struct ptlrpc_request *req = NULL;
508 LASSERT(parent != NULL);
509 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
511 /* if server supports open-by-fid, or file name is invalid, don't pack
512 * name in open request */
513 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
515 len = de->d_name.len;
516 name = kmalloc(len, GFP_NOFS);
520 spin_lock(&de->d_lock);
521 if (len != de->d_name.len) {
522 spin_unlock(&de->d_lock);
526 memcpy(name, de->d_name.name, len);
527 spin_unlock(&de->d_lock);
529 if (!lu_name_is_valid_2(name, len)) {
536 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
537 name, len, 0, LUSTRE_OPC_ANY, NULL);
538 if (IS_ERR(op_data)) {
540 RETURN(PTR_ERR(op_data));
542 op_data->op_data = lmm;
543 op_data->op_data_size = lmmsize;
545 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
546 &ll_md_blocking_ast, 0);
548 ll_finish_md_op_data(op_data);
550 /* reason for keep own exit path - don`t flood log
551 * with messages with -ESTALE errors.
553 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
554 it_open_error(DISP_OPEN_OPEN, itp))
556 ll_release_openhandle(de, itp);
560 if (it_disposition(itp, DISP_LOOKUP_NEG))
561 GOTO(out, rc = -ENOENT);
563 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
564 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
565 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
569 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
571 if (!rc && itp->it_lock_mode) {
572 ll_dom_finish_open(de->d_inode, req, itp);
573 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
577 ptlrpc_req_finished(req);
578 ll_intent_drop_lock(itp);
580 /* We did open by fid, but by the time we got to the server,
581 * the object disappeared. If this is a create, we cannot really
582 * tell the userspace that the file it was trying to create
583 * does not exist. Instead let's return -ESTALE, and the VFS will
584 * retry the create with LOOKUP_REVAL that we are going to catch
585 * in ll_revalidate_dentry() and use lookup then.
587 if (rc == -ENOENT && itp->it_op & IT_CREAT)
593 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
594 struct obd_client_handle *och)
596 struct mdt_body *body;
598 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
599 och->och_open_handle = body->mbo_open_handle;
600 och->och_fid = body->mbo_fid1;
601 och->och_lease_handle.cookie = it->it_lock_handle;
602 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
603 och->och_flags = it->it_flags;
605 return md_set_open_replay_data(md_exp, och, it);
608 static int ll_local_open(struct file *file, struct lookup_intent *it,
609 struct ll_file_data *fd, struct obd_client_handle *och)
611 struct inode *inode = file_inode(file);
614 LASSERT(!LUSTRE_FPRIVATE(file));
621 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
626 LUSTRE_FPRIVATE(file) = fd;
627 ll_readahead_init(inode, &fd->fd_ras);
628 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
630 /* ll_cl_context initialize */
631 rwlock_init(&fd->fd_lock);
632 INIT_LIST_HEAD(&fd->fd_lccs);
637 /* Open a file, and (for the very first open) create objects on the OSTs at
638 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
639 * creation or open until ll_lov_setstripe() ioctl is called.
641 * If we already have the stripe MD locally then we don't request it in
642 * md_open(), by passing a lmm_size = 0.
644 * It is up to the application to ensure no other processes open this file
645 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
646 * used. We might be able to avoid races of that sort by getting lli_open_sem
647 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
648 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
650 int ll_file_open(struct inode *inode, struct file *file)
652 struct ll_inode_info *lli = ll_i2info(inode);
653 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
654 .it_flags = file->f_flags };
655 struct obd_client_handle **och_p = NULL;
656 __u64 *och_usecount = NULL;
657 struct ll_file_data *fd;
661 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
662 PFID(ll_inode2fid(inode)), inode, file->f_flags);
664 it = file->private_data; /* XXX: compat macro */
665 file->private_data = NULL; /* prevent ll_local_open assertion */
667 fd = ll_file_data_get();
669 GOTO(out_nofiledata, rc = -ENOMEM);
672 if (S_ISDIR(inode->i_mode))
673 ll_authorize_statahead(inode, fd);
675 if (inode->i_sb->s_root == file_dentry(file)) {
676 LUSTRE_FPRIVATE(file) = fd;
680 if (!it || !it->it_disposition) {
681 /* Convert f_flags into access mode. We cannot use file->f_mode,
682 * because everything but O_ACCMODE mask was stripped from
684 if ((oit.it_flags + 1) & O_ACCMODE)
686 if (file->f_flags & O_TRUNC)
687 oit.it_flags |= FMODE_WRITE;
689 /* kernel only call f_op->open in dentry_open. filp_open calls
690 * dentry_open after call to open_namei that checks permissions.
691 * Only nfsd_open call dentry_open directly without checking
692 * permissions and because of that this code below is safe.
694 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
695 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
697 /* We do not want O_EXCL here, presumably we opened the file
698 * already? XXX - NFS implications? */
699 oit.it_flags &= ~O_EXCL;
701 /* bug20584, if "it_flags" contains O_CREAT, the file will be
702 * created if necessary, then "IT_CREAT" should be set to keep
703 * consistent with it */
704 if (oit.it_flags & O_CREAT)
705 oit.it_op |= IT_CREAT;
711 /* Let's see if we have file open on MDS already. */
712 if (it->it_flags & FMODE_WRITE) {
713 och_p = &lli->lli_mds_write_och;
714 och_usecount = &lli->lli_open_fd_write_count;
715 } else if (it->it_flags & FMODE_EXEC) {
716 och_p = &lli->lli_mds_exec_och;
717 och_usecount = &lli->lli_open_fd_exec_count;
719 och_p = &lli->lli_mds_read_och;
720 och_usecount = &lli->lli_open_fd_read_count;
723 mutex_lock(&lli->lli_och_mutex);
724 if (*och_p) { /* Open handle is present */
725 if (it_disposition(it, DISP_OPEN_OPEN)) {
726 /* Well, there's extra open request that we do not need,
727 let's close it somehow. This will decref request. */
728 rc = it_open_error(DISP_OPEN_OPEN, it);
730 mutex_unlock(&lli->lli_och_mutex);
731 GOTO(out_openerr, rc);
734 ll_release_openhandle(file_dentry(file), it);
738 rc = ll_local_open(file, it, fd, NULL);
741 mutex_unlock(&lli->lli_och_mutex);
742 GOTO(out_openerr, rc);
745 LASSERT(*och_usecount == 0);
746 if (!it->it_disposition) {
747 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
748 /* We cannot just request lock handle now, new ELC code
749 means that one of other OPEN locks for this file
750 could be cancelled, and since blocking ast handler
751 would attempt to grab och_mutex as well, that would
752 result in a deadlock */
753 mutex_unlock(&lli->lli_och_mutex);
755 * Normally called under two situations:
757 * 2. A race/condition on MDS resulting in no open
758 * handle to be returned from LOOKUP|OPEN request,
759 * for example if the target entry was a symlink.
761 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
762 * marked by a bit set in ll_iget_for_nfs. Clear the
763 * bit so that it's not confusing later callers.
765 * NB; when ldd is NULL, it must have come via normal
766 * lookup path only, since ll_iget_for_nfs always calls
769 if (ldd && ldd->lld_nfs_dentry) {
770 ldd->lld_nfs_dentry = 0;
771 it->it_flags |= MDS_OPEN_LOCK;
775 * Always specify MDS_OPEN_BY_FID because we don't want
776 * to get file with different fid.
778 it->it_flags |= MDS_OPEN_BY_FID;
779 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
782 GOTO(out_openerr, rc);
786 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
788 GOTO(out_och_free, rc = -ENOMEM);
792 /* md_intent_lock() didn't get a request ref if there was an
793 * open error, so don't do cleanup on the request here
795 /* XXX (green): Should not we bail out on any error here, not
796 * just open error? */
797 rc = it_open_error(DISP_OPEN_OPEN, it);
799 GOTO(out_och_free, rc);
801 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
802 "inode %p: disposition %x, status %d\n", inode,
803 it_disposition(it, ~0), it->it_status);
805 rc = ll_local_open(file, it, fd, *och_p);
807 GOTO(out_och_free, rc);
809 mutex_unlock(&lli->lli_och_mutex);
812 /* Must do this outside lli_och_mutex lock to prevent deadlock where
813 different kind of OPEN lock for this same inode gets cancelled
814 by ldlm_cancel_lru */
815 if (!S_ISREG(inode->i_mode))
816 GOTO(out_och_free, rc);
818 cl_lov_delay_create_clear(&file->f_flags);
819 GOTO(out_och_free, rc);
823 if (och_p && *och_p) {
824 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
825 *och_p = NULL; /* OBD_FREE writes some magic there */
828 mutex_unlock(&lli->lli_och_mutex);
831 if (lli->lli_opendir_key == fd)
832 ll_deauthorize_statahead(inode, fd);
834 ll_file_data_put(fd);
836 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
840 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
841 ptlrpc_req_finished(it->it_request);
842 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
848 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
849 struct ldlm_lock_desc *desc, void *data, int flag)
852 struct lustre_handle lockh;
856 case LDLM_CB_BLOCKING:
857 ldlm_lock2handle(lock, &lockh);
858 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
860 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
864 case LDLM_CB_CANCELING:
872 * When setting a lease on a file, we take ownership of the lli_mds_*_och
873 * and save it as fd->fd_och so as to force client to reopen the file even
874 * if it has an open lock in cache already.
876 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
877 struct lustre_handle *old_open_handle)
879 struct ll_inode_info *lli = ll_i2info(inode);
880 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
881 struct obd_client_handle **och_p;
886 /* Get the openhandle of the file */
887 mutex_lock(&lli->lli_och_mutex);
888 if (fd->fd_lease_och != NULL)
889 GOTO(out_unlock, rc = -EBUSY);
891 if (fd->fd_och == NULL) {
892 if (file->f_mode & FMODE_WRITE) {
893 LASSERT(lli->lli_mds_write_och != NULL);
894 och_p = &lli->lli_mds_write_och;
895 och_usecount = &lli->lli_open_fd_write_count;
897 LASSERT(lli->lli_mds_read_och != NULL);
898 och_p = &lli->lli_mds_read_och;
899 och_usecount = &lli->lli_open_fd_read_count;
902 if (*och_usecount > 1)
903 GOTO(out_unlock, rc = -EBUSY);
910 *old_open_handle = fd->fd_och->och_open_handle;
914 mutex_unlock(&lli->lli_och_mutex);
919 * Release ownership on lli_mds_*_och when putting back a file lease.
921 static int ll_lease_och_release(struct inode *inode, struct file *file)
923 struct ll_inode_info *lli = ll_i2info(inode);
924 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
925 struct obd_client_handle **och_p;
926 struct obd_client_handle *old_och = NULL;
931 mutex_lock(&lli->lli_och_mutex);
932 if (file->f_mode & FMODE_WRITE) {
933 och_p = &lli->lli_mds_write_och;
934 och_usecount = &lli->lli_open_fd_write_count;
936 och_p = &lli->lli_mds_read_och;
937 och_usecount = &lli->lli_open_fd_read_count;
940 /* The file may have been open by another process (broken lease) so
941 * *och_p is not NULL. In this case we should simply increase usecount
944 if (*och_p != NULL) {
945 old_och = fd->fd_och;
952 mutex_unlock(&lli->lli_och_mutex);
955 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
961 * Acquire a lease and open the file.
963 static struct obd_client_handle *
964 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
967 struct lookup_intent it = { .it_op = IT_OPEN };
968 struct ll_sb_info *sbi = ll_i2sbi(inode);
969 struct md_op_data *op_data;
970 struct ptlrpc_request *req = NULL;
971 struct lustre_handle old_open_handle = { 0 };
972 struct obd_client_handle *och = NULL;
977 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
978 RETURN(ERR_PTR(-EINVAL));
981 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
982 RETURN(ERR_PTR(-EPERM));
984 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
991 RETURN(ERR_PTR(-ENOMEM));
993 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
994 LUSTRE_OPC_ANY, NULL);
996 GOTO(out, rc = PTR_ERR(op_data));
998 /* To tell the MDT this openhandle is from the same owner */
999 op_data->op_open_handle = old_open_handle;
1001 it.it_flags = fmode | open_flags;
1002 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1003 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1004 &ll_md_blocking_lease_ast,
1005 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1006 * it can be cancelled which may mislead applications that the lease is
1008 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1009 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1010 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1011 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1012 ll_finish_md_op_data(op_data);
1013 ptlrpc_req_finished(req);
1015 GOTO(out_release_it, rc);
1017 if (it_disposition(&it, DISP_LOOKUP_NEG))
1018 GOTO(out_release_it, rc = -ENOENT);
1020 rc = it_open_error(DISP_OPEN_OPEN, &it);
1022 GOTO(out_release_it, rc);
1024 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1025 ll_och_fill(sbi->ll_md_exp, &it, och);
1027 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1028 GOTO(out_close, rc = -EOPNOTSUPP);
1030 /* already get lease, handle lease lock */
1031 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1032 if (it.it_lock_mode == 0 ||
1033 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1034 /* open lock must return for lease */
1035 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1036 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1038 GOTO(out_close, rc = -EPROTO);
1041 ll_intent_release(&it);
1045 /* Cancel open lock */
1046 if (it.it_lock_mode != 0) {
1047 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1049 it.it_lock_mode = 0;
1050 och->och_lease_handle.cookie = 0ULL;
1052 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1054 CERROR("%s: error closing file "DFID": %d\n",
1055 ll_get_fsname(inode->i_sb, NULL, 0),
1056 PFID(&ll_i2info(inode)->lli_fid), rc2);
1057 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1059 ll_intent_release(&it);
1063 RETURN(ERR_PTR(rc));
1067 * Check whether a layout swap can be done between two inodes.
1069 * \param[in] inode1 First inode to check
1070 * \param[in] inode2 Second inode to check
1072 * \retval 0 on success, layout swap can be performed between both inodes
1073 * \retval negative error code if requirements are not met
1075 static int ll_check_swap_layouts_validity(struct inode *inode1,
1076 struct inode *inode2)
1078 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1081 if (inode_permission(inode1, MAY_WRITE) ||
1082 inode_permission(inode2, MAY_WRITE))
1085 if (inode1->i_sb != inode2->i_sb)
1091 static int ll_swap_layouts_close(struct obd_client_handle *och,
1092 struct inode *inode, struct inode *inode2)
1094 const struct lu_fid *fid1 = ll_inode2fid(inode);
1095 const struct lu_fid *fid2;
1099 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1100 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1102 rc = ll_check_swap_layouts_validity(inode, inode2);
1104 GOTO(out_free_och, rc);
1106 /* We now know that inode2 is a lustre inode */
1107 fid2 = ll_inode2fid(inode2);
1109 rc = lu_fid_cmp(fid1, fid2);
1111 GOTO(out_free_och, rc = -EINVAL);
1113 /* Close the file and {swap,merge} layouts between inode & inode2.
1114 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1115 * because we still need it to pack l_remote_handle to MDT. */
1116 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1119 och = NULL; /* freed in ll_close_inode_openhandle() */
1129 * Release lease and close the file.
1130 * It will check if the lease has ever broken.
1132 static int ll_lease_close_intent(struct obd_client_handle *och,
1133 struct inode *inode,
1134 bool *lease_broken, enum mds_op_bias bias,
1137 struct ldlm_lock *lock;
1138 bool cancelled = true;
1142 lock = ldlm_handle2lock(&och->och_lease_handle);
1144 lock_res_and_lock(lock);
1145 cancelled = ldlm_is_cancel(lock);
1146 unlock_res_and_lock(lock);
1147 LDLM_LOCK_PUT(lock);
1150 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1151 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1153 if (lease_broken != NULL)
1154 *lease_broken = cancelled;
1156 if (!cancelled && !bias)
1157 ldlm_cli_cancel(&och->och_lease_handle, 0);
1159 if (cancelled) { /* no need to excute intent */
1164 rc = ll_close_inode_openhandle(inode, och, bias, data);
1168 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1171 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1175 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1177 static int ll_lease_file_resync(struct obd_client_handle *och,
1178 struct inode *inode, unsigned long arg)
1180 struct ll_sb_info *sbi = ll_i2sbi(inode);
1181 struct md_op_data *op_data;
1182 struct ll_ioc_lease_id ioc;
1183 __u64 data_version_unused;
1187 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1188 LUSTRE_OPC_ANY, NULL);
1189 if (IS_ERR(op_data))
1190 RETURN(PTR_ERR(op_data));
1192 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1196 /* before starting file resync, it's necessary to clean up page cache
1197 * in client memory, otherwise once the layout version is increased,
1198 * writing back cached data will be denied the OSTs. */
1199 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1203 op_data->op_lease_handle = och->och_lease_handle;
1204 op_data->op_mirror_id = ioc.lil_mirror_id;
1205 rc = md_file_resync(sbi->ll_md_exp, op_data);
1211 ll_finish_md_op_data(op_data);
1215 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1217 struct ll_inode_info *lli = ll_i2info(inode);
1218 struct cl_object *obj = lli->lli_clob;
1219 struct cl_attr *attr = vvp_env_thread_attr(env);
1227 ll_inode_size_lock(inode);
1229 /* Merge timestamps the most recently obtained from MDS with
1230 * timestamps obtained from OSTs.
1232 * Do not overwrite atime of inode because it may be refreshed
1233 * by file_accessed() function. If the read was served by cache
1234 * data, there is no RPC to be sent so that atime may not be
1235 * transferred to OSTs at all. MDT only updates atime at close time
1236 * if it's at least 'mdd.*.atime_diff' older.
1237 * All in all, the atime in Lustre does not strictly comply with
1238 * POSIX. Solving this problem needs to send an RPC to MDT for each
1239 * read, this will hurt performance.
1241 if (inode->i_atime.tv_sec < lli->lli_atime ||
1242 lli->lli_update_atime) {
1243 inode->i_atime.tv_sec = lli->lli_atime;
1244 lli->lli_update_atime = 0;
1246 inode->i_mtime.tv_sec = lli->lli_mtime;
1247 inode->i_ctime.tv_sec = lli->lli_ctime;
1249 mtime = inode->i_mtime.tv_sec;
1250 atime = inode->i_atime.tv_sec;
1251 ctime = inode->i_ctime.tv_sec;
1253 cl_object_attr_lock(obj);
1254 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1257 rc = cl_object_attr_get(env, obj, attr);
1258 cl_object_attr_unlock(obj);
1261 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1263 if (atime < attr->cat_atime)
1264 atime = attr->cat_atime;
1266 if (ctime < attr->cat_ctime)
1267 ctime = attr->cat_ctime;
1269 if (mtime < attr->cat_mtime)
1270 mtime = attr->cat_mtime;
1272 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1273 PFID(&lli->lli_fid), attr->cat_size);
1275 i_size_write(inode, attr->cat_size);
1276 inode->i_blocks = attr->cat_blocks;
1278 inode->i_mtime.tv_sec = mtime;
1279 inode->i_atime.tv_sec = atime;
1280 inode->i_ctime.tv_sec = ctime;
1283 ll_inode_size_unlock(inode);
1289 * Set designated mirror for I/O.
1291 * So far only read, write, and truncated can support to issue I/O to
1292 * designated mirror.
1294 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1296 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1298 /* clear layout version for generic(non-resync) I/O in case it carries
1299 * stale layout version due to I/O restart */
1300 io->ci_layout_version = 0;
1302 /* FLR: disable non-delay for designated mirror I/O because obviously
1303 * only one mirror is available */
1304 if (fd->fd_designated_mirror > 0) {
1306 io->ci_designated_mirror = fd->fd_designated_mirror;
1307 io->ci_layout_version = fd->fd_layout_version;
1310 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1311 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1314 static bool file_is_noatime(const struct file *file)
1316 const struct vfsmount *mnt = file->f_path.mnt;
1317 const struct inode *inode = file_inode((struct file *)file);
1319 /* Adapted from file_accessed() and touch_atime().*/
1320 if (file->f_flags & O_NOATIME)
1323 if (inode->i_flags & S_NOATIME)
1326 if (IS_NOATIME(inode))
1329 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1332 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1335 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1341 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1343 struct inode *inode = file_inode(file);
1344 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1346 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1347 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1349 if (iot == CIT_WRITE) {
1350 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1351 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1352 file->f_flags & O_DIRECT ||
1355 io->ci_obj = ll_i2info(inode)->lli_clob;
1356 io->ci_lockreq = CILR_MAYBE;
1357 if (ll_file_nolock(file)) {
1358 io->ci_lockreq = CILR_NEVER;
1359 io->ci_no_srvlock = 1;
1360 } else if (file->f_flags & O_APPEND) {
1361 io->ci_lockreq = CILR_MANDATORY;
1363 io->ci_noatime = file_is_noatime(file);
1365 /* FLR: only use non-delay I/O for read as there is only one
1366 * avaliable mirror for write. */
1367 io->ci_ndelay = !(iot == CIT_WRITE);
1369 ll_io_set_mirror(io, file);
1372 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1375 struct ll_inode_info *lli = ll_i2info(inode);
1376 struct ll_sb_info *sbi = ll_i2sbi(inode);
1377 enum obd_heat_type sample_type;
1378 enum obd_heat_type iobyte_type;
1379 __u64 now = ktime_get_real_seconds();
1381 if (!ll_sbi_has_file_heat(sbi) ||
1382 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1385 if (iot == CIT_READ) {
1386 sample_type = OBD_HEAT_READSAMPLE;
1387 iobyte_type = OBD_HEAT_READBYTE;
1388 } else if (iot == CIT_WRITE) {
1389 sample_type = OBD_HEAT_WRITESAMPLE;
1390 iobyte_type = OBD_HEAT_WRITEBYTE;
1395 spin_lock(&lli->lli_heat_lock);
1396 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1397 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1398 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1399 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1400 spin_unlock(&lli->lli_heat_lock);
1404 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1405 struct file *file, enum cl_io_type iot,
1406 loff_t *ppos, size_t count)
1408 struct vvp_io *vio = vvp_env_io(env);
1409 struct inode *inode = file_inode(file);
1410 struct ll_inode_info *lli = ll_i2info(inode);
1411 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1412 struct range_lock range;
1416 unsigned retried = 0;
1417 bool restarted = false;
1421 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1422 file_dentry(file)->d_name.name,
1423 iot == CIT_READ ? "read" : "write", *ppos, count);
1426 io = vvp_env_thread_io(env);
1427 ll_io_init(io, file, iot);
1428 io->ci_ndelay_tried = retried;
1430 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1431 bool range_locked = false;
1433 if (file->f_flags & O_APPEND)
1434 range_lock_init(&range, 0, LUSTRE_EOF);
1436 range_lock_init(&range, *ppos, *ppos + count - 1);
1438 vio->vui_fd = LUSTRE_FPRIVATE(file);
1439 vio->vui_io_subtype = args->via_io_subtype;
1441 switch (vio->vui_io_subtype) {
1443 vio->vui_iter = args->u.normal.via_iter;
1444 vio->vui_iocb = args->u.normal.via_iocb;
1445 /* Direct IO reads must also take range lock,
1446 * or multiple reads will try to work on the same pages
1447 * See LU-6227 for details. */
1448 if (((iot == CIT_WRITE) ||
1449 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1450 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1451 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1453 rc = range_lock(&lli->lli_write_tree, &range);
1457 range_locked = true;
1461 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1462 vio->u.splice.vui_flags = args->u.splice.via_flags;
1465 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1469 ll_cl_add(file, env, io, LCC_RW);
1470 rc = cl_io_loop(env, io);
1471 ll_cl_remove(file, env);
1474 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1476 range_unlock(&lli->lli_write_tree, &range);
1479 /* cl_io_rw_init() handled IO */
1483 if (io->ci_nob > 0) {
1484 result += io->ci_nob;
1485 count -= io->ci_nob;
1486 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1488 /* prepare IO restart */
1489 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1490 args->u.normal.via_iter = vio->vui_iter;
1493 cl_io_fini(env, io);
1496 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1497 file->f_path.dentry->d_name.name,
1498 iot, rc, result, io->ci_need_restart);
1500 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1502 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1503 file_dentry(file)->d_name.name,
1504 iot == CIT_READ ? "read" : "write",
1505 *ppos, count, result, rc);
1506 /* preserve the tried count for FLR */
1507 retried = io->ci_ndelay_tried;
1512 if (iot == CIT_READ) {
1514 ll_stats_ops_tally(ll_i2sbi(inode),
1515 LPROC_LL_READ_BYTES, result);
1516 } else if (iot == CIT_WRITE) {
1518 ll_stats_ops_tally(ll_i2sbi(inode),
1519 LPROC_LL_WRITE_BYTES, result);
1520 fd->fd_write_failed = false;
1521 } else if (result == 0 && rc == 0) {
1524 fd->fd_write_failed = true;
1526 fd->fd_write_failed = false;
1527 } else if (rc != -ERESTARTSYS) {
1528 fd->fd_write_failed = true;
1532 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1534 ll_heat_add(inode, iot, result);
1536 RETURN(result > 0 ? result : rc);
1540 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1541 * especially for small I/O.
1543 * To serve a read request, CLIO has to create and initialize a cl_io and
1544 * then request DLM lock. This has turned out to have siginificant overhead
1545 * and affects the performance of small I/O dramatically.
1547 * It's not necessary to create a cl_io for each I/O. Under the help of read
1548 * ahead, most of the pages being read are already in memory cache and we can
1549 * read those pages directly because if the pages exist, the corresponding DLM
1550 * lock must exist so that page content must be valid.
1552 * In fast read implementation, the llite speculatively finds and reads pages
1553 * in memory cache. There are three scenarios for fast read:
1554 * - If the page exists and is uptodate, kernel VM will provide the data and
1555 * CLIO won't be intervened;
1556 * - If the page was brought into memory by read ahead, it will be exported
1557 * and read ahead parameters will be updated;
1558 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1559 * it will go back and invoke normal read, i.e., a cl_io will be created
1560 * and DLM lock will be requested.
1562 * POSIX compliance: posix standard states that read is intended to be atomic.
1563 * Lustre read implementation is in line with Linux kernel read implementation
1564 * and neither of them complies with POSIX standard in this matter. Fast read
1565 * doesn't make the situation worse on single node but it may interleave write
1566 * results from multiple nodes due to short read handling in ll_file_aio_read().
1568 * \param env - lu_env
1569 * \param iocb - kiocb from kernel
1570 * \param iter - user space buffers where the data will be copied
1572 * \retval - number of bytes have been read, or error code if error occurred.
1575 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1579 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1582 /* NB: we can't do direct IO for fast read because it will need a lock
1583 * to make IO engine happy. */
1584 if (iocb->ki_filp->f_flags & O_DIRECT)
1587 result = generic_file_read_iter(iocb, iter);
1589 /* If the first page is not in cache, generic_file_aio_read() will be
1590 * returned with -ENODATA.
1591 * See corresponding code in ll_readpage(). */
1592 if (result == -ENODATA)
1596 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1597 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1598 LPROC_LL_READ_BYTES, result);
1605 * Read from a file (through the page cache).
1607 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1610 struct vvp_io_args *args;
1615 result = ll_do_fast_read(iocb, to);
1616 if (result < 0 || iov_iter_count(to) == 0)
1619 env = cl_env_get(&refcheck);
1621 return PTR_ERR(env);
1623 args = ll_env_args(env, IO_NORMAL);
1624 args->u.normal.via_iter = to;
1625 args->u.normal.via_iocb = iocb;
1627 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1628 &iocb->ki_pos, iov_iter_count(to));
1631 else if (result == 0)
1634 cl_env_put(env, &refcheck);
1640 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1641 * If a page is already in the page cache and dirty (and some other things -
1642 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1643 * write to it without doing a full I/O, because Lustre already knows about it
1644 * and will write it out. This saves a lot of processing time.
1646 * All writes here are within one page, so exclusion is handled by the page
1647 * lock on the vm page. We do not do tiny writes for writes which touch
1648 * multiple pages because it's very unlikely multiple sequential pages are
1649 * are already dirty.
1651 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1652 * and are unlikely to be to already dirty pages.
1654 * Attribute updates are important here, we do them in ll_tiny_write_end.
1656 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1658 ssize_t count = iov_iter_count(iter);
1659 struct file *file = iocb->ki_filp;
1660 struct inode *inode = file_inode(file);
1661 bool lock_inode = !IS_NOSEC(inode);
1666 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1667 * of function for why.
1669 if (count >= PAGE_SIZE ||
1670 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1673 if (unlikely(lock_inode))
1675 result = __generic_file_write_iter(iocb, iter);
1677 if (unlikely(lock_inode))
1678 inode_unlock(inode);
1680 /* If the page is not already dirty, ll_tiny_write_begin returns
1681 * -ENODATA. We continue on to normal write.
1683 if (result == -ENODATA)
1687 ll_heat_add(inode, CIT_WRITE, result);
1688 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1690 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1693 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1699 * Write to a file (through the page cache).
1701 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1703 struct vvp_io_args *args;
1705 ssize_t rc_tiny = 0, rc_normal;
1710 /* NB: we can't do direct IO for tiny writes because they use the page
1711 * cache, we can't do sync writes because tiny writes can't flush
1712 * pages, and we can't do append writes because we can't guarantee the
1713 * required DLM locks are held to protect file size.
1715 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1716 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1717 rc_tiny = ll_do_tiny_write(iocb, from);
1719 /* In case of error, go on and try normal write - Only stop if tiny
1720 * write completed I/O.
1722 if (iov_iter_count(from) == 0)
1723 GOTO(out, rc_normal = rc_tiny);
1725 env = cl_env_get(&refcheck);
1727 return PTR_ERR(env);
1729 args = ll_env_args(env, IO_NORMAL);
1730 args->u.normal.via_iter = from;
1731 args->u.normal.via_iocb = iocb;
1733 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1734 &iocb->ki_pos, iov_iter_count(from));
1736 /* On success, combine bytes written. */
1737 if (rc_tiny >= 0 && rc_normal > 0)
1738 rc_normal += rc_tiny;
1739 /* On error, only return error from normal write if tiny write did not
1740 * write any bytes. Otherwise return bytes written by tiny write.
1742 else if (rc_tiny > 0)
1743 rc_normal = rc_tiny;
1745 cl_env_put(env, &refcheck);
1750 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1752 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1754 static int ll_file_get_iov_count(const struct iovec *iov,
1755 unsigned long *nr_segs, size_t *count)
1760 for (seg = 0; seg < *nr_segs; seg++) {
1761 const struct iovec *iv = &iov[seg];
1764 * If any segment has a negative length, or the cumulative
1765 * length ever wraps negative then return -EINVAL.
1768 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1770 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1775 cnt -= iv->iov_len; /* This segment is no good */
1782 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1783 unsigned long nr_segs, loff_t pos)
1790 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1794 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1795 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1796 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1797 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1798 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1800 result = ll_file_read_iter(iocb, &to);
1805 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1808 struct iovec iov = { .iov_base = buf, .iov_len = count };
1813 init_sync_kiocb(&kiocb, file);
1814 kiocb.ki_pos = *ppos;
1815 #ifdef HAVE_KIOCB_KI_LEFT
1816 kiocb.ki_left = count;
1817 #elif defined(HAVE_KI_NBYTES)
1818 kiocb.i_nbytes = count;
1821 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1822 *ppos = kiocb.ki_pos;
1828 * Write to a file (through the page cache).
1831 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1832 unsigned long nr_segs, loff_t pos)
1834 struct iov_iter from;
1839 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1843 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1844 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1845 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1846 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1847 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1849 result = ll_file_write_iter(iocb, &from);
1854 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1855 size_t count, loff_t *ppos)
1857 struct iovec iov = { .iov_base = (void __user *)buf,
1864 init_sync_kiocb(&kiocb, file);
1865 kiocb.ki_pos = *ppos;
1866 #ifdef HAVE_KIOCB_KI_LEFT
1867 kiocb.ki_left = count;
1868 #elif defined(HAVE_KI_NBYTES)
1869 kiocb.ki_nbytes = count;
1872 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1873 *ppos = kiocb.ki_pos;
1877 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1880 * Send file content (through pagecache) somewhere with helper
1882 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1883 struct pipe_inode_info *pipe, size_t count,
1887 struct vvp_io_args *args;
1892 env = cl_env_get(&refcheck);
1894 RETURN(PTR_ERR(env));
1896 args = ll_env_args(env, IO_SPLICE);
1897 args->u.splice.via_pipe = pipe;
1898 args->u.splice.via_flags = flags;
1900 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1901 cl_env_put(env, &refcheck);
1905 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1906 __u64 flags, struct lov_user_md *lum, int lum_size)
1908 struct lookup_intent oit = {
1910 .it_flags = flags | MDS_OPEN_BY_FID,
1915 ll_inode_size_lock(inode);
1916 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1918 GOTO(out_unlock, rc);
1920 ll_release_openhandle(dentry, &oit);
1923 ll_inode_size_unlock(inode);
1924 ll_intent_release(&oit);
1929 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1930 struct lov_mds_md **lmmp, int *lmm_size,
1931 struct ptlrpc_request **request)
1933 struct ll_sb_info *sbi = ll_i2sbi(inode);
1934 struct mdt_body *body;
1935 struct lov_mds_md *lmm = NULL;
1936 struct ptlrpc_request *req = NULL;
1937 struct md_op_data *op_data;
1940 rc = ll_get_default_mdsize(sbi, &lmmsize);
1944 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1945 strlen(filename), lmmsize,
1946 LUSTRE_OPC_ANY, NULL);
1947 if (IS_ERR(op_data))
1948 RETURN(PTR_ERR(op_data));
1950 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1951 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1952 ll_finish_md_op_data(op_data);
1954 CDEBUG(D_INFO, "md_getattr_name failed "
1955 "on %s: rc %d\n", filename, rc);
1959 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1960 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1962 lmmsize = body->mbo_eadatasize;
1964 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1966 GOTO(out, rc = -ENODATA);
1969 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1970 LASSERT(lmm != NULL);
1972 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1973 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1974 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1975 GOTO(out, rc = -EPROTO);
1978 * This is coming from the MDS, so is probably in
1979 * little endian. We convert it to host endian before
1980 * passing it to userspace.
1982 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
1985 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1986 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1987 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
1988 if (le32_to_cpu(lmm->lmm_pattern) &
1989 LOV_PATTERN_F_RELEASED)
1993 /* if function called for directory - we should
1994 * avoid swab not existent lsm objects */
1995 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
1996 lustre_swab_lov_user_md_v1(
1997 (struct lov_user_md_v1 *)lmm);
1998 if (S_ISREG(body->mbo_mode))
1999 lustre_swab_lov_user_md_objects(
2000 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2002 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2003 lustre_swab_lov_user_md_v3(
2004 (struct lov_user_md_v3 *)lmm);
2005 if (S_ISREG(body->mbo_mode))
2006 lustre_swab_lov_user_md_objects(
2007 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2009 } else if (lmm->lmm_magic ==
2010 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2011 lustre_swab_lov_comp_md_v1(
2012 (struct lov_comp_md_v1 *)lmm);
2018 *lmm_size = lmmsize;
2023 static int ll_lov_setea(struct inode *inode, struct file *file,
2026 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2027 struct lov_user_md *lump;
2028 int lum_size = sizeof(struct lov_user_md) +
2029 sizeof(struct lov_user_ost_data);
2033 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2036 OBD_ALLOC_LARGE(lump, lum_size);
2040 if (copy_from_user(lump, arg, lum_size))
2041 GOTO(out_lump, rc = -EFAULT);
2043 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2045 cl_lov_delay_create_clear(&file->f_flags);
2048 OBD_FREE_LARGE(lump, lum_size);
2052 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2059 env = cl_env_get(&refcheck);
2061 RETURN(PTR_ERR(env));
2063 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2064 cl_env_put(env, &refcheck);
2068 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2071 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2072 struct lov_user_md *klum;
2074 __u64 flags = FMODE_WRITE;
2077 rc = ll_copy_user_md(lum, &klum);
2082 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2087 rc = put_user(0, &lum->lmm_stripe_count);
2091 rc = ll_layout_refresh(inode, &gen);
2095 rc = ll_file_getstripe(inode, arg, lum_size);
2097 cl_lov_delay_create_clear(&file->f_flags);
2100 OBD_FREE(klum, lum_size);
2105 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2107 struct ll_inode_info *lli = ll_i2info(inode);
2108 struct cl_object *obj = lli->lli_clob;
2109 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2110 struct ll_grouplock grouplock;
2115 CWARN("group id for group lock must not be 0\n");
2119 if (ll_file_nolock(file))
2120 RETURN(-EOPNOTSUPP);
2122 spin_lock(&lli->lli_lock);
2123 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2124 CWARN("group lock already existed with gid %lu\n",
2125 fd->fd_grouplock.lg_gid);
2126 spin_unlock(&lli->lli_lock);
2129 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2130 spin_unlock(&lli->lli_lock);
2133 * XXX: group lock needs to protect all OST objects while PFL
2134 * can add new OST objects during the IO, so we'd instantiate
2135 * all OST objects before getting its group lock.
2140 struct cl_layout cl = {
2141 .cl_is_composite = false,
2143 struct lu_extent ext = {
2145 .e_end = OBD_OBJECT_EOF,
2148 env = cl_env_get(&refcheck);
2150 RETURN(PTR_ERR(env));
2152 rc = cl_object_layout_get(env, obj, &cl);
2153 if (!rc && cl.cl_is_composite)
2154 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2157 cl_env_put(env, &refcheck);
2162 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2163 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2167 spin_lock(&lli->lli_lock);
2168 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2169 spin_unlock(&lli->lli_lock);
2170 CERROR("another thread just won the race\n");
2171 cl_put_grouplock(&grouplock);
2175 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2176 fd->fd_grouplock = grouplock;
2177 spin_unlock(&lli->lli_lock);
2179 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2183 static int ll_put_grouplock(struct inode *inode, struct file *file,
2186 struct ll_inode_info *lli = ll_i2info(inode);
2187 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2188 struct ll_grouplock grouplock;
2191 spin_lock(&lli->lli_lock);
2192 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2193 spin_unlock(&lli->lli_lock);
2194 CWARN("no group lock held\n");
2198 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2200 if (fd->fd_grouplock.lg_gid != arg) {
2201 CWARN("group lock %lu doesn't match current id %lu\n",
2202 arg, fd->fd_grouplock.lg_gid);
2203 spin_unlock(&lli->lli_lock);
2207 grouplock = fd->fd_grouplock;
2208 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2209 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2210 spin_unlock(&lli->lli_lock);
2212 cl_put_grouplock(&grouplock);
2213 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2218 * Close inode open handle
2220 * \param dentry [in] dentry which contains the inode
2221 * \param it [in,out] intent which contains open info and result
2224 * \retval <0 failure
2226 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2228 struct inode *inode = dentry->d_inode;
2229 struct obd_client_handle *och;
2235 /* Root ? Do nothing. */
2236 if (dentry->d_inode->i_sb->s_root == dentry)
2239 /* No open handle to close? Move away */
2240 if (!it_disposition(it, DISP_OPEN_OPEN))
2243 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2245 OBD_ALLOC(och, sizeof(*och));
2247 GOTO(out, rc = -ENOMEM);
2249 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2251 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2253 /* this one is in place of ll_file_open */
2254 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2255 ptlrpc_req_finished(it->it_request);
2256 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2262 * Get size for inode for which FIEMAP mapping is requested.
2263 * Make the FIEMAP get_info call and returns the result.
2264 * \param fiemap kernel buffer to hold extens
2265 * \param num_bytes kernel buffer size
2267 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2273 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2276 /* Checks for fiemap flags */
2277 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2278 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2282 /* Check for FIEMAP_FLAG_SYNC */
2283 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2284 rc = filemap_fdatawrite(inode->i_mapping);
2289 env = cl_env_get(&refcheck);
2291 RETURN(PTR_ERR(env));
2293 if (i_size_read(inode) == 0) {
2294 rc = ll_glimpse_size(inode);
2299 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2300 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2301 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2303 /* If filesize is 0, then there would be no objects for mapping */
2304 if (fmkey.lfik_oa.o_size == 0) {
2305 fiemap->fm_mapped_extents = 0;
2309 fmkey.lfik_fiemap = *fiemap;
2311 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2312 &fmkey, fiemap, &num_bytes);
2314 cl_env_put(env, &refcheck);
2318 int ll_fid2path(struct inode *inode, void __user *arg)
2320 struct obd_export *exp = ll_i2mdexp(inode);
2321 const struct getinfo_fid2path __user *gfin = arg;
2323 struct getinfo_fid2path *gfout;
2329 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2330 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2333 /* Only need to get the buflen */
2334 if (get_user(pathlen, &gfin->gf_pathlen))
2337 if (pathlen > PATH_MAX)
2340 outsize = sizeof(*gfout) + pathlen;
2341 OBD_ALLOC(gfout, outsize);
2345 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2346 GOTO(gf_free, rc = -EFAULT);
2347 /* append root FID after gfout to let MDT know the root FID so that it
2348 * can lookup the correct path, this is mainly for fileset.
2349 * old server without fileset mount support will ignore this. */
2350 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2352 /* Call mdc_iocontrol */
2353 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2357 if (copy_to_user(arg, gfout, outsize))
2361 OBD_FREE(gfout, outsize);
2366 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2368 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2376 ioc->idv_version = 0;
2377 ioc->idv_layout_version = UINT_MAX;
2379 /* If no file object initialized, we consider its version is 0. */
2383 env = cl_env_get(&refcheck);
2385 RETURN(PTR_ERR(env));
2387 io = vvp_env_thread_io(env);
2389 io->u.ci_data_version.dv_data_version = 0;
2390 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2391 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2394 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2395 result = cl_io_loop(env, io);
2397 result = io->ci_result;
2399 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2400 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2402 cl_io_fini(env, io);
2404 if (unlikely(io->ci_need_restart))
2407 cl_env_put(env, &refcheck);
2413 * Read the data_version for inode.
2415 * This value is computed using stripe object version on OST.
2416 * Version is computed using server side locking.
2418 * @param flags if do sync on the OST side;
2420 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2421 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2423 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2425 struct ioc_data_version ioc = { .idv_flags = flags };
2428 rc = ll_ioc_data_version(inode, &ioc);
2430 *data_version = ioc.idv_version;
2436 * Trigger a HSM release request for the provided inode.
2438 int ll_hsm_release(struct inode *inode)
2441 struct obd_client_handle *och = NULL;
2442 __u64 data_version = 0;
2447 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2448 ll_get_fsname(inode->i_sb, NULL, 0),
2449 PFID(&ll_i2info(inode)->lli_fid));
2451 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2453 GOTO(out, rc = PTR_ERR(och));
2455 /* Grab latest data_version and [am]time values */
2456 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2460 env = cl_env_get(&refcheck);
2462 GOTO(out, rc = PTR_ERR(env));
2464 rc = ll_merge_attr(env, inode);
2465 cl_env_put(env, &refcheck);
2467 /* If error happen, we have the wrong size for a file.
2473 /* Release the file.
2474 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2475 * we still need it to pack l_remote_handle to MDT. */
2476 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2482 if (och != NULL && !IS_ERR(och)) /* close the file */
2483 ll_lease_close(och, inode, NULL);
2488 struct ll_swap_stack {
2491 struct inode *inode1;
2492 struct inode *inode2;
2497 static int ll_swap_layouts(struct file *file1, struct file *file2,
2498 struct lustre_swap_layouts *lsl)
2500 struct mdc_swap_layouts msl;
2501 struct md_op_data *op_data;
2504 struct ll_swap_stack *llss = NULL;
2507 OBD_ALLOC_PTR(llss);
2511 llss->inode1 = file_inode(file1);
2512 llss->inode2 = file_inode(file2);
2514 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2518 /* we use 2 bool because it is easier to swap than 2 bits */
2519 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2520 llss->check_dv1 = true;
2522 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2523 llss->check_dv2 = true;
2525 /* we cannot use lsl->sl_dvX directly because we may swap them */
2526 llss->dv1 = lsl->sl_dv1;
2527 llss->dv2 = lsl->sl_dv2;
2529 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2530 if (rc == 0) /* same file, done! */
2533 if (rc < 0) { /* sequentialize it */
2534 swap(llss->inode1, llss->inode2);
2536 swap(llss->dv1, llss->dv2);
2537 swap(llss->check_dv1, llss->check_dv2);
2541 if (gid != 0) { /* application asks to flush dirty cache */
2542 rc = ll_get_grouplock(llss->inode1, file1, gid);
2546 rc = ll_get_grouplock(llss->inode2, file2, gid);
2548 ll_put_grouplock(llss->inode1, file1, gid);
2553 /* ultimate check, before swaping the layouts we check if
2554 * dataversion has changed (if requested) */
2555 if (llss->check_dv1) {
2556 rc = ll_data_version(llss->inode1, &dv, 0);
2559 if (dv != llss->dv1)
2560 GOTO(putgl, rc = -EAGAIN);
2563 if (llss->check_dv2) {
2564 rc = ll_data_version(llss->inode2, &dv, 0);
2567 if (dv != llss->dv2)
2568 GOTO(putgl, rc = -EAGAIN);
2571 /* struct md_op_data is used to send the swap args to the mdt
2572 * only flags is missing, so we use struct mdc_swap_layouts
2573 * through the md_op_data->op_data */
2574 /* flags from user space have to be converted before they are send to
2575 * server, no flag is sent today, they are only used on the client */
2578 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2579 0, LUSTRE_OPC_ANY, &msl);
2580 if (IS_ERR(op_data))
2581 GOTO(free, rc = PTR_ERR(op_data));
2583 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2584 sizeof(*op_data), op_data, NULL);
2585 ll_finish_md_op_data(op_data);
2592 ll_put_grouplock(llss->inode2, file2, gid);
2593 ll_put_grouplock(llss->inode1, file1, gid);
2603 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2605 struct obd_export *exp = ll_i2mdexp(inode);
2606 struct md_op_data *op_data;
2610 /* Detect out-of range masks */
2611 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2614 /* Non-root users are forbidden to set or clear flags which are
2615 * NOT defined in HSM_USER_MASK. */
2616 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2617 !cfs_capable(CFS_CAP_SYS_ADMIN))
2620 if (!exp_connect_archive_id_array(exp)) {
2621 /* Detect out-of range archive id */
2622 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2623 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2627 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2628 LUSTRE_OPC_ANY, hss);
2629 if (IS_ERR(op_data))
2630 RETURN(PTR_ERR(op_data));
2632 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2635 ll_finish_md_op_data(op_data);
2640 static int ll_hsm_import(struct inode *inode, struct file *file,
2641 struct hsm_user_import *hui)
2643 struct hsm_state_set *hss = NULL;
2644 struct iattr *attr = NULL;
2648 if (!S_ISREG(inode->i_mode))
2654 GOTO(out, rc = -ENOMEM);
2656 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2657 hss->hss_archive_id = hui->hui_archive_id;
2658 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2659 rc = ll_hsm_state_set(inode, hss);
2663 OBD_ALLOC_PTR(attr);
2665 GOTO(out, rc = -ENOMEM);
2667 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2668 attr->ia_mode |= S_IFREG;
2669 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2670 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2671 attr->ia_size = hui->hui_size;
2672 attr->ia_mtime.tv_sec = hui->hui_mtime;
2673 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2674 attr->ia_atime.tv_sec = hui->hui_atime;
2675 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2677 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2678 ATTR_UID | ATTR_GID |
2679 ATTR_MTIME | ATTR_MTIME_SET |
2680 ATTR_ATIME | ATTR_ATIME_SET;
2684 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2688 inode_unlock(inode);
2700 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2702 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2703 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2706 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2708 struct inode *inode = file_inode(file);
2710 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2711 ATTR_MTIME | ATTR_MTIME_SET |
2714 .tv_sec = lfu->lfu_atime_sec,
2715 .tv_nsec = lfu->lfu_atime_nsec,
2718 .tv_sec = lfu->lfu_mtime_sec,
2719 .tv_nsec = lfu->lfu_mtime_nsec,
2722 .tv_sec = lfu->lfu_ctime_sec,
2723 .tv_nsec = lfu->lfu_ctime_nsec,
2729 if (!capable(CAP_SYS_ADMIN))
2732 if (!S_ISREG(inode->i_mode))
2736 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2738 inode_unlock(inode);
2743 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2746 case MODE_READ_USER:
2748 case MODE_WRITE_USER:
2755 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2757 /* Used to allow the upper layers of the client to request an LDLM lock
2758 * without doing an actual read or write.
2760 * Used for ladvise lockahead to manually request specific locks.
2762 * \param[in] file file this ladvise lock request is on
2763 * \param[in] ladvise ladvise struct describing this lock request
2765 * \retval 0 success, no detailed result available (sync requests
2766 * and requests sent to the server [not handled locally]
2767 * cannot return detailed results)
2768 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2769 * see definitions for details.
2770 * \retval negative negative errno on error
2772 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2774 struct lu_env *env = NULL;
2775 struct cl_io *io = NULL;
2776 struct cl_lock *lock = NULL;
2777 struct cl_lock_descr *descr = NULL;
2778 struct dentry *dentry = file->f_path.dentry;
2779 struct inode *inode = dentry->d_inode;
2780 enum cl_lock_mode cl_mode;
2781 off_t start = ladvise->lla_start;
2782 off_t end = ladvise->lla_end;
2788 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2789 "start=%llu, end=%llu\n", dentry->d_name.len,
2790 dentry->d_name.name, dentry->d_inode,
2791 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2794 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2796 GOTO(out, result = cl_mode);
2798 /* Get IO environment */
2799 result = cl_io_get(inode, &env, &io, &refcheck);
2803 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2806 * nothing to do for this io. This currently happens when
2807 * stripe sub-object's are not yet created.
2809 result = io->ci_result;
2810 } else if (result == 0) {
2811 lock = vvp_env_lock(env);
2812 descr = &lock->cll_descr;
2814 descr->cld_obj = io->ci_obj;
2815 /* Convert byte offsets to pages */
2816 descr->cld_start = cl_index(io->ci_obj, start);
2817 descr->cld_end = cl_index(io->ci_obj, end);
2818 descr->cld_mode = cl_mode;
2819 /* CEF_MUST is used because we do not want to convert a
2820 * lockahead request to a lockless lock */
2821 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2824 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2825 descr->cld_enq_flags |= CEF_SPECULATIVE;
2827 result = cl_lock_request(env, io, lock);
2829 /* On success, we need to release the lock */
2831 cl_lock_release(env, lock);
2833 cl_io_fini(env, io);
2834 cl_env_put(env, &refcheck);
2836 /* -ECANCELED indicates a matching lock with a different extent
2837 * was already present, and -EEXIST indicates a matching lock
2838 * on exactly the same extent was already present.
2839 * We convert them to positive values for userspace to make
2840 * recognizing true errors easier.
2841 * Note we can only return these detailed results on async requests,
2842 * as sync requests look the same as i/o requests for locking. */
2843 if (result == -ECANCELED)
2844 result = LLA_RESULT_DIFFERENT;
2845 else if (result == -EEXIST)
2846 result = LLA_RESULT_SAME;
2851 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2853 static int ll_ladvise_sanity(struct inode *inode,
2854 struct llapi_lu_ladvise *ladvise)
2856 enum lu_ladvise_type advice = ladvise->lla_advice;
2857 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2858 * be in the first 32 bits of enum ladvise_flags */
2859 __u32 flags = ladvise->lla_peradvice_flags;
2860 /* 3 lines at 80 characters per line, should be plenty */
2863 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2865 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2866 "last supported advice is %s (value '%d'): rc = %d\n",
2867 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2868 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2872 /* Per-advice checks */
2874 case LU_LADVISE_LOCKNOEXPAND:
2875 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2877 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2879 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2880 ladvise_names[advice], rc);
2884 case LU_LADVISE_LOCKAHEAD:
2885 /* Currently only READ and WRITE modes can be requested */
2886 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2887 ladvise->lla_lockahead_mode == 0) {
2889 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2891 ll_get_fsname(inode->i_sb, NULL, 0),
2892 ladvise->lla_lockahead_mode,
2893 ladvise_names[advice], rc);
2896 case LU_LADVISE_WILLREAD:
2897 case LU_LADVISE_DONTNEED:
2899 /* Note fall through above - These checks apply to all advices
2900 * except LOCKNOEXPAND */
2901 if (flags & ~LF_DEFAULT_MASK) {
2903 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2905 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2906 ladvise_names[advice], rc);
2909 if (ladvise->lla_start >= ladvise->lla_end) {
2911 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2912 "for %s: rc = %d\n",
2913 ll_get_fsname(inode->i_sb, NULL, 0),
2914 ladvise->lla_start, ladvise->lla_end,
2915 ladvise_names[advice], rc);
2927 * Give file access advices
2929 * The ladvise interface is similar to Linux fadvise() system call, except it
2930 * forwards the advices directly from Lustre client to server. The server side
2931 * codes will apply appropriate read-ahead and caching techniques for the
2932 * corresponding files.
2934 * A typical workload for ladvise is e.g. a bunch of different clients are
2935 * doing small random reads of a file, so prefetching pages into OSS cache
2936 * with big linear reads before the random IO is a net benefit. Fetching
2937 * all that data into each client cache with fadvise() may not be, due to
2938 * much more data being sent to the client.
2940 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2941 struct llapi_lu_ladvise *ladvise)
2945 struct cl_ladvise_io *lio;
2950 env = cl_env_get(&refcheck);
2952 RETURN(PTR_ERR(env));
2954 io = vvp_env_thread_io(env);
2955 io->ci_obj = ll_i2info(inode)->lli_clob;
2957 /* initialize parameters for ladvise */
2958 lio = &io->u.ci_ladvise;
2959 lio->li_start = ladvise->lla_start;
2960 lio->li_end = ladvise->lla_end;
2961 lio->li_fid = ll_inode2fid(inode);
2962 lio->li_advice = ladvise->lla_advice;
2963 lio->li_flags = flags;
2965 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2966 rc = cl_io_loop(env, io);
2970 cl_io_fini(env, io);
2971 cl_env_put(env, &refcheck);
2975 static int ll_lock_noexpand(struct file *file, int flags)
2977 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2979 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2984 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2987 struct fsxattr fsxattr;
2989 if (copy_from_user(&fsxattr,
2990 (const struct fsxattr __user *)arg,
2994 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
2995 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
2996 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
2997 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
2998 if (copy_to_user((struct fsxattr __user *)arg,
2999 &fsxattr, sizeof(fsxattr)))
3005 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3008 * Project Quota ID state is only allowed to change from within the init
3009 * namespace. Enforce that restriction only if we are trying to change
3010 * the quota ID state. Everything else is allowed in user namespaces.
3012 if (current_user_ns() == &init_user_ns)
3015 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3018 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3019 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3022 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3029 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3033 struct md_op_data *op_data;
3034 struct ptlrpc_request *req = NULL;
3036 struct fsxattr fsxattr;
3037 struct cl_object *obj;
3041 if (copy_from_user(&fsxattr,
3042 (const struct fsxattr __user *)arg,
3046 rc = ll_ioctl_check_project(inode, &fsxattr);
3050 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3051 LUSTRE_OPC_ANY, NULL);
3052 if (IS_ERR(op_data))
3053 RETURN(PTR_ERR(op_data));
3055 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3056 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3057 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3058 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3059 op_data->op_projid = fsxattr.fsx_projid;
3060 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3061 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3063 ptlrpc_req_finished(req);
3065 GOTO(out_fsxattr, rc);
3066 ll_update_inode_flags(inode, op_data->op_attr_flags);
3067 obj = ll_i2info(inode)->lli_clob;
3069 GOTO(out_fsxattr, rc);
3071 OBD_ALLOC_PTR(attr);
3073 GOTO(out_fsxattr, rc = -ENOMEM);
3075 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3076 fsxattr.fsx_xflags);
3079 ll_finish_md_op_data(op_data);
3083 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3086 struct inode *inode = file_inode(file);
3087 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3088 struct ll_inode_info *lli = ll_i2info(inode);
3089 struct obd_client_handle *och = NULL;
3090 struct split_param sp;
3093 enum mds_op_bias bias = 0;
3094 struct file *layout_file = NULL;
3096 size_t data_size = 0;
3100 mutex_lock(&lli->lli_och_mutex);
3101 if (fd->fd_lease_och != NULL) {
3102 och = fd->fd_lease_och;
3103 fd->fd_lease_och = NULL;
3105 mutex_unlock(&lli->lli_och_mutex);
3108 GOTO(out, rc = -ENOLCK);
3110 fmode = och->och_flags;
3112 switch (ioc->lil_flags) {
3113 case LL_LEASE_RESYNC_DONE:
3114 if (ioc->lil_count > IOC_IDS_MAX)
3115 GOTO(out, rc = -EINVAL);
3117 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3118 OBD_ALLOC(data, data_size);
3120 GOTO(out, rc = -ENOMEM);
3122 if (copy_from_user(data, (void __user *)arg, data_size))
3123 GOTO(out, rc = -EFAULT);
3125 bias = MDS_CLOSE_RESYNC_DONE;
3127 case LL_LEASE_LAYOUT_MERGE: {
3130 if (ioc->lil_count != 1)
3131 GOTO(out, rc = -EINVAL);
3133 arg += sizeof(*ioc);
3134 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3135 GOTO(out, rc = -EFAULT);
3137 layout_file = fget(fd);
3139 GOTO(out, rc = -EBADF);
3141 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3142 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3143 GOTO(out, rc = -EPERM);
3145 data = file_inode(layout_file);
3146 bias = MDS_CLOSE_LAYOUT_MERGE;
3149 case LL_LEASE_LAYOUT_SPLIT: {
3153 if (ioc->lil_count != 2)
3154 GOTO(out, rc = -EINVAL);
3156 arg += sizeof(*ioc);
3157 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3158 GOTO(out, rc = -EFAULT);
3160 arg += sizeof(__u32);
3161 if (copy_from_user(&mirror_id, (void __user *)arg,
3163 GOTO(out, rc = -EFAULT);
3165 layout_file = fget(fdv);
3167 GOTO(out, rc = -EBADF);
3169 sp.sp_inode = file_inode(layout_file);
3170 sp.sp_mirror_id = (__u16)mirror_id;
3172 bias = MDS_CLOSE_LAYOUT_SPLIT;
3176 /* without close intent */
3180 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3184 rc = ll_lease_och_release(inode, file);
3193 switch (ioc->lil_flags) {
3194 case LL_LEASE_RESYNC_DONE:
3196 OBD_FREE(data, data_size);
3198 case LL_LEASE_LAYOUT_MERGE:
3199 case LL_LEASE_LAYOUT_SPLIT:
3206 rc = ll_lease_type_from_fmode(fmode);
3210 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3213 struct inode *inode = file_inode(file);
3214 struct ll_inode_info *lli = ll_i2info(inode);
3215 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3216 struct obd_client_handle *och = NULL;
3217 __u64 open_flags = 0;
3223 switch (ioc->lil_mode) {
3224 case LL_LEASE_WRLCK:
3225 if (!(file->f_mode & FMODE_WRITE))
3227 fmode = FMODE_WRITE;
3229 case LL_LEASE_RDLCK:
3230 if (!(file->f_mode & FMODE_READ))
3234 case LL_LEASE_UNLCK:
3235 RETURN(ll_file_unlock_lease(file, ioc, arg));
3240 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3242 /* apply for lease */
3243 if (ioc->lil_flags & LL_LEASE_RESYNC)
3244 open_flags = MDS_OPEN_RESYNC;
3245 och = ll_lease_open(inode, file, fmode, open_flags);
3247 RETURN(PTR_ERR(och));
3249 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3250 rc = ll_lease_file_resync(och, inode, arg);
3252 ll_lease_close(och, inode, NULL);
3255 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3257 ll_lease_close(och, inode, NULL);
3263 mutex_lock(&lli->lli_och_mutex);
3264 if (fd->fd_lease_och == NULL) {
3265 fd->fd_lease_och = och;
3268 mutex_unlock(&lli->lli_och_mutex);
3270 /* impossible now that only excl is supported for now */
3271 ll_lease_close(och, inode, &lease_broken);
3277 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3279 struct ll_inode_info *lli = ll_i2info(inode);
3280 struct ll_sb_info *sbi = ll_i2sbi(inode);
3281 __u64 now = ktime_get_real_seconds();
3284 spin_lock(&lli->lli_heat_lock);
3285 heat->lh_flags = lli->lli_heat_flags;
3286 for (i = 0; i < heat->lh_count; i++)
3287 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3288 now, sbi->ll_heat_decay_weight,
3289 sbi->ll_heat_period_second);
3290 spin_unlock(&lli->lli_heat_lock);
3293 static int ll_heat_set(struct inode *inode, __u64 flags)
3295 struct ll_inode_info *lli = ll_i2info(inode);
3298 spin_lock(&lli->lli_heat_lock);
3299 if (flags & LU_HEAT_FLAG_CLEAR)
3300 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3302 if (flags & LU_HEAT_FLAG_OFF)
3303 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3305 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3307 spin_unlock(&lli->lli_heat_lock);
3313 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3315 struct inode *inode = file_inode(file);
3316 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3320 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3321 PFID(ll_inode2fid(inode)), inode, cmd);
3322 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3324 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3325 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3329 case LL_IOC_GETFLAGS:
3330 /* Get the current value of the file flags */
3331 return put_user(fd->fd_flags, (int __user *)arg);
3332 case LL_IOC_SETFLAGS:
3333 case LL_IOC_CLRFLAGS:
3334 /* Set or clear specific file flags */
3335 /* XXX This probably needs checks to ensure the flags are
3336 * not abused, and to handle any flag side effects.
3338 if (get_user(flags, (int __user *) arg))
3341 if (cmd == LL_IOC_SETFLAGS) {
3342 if ((flags & LL_FILE_IGNORE_LOCK) &&
3343 !(file->f_flags & O_DIRECT)) {
3344 CERROR("%s: unable to disable locking on "
3345 "non-O_DIRECT file\n", current->comm);
3349 fd->fd_flags |= flags;
3351 fd->fd_flags &= ~flags;
3354 case LL_IOC_LOV_SETSTRIPE:
3355 case LL_IOC_LOV_SETSTRIPE_NEW:
3356 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3357 case LL_IOC_LOV_SETEA:
3358 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3359 case LL_IOC_LOV_SWAP_LAYOUTS: {
3361 struct lustre_swap_layouts lsl;
3363 if (copy_from_user(&lsl, (char __user *)arg,
3364 sizeof(struct lustre_swap_layouts)))
3367 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3370 file2 = fget(lsl.sl_fd);
3374 /* O_WRONLY or O_RDWR */
3375 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3376 GOTO(out, rc = -EPERM);
3378 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3379 struct inode *inode2;
3380 struct ll_inode_info *lli;
3381 struct obd_client_handle *och = NULL;
3383 lli = ll_i2info(inode);
3384 mutex_lock(&lli->lli_och_mutex);
3385 if (fd->fd_lease_och != NULL) {
3386 och = fd->fd_lease_och;
3387 fd->fd_lease_och = NULL;
3389 mutex_unlock(&lli->lli_och_mutex);
3391 GOTO(out, rc = -ENOLCK);
3392 inode2 = file_inode(file2);
3393 rc = ll_swap_layouts_close(och, inode, inode2);
3395 rc = ll_swap_layouts(file, file2, &lsl);
3401 case LL_IOC_LOV_GETSTRIPE:
3402 case LL_IOC_LOV_GETSTRIPE_NEW:
3403 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3404 case FS_IOC_GETFLAGS:
3405 case FS_IOC_SETFLAGS:
3406 RETURN(ll_iocontrol(inode, file, cmd, arg));
3407 case FSFILT_IOC_GETVERSION:
3408 case FS_IOC_GETVERSION:
3409 RETURN(put_user(inode->i_generation, (int __user *)arg));
3410 /* We need to special case any other ioctls we want to handle,
3411 * to send them to the MDS/OST as appropriate and to properly
3412 * network encode the arg field. */
3413 case FS_IOC_SETVERSION:
3416 case LL_IOC_GROUP_LOCK:
3417 RETURN(ll_get_grouplock(inode, file, arg));
3418 case LL_IOC_GROUP_UNLOCK:
3419 RETURN(ll_put_grouplock(inode, file, arg));
3420 case IOC_OBD_STATFS:
3421 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3423 case LL_IOC_FLUSHCTX:
3424 RETURN(ll_flush_ctx(inode));
3425 case LL_IOC_PATH2FID: {
3426 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3427 sizeof(struct lu_fid)))
3432 case LL_IOC_GETPARENT:
3433 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3435 case OBD_IOC_FID2PATH:
3436 RETURN(ll_fid2path(inode, (void __user *)arg));
3437 case LL_IOC_DATA_VERSION: {
3438 struct ioc_data_version idv;
3441 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3444 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3445 rc = ll_ioc_data_version(inode, &idv);
3448 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3454 case LL_IOC_GET_MDTIDX: {
3457 mdtidx = ll_get_mdt_idx(inode);
3461 if (put_user((int)mdtidx, (int __user *)arg))
3466 case OBD_IOC_GETDTNAME:
3467 case OBD_IOC_GETMDNAME:
3468 RETURN(ll_get_obd_name(inode, cmd, arg));
3469 case LL_IOC_HSM_STATE_GET: {
3470 struct md_op_data *op_data;
3471 struct hsm_user_state *hus;
3478 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3479 LUSTRE_OPC_ANY, hus);
3480 if (IS_ERR(op_data)) {
3482 RETURN(PTR_ERR(op_data));
3485 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3488 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3491 ll_finish_md_op_data(op_data);
3495 case LL_IOC_HSM_STATE_SET: {
3496 struct hsm_state_set *hss;
3503 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3508 rc = ll_hsm_state_set(inode, hss);
3513 case LL_IOC_HSM_ACTION: {
3514 struct md_op_data *op_data;
3515 struct hsm_current_action *hca;
3522 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3523 LUSTRE_OPC_ANY, hca);
3524 if (IS_ERR(op_data)) {
3526 RETURN(PTR_ERR(op_data));
3529 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3532 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3535 ll_finish_md_op_data(op_data);
3539 case LL_IOC_SET_LEASE_OLD: {
3540 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3542 RETURN(ll_file_set_lease(file, &ioc, 0));
3544 case LL_IOC_SET_LEASE: {
3545 struct ll_ioc_lease ioc;
3547 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3550 RETURN(ll_file_set_lease(file, &ioc, arg));
3552 case LL_IOC_GET_LEASE: {
3553 struct ll_inode_info *lli = ll_i2info(inode);
3554 struct ldlm_lock *lock = NULL;
3557 mutex_lock(&lli->lli_och_mutex);
3558 if (fd->fd_lease_och != NULL) {
3559 struct obd_client_handle *och = fd->fd_lease_och;
3561 lock = ldlm_handle2lock(&och->och_lease_handle);
3563 lock_res_and_lock(lock);
3564 if (!ldlm_is_cancel(lock))
3565 fmode = och->och_flags;
3567 unlock_res_and_lock(lock);
3568 LDLM_LOCK_PUT(lock);
3571 mutex_unlock(&lli->lli_och_mutex);
3573 RETURN(ll_lease_type_from_fmode(fmode));
3575 case LL_IOC_HSM_IMPORT: {
3576 struct hsm_user_import *hui;
3582 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3587 rc = ll_hsm_import(inode, file, hui);
3592 case LL_IOC_FUTIMES_3: {
3593 struct ll_futimes_3 lfu;
3595 if (copy_from_user(&lfu,
3596 (const struct ll_futimes_3 __user *)arg,
3600 RETURN(ll_file_futimes_3(file, &lfu));
3602 case LL_IOC_LADVISE: {
3603 struct llapi_ladvise_hdr *k_ladvise_hdr;
3604 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3607 int alloc_size = sizeof(*k_ladvise_hdr);
3610 u_ladvise_hdr = (void __user *)arg;
3611 OBD_ALLOC_PTR(k_ladvise_hdr);
3612 if (k_ladvise_hdr == NULL)
3615 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3616 GOTO(out_ladvise, rc = -EFAULT);
3618 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3619 k_ladvise_hdr->lah_count < 1)
3620 GOTO(out_ladvise, rc = -EINVAL);
3622 num_advise = k_ladvise_hdr->lah_count;
3623 if (num_advise >= LAH_COUNT_MAX)
3624 GOTO(out_ladvise, rc = -EFBIG);
3626 OBD_FREE_PTR(k_ladvise_hdr);
3627 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3628 lah_advise[num_advise]);
3629 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3630 if (k_ladvise_hdr == NULL)
3634 * TODO: submit multiple advices to one server in a single RPC
3636 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3637 GOTO(out_ladvise, rc = -EFAULT);
3639 for (i = 0; i < num_advise; i++) {
3640 struct llapi_lu_ladvise *k_ladvise =
3641 &k_ladvise_hdr->lah_advise[i];
3642 struct llapi_lu_ladvise __user *u_ladvise =
3643 &u_ladvise_hdr->lah_advise[i];
3645 rc = ll_ladvise_sanity(inode, k_ladvise);
3647 GOTO(out_ladvise, rc);
3649 switch (k_ladvise->lla_advice) {
3650 case LU_LADVISE_LOCKNOEXPAND:
3651 rc = ll_lock_noexpand(file,
3652 k_ladvise->lla_peradvice_flags);
3653 GOTO(out_ladvise, rc);
3654 case LU_LADVISE_LOCKAHEAD:
3656 rc = ll_file_lock_ahead(file, k_ladvise);
3659 GOTO(out_ladvise, rc);
3662 &u_ladvise->lla_lockahead_result))
3663 GOTO(out_ladvise, rc = -EFAULT);
3666 rc = ll_ladvise(inode, file,
3667 k_ladvise_hdr->lah_flags,
3670 GOTO(out_ladvise, rc);
3677 OBD_FREE(k_ladvise_hdr, alloc_size);
3680 case LL_IOC_FLR_SET_MIRROR: {
3681 /* mirror I/O must be direct to avoid polluting page cache
3683 if (!(file->f_flags & O_DIRECT))
3686 fd->fd_designated_mirror = (__u32)arg;
3689 case LL_IOC_FSGETXATTR:
3690 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3691 case LL_IOC_FSSETXATTR:
3692 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3694 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3695 case LL_IOC_HEAT_GET: {
3696 struct lu_heat uheat;
3697 struct lu_heat *heat;
3700 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3703 if (uheat.lh_count > OBD_HEAT_COUNT)
3704 uheat.lh_count = OBD_HEAT_COUNT;
3706 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3707 OBD_ALLOC(heat, size);
3711 heat->lh_count = uheat.lh_count;
3712 ll_heat_get(inode, heat);
3713 rc = copy_to_user((char __user *)arg, heat, size);
3714 OBD_FREE(heat, size);
3715 RETURN(rc ? -EFAULT : 0);
3717 case LL_IOC_HEAT_SET: {
3720 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3723 rc = ll_heat_set(inode, flags);
3727 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3728 (void __user *)arg));
3732 #ifndef HAVE_FILE_LLSEEK_SIZE
3733 static inline loff_t
3734 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3736 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3738 if (offset > maxsize)
3741 if (offset != file->f_pos) {
3742 file->f_pos = offset;
3743 file->f_version = 0;
3749 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3750 loff_t maxsize, loff_t eof)
3752 struct inode *inode = file_inode(file);
3760 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3761 * position-querying operation. Avoid rewriting the "same"
3762 * f_pos value back to the file because a concurrent read(),
3763 * write() or lseek() might have altered it
3768 * f_lock protects against read/modify/write race with other
3769 * SEEK_CURs. Note that parallel writes and reads behave
3773 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3774 inode_unlock(inode);
3778 * In the generic case the entire file is data, so as long as
3779 * offset isn't at the end of the file then the offset is data.
3786 * There is a virtual hole at the end of the file, so as long as
3787 * offset isn't i_size or larger, return i_size.
3795 return llseek_execute(file, offset, maxsize);
3799 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3801 struct inode *inode = file_inode(file);
3802 loff_t retval, eof = 0;
3805 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3806 (origin == SEEK_CUR) ? file->f_pos : 0);
3807 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3808 PFID(ll_inode2fid(inode)), inode, retval, retval,
3810 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3812 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3813 retval = ll_glimpse_size(inode);
3816 eof = i_size_read(inode);
3819 retval = ll_generic_file_llseek_size(file, offset, origin,
3820 ll_file_maxbytes(inode), eof);
3824 static int ll_flush(struct file *file, fl_owner_t id)
3826 struct inode *inode = file_inode(file);
3827 struct ll_inode_info *lli = ll_i2info(inode);
3828 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3831 LASSERT(!S_ISDIR(inode->i_mode));
3833 /* catch async errors that were recorded back when async writeback
3834 * failed for pages in this mapping. */
3835 rc = lli->lli_async_rc;
3836 lli->lli_async_rc = 0;
3837 if (lli->lli_clob != NULL) {
3838 err = lov_read_and_clear_async_rc(lli->lli_clob);
3843 /* The application has been told write failure already.
3844 * Do not report failure again. */
3845 if (fd->fd_write_failed)
3847 return rc ? -EIO : 0;
3851 * Called to make sure a portion of file has been written out.
3852 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3854 * Return how many pages have been written.
3856 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3857 enum cl_fsync_mode mode, int ignore_layout)
3861 struct cl_fsync_io *fio;
3866 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3867 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3870 env = cl_env_get(&refcheck);
3872 RETURN(PTR_ERR(env));
3874 io = vvp_env_thread_io(env);
3875 io->ci_obj = ll_i2info(inode)->lli_clob;
3876 io->ci_ignore_layout = ignore_layout;
3878 /* initialize parameters for sync */
3879 fio = &io->u.ci_fsync;
3880 fio->fi_start = start;
3882 fio->fi_fid = ll_inode2fid(inode);
3883 fio->fi_mode = mode;
3884 fio->fi_nr_written = 0;
3886 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3887 result = cl_io_loop(env, io);
3889 result = io->ci_result;
3891 result = fio->fi_nr_written;
3892 cl_io_fini(env, io);
3893 cl_env_put(env, &refcheck);
3899 * When dentry is provided (the 'else' case), file_dentry() may be
3900 * null and dentry must be used directly rather than pulled from
3901 * file_dentry() as is done otherwise.
3904 #ifdef HAVE_FILE_FSYNC_4ARGS
3905 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3907 struct dentry *dentry = file_dentry(file);
3908 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3909 int ll_fsync(struct file *file, int datasync)
3911 struct dentry *dentry = file_dentry(file);
3913 loff_t end = LLONG_MAX;
3915 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3918 loff_t end = LLONG_MAX;
3920 struct inode *inode = dentry->d_inode;
3921 struct ll_inode_info *lli = ll_i2info(inode);
3922 struct ptlrpc_request *req;
3926 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3927 PFID(ll_inode2fid(inode)), inode);
3928 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3930 #ifdef HAVE_FILE_FSYNC_4ARGS
3931 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3934 /* fsync's caller has already called _fdata{sync,write}, we want
3935 * that IO to finish before calling the osc and mdc sync methods */
3936 rc = filemap_fdatawait(inode->i_mapping);
3939 /* catch async errors that were recorded back when async writeback
3940 * failed for pages in this mapping. */
3941 if (!S_ISDIR(inode->i_mode)) {
3942 err = lli->lli_async_rc;
3943 lli->lli_async_rc = 0;
3946 if (lli->lli_clob != NULL) {
3947 err = lov_read_and_clear_async_rc(lli->lli_clob);
3953 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3957 ptlrpc_req_finished(req);
3959 if (S_ISREG(inode->i_mode)) {
3960 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3962 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3963 if (rc == 0 && err < 0)
3966 fd->fd_write_failed = true;
3968 fd->fd_write_failed = false;
3971 #ifdef HAVE_FILE_FSYNC_4ARGS
3972 inode_unlock(inode);
3978 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3980 struct inode *inode = file_inode(file);
3981 struct ll_sb_info *sbi = ll_i2sbi(inode);
3982 struct ldlm_enqueue_info einfo = {
3983 .ei_type = LDLM_FLOCK,
3984 .ei_cb_cp = ldlm_flock_completion_ast,
3985 .ei_cbdata = file_lock,
3987 struct md_op_data *op_data;
3988 struct lustre_handle lockh = { 0 };
3989 union ldlm_policy_data flock = { { 0 } };
3990 int fl_type = file_lock->fl_type;
3996 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3997 PFID(ll_inode2fid(inode)), file_lock);
3999 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4001 if (file_lock->fl_flags & FL_FLOCK) {
4002 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4003 /* flocks are whole-file locks */
4004 flock.l_flock.end = OFFSET_MAX;
4005 /* For flocks owner is determined by the local file desctiptor*/
4006 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4007 } else if (file_lock->fl_flags & FL_POSIX) {
4008 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4009 flock.l_flock.start = file_lock->fl_start;
4010 flock.l_flock.end = file_lock->fl_end;
4014 flock.l_flock.pid = file_lock->fl_pid;
4016 /* Somewhat ugly workaround for svc lockd.
4017 * lockd installs custom fl_lmops->lm_compare_owner that checks
4018 * for the fl_owner to be the same (which it always is on local node
4019 * I guess between lockd processes) and then compares pid.
4020 * As such we assign pid to the owner field to make it all work,
4021 * conflict with normal locks is unlikely since pid space and
4022 * pointer space for current->files are not intersecting */
4023 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4024 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4028 einfo.ei_mode = LCK_PR;
4031 /* An unlock request may or may not have any relation to
4032 * existing locks so we may not be able to pass a lock handle
4033 * via a normal ldlm_lock_cancel() request. The request may even
4034 * unlock a byte range in the middle of an existing lock. In
4035 * order to process an unlock request we need all of the same
4036 * information that is given with a normal read or write record
4037 * lock request. To avoid creating another ldlm unlock (cancel)
4038 * message we'll treat a LCK_NL flock request as an unlock. */
4039 einfo.ei_mode = LCK_NL;
4042 einfo.ei_mode = LCK_PW;
4045 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4060 flags = LDLM_FL_BLOCK_NOWAIT;
4066 flags = LDLM_FL_TEST_LOCK;
4069 CERROR("unknown fcntl lock command: %d\n", cmd);
4073 /* Save the old mode so that if the mode in the lock changes we
4074 * can decrement the appropriate reader or writer refcount. */
4075 file_lock->fl_type = einfo.ei_mode;
4077 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4078 LUSTRE_OPC_ANY, NULL);
4079 if (IS_ERR(op_data))
4080 RETURN(PTR_ERR(op_data));
4082 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4083 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4084 flock.l_flock.pid, flags, einfo.ei_mode,
4085 flock.l_flock.start, flock.l_flock.end);
4087 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4090 /* Restore the file lock type if not TEST lock. */
4091 if (!(flags & LDLM_FL_TEST_LOCK))
4092 file_lock->fl_type = fl_type;
4094 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4095 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4096 !(flags & LDLM_FL_TEST_LOCK))
4097 rc2 = locks_lock_file_wait(file, file_lock);
4099 if ((file_lock->fl_flags & FL_FLOCK) &&
4100 (rc == 0 || file_lock->fl_type == F_UNLCK))
4101 rc2 = flock_lock_file_wait(file, file_lock);
4102 if ((file_lock->fl_flags & FL_POSIX) &&
4103 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4104 !(flags & LDLM_FL_TEST_LOCK))
4105 rc2 = posix_lock_file_wait(file, file_lock);
4106 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4108 if (rc2 && file_lock->fl_type != F_UNLCK) {
4109 einfo.ei_mode = LCK_NL;
4110 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4115 ll_finish_md_op_data(op_data);
4120 int ll_get_fid_by_name(struct inode *parent, const char *name,
4121 int namelen, struct lu_fid *fid,
4122 struct inode **inode)
4124 struct md_op_data *op_data = NULL;
4125 struct mdt_body *body;
4126 struct ptlrpc_request *req;
4130 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4131 LUSTRE_OPC_ANY, NULL);
4132 if (IS_ERR(op_data))
4133 RETURN(PTR_ERR(op_data));
4135 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4136 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4137 ll_finish_md_op_data(op_data);
4141 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4143 GOTO(out_req, rc = -EFAULT);
4145 *fid = body->mbo_fid1;
4148 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4150 ptlrpc_req_finished(req);
4154 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4157 struct dentry *dchild = NULL;
4158 struct inode *child_inode = NULL;
4159 struct md_op_data *op_data;
4160 struct ptlrpc_request *request = NULL;
4161 struct obd_client_handle *och = NULL;
4163 struct mdt_body *body;
4164 __u64 data_version = 0;
4165 size_t namelen = strlen(name);
4166 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4170 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4171 PFID(ll_inode2fid(parent)), name,
4172 lum->lum_stripe_offset, lum->lum_stripe_count);
4174 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4175 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4176 lustre_swab_lmv_user_md(lum);
4178 /* Get child FID first */
4179 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4182 dchild = d_lookup(file_dentry(file), &qstr);
4184 if (dchild->d_inode)
4185 child_inode = igrab(dchild->d_inode);
4190 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4199 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4200 OBD_CONNECT2_DIR_MIGRATE)) {
4201 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4202 ll_i2info(child_inode)->lli_lsm_md) {
4203 CERROR("%s: MDT doesn't support stripe directory "
4205 ll_get_fsname(parent->i_sb, NULL, 0));
4206 GOTO(out_iput, rc = -EOPNOTSUPP);
4211 * lfs migrate command needs to be blocked on the client
4212 * by checking the migrate FID against the FID of the
4215 if (child_inode == parent->i_sb->s_root->d_inode)
4216 GOTO(out_iput, rc = -EINVAL);
4218 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4219 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4220 if (IS_ERR(op_data))
4221 GOTO(out_iput, rc = PTR_ERR(op_data));
4223 inode_lock(child_inode);
4224 op_data->op_fid3 = *ll_inode2fid(child_inode);
4225 if (!fid_is_sane(&op_data->op_fid3)) {
4226 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4227 ll_get_fsname(parent->i_sb, NULL, 0), name,
4228 PFID(&op_data->op_fid3));
4229 GOTO(out_unlock, rc = -EINVAL);
4232 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4233 op_data->op_data = lum;
4234 op_data->op_data_size = lumlen;
4237 if (S_ISREG(child_inode->i_mode)) {
4238 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4242 GOTO(out_unlock, rc);
4245 rc = ll_data_version(child_inode, &data_version,
4248 GOTO(out_close, rc);
4250 op_data->op_open_handle = och->och_open_handle;
4251 op_data->op_data_version = data_version;
4252 op_data->op_lease_handle = och->och_lease_handle;
4253 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4255 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4256 och->och_mod->mod_open_req->rq_replay = 0;
4257 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4260 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4261 name, namelen, &request);
4263 LASSERT(request != NULL);
4264 ll_update_times(request, parent);
4267 if (rc == 0 || rc == -EAGAIN) {
4268 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4269 LASSERT(body != NULL);
4271 /* If the server does release layout lock, then we cleanup
4272 * the client och here, otherwise release it in out_close: */
4273 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4274 obd_mod_put(och->och_mod);
4275 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4277 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4283 if (request != NULL) {
4284 ptlrpc_req_finished(request);
4288 /* Try again if the lease has cancelled. */
4289 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4294 ll_lease_close(och, child_inode, NULL);
4296 clear_nlink(child_inode);
4298 inode_unlock(child_inode);
4299 ll_finish_md_op_data(op_data);
4306 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4314 * test if some locks matching bits and l_req_mode are acquired
4315 * - bits can be in different locks
4316 * - if found clear the common lock bits in *bits
4317 * - the bits not found, are kept in *bits
4319 * \param bits [IN] searched lock bits [IN]
4320 * \param l_req_mode [IN] searched lock mode
4321 * \retval boolean, true iff all bits are found
4323 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4325 struct lustre_handle lockh;
4326 union ldlm_policy_data policy;
4327 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4328 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4337 fid = &ll_i2info(inode)->lli_fid;
4338 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4339 ldlm_lockname[mode]);
4341 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4342 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4343 policy.l_inodebits.bits = *bits & (1 << i);
4344 if (policy.l_inodebits.bits == 0)
4347 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4348 &policy, mode, &lockh)) {
4349 struct ldlm_lock *lock;
4351 lock = ldlm_handle2lock(&lockh);
4354 ~(lock->l_policy_data.l_inodebits.bits);
4355 LDLM_LOCK_PUT(lock);
4357 *bits &= ~policy.l_inodebits.bits;
4364 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4365 struct lustre_handle *lockh, __u64 flags,
4366 enum ldlm_mode mode)
4368 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4373 fid = &ll_i2info(inode)->lli_fid;
4374 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4376 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4377 fid, LDLM_IBITS, &policy, mode, lockh);
4382 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4384 /* Already unlinked. Just update nlink and return success */
4385 if (rc == -ENOENT) {
4387 /* If it is striped directory, and there is bad stripe
4388 * Let's revalidate the dentry again, instead of returning
4390 if (S_ISDIR(inode->i_mode) &&
4391 ll_i2info(inode)->lli_lsm_md != NULL)
4394 /* This path cannot be hit for regular files unless in
4395 * case of obscure races, so no need to to validate
4397 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4399 } else if (rc != 0) {
4400 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4401 "%s: revalidate FID "DFID" error: rc = %d\n",
4402 ll_get_fsname(inode->i_sb, NULL, 0),
4403 PFID(ll_inode2fid(inode)), rc);
4409 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4411 struct inode *inode = dentry->d_inode;
4412 struct obd_export *exp = ll_i2mdexp(inode);
4413 struct lookup_intent oit = {
4416 struct ptlrpc_request *req = NULL;
4417 struct md_op_data *op_data;
4421 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4422 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4424 /* Call getattr by fid, so do not provide name at all. */
4425 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4426 LUSTRE_OPC_ANY, NULL);
4427 if (IS_ERR(op_data))
4428 RETURN(PTR_ERR(op_data));
4430 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4431 ll_finish_md_op_data(op_data);
4433 rc = ll_inode_revalidate_fini(inode, rc);
4437 rc = ll_revalidate_it_finish(req, &oit, dentry);
4439 ll_intent_release(&oit);
4443 /* Unlinked? Unhash dentry, so it is not picked up later by
4444 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4445 * here to preserve get_cwd functionality on 2.6.
4447 if (!dentry->d_inode->i_nlink) {
4448 ll_lock_dcache(inode);
4449 d_lustre_invalidate(dentry, 0);
4450 ll_unlock_dcache(inode);
4453 ll_lookup_finish_locks(&oit, dentry);
4455 ptlrpc_req_finished(req);
4460 static int ll_merge_md_attr(struct inode *inode)
4462 struct ll_inode_info *lli = ll_i2info(inode);
4463 struct cl_attr attr = { 0 };
4466 LASSERT(lli->lli_lsm_md != NULL);
4467 down_read(&lli->lli_lsm_sem);
4468 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4469 &attr, ll_md_blocking_ast);
4470 up_read(&lli->lli_lsm_sem);
4474 set_nlink(inode, attr.cat_nlink);
4475 inode->i_blocks = attr.cat_blocks;
4476 i_size_write(inode, attr.cat_size);
4478 ll_i2info(inode)->lli_atime = attr.cat_atime;
4479 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4480 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4485 static inline dev_t ll_compat_encode_dev(dev_t dev)
4487 /* The compat_sys_*stat*() syscalls will fail unless the
4488 * device majors and minors are both less than 256. Note that
4489 * the value returned here will be passed through
4490 * old_encode_dev() in cp_compat_stat(). And so we are not
4491 * trying to return a valid compat (u16) device number, just
4492 * one that will pass the old_valid_dev() check. */
4494 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4497 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4498 int ll_getattr(const struct path *path, struct kstat *stat,
4499 u32 request_mask, unsigned int flags)
4501 struct dentry *de = path->dentry;
4503 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4506 struct inode *inode = de->d_inode;
4507 struct ll_sb_info *sbi = ll_i2sbi(inode);
4508 struct ll_inode_info *lli = ll_i2info(inode);
4511 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4513 rc = ll_inode_revalidate(de, IT_GETATTR);
4517 if (S_ISREG(inode->i_mode)) {
4518 /* In case of restore, the MDT has the right size and has
4519 * already send it back without granting the layout lock,
4520 * inode is up-to-date so glimpse is useless.
4521 * Also to glimpse we need the layout, in case of a running
4522 * restore the MDT holds the layout lock so the glimpse will
4523 * block up to the end of restore (getattr will block)
4525 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4526 rc = ll_glimpse_size(inode);
4531 /* If object isn't regular a file then don't validate size. */
4532 if (S_ISDIR(inode->i_mode) &&
4533 lli->lli_lsm_md != NULL) {
4534 rc = ll_merge_md_attr(inode);
4539 inode->i_atime.tv_sec = lli->lli_atime;
4540 inode->i_mtime.tv_sec = lli->lli_mtime;
4541 inode->i_ctime.tv_sec = lli->lli_ctime;
4544 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4546 if (ll_need_32bit_api(sbi)) {
4547 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4548 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4549 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4551 stat->ino = inode->i_ino;
4552 stat->dev = inode->i_sb->s_dev;
4553 stat->rdev = inode->i_rdev;
4556 stat->mode = inode->i_mode;
4557 stat->uid = inode->i_uid;
4558 stat->gid = inode->i_gid;
4559 stat->atime = inode->i_atime;
4560 stat->mtime = inode->i_mtime;
4561 stat->ctime = inode->i_ctime;
4562 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4564 stat->nlink = inode->i_nlink;
4565 stat->size = i_size_read(inode);
4566 stat->blocks = inode->i_blocks;
4571 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4572 __u64 start, __u64 len)
4576 struct fiemap *fiemap;
4577 unsigned int extent_count = fieinfo->fi_extents_max;
4579 num_bytes = sizeof(*fiemap) + (extent_count *
4580 sizeof(struct fiemap_extent));
4581 OBD_ALLOC_LARGE(fiemap, num_bytes);
4586 fiemap->fm_flags = fieinfo->fi_flags;
4587 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4588 fiemap->fm_start = start;
4589 fiemap->fm_length = len;
4590 if (extent_count > 0 &&
4591 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4592 sizeof(struct fiemap_extent)) != 0)
4593 GOTO(out, rc = -EFAULT);
4595 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4597 fieinfo->fi_flags = fiemap->fm_flags;
4598 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4599 if (extent_count > 0 &&
4600 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4601 fiemap->fm_mapped_extents *
4602 sizeof(struct fiemap_extent)) != 0)
4603 GOTO(out, rc = -EFAULT);
4605 OBD_FREE_LARGE(fiemap, num_bytes);
4609 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4611 struct ll_inode_info *lli = ll_i2info(inode);
4612 struct posix_acl *acl = NULL;
4615 spin_lock(&lli->lli_lock);
4616 /* VFS' acl_permission_check->check_acl will release the refcount */
4617 acl = posix_acl_dup(lli->lli_posix_acl);
4618 spin_unlock(&lli->lli_lock);
4623 #ifdef HAVE_IOP_SET_ACL
4624 #ifdef CONFIG_FS_POSIX_ACL
4625 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4627 struct ll_sb_info *sbi = ll_i2sbi(inode);
4628 struct ptlrpc_request *req = NULL;
4629 const char *name = NULL;
4631 size_t value_size = 0;
4636 case ACL_TYPE_ACCESS:
4637 name = XATTR_NAME_POSIX_ACL_ACCESS;
4639 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4642 case ACL_TYPE_DEFAULT:
4643 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4644 if (!S_ISDIR(inode->i_mode))
4645 rc = acl ? -EACCES : 0;
4656 value_size = posix_acl_xattr_size(acl->a_count);
4657 value = kmalloc(value_size, GFP_NOFS);
4659 GOTO(out, rc = -ENOMEM);
4661 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4663 GOTO(out_value, rc);
4666 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4667 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4668 name, value, value_size, 0, 0, &req);
4670 ptlrpc_req_finished(req);
4675 forget_cached_acl(inode, type);
4677 set_cached_acl(inode, type, acl);
4680 #endif /* CONFIG_FS_POSIX_ACL */
4681 #endif /* HAVE_IOP_SET_ACL */
4683 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4685 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4686 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4688 ll_check_acl(struct inode *inode, int mask)
4691 # ifdef CONFIG_FS_POSIX_ACL
4692 struct posix_acl *acl;
4696 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4697 if (flags & IPERM_FLAG_RCU)
4700 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4705 rc = posix_acl_permission(inode, acl, mask);
4706 posix_acl_release(acl);
4709 # else /* !CONFIG_FS_POSIX_ACL */
4711 # endif /* CONFIG_FS_POSIX_ACL */
4713 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4715 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4716 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4718 # ifdef HAVE_INODE_PERMISION_2ARGS
4719 int ll_inode_permission(struct inode *inode, int mask)
4721 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4726 struct ll_sb_info *sbi;
4727 struct root_squash_info *squash;
4728 struct cred *cred = NULL;
4729 const struct cred *old_cred = NULL;
4731 bool squash_id = false;
4734 #ifdef MAY_NOT_BLOCK
4735 if (mask & MAY_NOT_BLOCK)
4737 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4738 if (flags & IPERM_FLAG_RCU)
4742 /* as root inode are NOT getting validated in lookup operation,
4743 * need to do it before permission check. */
4745 if (inode == inode->i_sb->s_root->d_inode) {
4746 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4751 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4752 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4754 /* squash fsuid/fsgid if needed */
4755 sbi = ll_i2sbi(inode);
4756 squash = &sbi->ll_squash;
4757 if (unlikely(squash->rsi_uid != 0 &&
4758 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4759 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4763 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4764 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4765 squash->rsi_uid, squash->rsi_gid);
4767 /* update current process's credentials
4768 * and FS capability */
4769 cred = prepare_creds();
4773 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4774 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4775 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4776 if ((1 << cap) & CFS_CAP_FS_MASK)
4777 cap_lower(cred->cap_effective, cap);
4779 old_cred = override_creds(cred);
4782 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4783 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4784 /* restore current process's credentials and FS capability */
4786 revert_creds(old_cred);
4793 /* -o localflock - only provides locally consistent flock locks */
4794 struct file_operations ll_file_operations = {
4795 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4796 # ifdef HAVE_SYNC_READ_WRITE
4797 .read = new_sync_read,
4798 .write = new_sync_write,
4800 .read_iter = ll_file_read_iter,
4801 .write_iter = ll_file_write_iter,
4802 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4803 .read = ll_file_read,
4804 .aio_read = ll_file_aio_read,
4805 .write = ll_file_write,
4806 .aio_write = ll_file_aio_write,
4807 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4808 .unlocked_ioctl = ll_file_ioctl,
4809 .open = ll_file_open,
4810 .release = ll_file_release,
4811 .mmap = ll_file_mmap,
4812 .llseek = ll_file_seek,
4813 .splice_read = ll_file_splice_read,
4818 struct file_operations ll_file_operations_flock = {
4819 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4820 # ifdef HAVE_SYNC_READ_WRITE
4821 .read = new_sync_read,
4822 .write = new_sync_write,
4823 # endif /* HAVE_SYNC_READ_WRITE */
4824 .read_iter = ll_file_read_iter,
4825 .write_iter = ll_file_write_iter,
4826 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4827 .read = ll_file_read,
4828 .aio_read = ll_file_aio_read,
4829 .write = ll_file_write,
4830 .aio_write = ll_file_aio_write,
4831 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4832 .unlocked_ioctl = ll_file_ioctl,
4833 .open = ll_file_open,
4834 .release = ll_file_release,
4835 .mmap = ll_file_mmap,
4836 .llseek = ll_file_seek,
4837 .splice_read = ll_file_splice_read,
4840 .flock = ll_file_flock,
4841 .lock = ll_file_flock
4844 /* These are for -o noflock - to return ENOSYS on flock calls */
4845 struct file_operations ll_file_operations_noflock = {
4846 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4847 # ifdef HAVE_SYNC_READ_WRITE
4848 .read = new_sync_read,
4849 .write = new_sync_write,
4850 # endif /* HAVE_SYNC_READ_WRITE */
4851 .read_iter = ll_file_read_iter,
4852 .write_iter = ll_file_write_iter,
4853 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4854 .read = ll_file_read,
4855 .aio_read = ll_file_aio_read,
4856 .write = ll_file_write,
4857 .aio_write = ll_file_aio_write,
4858 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4859 .unlocked_ioctl = ll_file_ioctl,
4860 .open = ll_file_open,
4861 .release = ll_file_release,
4862 .mmap = ll_file_mmap,
4863 .llseek = ll_file_seek,
4864 .splice_read = ll_file_splice_read,
4867 .flock = ll_file_noflock,
4868 .lock = ll_file_noflock
4871 struct inode_operations ll_file_inode_operations = {
4872 .setattr = ll_setattr,
4873 .getattr = ll_getattr,
4874 .permission = ll_inode_permission,
4875 #ifdef HAVE_IOP_XATTR
4876 .setxattr = ll_setxattr,
4877 .getxattr = ll_getxattr,
4878 .removexattr = ll_removexattr,
4880 .listxattr = ll_listxattr,
4881 .fiemap = ll_fiemap,
4882 #ifdef HAVE_IOP_GET_ACL
4883 .get_acl = ll_get_acl,
4885 #ifdef HAVE_IOP_SET_ACL
4886 .set_acl = ll_set_acl,
4890 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4892 struct ll_inode_info *lli = ll_i2info(inode);
4893 struct cl_object *obj = lli->lli_clob;
4902 env = cl_env_get(&refcheck);
4904 RETURN(PTR_ERR(env));
4906 rc = cl_conf_set(env, lli->lli_clob, conf);
4910 if (conf->coc_opc == OBJECT_CONF_SET) {
4911 struct ldlm_lock *lock = conf->coc_lock;
4912 struct cl_layout cl = {
4916 LASSERT(lock != NULL);
4917 LASSERT(ldlm_has_layout(lock));
4919 /* it can only be allowed to match after layout is
4920 * applied to inode otherwise false layout would be
4921 * seen. Applying layout shoud happen before dropping
4922 * the intent lock. */
4923 ldlm_lock_allow_match(lock);
4925 rc = cl_object_layout_get(env, obj, &cl);
4930 DFID": layout version change: %u -> %u\n",
4931 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4933 ll_layout_version_set(lli, cl.cl_layout_gen);
4937 cl_env_put(env, &refcheck);
4942 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4943 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4946 struct ll_sb_info *sbi = ll_i2sbi(inode);
4947 struct ptlrpc_request *req;
4954 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4955 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4956 lock->l_lvb_data, lock->l_lvb_len);
4958 if (lock->l_lvb_data != NULL)
4961 /* if layout lock was granted right away, the layout is returned
4962 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4963 * blocked and then granted via completion ast, we have to fetch
4964 * layout here. Please note that we can't use the LVB buffer in
4965 * completion AST because it doesn't have a large enough buffer */
4966 rc = ll_get_default_mdsize(sbi, &lmmsize);
4970 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4971 XATTR_NAME_LOV, lmmsize, &req);
4974 GOTO(out, rc = 0); /* empty layout */
4981 if (lmmsize == 0) /* empty layout */
4984 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4986 GOTO(out, rc = -EFAULT);
4988 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4989 if (lvbdata == NULL)
4990 GOTO(out, rc = -ENOMEM);
4992 memcpy(lvbdata, lmm, lmmsize);
4993 lock_res_and_lock(lock);
4994 if (unlikely(lock->l_lvb_data == NULL)) {
4995 lock->l_lvb_type = LVB_T_LAYOUT;
4996 lock->l_lvb_data = lvbdata;
4997 lock->l_lvb_len = lmmsize;
5000 unlock_res_and_lock(lock);
5003 OBD_FREE_LARGE(lvbdata, lmmsize);
5008 ptlrpc_req_finished(req);
5013 * Apply the layout to the inode. Layout lock is held and will be released
5016 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5017 struct inode *inode)
5019 struct ll_inode_info *lli = ll_i2info(inode);
5020 struct ll_sb_info *sbi = ll_i2sbi(inode);
5021 struct ldlm_lock *lock;
5022 struct cl_object_conf conf;
5025 bool wait_layout = false;
5028 LASSERT(lustre_handle_is_used(lockh));
5030 lock = ldlm_handle2lock(lockh);
5031 LASSERT(lock != NULL);
5032 LASSERT(ldlm_has_layout(lock));
5034 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5035 PFID(&lli->lli_fid), inode);
5037 /* in case this is a caching lock and reinstate with new inode */
5038 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5040 lock_res_and_lock(lock);
5041 lvb_ready = ldlm_is_lvb_ready(lock);
5042 unlock_res_and_lock(lock);
5044 /* checking lvb_ready is racy but this is okay. The worst case is
5045 * that multi processes may configure the file on the same time. */
5049 rc = ll_layout_fetch(inode, lock);
5053 /* for layout lock, lmm is stored in lock's lvb.
5054 * lvb_data is immutable if the lock is held so it's safe to access it
5057 * set layout to file. Unlikely this will fail as old layout was
5058 * surely eliminated */
5059 memset(&conf, 0, sizeof conf);
5060 conf.coc_opc = OBJECT_CONF_SET;
5061 conf.coc_inode = inode;
5062 conf.coc_lock = lock;
5063 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5064 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5065 rc = ll_layout_conf(inode, &conf);
5067 /* refresh layout failed, need to wait */
5068 wait_layout = rc == -EBUSY;
5071 LDLM_LOCK_PUT(lock);
5072 ldlm_lock_decref(lockh, mode);
5074 /* wait for IO to complete if it's still being used. */
5076 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5077 ll_get_fsname(inode->i_sb, NULL, 0),
5078 PFID(&lli->lli_fid), inode);
5080 memset(&conf, 0, sizeof conf);
5081 conf.coc_opc = OBJECT_CONF_WAIT;
5082 conf.coc_inode = inode;
5083 rc = ll_layout_conf(inode, &conf);
5087 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5088 ll_get_fsname(inode->i_sb, NULL, 0),
5089 PFID(&lli->lli_fid), rc);
5095 * Issue layout intent RPC to MDS.
5096 * \param inode [in] file inode
5097 * \param intent [in] layout intent
5099 * \retval 0 on success
5100 * \retval < 0 error code
5102 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5104 struct ll_inode_info *lli = ll_i2info(inode);
5105 struct ll_sb_info *sbi = ll_i2sbi(inode);
5106 struct md_op_data *op_data;
5107 struct lookup_intent it;
5108 struct ptlrpc_request *req;
5112 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5113 0, 0, LUSTRE_OPC_ANY, NULL);
5114 if (IS_ERR(op_data))
5115 RETURN(PTR_ERR(op_data));
5117 op_data->op_data = intent;
5118 op_data->op_data_size = sizeof(*intent);
5120 memset(&it, 0, sizeof(it));
5121 it.it_op = IT_LAYOUT;
5122 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5123 intent->li_opc == LAYOUT_INTENT_TRUNC)
5124 it.it_flags = FMODE_WRITE;
5126 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5127 ll_get_fsname(inode->i_sb, NULL, 0),
5128 PFID(&lli->lli_fid), inode);
5130 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5131 &ll_md_blocking_ast, 0);
5132 if (it.it_request != NULL)
5133 ptlrpc_req_finished(it.it_request);
5134 it.it_request = NULL;
5136 ll_finish_md_op_data(op_data);
5138 /* set lock data in case this is a new lock */
5140 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5142 ll_intent_drop_lock(&it);
5148 * This function checks if there exists a LAYOUT lock on the client side,
5149 * or enqueues it if it doesn't have one in cache.
5151 * This function will not hold layout lock so it may be revoked any time after
5152 * this function returns. Any operations depend on layout should be redone
5155 * This function should be called before lov_io_init() to get an uptodate
5156 * layout version, the caller should save the version number and after IO
5157 * is finished, this function should be called again to verify that layout
5158 * is not changed during IO time.
5160 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5162 struct ll_inode_info *lli = ll_i2info(inode);
5163 struct ll_sb_info *sbi = ll_i2sbi(inode);
5164 struct lustre_handle lockh;
5165 struct layout_intent intent = {
5166 .li_opc = LAYOUT_INTENT_ACCESS,
5168 enum ldlm_mode mode;
5172 *gen = ll_layout_version_get(lli);
5173 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5177 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5178 LASSERT(S_ISREG(inode->i_mode));
5180 /* take layout lock mutex to enqueue layout lock exclusively. */
5181 mutex_lock(&lli->lli_layout_mutex);
5184 /* mostly layout lock is caching on the local side, so try to
5185 * match it before grabbing layout lock mutex. */
5186 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5187 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5188 if (mode != 0) { /* hit cached lock */
5189 rc = ll_layout_lock_set(&lockh, mode, inode);
5195 rc = ll_layout_intent(inode, &intent);
5201 *gen = ll_layout_version_get(lli);
5202 mutex_unlock(&lli->lli_layout_mutex);
5208 * Issue layout intent RPC indicating where in a file an IO is about to write.
5210 * \param[in] inode file inode.
5211 * \param[in] ext write range with start offset of fille in bytes where
5212 * an IO is about to write, and exclusive end offset in
5215 * \retval 0 on success
5216 * \retval < 0 error code
5218 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5219 struct lu_extent *ext)
5221 struct layout_intent intent = {
5223 .li_extent.e_start = ext->e_start,
5224 .li_extent.e_end = ext->e_end,
5229 rc = ll_layout_intent(inode, &intent);
5235 * This function send a restore request to the MDT
5237 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5239 struct hsm_user_request *hur;
5243 len = sizeof(struct hsm_user_request) +
5244 sizeof(struct hsm_user_item);
5245 OBD_ALLOC(hur, len);
5249 hur->hur_request.hr_action = HUA_RESTORE;
5250 hur->hur_request.hr_archive_id = 0;
5251 hur->hur_request.hr_flags = 0;
5252 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5253 sizeof(hur->hur_user_item[0].hui_fid));
5254 hur->hur_user_item[0].hui_extent.offset = offset;
5255 hur->hur_user_item[0].hui_extent.length = length;
5256 hur->hur_request.hr_itemcount = 1;
5257 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,