4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_get_fsname(inode->i_sb, NULL, 0),
143 PFID(&lli->lli_fid));
147 OBD_ALLOC_PTR(op_data);
148 /* We leak openhandle and request here on error, but not much to be
149 * done in OOM case since app won't retry close on error either. */
151 GOTO(out, rc = -ENOMEM);
153 ll_prepare_close(inode, op_data, och);
155 case MDS_CLOSE_LAYOUT_MERGE:
156 /* merge blocks from the victim inode */
157 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
158 op_data->op_attr.ia_valid |= ATTR_SIZE;
159 op_data->op_xvalid |= OP_XVALID_BLOCKS;
160 case MDS_CLOSE_LAYOUT_SPLIT:
161 case MDS_CLOSE_LAYOUT_SWAP: {
162 struct split_param *sp = data;
164 LASSERT(data != NULL);
165 op_data->op_bias |= bias;
166 op_data->op_data_version = 0;
167 op_data->op_lease_handle = och->och_lease_handle;
168 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
169 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
170 op_data->op_mirror_id = sp->sp_mirror_id;
172 op_data->op_fid2 = *ll_inode2fid(data);
177 case MDS_CLOSE_RESYNC_DONE: {
178 struct ll_ioc_lease *ioc = data;
180 LASSERT(data != NULL);
181 op_data->op_attr_blocks +=
182 ioc->lil_count * op_data->op_attr_blocks;
183 op_data->op_attr.ia_valid |= ATTR_SIZE;
184 op_data->op_xvalid |= OP_XVALID_BLOCKS;
185 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
187 op_data->op_lease_handle = och->och_lease_handle;
188 op_data->op_data = &ioc->lil_ids[0];
189 op_data->op_data_size =
190 ioc->lil_count * sizeof(ioc->lil_ids[0]);
194 case MDS_HSM_RELEASE:
195 LASSERT(data != NULL);
196 op_data->op_bias |= MDS_HSM_RELEASE;
197 op_data->op_data_version = *(__u64 *)data;
198 op_data->op_lease_handle = och->och_lease_handle;
199 op_data->op_attr.ia_valid |= ATTR_SIZE;
200 op_data->op_xvalid |= OP_XVALID_BLOCKS;
204 LASSERT(data == NULL);
208 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
209 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
210 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
211 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
213 rc = md_close(md_exp, op_data, och->och_mod, &req);
214 if (rc != 0 && rc != -EINTR)
215 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
216 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
218 if (rc == 0 && op_data->op_bias & bias) {
219 struct mdt_body *body;
221 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
222 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
226 ll_finish_md_op_data(op_data);
230 md_clear_open_replay_data(md_exp, och);
231 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
234 ptlrpc_req_finished(req); /* This is close request */
238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
240 struct ll_inode_info *lli = ll_i2info(inode);
241 struct obd_client_handle **och_p;
242 struct obd_client_handle *och;
247 if (fmode & FMODE_WRITE) {
248 och_p = &lli->lli_mds_write_och;
249 och_usecount = &lli->lli_open_fd_write_count;
250 } else if (fmode & FMODE_EXEC) {
251 och_p = &lli->lli_mds_exec_och;
252 och_usecount = &lli->lli_open_fd_exec_count;
254 LASSERT(fmode & FMODE_READ);
255 och_p = &lli->lli_mds_read_och;
256 och_usecount = &lli->lli_open_fd_read_count;
259 mutex_lock(&lli->lli_och_mutex);
260 if (*och_usecount > 0) {
261 /* There are still users of this handle, so skip
263 mutex_unlock(&lli->lli_och_mutex);
269 mutex_unlock(&lli->lli_och_mutex);
272 /* There might be a race and this handle may already
274 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
280 static int ll_md_close(struct inode *inode, struct file *file)
282 union ldlm_policy_data policy = {
283 .l_inodebits = { MDS_INODELOCK_OPEN },
285 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
286 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
287 struct ll_inode_info *lli = ll_i2info(inode);
288 struct lustre_handle lockh;
289 enum ldlm_mode lockmode;
293 /* clear group lock, if present */
294 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
295 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
297 if (fd->fd_lease_och != NULL) {
300 /* Usually the lease is not released when the
301 * application crashed, we need to release here. */
302 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
303 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
304 PFID(&lli->lli_fid), rc, lease_broken);
306 fd->fd_lease_och = NULL;
309 if (fd->fd_och != NULL) {
310 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
315 /* Let's see if we have good enough OPEN lock on the file and if
316 we can skip talking to MDS */
317 mutex_lock(&lli->lli_och_mutex);
318 if (fd->fd_omode & FMODE_WRITE) {
320 LASSERT(lli->lli_open_fd_write_count);
321 lli->lli_open_fd_write_count--;
322 } else if (fd->fd_omode & FMODE_EXEC) {
324 LASSERT(lli->lli_open_fd_exec_count);
325 lli->lli_open_fd_exec_count--;
328 LASSERT(lli->lli_open_fd_read_count);
329 lli->lli_open_fd_read_count--;
331 mutex_unlock(&lli->lli_och_mutex);
333 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
334 LDLM_IBITS, &policy, lockmode, &lockh))
335 rc = ll_md_real_close(inode, fd->fd_omode);
338 LUSTRE_FPRIVATE(file) = NULL;
339 ll_file_data_put(fd);
344 /* While this returns an error code, fput() the caller does not, so we need
345 * to make every effort to clean up all of our state here. Also, applications
346 * rarely check close errors and even if an error is returned they will not
347 * re-try the close call.
349 int ll_file_release(struct inode *inode, struct file *file)
351 struct ll_file_data *fd;
352 struct ll_sb_info *sbi = ll_i2sbi(inode);
353 struct ll_inode_info *lli = ll_i2info(inode);
357 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
358 PFID(ll_inode2fid(inode)), inode);
360 if (inode->i_sb->s_root != file_dentry(file))
361 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
362 fd = LUSTRE_FPRIVATE(file);
365 /* The last ref on @file, maybe not the the owner pid of statahead,
366 * because parent and child process can share the same file handle. */
367 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
368 ll_deauthorize_statahead(inode, fd);
370 if (inode->i_sb->s_root == file_dentry(file)) {
371 LUSTRE_FPRIVATE(file) = NULL;
372 ll_file_data_put(fd);
376 if (!S_ISDIR(inode->i_mode)) {
377 if (lli->lli_clob != NULL)
378 lov_read_and_clear_async_rc(lli->lli_clob);
379 lli->lli_async_rc = 0;
382 rc = ll_md_close(inode, file);
384 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
385 libcfs_debug_dumplog();
390 static inline int ll_dom_readpage(void *data, struct page *page)
392 struct niobuf_local *lnb = data;
395 kaddr = ll_kmap_atomic(page, KM_USER0);
396 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
397 if (lnb->lnb_len < PAGE_SIZE)
398 memset(kaddr + lnb->lnb_len, 0,
399 PAGE_SIZE - lnb->lnb_len);
400 flush_dcache_page(page);
401 SetPageUptodate(page);
402 ll_kunmap_atomic(kaddr, KM_USER0);
408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
409 struct lookup_intent *it)
411 struct ll_inode_info *lli = ll_i2info(inode);
412 struct cl_object *obj = lli->lli_clob;
413 struct address_space *mapping = inode->i_mapping;
415 struct niobuf_remote *rnb;
417 struct lustre_handle lockh;
418 struct ldlm_lock *lock;
419 unsigned long index, start;
420 struct niobuf_local lnb;
421 bool dom_lock = false;
428 if (it->it_lock_mode != 0) {
429 lockh.cookie = it->it_lock_handle;
430 lock = ldlm_handle2lock(&lockh);
432 dom_lock = ldlm_has_dom(lock);
438 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
442 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
443 if (rnb == NULL || rnb->rnb_len == 0)
446 /* LU-11595: Server may return whole file and that is OK always or
447 * it may return just file tail and its offset must be aligned with
448 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
449 * smaller then offset may be not aligned and that data is just ignored.
451 if (rnb->rnb_offset % PAGE_SIZE)
454 /* Server returns whole file or just file tail if it fills in
455 * reply buffer, in both cases total size should be inode size.
457 if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
458 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
459 ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
460 rnb->rnb_len, i_size_read(inode));
464 CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
465 rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
467 data = (char *)rnb + sizeof(*rnb);
469 lnb.lnb_file_offset = rnb->rnb_offset;
470 start = lnb.lnb_file_offset / PAGE_SIZE;
472 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
473 lnb.lnb_page_offset = 0;
475 lnb.lnb_data = data + (index << PAGE_SHIFT);
476 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
477 if (lnb.lnb_len > PAGE_SIZE)
478 lnb.lnb_len = PAGE_SIZE;
480 vmpage = read_cache_page(mapping, index + start,
481 ll_dom_readpage, &lnb);
482 if (IS_ERR(vmpage)) {
483 CWARN("%s: cannot fill page %lu for "DFID
484 " with data: rc = %li\n",
485 ll_get_fsname(inode->i_sb, NULL, 0),
486 index + start, PFID(lu_object_fid(&obj->co_lu)),
492 } while (rnb->rnb_len > (index << PAGE_SHIFT));
496 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
497 struct lookup_intent *itp)
499 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
500 struct dentry *parent = de->d_parent;
503 struct md_op_data *op_data;
504 struct ptlrpc_request *req = NULL;
508 LASSERT(parent != NULL);
509 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
511 /* if server supports open-by-fid, or file name is invalid, don't pack
512 * name in open request */
513 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
515 len = de->d_name.len;
516 name = kmalloc(len, GFP_NOFS);
520 spin_lock(&de->d_lock);
521 if (len != de->d_name.len) {
522 spin_unlock(&de->d_lock);
526 memcpy(name, de->d_name.name, len);
527 spin_unlock(&de->d_lock);
529 if (!lu_name_is_valid_2(name, len)) {
536 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
537 name, len, 0, LUSTRE_OPC_ANY, NULL);
538 if (IS_ERR(op_data)) {
540 RETURN(PTR_ERR(op_data));
542 op_data->op_data = lmm;
543 op_data->op_data_size = lmmsize;
545 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
546 &ll_md_blocking_ast, 0);
548 ll_finish_md_op_data(op_data);
550 /* reason for keep own exit path - don`t flood log
551 * with messages with -ESTALE errors.
553 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
554 it_open_error(DISP_OPEN_OPEN, itp))
556 ll_release_openhandle(de, itp);
560 if (it_disposition(itp, DISP_LOOKUP_NEG))
561 GOTO(out, rc = -ENOENT);
563 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
564 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
565 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
569 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
571 if (!rc && itp->it_lock_mode) {
572 ll_dom_finish_open(de->d_inode, req, itp);
573 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
577 ptlrpc_req_finished(req);
578 ll_intent_drop_lock(itp);
580 /* We did open by fid, but by the time we got to the server,
581 * the object disappeared. If this is a create, we cannot really
582 * tell the userspace that the file it was trying to create
583 * does not exist. Instead let's return -ESTALE, and the VFS will
584 * retry the create with LOOKUP_REVAL that we are going to catch
585 * in ll_revalidate_dentry() and use lookup then.
587 if (rc == -ENOENT && itp->it_op & IT_CREAT)
593 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
594 struct obd_client_handle *och)
596 struct mdt_body *body;
598 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
599 och->och_open_handle = body->mbo_open_handle;
600 och->och_fid = body->mbo_fid1;
601 och->och_lease_handle.cookie = it->it_lock_handle;
602 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
603 och->och_flags = it->it_flags;
605 return md_set_open_replay_data(md_exp, och, it);
608 static int ll_local_open(struct file *file, struct lookup_intent *it,
609 struct ll_file_data *fd, struct obd_client_handle *och)
611 struct inode *inode = file_inode(file);
614 LASSERT(!LUSTRE_FPRIVATE(file));
621 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
626 LUSTRE_FPRIVATE(file) = fd;
627 ll_readahead_init(inode, &fd->fd_ras);
628 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
630 /* ll_cl_context initialize */
631 rwlock_init(&fd->fd_lock);
632 INIT_LIST_HEAD(&fd->fd_lccs);
637 /* Open a file, and (for the very first open) create objects on the OSTs at
638 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
639 * creation or open until ll_lov_setstripe() ioctl is called.
641 * If we already have the stripe MD locally then we don't request it in
642 * md_open(), by passing a lmm_size = 0.
644 * It is up to the application to ensure no other processes open this file
645 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
646 * used. We might be able to avoid races of that sort by getting lli_open_sem
647 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
648 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
650 int ll_file_open(struct inode *inode, struct file *file)
652 struct ll_inode_info *lli = ll_i2info(inode);
653 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
654 .it_flags = file->f_flags };
655 struct obd_client_handle **och_p = NULL;
656 __u64 *och_usecount = NULL;
657 struct ll_file_data *fd;
661 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
662 PFID(ll_inode2fid(inode)), inode, file->f_flags);
664 it = file->private_data; /* XXX: compat macro */
665 file->private_data = NULL; /* prevent ll_local_open assertion */
667 fd = ll_file_data_get();
669 GOTO(out_nofiledata, rc = -ENOMEM);
672 if (S_ISDIR(inode->i_mode))
673 ll_authorize_statahead(inode, fd);
675 if (inode->i_sb->s_root == file_dentry(file)) {
676 LUSTRE_FPRIVATE(file) = fd;
680 if (!it || !it->it_disposition) {
681 /* Convert f_flags into access mode. We cannot use file->f_mode,
682 * because everything but O_ACCMODE mask was stripped from
684 if ((oit.it_flags + 1) & O_ACCMODE)
686 if (file->f_flags & O_TRUNC)
687 oit.it_flags |= FMODE_WRITE;
689 /* kernel only call f_op->open in dentry_open. filp_open calls
690 * dentry_open after call to open_namei that checks permissions.
691 * Only nfsd_open call dentry_open directly without checking
692 * permissions and because of that this code below is safe.
694 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
695 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
697 /* We do not want O_EXCL here, presumably we opened the file
698 * already? XXX - NFS implications? */
699 oit.it_flags &= ~O_EXCL;
701 /* bug20584, if "it_flags" contains O_CREAT, the file will be
702 * created if necessary, then "IT_CREAT" should be set to keep
703 * consistent with it */
704 if (oit.it_flags & O_CREAT)
705 oit.it_op |= IT_CREAT;
711 /* Let's see if we have file open on MDS already. */
712 if (it->it_flags & FMODE_WRITE) {
713 och_p = &lli->lli_mds_write_och;
714 och_usecount = &lli->lli_open_fd_write_count;
715 } else if (it->it_flags & FMODE_EXEC) {
716 och_p = &lli->lli_mds_exec_och;
717 och_usecount = &lli->lli_open_fd_exec_count;
719 och_p = &lli->lli_mds_read_och;
720 och_usecount = &lli->lli_open_fd_read_count;
723 mutex_lock(&lli->lli_och_mutex);
724 if (*och_p) { /* Open handle is present */
725 if (it_disposition(it, DISP_OPEN_OPEN)) {
726 /* Well, there's extra open request that we do not need,
727 let's close it somehow. This will decref request. */
728 rc = it_open_error(DISP_OPEN_OPEN, it);
730 mutex_unlock(&lli->lli_och_mutex);
731 GOTO(out_openerr, rc);
734 ll_release_openhandle(file_dentry(file), it);
738 rc = ll_local_open(file, it, fd, NULL);
741 mutex_unlock(&lli->lli_och_mutex);
742 GOTO(out_openerr, rc);
745 LASSERT(*och_usecount == 0);
746 if (!it->it_disposition) {
747 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
748 /* We cannot just request lock handle now, new ELC code
749 means that one of other OPEN locks for this file
750 could be cancelled, and since blocking ast handler
751 would attempt to grab och_mutex as well, that would
752 result in a deadlock */
753 mutex_unlock(&lli->lli_och_mutex);
755 * Normally called under two situations:
757 * 2. A race/condition on MDS resulting in no open
758 * handle to be returned from LOOKUP|OPEN request,
759 * for example if the target entry was a symlink.
761 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
762 * marked by a bit set in ll_iget_for_nfs. Clear the
763 * bit so that it's not confusing later callers.
765 * NB; when ldd is NULL, it must have come via normal
766 * lookup path only, since ll_iget_for_nfs always calls
769 if (ldd && ldd->lld_nfs_dentry) {
770 ldd->lld_nfs_dentry = 0;
771 it->it_flags |= MDS_OPEN_LOCK;
775 * Always specify MDS_OPEN_BY_FID because we don't want
776 * to get file with different fid.
778 it->it_flags |= MDS_OPEN_BY_FID;
779 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
782 GOTO(out_openerr, rc);
786 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
788 GOTO(out_och_free, rc = -ENOMEM);
792 /* md_intent_lock() didn't get a request ref if there was an
793 * open error, so don't do cleanup on the request here
795 /* XXX (green): Should not we bail out on any error here, not
796 * just open error? */
797 rc = it_open_error(DISP_OPEN_OPEN, it);
799 GOTO(out_och_free, rc);
801 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
802 "inode %p: disposition %x, status %d\n", inode,
803 it_disposition(it, ~0), it->it_status);
805 rc = ll_local_open(file, it, fd, *och_p);
807 GOTO(out_och_free, rc);
809 mutex_unlock(&lli->lli_och_mutex);
812 /* Must do this outside lli_och_mutex lock to prevent deadlock where
813 different kind of OPEN lock for this same inode gets cancelled
814 by ldlm_cancel_lru */
815 if (!S_ISREG(inode->i_mode))
816 GOTO(out_och_free, rc);
818 cl_lov_delay_create_clear(&file->f_flags);
819 GOTO(out_och_free, rc);
823 if (och_p && *och_p) {
824 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
825 *och_p = NULL; /* OBD_FREE writes some magic there */
828 mutex_unlock(&lli->lli_och_mutex);
831 if (lli->lli_opendir_key == fd)
832 ll_deauthorize_statahead(inode, fd);
834 ll_file_data_put(fd);
836 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
840 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
841 ptlrpc_req_finished(it->it_request);
842 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
848 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
849 struct ldlm_lock_desc *desc, void *data, int flag)
852 struct lustre_handle lockh;
856 case LDLM_CB_BLOCKING:
857 ldlm_lock2handle(lock, &lockh);
858 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
860 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
864 case LDLM_CB_CANCELING:
872 * When setting a lease on a file, we take ownership of the lli_mds_*_och
873 * and save it as fd->fd_och so as to force client to reopen the file even
874 * if it has an open lock in cache already.
876 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
877 struct lustre_handle *old_open_handle)
879 struct ll_inode_info *lli = ll_i2info(inode);
880 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
881 struct obd_client_handle **och_p;
886 /* Get the openhandle of the file */
887 mutex_lock(&lli->lli_och_mutex);
888 if (fd->fd_lease_och != NULL)
889 GOTO(out_unlock, rc = -EBUSY);
891 if (fd->fd_och == NULL) {
892 if (file->f_mode & FMODE_WRITE) {
893 LASSERT(lli->lli_mds_write_och != NULL);
894 och_p = &lli->lli_mds_write_och;
895 och_usecount = &lli->lli_open_fd_write_count;
897 LASSERT(lli->lli_mds_read_och != NULL);
898 och_p = &lli->lli_mds_read_och;
899 och_usecount = &lli->lli_open_fd_read_count;
902 if (*och_usecount > 1)
903 GOTO(out_unlock, rc = -EBUSY);
910 *old_open_handle = fd->fd_och->och_open_handle;
914 mutex_unlock(&lli->lli_och_mutex);
919 * Release ownership on lli_mds_*_och when putting back a file lease.
921 static int ll_lease_och_release(struct inode *inode, struct file *file)
923 struct ll_inode_info *lli = ll_i2info(inode);
924 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
925 struct obd_client_handle **och_p;
926 struct obd_client_handle *old_och = NULL;
931 mutex_lock(&lli->lli_och_mutex);
932 if (file->f_mode & FMODE_WRITE) {
933 och_p = &lli->lli_mds_write_och;
934 och_usecount = &lli->lli_open_fd_write_count;
936 och_p = &lli->lli_mds_read_och;
937 och_usecount = &lli->lli_open_fd_read_count;
940 /* The file may have been open by another process (broken lease) so
941 * *och_p is not NULL. In this case we should simply increase usecount
944 if (*och_p != NULL) {
945 old_och = fd->fd_och;
952 mutex_unlock(&lli->lli_och_mutex);
955 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
961 * Acquire a lease and open the file.
963 static struct obd_client_handle *
964 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
967 struct lookup_intent it = { .it_op = IT_OPEN };
968 struct ll_sb_info *sbi = ll_i2sbi(inode);
969 struct md_op_data *op_data;
970 struct ptlrpc_request *req = NULL;
971 struct lustre_handle old_open_handle = { 0 };
972 struct obd_client_handle *och = NULL;
977 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
978 RETURN(ERR_PTR(-EINVAL));
981 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
982 RETURN(ERR_PTR(-EPERM));
984 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
991 RETURN(ERR_PTR(-ENOMEM));
993 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
994 LUSTRE_OPC_ANY, NULL);
996 GOTO(out, rc = PTR_ERR(op_data));
998 /* To tell the MDT this openhandle is from the same owner */
999 op_data->op_open_handle = old_open_handle;
1001 it.it_flags = fmode | open_flags;
1002 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1003 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1004 &ll_md_blocking_lease_ast,
1005 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1006 * it can be cancelled which may mislead applications that the lease is
1008 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1009 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1010 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1011 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1012 ll_finish_md_op_data(op_data);
1013 ptlrpc_req_finished(req);
1015 GOTO(out_release_it, rc);
1017 if (it_disposition(&it, DISP_LOOKUP_NEG))
1018 GOTO(out_release_it, rc = -ENOENT);
1020 rc = it_open_error(DISP_OPEN_OPEN, &it);
1022 GOTO(out_release_it, rc);
1024 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1025 ll_och_fill(sbi->ll_md_exp, &it, och);
1027 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1028 GOTO(out_close, rc = -EOPNOTSUPP);
1030 /* already get lease, handle lease lock */
1031 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1032 if (it.it_lock_mode == 0 ||
1033 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1034 /* open lock must return for lease */
1035 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1036 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1038 GOTO(out_close, rc = -EPROTO);
1041 ll_intent_release(&it);
1045 /* Cancel open lock */
1046 if (it.it_lock_mode != 0) {
1047 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1049 it.it_lock_mode = 0;
1050 och->och_lease_handle.cookie = 0ULL;
1052 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1054 CERROR("%s: error closing file "DFID": %d\n",
1055 ll_get_fsname(inode->i_sb, NULL, 0),
1056 PFID(&ll_i2info(inode)->lli_fid), rc2);
1057 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1059 ll_intent_release(&it);
1063 RETURN(ERR_PTR(rc));
1067 * Check whether a layout swap can be done between two inodes.
1069 * \param[in] inode1 First inode to check
1070 * \param[in] inode2 Second inode to check
1072 * \retval 0 on success, layout swap can be performed between both inodes
1073 * \retval negative error code if requirements are not met
1075 static int ll_check_swap_layouts_validity(struct inode *inode1,
1076 struct inode *inode2)
1078 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1081 if (inode_permission(inode1, MAY_WRITE) ||
1082 inode_permission(inode2, MAY_WRITE))
1085 if (inode1->i_sb != inode2->i_sb)
1091 static int ll_swap_layouts_close(struct obd_client_handle *och,
1092 struct inode *inode, struct inode *inode2)
1094 const struct lu_fid *fid1 = ll_inode2fid(inode);
1095 const struct lu_fid *fid2;
1099 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1100 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1102 rc = ll_check_swap_layouts_validity(inode, inode2);
1104 GOTO(out_free_och, rc);
1106 /* We now know that inode2 is a lustre inode */
1107 fid2 = ll_inode2fid(inode2);
1109 rc = lu_fid_cmp(fid1, fid2);
1111 GOTO(out_free_och, rc = -EINVAL);
1113 /* Close the file and {swap,merge} layouts between inode & inode2.
1114 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1115 * because we still need it to pack l_remote_handle to MDT. */
1116 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1119 och = NULL; /* freed in ll_close_inode_openhandle() */
1129 * Release lease and close the file.
1130 * It will check if the lease has ever broken.
1132 static int ll_lease_close_intent(struct obd_client_handle *och,
1133 struct inode *inode,
1134 bool *lease_broken, enum mds_op_bias bias,
1137 struct ldlm_lock *lock;
1138 bool cancelled = true;
1142 lock = ldlm_handle2lock(&och->och_lease_handle);
1144 lock_res_and_lock(lock);
1145 cancelled = ldlm_is_cancel(lock);
1146 unlock_res_and_lock(lock);
1147 LDLM_LOCK_PUT(lock);
1150 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1151 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1153 if (lease_broken != NULL)
1154 *lease_broken = cancelled;
1156 if (!cancelled && !bias)
1157 ldlm_cli_cancel(&och->och_lease_handle, 0);
1159 if (cancelled) { /* no need to excute intent */
1164 rc = ll_close_inode_openhandle(inode, och, bias, data);
1168 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1171 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1175 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1177 static int ll_lease_file_resync(struct obd_client_handle *och,
1178 struct inode *inode, unsigned long arg)
1180 struct ll_sb_info *sbi = ll_i2sbi(inode);
1181 struct md_op_data *op_data;
1182 struct ll_ioc_lease_id ioc;
1183 __u64 data_version_unused;
1187 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1188 LUSTRE_OPC_ANY, NULL);
1189 if (IS_ERR(op_data))
1190 RETURN(PTR_ERR(op_data));
1192 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1196 /* before starting file resync, it's necessary to clean up page cache
1197 * in client memory, otherwise once the layout version is increased,
1198 * writing back cached data will be denied the OSTs. */
1199 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1203 op_data->op_lease_handle = och->och_lease_handle;
1204 op_data->op_mirror_id = ioc.lil_mirror_id;
1205 rc = md_file_resync(sbi->ll_md_exp, op_data);
1211 ll_finish_md_op_data(op_data);
1215 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1217 struct ll_inode_info *lli = ll_i2info(inode);
1218 struct cl_object *obj = lli->lli_clob;
1219 struct cl_attr *attr = vvp_env_thread_attr(env);
1227 ll_inode_size_lock(inode);
1229 /* Merge timestamps the most recently obtained from MDS with
1230 * timestamps obtained from OSTs.
1232 * Do not overwrite atime of inode because it may be refreshed
1233 * by file_accessed() function. If the read was served by cache
1234 * data, there is no RPC to be sent so that atime may not be
1235 * transferred to OSTs at all. MDT only updates atime at close time
1236 * if it's at least 'mdd.*.atime_diff' older.
1237 * All in all, the atime in Lustre does not strictly comply with
1238 * POSIX. Solving this problem needs to send an RPC to MDT for each
1239 * read, this will hurt performance. */
1240 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1241 LTIME_S(inode->i_atime) = lli->lli_atime;
1242 lli->lli_update_atime = 0;
1244 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1245 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1247 atime = LTIME_S(inode->i_atime);
1248 mtime = LTIME_S(inode->i_mtime);
1249 ctime = LTIME_S(inode->i_ctime);
1251 cl_object_attr_lock(obj);
1252 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1255 rc = cl_object_attr_get(env, obj, attr);
1256 cl_object_attr_unlock(obj);
1259 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1261 if (atime < attr->cat_atime)
1262 atime = attr->cat_atime;
1264 if (ctime < attr->cat_ctime)
1265 ctime = attr->cat_ctime;
1267 if (mtime < attr->cat_mtime)
1268 mtime = attr->cat_mtime;
1270 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1271 PFID(&lli->lli_fid), attr->cat_size);
1273 i_size_write(inode, attr->cat_size);
1274 inode->i_blocks = attr->cat_blocks;
1276 LTIME_S(inode->i_atime) = atime;
1277 LTIME_S(inode->i_mtime) = mtime;
1278 LTIME_S(inode->i_ctime) = ctime;
1281 ll_inode_size_unlock(inode);
1287 * Set designated mirror for I/O.
1289 * So far only read, write, and truncated can support to issue I/O to
1290 * designated mirror.
1292 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1294 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1296 /* clear layout version for generic(non-resync) I/O in case it carries
1297 * stale layout version due to I/O restart */
1298 io->ci_layout_version = 0;
1300 /* FLR: disable non-delay for designated mirror I/O because obviously
1301 * only one mirror is available */
1302 if (fd->fd_designated_mirror > 0) {
1304 io->ci_designated_mirror = fd->fd_designated_mirror;
1305 io->ci_layout_version = fd->fd_layout_version;
1306 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1310 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1311 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1314 static bool file_is_noatime(const struct file *file)
1316 const struct vfsmount *mnt = file->f_path.mnt;
1317 const struct inode *inode = file_inode((struct file *)file);
1319 /* Adapted from file_accessed() and touch_atime().*/
1320 if (file->f_flags & O_NOATIME)
1323 if (inode->i_flags & S_NOATIME)
1326 if (IS_NOATIME(inode))
1329 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1332 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1335 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1341 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1343 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1345 struct inode *inode = file_inode(file);
1346 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1348 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1349 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1350 io->u.ci_rw.rw_file = file;
1351 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1352 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1353 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1355 if (iot == CIT_WRITE) {
1356 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1357 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1358 file->f_flags & O_DIRECT ||
1361 io->ci_obj = ll_i2info(inode)->lli_clob;
1362 io->ci_lockreq = CILR_MAYBE;
1363 if (ll_file_nolock(file)) {
1364 io->ci_lockreq = CILR_NEVER;
1365 io->ci_no_srvlock = 1;
1366 } else if (file->f_flags & O_APPEND) {
1367 io->ci_lockreq = CILR_MANDATORY;
1369 io->ci_noatime = file_is_noatime(file);
1370 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1371 io->ci_pio = !io->u.ci_rw.rw_append;
1375 /* FLR: only use non-delay I/O for read as there is only one
1376 * avaliable mirror for write. */
1377 io->ci_ndelay = !(iot == CIT_WRITE);
1379 ll_io_set_mirror(io, file);
1382 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1384 struct cl_io_pt *pt = ptask->pt_cbdata;
1385 struct file *file = pt->cip_file;
1388 loff_t pos = pt->cip_pos;
1393 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1394 file_dentry(file)->d_name.name,
1395 pt->cip_iot == CIT_READ ? "read" : "write",
1396 pos, pos + pt->cip_count);
1398 env = cl_env_get(&refcheck);
1400 RETURN(PTR_ERR(env));
1402 io = vvp_env_thread_io(env);
1403 ll_io_init(io, file, pt->cip_iot);
1404 io->u.ci_rw.rw_iter = pt->cip_iter;
1405 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1406 io->ci_pio = 0; /* It's already in parallel task */
1408 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1409 pt->cip_count - pt->cip_result);
1411 struct vvp_io *vio = vvp_env_io(env);
1413 vio->vui_io_subtype = IO_NORMAL;
1414 vio->vui_fd = LUSTRE_FPRIVATE(file);
1416 ll_cl_add(file, env, io, LCC_RW);
1417 rc = cl_io_loop(env, io);
1418 ll_cl_remove(file, env);
1420 /* cl_io_rw_init() handled IO */
1424 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1430 if (io->ci_nob > 0) {
1431 pt->cip_result += io->ci_nob;
1432 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1434 pt->cip_iocb.ki_pos = pos;
1435 #ifdef HAVE_KIOCB_KI_LEFT
1436 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1437 #elif defined(HAVE_KI_NBYTES)
1438 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1442 cl_io_fini(env, io);
1443 cl_env_put(env, &refcheck);
1445 pt->cip_need_restart = io->ci_need_restart;
1447 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1448 file_dentry(file)->d_name.name,
1449 pt->cip_iot == CIT_READ ? "read" : "write",
1450 pt->cip_result, rc);
1452 RETURN(pt->cip_result > 0 ? 0 : rc);
1456 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1457 struct file *file, enum cl_io_type iot,
1458 loff_t *ppos, size_t count)
1460 struct range_lock range;
1461 struct vvp_io *vio = vvp_env_io(env);
1462 struct inode *inode = file_inode(file);
1463 struct ll_inode_info *lli = ll_i2info(inode);
1464 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1469 unsigned retried = 0;
1470 bool restarted = false;
1474 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1475 file_dentry(file)->d_name.name,
1476 iot == CIT_READ ? "read" : "write", pos, pos + count);
1479 io = vvp_env_thread_io(env);
1480 ll_io_init(io, file, iot);
1481 if (args->via_io_subtype == IO_NORMAL) {
1482 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1483 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1485 if (args->via_io_subtype != IO_NORMAL || restarted)
1487 io->ci_ndelay_tried = retried;
1489 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1490 bool range_locked = false;
1492 if (file->f_flags & O_APPEND)
1493 range_lock_init(&range, 0, LUSTRE_EOF);
1495 range_lock_init(&range, pos, pos + count - 1);
1497 vio->vui_fd = LUSTRE_FPRIVATE(file);
1498 vio->vui_io_subtype = args->via_io_subtype;
1500 switch (vio->vui_io_subtype) {
1502 /* Direct IO reads must also take range lock,
1503 * or multiple reads will try to work on the same pages
1504 * See LU-6227 for details. */
1505 if (((iot == CIT_WRITE) ||
1506 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1507 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1508 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1510 rc = range_lock(&lli->lli_write_tree, &range);
1514 range_locked = true;
1518 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1519 vio->u.splice.vui_flags = args->u.splice.via_flags;
1522 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1526 ll_cl_add(file, env, io, LCC_RW);
1527 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1528 !lli->lli_inode_locked) {
1530 lli->lli_inode_locked = 1;
1532 rc = cl_io_loop(env, io);
1533 if (lli->lli_inode_locked) {
1534 lli->lli_inode_locked = 0;
1535 inode_unlock(inode);
1537 ll_cl_remove(file, env);
1540 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1542 range_unlock(&lli->lli_write_tree, &range);
1545 /* cl_io_rw_init() handled IO */
1549 if (io->ci_nob > 0) {
1550 result += io->ci_nob;
1551 count -= io->ci_nob;
1553 if (args->via_io_subtype == IO_NORMAL) {
1554 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1556 /* CLIO is too complicated. See LU-11069. */
1557 if (cl_io_is_append(io))
1558 pos = io->u.ci_rw.rw_iocb.ki_pos;
1562 args->u.normal.via_iocb->ki_pos = pos;
1563 #ifdef HAVE_KIOCB_KI_LEFT
1564 args->u.normal.via_iocb->ki_left = count;
1565 #elif defined(HAVE_KI_NBYTES)
1566 args->u.normal.via_iocb->ki_nbytes = count;
1570 pos = io->u.ci_rw.rw_range.cir_pos;
1574 cl_io_fini(env, io);
1577 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1578 file->f_path.dentry->d_name.name,
1579 iot, rc, result, io->ci_need_restart);
1581 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1583 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1584 file_dentry(file)->d_name.name,
1585 iot == CIT_READ ? "read" : "write",
1586 pos, pos + count, result, rc);
1587 /* preserve the tried count for FLR */
1588 retried = io->ci_ndelay_tried;
1593 if (iot == CIT_READ) {
1595 ll_stats_ops_tally(ll_i2sbi(inode),
1596 LPROC_LL_READ_BYTES, result);
1597 } else if (iot == CIT_WRITE) {
1599 ll_stats_ops_tally(ll_i2sbi(inode),
1600 LPROC_LL_WRITE_BYTES, result);
1601 fd->fd_write_failed = false;
1602 } else if (result == 0 && rc == 0) {
1605 fd->fd_write_failed = true;
1607 fd->fd_write_failed = false;
1608 } else if (rc != -ERESTARTSYS) {
1609 fd->fd_write_failed = true;
1613 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1614 file_dentry(file)->d_name.name,
1615 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1619 RETURN(result > 0 ? result : rc);
1623 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1624 * especially for small I/O.
1626 * To serve a read request, CLIO has to create and initialize a cl_io and
1627 * then request DLM lock. This has turned out to have siginificant overhead
1628 * and affects the performance of small I/O dramatically.
1630 * It's not necessary to create a cl_io for each I/O. Under the help of read
1631 * ahead, most of the pages being read are already in memory cache and we can
1632 * read those pages directly because if the pages exist, the corresponding DLM
1633 * lock must exist so that page content must be valid.
1635 * In fast read implementation, the llite speculatively finds and reads pages
1636 * in memory cache. There are three scenarios for fast read:
1637 * - If the page exists and is uptodate, kernel VM will provide the data and
1638 * CLIO won't be intervened;
1639 * - If the page was brought into memory by read ahead, it will be exported
1640 * and read ahead parameters will be updated;
1641 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1642 * it will go back and invoke normal read, i.e., a cl_io will be created
1643 * and DLM lock will be requested.
1645 * POSIX compliance: posix standard states that read is intended to be atomic.
1646 * Lustre read implementation is in line with Linux kernel read implementation
1647 * and neither of them complies with POSIX standard in this matter. Fast read
1648 * doesn't make the situation worse on single node but it may interleave write
1649 * results from multiple nodes due to short read handling in ll_file_aio_read().
1651 * \param env - lu_env
1652 * \param iocb - kiocb from kernel
1653 * \param iter - user space buffers where the data will be copied
1655 * \retval - number of bytes have been read, or error code if error occurred.
1658 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1662 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1665 /* NB: we can't do direct IO for fast read because it will need a lock
1666 * to make IO engine happy. */
1667 if (iocb->ki_filp->f_flags & O_DIRECT)
1670 result = generic_file_read_iter(iocb, iter);
1672 /* If the first page is not in cache, generic_file_aio_read() will be
1673 * returned with -ENODATA.
1674 * See corresponding code in ll_readpage(). */
1675 if (result == -ENODATA)
1679 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1680 LPROC_LL_READ_BYTES, result);
1686 * Read from a file (through the page cache).
1688 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1691 struct vvp_io_args *args;
1696 result = ll_do_fast_read(iocb, to);
1697 if (result < 0 || iov_iter_count(to) == 0)
1700 env = cl_env_get(&refcheck);
1702 return PTR_ERR(env);
1704 args = ll_env_args(env, IO_NORMAL);
1705 args->u.normal.via_iter = to;
1706 args->u.normal.via_iocb = iocb;
1708 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1709 &iocb->ki_pos, iov_iter_count(to));
1712 else if (result == 0)
1715 cl_env_put(env, &refcheck);
1721 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1722 * If a page is already in the page cache and dirty (and some other things -
1723 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1724 * write to it without doing a full I/O, because Lustre already knows about it
1725 * and will write it out. This saves a lot of processing time.
1727 * All writes here are within one page, so exclusion is handled by the page
1728 * lock on the vm page. We do not do tiny writes for writes which touch
1729 * multiple pages because it's very unlikely multiple sequential pages are
1730 * are already dirty.
1732 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1733 * and are unlikely to be to already dirty pages.
1735 * Attribute updates are important here, we do them in ll_tiny_write_end.
1737 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1739 ssize_t count = iov_iter_count(iter);
1740 struct file *file = iocb->ki_filp;
1741 struct inode *inode = file_inode(file);
1746 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1747 * of function for why.
1749 if (count >= PAGE_SIZE ||
1750 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1753 result = __generic_file_write_iter(iocb, iter);
1755 /* If the page is not already dirty, ll_tiny_write_begin returns
1756 * -ENODATA. We continue on to normal write.
1758 if (result == -ENODATA)
1762 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1764 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1767 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1773 * Write to a file (through the page cache).
1775 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1777 struct vvp_io_args *args;
1779 ssize_t rc_tiny = 0, rc_normal;
1784 /* NB: we can't do direct IO for tiny writes because they use the page
1785 * cache, we can't do sync writes because tiny writes can't flush
1786 * pages, and we can't do append writes because we can't guarantee the
1787 * required DLM locks are held to protect file size.
1789 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1790 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1791 rc_tiny = ll_do_tiny_write(iocb, from);
1793 /* In case of error, go on and try normal write - Only stop if tiny
1794 * write completed I/O.
1796 if (iov_iter_count(from) == 0)
1797 GOTO(out, rc_normal = rc_tiny);
1799 env = cl_env_get(&refcheck);
1801 return PTR_ERR(env);
1803 args = ll_env_args(env, IO_NORMAL);
1804 args->u.normal.via_iter = from;
1805 args->u.normal.via_iocb = iocb;
1807 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1808 &iocb->ki_pos, iov_iter_count(from));
1810 /* On success, combine bytes written. */
1811 if (rc_tiny >= 0 && rc_normal > 0)
1812 rc_normal += rc_tiny;
1813 /* On error, only return error from normal write if tiny write did not
1814 * write any bytes. Otherwise return bytes written by tiny write.
1816 else if (rc_tiny > 0)
1817 rc_normal = rc_tiny;
1819 cl_env_put(env, &refcheck);
1824 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1826 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1828 static int ll_file_get_iov_count(const struct iovec *iov,
1829 unsigned long *nr_segs, size_t *count)
1834 for (seg = 0; seg < *nr_segs; seg++) {
1835 const struct iovec *iv = &iov[seg];
1838 * If any segment has a negative length, or the cumulative
1839 * length ever wraps negative then return -EINVAL.
1842 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1844 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1849 cnt -= iv->iov_len; /* This segment is no good */
1856 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1857 unsigned long nr_segs, loff_t pos)
1864 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1868 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1869 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1870 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1871 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1872 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1874 result = ll_file_read_iter(iocb, &to);
1879 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1882 struct iovec iov = { .iov_base = buf, .iov_len = count };
1887 init_sync_kiocb(&kiocb, file);
1888 kiocb.ki_pos = *ppos;
1889 #ifdef HAVE_KIOCB_KI_LEFT
1890 kiocb.ki_left = count;
1891 #elif defined(HAVE_KI_NBYTES)
1892 kiocb.i_nbytes = count;
1895 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1896 *ppos = kiocb.ki_pos;
1902 * Write to a file (through the page cache).
1905 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1906 unsigned long nr_segs, loff_t pos)
1908 struct iov_iter from;
1913 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1917 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1918 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1919 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1920 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1921 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1923 result = ll_file_write_iter(iocb, &from);
1928 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1929 size_t count, loff_t *ppos)
1931 struct iovec iov = { .iov_base = (void __user *)buf,
1938 init_sync_kiocb(&kiocb, file);
1939 kiocb.ki_pos = *ppos;
1940 #ifdef HAVE_KIOCB_KI_LEFT
1941 kiocb.ki_left = count;
1942 #elif defined(HAVE_KI_NBYTES)
1943 kiocb.ki_nbytes = count;
1946 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1947 *ppos = kiocb.ki_pos;
1951 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1954 * Send file content (through pagecache) somewhere with helper
1956 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1957 struct pipe_inode_info *pipe, size_t count,
1961 struct vvp_io_args *args;
1966 env = cl_env_get(&refcheck);
1968 RETURN(PTR_ERR(env));
1970 args = ll_env_args(env, IO_SPLICE);
1971 args->u.splice.via_pipe = pipe;
1972 args->u.splice.via_flags = flags;
1974 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1975 cl_env_put(env, &refcheck);
1979 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1980 __u64 flags, struct lov_user_md *lum, int lum_size)
1982 struct lookup_intent oit = {
1984 .it_flags = flags | MDS_OPEN_BY_FID,
1989 ll_inode_size_lock(inode);
1990 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1992 GOTO(out_unlock, rc);
1994 ll_release_openhandle(dentry, &oit);
1997 ll_inode_size_unlock(inode);
1998 ll_intent_release(&oit);
2003 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2004 struct lov_mds_md **lmmp, int *lmm_size,
2005 struct ptlrpc_request **request)
2007 struct ll_sb_info *sbi = ll_i2sbi(inode);
2008 struct mdt_body *body;
2009 struct lov_mds_md *lmm = NULL;
2010 struct ptlrpc_request *req = NULL;
2011 struct md_op_data *op_data;
2014 rc = ll_get_default_mdsize(sbi, &lmmsize);
2018 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2019 strlen(filename), lmmsize,
2020 LUSTRE_OPC_ANY, NULL);
2021 if (IS_ERR(op_data))
2022 RETURN(PTR_ERR(op_data));
2024 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2025 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2026 ll_finish_md_op_data(op_data);
2028 CDEBUG(D_INFO, "md_getattr_name failed "
2029 "on %s: rc %d\n", filename, rc);
2033 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2034 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2036 lmmsize = body->mbo_eadatasize;
2038 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2040 GOTO(out, rc = -ENODATA);
2043 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2044 LASSERT(lmm != NULL);
2046 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2047 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2048 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2049 GOTO(out, rc = -EPROTO);
2052 * This is coming from the MDS, so is probably in
2053 * little endian. We convert it to host endian before
2054 * passing it to userspace.
2056 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2059 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2060 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2061 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2062 if (le32_to_cpu(lmm->lmm_pattern) &
2063 LOV_PATTERN_F_RELEASED)
2067 /* if function called for directory - we should
2068 * avoid swab not existent lsm objects */
2069 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2070 lustre_swab_lov_user_md_v1(
2071 (struct lov_user_md_v1 *)lmm);
2072 if (S_ISREG(body->mbo_mode))
2073 lustre_swab_lov_user_md_objects(
2074 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2076 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2077 lustre_swab_lov_user_md_v3(
2078 (struct lov_user_md_v3 *)lmm);
2079 if (S_ISREG(body->mbo_mode))
2080 lustre_swab_lov_user_md_objects(
2081 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2083 } else if (lmm->lmm_magic ==
2084 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2085 lustre_swab_lov_comp_md_v1(
2086 (struct lov_comp_md_v1 *)lmm);
2092 *lmm_size = lmmsize;
2097 static int ll_lov_setea(struct inode *inode, struct file *file,
2100 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2101 struct lov_user_md *lump;
2102 int lum_size = sizeof(struct lov_user_md) +
2103 sizeof(struct lov_user_ost_data);
2107 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2110 OBD_ALLOC_LARGE(lump, lum_size);
2114 if (copy_from_user(lump, arg, lum_size))
2115 GOTO(out_lump, rc = -EFAULT);
2117 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2119 cl_lov_delay_create_clear(&file->f_flags);
2122 OBD_FREE_LARGE(lump, lum_size);
2126 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2133 env = cl_env_get(&refcheck);
2135 RETURN(PTR_ERR(env));
2137 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2138 cl_env_put(env, &refcheck);
2142 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2145 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2146 struct lov_user_md *klum;
2148 __u64 flags = FMODE_WRITE;
2151 rc = ll_copy_user_md(lum, &klum);
2156 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2161 rc = put_user(0, &lum->lmm_stripe_count);
2165 rc = ll_layout_refresh(inode, &gen);
2169 rc = ll_file_getstripe(inode, arg, lum_size);
2171 cl_lov_delay_create_clear(&file->f_flags);
2174 OBD_FREE(klum, lum_size);
2179 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2181 struct ll_inode_info *lli = ll_i2info(inode);
2182 struct cl_object *obj = lli->lli_clob;
2183 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2184 struct ll_grouplock grouplock;
2189 CWARN("group id for group lock must not be 0\n");
2193 if (ll_file_nolock(file))
2194 RETURN(-EOPNOTSUPP);
2196 spin_lock(&lli->lli_lock);
2197 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2198 CWARN("group lock already existed with gid %lu\n",
2199 fd->fd_grouplock.lg_gid);
2200 spin_unlock(&lli->lli_lock);
2203 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2204 spin_unlock(&lli->lli_lock);
2207 * XXX: group lock needs to protect all OST objects while PFL
2208 * can add new OST objects during the IO, so we'd instantiate
2209 * all OST objects before getting its group lock.
2214 struct cl_layout cl = {
2215 .cl_is_composite = false,
2217 struct lu_extent ext = {
2219 .e_end = OBD_OBJECT_EOF,
2222 env = cl_env_get(&refcheck);
2224 RETURN(PTR_ERR(env));
2226 rc = cl_object_layout_get(env, obj, &cl);
2227 if (!rc && cl.cl_is_composite)
2228 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2231 cl_env_put(env, &refcheck);
2236 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2237 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2241 spin_lock(&lli->lli_lock);
2242 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2243 spin_unlock(&lli->lli_lock);
2244 CERROR("another thread just won the race\n");
2245 cl_put_grouplock(&grouplock);
2249 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2250 fd->fd_grouplock = grouplock;
2251 spin_unlock(&lli->lli_lock);
2253 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2257 static int ll_put_grouplock(struct inode *inode, struct file *file,
2260 struct ll_inode_info *lli = ll_i2info(inode);
2261 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2262 struct ll_grouplock grouplock;
2265 spin_lock(&lli->lli_lock);
2266 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2267 spin_unlock(&lli->lli_lock);
2268 CWARN("no group lock held\n");
2272 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2274 if (fd->fd_grouplock.lg_gid != arg) {
2275 CWARN("group lock %lu doesn't match current id %lu\n",
2276 arg, fd->fd_grouplock.lg_gid);
2277 spin_unlock(&lli->lli_lock);
2281 grouplock = fd->fd_grouplock;
2282 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2283 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2284 spin_unlock(&lli->lli_lock);
2286 cl_put_grouplock(&grouplock);
2287 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2292 * Close inode open handle
2294 * \param dentry [in] dentry which contains the inode
2295 * \param it [in,out] intent which contains open info and result
2298 * \retval <0 failure
2300 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2302 struct inode *inode = dentry->d_inode;
2303 struct obd_client_handle *och;
2309 /* Root ? Do nothing. */
2310 if (dentry->d_inode->i_sb->s_root == dentry)
2313 /* No open handle to close? Move away */
2314 if (!it_disposition(it, DISP_OPEN_OPEN))
2317 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2319 OBD_ALLOC(och, sizeof(*och));
2321 GOTO(out, rc = -ENOMEM);
2323 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2325 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2327 /* this one is in place of ll_file_open */
2328 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2329 ptlrpc_req_finished(it->it_request);
2330 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2336 * Get size for inode for which FIEMAP mapping is requested.
2337 * Make the FIEMAP get_info call and returns the result.
2338 * \param fiemap kernel buffer to hold extens
2339 * \param num_bytes kernel buffer size
2341 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2347 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2350 /* Checks for fiemap flags */
2351 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2352 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2356 /* Check for FIEMAP_FLAG_SYNC */
2357 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2358 rc = filemap_fdatawrite(inode->i_mapping);
2363 env = cl_env_get(&refcheck);
2365 RETURN(PTR_ERR(env));
2367 if (i_size_read(inode) == 0) {
2368 rc = ll_glimpse_size(inode);
2373 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2374 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2375 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2377 /* If filesize is 0, then there would be no objects for mapping */
2378 if (fmkey.lfik_oa.o_size == 0) {
2379 fiemap->fm_mapped_extents = 0;
2383 fmkey.lfik_fiemap = *fiemap;
2385 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2386 &fmkey, fiemap, &num_bytes);
2388 cl_env_put(env, &refcheck);
2392 int ll_fid2path(struct inode *inode, void __user *arg)
2394 struct obd_export *exp = ll_i2mdexp(inode);
2395 const struct getinfo_fid2path __user *gfin = arg;
2397 struct getinfo_fid2path *gfout;
2403 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2404 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2407 /* Only need to get the buflen */
2408 if (get_user(pathlen, &gfin->gf_pathlen))
2411 if (pathlen > PATH_MAX)
2414 outsize = sizeof(*gfout) + pathlen;
2415 OBD_ALLOC(gfout, outsize);
2419 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2420 GOTO(gf_free, rc = -EFAULT);
2421 /* append root FID after gfout to let MDT know the root FID so that it
2422 * can lookup the correct path, this is mainly for fileset.
2423 * old server without fileset mount support will ignore this. */
2424 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2426 /* Call mdc_iocontrol */
2427 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2431 if (copy_to_user(arg, gfout, outsize))
2435 OBD_FREE(gfout, outsize);
2440 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2442 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2450 ioc->idv_version = 0;
2451 ioc->idv_layout_version = UINT_MAX;
2453 /* If no file object initialized, we consider its version is 0. */
2457 env = cl_env_get(&refcheck);
2459 RETURN(PTR_ERR(env));
2461 io = vvp_env_thread_io(env);
2463 io->u.ci_data_version.dv_data_version = 0;
2464 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2465 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2468 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2469 result = cl_io_loop(env, io);
2471 result = io->ci_result;
2473 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2474 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2476 cl_io_fini(env, io);
2478 if (unlikely(io->ci_need_restart))
2481 cl_env_put(env, &refcheck);
2487 * Read the data_version for inode.
2489 * This value is computed using stripe object version on OST.
2490 * Version is computed using server side locking.
2492 * @param flags if do sync on the OST side;
2494 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2495 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2497 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2499 struct ioc_data_version ioc = { .idv_flags = flags };
2502 rc = ll_ioc_data_version(inode, &ioc);
2504 *data_version = ioc.idv_version;
2510 * Trigger a HSM release request for the provided inode.
2512 int ll_hsm_release(struct inode *inode)
2515 struct obd_client_handle *och = NULL;
2516 __u64 data_version = 0;
2521 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2522 ll_get_fsname(inode->i_sb, NULL, 0),
2523 PFID(&ll_i2info(inode)->lli_fid));
2525 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2527 GOTO(out, rc = PTR_ERR(och));
2529 /* Grab latest data_version and [am]time values */
2530 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2534 env = cl_env_get(&refcheck);
2536 GOTO(out, rc = PTR_ERR(env));
2538 rc = ll_merge_attr(env, inode);
2539 cl_env_put(env, &refcheck);
2541 /* If error happen, we have the wrong size for a file.
2547 /* Release the file.
2548 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2549 * we still need it to pack l_remote_handle to MDT. */
2550 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2556 if (och != NULL && !IS_ERR(och)) /* close the file */
2557 ll_lease_close(och, inode, NULL);
2562 struct ll_swap_stack {
2565 struct inode *inode1;
2566 struct inode *inode2;
2571 static int ll_swap_layouts(struct file *file1, struct file *file2,
2572 struct lustre_swap_layouts *lsl)
2574 struct mdc_swap_layouts msl;
2575 struct md_op_data *op_data;
2578 struct ll_swap_stack *llss = NULL;
2581 OBD_ALLOC_PTR(llss);
2585 llss->inode1 = file_inode(file1);
2586 llss->inode2 = file_inode(file2);
2588 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2592 /* we use 2 bool because it is easier to swap than 2 bits */
2593 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2594 llss->check_dv1 = true;
2596 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2597 llss->check_dv2 = true;
2599 /* we cannot use lsl->sl_dvX directly because we may swap them */
2600 llss->dv1 = lsl->sl_dv1;
2601 llss->dv2 = lsl->sl_dv2;
2603 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2604 if (rc == 0) /* same file, done! */
2607 if (rc < 0) { /* sequentialize it */
2608 swap(llss->inode1, llss->inode2);
2610 swap(llss->dv1, llss->dv2);
2611 swap(llss->check_dv1, llss->check_dv2);
2615 if (gid != 0) { /* application asks to flush dirty cache */
2616 rc = ll_get_grouplock(llss->inode1, file1, gid);
2620 rc = ll_get_grouplock(llss->inode2, file2, gid);
2622 ll_put_grouplock(llss->inode1, file1, gid);
2627 /* ultimate check, before swaping the layouts we check if
2628 * dataversion has changed (if requested) */
2629 if (llss->check_dv1) {
2630 rc = ll_data_version(llss->inode1, &dv, 0);
2633 if (dv != llss->dv1)
2634 GOTO(putgl, rc = -EAGAIN);
2637 if (llss->check_dv2) {
2638 rc = ll_data_version(llss->inode2, &dv, 0);
2641 if (dv != llss->dv2)
2642 GOTO(putgl, rc = -EAGAIN);
2645 /* struct md_op_data is used to send the swap args to the mdt
2646 * only flags is missing, so we use struct mdc_swap_layouts
2647 * through the md_op_data->op_data */
2648 /* flags from user space have to be converted before they are send to
2649 * server, no flag is sent today, they are only used on the client */
2652 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2653 0, LUSTRE_OPC_ANY, &msl);
2654 if (IS_ERR(op_data))
2655 GOTO(free, rc = PTR_ERR(op_data));
2657 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2658 sizeof(*op_data), op_data, NULL);
2659 ll_finish_md_op_data(op_data);
2666 ll_put_grouplock(llss->inode2, file2, gid);
2667 ll_put_grouplock(llss->inode1, file1, gid);
2677 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2679 struct obd_export *exp = ll_i2mdexp(inode);
2680 struct md_op_data *op_data;
2684 /* Detect out-of range masks */
2685 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2688 /* Non-root users are forbidden to set or clear flags which are
2689 * NOT defined in HSM_USER_MASK. */
2690 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2691 !cfs_capable(CFS_CAP_SYS_ADMIN))
2694 if (!exp_connect_archive_id_array(exp)) {
2695 /* Detect out-of range archive id */
2696 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2697 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2701 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2702 LUSTRE_OPC_ANY, hss);
2703 if (IS_ERR(op_data))
2704 RETURN(PTR_ERR(op_data));
2706 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2709 ll_finish_md_op_data(op_data);
2714 static int ll_hsm_import(struct inode *inode, struct file *file,
2715 struct hsm_user_import *hui)
2717 struct hsm_state_set *hss = NULL;
2718 struct iattr *attr = NULL;
2722 if (!S_ISREG(inode->i_mode))
2728 GOTO(out, rc = -ENOMEM);
2730 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2731 hss->hss_archive_id = hui->hui_archive_id;
2732 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2733 rc = ll_hsm_state_set(inode, hss);
2737 OBD_ALLOC_PTR(attr);
2739 GOTO(out, rc = -ENOMEM);
2741 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2742 attr->ia_mode |= S_IFREG;
2743 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2744 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2745 attr->ia_size = hui->hui_size;
2746 attr->ia_mtime.tv_sec = hui->hui_mtime;
2747 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2748 attr->ia_atime.tv_sec = hui->hui_atime;
2749 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2751 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2752 ATTR_UID | ATTR_GID |
2753 ATTR_MTIME | ATTR_MTIME_SET |
2754 ATTR_ATIME | ATTR_ATIME_SET;
2758 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2762 inode_unlock(inode);
2774 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2776 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2777 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2780 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2782 struct inode *inode = file_inode(file);
2784 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2785 ATTR_MTIME | ATTR_MTIME_SET |
2788 .tv_sec = lfu->lfu_atime_sec,
2789 .tv_nsec = lfu->lfu_atime_nsec,
2792 .tv_sec = lfu->lfu_mtime_sec,
2793 .tv_nsec = lfu->lfu_mtime_nsec,
2796 .tv_sec = lfu->lfu_ctime_sec,
2797 .tv_nsec = lfu->lfu_ctime_nsec,
2803 if (!capable(CAP_SYS_ADMIN))
2806 if (!S_ISREG(inode->i_mode))
2810 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2812 inode_unlock(inode);
2817 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2820 case MODE_READ_USER:
2822 case MODE_WRITE_USER:
2829 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2831 /* Used to allow the upper layers of the client to request an LDLM lock
2832 * without doing an actual read or write.
2834 * Used for ladvise lockahead to manually request specific locks.
2836 * \param[in] file file this ladvise lock request is on
2837 * \param[in] ladvise ladvise struct describing this lock request
2839 * \retval 0 success, no detailed result available (sync requests
2840 * and requests sent to the server [not handled locally]
2841 * cannot return detailed results)
2842 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2843 * see definitions for details.
2844 * \retval negative negative errno on error
2846 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2848 struct lu_env *env = NULL;
2849 struct cl_io *io = NULL;
2850 struct cl_lock *lock = NULL;
2851 struct cl_lock_descr *descr = NULL;
2852 struct dentry *dentry = file->f_path.dentry;
2853 struct inode *inode = dentry->d_inode;
2854 enum cl_lock_mode cl_mode;
2855 off_t start = ladvise->lla_start;
2856 off_t end = ladvise->lla_end;
2862 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2863 "start=%llu, end=%llu\n", dentry->d_name.len,
2864 dentry->d_name.name, dentry->d_inode,
2865 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2868 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2870 GOTO(out, result = cl_mode);
2872 /* Get IO environment */
2873 result = cl_io_get(inode, &env, &io, &refcheck);
2877 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2880 * nothing to do for this io. This currently happens when
2881 * stripe sub-object's are not yet created.
2883 result = io->ci_result;
2884 } else if (result == 0) {
2885 lock = vvp_env_lock(env);
2886 descr = &lock->cll_descr;
2888 descr->cld_obj = io->ci_obj;
2889 /* Convert byte offsets to pages */
2890 descr->cld_start = cl_index(io->ci_obj, start);
2891 descr->cld_end = cl_index(io->ci_obj, end);
2892 descr->cld_mode = cl_mode;
2893 /* CEF_MUST is used because we do not want to convert a
2894 * lockahead request to a lockless lock */
2895 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2898 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2899 descr->cld_enq_flags |= CEF_SPECULATIVE;
2901 result = cl_lock_request(env, io, lock);
2903 /* On success, we need to release the lock */
2905 cl_lock_release(env, lock);
2907 cl_io_fini(env, io);
2908 cl_env_put(env, &refcheck);
2910 /* -ECANCELED indicates a matching lock with a different extent
2911 * was already present, and -EEXIST indicates a matching lock
2912 * on exactly the same extent was already present.
2913 * We convert them to positive values for userspace to make
2914 * recognizing true errors easier.
2915 * Note we can only return these detailed results on async requests,
2916 * as sync requests look the same as i/o requests for locking. */
2917 if (result == -ECANCELED)
2918 result = LLA_RESULT_DIFFERENT;
2919 else if (result == -EEXIST)
2920 result = LLA_RESULT_SAME;
2925 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2927 static int ll_ladvise_sanity(struct inode *inode,
2928 struct llapi_lu_ladvise *ladvise)
2930 enum lu_ladvise_type advice = ladvise->lla_advice;
2931 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2932 * be in the first 32 bits of enum ladvise_flags */
2933 __u32 flags = ladvise->lla_peradvice_flags;
2934 /* 3 lines at 80 characters per line, should be plenty */
2937 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2939 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2940 "last supported advice is %s (value '%d'): rc = %d\n",
2941 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2942 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2946 /* Per-advice checks */
2948 case LU_LADVISE_LOCKNOEXPAND:
2949 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2951 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2953 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2954 ladvise_names[advice], rc);
2958 case LU_LADVISE_LOCKAHEAD:
2959 /* Currently only READ and WRITE modes can be requested */
2960 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2961 ladvise->lla_lockahead_mode == 0) {
2963 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2965 ll_get_fsname(inode->i_sb, NULL, 0),
2966 ladvise->lla_lockahead_mode,
2967 ladvise_names[advice], rc);
2970 case LU_LADVISE_WILLREAD:
2971 case LU_LADVISE_DONTNEED:
2973 /* Note fall through above - These checks apply to all advices
2974 * except LOCKNOEXPAND */
2975 if (flags & ~LF_DEFAULT_MASK) {
2977 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2979 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2980 ladvise_names[advice], rc);
2983 if (ladvise->lla_start >= ladvise->lla_end) {
2985 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2986 "for %s: rc = %d\n",
2987 ll_get_fsname(inode->i_sb, NULL, 0),
2988 ladvise->lla_start, ladvise->lla_end,
2989 ladvise_names[advice], rc);
3001 * Give file access advices
3003 * The ladvise interface is similar to Linux fadvise() system call, except it
3004 * forwards the advices directly from Lustre client to server. The server side
3005 * codes will apply appropriate read-ahead and caching techniques for the
3006 * corresponding files.
3008 * A typical workload for ladvise is e.g. a bunch of different clients are
3009 * doing small random reads of a file, so prefetching pages into OSS cache
3010 * with big linear reads before the random IO is a net benefit. Fetching
3011 * all that data into each client cache with fadvise() may not be, due to
3012 * much more data being sent to the client.
3014 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3015 struct llapi_lu_ladvise *ladvise)
3019 struct cl_ladvise_io *lio;
3024 env = cl_env_get(&refcheck);
3026 RETURN(PTR_ERR(env));
3028 io = vvp_env_thread_io(env);
3029 io->ci_obj = ll_i2info(inode)->lli_clob;
3031 /* initialize parameters for ladvise */
3032 lio = &io->u.ci_ladvise;
3033 lio->li_start = ladvise->lla_start;
3034 lio->li_end = ladvise->lla_end;
3035 lio->li_fid = ll_inode2fid(inode);
3036 lio->li_advice = ladvise->lla_advice;
3037 lio->li_flags = flags;
3039 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3040 rc = cl_io_loop(env, io);
3044 cl_io_fini(env, io);
3045 cl_env_put(env, &refcheck);
3049 static int ll_lock_noexpand(struct file *file, int flags)
3051 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3053 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3058 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3061 struct fsxattr fsxattr;
3063 if (copy_from_user(&fsxattr,
3064 (const struct fsxattr __user *)arg,
3068 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3069 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3070 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3071 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3072 if (copy_to_user((struct fsxattr __user *)arg,
3073 &fsxattr, sizeof(fsxattr)))
3079 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3082 * Project Quota ID state is only allowed to change from within the init
3083 * namespace. Enforce that restriction only if we are trying to change
3084 * the quota ID state. Everything else is allowed in user namespaces.
3086 if (current_user_ns() == &init_user_ns)
3089 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3092 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3093 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3096 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3103 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3107 struct md_op_data *op_data;
3108 struct ptlrpc_request *req = NULL;
3110 struct fsxattr fsxattr;
3111 struct cl_object *obj;
3115 if (copy_from_user(&fsxattr,
3116 (const struct fsxattr __user *)arg,
3120 rc = ll_ioctl_check_project(inode, &fsxattr);
3124 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3125 LUSTRE_OPC_ANY, NULL);
3126 if (IS_ERR(op_data))
3127 RETURN(PTR_ERR(op_data));
3129 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3130 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3131 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3132 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3133 op_data->op_projid = fsxattr.fsx_projid;
3134 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3135 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3137 ptlrpc_req_finished(req);
3139 GOTO(out_fsxattr, rc);
3140 ll_update_inode_flags(inode, op_data->op_attr_flags);
3141 obj = ll_i2info(inode)->lli_clob;
3143 GOTO(out_fsxattr, rc);
3145 OBD_ALLOC_PTR(attr);
3147 GOTO(out_fsxattr, rc = -ENOMEM);
3149 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3150 fsxattr.fsx_xflags);
3153 ll_finish_md_op_data(op_data);
3157 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3160 struct inode *inode = file_inode(file);
3161 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3162 struct ll_inode_info *lli = ll_i2info(inode);
3163 struct obd_client_handle *och = NULL;
3164 struct split_param sp;
3167 enum mds_op_bias bias = 0;
3168 struct file *layout_file = NULL;
3170 size_t data_size = 0;
3174 mutex_lock(&lli->lli_och_mutex);
3175 if (fd->fd_lease_och != NULL) {
3176 och = fd->fd_lease_och;
3177 fd->fd_lease_och = NULL;
3179 mutex_unlock(&lli->lli_och_mutex);
3182 GOTO(out, rc = -ENOLCK);
3184 fmode = och->och_flags;
3186 switch (ioc->lil_flags) {
3187 case LL_LEASE_RESYNC_DONE:
3188 if (ioc->lil_count > IOC_IDS_MAX)
3189 GOTO(out, rc = -EINVAL);
3191 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3192 OBD_ALLOC(data, data_size);
3194 GOTO(out, rc = -ENOMEM);
3196 if (copy_from_user(data, (void __user *)arg, data_size))
3197 GOTO(out, rc = -EFAULT);
3199 bias = MDS_CLOSE_RESYNC_DONE;
3201 case LL_LEASE_LAYOUT_MERGE: {
3204 if (ioc->lil_count != 1)
3205 GOTO(out, rc = -EINVAL);
3207 arg += sizeof(*ioc);
3208 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3209 GOTO(out, rc = -EFAULT);
3211 layout_file = fget(fd);
3213 GOTO(out, rc = -EBADF);
3215 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3216 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3217 GOTO(out, rc = -EPERM);
3219 data = file_inode(layout_file);
3220 bias = MDS_CLOSE_LAYOUT_MERGE;
3223 case LL_LEASE_LAYOUT_SPLIT: {
3227 if (ioc->lil_count != 2)
3228 GOTO(out, rc = -EINVAL);
3230 arg += sizeof(*ioc);
3231 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3232 GOTO(out, rc = -EFAULT);
3234 arg += sizeof(__u32);
3235 if (copy_from_user(&mirror_id, (void __user *)arg,
3237 GOTO(out, rc = -EFAULT);
3239 layout_file = fget(fdv);
3241 GOTO(out, rc = -EBADF);
3243 sp.sp_inode = file_inode(layout_file);
3244 sp.sp_mirror_id = (__u16)mirror_id;
3246 bias = MDS_CLOSE_LAYOUT_SPLIT;
3250 /* without close intent */
3254 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3258 rc = ll_lease_och_release(inode, file);
3267 switch (ioc->lil_flags) {
3268 case LL_LEASE_RESYNC_DONE:
3270 OBD_FREE(data, data_size);
3272 case LL_LEASE_LAYOUT_MERGE:
3273 case LL_LEASE_LAYOUT_SPLIT:
3280 rc = ll_lease_type_from_fmode(fmode);
3284 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3287 struct inode *inode = file_inode(file);
3288 struct ll_inode_info *lli = ll_i2info(inode);
3289 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3290 struct obd_client_handle *och = NULL;
3291 __u64 open_flags = 0;
3297 switch (ioc->lil_mode) {
3298 case LL_LEASE_WRLCK:
3299 if (!(file->f_mode & FMODE_WRITE))
3301 fmode = FMODE_WRITE;
3303 case LL_LEASE_RDLCK:
3304 if (!(file->f_mode & FMODE_READ))
3308 case LL_LEASE_UNLCK:
3309 RETURN(ll_file_unlock_lease(file, ioc, arg));
3314 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3316 /* apply for lease */
3317 if (ioc->lil_flags & LL_LEASE_RESYNC)
3318 open_flags = MDS_OPEN_RESYNC;
3319 och = ll_lease_open(inode, file, fmode, open_flags);
3321 RETURN(PTR_ERR(och));
3323 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3324 rc = ll_lease_file_resync(och, inode, arg);
3326 ll_lease_close(och, inode, NULL);
3329 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3331 ll_lease_close(och, inode, NULL);
3337 mutex_lock(&lli->lli_och_mutex);
3338 if (fd->fd_lease_och == NULL) {
3339 fd->fd_lease_och = och;
3342 mutex_unlock(&lli->lli_och_mutex);
3344 /* impossible now that only excl is supported for now */
3345 ll_lease_close(och, inode, &lease_broken);
3352 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3354 struct inode *inode = file_inode(file);
3355 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3359 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3360 PFID(ll_inode2fid(inode)), inode, cmd);
3361 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3363 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3364 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3368 case LL_IOC_GETFLAGS:
3369 /* Get the current value of the file flags */
3370 return put_user(fd->fd_flags, (int __user *)arg);
3371 case LL_IOC_SETFLAGS:
3372 case LL_IOC_CLRFLAGS:
3373 /* Set or clear specific file flags */
3374 /* XXX This probably needs checks to ensure the flags are
3375 * not abused, and to handle any flag side effects.
3377 if (get_user(flags, (int __user *) arg))
3380 if (cmd == LL_IOC_SETFLAGS) {
3381 if ((flags & LL_FILE_IGNORE_LOCK) &&
3382 !(file->f_flags & O_DIRECT)) {
3383 CERROR("%s: unable to disable locking on "
3384 "non-O_DIRECT file\n", current->comm);
3388 fd->fd_flags |= flags;
3390 fd->fd_flags &= ~flags;
3393 case LL_IOC_LOV_SETSTRIPE:
3394 case LL_IOC_LOV_SETSTRIPE_NEW:
3395 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3396 case LL_IOC_LOV_SETEA:
3397 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3398 case LL_IOC_LOV_SWAP_LAYOUTS: {
3400 struct lustre_swap_layouts lsl;
3402 if (copy_from_user(&lsl, (char __user *)arg,
3403 sizeof(struct lustre_swap_layouts)))
3406 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3409 file2 = fget(lsl.sl_fd);
3413 /* O_WRONLY or O_RDWR */
3414 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3415 GOTO(out, rc = -EPERM);
3417 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3418 struct inode *inode2;
3419 struct ll_inode_info *lli;
3420 struct obd_client_handle *och = NULL;
3422 lli = ll_i2info(inode);
3423 mutex_lock(&lli->lli_och_mutex);
3424 if (fd->fd_lease_och != NULL) {
3425 och = fd->fd_lease_och;
3426 fd->fd_lease_och = NULL;
3428 mutex_unlock(&lli->lli_och_mutex);
3430 GOTO(out, rc = -ENOLCK);
3431 inode2 = file_inode(file2);
3432 rc = ll_swap_layouts_close(och, inode, inode2);
3434 rc = ll_swap_layouts(file, file2, &lsl);
3440 case LL_IOC_LOV_GETSTRIPE:
3441 case LL_IOC_LOV_GETSTRIPE_NEW:
3442 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3443 case FS_IOC_GETFLAGS:
3444 case FS_IOC_SETFLAGS:
3445 RETURN(ll_iocontrol(inode, file, cmd, arg));
3446 case FSFILT_IOC_GETVERSION:
3447 case FS_IOC_GETVERSION:
3448 RETURN(put_user(inode->i_generation, (int __user *)arg));
3449 /* We need to special case any other ioctls we want to handle,
3450 * to send them to the MDS/OST as appropriate and to properly
3451 * network encode the arg field. */
3452 case FS_IOC_SETVERSION:
3455 case LL_IOC_GROUP_LOCK:
3456 RETURN(ll_get_grouplock(inode, file, arg));
3457 case LL_IOC_GROUP_UNLOCK:
3458 RETURN(ll_put_grouplock(inode, file, arg));
3459 case IOC_OBD_STATFS:
3460 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3462 case LL_IOC_FLUSHCTX:
3463 RETURN(ll_flush_ctx(inode));
3464 case LL_IOC_PATH2FID: {
3465 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3466 sizeof(struct lu_fid)))
3471 case LL_IOC_GETPARENT:
3472 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3474 case OBD_IOC_FID2PATH:
3475 RETURN(ll_fid2path(inode, (void __user *)arg));
3476 case LL_IOC_DATA_VERSION: {
3477 struct ioc_data_version idv;
3480 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3483 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3484 rc = ll_ioc_data_version(inode, &idv);
3487 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3493 case LL_IOC_GET_MDTIDX: {
3496 mdtidx = ll_get_mdt_idx(inode);
3500 if (put_user((int)mdtidx, (int __user *)arg))
3505 case OBD_IOC_GETDTNAME:
3506 case OBD_IOC_GETMDNAME:
3507 RETURN(ll_get_obd_name(inode, cmd, arg));
3508 case LL_IOC_HSM_STATE_GET: {
3509 struct md_op_data *op_data;
3510 struct hsm_user_state *hus;
3517 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3518 LUSTRE_OPC_ANY, hus);
3519 if (IS_ERR(op_data)) {
3521 RETURN(PTR_ERR(op_data));
3524 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3527 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3530 ll_finish_md_op_data(op_data);
3534 case LL_IOC_HSM_STATE_SET: {
3535 struct hsm_state_set *hss;
3542 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3547 rc = ll_hsm_state_set(inode, hss);
3552 case LL_IOC_HSM_ACTION: {
3553 struct md_op_data *op_data;
3554 struct hsm_current_action *hca;
3561 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3562 LUSTRE_OPC_ANY, hca);
3563 if (IS_ERR(op_data)) {
3565 RETURN(PTR_ERR(op_data));
3568 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3571 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3574 ll_finish_md_op_data(op_data);
3578 case LL_IOC_SET_LEASE_OLD: {
3579 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3581 RETURN(ll_file_set_lease(file, &ioc, 0));
3583 case LL_IOC_SET_LEASE: {
3584 struct ll_ioc_lease ioc;
3586 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3589 RETURN(ll_file_set_lease(file, &ioc, arg));
3591 case LL_IOC_GET_LEASE: {
3592 struct ll_inode_info *lli = ll_i2info(inode);
3593 struct ldlm_lock *lock = NULL;
3596 mutex_lock(&lli->lli_och_mutex);
3597 if (fd->fd_lease_och != NULL) {
3598 struct obd_client_handle *och = fd->fd_lease_och;
3600 lock = ldlm_handle2lock(&och->och_lease_handle);
3602 lock_res_and_lock(lock);
3603 if (!ldlm_is_cancel(lock))
3604 fmode = och->och_flags;
3606 unlock_res_and_lock(lock);
3607 LDLM_LOCK_PUT(lock);
3610 mutex_unlock(&lli->lli_och_mutex);
3612 RETURN(ll_lease_type_from_fmode(fmode));
3614 case LL_IOC_HSM_IMPORT: {
3615 struct hsm_user_import *hui;
3621 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3626 rc = ll_hsm_import(inode, file, hui);
3631 case LL_IOC_FUTIMES_3: {
3632 struct ll_futimes_3 lfu;
3634 if (copy_from_user(&lfu,
3635 (const struct ll_futimes_3 __user *)arg,
3639 RETURN(ll_file_futimes_3(file, &lfu));
3641 case LL_IOC_LADVISE: {
3642 struct llapi_ladvise_hdr *k_ladvise_hdr;
3643 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3646 int alloc_size = sizeof(*k_ladvise_hdr);
3649 u_ladvise_hdr = (void __user *)arg;
3650 OBD_ALLOC_PTR(k_ladvise_hdr);
3651 if (k_ladvise_hdr == NULL)
3654 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3655 GOTO(out_ladvise, rc = -EFAULT);
3657 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3658 k_ladvise_hdr->lah_count < 1)
3659 GOTO(out_ladvise, rc = -EINVAL);
3661 num_advise = k_ladvise_hdr->lah_count;
3662 if (num_advise >= LAH_COUNT_MAX)
3663 GOTO(out_ladvise, rc = -EFBIG);
3665 OBD_FREE_PTR(k_ladvise_hdr);
3666 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3667 lah_advise[num_advise]);
3668 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3669 if (k_ladvise_hdr == NULL)
3673 * TODO: submit multiple advices to one server in a single RPC
3675 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3676 GOTO(out_ladvise, rc = -EFAULT);
3678 for (i = 0; i < num_advise; i++) {
3679 struct llapi_lu_ladvise *k_ladvise =
3680 &k_ladvise_hdr->lah_advise[i];
3681 struct llapi_lu_ladvise __user *u_ladvise =
3682 &u_ladvise_hdr->lah_advise[i];
3684 rc = ll_ladvise_sanity(inode, k_ladvise);
3686 GOTO(out_ladvise, rc);
3688 switch (k_ladvise->lla_advice) {
3689 case LU_LADVISE_LOCKNOEXPAND:
3690 rc = ll_lock_noexpand(file,
3691 k_ladvise->lla_peradvice_flags);
3692 GOTO(out_ladvise, rc);
3693 case LU_LADVISE_LOCKAHEAD:
3695 rc = ll_file_lock_ahead(file, k_ladvise);
3698 GOTO(out_ladvise, rc);
3701 &u_ladvise->lla_lockahead_result))
3702 GOTO(out_ladvise, rc = -EFAULT);
3705 rc = ll_ladvise(inode, file,
3706 k_ladvise_hdr->lah_flags,
3709 GOTO(out_ladvise, rc);
3716 OBD_FREE(k_ladvise_hdr, alloc_size);
3719 case LL_IOC_FLR_SET_MIRROR: {
3720 /* mirror I/O must be direct to avoid polluting page cache
3722 if (!(file->f_flags & O_DIRECT))
3725 fd->fd_designated_mirror = (__u32)arg;
3728 case LL_IOC_FSGETXATTR:
3729 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3730 case LL_IOC_FSSETXATTR:
3731 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3733 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3735 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3736 (void __user *)arg));
3740 #ifndef HAVE_FILE_LLSEEK_SIZE
3741 static inline loff_t
3742 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3744 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3746 if (offset > maxsize)
3749 if (offset != file->f_pos) {
3750 file->f_pos = offset;
3751 file->f_version = 0;
3757 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3758 loff_t maxsize, loff_t eof)
3760 struct inode *inode = file_inode(file);
3768 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3769 * position-querying operation. Avoid rewriting the "same"
3770 * f_pos value back to the file because a concurrent read(),
3771 * write() or lseek() might have altered it
3776 * f_lock protects against read/modify/write race with other
3777 * SEEK_CURs. Note that parallel writes and reads behave
3781 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3782 inode_unlock(inode);
3786 * In the generic case the entire file is data, so as long as
3787 * offset isn't at the end of the file then the offset is data.
3794 * There is a virtual hole at the end of the file, so as long as
3795 * offset isn't i_size or larger, return i_size.
3803 return llseek_execute(file, offset, maxsize);
3807 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3809 struct inode *inode = file_inode(file);
3810 loff_t retval, eof = 0;
3813 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3814 (origin == SEEK_CUR) ? file->f_pos : 0);
3815 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3816 PFID(ll_inode2fid(inode)), inode, retval, retval,
3818 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3820 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3821 retval = ll_glimpse_size(inode);
3824 eof = i_size_read(inode);
3827 retval = ll_generic_file_llseek_size(file, offset, origin,
3828 ll_file_maxbytes(inode), eof);
3832 static int ll_flush(struct file *file, fl_owner_t id)
3834 struct inode *inode = file_inode(file);
3835 struct ll_inode_info *lli = ll_i2info(inode);
3836 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3839 LASSERT(!S_ISDIR(inode->i_mode));
3841 /* catch async errors that were recorded back when async writeback
3842 * failed for pages in this mapping. */
3843 rc = lli->lli_async_rc;
3844 lli->lli_async_rc = 0;
3845 if (lli->lli_clob != NULL) {
3846 err = lov_read_and_clear_async_rc(lli->lli_clob);
3851 /* The application has been told write failure already.
3852 * Do not report failure again. */
3853 if (fd->fd_write_failed)
3855 return rc ? -EIO : 0;
3859 * Called to make sure a portion of file has been written out.
3860 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3862 * Return how many pages have been written.
3864 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3865 enum cl_fsync_mode mode, int ignore_layout)
3869 struct cl_fsync_io *fio;
3874 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3875 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3878 env = cl_env_get(&refcheck);
3880 RETURN(PTR_ERR(env));
3882 io = vvp_env_thread_io(env);
3883 io->ci_obj = ll_i2info(inode)->lli_clob;
3884 io->ci_ignore_layout = ignore_layout;
3886 /* initialize parameters for sync */
3887 fio = &io->u.ci_fsync;
3888 fio->fi_start = start;
3890 fio->fi_fid = ll_inode2fid(inode);
3891 fio->fi_mode = mode;
3892 fio->fi_nr_written = 0;
3894 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3895 result = cl_io_loop(env, io);
3897 result = io->ci_result;
3899 result = fio->fi_nr_written;
3900 cl_io_fini(env, io);
3901 cl_env_put(env, &refcheck);
3907 * When dentry is provided (the 'else' case), file_dentry() may be
3908 * null and dentry must be used directly rather than pulled from
3909 * file_dentry() as is done otherwise.
3912 #ifdef HAVE_FILE_FSYNC_4ARGS
3913 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3915 struct dentry *dentry = file_dentry(file);
3917 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3918 int ll_fsync(struct file *file, int datasync)
3920 struct dentry *dentry = file_dentry(file);
3922 loff_t end = LLONG_MAX;
3924 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3927 loff_t end = LLONG_MAX;
3929 struct inode *inode = dentry->d_inode;
3930 struct ll_inode_info *lli = ll_i2info(inode);
3931 struct ptlrpc_request *req;
3935 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3936 PFID(ll_inode2fid(inode)), inode);
3937 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3939 #ifdef HAVE_FILE_FSYNC_4ARGS
3940 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3941 lock_inode = !lli->lli_inode_locked;
3945 /* fsync's caller has already called _fdata{sync,write}, we want
3946 * that IO to finish before calling the osc and mdc sync methods */
3947 rc = filemap_fdatawait(inode->i_mapping);
3950 /* catch async errors that were recorded back when async writeback
3951 * failed for pages in this mapping. */
3952 if (!S_ISDIR(inode->i_mode)) {
3953 err = lli->lli_async_rc;
3954 lli->lli_async_rc = 0;
3957 if (lli->lli_clob != NULL) {
3958 err = lov_read_and_clear_async_rc(lli->lli_clob);
3964 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3968 ptlrpc_req_finished(req);
3970 if (S_ISREG(inode->i_mode)) {
3971 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3973 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3974 if (rc == 0 && err < 0)
3977 fd->fd_write_failed = true;
3979 fd->fd_write_failed = false;
3982 #ifdef HAVE_FILE_FSYNC_4ARGS
3984 inode_unlock(inode);
3990 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3992 struct inode *inode = file_inode(file);
3993 struct ll_sb_info *sbi = ll_i2sbi(inode);
3994 struct ldlm_enqueue_info einfo = {
3995 .ei_type = LDLM_FLOCK,
3996 .ei_cb_cp = ldlm_flock_completion_ast,
3997 .ei_cbdata = file_lock,
3999 struct md_op_data *op_data;
4000 struct lustre_handle lockh = { 0 };
4001 union ldlm_policy_data flock = { { 0 } };
4002 int fl_type = file_lock->fl_type;
4008 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4009 PFID(ll_inode2fid(inode)), file_lock);
4011 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4013 if (file_lock->fl_flags & FL_FLOCK) {
4014 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4015 /* flocks are whole-file locks */
4016 flock.l_flock.end = OFFSET_MAX;
4017 /* For flocks owner is determined by the local file desctiptor*/
4018 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4019 } else if (file_lock->fl_flags & FL_POSIX) {
4020 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4021 flock.l_flock.start = file_lock->fl_start;
4022 flock.l_flock.end = file_lock->fl_end;
4026 flock.l_flock.pid = file_lock->fl_pid;
4028 /* Somewhat ugly workaround for svc lockd.
4029 * lockd installs custom fl_lmops->lm_compare_owner that checks
4030 * for the fl_owner to be the same (which it always is on local node
4031 * I guess between lockd processes) and then compares pid.
4032 * As such we assign pid to the owner field to make it all work,
4033 * conflict with normal locks is unlikely since pid space and
4034 * pointer space for current->files are not intersecting */
4035 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4036 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4040 einfo.ei_mode = LCK_PR;
4043 /* An unlock request may or may not have any relation to
4044 * existing locks so we may not be able to pass a lock handle
4045 * via a normal ldlm_lock_cancel() request. The request may even
4046 * unlock a byte range in the middle of an existing lock. In
4047 * order to process an unlock request we need all of the same
4048 * information that is given with a normal read or write record
4049 * lock request. To avoid creating another ldlm unlock (cancel)
4050 * message we'll treat a LCK_NL flock request as an unlock. */
4051 einfo.ei_mode = LCK_NL;
4054 einfo.ei_mode = LCK_PW;
4057 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4072 flags = LDLM_FL_BLOCK_NOWAIT;
4078 flags = LDLM_FL_TEST_LOCK;
4081 CERROR("unknown fcntl lock command: %d\n", cmd);
4085 /* Save the old mode so that if the mode in the lock changes we
4086 * can decrement the appropriate reader or writer refcount. */
4087 file_lock->fl_type = einfo.ei_mode;
4089 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4090 LUSTRE_OPC_ANY, NULL);
4091 if (IS_ERR(op_data))
4092 RETURN(PTR_ERR(op_data));
4094 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4095 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4096 flock.l_flock.pid, flags, einfo.ei_mode,
4097 flock.l_flock.start, flock.l_flock.end);
4099 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4102 /* Restore the file lock type if not TEST lock. */
4103 if (!(flags & LDLM_FL_TEST_LOCK))
4104 file_lock->fl_type = fl_type;
4106 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4107 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4108 !(flags & LDLM_FL_TEST_LOCK))
4109 rc2 = locks_lock_file_wait(file, file_lock);
4111 if ((file_lock->fl_flags & FL_FLOCK) &&
4112 (rc == 0 || file_lock->fl_type == F_UNLCK))
4113 rc2 = flock_lock_file_wait(file, file_lock);
4114 if ((file_lock->fl_flags & FL_POSIX) &&
4115 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4116 !(flags & LDLM_FL_TEST_LOCK))
4117 rc2 = posix_lock_file_wait(file, file_lock);
4118 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4120 if (rc2 && file_lock->fl_type != F_UNLCK) {
4121 einfo.ei_mode = LCK_NL;
4122 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4127 ll_finish_md_op_data(op_data);
4132 int ll_get_fid_by_name(struct inode *parent, const char *name,
4133 int namelen, struct lu_fid *fid,
4134 struct inode **inode)
4136 struct md_op_data *op_data = NULL;
4137 struct mdt_body *body;
4138 struct ptlrpc_request *req;
4142 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4143 LUSTRE_OPC_ANY, NULL);
4144 if (IS_ERR(op_data))
4145 RETURN(PTR_ERR(op_data));
4147 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4148 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4149 ll_finish_md_op_data(op_data);
4153 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4155 GOTO(out_req, rc = -EFAULT);
4157 *fid = body->mbo_fid1;
4160 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4162 ptlrpc_req_finished(req);
4166 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4169 struct dentry *dchild = NULL;
4170 struct inode *child_inode = NULL;
4171 struct md_op_data *op_data;
4172 struct ptlrpc_request *request = NULL;
4173 struct obd_client_handle *och = NULL;
4175 struct mdt_body *body;
4176 __u64 data_version = 0;
4177 size_t namelen = strlen(name);
4178 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4182 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4183 PFID(ll_inode2fid(parent)), name,
4184 lum->lum_stripe_offset, lum->lum_stripe_count);
4186 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4187 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4188 lustre_swab_lmv_user_md(lum);
4190 /* Get child FID first */
4191 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4194 dchild = d_lookup(file_dentry(file), &qstr);
4196 if (dchild->d_inode)
4197 child_inode = igrab(dchild->d_inode);
4202 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4211 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4212 OBD_CONNECT2_DIR_MIGRATE)) {
4213 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4214 ll_i2info(child_inode)->lli_lsm_md) {
4215 CERROR("%s: MDT doesn't support stripe directory "
4217 ll_get_fsname(parent->i_sb, NULL, 0));
4218 GOTO(out_iput, rc = -EOPNOTSUPP);
4223 * lfs migrate command needs to be blocked on the client
4224 * by checking the migrate FID against the FID of the
4227 if (child_inode == parent->i_sb->s_root->d_inode)
4228 GOTO(out_iput, rc = -EINVAL);
4230 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4231 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4232 if (IS_ERR(op_data))
4233 GOTO(out_iput, rc = PTR_ERR(op_data));
4235 inode_lock(child_inode);
4236 op_data->op_fid3 = *ll_inode2fid(child_inode);
4237 if (!fid_is_sane(&op_data->op_fid3)) {
4238 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4239 ll_get_fsname(parent->i_sb, NULL, 0), name,
4240 PFID(&op_data->op_fid3));
4241 GOTO(out_unlock, rc = -EINVAL);
4244 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4245 op_data->op_data = lum;
4246 op_data->op_data_size = lumlen;
4249 if (S_ISREG(child_inode->i_mode)) {
4250 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4254 GOTO(out_unlock, rc);
4257 rc = ll_data_version(child_inode, &data_version,
4260 GOTO(out_close, rc);
4262 op_data->op_open_handle = och->och_open_handle;
4263 op_data->op_data_version = data_version;
4264 op_data->op_lease_handle = och->och_lease_handle;
4265 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4267 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4268 och->och_mod->mod_open_req->rq_replay = 0;
4269 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4272 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4273 name, namelen, &request);
4275 LASSERT(request != NULL);
4276 ll_update_times(request, parent);
4278 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4279 LASSERT(body != NULL);
4281 /* If the server does release layout lock, then we cleanup
4282 * the client och here, otherwise release it in out_close: */
4283 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4284 obd_mod_put(och->och_mod);
4285 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4287 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4293 if (request != NULL) {
4294 ptlrpc_req_finished(request);
4298 /* Try again if the file layout has changed. */
4299 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4304 ll_lease_close(och, child_inode, NULL);
4306 clear_nlink(child_inode);
4308 inode_unlock(child_inode);
4309 ll_finish_md_op_data(op_data);
4316 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4324 * test if some locks matching bits and l_req_mode are acquired
4325 * - bits can be in different locks
4326 * - if found clear the common lock bits in *bits
4327 * - the bits not found, are kept in *bits
4329 * \param bits [IN] searched lock bits [IN]
4330 * \param l_req_mode [IN] searched lock mode
4331 * \retval boolean, true iff all bits are found
4333 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4335 struct lustre_handle lockh;
4336 union ldlm_policy_data policy;
4337 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4338 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4347 fid = &ll_i2info(inode)->lli_fid;
4348 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4349 ldlm_lockname[mode]);
4351 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4352 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4353 policy.l_inodebits.bits = *bits & (1 << i);
4354 if (policy.l_inodebits.bits == 0)
4357 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4358 &policy, mode, &lockh)) {
4359 struct ldlm_lock *lock;
4361 lock = ldlm_handle2lock(&lockh);
4364 ~(lock->l_policy_data.l_inodebits.bits);
4365 LDLM_LOCK_PUT(lock);
4367 *bits &= ~policy.l_inodebits.bits;
4374 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4375 struct lustre_handle *lockh, __u64 flags,
4376 enum ldlm_mode mode)
4378 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4383 fid = &ll_i2info(inode)->lli_fid;
4384 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4386 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4387 fid, LDLM_IBITS, &policy, mode, lockh);
4392 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4394 /* Already unlinked. Just update nlink and return success */
4395 if (rc == -ENOENT) {
4397 /* If it is striped directory, and there is bad stripe
4398 * Let's revalidate the dentry again, instead of returning
4400 if (S_ISDIR(inode->i_mode) &&
4401 ll_i2info(inode)->lli_lsm_md != NULL)
4404 /* This path cannot be hit for regular files unless in
4405 * case of obscure races, so no need to to validate
4407 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4409 } else if (rc != 0) {
4410 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4411 "%s: revalidate FID "DFID" error: rc = %d\n",
4412 ll_get_fsname(inode->i_sb, NULL, 0),
4413 PFID(ll_inode2fid(inode)), rc);
4419 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4421 struct inode *inode = dentry->d_inode;
4422 struct obd_export *exp = ll_i2mdexp(inode);
4423 struct lookup_intent oit = {
4426 struct ptlrpc_request *req = NULL;
4427 struct md_op_data *op_data;
4431 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4432 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4434 /* Call getattr by fid, so do not provide name at all. */
4435 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4436 LUSTRE_OPC_ANY, NULL);
4437 if (IS_ERR(op_data))
4438 RETURN(PTR_ERR(op_data));
4440 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4441 ll_finish_md_op_data(op_data);
4443 rc = ll_inode_revalidate_fini(inode, rc);
4447 rc = ll_revalidate_it_finish(req, &oit, dentry);
4449 ll_intent_release(&oit);
4453 /* Unlinked? Unhash dentry, so it is not picked up later by
4454 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4455 * here to preserve get_cwd functionality on 2.6.
4457 if (!dentry->d_inode->i_nlink) {
4458 ll_lock_dcache(inode);
4459 d_lustre_invalidate(dentry, 0);
4460 ll_unlock_dcache(inode);
4463 ll_lookup_finish_locks(&oit, dentry);
4465 ptlrpc_req_finished(req);
4470 static int ll_merge_md_attr(struct inode *inode)
4472 struct ll_inode_info *lli = ll_i2info(inode);
4473 struct cl_attr attr = { 0 };
4476 LASSERT(lli->lli_lsm_md != NULL);
4477 down_read(&lli->lli_lsm_sem);
4478 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4479 &attr, ll_md_blocking_ast);
4480 up_read(&lli->lli_lsm_sem);
4484 set_nlink(inode, attr.cat_nlink);
4485 inode->i_blocks = attr.cat_blocks;
4486 i_size_write(inode, attr.cat_size);
4488 ll_i2info(inode)->lli_atime = attr.cat_atime;
4489 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4490 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4495 static inline dev_t ll_compat_encode_dev(dev_t dev)
4497 /* The compat_sys_*stat*() syscalls will fail unless the
4498 * device majors and minors are both less than 256. Note that
4499 * the value returned here will be passed through
4500 * old_encode_dev() in cp_compat_stat(). And so we are not
4501 * trying to return a valid compat (u16) device number, just
4502 * one that will pass the old_valid_dev() check. */
4504 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4507 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4508 int ll_getattr(const struct path *path, struct kstat *stat,
4509 u32 request_mask, unsigned int flags)
4511 struct dentry *de = path->dentry;
4513 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4516 struct inode *inode = de->d_inode;
4517 struct ll_sb_info *sbi = ll_i2sbi(inode);
4518 struct ll_inode_info *lli = ll_i2info(inode);
4521 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4523 rc = ll_inode_revalidate(de, IT_GETATTR);
4527 if (S_ISREG(inode->i_mode)) {
4528 /* In case of restore, the MDT has the right size and has
4529 * already send it back without granting the layout lock,
4530 * inode is up-to-date so glimpse is useless.
4531 * Also to glimpse we need the layout, in case of a running
4532 * restore the MDT holds the layout lock so the glimpse will
4533 * block up to the end of restore (getattr will block)
4535 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4536 rc = ll_glimpse_size(inode);
4541 /* If object isn't regular a file then don't validate size. */
4542 if (S_ISDIR(inode->i_mode) &&
4543 lli->lli_lsm_md != NULL) {
4544 rc = ll_merge_md_attr(inode);
4549 LTIME_S(inode->i_atime) = lli->lli_atime;
4550 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4551 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4554 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4556 if (ll_need_32bit_api(sbi)) {
4557 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4558 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4559 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4561 stat->ino = inode->i_ino;
4562 stat->dev = inode->i_sb->s_dev;
4563 stat->rdev = inode->i_rdev;
4566 stat->mode = inode->i_mode;
4567 stat->uid = inode->i_uid;
4568 stat->gid = inode->i_gid;
4569 stat->atime = inode->i_atime;
4570 stat->mtime = inode->i_mtime;
4571 stat->ctime = inode->i_ctime;
4572 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4574 stat->nlink = inode->i_nlink;
4575 stat->size = i_size_read(inode);
4576 stat->blocks = inode->i_blocks;
4581 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4582 __u64 start, __u64 len)
4586 struct fiemap *fiemap;
4587 unsigned int extent_count = fieinfo->fi_extents_max;
4589 num_bytes = sizeof(*fiemap) + (extent_count *
4590 sizeof(struct fiemap_extent));
4591 OBD_ALLOC_LARGE(fiemap, num_bytes);
4596 fiemap->fm_flags = fieinfo->fi_flags;
4597 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4598 fiemap->fm_start = start;
4599 fiemap->fm_length = len;
4600 if (extent_count > 0 &&
4601 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4602 sizeof(struct fiemap_extent)) != 0)
4603 GOTO(out, rc = -EFAULT);
4605 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4607 fieinfo->fi_flags = fiemap->fm_flags;
4608 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4609 if (extent_count > 0 &&
4610 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4611 fiemap->fm_mapped_extents *
4612 sizeof(struct fiemap_extent)) != 0)
4613 GOTO(out, rc = -EFAULT);
4615 OBD_FREE_LARGE(fiemap, num_bytes);
4619 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4621 struct ll_inode_info *lli = ll_i2info(inode);
4622 struct posix_acl *acl = NULL;
4625 spin_lock(&lli->lli_lock);
4626 /* VFS' acl_permission_check->check_acl will release the refcount */
4627 acl = posix_acl_dup(lli->lli_posix_acl);
4628 spin_unlock(&lli->lli_lock);
4633 #ifdef HAVE_IOP_SET_ACL
4634 #ifdef CONFIG_FS_POSIX_ACL
4635 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4637 struct ll_sb_info *sbi = ll_i2sbi(inode);
4638 struct ptlrpc_request *req = NULL;
4639 const char *name = NULL;
4641 size_t value_size = 0;
4646 case ACL_TYPE_ACCESS:
4647 name = XATTR_NAME_POSIX_ACL_ACCESS;
4649 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4652 case ACL_TYPE_DEFAULT:
4653 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4654 if (!S_ISDIR(inode->i_mode))
4655 rc = acl ? -EACCES : 0;
4666 value_size = posix_acl_xattr_size(acl->a_count);
4667 value = kmalloc(value_size, GFP_NOFS);
4669 GOTO(out, rc = -ENOMEM);
4671 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4673 GOTO(out_value, rc);
4676 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4677 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4678 name, value, value_size, 0, 0, &req);
4680 ptlrpc_req_finished(req);
4685 forget_cached_acl(inode, type);
4687 set_cached_acl(inode, type, acl);
4690 #endif /* CONFIG_FS_POSIX_ACL */
4691 #endif /* HAVE_IOP_SET_ACL */
4693 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4695 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4696 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4698 ll_check_acl(struct inode *inode, int mask)
4701 # ifdef CONFIG_FS_POSIX_ACL
4702 struct posix_acl *acl;
4706 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4707 if (flags & IPERM_FLAG_RCU)
4710 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4715 rc = posix_acl_permission(inode, acl, mask);
4716 posix_acl_release(acl);
4719 # else /* !CONFIG_FS_POSIX_ACL */
4721 # endif /* CONFIG_FS_POSIX_ACL */
4723 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4725 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4726 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4728 # ifdef HAVE_INODE_PERMISION_2ARGS
4729 int ll_inode_permission(struct inode *inode, int mask)
4731 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4736 struct ll_sb_info *sbi;
4737 struct root_squash_info *squash;
4738 struct cred *cred = NULL;
4739 const struct cred *old_cred = NULL;
4741 bool squash_id = false;
4744 #ifdef MAY_NOT_BLOCK
4745 if (mask & MAY_NOT_BLOCK)
4747 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4748 if (flags & IPERM_FLAG_RCU)
4752 /* as root inode are NOT getting validated in lookup operation,
4753 * need to do it before permission check. */
4755 if (inode == inode->i_sb->s_root->d_inode) {
4756 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4761 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4762 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4764 /* squash fsuid/fsgid if needed */
4765 sbi = ll_i2sbi(inode);
4766 squash = &sbi->ll_squash;
4767 if (unlikely(squash->rsi_uid != 0 &&
4768 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4769 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4773 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4774 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4775 squash->rsi_uid, squash->rsi_gid);
4777 /* update current process's credentials
4778 * and FS capability */
4779 cred = prepare_creds();
4783 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4784 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4785 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4786 if ((1 << cap) & CFS_CAP_FS_MASK)
4787 cap_lower(cred->cap_effective, cap);
4789 old_cred = override_creds(cred);
4792 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4793 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4794 /* restore current process's credentials and FS capability */
4796 revert_creds(old_cred);
4803 /* -o localflock - only provides locally consistent flock locks */
4804 struct file_operations ll_file_operations = {
4805 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4806 # ifdef HAVE_SYNC_READ_WRITE
4807 .read = new_sync_read,
4808 .write = new_sync_write,
4810 .read_iter = ll_file_read_iter,
4811 .write_iter = ll_file_write_iter,
4812 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4813 .read = ll_file_read,
4814 .aio_read = ll_file_aio_read,
4815 .write = ll_file_write,
4816 .aio_write = ll_file_aio_write,
4817 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4818 .unlocked_ioctl = ll_file_ioctl,
4819 .open = ll_file_open,
4820 .release = ll_file_release,
4821 .mmap = ll_file_mmap,
4822 .llseek = ll_file_seek,
4823 .splice_read = ll_file_splice_read,
4828 struct file_operations ll_file_operations_flock = {
4829 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4830 # ifdef HAVE_SYNC_READ_WRITE
4831 .read = new_sync_read,
4832 .write = new_sync_write,
4833 # endif /* HAVE_SYNC_READ_WRITE */
4834 .read_iter = ll_file_read_iter,
4835 .write_iter = ll_file_write_iter,
4836 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4837 .read = ll_file_read,
4838 .aio_read = ll_file_aio_read,
4839 .write = ll_file_write,
4840 .aio_write = ll_file_aio_write,
4841 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4842 .unlocked_ioctl = ll_file_ioctl,
4843 .open = ll_file_open,
4844 .release = ll_file_release,
4845 .mmap = ll_file_mmap,
4846 .llseek = ll_file_seek,
4847 .splice_read = ll_file_splice_read,
4850 .flock = ll_file_flock,
4851 .lock = ll_file_flock
4854 /* These are for -o noflock - to return ENOSYS on flock calls */
4855 struct file_operations ll_file_operations_noflock = {
4856 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4857 # ifdef HAVE_SYNC_READ_WRITE
4858 .read = new_sync_read,
4859 .write = new_sync_write,
4860 # endif /* HAVE_SYNC_READ_WRITE */
4861 .read_iter = ll_file_read_iter,
4862 .write_iter = ll_file_write_iter,
4863 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4864 .read = ll_file_read,
4865 .aio_read = ll_file_aio_read,
4866 .write = ll_file_write,
4867 .aio_write = ll_file_aio_write,
4868 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4869 .unlocked_ioctl = ll_file_ioctl,
4870 .open = ll_file_open,
4871 .release = ll_file_release,
4872 .mmap = ll_file_mmap,
4873 .llseek = ll_file_seek,
4874 .splice_read = ll_file_splice_read,
4877 .flock = ll_file_noflock,
4878 .lock = ll_file_noflock
4881 struct inode_operations ll_file_inode_operations = {
4882 .setattr = ll_setattr,
4883 .getattr = ll_getattr,
4884 .permission = ll_inode_permission,
4885 #ifdef HAVE_IOP_XATTR
4886 .setxattr = ll_setxattr,
4887 .getxattr = ll_getxattr,
4888 .removexattr = ll_removexattr,
4890 .listxattr = ll_listxattr,
4891 .fiemap = ll_fiemap,
4892 #ifdef HAVE_IOP_GET_ACL
4893 .get_acl = ll_get_acl,
4895 #ifdef HAVE_IOP_SET_ACL
4896 .set_acl = ll_set_acl,
4900 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4902 struct ll_inode_info *lli = ll_i2info(inode);
4903 struct cl_object *obj = lli->lli_clob;
4912 env = cl_env_get(&refcheck);
4914 RETURN(PTR_ERR(env));
4916 rc = cl_conf_set(env, lli->lli_clob, conf);
4920 if (conf->coc_opc == OBJECT_CONF_SET) {
4921 struct ldlm_lock *lock = conf->coc_lock;
4922 struct cl_layout cl = {
4926 LASSERT(lock != NULL);
4927 LASSERT(ldlm_has_layout(lock));
4929 /* it can only be allowed to match after layout is
4930 * applied to inode otherwise false layout would be
4931 * seen. Applying layout shoud happen before dropping
4932 * the intent lock. */
4933 ldlm_lock_allow_match(lock);
4935 rc = cl_object_layout_get(env, obj, &cl);
4940 DFID": layout version change: %u -> %u\n",
4941 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4943 ll_layout_version_set(lli, cl.cl_layout_gen);
4947 cl_env_put(env, &refcheck);
4952 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4953 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4956 struct ll_sb_info *sbi = ll_i2sbi(inode);
4957 struct ptlrpc_request *req;
4964 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4965 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4966 lock->l_lvb_data, lock->l_lvb_len);
4968 if (lock->l_lvb_data != NULL)
4971 /* if layout lock was granted right away, the layout is returned
4972 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4973 * blocked and then granted via completion ast, we have to fetch
4974 * layout here. Please note that we can't use the LVB buffer in
4975 * completion AST because it doesn't have a large enough buffer */
4976 rc = ll_get_default_mdsize(sbi, &lmmsize);
4980 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4981 XATTR_NAME_LOV, lmmsize, &req);
4984 GOTO(out, rc = 0); /* empty layout */
4991 if (lmmsize == 0) /* empty layout */
4994 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4996 GOTO(out, rc = -EFAULT);
4998 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4999 if (lvbdata == NULL)
5000 GOTO(out, rc = -ENOMEM);
5002 memcpy(lvbdata, lmm, lmmsize);
5003 lock_res_and_lock(lock);
5004 if (unlikely(lock->l_lvb_data == NULL)) {
5005 lock->l_lvb_type = LVB_T_LAYOUT;
5006 lock->l_lvb_data = lvbdata;
5007 lock->l_lvb_len = lmmsize;
5010 unlock_res_and_lock(lock);
5013 OBD_FREE_LARGE(lvbdata, lmmsize);
5018 ptlrpc_req_finished(req);
5023 * Apply the layout to the inode. Layout lock is held and will be released
5026 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5027 struct inode *inode)
5029 struct ll_inode_info *lli = ll_i2info(inode);
5030 struct ll_sb_info *sbi = ll_i2sbi(inode);
5031 struct ldlm_lock *lock;
5032 struct cl_object_conf conf;
5035 bool wait_layout = false;
5038 LASSERT(lustre_handle_is_used(lockh));
5040 lock = ldlm_handle2lock(lockh);
5041 LASSERT(lock != NULL);
5042 LASSERT(ldlm_has_layout(lock));
5044 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5045 PFID(&lli->lli_fid), inode);
5047 /* in case this is a caching lock and reinstate with new inode */
5048 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5050 lock_res_and_lock(lock);
5051 lvb_ready = ldlm_is_lvb_ready(lock);
5052 unlock_res_and_lock(lock);
5054 /* checking lvb_ready is racy but this is okay. The worst case is
5055 * that multi processes may configure the file on the same time. */
5059 rc = ll_layout_fetch(inode, lock);
5063 /* for layout lock, lmm is stored in lock's lvb.
5064 * lvb_data is immutable if the lock is held so it's safe to access it
5067 * set layout to file. Unlikely this will fail as old layout was
5068 * surely eliminated */
5069 memset(&conf, 0, sizeof conf);
5070 conf.coc_opc = OBJECT_CONF_SET;
5071 conf.coc_inode = inode;
5072 conf.coc_lock = lock;
5073 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5074 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5075 rc = ll_layout_conf(inode, &conf);
5077 /* refresh layout failed, need to wait */
5078 wait_layout = rc == -EBUSY;
5081 LDLM_LOCK_PUT(lock);
5082 ldlm_lock_decref(lockh, mode);
5084 /* wait for IO to complete if it's still being used. */
5086 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5087 ll_get_fsname(inode->i_sb, NULL, 0),
5088 PFID(&lli->lli_fid), inode);
5090 memset(&conf, 0, sizeof conf);
5091 conf.coc_opc = OBJECT_CONF_WAIT;
5092 conf.coc_inode = inode;
5093 rc = ll_layout_conf(inode, &conf);
5097 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5098 ll_get_fsname(inode->i_sb, NULL, 0),
5099 PFID(&lli->lli_fid), rc);
5105 * Issue layout intent RPC to MDS.
5106 * \param inode [in] file inode
5107 * \param intent [in] layout intent
5109 * \retval 0 on success
5110 * \retval < 0 error code
5112 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5114 struct ll_inode_info *lli = ll_i2info(inode);
5115 struct ll_sb_info *sbi = ll_i2sbi(inode);
5116 struct md_op_data *op_data;
5117 struct lookup_intent it;
5118 struct ptlrpc_request *req;
5122 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5123 0, 0, LUSTRE_OPC_ANY, NULL);
5124 if (IS_ERR(op_data))
5125 RETURN(PTR_ERR(op_data));
5127 op_data->op_data = intent;
5128 op_data->op_data_size = sizeof(*intent);
5130 memset(&it, 0, sizeof(it));
5131 it.it_op = IT_LAYOUT;
5132 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5133 intent->li_opc == LAYOUT_INTENT_TRUNC)
5134 it.it_flags = FMODE_WRITE;
5136 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5137 ll_get_fsname(inode->i_sb, NULL, 0),
5138 PFID(&lli->lli_fid), inode);
5140 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5141 &ll_md_blocking_ast, 0);
5142 if (it.it_request != NULL)
5143 ptlrpc_req_finished(it.it_request);
5144 it.it_request = NULL;
5146 ll_finish_md_op_data(op_data);
5148 /* set lock data in case this is a new lock */
5150 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5152 ll_intent_drop_lock(&it);
5158 * This function checks if there exists a LAYOUT lock on the client side,
5159 * or enqueues it if it doesn't have one in cache.
5161 * This function will not hold layout lock so it may be revoked any time after
5162 * this function returns. Any operations depend on layout should be redone
5165 * This function should be called before lov_io_init() to get an uptodate
5166 * layout version, the caller should save the version number and after IO
5167 * is finished, this function should be called again to verify that layout
5168 * is not changed during IO time.
5170 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5172 struct ll_inode_info *lli = ll_i2info(inode);
5173 struct ll_sb_info *sbi = ll_i2sbi(inode);
5174 struct lustre_handle lockh;
5175 struct layout_intent intent = {
5176 .li_opc = LAYOUT_INTENT_ACCESS,
5178 enum ldlm_mode mode;
5182 *gen = ll_layout_version_get(lli);
5183 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5187 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5188 LASSERT(S_ISREG(inode->i_mode));
5190 /* take layout lock mutex to enqueue layout lock exclusively. */
5191 mutex_lock(&lli->lli_layout_mutex);
5194 /* mostly layout lock is caching on the local side, so try to
5195 * match it before grabbing layout lock mutex. */
5196 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5197 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5198 if (mode != 0) { /* hit cached lock */
5199 rc = ll_layout_lock_set(&lockh, mode, inode);
5205 rc = ll_layout_intent(inode, &intent);
5211 *gen = ll_layout_version_get(lli);
5212 mutex_unlock(&lli->lli_layout_mutex);
5218 * Issue layout intent RPC indicating where in a file an IO is about to write.
5220 * \param[in] inode file inode.
5221 * \param[in] ext write range with start offset of fille in bytes where
5222 * an IO is about to write, and exclusive end offset in
5225 * \retval 0 on success
5226 * \retval < 0 error code
5228 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5229 struct lu_extent *ext)
5231 struct layout_intent intent = {
5233 .li_extent.e_start = ext->e_start,
5234 .li_extent.e_end = ext->e_end,
5239 rc = ll_layout_intent(inode, &intent);
5245 * This function send a restore request to the MDT
5247 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5249 struct hsm_user_request *hur;
5253 len = sizeof(struct hsm_user_request) +
5254 sizeof(struct hsm_user_item);
5255 OBD_ALLOC(hur, len);
5259 hur->hur_request.hr_action = HUA_RESTORE;
5260 hur->hur_request.hr_archive_id = 0;
5261 hur->hur_request.hr_flags = 0;
5262 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5263 sizeof(hur->hur_user_item[0].hui_fid));
5264 hur->hur_user_item[0].hui_extent.offset = offset;
5265 hur->hur_user_item[0].hui_extent.length = length;
5266 hur->hur_request.hr_itemcount = 1;
5267 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,