4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_get_fsname(inode->i_sb, NULL, 0),
143 PFID(&lli->lli_fid));
147 OBD_ALLOC_PTR(op_data);
148 /* We leak openhandle and request here on error, but not much to be
149 * done in OOM case since app won't retry close on error either. */
151 GOTO(out, rc = -ENOMEM);
153 ll_prepare_close(inode, op_data, och);
155 case MDS_CLOSE_LAYOUT_MERGE:
156 /* merge blocks from the victim inode */
157 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
158 op_data->op_attr.ia_valid |= ATTR_SIZE;
159 op_data->op_xvalid |= OP_XVALID_BLOCKS;
160 case MDS_CLOSE_LAYOUT_SPLIT:
161 case MDS_CLOSE_LAYOUT_SWAP: {
162 struct split_param *sp = data;
164 LASSERT(data != NULL);
165 op_data->op_bias |= bias;
166 op_data->op_data_version = 0;
167 op_data->op_lease_handle = och->och_lease_handle;
168 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
169 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
170 op_data->op_mirror_id = sp->sp_mirror_id;
172 op_data->op_fid2 = *ll_inode2fid(data);
177 case MDS_CLOSE_RESYNC_DONE: {
178 struct ll_ioc_lease *ioc = data;
180 LASSERT(data != NULL);
181 op_data->op_attr_blocks +=
182 ioc->lil_count * op_data->op_attr_blocks;
183 op_data->op_attr.ia_valid |= ATTR_SIZE;
184 op_data->op_xvalid |= OP_XVALID_BLOCKS;
185 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
187 op_data->op_lease_handle = och->och_lease_handle;
188 op_data->op_data = &ioc->lil_ids[0];
189 op_data->op_data_size =
190 ioc->lil_count * sizeof(ioc->lil_ids[0]);
194 case MDS_HSM_RELEASE:
195 LASSERT(data != NULL);
196 op_data->op_bias |= MDS_HSM_RELEASE;
197 op_data->op_data_version = *(__u64 *)data;
198 op_data->op_lease_handle = och->och_lease_handle;
199 op_data->op_attr.ia_valid |= ATTR_SIZE;
200 op_data->op_xvalid |= OP_XVALID_BLOCKS;
204 LASSERT(data == NULL);
208 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
209 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
210 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
211 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
213 rc = md_close(md_exp, op_data, och->och_mod, &req);
214 if (rc != 0 && rc != -EINTR)
215 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
216 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
218 if (rc == 0 && op_data->op_bias & bias) {
219 struct mdt_body *body;
221 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
222 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
226 ll_finish_md_op_data(op_data);
230 md_clear_open_replay_data(md_exp, och);
231 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
234 ptlrpc_req_finished(req); /* This is close request */
238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
240 struct ll_inode_info *lli = ll_i2info(inode);
241 struct obd_client_handle **och_p;
242 struct obd_client_handle *och;
247 if (fmode & FMODE_WRITE) {
248 och_p = &lli->lli_mds_write_och;
249 och_usecount = &lli->lli_open_fd_write_count;
250 } else if (fmode & FMODE_EXEC) {
251 och_p = &lli->lli_mds_exec_och;
252 och_usecount = &lli->lli_open_fd_exec_count;
254 LASSERT(fmode & FMODE_READ);
255 och_p = &lli->lli_mds_read_och;
256 och_usecount = &lli->lli_open_fd_read_count;
259 mutex_lock(&lli->lli_och_mutex);
260 if (*och_usecount > 0) {
261 /* There are still users of this handle, so skip
263 mutex_unlock(&lli->lli_och_mutex);
269 mutex_unlock(&lli->lli_och_mutex);
272 /* There might be a race and this handle may already
274 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
280 static int ll_md_close(struct inode *inode, struct file *file)
282 union ldlm_policy_data policy = {
283 .l_inodebits = { MDS_INODELOCK_OPEN },
285 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
286 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
287 struct ll_inode_info *lli = ll_i2info(inode);
288 struct lustre_handle lockh;
289 enum ldlm_mode lockmode;
293 /* clear group lock, if present */
294 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
295 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
297 if (fd->fd_lease_och != NULL) {
300 /* Usually the lease is not released when the
301 * application crashed, we need to release here. */
302 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
303 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
304 PFID(&lli->lli_fid), rc, lease_broken);
306 fd->fd_lease_och = NULL;
309 if (fd->fd_och != NULL) {
310 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
315 /* Let's see if we have good enough OPEN lock on the file and if
316 we can skip talking to MDS */
317 mutex_lock(&lli->lli_och_mutex);
318 if (fd->fd_omode & FMODE_WRITE) {
320 LASSERT(lli->lli_open_fd_write_count);
321 lli->lli_open_fd_write_count--;
322 } else if (fd->fd_omode & FMODE_EXEC) {
324 LASSERT(lli->lli_open_fd_exec_count);
325 lli->lli_open_fd_exec_count--;
328 LASSERT(lli->lli_open_fd_read_count);
329 lli->lli_open_fd_read_count--;
331 mutex_unlock(&lli->lli_och_mutex);
333 /* LU-4398: do not cache write open lock if the file has exec bit */
334 if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
335 !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
336 LDLM_IBITS, &policy, lockmode, &lockh))
337 rc = ll_md_real_close(inode, fd->fd_omode);
340 LUSTRE_FPRIVATE(file) = NULL;
341 ll_file_data_put(fd);
346 /* While this returns an error code, fput() the caller does not, so we need
347 * to make every effort to clean up all of our state here. Also, applications
348 * rarely check close errors and even if an error is returned they will not
349 * re-try the close call.
351 int ll_file_release(struct inode *inode, struct file *file)
353 struct ll_file_data *fd;
354 struct ll_sb_info *sbi = ll_i2sbi(inode);
355 struct ll_inode_info *lli = ll_i2info(inode);
359 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
360 PFID(ll_inode2fid(inode)), inode);
362 if (inode->i_sb->s_root != file_dentry(file))
363 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
364 fd = LUSTRE_FPRIVATE(file);
367 /* The last ref on @file, maybe not the the owner pid of statahead,
368 * because parent and child process can share the same file handle. */
369 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
370 ll_deauthorize_statahead(inode, fd);
372 if (inode->i_sb->s_root == file_dentry(file)) {
373 LUSTRE_FPRIVATE(file) = NULL;
374 ll_file_data_put(fd);
378 if (!S_ISDIR(inode->i_mode)) {
379 if (lli->lli_clob != NULL)
380 lov_read_and_clear_async_rc(lli->lli_clob);
381 lli->lli_async_rc = 0;
384 rc = ll_md_close(inode, file);
386 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
387 libcfs_debug_dumplog();
392 static inline int ll_dom_readpage(void *data, struct page *page)
394 struct niobuf_local *lnb = data;
397 kaddr = ll_kmap_atomic(page, KM_USER0);
398 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
399 if (lnb->lnb_len < PAGE_SIZE)
400 memset(kaddr + lnb->lnb_len, 0,
401 PAGE_SIZE - lnb->lnb_len);
402 flush_dcache_page(page);
403 SetPageUptodate(page);
404 ll_kunmap_atomic(kaddr, KM_USER0);
410 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
411 struct lookup_intent *it)
413 struct ll_inode_info *lli = ll_i2info(inode);
414 struct cl_object *obj = lli->lli_clob;
415 struct address_space *mapping = inode->i_mapping;
417 struct niobuf_remote *rnb;
419 struct lustre_handle lockh;
420 struct ldlm_lock *lock;
421 unsigned long index, start;
422 struct niobuf_local lnb;
423 bool dom_lock = false;
430 if (it->it_lock_mode != 0) {
431 lockh.cookie = it->it_lock_handle;
432 lock = ldlm_handle2lock(&lockh);
434 dom_lock = ldlm_has_dom(lock);
440 if (!req_capsule_field_present(&req->rq_pill, &RMF_NIOBUF_INLINE,
444 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
445 if (rnb == NULL || rnb->rnb_len == 0)
448 /* LU-11595: Server may return whole file and that is OK always or
449 * it may return just file tail and its offset must be aligned with
450 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
451 * smaller then offset may be not aligned and that data is just ignored.
453 if (rnb->rnb_offset % PAGE_SIZE)
456 /* Server returns whole file or just file tail if it fills in
457 * reply buffer, in both cases total size should be inode size.
459 if (rnb->rnb_offset + rnb->rnb_len < i_size_read(inode)) {
460 CERROR("%s: server returns off/len %llu/%u < i_size %llu\n",
461 ll_get_fsname(inode->i_sb, NULL, 0), rnb->rnb_offset,
462 rnb->rnb_len, i_size_read(inode));
466 CDEBUG(D_INFO, "Get data along with open at %llu len %i, i_size %llu\n",
467 rnb->rnb_offset, rnb->rnb_len, i_size_read(inode));
469 data = (char *)rnb + sizeof(*rnb);
471 lnb.lnb_file_offset = rnb->rnb_offset;
472 start = lnb.lnb_file_offset / PAGE_SIZE;
474 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
475 lnb.lnb_page_offset = 0;
477 lnb.lnb_data = data + (index << PAGE_SHIFT);
478 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
479 if (lnb.lnb_len > PAGE_SIZE)
480 lnb.lnb_len = PAGE_SIZE;
482 vmpage = read_cache_page(mapping, index + start,
483 ll_dom_readpage, &lnb);
484 if (IS_ERR(vmpage)) {
485 CWARN("%s: cannot fill page %lu for "DFID
486 " with data: rc = %li\n",
487 ll_get_fsname(inode->i_sb, NULL, 0),
488 index + start, PFID(lu_object_fid(&obj->co_lu)),
494 } while (rnb->rnb_len > (index << PAGE_SHIFT));
498 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
499 struct lookup_intent *itp)
501 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
502 struct dentry *parent = de->d_parent;
505 struct md_op_data *op_data;
506 struct ptlrpc_request *req = NULL;
510 LASSERT(parent != NULL);
511 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
513 /* if server supports open-by-fid, or file name is invalid, don't pack
514 * name in open request */
515 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
516 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
518 len = de->d_name.len;
519 name = kmalloc(len + 1, GFP_NOFS);
524 spin_lock(&de->d_lock);
525 if (len != de->d_name.len) {
526 spin_unlock(&de->d_lock);
530 memcpy(name, de->d_name.name, len);
532 spin_unlock(&de->d_lock);
534 if (!lu_name_is_valid_2(name, len)) {
540 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
541 name, len, 0, LUSTRE_OPC_ANY, NULL);
542 if (IS_ERR(op_data)) {
544 RETURN(PTR_ERR(op_data));
546 op_data->op_data = lmm;
547 op_data->op_data_size = lmmsize;
549 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
550 &ll_md_blocking_ast, 0);
552 ll_finish_md_op_data(op_data);
554 /* reason for keep own exit path - don`t flood log
555 * with messages with -ESTALE errors.
557 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
558 it_open_error(DISP_OPEN_OPEN, itp))
560 ll_release_openhandle(de, itp);
564 if (it_disposition(itp, DISP_LOOKUP_NEG))
565 GOTO(out, rc = -ENOENT);
567 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
568 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
569 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
573 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
575 if (!rc && itp->it_lock_mode) {
576 ll_dom_finish_open(de->d_inode, req, itp);
577 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
581 ptlrpc_req_finished(req);
582 ll_intent_drop_lock(itp);
584 /* We did open by fid, but by the time we got to the server,
585 * the object disappeared. If this is a create, we cannot really
586 * tell the userspace that the file it was trying to create
587 * does not exist. Instead let's return -ESTALE, and the VFS will
588 * retry the create with LOOKUP_REVAL that we are going to catch
589 * in ll_revalidate_dentry() and use lookup then.
591 if (rc == -ENOENT && itp->it_op & IT_CREAT)
597 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
598 struct obd_client_handle *och)
600 struct mdt_body *body;
602 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
603 och->och_open_handle = body->mbo_open_handle;
604 och->och_fid = body->mbo_fid1;
605 och->och_lease_handle.cookie = it->it_lock_handle;
606 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
607 och->och_flags = it->it_flags;
609 return md_set_open_replay_data(md_exp, och, it);
612 static int ll_local_open(struct file *file, struct lookup_intent *it,
613 struct ll_file_data *fd, struct obd_client_handle *och)
615 struct inode *inode = file_inode(file);
618 LASSERT(!LUSTRE_FPRIVATE(file));
625 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
630 LUSTRE_FPRIVATE(file) = fd;
631 ll_readahead_init(inode, &fd->fd_ras);
632 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
634 /* ll_cl_context initialize */
635 rwlock_init(&fd->fd_lock);
636 INIT_LIST_HEAD(&fd->fd_lccs);
641 /* Open a file, and (for the very first open) create objects on the OSTs at
642 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
643 * creation or open until ll_lov_setstripe() ioctl is called.
645 * If we already have the stripe MD locally then we don't request it in
646 * md_open(), by passing a lmm_size = 0.
648 * It is up to the application to ensure no other processes open this file
649 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
650 * used. We might be able to avoid races of that sort by getting lli_open_sem
651 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
652 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
654 int ll_file_open(struct inode *inode, struct file *file)
656 struct ll_inode_info *lli = ll_i2info(inode);
657 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
658 .it_flags = file->f_flags };
659 struct obd_client_handle **och_p = NULL;
660 __u64 *och_usecount = NULL;
661 struct ll_file_data *fd;
665 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
666 PFID(ll_inode2fid(inode)), inode, file->f_flags);
668 it = file->private_data; /* XXX: compat macro */
669 file->private_data = NULL; /* prevent ll_local_open assertion */
671 fd = ll_file_data_get();
673 GOTO(out_nofiledata, rc = -ENOMEM);
676 if (S_ISDIR(inode->i_mode))
677 ll_authorize_statahead(inode, fd);
679 if (inode->i_sb->s_root == file_dentry(file)) {
680 LUSTRE_FPRIVATE(file) = fd;
684 if (!it || !it->it_disposition) {
685 /* Convert f_flags into access mode. We cannot use file->f_mode,
686 * because everything but O_ACCMODE mask was stripped from
688 if ((oit.it_flags + 1) & O_ACCMODE)
690 if (file->f_flags & O_TRUNC)
691 oit.it_flags |= FMODE_WRITE;
693 /* kernel only call f_op->open in dentry_open. filp_open calls
694 * dentry_open after call to open_namei that checks permissions.
695 * Only nfsd_open call dentry_open directly without checking
696 * permissions and because of that this code below is safe.
698 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
699 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
701 /* We do not want O_EXCL here, presumably we opened the file
702 * already? XXX - NFS implications? */
703 oit.it_flags &= ~O_EXCL;
705 /* bug20584, if "it_flags" contains O_CREAT, the file will be
706 * created if necessary, then "IT_CREAT" should be set to keep
707 * consistent with it */
708 if (oit.it_flags & O_CREAT)
709 oit.it_op |= IT_CREAT;
715 /* Let's see if we have file open on MDS already. */
716 if (it->it_flags & FMODE_WRITE) {
717 och_p = &lli->lli_mds_write_och;
718 och_usecount = &lli->lli_open_fd_write_count;
719 } else if (it->it_flags & FMODE_EXEC) {
720 och_p = &lli->lli_mds_exec_och;
721 och_usecount = &lli->lli_open_fd_exec_count;
723 och_p = &lli->lli_mds_read_och;
724 och_usecount = &lli->lli_open_fd_read_count;
727 mutex_lock(&lli->lli_och_mutex);
728 if (*och_p) { /* Open handle is present */
729 if (it_disposition(it, DISP_OPEN_OPEN)) {
730 /* Well, there's extra open request that we do not need,
731 let's close it somehow. This will decref request. */
732 rc = it_open_error(DISP_OPEN_OPEN, it);
734 mutex_unlock(&lli->lli_och_mutex);
735 GOTO(out_openerr, rc);
738 ll_release_openhandle(file_dentry(file), it);
742 rc = ll_local_open(file, it, fd, NULL);
745 mutex_unlock(&lli->lli_och_mutex);
746 GOTO(out_openerr, rc);
749 LASSERT(*och_usecount == 0);
750 if (!it->it_disposition) {
751 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
752 /* We cannot just request lock handle now, new ELC code
753 means that one of other OPEN locks for this file
754 could be cancelled, and since blocking ast handler
755 would attempt to grab och_mutex as well, that would
756 result in a deadlock */
757 mutex_unlock(&lli->lli_och_mutex);
759 * Normally called under two situations:
761 * 2. A race/condition on MDS resulting in no open
762 * handle to be returned from LOOKUP|OPEN request,
763 * for example if the target entry was a symlink.
765 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
766 * marked by a bit set in ll_iget_for_nfs. Clear the
767 * bit so that it's not confusing later callers.
769 * NB; when ldd is NULL, it must have come via normal
770 * lookup path only, since ll_iget_for_nfs always calls
773 if (ldd && ldd->lld_nfs_dentry) {
774 ldd->lld_nfs_dentry = 0;
775 it->it_flags |= MDS_OPEN_LOCK;
779 * Always specify MDS_OPEN_BY_FID because we don't want
780 * to get file with different fid.
782 it->it_flags |= MDS_OPEN_BY_FID;
783 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
786 GOTO(out_openerr, rc);
790 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
792 GOTO(out_och_free, rc = -ENOMEM);
796 /* md_intent_lock() didn't get a request ref if there was an
797 * open error, so don't do cleanup on the request here
799 /* XXX (green): Should not we bail out on any error here, not
800 * just open error? */
801 rc = it_open_error(DISP_OPEN_OPEN, it);
803 GOTO(out_och_free, rc);
805 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
806 "inode %p: disposition %x, status %d\n", inode,
807 it_disposition(it, ~0), it->it_status);
809 rc = ll_local_open(file, it, fd, *och_p);
811 GOTO(out_och_free, rc);
813 mutex_unlock(&lli->lli_och_mutex);
816 /* Must do this outside lli_och_mutex lock to prevent deadlock where
817 different kind of OPEN lock for this same inode gets cancelled
818 by ldlm_cancel_lru */
819 if (!S_ISREG(inode->i_mode))
820 GOTO(out_och_free, rc);
822 cl_lov_delay_create_clear(&file->f_flags);
823 GOTO(out_och_free, rc);
827 if (och_p && *och_p) {
828 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
829 *och_p = NULL; /* OBD_FREE writes some magic there */
832 mutex_unlock(&lli->lli_och_mutex);
835 if (lli->lli_opendir_key == fd)
836 ll_deauthorize_statahead(inode, fd);
838 ll_file_data_put(fd);
840 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
844 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
845 ptlrpc_req_finished(it->it_request);
846 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
852 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
853 struct ldlm_lock_desc *desc, void *data, int flag)
856 struct lustre_handle lockh;
860 case LDLM_CB_BLOCKING:
861 ldlm_lock2handle(lock, &lockh);
862 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
864 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
868 case LDLM_CB_CANCELING:
876 * When setting a lease on a file, we take ownership of the lli_mds_*_och
877 * and save it as fd->fd_och so as to force client to reopen the file even
878 * if it has an open lock in cache already.
880 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
881 struct lustre_handle *old_open_handle)
883 struct ll_inode_info *lli = ll_i2info(inode);
884 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
885 struct obd_client_handle **och_p;
890 /* Get the openhandle of the file */
891 mutex_lock(&lli->lli_och_mutex);
892 if (fd->fd_lease_och != NULL)
893 GOTO(out_unlock, rc = -EBUSY);
895 if (fd->fd_och == NULL) {
896 if (file->f_mode & FMODE_WRITE) {
897 LASSERT(lli->lli_mds_write_och != NULL);
898 och_p = &lli->lli_mds_write_och;
899 och_usecount = &lli->lli_open_fd_write_count;
901 LASSERT(lli->lli_mds_read_och != NULL);
902 och_p = &lli->lli_mds_read_och;
903 och_usecount = &lli->lli_open_fd_read_count;
906 if (*och_usecount > 1)
907 GOTO(out_unlock, rc = -EBUSY);
914 *old_open_handle = fd->fd_och->och_open_handle;
918 mutex_unlock(&lli->lli_och_mutex);
923 * Release ownership on lli_mds_*_och when putting back a file lease.
925 static int ll_lease_och_release(struct inode *inode, struct file *file)
927 struct ll_inode_info *lli = ll_i2info(inode);
928 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
929 struct obd_client_handle **och_p;
930 struct obd_client_handle *old_och = NULL;
935 mutex_lock(&lli->lli_och_mutex);
936 if (file->f_mode & FMODE_WRITE) {
937 och_p = &lli->lli_mds_write_och;
938 och_usecount = &lli->lli_open_fd_write_count;
940 och_p = &lli->lli_mds_read_och;
941 och_usecount = &lli->lli_open_fd_read_count;
944 /* The file may have been open by another process (broken lease) so
945 * *och_p is not NULL. In this case we should simply increase usecount
948 if (*och_p != NULL) {
949 old_och = fd->fd_och;
956 mutex_unlock(&lli->lli_och_mutex);
959 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
965 * Acquire a lease and open the file.
967 static struct obd_client_handle *
968 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
971 struct lookup_intent it = { .it_op = IT_OPEN };
972 struct ll_sb_info *sbi = ll_i2sbi(inode);
973 struct md_op_data *op_data;
974 struct ptlrpc_request *req = NULL;
975 struct lustre_handle old_open_handle = { 0 };
976 struct obd_client_handle *och = NULL;
981 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
982 RETURN(ERR_PTR(-EINVAL));
985 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
986 RETURN(ERR_PTR(-EPERM));
988 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
995 RETURN(ERR_PTR(-ENOMEM));
997 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
998 LUSTRE_OPC_ANY, NULL);
1000 GOTO(out, rc = PTR_ERR(op_data));
1002 /* To tell the MDT this openhandle is from the same owner */
1003 op_data->op_open_handle = old_open_handle;
1005 it.it_flags = fmode | open_flags;
1006 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1007 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1008 &ll_md_blocking_lease_ast,
1009 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1010 * it can be cancelled which may mislead applications that the lease is
1012 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1013 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1014 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1015 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1016 ll_finish_md_op_data(op_data);
1017 ptlrpc_req_finished(req);
1019 GOTO(out_release_it, rc);
1021 if (it_disposition(&it, DISP_LOOKUP_NEG))
1022 GOTO(out_release_it, rc = -ENOENT);
1024 rc = it_open_error(DISP_OPEN_OPEN, &it);
1026 GOTO(out_release_it, rc);
1028 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1029 rc = ll_och_fill(sbi->ll_md_exp, &it, och);
1031 GOTO(out_release_it, rc);
1033 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1034 GOTO(out_close, rc = -EOPNOTSUPP);
1036 /* already get lease, handle lease lock */
1037 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1038 if (it.it_lock_mode == 0 ||
1039 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1040 /* open lock must return for lease */
1041 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1042 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1044 GOTO(out_close, rc = -EPROTO);
1047 ll_intent_release(&it);
1051 /* Cancel open lock */
1052 if (it.it_lock_mode != 0) {
1053 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1055 it.it_lock_mode = 0;
1056 och->och_lease_handle.cookie = 0ULL;
1058 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1060 CERROR("%s: error closing file "DFID": %d\n",
1061 ll_get_fsname(inode->i_sb, NULL, 0),
1062 PFID(&ll_i2info(inode)->lli_fid), rc2);
1063 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1065 ll_intent_release(&it);
1069 RETURN(ERR_PTR(rc));
1073 * Check whether a layout swap can be done between two inodes.
1075 * \param[in] inode1 First inode to check
1076 * \param[in] inode2 Second inode to check
1078 * \retval 0 on success, layout swap can be performed between both inodes
1079 * \retval negative error code if requirements are not met
1081 static int ll_check_swap_layouts_validity(struct inode *inode1,
1082 struct inode *inode2)
1084 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1087 if (inode_permission(inode1, MAY_WRITE) ||
1088 inode_permission(inode2, MAY_WRITE))
1091 if (inode1->i_sb != inode2->i_sb)
1097 static int ll_swap_layouts_close(struct obd_client_handle *och,
1098 struct inode *inode, struct inode *inode2)
1100 const struct lu_fid *fid1 = ll_inode2fid(inode);
1101 const struct lu_fid *fid2;
1105 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1106 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1108 rc = ll_check_swap_layouts_validity(inode, inode2);
1110 GOTO(out_free_och, rc);
1112 /* We now know that inode2 is a lustre inode */
1113 fid2 = ll_inode2fid(inode2);
1115 rc = lu_fid_cmp(fid1, fid2);
1117 GOTO(out_free_och, rc = -EINVAL);
1119 /* Close the file and {swap,merge} layouts between inode & inode2.
1120 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1121 * because we still need it to pack l_remote_handle to MDT. */
1122 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1125 och = NULL; /* freed in ll_close_inode_openhandle() */
1135 * Release lease and close the file.
1136 * It will check if the lease has ever broken.
1138 static int ll_lease_close_intent(struct obd_client_handle *och,
1139 struct inode *inode,
1140 bool *lease_broken, enum mds_op_bias bias,
1143 struct ldlm_lock *lock;
1144 bool cancelled = true;
1148 lock = ldlm_handle2lock(&och->och_lease_handle);
1150 lock_res_and_lock(lock);
1151 cancelled = ldlm_is_cancel(lock);
1152 unlock_res_and_lock(lock);
1153 LDLM_LOCK_PUT(lock);
1156 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1157 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1159 if (lease_broken != NULL)
1160 *lease_broken = cancelled;
1162 if (!cancelled && !bias)
1163 ldlm_cli_cancel(&och->och_lease_handle, 0);
1165 if (cancelled) { /* no need to excute intent */
1170 rc = ll_close_inode_openhandle(inode, och, bias, data);
1174 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1177 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1181 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1183 static int ll_lease_file_resync(struct obd_client_handle *och,
1184 struct inode *inode, unsigned long arg)
1186 struct ll_sb_info *sbi = ll_i2sbi(inode);
1187 struct md_op_data *op_data;
1188 struct ll_ioc_lease_id ioc;
1189 __u64 data_version_unused;
1193 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1194 LUSTRE_OPC_ANY, NULL);
1195 if (IS_ERR(op_data))
1196 RETURN(PTR_ERR(op_data));
1198 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1202 /* before starting file resync, it's necessary to clean up page cache
1203 * in client memory, otherwise once the layout version is increased,
1204 * writing back cached data will be denied the OSTs. */
1205 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1209 op_data->op_lease_handle = och->och_lease_handle;
1210 op_data->op_mirror_id = ioc.lil_mirror_id;
1211 rc = md_file_resync(sbi->ll_md_exp, op_data);
1217 ll_finish_md_op_data(op_data);
1221 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1223 struct ll_inode_info *lli = ll_i2info(inode);
1224 struct cl_object *obj = lli->lli_clob;
1225 struct cl_attr *attr = vvp_env_thread_attr(env);
1233 ll_inode_size_lock(inode);
1235 /* Merge timestamps the most recently obtained from MDS with
1236 * timestamps obtained from OSTs.
1238 * Do not overwrite atime of inode because it may be refreshed
1239 * by file_accessed() function. If the read was served by cache
1240 * data, there is no RPC to be sent so that atime may not be
1241 * transferred to OSTs at all. MDT only updates atime at close time
1242 * if it's at least 'mdd.*.atime_diff' older.
1243 * All in all, the atime in Lustre does not strictly comply with
1244 * POSIX. Solving this problem needs to send an RPC to MDT for each
1245 * read, this will hurt performance.
1247 if (inode->i_atime.tv_sec < lli->lli_atime ||
1248 lli->lli_update_atime) {
1249 inode->i_atime.tv_sec = lli->lli_atime;
1250 lli->lli_update_atime = 0;
1252 inode->i_mtime.tv_sec = lli->lli_mtime;
1253 inode->i_ctime.tv_sec = lli->lli_ctime;
1255 mtime = inode->i_mtime.tv_sec;
1256 atime = inode->i_atime.tv_sec;
1257 ctime = inode->i_ctime.tv_sec;
1259 cl_object_attr_lock(obj);
1260 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1263 rc = cl_object_attr_get(env, obj, attr);
1264 cl_object_attr_unlock(obj);
1267 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1269 if (atime < attr->cat_atime)
1270 atime = attr->cat_atime;
1272 if (ctime < attr->cat_ctime)
1273 ctime = attr->cat_ctime;
1275 if (mtime < attr->cat_mtime)
1276 mtime = attr->cat_mtime;
1278 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1279 PFID(&lli->lli_fid), attr->cat_size);
1281 i_size_write(inode, attr->cat_size);
1282 inode->i_blocks = attr->cat_blocks;
1284 inode->i_mtime.tv_sec = mtime;
1285 inode->i_atime.tv_sec = atime;
1286 inode->i_ctime.tv_sec = ctime;
1289 ll_inode_size_unlock(inode);
1295 * Set designated mirror for I/O.
1297 * So far only read, write, and truncated can support to issue I/O to
1298 * designated mirror.
1300 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1302 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1304 /* clear layout version for generic(non-resync) I/O in case it carries
1305 * stale layout version due to I/O restart */
1306 io->ci_layout_version = 0;
1308 /* FLR: disable non-delay for designated mirror I/O because obviously
1309 * only one mirror is available */
1310 if (fd->fd_designated_mirror > 0) {
1312 io->ci_designated_mirror = fd->fd_designated_mirror;
1313 io->ci_layout_version = fd->fd_layout_version;
1316 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1317 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1320 static bool file_is_noatime(const struct file *file)
1322 const struct vfsmount *mnt = file->f_path.mnt;
1323 const struct inode *inode = file_inode((struct file *)file);
1325 /* Adapted from file_accessed() and touch_atime().*/
1326 if (file->f_flags & O_NOATIME)
1329 if (inode->i_flags & S_NOATIME)
1332 if (IS_NOATIME(inode))
1335 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1338 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1341 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1347 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1349 struct inode *inode = file_inode(file);
1350 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1352 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1353 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1355 if (iot == CIT_WRITE) {
1356 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1357 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1358 file->f_flags & O_DIRECT ||
1361 io->ci_obj = ll_i2info(inode)->lli_clob;
1362 io->ci_lockreq = CILR_MAYBE;
1363 if (ll_file_nolock(file)) {
1364 io->ci_lockreq = CILR_NEVER;
1365 io->ci_no_srvlock = 1;
1366 } else if (file->f_flags & O_APPEND) {
1367 io->ci_lockreq = CILR_MANDATORY;
1369 io->ci_noatime = file_is_noatime(file);
1371 /* FLR: only use non-delay I/O for read as there is only one
1372 * avaliable mirror for write. */
1373 io->ci_ndelay = !(iot == CIT_WRITE);
1375 ll_io_set_mirror(io, file);
1379 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1380 struct file *file, enum cl_io_type iot,
1381 loff_t *ppos, size_t count)
1383 struct vvp_io *vio = vvp_env_io(env);
1384 struct inode *inode = file_inode(file);
1385 struct ll_inode_info *lli = ll_i2info(inode);
1386 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1387 struct range_lock range;
1391 unsigned retried = 0;
1392 bool restarted = false;
1396 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1397 file_dentry(file)->d_name.name,
1398 iot == CIT_READ ? "read" : "write", *ppos, count);
1401 io = vvp_env_thread_io(env);
1402 ll_io_init(io, file, iot);
1403 io->ci_ndelay_tried = retried;
1405 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1406 bool range_locked = false;
1408 if (file->f_flags & O_APPEND)
1409 range_lock_init(&range, 0, LUSTRE_EOF);
1411 range_lock_init(&range, *ppos, *ppos + count - 1);
1413 vio->vui_fd = LUSTRE_FPRIVATE(file);
1414 vio->vui_io_subtype = args->via_io_subtype;
1416 switch (vio->vui_io_subtype) {
1418 vio->vui_iter = args->u.normal.via_iter;
1419 vio->vui_iocb = args->u.normal.via_iocb;
1420 /* Direct IO reads must also take range lock,
1421 * or multiple reads will try to work on the same pages
1422 * See LU-6227 for details. */
1423 if (((iot == CIT_WRITE) ||
1424 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1425 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1426 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1428 rc = range_lock(&lli->lli_write_tree, &range);
1432 range_locked = true;
1436 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1437 vio->u.splice.vui_flags = args->u.splice.via_flags;
1440 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1444 ll_cl_add(file, env, io, LCC_RW);
1445 rc = cl_io_loop(env, io);
1446 ll_cl_remove(file, env);
1449 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1451 range_unlock(&lli->lli_write_tree, &range);
1454 /* cl_io_rw_init() handled IO */
1458 if (io->ci_nob > 0) {
1459 result += io->ci_nob;
1460 count -= io->ci_nob;
1461 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1463 /* prepare IO restart */
1464 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1465 args->u.normal.via_iter = vio->vui_iter;
1468 cl_io_fini(env, io);
1471 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1472 file->f_path.dentry->d_name.name,
1473 iot, rc, result, io->ci_need_restart);
1475 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1477 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1478 file_dentry(file)->d_name.name,
1479 iot == CIT_READ ? "read" : "write",
1480 *ppos, count, result, rc);
1481 /* preserve the tried count for FLR */
1482 retried = io->ci_ndelay_tried;
1487 if (iot == CIT_READ) {
1489 ll_stats_ops_tally(ll_i2sbi(inode),
1490 LPROC_LL_READ_BYTES, result);
1491 } else if (iot == CIT_WRITE) {
1493 ll_stats_ops_tally(ll_i2sbi(inode),
1494 LPROC_LL_WRITE_BYTES, result);
1495 fd->fd_write_failed = false;
1496 } else if (result == 0 && rc == 0) {
1499 fd->fd_write_failed = true;
1501 fd->fd_write_failed = false;
1502 } else if (rc != -ERESTARTSYS) {
1503 fd->fd_write_failed = true;
1507 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1509 RETURN(result > 0 ? result : rc);
1513 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1514 * especially for small I/O.
1516 * To serve a read request, CLIO has to create and initialize a cl_io and
1517 * then request DLM lock. This has turned out to have siginificant overhead
1518 * and affects the performance of small I/O dramatically.
1520 * It's not necessary to create a cl_io for each I/O. Under the help of read
1521 * ahead, most of the pages being read are already in memory cache and we can
1522 * read those pages directly because if the pages exist, the corresponding DLM
1523 * lock must exist so that page content must be valid.
1525 * In fast read implementation, the llite speculatively finds and reads pages
1526 * in memory cache. There are three scenarios for fast read:
1527 * - If the page exists and is uptodate, kernel VM will provide the data and
1528 * CLIO won't be intervened;
1529 * - If the page was brought into memory by read ahead, it will be exported
1530 * and read ahead parameters will be updated;
1531 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1532 * it will go back and invoke normal read, i.e., a cl_io will be created
1533 * and DLM lock will be requested.
1535 * POSIX compliance: posix standard states that read is intended to be atomic.
1536 * Lustre read implementation is in line with Linux kernel read implementation
1537 * and neither of them complies with POSIX standard in this matter. Fast read
1538 * doesn't make the situation worse on single node but it may interleave write
1539 * results from multiple nodes due to short read handling in ll_file_aio_read().
1541 * \param env - lu_env
1542 * \param iocb - kiocb from kernel
1543 * \param iter - user space buffers where the data will be copied
1545 * \retval - number of bytes have been read, or error code if error occurred.
1548 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1552 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1555 /* NB: we can't do direct IO for fast read because it will need a lock
1556 * to make IO engine happy. */
1557 if (iocb->ki_filp->f_flags & O_DIRECT)
1560 result = generic_file_read_iter(iocb, iter);
1562 /* If the first page is not in cache, generic_file_aio_read() will be
1563 * returned with -ENODATA.
1564 * See corresponding code in ll_readpage(). */
1565 if (result == -ENODATA)
1569 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1570 LPROC_LL_READ_BYTES, result);
1576 * Read from a file (through the page cache).
1578 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1581 struct vvp_io_args *args;
1582 struct file *file = iocb->ki_filp;
1587 if (!iov_iter_count(to))
1590 result = ll_do_fast_read(iocb, to);
1591 if (result < 0 || iov_iter_count(to) == 0)
1594 env = cl_env_get(&refcheck);
1596 return PTR_ERR(env);
1598 args = ll_env_args(env, IO_NORMAL);
1599 args->u.normal.via_iter = to;
1600 args->u.normal.via_iocb = iocb;
1602 rc2 = ll_file_io_generic(env, args, file, CIT_READ,
1603 &iocb->ki_pos, iov_iter_count(to));
1606 else if (result == 0)
1609 cl_env_put(env, &refcheck);
1612 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1613 LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
1620 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1621 * If a page is already in the page cache and dirty (and some other things -
1622 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1623 * write to it without doing a full I/O, because Lustre already knows about it
1624 * and will write it out. This saves a lot of processing time.
1626 * All writes here are within one page, so exclusion is handled by the page
1627 * lock on the vm page. We do not do tiny writes for writes which touch
1628 * multiple pages because it's very unlikely multiple sequential pages are
1629 * are already dirty.
1631 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1632 * and are unlikely to be to already dirty pages.
1634 * Attribute updates are important here, we do them in ll_tiny_write_end.
1636 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1638 ssize_t count = iov_iter_count(iter);
1639 struct file *file = iocb->ki_filp;
1640 struct inode *inode = file_inode(file);
1641 bool lock_inode = !IS_NOSEC(inode);
1646 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1647 * of function for why.
1649 if (count >= PAGE_SIZE ||
1650 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1653 if (unlikely(lock_inode))
1655 result = __generic_file_write_iter(iocb, iter);
1657 if (unlikely(lock_inode))
1658 inode_unlock(inode);
1660 /* If the page is not already dirty, ll_tiny_write_begin returns
1661 * -ENODATA. We continue on to normal write.
1663 if (result == -ENODATA)
1667 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1669 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1672 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1678 * Write to a file (through the page cache).
1680 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1682 struct vvp_io_args *args;
1684 ssize_t rc_tiny = 0, rc_normal;
1685 struct file *file = iocb->ki_filp;
1690 if (!iov_iter_count(from))
1691 GOTO(out, rc_normal = 0);
1693 /* NB: we can't do direct IO for tiny writes because they use the page
1694 * cache, we can't do sync writes because tiny writes can't flush
1695 * pages, and we can't do append writes because we can't guarantee the
1696 * required DLM locks are held to protect file size.
1698 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
1699 !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1700 rc_tiny = ll_do_tiny_write(iocb, from);
1702 /* In case of error, go on and try normal write - Only stop if tiny
1703 * write completed I/O.
1705 if (iov_iter_count(from) == 0)
1706 GOTO(out, rc_normal = rc_tiny);
1708 env = cl_env_get(&refcheck);
1710 return PTR_ERR(env);
1712 args = ll_env_args(env, IO_NORMAL);
1713 args->u.normal.via_iter = from;
1714 args->u.normal.via_iocb = iocb;
1716 rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
1717 &iocb->ki_pos, iov_iter_count(from));
1719 /* On success, combine bytes written. */
1720 if (rc_tiny >= 0 && rc_normal > 0)
1721 rc_normal += rc_tiny;
1722 /* On error, only return error from normal write if tiny write did not
1723 * write any bytes. Otherwise return bytes written by tiny write.
1725 else if (rc_tiny > 0)
1726 rc_normal = rc_tiny;
1728 cl_env_put(env, &refcheck);
1731 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1732 LUSTRE_FPRIVATE(file), iocb->ki_pos,
1737 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1739 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1741 static int ll_file_get_iov_count(const struct iovec *iov,
1742 unsigned long *nr_segs, size_t *count)
1747 for (seg = 0; seg < *nr_segs; seg++) {
1748 const struct iovec *iv = &iov[seg];
1751 * If any segment has a negative length, or the cumulative
1752 * length ever wraps negative then return -EINVAL.
1755 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1757 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1762 cnt -= iv->iov_len; /* This segment is no good */
1769 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1770 unsigned long nr_segs, loff_t pos)
1777 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1784 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1785 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1786 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1787 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1788 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1790 result = ll_file_read_iter(iocb, &to);
1795 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1798 struct iovec iov = { .iov_base = buf, .iov_len = count };
1807 init_sync_kiocb(&kiocb, file);
1808 kiocb.ki_pos = *ppos;
1809 #ifdef HAVE_KIOCB_KI_LEFT
1810 kiocb.ki_left = count;
1811 #elif defined(HAVE_KI_NBYTES)
1812 kiocb.i_nbytes = count;
1815 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1816 *ppos = kiocb.ki_pos;
1822 * Write to a file (through the page cache).
1825 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1826 unsigned long nr_segs, loff_t pos)
1828 struct iov_iter from;
1833 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1840 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1841 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1842 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1843 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1844 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1846 result = ll_file_write_iter(iocb, &from);
1851 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1852 size_t count, loff_t *ppos)
1854 struct iovec iov = { .iov_base = (void __user *)buf,
1864 init_sync_kiocb(&kiocb, file);
1865 kiocb.ki_pos = *ppos;
1866 #ifdef HAVE_KIOCB_KI_LEFT
1867 kiocb.ki_left = count;
1868 #elif defined(HAVE_KI_NBYTES)
1869 kiocb.ki_nbytes = count;
1872 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1873 *ppos = kiocb.ki_pos;
1877 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1880 * Send file content (through pagecache) somewhere with helper
1882 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1883 struct pipe_inode_info *pipe, size_t count,
1887 struct vvp_io_args *args;
1892 env = cl_env_get(&refcheck);
1894 RETURN(PTR_ERR(env));
1896 args = ll_env_args(env, IO_SPLICE);
1897 args->u.splice.via_pipe = pipe;
1898 args->u.splice.via_flags = flags;
1900 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1901 cl_env_put(env, &refcheck);
1904 ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
1905 LUSTRE_FPRIVATE(in_file), *ppos, result,
1910 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1911 __u64 flags, struct lov_user_md *lum, int lum_size)
1913 struct lookup_intent oit = {
1915 .it_flags = flags | MDS_OPEN_BY_FID,
1920 if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
1921 le32_to_cpu(LOV_MAGIC_MAGIC)) {
1922 /* this code will only exist for big-endian systems */
1923 lustre_swab_lov_user_md(lum, 0);
1926 ll_inode_size_lock(inode);
1927 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1929 GOTO(out_unlock, rc);
1931 ll_release_openhandle(dentry, &oit);
1934 ll_inode_size_unlock(inode);
1935 ll_intent_release(&oit);
1940 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1941 struct lov_mds_md **lmmp, int *lmm_size,
1942 struct ptlrpc_request **request)
1944 struct ll_sb_info *sbi = ll_i2sbi(inode);
1945 struct mdt_body *body;
1946 struct lov_mds_md *lmm = NULL;
1947 struct ptlrpc_request *req = NULL;
1948 struct md_op_data *op_data;
1951 rc = ll_get_default_mdsize(sbi, &lmmsize);
1955 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1956 strlen(filename), lmmsize,
1957 LUSTRE_OPC_ANY, NULL);
1958 if (IS_ERR(op_data))
1959 RETURN(PTR_ERR(op_data));
1961 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1962 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1963 ll_finish_md_op_data(op_data);
1965 CDEBUG(D_INFO, "md_getattr_name failed "
1966 "on %s: rc %d\n", filename, rc);
1970 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1971 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1973 lmmsize = body->mbo_eadatasize;
1975 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1977 GOTO(out, rc = -ENODATA);
1980 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
1981 LASSERT(lmm != NULL);
1983 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
1984 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
1985 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
1986 GOTO(out, rc = -EPROTO);
1989 * This is coming from the MDS, so is probably in
1990 * little endian. We convert it to host endian before
1991 * passing it to userspace.
1993 if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
1994 __swab32(LOV_MAGIC_MAGIC)) {
1995 int stripe_count = 0;
1997 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
1998 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
1999 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2000 if (le32_to_cpu(lmm->lmm_pattern) &
2001 LOV_PATTERN_F_RELEASED)
2005 lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
2007 /* if function called for directory - we should
2008 * avoid swab not existent lsm objects */
2009 if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
2010 lustre_swab_lov_user_md_objects(
2011 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2013 else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
2014 S_ISREG(body->mbo_mode))
2015 lustre_swab_lov_user_md_objects(
2016 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2022 *lmm_size = lmmsize;
2027 static int ll_lov_setea(struct inode *inode, struct file *file,
2030 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2031 struct lov_user_md *lump;
2032 int lum_size = sizeof(struct lov_user_md) +
2033 sizeof(struct lov_user_ost_data);
2037 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2040 OBD_ALLOC_LARGE(lump, lum_size);
2044 if (copy_from_user(lump, arg, lum_size))
2045 GOTO(out_lump, rc = -EFAULT);
2047 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2049 cl_lov_delay_create_clear(&file->f_flags);
2052 OBD_FREE_LARGE(lump, lum_size);
2056 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2063 env = cl_env_get(&refcheck);
2065 RETURN(PTR_ERR(env));
2067 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2068 cl_env_put(env, &refcheck);
2072 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2075 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2076 struct lov_user_md *klum;
2078 __u64 flags = FMODE_WRITE;
2081 rc = ll_copy_user_md(lum, &klum);
2086 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2091 rc = put_user(0, &lum->lmm_stripe_count);
2095 rc = ll_layout_refresh(inode, &gen);
2099 rc = ll_file_getstripe(inode, arg, lum_size);
2101 cl_lov_delay_create_clear(&file->f_flags);
2104 OBD_FREE_LARGE(klum, lum_size);
2109 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2111 struct ll_inode_info *lli = ll_i2info(inode);
2112 struct cl_object *obj = lli->lli_clob;
2113 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2114 struct ll_grouplock grouplock;
2119 CWARN("group id for group lock must not be 0\n");
2123 if (ll_file_nolock(file))
2124 RETURN(-EOPNOTSUPP);
2126 spin_lock(&lli->lli_lock);
2127 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2128 CWARN("group lock already existed with gid %lu\n",
2129 fd->fd_grouplock.lg_gid);
2130 spin_unlock(&lli->lli_lock);
2133 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2134 spin_unlock(&lli->lli_lock);
2137 * XXX: group lock needs to protect all OST objects while PFL
2138 * can add new OST objects during the IO, so we'd instantiate
2139 * all OST objects before getting its group lock.
2144 struct cl_layout cl = {
2145 .cl_is_composite = false,
2147 struct lu_extent ext = {
2149 .e_end = OBD_OBJECT_EOF,
2152 env = cl_env_get(&refcheck);
2154 RETURN(PTR_ERR(env));
2156 rc = cl_object_layout_get(env, obj, &cl);
2157 if (!rc && cl.cl_is_composite)
2158 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2161 cl_env_put(env, &refcheck);
2166 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2167 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2171 spin_lock(&lli->lli_lock);
2172 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2173 spin_unlock(&lli->lli_lock);
2174 CERROR("another thread just won the race\n");
2175 cl_put_grouplock(&grouplock);
2179 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2180 fd->fd_grouplock = grouplock;
2181 spin_unlock(&lli->lli_lock);
2183 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2187 static int ll_put_grouplock(struct inode *inode, struct file *file,
2190 struct ll_inode_info *lli = ll_i2info(inode);
2191 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2192 struct ll_grouplock grouplock;
2195 spin_lock(&lli->lli_lock);
2196 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2197 spin_unlock(&lli->lli_lock);
2198 CWARN("no group lock held\n");
2202 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2204 if (fd->fd_grouplock.lg_gid != arg) {
2205 CWARN("group lock %lu doesn't match current id %lu\n",
2206 arg, fd->fd_grouplock.lg_gid);
2207 spin_unlock(&lli->lli_lock);
2211 grouplock = fd->fd_grouplock;
2212 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2213 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2214 spin_unlock(&lli->lli_lock);
2216 cl_put_grouplock(&grouplock);
2217 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2222 * Close inode open handle
2224 * \param dentry [in] dentry which contains the inode
2225 * \param it [in,out] intent which contains open info and result
2228 * \retval <0 failure
2230 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2232 struct inode *inode = dentry->d_inode;
2233 struct obd_client_handle *och;
2239 /* Root ? Do nothing. */
2240 if (dentry->d_inode->i_sb->s_root == dentry)
2243 /* No open handle to close? Move away */
2244 if (!it_disposition(it, DISP_OPEN_OPEN))
2247 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2249 OBD_ALLOC(och, sizeof(*och));
2251 GOTO(out, rc = -ENOMEM);
2253 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2257 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2259 /* this one is in place of ll_file_open */
2260 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2261 ptlrpc_req_finished(it->it_request);
2262 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2268 * Get size for inode for which FIEMAP mapping is requested.
2269 * Make the FIEMAP get_info call and returns the result.
2270 * \param fiemap kernel buffer to hold extens
2271 * \param num_bytes kernel buffer size
2273 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2279 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2282 /* Checks for fiemap flags */
2283 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2284 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2288 /* Check for FIEMAP_FLAG_SYNC */
2289 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2290 rc = filemap_fdatawrite(inode->i_mapping);
2295 env = cl_env_get(&refcheck);
2297 RETURN(PTR_ERR(env));
2299 if (i_size_read(inode) == 0) {
2300 rc = ll_glimpse_size(inode);
2305 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2306 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2307 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2309 /* If filesize is 0, then there would be no objects for mapping */
2310 if (fmkey.lfik_oa.o_size == 0) {
2311 fiemap->fm_mapped_extents = 0;
2315 fmkey.lfik_fiemap = *fiemap;
2317 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2318 &fmkey, fiemap, &num_bytes);
2320 cl_env_put(env, &refcheck);
2324 int ll_fid2path(struct inode *inode, void __user *arg)
2326 struct obd_export *exp = ll_i2mdexp(inode);
2327 const struct getinfo_fid2path __user *gfin = arg;
2329 struct getinfo_fid2path *gfout;
2335 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2336 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2339 /* Only need to get the buflen */
2340 if (get_user(pathlen, &gfin->gf_pathlen))
2343 if (pathlen > PATH_MAX)
2346 outsize = sizeof(*gfout) + pathlen;
2347 OBD_ALLOC(gfout, outsize);
2351 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2352 GOTO(gf_free, rc = -EFAULT);
2353 /* append root FID after gfout to let MDT know the root FID so that it
2354 * can lookup the correct path, this is mainly for fileset.
2355 * old server without fileset mount support will ignore this. */
2356 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2358 /* Call mdc_iocontrol */
2359 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2363 if (copy_to_user(arg, gfout, outsize))
2367 OBD_FREE(gfout, outsize);
2372 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2374 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2382 ioc->idv_version = 0;
2383 ioc->idv_layout_version = UINT_MAX;
2385 /* If no file object initialized, we consider its version is 0. */
2389 env = cl_env_get(&refcheck);
2391 RETURN(PTR_ERR(env));
2393 io = vvp_env_thread_io(env);
2395 io->u.ci_data_version.dv_data_version = 0;
2396 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2397 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2400 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2401 result = cl_io_loop(env, io);
2403 result = io->ci_result;
2405 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2406 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2408 cl_io_fini(env, io);
2410 if (unlikely(io->ci_need_restart))
2413 cl_env_put(env, &refcheck);
2419 * Read the data_version for inode.
2421 * This value is computed using stripe object version on OST.
2422 * Version is computed using server side locking.
2424 * @param flags if do sync on the OST side;
2426 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2427 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2429 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2431 struct ioc_data_version ioc = { .idv_flags = flags };
2434 rc = ll_ioc_data_version(inode, &ioc);
2436 *data_version = ioc.idv_version;
2442 * Trigger a HSM release request for the provided inode.
2444 int ll_hsm_release(struct inode *inode)
2447 struct obd_client_handle *och = NULL;
2448 __u64 data_version = 0;
2453 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2454 ll_get_fsname(inode->i_sb, NULL, 0),
2455 PFID(&ll_i2info(inode)->lli_fid));
2457 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2459 GOTO(out, rc = PTR_ERR(och));
2461 /* Grab latest data_version and [am]time values */
2462 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2466 env = cl_env_get(&refcheck);
2468 GOTO(out, rc = PTR_ERR(env));
2470 rc = ll_merge_attr(env, inode);
2471 cl_env_put(env, &refcheck);
2473 /* If error happen, we have the wrong size for a file.
2479 /* Release the file.
2480 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2481 * we still need it to pack l_remote_handle to MDT. */
2482 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2488 if (och != NULL && !IS_ERR(och)) /* close the file */
2489 ll_lease_close(och, inode, NULL);
2494 struct ll_swap_stack {
2497 struct inode *inode1;
2498 struct inode *inode2;
2503 static int ll_swap_layouts(struct file *file1, struct file *file2,
2504 struct lustre_swap_layouts *lsl)
2506 struct mdc_swap_layouts msl;
2507 struct md_op_data *op_data;
2510 struct ll_swap_stack *llss = NULL;
2513 OBD_ALLOC_PTR(llss);
2517 llss->inode1 = file_inode(file1);
2518 llss->inode2 = file_inode(file2);
2520 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2524 /* we use 2 bool because it is easier to swap than 2 bits */
2525 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2526 llss->check_dv1 = true;
2528 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2529 llss->check_dv2 = true;
2531 /* we cannot use lsl->sl_dvX directly because we may swap them */
2532 llss->dv1 = lsl->sl_dv1;
2533 llss->dv2 = lsl->sl_dv2;
2535 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2536 if (rc == 0) /* same file, done! */
2539 if (rc < 0) { /* sequentialize it */
2540 swap(llss->inode1, llss->inode2);
2542 swap(llss->dv1, llss->dv2);
2543 swap(llss->check_dv1, llss->check_dv2);
2547 if (gid != 0) { /* application asks to flush dirty cache */
2548 rc = ll_get_grouplock(llss->inode1, file1, gid);
2552 rc = ll_get_grouplock(llss->inode2, file2, gid);
2554 ll_put_grouplock(llss->inode1, file1, gid);
2559 /* ultimate check, before swaping the layouts we check if
2560 * dataversion has changed (if requested) */
2561 if (llss->check_dv1) {
2562 rc = ll_data_version(llss->inode1, &dv, 0);
2565 if (dv != llss->dv1)
2566 GOTO(putgl, rc = -EAGAIN);
2569 if (llss->check_dv2) {
2570 rc = ll_data_version(llss->inode2, &dv, 0);
2573 if (dv != llss->dv2)
2574 GOTO(putgl, rc = -EAGAIN);
2577 /* struct md_op_data is used to send the swap args to the mdt
2578 * only flags is missing, so we use struct mdc_swap_layouts
2579 * through the md_op_data->op_data */
2580 /* flags from user space have to be converted before they are send to
2581 * server, no flag is sent today, they are only used on the client */
2584 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2585 0, LUSTRE_OPC_ANY, &msl);
2586 if (IS_ERR(op_data))
2587 GOTO(free, rc = PTR_ERR(op_data));
2589 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2590 sizeof(*op_data), op_data, NULL);
2591 ll_finish_md_op_data(op_data);
2598 ll_put_grouplock(llss->inode2, file2, gid);
2599 ll_put_grouplock(llss->inode1, file1, gid);
2609 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2611 struct obd_export *exp = ll_i2mdexp(inode);
2612 struct md_op_data *op_data;
2616 /* Detect out-of range masks */
2617 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2620 /* Non-root users are forbidden to set or clear flags which are
2621 * NOT defined in HSM_USER_MASK. */
2622 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2623 !cfs_capable(CFS_CAP_SYS_ADMIN))
2626 if (!exp_connect_archive_id_array(exp)) {
2627 /* Detect out-of range archive id */
2628 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2629 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2633 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2634 LUSTRE_OPC_ANY, hss);
2635 if (IS_ERR(op_data))
2636 RETURN(PTR_ERR(op_data));
2638 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2641 ll_finish_md_op_data(op_data);
2646 static int ll_hsm_import(struct inode *inode, struct file *file,
2647 struct hsm_user_import *hui)
2649 struct hsm_state_set *hss = NULL;
2650 struct iattr *attr = NULL;
2654 if (!S_ISREG(inode->i_mode))
2660 GOTO(out, rc = -ENOMEM);
2662 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2663 hss->hss_archive_id = hui->hui_archive_id;
2664 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2665 rc = ll_hsm_state_set(inode, hss);
2669 OBD_ALLOC_PTR(attr);
2671 GOTO(out, rc = -ENOMEM);
2673 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2674 attr->ia_mode |= S_IFREG;
2675 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2676 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2677 attr->ia_size = hui->hui_size;
2678 attr->ia_mtime.tv_sec = hui->hui_mtime;
2679 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2680 attr->ia_atime.tv_sec = hui->hui_atime;
2681 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2683 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2684 ATTR_UID | ATTR_GID |
2685 ATTR_MTIME | ATTR_MTIME_SET |
2686 ATTR_ATIME | ATTR_ATIME_SET;
2690 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2694 inode_unlock(inode);
2706 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2708 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2709 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2712 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2714 struct inode *inode = file_inode(file);
2716 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2717 ATTR_MTIME | ATTR_MTIME_SET |
2720 .tv_sec = lfu->lfu_atime_sec,
2721 .tv_nsec = lfu->lfu_atime_nsec,
2724 .tv_sec = lfu->lfu_mtime_sec,
2725 .tv_nsec = lfu->lfu_mtime_nsec,
2728 .tv_sec = lfu->lfu_ctime_sec,
2729 .tv_nsec = lfu->lfu_ctime_nsec,
2735 if (!capable(CAP_SYS_ADMIN))
2738 if (!S_ISREG(inode->i_mode))
2742 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2744 inode_unlock(inode);
2749 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2752 case MODE_READ_USER:
2754 case MODE_WRITE_USER:
2761 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2763 /* Used to allow the upper layers of the client to request an LDLM lock
2764 * without doing an actual read or write.
2766 * Used for ladvise lockahead to manually request specific locks.
2768 * \param[in] file file this ladvise lock request is on
2769 * \param[in] ladvise ladvise struct describing this lock request
2771 * \retval 0 success, no detailed result available (sync requests
2772 * and requests sent to the server [not handled locally]
2773 * cannot return detailed results)
2774 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2775 * see definitions for details.
2776 * \retval negative negative errno on error
2778 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2780 struct lu_env *env = NULL;
2781 struct cl_io *io = NULL;
2782 struct cl_lock *lock = NULL;
2783 struct cl_lock_descr *descr = NULL;
2784 struct dentry *dentry = file->f_path.dentry;
2785 struct inode *inode = dentry->d_inode;
2786 enum cl_lock_mode cl_mode;
2787 off_t start = ladvise->lla_start;
2788 off_t end = ladvise->lla_end;
2794 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2795 "start=%llu, end=%llu\n", dentry->d_name.len,
2796 dentry->d_name.name, dentry->d_inode,
2797 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2800 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2802 GOTO(out, result = cl_mode);
2804 /* Get IO environment */
2805 result = cl_io_get(inode, &env, &io, &refcheck);
2809 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2812 * nothing to do for this io. This currently happens when
2813 * stripe sub-object's are not yet created.
2815 result = io->ci_result;
2816 } else if (result == 0) {
2817 lock = vvp_env_lock(env);
2818 descr = &lock->cll_descr;
2820 descr->cld_obj = io->ci_obj;
2821 /* Convert byte offsets to pages */
2822 descr->cld_start = cl_index(io->ci_obj, start);
2823 descr->cld_end = cl_index(io->ci_obj, end);
2824 descr->cld_mode = cl_mode;
2825 /* CEF_MUST is used because we do not want to convert a
2826 * lockahead request to a lockless lock */
2827 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2830 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2831 descr->cld_enq_flags |= CEF_SPECULATIVE;
2833 result = cl_lock_request(env, io, lock);
2835 /* On success, we need to release the lock */
2837 cl_lock_release(env, lock);
2839 cl_io_fini(env, io);
2840 cl_env_put(env, &refcheck);
2842 /* -ECANCELED indicates a matching lock with a different extent
2843 * was already present, and -EEXIST indicates a matching lock
2844 * on exactly the same extent was already present.
2845 * We convert them to positive values for userspace to make
2846 * recognizing true errors easier.
2847 * Note we can only return these detailed results on async requests,
2848 * as sync requests look the same as i/o requests for locking. */
2849 if (result == -ECANCELED)
2850 result = LLA_RESULT_DIFFERENT;
2851 else if (result == -EEXIST)
2852 result = LLA_RESULT_SAME;
2857 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2859 static int ll_ladvise_sanity(struct inode *inode,
2860 struct llapi_lu_ladvise *ladvise)
2862 enum lu_ladvise_type advice = ladvise->lla_advice;
2863 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2864 * be in the first 32 bits of enum ladvise_flags */
2865 __u32 flags = ladvise->lla_peradvice_flags;
2866 /* 3 lines at 80 characters per line, should be plenty */
2869 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2871 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2872 "last supported advice is %s (value '%d'): rc = %d\n",
2873 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2874 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2878 /* Per-advice checks */
2880 case LU_LADVISE_LOCKNOEXPAND:
2881 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2883 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2885 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2886 ladvise_names[advice], rc);
2890 case LU_LADVISE_LOCKAHEAD:
2891 /* Currently only READ and WRITE modes can be requested */
2892 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2893 ladvise->lla_lockahead_mode == 0) {
2895 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2897 ll_get_fsname(inode->i_sb, NULL, 0),
2898 ladvise->lla_lockahead_mode,
2899 ladvise_names[advice], rc);
2902 case LU_LADVISE_WILLREAD:
2903 case LU_LADVISE_DONTNEED:
2905 /* Note fall through above - These checks apply to all advices
2906 * except LOCKNOEXPAND */
2907 if (flags & ~LF_DEFAULT_MASK) {
2909 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2911 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2912 ladvise_names[advice], rc);
2915 if (ladvise->lla_start >= ladvise->lla_end) {
2917 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2918 "for %s: rc = %d\n",
2919 ll_get_fsname(inode->i_sb, NULL, 0),
2920 ladvise->lla_start, ladvise->lla_end,
2921 ladvise_names[advice], rc);
2933 * Give file access advices
2935 * The ladvise interface is similar to Linux fadvise() system call, except it
2936 * forwards the advices directly from Lustre client to server. The server side
2937 * codes will apply appropriate read-ahead and caching techniques for the
2938 * corresponding files.
2940 * A typical workload for ladvise is e.g. a bunch of different clients are
2941 * doing small random reads of a file, so prefetching pages into OSS cache
2942 * with big linear reads before the random IO is a net benefit. Fetching
2943 * all that data into each client cache with fadvise() may not be, due to
2944 * much more data being sent to the client.
2946 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2947 struct llapi_lu_ladvise *ladvise)
2951 struct cl_ladvise_io *lio;
2956 env = cl_env_get(&refcheck);
2958 RETURN(PTR_ERR(env));
2960 io = vvp_env_thread_io(env);
2961 io->ci_obj = ll_i2info(inode)->lli_clob;
2963 /* initialize parameters for ladvise */
2964 lio = &io->u.ci_ladvise;
2965 lio->li_start = ladvise->lla_start;
2966 lio->li_end = ladvise->lla_end;
2967 lio->li_fid = ll_inode2fid(inode);
2968 lio->li_advice = ladvise->lla_advice;
2969 lio->li_flags = flags;
2971 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2972 rc = cl_io_loop(env, io);
2976 cl_io_fini(env, io);
2977 cl_env_put(env, &refcheck);
2981 static int ll_lock_noexpand(struct file *file, int flags)
2983 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2985 fd->ll_lock_no_expand = !(flags & LF_UNSET);
2990 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
2993 struct fsxattr fsxattr;
2995 if (copy_from_user(&fsxattr,
2996 (const struct fsxattr __user *)arg,
3000 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3001 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3002 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3003 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3004 if (copy_to_user((struct fsxattr __user *)arg,
3005 &fsxattr, sizeof(fsxattr)))
3011 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3014 * Project Quota ID state is only allowed to change from within the init
3015 * namespace. Enforce that restriction only if we are trying to change
3016 * the quota ID state. Everything else is allowed in user namespaces.
3018 if (current_user_ns() == &init_user_ns)
3021 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3024 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3025 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3028 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3035 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3039 struct md_op_data *op_data;
3040 struct ptlrpc_request *req = NULL;
3042 struct fsxattr fsxattr;
3043 struct cl_object *obj;
3047 if (copy_from_user(&fsxattr,
3048 (const struct fsxattr __user *)arg,
3052 rc = ll_ioctl_check_project(inode, &fsxattr);
3056 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3057 LUSTRE_OPC_ANY, NULL);
3058 if (IS_ERR(op_data))
3059 RETURN(PTR_ERR(op_data));
3061 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3062 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3063 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3064 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3065 op_data->op_projid = fsxattr.fsx_projid;
3066 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3067 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3069 ptlrpc_req_finished(req);
3071 GOTO(out_fsxattr, rc);
3072 ll_update_inode_flags(inode, op_data->op_attr_flags);
3073 obj = ll_i2info(inode)->lli_clob;
3075 GOTO(out_fsxattr, rc);
3077 OBD_ALLOC_PTR(attr);
3079 GOTO(out_fsxattr, rc = -ENOMEM);
3081 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3082 fsxattr.fsx_xflags);
3085 ll_finish_md_op_data(op_data);
3089 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3092 struct inode *inode = file_inode(file);
3093 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3094 struct ll_inode_info *lli = ll_i2info(inode);
3095 struct obd_client_handle *och = NULL;
3096 struct split_param sp;
3099 enum mds_op_bias bias = 0;
3100 struct file *layout_file = NULL;
3102 size_t data_size = 0;
3106 mutex_lock(&lli->lli_och_mutex);
3107 if (fd->fd_lease_och != NULL) {
3108 och = fd->fd_lease_och;
3109 fd->fd_lease_och = NULL;
3111 mutex_unlock(&lli->lli_och_mutex);
3114 GOTO(out, rc = -ENOLCK);
3116 fmode = och->och_flags;
3118 switch (ioc->lil_flags) {
3119 case LL_LEASE_RESYNC_DONE:
3120 if (ioc->lil_count > IOC_IDS_MAX)
3121 GOTO(out, rc = -EINVAL);
3123 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3124 OBD_ALLOC(data, data_size);
3126 GOTO(out, rc = -ENOMEM);
3128 if (copy_from_user(data, (void __user *)arg, data_size))
3129 GOTO(out, rc = -EFAULT);
3131 bias = MDS_CLOSE_RESYNC_DONE;
3133 case LL_LEASE_LAYOUT_MERGE: {
3136 if (ioc->lil_count != 1)
3137 GOTO(out, rc = -EINVAL);
3139 arg += sizeof(*ioc);
3140 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3141 GOTO(out, rc = -EFAULT);
3143 layout_file = fget(fd);
3145 GOTO(out, rc = -EBADF);
3147 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3148 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3149 GOTO(out, rc = -EPERM);
3151 data = file_inode(layout_file);
3152 bias = MDS_CLOSE_LAYOUT_MERGE;
3155 case LL_LEASE_LAYOUT_SPLIT: {
3159 if (ioc->lil_count != 2)
3160 GOTO(out, rc = -EINVAL);
3162 arg += sizeof(*ioc);
3163 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3164 GOTO(out, rc = -EFAULT);
3166 arg += sizeof(__u32);
3167 if (copy_from_user(&mirror_id, (void __user *)arg,
3169 GOTO(out, rc = -EFAULT);
3171 layout_file = fget(fdv);
3173 GOTO(out, rc = -EBADF);
3175 sp.sp_inode = file_inode(layout_file);
3176 sp.sp_mirror_id = (__u16)mirror_id;
3178 bias = MDS_CLOSE_LAYOUT_SPLIT;
3182 /* without close intent */
3186 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3190 rc = ll_lease_och_release(inode, file);
3199 switch (ioc->lil_flags) {
3200 case LL_LEASE_RESYNC_DONE:
3202 OBD_FREE(data, data_size);
3204 case LL_LEASE_LAYOUT_MERGE:
3205 case LL_LEASE_LAYOUT_SPLIT:
3212 rc = ll_lease_type_from_fmode(fmode);
3216 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3219 struct inode *inode = file_inode(file);
3220 struct ll_inode_info *lli = ll_i2info(inode);
3221 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3222 struct obd_client_handle *och = NULL;
3223 __u64 open_flags = 0;
3229 switch (ioc->lil_mode) {
3230 case LL_LEASE_WRLCK:
3231 if (!(file->f_mode & FMODE_WRITE))
3233 fmode = FMODE_WRITE;
3235 case LL_LEASE_RDLCK:
3236 if (!(file->f_mode & FMODE_READ))
3240 case LL_LEASE_UNLCK:
3241 RETURN(ll_file_unlock_lease(file, ioc, arg));
3246 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3248 /* apply for lease */
3249 if (ioc->lil_flags & LL_LEASE_RESYNC)
3250 open_flags = MDS_OPEN_RESYNC;
3251 och = ll_lease_open(inode, file, fmode, open_flags);
3253 RETURN(PTR_ERR(och));
3255 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3256 rc = ll_lease_file_resync(och, inode, arg);
3258 ll_lease_close(och, inode, NULL);
3261 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3263 ll_lease_close(och, inode, NULL);
3269 mutex_lock(&lli->lli_och_mutex);
3270 if (fd->fd_lease_och == NULL) {
3271 fd->fd_lease_och = och;
3274 mutex_unlock(&lli->lli_och_mutex);
3276 /* impossible now that only excl is supported for now */
3277 ll_lease_close(och, inode, &lease_broken);
3284 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3286 struct inode *inode = file_inode(file);
3287 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3291 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3292 PFID(ll_inode2fid(inode)), inode, cmd);
3293 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3295 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3296 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3300 case LL_IOC_GETFLAGS:
3301 /* Get the current value of the file flags */
3302 return put_user(fd->fd_flags, (int __user *)arg);
3303 case LL_IOC_SETFLAGS:
3304 case LL_IOC_CLRFLAGS:
3305 /* Set or clear specific file flags */
3306 /* XXX This probably needs checks to ensure the flags are
3307 * not abused, and to handle any flag side effects.
3309 if (get_user(flags, (int __user *) arg))
3312 if (cmd == LL_IOC_SETFLAGS) {
3313 if ((flags & LL_FILE_IGNORE_LOCK) &&
3314 !(file->f_flags & O_DIRECT)) {
3315 CERROR("%s: unable to disable locking on "
3316 "non-O_DIRECT file\n", current->comm);
3320 fd->fd_flags |= flags;
3322 fd->fd_flags &= ~flags;
3325 case LL_IOC_LOV_SETSTRIPE:
3326 case LL_IOC_LOV_SETSTRIPE_NEW:
3327 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3328 case LL_IOC_LOV_SETEA:
3329 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3330 case LL_IOC_LOV_SWAP_LAYOUTS: {
3332 struct lustre_swap_layouts lsl;
3334 if (copy_from_user(&lsl, (char __user *)arg,
3335 sizeof(struct lustre_swap_layouts)))
3338 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3341 file2 = fget(lsl.sl_fd);
3345 /* O_WRONLY or O_RDWR */
3346 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3347 GOTO(out, rc = -EPERM);
3349 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3350 struct inode *inode2;
3351 struct ll_inode_info *lli;
3352 struct obd_client_handle *och = NULL;
3354 lli = ll_i2info(inode);
3355 mutex_lock(&lli->lli_och_mutex);
3356 if (fd->fd_lease_och != NULL) {
3357 och = fd->fd_lease_och;
3358 fd->fd_lease_och = NULL;
3360 mutex_unlock(&lli->lli_och_mutex);
3362 GOTO(out, rc = -ENOLCK);
3363 inode2 = file_inode(file2);
3364 rc = ll_swap_layouts_close(och, inode, inode2);
3366 rc = ll_swap_layouts(file, file2, &lsl);
3372 case LL_IOC_LOV_GETSTRIPE:
3373 case LL_IOC_LOV_GETSTRIPE_NEW:
3374 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3375 case FS_IOC_GETFLAGS:
3376 case FS_IOC_SETFLAGS:
3377 RETURN(ll_iocontrol(inode, file, cmd, arg));
3378 case FSFILT_IOC_GETVERSION:
3379 case FS_IOC_GETVERSION:
3380 RETURN(put_user(inode->i_generation, (int __user *)arg));
3381 /* We need to special case any other ioctls we want to handle,
3382 * to send them to the MDS/OST as appropriate and to properly
3383 * network encode the arg field. */
3384 case FS_IOC_SETVERSION:
3387 case LL_IOC_GROUP_LOCK:
3388 RETURN(ll_get_grouplock(inode, file, arg));
3389 case LL_IOC_GROUP_UNLOCK:
3390 RETURN(ll_put_grouplock(inode, file, arg));
3391 case IOC_OBD_STATFS:
3392 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3394 case LL_IOC_FLUSHCTX:
3395 RETURN(ll_flush_ctx(inode));
3396 case LL_IOC_PATH2FID: {
3397 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3398 sizeof(struct lu_fid)))
3403 case LL_IOC_GETPARENT:
3404 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3406 case OBD_IOC_FID2PATH:
3407 RETURN(ll_fid2path(inode, (void __user *)arg));
3408 case LL_IOC_DATA_VERSION: {
3409 struct ioc_data_version idv;
3412 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3415 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3416 rc = ll_ioc_data_version(inode, &idv);
3419 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3425 case LL_IOC_GET_MDTIDX: {
3428 mdtidx = ll_get_mdt_idx(inode);
3432 if (put_user((int)mdtidx, (int __user *)arg))
3437 case OBD_IOC_GETDTNAME:
3438 case OBD_IOC_GETMDNAME:
3439 RETURN(ll_get_obd_name(inode, cmd, arg));
3440 case LL_IOC_HSM_STATE_GET: {
3441 struct md_op_data *op_data;
3442 struct hsm_user_state *hus;
3449 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3450 LUSTRE_OPC_ANY, hus);
3451 if (IS_ERR(op_data)) {
3453 RETURN(PTR_ERR(op_data));
3456 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3459 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3462 ll_finish_md_op_data(op_data);
3466 case LL_IOC_HSM_STATE_SET: {
3467 struct hsm_state_set *hss;
3474 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3479 rc = ll_hsm_state_set(inode, hss);
3484 case LL_IOC_HSM_ACTION: {
3485 struct md_op_data *op_data;
3486 struct hsm_current_action *hca;
3493 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3494 LUSTRE_OPC_ANY, hca);
3495 if (IS_ERR(op_data)) {
3497 RETURN(PTR_ERR(op_data));
3500 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3503 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3506 ll_finish_md_op_data(op_data);
3510 case LL_IOC_SET_LEASE_OLD: {
3511 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3513 RETURN(ll_file_set_lease(file, &ioc, 0));
3515 case LL_IOC_SET_LEASE: {
3516 struct ll_ioc_lease ioc;
3518 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3521 RETURN(ll_file_set_lease(file, &ioc, arg));
3523 case LL_IOC_GET_LEASE: {
3524 struct ll_inode_info *lli = ll_i2info(inode);
3525 struct ldlm_lock *lock = NULL;
3528 mutex_lock(&lli->lli_och_mutex);
3529 if (fd->fd_lease_och != NULL) {
3530 struct obd_client_handle *och = fd->fd_lease_och;
3532 lock = ldlm_handle2lock(&och->och_lease_handle);
3534 lock_res_and_lock(lock);
3535 if (!ldlm_is_cancel(lock))
3536 fmode = och->och_flags;
3538 unlock_res_and_lock(lock);
3539 LDLM_LOCK_PUT(lock);
3542 mutex_unlock(&lli->lli_och_mutex);
3544 RETURN(ll_lease_type_from_fmode(fmode));
3546 case LL_IOC_HSM_IMPORT: {
3547 struct hsm_user_import *hui;
3553 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3558 rc = ll_hsm_import(inode, file, hui);
3563 case LL_IOC_FUTIMES_3: {
3564 struct ll_futimes_3 lfu;
3566 if (copy_from_user(&lfu,
3567 (const struct ll_futimes_3 __user *)arg,
3571 RETURN(ll_file_futimes_3(file, &lfu));
3573 case LL_IOC_LADVISE: {
3574 struct llapi_ladvise_hdr *k_ladvise_hdr;
3575 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3578 int alloc_size = sizeof(*k_ladvise_hdr);
3581 u_ladvise_hdr = (void __user *)arg;
3582 OBD_ALLOC_PTR(k_ladvise_hdr);
3583 if (k_ladvise_hdr == NULL)
3586 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3587 GOTO(out_ladvise, rc = -EFAULT);
3589 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3590 k_ladvise_hdr->lah_count < 1)
3591 GOTO(out_ladvise, rc = -EINVAL);
3593 num_advise = k_ladvise_hdr->lah_count;
3594 if (num_advise >= LAH_COUNT_MAX)
3595 GOTO(out_ladvise, rc = -EFBIG);
3597 OBD_FREE_PTR(k_ladvise_hdr);
3598 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3599 lah_advise[num_advise]);
3600 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3601 if (k_ladvise_hdr == NULL)
3605 * TODO: submit multiple advices to one server in a single RPC
3607 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3608 GOTO(out_ladvise, rc = -EFAULT);
3610 for (i = 0; i < num_advise; i++) {
3611 struct llapi_lu_ladvise *k_ladvise =
3612 &k_ladvise_hdr->lah_advise[i];
3613 struct llapi_lu_ladvise __user *u_ladvise =
3614 &u_ladvise_hdr->lah_advise[i];
3616 rc = ll_ladvise_sanity(inode, k_ladvise);
3618 GOTO(out_ladvise, rc);
3620 switch (k_ladvise->lla_advice) {
3621 case LU_LADVISE_LOCKNOEXPAND:
3622 rc = ll_lock_noexpand(file,
3623 k_ladvise->lla_peradvice_flags);
3624 GOTO(out_ladvise, rc);
3625 case LU_LADVISE_LOCKAHEAD:
3627 rc = ll_file_lock_ahead(file, k_ladvise);
3630 GOTO(out_ladvise, rc);
3633 &u_ladvise->lla_lockahead_result))
3634 GOTO(out_ladvise, rc = -EFAULT);
3637 rc = ll_ladvise(inode, file,
3638 k_ladvise_hdr->lah_flags,
3641 GOTO(out_ladvise, rc);
3648 OBD_FREE(k_ladvise_hdr, alloc_size);
3651 case LL_IOC_FLR_SET_MIRROR: {
3652 /* mirror I/O must be direct to avoid polluting page cache
3654 if (!(file->f_flags & O_DIRECT))
3657 fd->fd_designated_mirror = (__u32)arg;
3660 case LL_IOC_FSGETXATTR:
3661 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3662 case LL_IOC_FSSETXATTR:
3663 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3665 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3667 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3668 (void __user *)arg));
3672 #ifndef HAVE_FILE_LLSEEK_SIZE
3673 static inline loff_t
3674 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3676 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3678 if (offset > maxsize)
3681 if (offset != file->f_pos) {
3682 file->f_pos = offset;
3683 file->f_version = 0;
3689 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3690 loff_t maxsize, loff_t eof)
3692 struct inode *inode = file_inode(file);
3700 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3701 * position-querying operation. Avoid rewriting the "same"
3702 * f_pos value back to the file because a concurrent read(),
3703 * write() or lseek() might have altered it
3708 * f_lock protects against read/modify/write race with other
3709 * SEEK_CURs. Note that parallel writes and reads behave
3713 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3714 inode_unlock(inode);
3718 * In the generic case the entire file is data, so as long as
3719 * offset isn't at the end of the file then the offset is data.
3726 * There is a virtual hole at the end of the file, so as long as
3727 * offset isn't i_size or larger, return i_size.
3735 return llseek_execute(file, offset, maxsize);
3739 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3741 struct inode *inode = file_inode(file);
3742 loff_t retval, eof = 0;
3745 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3746 (origin == SEEK_CUR) ? file->f_pos : 0);
3747 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3748 PFID(ll_inode2fid(inode)), inode, retval, retval,
3750 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3752 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3753 retval = ll_glimpse_size(inode);
3756 eof = i_size_read(inode);
3759 retval = ll_generic_file_llseek_size(file, offset, origin,
3760 ll_file_maxbytes(inode), eof);
3764 static int ll_flush(struct file *file, fl_owner_t id)
3766 struct inode *inode = file_inode(file);
3767 struct ll_inode_info *lli = ll_i2info(inode);
3768 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3771 LASSERT(!S_ISDIR(inode->i_mode));
3773 /* catch async errors that were recorded back when async writeback
3774 * failed for pages in this mapping. */
3775 rc = lli->lli_async_rc;
3776 lli->lli_async_rc = 0;
3777 if (lli->lli_clob != NULL) {
3778 err = lov_read_and_clear_async_rc(lli->lli_clob);
3783 /* The application has been told write failure already.
3784 * Do not report failure again. */
3785 if (fd->fd_write_failed)
3787 return rc ? -EIO : 0;
3791 * Called to make sure a portion of file has been written out.
3792 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3794 * Return how many pages have been written.
3796 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3797 enum cl_fsync_mode mode, int ignore_layout)
3801 struct cl_fsync_io *fio;
3806 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3807 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3810 env = cl_env_get(&refcheck);
3812 RETURN(PTR_ERR(env));
3814 io = vvp_env_thread_io(env);
3815 io->ci_obj = ll_i2info(inode)->lli_clob;
3816 io->ci_ignore_layout = ignore_layout;
3818 /* initialize parameters for sync */
3819 fio = &io->u.ci_fsync;
3820 fio->fi_start = start;
3822 fio->fi_fid = ll_inode2fid(inode);
3823 fio->fi_mode = mode;
3824 fio->fi_nr_written = 0;
3826 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3827 result = cl_io_loop(env, io);
3829 result = io->ci_result;
3831 result = fio->fi_nr_written;
3832 cl_io_fini(env, io);
3833 cl_env_put(env, &refcheck);
3839 * When dentry is provided (the 'else' case), file_dentry() may be
3840 * null and dentry must be used directly rather than pulled from
3841 * file_dentry() as is done otherwise.
3844 #ifdef HAVE_FILE_FSYNC_4ARGS
3845 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3847 struct dentry *dentry = file_dentry(file);
3848 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3849 int ll_fsync(struct file *file, int datasync)
3851 struct dentry *dentry = file_dentry(file);
3853 loff_t end = LLONG_MAX;
3855 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3858 loff_t end = LLONG_MAX;
3860 struct inode *inode = dentry->d_inode;
3861 struct ll_inode_info *lli = ll_i2info(inode);
3862 struct ptlrpc_request *req;
3866 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3867 PFID(ll_inode2fid(inode)), inode);
3868 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3870 #ifdef HAVE_FILE_FSYNC_4ARGS
3871 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3874 /* fsync's caller has already called _fdata{sync,write}, we want
3875 * that IO to finish before calling the osc and mdc sync methods */
3876 rc = filemap_fdatawait(inode->i_mapping);
3879 /* catch async errors that were recorded back when async writeback
3880 * failed for pages in this mapping. */
3881 if (!S_ISDIR(inode->i_mode)) {
3882 err = lli->lli_async_rc;
3883 lli->lli_async_rc = 0;
3886 if (lli->lli_clob != NULL) {
3887 err = lov_read_and_clear_async_rc(lli->lli_clob);
3893 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3897 ptlrpc_req_finished(req);
3899 if (S_ISREG(inode->i_mode)) {
3900 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3902 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3903 if (rc == 0 && err < 0)
3906 fd->fd_write_failed = true;
3908 fd->fd_write_failed = false;
3911 #ifdef HAVE_FILE_FSYNC_4ARGS
3912 inode_unlock(inode);
3918 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3920 struct inode *inode = file_inode(file);
3921 struct ll_sb_info *sbi = ll_i2sbi(inode);
3922 struct ldlm_enqueue_info einfo = {
3923 .ei_type = LDLM_FLOCK,
3924 .ei_cb_cp = ldlm_flock_completion_ast,
3925 .ei_cbdata = file_lock,
3927 struct md_op_data *op_data;
3928 struct lustre_handle lockh = { 0 };
3929 union ldlm_policy_data flock = { { 0 } };
3930 int fl_type = file_lock->fl_type;
3936 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3937 PFID(ll_inode2fid(inode)), file_lock);
3939 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3941 if (file_lock->fl_flags & FL_FLOCK) {
3942 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3943 /* flocks are whole-file locks */
3944 flock.l_flock.end = OFFSET_MAX;
3945 /* For flocks owner is determined by the local file desctiptor*/
3946 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3947 } else if (file_lock->fl_flags & FL_POSIX) {
3948 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3949 flock.l_flock.start = file_lock->fl_start;
3950 flock.l_flock.end = file_lock->fl_end;
3954 flock.l_flock.pid = file_lock->fl_pid;
3956 /* Somewhat ugly workaround for svc lockd.
3957 * lockd installs custom fl_lmops->lm_compare_owner that checks
3958 * for the fl_owner to be the same (which it always is on local node
3959 * I guess between lockd processes) and then compares pid.
3960 * As such we assign pid to the owner field to make it all work,
3961 * conflict with normal locks is unlikely since pid space and
3962 * pointer space for current->files are not intersecting */
3963 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3964 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3968 einfo.ei_mode = LCK_PR;
3971 /* An unlock request may or may not have any relation to
3972 * existing locks so we may not be able to pass a lock handle
3973 * via a normal ldlm_lock_cancel() request. The request may even
3974 * unlock a byte range in the middle of an existing lock. In
3975 * order to process an unlock request we need all of the same
3976 * information that is given with a normal read or write record
3977 * lock request. To avoid creating another ldlm unlock (cancel)
3978 * message we'll treat a LCK_NL flock request as an unlock. */
3979 einfo.ei_mode = LCK_NL;
3982 einfo.ei_mode = LCK_PW;
3985 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4000 flags = LDLM_FL_BLOCK_NOWAIT;
4006 flags = LDLM_FL_TEST_LOCK;
4009 CERROR("unknown fcntl lock command: %d\n", cmd);
4013 /* Save the old mode so that if the mode in the lock changes we
4014 * can decrement the appropriate reader or writer refcount. */
4015 file_lock->fl_type = einfo.ei_mode;
4017 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4018 LUSTRE_OPC_ANY, NULL);
4019 if (IS_ERR(op_data))
4020 RETURN(PTR_ERR(op_data));
4022 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4023 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4024 flock.l_flock.pid, flags, einfo.ei_mode,
4025 flock.l_flock.start, flock.l_flock.end);
4027 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4030 /* Restore the file lock type if not TEST lock. */
4031 if (!(flags & LDLM_FL_TEST_LOCK))
4032 file_lock->fl_type = fl_type;
4034 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4035 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4036 !(flags & LDLM_FL_TEST_LOCK))
4037 rc2 = locks_lock_file_wait(file, file_lock);
4039 if ((file_lock->fl_flags & FL_FLOCK) &&
4040 (rc == 0 || file_lock->fl_type == F_UNLCK))
4041 rc2 = flock_lock_file_wait(file, file_lock);
4042 if ((file_lock->fl_flags & FL_POSIX) &&
4043 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4044 !(flags & LDLM_FL_TEST_LOCK))
4045 rc2 = posix_lock_file_wait(file, file_lock);
4046 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4048 if (rc2 && file_lock->fl_type != F_UNLCK) {
4049 einfo.ei_mode = LCK_NL;
4050 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4055 ll_finish_md_op_data(op_data);
4060 int ll_get_fid_by_name(struct inode *parent, const char *name,
4061 int namelen, struct lu_fid *fid,
4062 struct inode **inode)
4064 struct md_op_data *op_data = NULL;
4065 struct mdt_body *body;
4066 struct ptlrpc_request *req;
4070 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4071 LUSTRE_OPC_ANY, NULL);
4072 if (IS_ERR(op_data))
4073 RETURN(PTR_ERR(op_data));
4075 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4076 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4077 ll_finish_md_op_data(op_data);
4081 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4083 GOTO(out_req, rc = -EFAULT);
4085 *fid = body->mbo_fid1;
4088 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4090 ptlrpc_req_finished(req);
4094 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4097 struct dentry *dchild = NULL;
4098 struct inode *child_inode = NULL;
4099 struct md_op_data *op_data;
4100 struct ptlrpc_request *request = NULL;
4101 struct obd_client_handle *och = NULL;
4103 struct mdt_body *body;
4104 __u64 data_version = 0;
4105 size_t namelen = strlen(name);
4106 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4110 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4111 PFID(ll_inode2fid(parent)), name,
4112 lum->lum_stripe_offset, lum->lum_stripe_count);
4114 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4115 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4116 lustre_swab_lmv_user_md(lum);
4118 /* Get child FID first */
4119 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4122 dchild = d_lookup(file_dentry(file), &qstr);
4124 if (dchild->d_inode)
4125 child_inode = igrab(dchild->d_inode);
4130 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4139 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4140 OBD_CONNECT2_DIR_MIGRATE)) {
4141 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4142 ll_i2info(child_inode)->lli_lsm_md) {
4143 CERROR("%s: MDT doesn't support stripe directory "
4145 ll_get_fsname(parent->i_sb, NULL, 0));
4146 GOTO(out_iput, rc = -EOPNOTSUPP);
4151 * lfs migrate command needs to be blocked on the client
4152 * by checking the migrate FID against the FID of the
4155 if (child_inode == parent->i_sb->s_root->d_inode)
4156 GOTO(out_iput, rc = -EINVAL);
4158 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4159 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4160 if (IS_ERR(op_data))
4161 GOTO(out_iput, rc = PTR_ERR(op_data));
4163 inode_lock(child_inode);
4164 op_data->op_fid3 = *ll_inode2fid(child_inode);
4165 if (!fid_is_sane(&op_data->op_fid3)) {
4166 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4167 ll_get_fsname(parent->i_sb, NULL, 0), name,
4168 PFID(&op_data->op_fid3));
4169 GOTO(out_unlock, rc = -EINVAL);
4172 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4173 op_data->op_data = lum;
4174 op_data->op_data_size = lumlen;
4177 if (S_ISREG(child_inode->i_mode)) {
4178 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4182 GOTO(out_unlock, rc);
4185 rc = ll_data_version(child_inode, &data_version,
4188 GOTO(out_close, rc);
4190 op_data->op_open_handle = och->och_open_handle;
4191 op_data->op_data_version = data_version;
4192 op_data->op_lease_handle = och->och_lease_handle;
4193 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4195 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4196 och->och_mod->mod_open_req->rq_replay = 0;
4197 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4200 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4201 name, namelen, &request);
4203 LASSERT(request != NULL);
4204 ll_update_times(request, parent);
4206 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4207 LASSERT(body != NULL);
4209 /* If the server does release layout lock, then we cleanup
4210 * the client och here, otherwise release it in out_close: */
4211 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4212 obd_mod_put(och->och_mod);
4213 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4215 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4221 if (request != NULL) {
4222 ptlrpc_req_finished(request);
4226 /* Try again if the file layout has changed. */
4227 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4232 ll_lease_close(och, child_inode, NULL);
4234 clear_nlink(child_inode);
4236 inode_unlock(child_inode);
4237 ll_finish_md_op_data(op_data);
4244 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4252 * test if some locks matching bits and l_req_mode are acquired
4253 * - bits can be in different locks
4254 * - if found clear the common lock bits in *bits
4255 * - the bits not found, are kept in *bits
4257 * \param bits [IN] searched lock bits [IN]
4258 * \param l_req_mode [IN] searched lock mode
4259 * \retval boolean, true iff all bits are found
4261 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4263 struct lustre_handle lockh;
4264 union ldlm_policy_data policy;
4265 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4266 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4275 fid = &ll_i2info(inode)->lli_fid;
4276 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4277 ldlm_lockname[mode]);
4279 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4280 for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4281 policy.l_inodebits.bits = *bits & (1 << i);
4282 if (policy.l_inodebits.bits == 0)
4285 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4286 &policy, mode, &lockh)) {
4287 struct ldlm_lock *lock;
4289 lock = ldlm_handle2lock(&lockh);
4292 ~(lock->l_policy_data.l_inodebits.bits);
4293 LDLM_LOCK_PUT(lock);
4295 *bits &= ~policy.l_inodebits.bits;
4302 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4303 struct lustre_handle *lockh, __u64 flags,
4304 enum ldlm_mode mode)
4306 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4311 fid = &ll_i2info(inode)->lli_fid;
4312 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4314 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4315 fid, LDLM_IBITS, &policy, mode, lockh);
4320 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4322 /* Already unlinked. Just update nlink and return success */
4323 if (rc == -ENOENT) {
4325 /* If it is striped directory, and there is bad stripe
4326 * Let's revalidate the dentry again, instead of returning
4328 if (S_ISDIR(inode->i_mode) &&
4329 ll_i2info(inode)->lli_lsm_md != NULL)
4332 /* This path cannot be hit for regular files unless in
4333 * case of obscure races, so no need to to validate
4335 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4337 } else if (rc != 0) {
4338 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4339 "%s: revalidate FID "DFID" error: rc = %d\n",
4340 ll_get_fsname(inode->i_sb, NULL, 0),
4341 PFID(ll_inode2fid(inode)), rc);
4347 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4349 struct inode *inode = dentry->d_inode;
4350 struct obd_export *exp = ll_i2mdexp(inode);
4351 struct lookup_intent oit = {
4354 struct ptlrpc_request *req = NULL;
4355 struct md_op_data *op_data;
4359 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4360 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4362 /* Call getattr by fid, so do not provide name at all. */
4363 op_data = ll_prep_md_op_data(NULL, dentry->d_parent->d_inode, inode,
4364 NULL, 0, 0, LUSTRE_OPC_ANY, NULL);
4365 if (IS_ERR(op_data))
4366 RETURN(PTR_ERR(op_data));
4368 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4369 ll_finish_md_op_data(op_data);
4371 rc = ll_inode_revalidate_fini(inode, rc);
4375 rc = ll_revalidate_it_finish(req, &oit, dentry);
4377 ll_intent_release(&oit);
4381 /* Unlinked? Unhash dentry, so it is not picked up later by
4382 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4383 * here to preserve get_cwd functionality on 2.6.
4385 if (!dentry->d_inode->i_nlink) {
4386 ll_lock_dcache(inode);
4387 d_lustre_invalidate(dentry, 0);
4388 ll_unlock_dcache(inode);
4391 ll_lookup_finish_locks(&oit, dentry);
4393 ptlrpc_req_finished(req);
4398 static int ll_merge_md_attr(struct inode *inode)
4400 struct ll_inode_info *lli = ll_i2info(inode);
4401 struct cl_attr attr = { 0 };
4404 LASSERT(lli->lli_lsm_md != NULL);
4405 down_read(&lli->lli_lsm_sem);
4406 rc = md_merge_attr(ll_i2mdexp(inode), &lli->lli_fid, lli->lli_lsm_md,
4407 &attr, ll_md_blocking_ast);
4408 up_read(&lli->lli_lsm_sem);
4412 set_nlink(inode, attr.cat_nlink);
4413 inode->i_blocks = attr.cat_blocks;
4414 i_size_write(inode, attr.cat_size);
4416 ll_i2info(inode)->lli_atime = attr.cat_atime;
4417 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4418 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4423 static inline dev_t ll_compat_encode_dev(dev_t dev)
4425 /* The compat_sys_*stat*() syscalls will fail unless the
4426 * device majors and minors are both less than 256. Note that
4427 * the value returned here will be passed through
4428 * old_encode_dev() in cp_compat_stat(). And so we are not
4429 * trying to return a valid compat (u16) device number, just
4430 * one that will pass the old_valid_dev() check. */
4432 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4435 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4436 int ll_getattr(const struct path *path, struct kstat *stat,
4437 u32 request_mask, unsigned int flags)
4439 struct dentry *de = path->dentry;
4441 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4444 struct inode *inode = de->d_inode;
4445 struct ll_sb_info *sbi = ll_i2sbi(inode);
4446 struct ll_inode_info *lli = ll_i2info(inode);
4449 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4451 rc = ll_inode_revalidate(de, IT_GETATTR);
4455 if (S_ISREG(inode->i_mode)) {
4456 /* In case of restore, the MDT has the right size and has
4457 * already send it back without granting the layout lock,
4458 * inode is up-to-date so glimpse is useless.
4459 * Also to glimpse we need the layout, in case of a running
4460 * restore the MDT holds the layout lock so the glimpse will
4461 * block up to the end of restore (getattr will block)
4463 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4464 rc = ll_glimpse_size(inode);
4469 /* If object isn't regular a file then don't validate size. */
4470 if (S_ISDIR(inode->i_mode) &&
4471 lli->lli_lsm_md != NULL) {
4472 rc = ll_merge_md_attr(inode);
4477 inode->i_atime.tv_sec = lli->lli_atime;
4478 inode->i_mtime.tv_sec = lli->lli_mtime;
4479 inode->i_ctime.tv_sec = lli->lli_ctime;
4482 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4484 if (ll_need_32bit_api(sbi)) {
4485 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4486 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4487 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4489 stat->ino = inode->i_ino;
4490 stat->dev = inode->i_sb->s_dev;
4491 stat->rdev = inode->i_rdev;
4494 stat->mode = inode->i_mode;
4495 stat->uid = inode->i_uid;
4496 stat->gid = inode->i_gid;
4497 stat->atime = inode->i_atime;
4498 stat->mtime = inode->i_mtime;
4499 stat->ctime = inode->i_ctime;
4500 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4502 stat->nlink = inode->i_nlink;
4503 stat->size = i_size_read(inode);
4504 stat->blocks = inode->i_blocks;
4509 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4510 __u64 start, __u64 len)
4514 struct fiemap *fiemap;
4515 unsigned int extent_count = fieinfo->fi_extents_max;
4517 num_bytes = sizeof(*fiemap) + (extent_count *
4518 sizeof(struct fiemap_extent));
4519 OBD_ALLOC_LARGE(fiemap, num_bytes);
4524 fiemap->fm_flags = fieinfo->fi_flags;
4525 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4526 fiemap->fm_start = start;
4527 fiemap->fm_length = len;
4528 if (extent_count > 0 &&
4529 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4530 sizeof(struct fiemap_extent)) != 0)
4531 GOTO(out, rc = -EFAULT);
4533 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4535 fieinfo->fi_flags = fiemap->fm_flags;
4536 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4537 if (extent_count > 0 &&
4538 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4539 fiemap->fm_mapped_extents *
4540 sizeof(struct fiemap_extent)) != 0)
4541 GOTO(out, rc = -EFAULT);
4543 OBD_FREE_LARGE(fiemap, num_bytes);
4547 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4549 struct ll_inode_info *lli = ll_i2info(inode);
4550 struct posix_acl *acl = NULL;
4553 spin_lock(&lli->lli_lock);
4554 /* VFS' acl_permission_check->check_acl will release the refcount */
4555 acl = posix_acl_dup(lli->lli_posix_acl);
4556 spin_unlock(&lli->lli_lock);
4561 #ifdef HAVE_IOP_SET_ACL
4562 #ifdef CONFIG_FS_POSIX_ACL
4563 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4565 struct ll_sb_info *sbi = ll_i2sbi(inode);
4566 struct ptlrpc_request *req = NULL;
4567 const char *name = NULL;
4569 size_t value_size = 0;
4574 case ACL_TYPE_ACCESS:
4575 name = XATTR_NAME_POSIX_ACL_ACCESS;
4577 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4580 case ACL_TYPE_DEFAULT:
4581 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4582 if (!S_ISDIR(inode->i_mode))
4583 rc = acl ? -EACCES : 0;
4594 value_size = posix_acl_xattr_size(acl->a_count);
4595 value = kmalloc(value_size, GFP_NOFS);
4597 GOTO(out, rc = -ENOMEM);
4599 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4601 GOTO(out_value, rc);
4604 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4605 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4606 name, value, value_size, 0, 0, &req);
4608 ptlrpc_req_finished(req);
4613 forget_cached_acl(inode, type);
4615 set_cached_acl(inode, type, acl);
4618 #endif /* CONFIG_FS_POSIX_ACL */
4619 #endif /* HAVE_IOP_SET_ACL */
4621 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4623 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4624 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4626 ll_check_acl(struct inode *inode, int mask)
4629 # ifdef CONFIG_FS_POSIX_ACL
4630 struct posix_acl *acl;
4634 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4635 if (flags & IPERM_FLAG_RCU)
4638 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4643 rc = posix_acl_permission(inode, acl, mask);
4644 posix_acl_release(acl);
4647 # else /* !CONFIG_FS_POSIX_ACL */
4649 # endif /* CONFIG_FS_POSIX_ACL */
4651 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4653 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4654 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4656 # ifdef HAVE_INODE_PERMISION_2ARGS
4657 int ll_inode_permission(struct inode *inode, int mask)
4659 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4664 struct ll_sb_info *sbi;
4665 struct root_squash_info *squash;
4666 struct cred *cred = NULL;
4667 const struct cred *old_cred = NULL;
4669 bool squash_id = false;
4672 #ifdef MAY_NOT_BLOCK
4673 if (mask & MAY_NOT_BLOCK)
4675 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4676 if (flags & IPERM_FLAG_RCU)
4680 /* as root inode are NOT getting validated in lookup operation,
4681 * need to do it before permission check. */
4683 if (inode == inode->i_sb->s_root->d_inode) {
4684 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4689 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4690 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4692 /* squash fsuid/fsgid if needed */
4693 sbi = ll_i2sbi(inode);
4694 squash = &sbi->ll_squash;
4695 if (unlikely(squash->rsi_uid != 0 &&
4696 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4697 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4701 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4702 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4703 squash->rsi_uid, squash->rsi_gid);
4705 /* update current process's credentials
4706 * and FS capability */
4707 cred = prepare_creds();
4711 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4712 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4713 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4714 if ((1 << cap) & CFS_CAP_FS_MASK)
4715 cap_lower(cred->cap_effective, cap);
4717 old_cred = override_creds(cred);
4720 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4721 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4722 /* restore current process's credentials and FS capability */
4724 revert_creds(old_cred);
4731 /* -o localflock - only provides locally consistent flock locks */
4732 struct file_operations ll_file_operations = {
4733 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4734 # ifdef HAVE_SYNC_READ_WRITE
4735 .read = new_sync_read,
4736 .write = new_sync_write,
4738 .read_iter = ll_file_read_iter,
4739 .write_iter = ll_file_write_iter,
4740 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4741 .read = ll_file_read,
4742 .aio_read = ll_file_aio_read,
4743 .write = ll_file_write,
4744 .aio_write = ll_file_aio_write,
4745 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4746 .unlocked_ioctl = ll_file_ioctl,
4747 .open = ll_file_open,
4748 .release = ll_file_release,
4749 .mmap = ll_file_mmap,
4750 .llseek = ll_file_seek,
4751 .splice_read = ll_file_splice_read,
4756 struct file_operations ll_file_operations_flock = {
4757 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4758 # ifdef HAVE_SYNC_READ_WRITE
4759 .read = new_sync_read,
4760 .write = new_sync_write,
4761 # endif /* HAVE_SYNC_READ_WRITE */
4762 .read_iter = ll_file_read_iter,
4763 .write_iter = ll_file_write_iter,
4764 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4765 .read = ll_file_read,
4766 .aio_read = ll_file_aio_read,
4767 .write = ll_file_write,
4768 .aio_write = ll_file_aio_write,
4769 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4770 .unlocked_ioctl = ll_file_ioctl,
4771 .open = ll_file_open,
4772 .release = ll_file_release,
4773 .mmap = ll_file_mmap,
4774 .llseek = ll_file_seek,
4775 .splice_read = ll_file_splice_read,
4778 .flock = ll_file_flock,
4779 .lock = ll_file_flock
4782 /* These are for -o noflock - to return ENOSYS on flock calls */
4783 struct file_operations ll_file_operations_noflock = {
4784 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4785 # ifdef HAVE_SYNC_READ_WRITE
4786 .read = new_sync_read,
4787 .write = new_sync_write,
4788 # endif /* HAVE_SYNC_READ_WRITE */
4789 .read_iter = ll_file_read_iter,
4790 .write_iter = ll_file_write_iter,
4791 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4792 .read = ll_file_read,
4793 .aio_read = ll_file_aio_read,
4794 .write = ll_file_write,
4795 .aio_write = ll_file_aio_write,
4796 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4797 .unlocked_ioctl = ll_file_ioctl,
4798 .open = ll_file_open,
4799 .release = ll_file_release,
4800 .mmap = ll_file_mmap,
4801 .llseek = ll_file_seek,
4802 .splice_read = ll_file_splice_read,
4805 .flock = ll_file_noflock,
4806 .lock = ll_file_noflock
4809 struct inode_operations ll_file_inode_operations = {
4810 .setattr = ll_setattr,
4811 .getattr = ll_getattr,
4812 .permission = ll_inode_permission,
4813 #ifdef HAVE_IOP_XATTR
4814 .setxattr = ll_setxattr,
4815 .getxattr = ll_getxattr,
4816 .removexattr = ll_removexattr,
4818 .listxattr = ll_listxattr,
4819 .fiemap = ll_fiemap,
4820 #ifdef HAVE_IOP_GET_ACL
4821 .get_acl = ll_get_acl,
4823 #ifdef HAVE_IOP_SET_ACL
4824 .set_acl = ll_set_acl,
4828 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4830 struct ll_inode_info *lli = ll_i2info(inode);
4831 struct cl_object *obj = lli->lli_clob;
4840 env = cl_env_get(&refcheck);
4842 RETURN(PTR_ERR(env));
4844 rc = cl_conf_set(env, lli->lli_clob, conf);
4848 if (conf->coc_opc == OBJECT_CONF_SET) {
4849 struct ldlm_lock *lock = conf->coc_lock;
4850 struct cl_layout cl = {
4854 LASSERT(lock != NULL);
4855 LASSERT(ldlm_has_layout(lock));
4857 /* it can only be allowed to match after layout is
4858 * applied to inode otherwise false layout would be
4859 * seen. Applying layout shoud happen before dropping
4860 * the intent lock. */
4861 ldlm_lock_allow_match(lock);
4863 rc = cl_object_layout_get(env, obj, &cl);
4868 DFID": layout version change: %u -> %u\n",
4869 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4871 ll_layout_version_set(lli, cl.cl_layout_gen);
4875 cl_env_put(env, &refcheck);
4880 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4881 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4884 struct ll_sb_info *sbi = ll_i2sbi(inode);
4885 struct ptlrpc_request *req;
4892 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4893 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4894 lock->l_lvb_data, lock->l_lvb_len);
4896 if (lock->l_lvb_data != NULL)
4899 /* if layout lock was granted right away, the layout is returned
4900 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4901 * blocked and then granted via completion ast, we have to fetch
4902 * layout here. Please note that we can't use the LVB buffer in
4903 * completion AST because it doesn't have a large enough buffer */
4904 rc = ll_get_default_mdsize(sbi, &lmmsize);
4908 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
4909 XATTR_NAME_LOV, lmmsize, &req);
4912 GOTO(out, rc = 0); /* empty layout */
4919 if (lmmsize == 0) /* empty layout */
4922 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4924 GOTO(out, rc = -EFAULT);
4926 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4927 if (lvbdata == NULL)
4928 GOTO(out, rc = -ENOMEM);
4930 memcpy(lvbdata, lmm, lmmsize);
4931 lock_res_and_lock(lock);
4932 if (unlikely(lock->l_lvb_data == NULL)) {
4933 lock->l_lvb_type = LVB_T_LAYOUT;
4934 lock->l_lvb_data = lvbdata;
4935 lock->l_lvb_len = lmmsize;
4938 unlock_res_and_lock(lock);
4941 OBD_FREE_LARGE(lvbdata, lmmsize);
4946 ptlrpc_req_finished(req);
4951 * Apply the layout to the inode. Layout lock is held and will be released
4954 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4955 struct inode *inode)
4957 struct ll_inode_info *lli = ll_i2info(inode);
4958 struct ll_sb_info *sbi = ll_i2sbi(inode);
4959 struct ldlm_lock *lock;
4960 struct cl_object_conf conf;
4963 bool wait_layout = false;
4966 LASSERT(lustre_handle_is_used(lockh));
4968 lock = ldlm_handle2lock(lockh);
4969 LASSERT(lock != NULL);
4970 LASSERT(ldlm_has_layout(lock));
4972 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4973 PFID(&lli->lli_fid), inode);
4975 /* in case this is a caching lock and reinstate with new inode */
4976 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4978 lock_res_and_lock(lock);
4979 lvb_ready = ldlm_is_lvb_ready(lock);
4980 unlock_res_and_lock(lock);
4982 /* checking lvb_ready is racy but this is okay. The worst case is
4983 * that multi processes may configure the file on the same time. */
4987 rc = ll_layout_fetch(inode, lock);
4991 /* for layout lock, lmm is stored in lock's lvb.
4992 * lvb_data is immutable if the lock is held so it's safe to access it
4995 * set layout to file. Unlikely this will fail as old layout was
4996 * surely eliminated */
4997 memset(&conf, 0, sizeof conf);
4998 conf.coc_opc = OBJECT_CONF_SET;
4999 conf.coc_inode = inode;
5000 conf.coc_lock = lock;
5001 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5002 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5003 rc = ll_layout_conf(inode, &conf);
5005 /* refresh layout failed, need to wait */
5006 wait_layout = rc == -EBUSY;
5009 LDLM_LOCK_PUT(lock);
5010 ldlm_lock_decref(lockh, mode);
5012 /* wait for IO to complete if it's still being used. */
5014 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5015 ll_get_fsname(inode->i_sb, NULL, 0),
5016 PFID(&lli->lli_fid), inode);
5018 memset(&conf, 0, sizeof conf);
5019 conf.coc_opc = OBJECT_CONF_WAIT;
5020 conf.coc_inode = inode;
5021 rc = ll_layout_conf(inode, &conf);
5025 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5026 ll_get_fsname(inode->i_sb, NULL, 0),
5027 PFID(&lli->lli_fid), rc);
5033 * Issue layout intent RPC to MDS.
5034 * \param inode [in] file inode
5035 * \param intent [in] layout intent
5037 * \retval 0 on success
5038 * \retval < 0 error code
5040 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5042 struct ll_inode_info *lli = ll_i2info(inode);
5043 struct ll_sb_info *sbi = ll_i2sbi(inode);
5044 struct md_op_data *op_data;
5045 struct lookup_intent it;
5046 struct ptlrpc_request *req;
5050 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5051 0, 0, LUSTRE_OPC_ANY, NULL);
5052 if (IS_ERR(op_data))
5053 RETURN(PTR_ERR(op_data));
5055 op_data->op_data = intent;
5056 op_data->op_data_size = sizeof(*intent);
5058 memset(&it, 0, sizeof(it));
5059 it.it_op = IT_LAYOUT;
5060 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5061 intent->li_opc == LAYOUT_INTENT_TRUNC)
5062 it.it_flags = FMODE_WRITE;
5064 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5065 ll_get_fsname(inode->i_sb, NULL, 0),
5066 PFID(&lli->lli_fid), inode);
5068 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5069 &ll_md_blocking_ast, 0);
5070 if (it.it_request != NULL)
5071 ptlrpc_req_finished(it.it_request);
5072 it.it_request = NULL;
5074 ll_finish_md_op_data(op_data);
5076 /* set lock data in case this is a new lock */
5078 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5080 ll_intent_drop_lock(&it);
5086 * This function checks if there exists a LAYOUT lock on the client side,
5087 * or enqueues it if it doesn't have one in cache.
5089 * This function will not hold layout lock so it may be revoked any time after
5090 * this function returns. Any operations depend on layout should be redone
5093 * This function should be called before lov_io_init() to get an uptodate
5094 * layout version, the caller should save the version number and after IO
5095 * is finished, this function should be called again to verify that layout
5096 * is not changed during IO time.
5098 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5100 struct ll_inode_info *lli = ll_i2info(inode);
5101 struct ll_sb_info *sbi = ll_i2sbi(inode);
5102 struct lustre_handle lockh;
5103 struct layout_intent intent = {
5104 .li_opc = LAYOUT_INTENT_ACCESS,
5106 enum ldlm_mode mode;
5110 *gen = ll_layout_version_get(lli);
5111 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5115 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5116 LASSERT(S_ISREG(inode->i_mode));
5118 /* take layout lock mutex to enqueue layout lock exclusively. */
5119 mutex_lock(&lli->lli_layout_mutex);
5122 /* mostly layout lock is caching on the local side, so try to
5123 * match it before grabbing layout lock mutex. */
5124 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5125 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5126 if (mode != 0) { /* hit cached lock */
5127 rc = ll_layout_lock_set(&lockh, mode, inode);
5133 rc = ll_layout_intent(inode, &intent);
5139 *gen = ll_layout_version_get(lli);
5140 mutex_unlock(&lli->lli_layout_mutex);
5146 * Issue layout intent RPC indicating where in a file an IO is about to write.
5148 * \param[in] inode file inode.
5149 * \param[in] ext write range with start offset of fille in bytes where
5150 * an IO is about to write, and exclusive end offset in
5153 * \retval 0 on success
5154 * \retval < 0 error code
5156 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5157 struct lu_extent *ext)
5159 struct layout_intent intent = {
5161 .li_extent.e_start = ext->e_start,
5162 .li_extent.e_end = ext->e_end,
5167 rc = ll_layout_intent(inode, &intent);
5173 * This function send a restore request to the MDT
5175 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5177 struct hsm_user_request *hur;
5181 len = sizeof(struct hsm_user_request) +
5182 sizeof(struct hsm_user_item);
5183 OBD_ALLOC(hur, len);
5187 hur->hur_request.hr_action = HUA_RESTORE;
5188 hur->hur_request.hr_archive_id = 0;
5189 hur->hur_request.hr_flags = 0;
5190 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5191 sizeof(hur->hur_user_item[0].hui_fid));
5192 hur->hur_user_item[0].hui_extent.offset = offset;
5193 hur->hur_user_item[0].hui_extent.length = length;
5194 hur->hur_request.hr_itemcount = 1;
5195 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,