4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_get_fsname(inode->i_sb, NULL, 0),
143 PFID(&lli->lli_fid));
147 OBD_ALLOC_PTR(op_data);
148 /* We leak openhandle and request here on error, but not much to be
149 * done in OOM case since app won't retry close on error either. */
151 GOTO(out, rc = -ENOMEM);
153 ll_prepare_close(inode, op_data, och);
155 case MDS_CLOSE_LAYOUT_MERGE:
156 /* merge blocks from the victim inode */
157 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
158 op_data->op_attr.ia_valid |= ATTR_SIZE;
159 op_data->op_xvalid |= OP_XVALID_BLOCKS;
160 case MDS_CLOSE_LAYOUT_SPLIT:
161 case MDS_CLOSE_LAYOUT_SWAP: {
162 struct split_param *sp = data;
164 LASSERT(data != NULL);
165 op_data->op_bias |= bias;
166 op_data->op_data_version = 0;
167 op_data->op_lease_handle = och->och_lease_handle;
168 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
169 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
170 op_data->op_mirror_id = sp->sp_mirror_id;
172 op_data->op_fid2 = *ll_inode2fid(data);
177 case MDS_CLOSE_RESYNC_DONE: {
178 struct ll_ioc_lease *ioc = data;
180 LASSERT(data != NULL);
181 op_data->op_attr_blocks +=
182 ioc->lil_count * op_data->op_attr_blocks;
183 op_data->op_attr.ia_valid |= ATTR_SIZE;
184 op_data->op_xvalid |= OP_XVALID_BLOCKS;
185 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
187 op_data->op_lease_handle = och->och_lease_handle;
188 op_data->op_data = &ioc->lil_ids[0];
189 op_data->op_data_size =
190 ioc->lil_count * sizeof(ioc->lil_ids[0]);
194 case MDS_HSM_RELEASE:
195 LASSERT(data != NULL);
196 op_data->op_bias |= MDS_HSM_RELEASE;
197 op_data->op_data_version = *(__u64 *)data;
198 op_data->op_lease_handle = och->och_lease_handle;
199 op_data->op_attr.ia_valid |= ATTR_SIZE;
200 op_data->op_xvalid |= OP_XVALID_BLOCKS;
204 LASSERT(data == NULL);
208 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
209 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
210 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
211 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
213 rc = md_close(md_exp, op_data, och->och_mod, &req);
214 if (rc != 0 && rc != -EINTR)
215 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
216 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
218 if (rc == 0 && op_data->op_bias & bias) {
219 struct mdt_body *body;
221 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
222 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
226 ll_finish_md_op_data(op_data);
230 md_clear_open_replay_data(md_exp, och);
231 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
234 ptlrpc_req_finished(req); /* This is close request */
238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
240 struct ll_inode_info *lli = ll_i2info(inode);
241 struct obd_client_handle **och_p;
242 struct obd_client_handle *och;
247 if (fmode & FMODE_WRITE) {
248 och_p = &lli->lli_mds_write_och;
249 och_usecount = &lli->lli_open_fd_write_count;
250 } else if (fmode & FMODE_EXEC) {
251 och_p = &lli->lli_mds_exec_och;
252 och_usecount = &lli->lli_open_fd_exec_count;
254 LASSERT(fmode & FMODE_READ);
255 och_p = &lli->lli_mds_read_och;
256 och_usecount = &lli->lli_open_fd_read_count;
259 mutex_lock(&lli->lli_och_mutex);
260 if (*och_usecount > 0) {
261 /* There are still users of this handle, so skip
263 mutex_unlock(&lli->lli_och_mutex);
269 mutex_unlock(&lli->lli_och_mutex);
272 /* There might be a race and this handle may already
274 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
280 static int ll_md_close(struct inode *inode, struct file *file)
282 union ldlm_policy_data policy = {
283 .l_inodebits = { MDS_INODELOCK_OPEN },
285 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
286 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
287 struct ll_inode_info *lli = ll_i2info(inode);
288 struct lustre_handle lockh;
289 enum ldlm_mode lockmode;
293 /* clear group lock, if present */
294 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
295 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
297 if (fd->fd_lease_och != NULL) {
300 /* Usually the lease is not released when the
301 * application crashed, we need to release here. */
302 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
303 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
304 PFID(&lli->lli_fid), rc, lease_broken);
306 fd->fd_lease_och = NULL;
309 if (fd->fd_och != NULL) {
310 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
315 /* Let's see if we have good enough OPEN lock on the file and if
316 we can skip talking to MDS */
317 mutex_lock(&lli->lli_och_mutex);
318 if (fd->fd_omode & FMODE_WRITE) {
320 LASSERT(lli->lli_open_fd_write_count);
321 lli->lli_open_fd_write_count--;
322 } else if (fd->fd_omode & FMODE_EXEC) {
324 LASSERT(lli->lli_open_fd_exec_count);
325 lli->lli_open_fd_exec_count--;
328 LASSERT(lli->lli_open_fd_read_count);
329 lli->lli_open_fd_read_count--;
331 mutex_unlock(&lli->lli_och_mutex);
333 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
334 LDLM_IBITS, &policy, lockmode, &lockh))
335 rc = ll_md_real_close(inode, fd->fd_omode);
338 LUSTRE_FPRIVATE(file) = NULL;
339 ll_file_data_put(fd);
344 /* While this returns an error code, fput() the caller does not, so we need
345 * to make every effort to clean up all of our state here. Also, applications
346 * rarely check close errors and even if an error is returned they will not
347 * re-try the close call.
349 int ll_file_release(struct inode *inode, struct file *file)
351 struct ll_file_data *fd;
352 struct ll_sb_info *sbi = ll_i2sbi(inode);
353 struct ll_inode_info *lli = ll_i2info(inode);
357 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
358 PFID(ll_inode2fid(inode)), inode);
360 if (inode->i_sb->s_root != file_dentry(file))
361 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
362 fd = LUSTRE_FPRIVATE(file);
365 /* The last ref on @file, maybe not the the owner pid of statahead,
366 * because parent and child process can share the same file handle. */
367 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
368 ll_deauthorize_statahead(inode, fd);
370 if (inode->i_sb->s_root == file_dentry(file)) {
371 LUSTRE_FPRIVATE(file) = NULL;
372 ll_file_data_put(fd);
376 if (!S_ISDIR(inode->i_mode)) {
377 if (lli->lli_clob != NULL)
378 lov_read_and_clear_async_rc(lli->lli_clob);
379 lli->lli_async_rc = 0;
382 rc = ll_md_close(inode, file);
384 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
385 libcfs_debug_dumplog();
390 static inline int ll_dom_readpage(void *data, struct page *page)
392 struct niobuf_local *lnb = data;
395 kaddr = ll_kmap_atomic(page, KM_USER0);
396 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
397 if (lnb->lnb_len < PAGE_SIZE)
398 memset(kaddr + lnb->lnb_len, 0,
399 PAGE_SIZE - lnb->lnb_len);
400 flush_dcache_page(page);
401 SetPageUptodate(page);
402 ll_kunmap_atomic(kaddr, KM_USER0);
408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
409 struct lookup_intent *it)
411 struct ll_inode_info *lli = ll_i2info(inode);
412 struct cl_object *obj = lli->lli_clob;
413 struct address_space *mapping = inode->i_mapping;
415 struct niobuf_remote *rnb;
417 struct lustre_handle lockh;
418 struct ldlm_lock *lock;
419 unsigned long index, start;
420 struct niobuf_local lnb;
421 bool dom_lock = false;
428 if (it->it_lock_mode != 0) {
429 lockh.cookie = it->it_lock_handle;
430 lock = ldlm_handle2lock(&lockh);
432 dom_lock = ldlm_has_dom(lock);
438 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
442 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
443 if (rnb == NULL || rnb->rnb_len == 0)
446 CDEBUG(D_INFO, "Get data buffer along with open, len %i, i_size %llu\n",
447 rnb->rnb_len, i_size_read(inode));
449 data = (char *)rnb + sizeof(*rnb);
451 lnb.lnb_file_offset = rnb->rnb_offset;
452 start = lnb.lnb_file_offset / PAGE_SIZE;
454 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
455 lnb.lnb_page_offset = 0;
457 lnb.lnb_data = data + (index << PAGE_SHIFT);
458 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
459 if (lnb.lnb_len > PAGE_SIZE)
460 lnb.lnb_len = PAGE_SIZE;
462 vmpage = read_cache_page(mapping, index + start,
463 ll_dom_readpage, &lnb);
464 if (IS_ERR(vmpage)) {
465 CWARN("%s: cannot fill page %lu for "DFID
466 " with data: rc = %li\n",
467 ll_get_fsname(inode->i_sb, NULL, 0),
468 index + start, PFID(lu_object_fid(&obj->co_lu)),
474 } while (rnb->rnb_len > (index << PAGE_SHIFT));
478 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
479 struct lookup_intent *itp)
481 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
482 struct dentry *parent = de->d_parent;
483 const char *name = NULL;
485 struct md_op_data *op_data;
486 struct ptlrpc_request *req = NULL;
490 LASSERT(parent != NULL);
491 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
493 /* if server supports open-by-fid, or file name is invalid, don't pack
494 * name in open request */
495 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
496 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
497 name = de->d_name.name;
498 len = de->d_name.len;
501 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
502 name, len, 0, LUSTRE_OPC_ANY, NULL);
504 RETURN(PTR_ERR(op_data));
505 op_data->op_data = lmm;
506 op_data->op_data_size = lmmsize;
508 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
509 &ll_md_blocking_ast, 0);
510 ll_finish_md_op_data(op_data);
512 /* reason for keep own exit path - don`t flood log
513 * with messages with -ESTALE errors.
515 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
516 it_open_error(DISP_OPEN_OPEN, itp))
518 ll_release_openhandle(de, itp);
522 if (it_disposition(itp, DISP_LOOKUP_NEG))
523 GOTO(out, rc = -ENOENT);
525 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
526 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
527 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
531 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
533 if (!rc && itp->it_lock_mode) {
534 ll_dom_finish_open(de->d_inode, req, itp);
535 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
539 ptlrpc_req_finished(req);
540 ll_intent_drop_lock(itp);
542 /* We did open by fid, but by the time we got to the server,
543 * the object disappeared. If this is a create, we cannot really
544 * tell the userspace that the file it was trying to create
545 * does not exist. Instead let's return -ESTALE, and the VFS will
546 * retry the create with LOOKUP_REVAL that we are going to catch
547 * in ll_revalidate_dentry() and use lookup then.
549 if (rc == -ENOENT && itp->it_op & IT_CREAT)
555 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
556 struct obd_client_handle *och)
558 struct mdt_body *body;
560 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
561 och->och_open_handle = body->mbo_open_handle;
562 och->och_fid = body->mbo_fid1;
563 och->och_lease_handle.cookie = it->it_lock_handle;
564 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
565 och->och_flags = it->it_flags;
567 return md_set_open_replay_data(md_exp, och, it);
570 static int ll_local_open(struct file *file, struct lookup_intent *it,
571 struct ll_file_data *fd, struct obd_client_handle *och)
573 struct inode *inode = file_inode(file);
576 LASSERT(!LUSTRE_FPRIVATE(file));
583 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
588 LUSTRE_FPRIVATE(file) = fd;
589 ll_readahead_init(inode, &fd->fd_ras);
590 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
592 /* ll_cl_context initialize */
593 rwlock_init(&fd->fd_lock);
594 INIT_LIST_HEAD(&fd->fd_lccs);
599 /* Open a file, and (for the very first open) create objects on the OSTs at
600 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
601 * creation or open until ll_lov_setstripe() ioctl is called.
603 * If we already have the stripe MD locally then we don't request it in
604 * md_open(), by passing a lmm_size = 0.
606 * It is up to the application to ensure no other processes open this file
607 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
608 * used. We might be able to avoid races of that sort by getting lli_open_sem
609 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
610 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
612 int ll_file_open(struct inode *inode, struct file *file)
614 struct ll_inode_info *lli = ll_i2info(inode);
615 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
616 .it_flags = file->f_flags };
617 struct obd_client_handle **och_p = NULL;
618 __u64 *och_usecount = NULL;
619 struct ll_file_data *fd;
623 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
624 PFID(ll_inode2fid(inode)), inode, file->f_flags);
626 it = file->private_data; /* XXX: compat macro */
627 file->private_data = NULL; /* prevent ll_local_open assertion */
629 fd = ll_file_data_get();
631 GOTO(out_nofiledata, rc = -ENOMEM);
634 if (S_ISDIR(inode->i_mode))
635 ll_authorize_statahead(inode, fd);
637 if (inode->i_sb->s_root == file_dentry(file)) {
638 LUSTRE_FPRIVATE(file) = fd;
642 if (!it || !it->it_disposition) {
643 /* Convert f_flags into access mode. We cannot use file->f_mode,
644 * because everything but O_ACCMODE mask was stripped from
646 if ((oit.it_flags + 1) & O_ACCMODE)
648 if (file->f_flags & O_TRUNC)
649 oit.it_flags |= FMODE_WRITE;
651 /* kernel only call f_op->open in dentry_open. filp_open calls
652 * dentry_open after call to open_namei that checks permissions.
653 * Only nfsd_open call dentry_open directly without checking
654 * permissions and because of that this code below is safe.
656 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
657 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
659 /* We do not want O_EXCL here, presumably we opened the file
660 * already? XXX - NFS implications? */
661 oit.it_flags &= ~O_EXCL;
663 /* bug20584, if "it_flags" contains O_CREAT, the file will be
664 * created if necessary, then "IT_CREAT" should be set to keep
665 * consistent with it */
666 if (oit.it_flags & O_CREAT)
667 oit.it_op |= IT_CREAT;
673 /* Let's see if we have file open on MDS already. */
674 if (it->it_flags & FMODE_WRITE) {
675 och_p = &lli->lli_mds_write_och;
676 och_usecount = &lli->lli_open_fd_write_count;
677 } else if (it->it_flags & FMODE_EXEC) {
678 och_p = &lli->lli_mds_exec_och;
679 och_usecount = &lli->lli_open_fd_exec_count;
681 och_p = &lli->lli_mds_read_och;
682 och_usecount = &lli->lli_open_fd_read_count;
685 mutex_lock(&lli->lli_och_mutex);
686 if (*och_p) { /* Open handle is present */
687 if (it_disposition(it, DISP_OPEN_OPEN)) {
688 /* Well, there's extra open request that we do not need,
689 let's close it somehow. This will decref request. */
690 rc = it_open_error(DISP_OPEN_OPEN, it);
692 mutex_unlock(&lli->lli_och_mutex);
693 GOTO(out_openerr, rc);
696 ll_release_openhandle(file_dentry(file), it);
700 rc = ll_local_open(file, it, fd, NULL);
703 mutex_unlock(&lli->lli_och_mutex);
704 GOTO(out_openerr, rc);
707 LASSERT(*och_usecount == 0);
708 if (!it->it_disposition) {
709 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
710 /* We cannot just request lock handle now, new ELC code
711 means that one of other OPEN locks for this file
712 could be cancelled, and since blocking ast handler
713 would attempt to grab och_mutex as well, that would
714 result in a deadlock */
715 mutex_unlock(&lli->lli_och_mutex);
717 * Normally called under two situations:
719 * 2. A race/condition on MDS resulting in no open
720 * handle to be returned from LOOKUP|OPEN request,
721 * for example if the target entry was a symlink.
723 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
724 * marked by a bit set in ll_iget_for_nfs. Clear the
725 * bit so that it's not confusing later callers.
727 * NB; when ldd is NULL, it must have come via normal
728 * lookup path only, since ll_iget_for_nfs always calls
731 if (ldd && ldd->lld_nfs_dentry) {
732 ldd->lld_nfs_dentry = 0;
733 it->it_flags |= MDS_OPEN_LOCK;
737 * Always specify MDS_OPEN_BY_FID because we don't want
738 * to get file with different fid.
740 it->it_flags |= MDS_OPEN_BY_FID;
741 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
744 GOTO(out_openerr, rc);
748 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
750 GOTO(out_och_free, rc = -ENOMEM);
754 /* md_intent_lock() didn't get a request ref if there was an
755 * open error, so don't do cleanup on the request here
757 /* XXX (green): Should not we bail out on any error here, not
758 * just open error? */
759 rc = it_open_error(DISP_OPEN_OPEN, it);
761 GOTO(out_och_free, rc);
763 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
764 "inode %p: disposition %x, status %d\n", inode,
765 it_disposition(it, ~0), it->it_status);
767 rc = ll_local_open(file, it, fd, *och_p);
769 GOTO(out_och_free, rc);
771 mutex_unlock(&lli->lli_och_mutex);
774 /* Must do this outside lli_och_mutex lock to prevent deadlock where
775 different kind of OPEN lock for this same inode gets cancelled
776 by ldlm_cancel_lru */
777 if (!S_ISREG(inode->i_mode))
778 GOTO(out_och_free, rc);
780 cl_lov_delay_create_clear(&file->f_flags);
781 GOTO(out_och_free, rc);
785 if (och_p && *och_p) {
786 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
787 *och_p = NULL; /* OBD_FREE writes some magic there */
790 mutex_unlock(&lli->lli_och_mutex);
793 if (lli->lli_opendir_key == fd)
794 ll_deauthorize_statahead(inode, fd);
796 ll_file_data_put(fd);
798 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
802 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
803 ptlrpc_req_finished(it->it_request);
804 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
810 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
811 struct ldlm_lock_desc *desc, void *data, int flag)
814 struct lustre_handle lockh;
818 case LDLM_CB_BLOCKING:
819 ldlm_lock2handle(lock, &lockh);
820 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
822 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
826 case LDLM_CB_CANCELING:
834 * When setting a lease on a file, we take ownership of the lli_mds_*_och
835 * and save it as fd->fd_och so as to force client to reopen the file even
836 * if it has an open lock in cache already.
838 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
839 struct lustre_handle *old_open_handle)
841 struct ll_inode_info *lli = ll_i2info(inode);
842 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
843 struct obd_client_handle **och_p;
848 /* Get the openhandle of the file */
849 mutex_lock(&lli->lli_och_mutex);
850 if (fd->fd_lease_och != NULL)
851 GOTO(out_unlock, rc = -EBUSY);
853 if (fd->fd_och == NULL) {
854 if (file->f_mode & FMODE_WRITE) {
855 LASSERT(lli->lli_mds_write_och != NULL);
856 och_p = &lli->lli_mds_write_och;
857 och_usecount = &lli->lli_open_fd_write_count;
859 LASSERT(lli->lli_mds_read_och != NULL);
860 och_p = &lli->lli_mds_read_och;
861 och_usecount = &lli->lli_open_fd_read_count;
864 if (*och_usecount > 1)
865 GOTO(out_unlock, rc = -EBUSY);
872 *old_open_handle = fd->fd_och->och_open_handle;
876 mutex_unlock(&lli->lli_och_mutex);
881 * Release ownership on lli_mds_*_och when putting back a file lease.
883 static int ll_lease_och_release(struct inode *inode, struct file *file)
885 struct ll_inode_info *lli = ll_i2info(inode);
886 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
887 struct obd_client_handle **och_p;
888 struct obd_client_handle *old_och = NULL;
893 mutex_lock(&lli->lli_och_mutex);
894 if (file->f_mode & FMODE_WRITE) {
895 och_p = &lli->lli_mds_write_och;
896 och_usecount = &lli->lli_open_fd_write_count;
898 och_p = &lli->lli_mds_read_och;
899 och_usecount = &lli->lli_open_fd_read_count;
902 /* The file may have been open by another process (broken lease) so
903 * *och_p is not NULL. In this case we should simply increase usecount
906 if (*och_p != NULL) {
907 old_och = fd->fd_och;
914 mutex_unlock(&lli->lli_och_mutex);
917 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
923 * Acquire a lease and open the file.
925 static struct obd_client_handle *
926 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
929 struct lookup_intent it = { .it_op = IT_OPEN };
930 struct ll_sb_info *sbi = ll_i2sbi(inode);
931 struct md_op_data *op_data;
932 struct ptlrpc_request *req = NULL;
933 struct lustre_handle old_open_handle = { 0 };
934 struct obd_client_handle *och = NULL;
939 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
940 RETURN(ERR_PTR(-EINVAL));
943 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
944 RETURN(ERR_PTR(-EPERM));
946 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
953 RETURN(ERR_PTR(-ENOMEM));
955 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
956 LUSTRE_OPC_ANY, NULL);
958 GOTO(out, rc = PTR_ERR(op_data));
960 /* To tell the MDT this openhandle is from the same owner */
961 op_data->op_open_handle = old_open_handle;
963 it.it_flags = fmode | open_flags;
964 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
965 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
966 &ll_md_blocking_lease_ast,
967 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
968 * it can be cancelled which may mislead applications that the lease is
970 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
971 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
972 * doesn't deal with openhandle, so normal openhandle will be leaked. */
973 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
974 ll_finish_md_op_data(op_data);
975 ptlrpc_req_finished(req);
977 GOTO(out_release_it, rc);
979 if (it_disposition(&it, DISP_LOOKUP_NEG))
980 GOTO(out_release_it, rc = -ENOENT);
982 rc = it_open_error(DISP_OPEN_OPEN, &it);
984 GOTO(out_release_it, rc);
986 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
987 ll_och_fill(sbi->ll_md_exp, &it, och);
989 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
990 GOTO(out_close, rc = -EOPNOTSUPP);
992 /* already get lease, handle lease lock */
993 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
994 if (it.it_lock_mode == 0 ||
995 it.it_lock_bits != MDS_INODELOCK_OPEN) {
996 /* open lock must return for lease */
997 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
998 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1000 GOTO(out_close, rc = -EPROTO);
1003 ll_intent_release(&it);
1007 /* Cancel open lock */
1008 if (it.it_lock_mode != 0) {
1009 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1011 it.it_lock_mode = 0;
1012 och->och_lease_handle.cookie = 0ULL;
1014 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1016 CERROR("%s: error closing file "DFID": %d\n",
1017 ll_get_fsname(inode->i_sb, NULL, 0),
1018 PFID(&ll_i2info(inode)->lli_fid), rc2);
1019 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1021 ll_intent_release(&it);
1025 RETURN(ERR_PTR(rc));
1029 * Check whether a layout swap can be done between two inodes.
1031 * \param[in] inode1 First inode to check
1032 * \param[in] inode2 Second inode to check
1034 * \retval 0 on success, layout swap can be performed between both inodes
1035 * \retval negative error code if requirements are not met
1037 static int ll_check_swap_layouts_validity(struct inode *inode1,
1038 struct inode *inode2)
1040 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1043 if (inode_permission(inode1, MAY_WRITE) ||
1044 inode_permission(inode2, MAY_WRITE))
1047 if (inode1->i_sb != inode2->i_sb)
1053 static int ll_swap_layouts_close(struct obd_client_handle *och,
1054 struct inode *inode, struct inode *inode2)
1056 const struct lu_fid *fid1 = ll_inode2fid(inode);
1057 const struct lu_fid *fid2;
1061 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1062 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1064 rc = ll_check_swap_layouts_validity(inode, inode2);
1066 GOTO(out_free_och, rc);
1068 /* We now know that inode2 is a lustre inode */
1069 fid2 = ll_inode2fid(inode2);
1071 rc = lu_fid_cmp(fid1, fid2);
1073 GOTO(out_free_och, rc = -EINVAL);
1075 /* Close the file and {swap,merge} layouts between inode & inode2.
1076 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1077 * because we still need it to pack l_remote_handle to MDT. */
1078 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1081 och = NULL; /* freed in ll_close_inode_openhandle() */
1091 * Release lease and close the file.
1092 * It will check if the lease has ever broken.
1094 static int ll_lease_close_intent(struct obd_client_handle *och,
1095 struct inode *inode,
1096 bool *lease_broken, enum mds_op_bias bias,
1099 struct ldlm_lock *lock;
1100 bool cancelled = true;
1104 lock = ldlm_handle2lock(&och->och_lease_handle);
1106 lock_res_and_lock(lock);
1107 cancelled = ldlm_is_cancel(lock);
1108 unlock_res_and_lock(lock);
1109 LDLM_LOCK_PUT(lock);
1112 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1113 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1115 if (lease_broken != NULL)
1116 *lease_broken = cancelled;
1118 if (!cancelled && !bias)
1119 ldlm_cli_cancel(&och->och_lease_handle, 0);
1121 if (cancelled) { /* no need to excute intent */
1126 rc = ll_close_inode_openhandle(inode, och, bias, data);
1130 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1133 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1137 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1139 static int ll_lease_file_resync(struct obd_client_handle *och,
1140 struct inode *inode, unsigned long arg)
1142 struct ll_sb_info *sbi = ll_i2sbi(inode);
1143 struct md_op_data *op_data;
1144 struct ll_ioc_lease_id ioc;
1145 __u64 data_version_unused;
1149 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1150 LUSTRE_OPC_ANY, NULL);
1151 if (IS_ERR(op_data))
1152 RETURN(PTR_ERR(op_data));
1154 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1158 /* before starting file resync, it's necessary to clean up page cache
1159 * in client memory, otherwise once the layout version is increased,
1160 * writing back cached data will be denied the OSTs. */
1161 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1165 op_data->op_lease_handle = och->och_lease_handle;
1166 op_data->op_mirror_id = ioc.lil_mirror_id;
1167 rc = md_file_resync(sbi->ll_md_exp, op_data);
1173 ll_finish_md_op_data(op_data);
1177 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1179 struct ll_inode_info *lli = ll_i2info(inode);
1180 struct cl_object *obj = lli->lli_clob;
1181 struct cl_attr *attr = vvp_env_thread_attr(env);
1189 ll_inode_size_lock(inode);
1191 /* Merge timestamps the most recently obtained from MDS with
1192 * timestamps obtained from OSTs.
1194 * Do not overwrite atime of inode because it may be refreshed
1195 * by file_accessed() function. If the read was served by cache
1196 * data, there is no RPC to be sent so that atime may not be
1197 * transferred to OSTs at all. MDT only updates atime at close time
1198 * if it's at least 'mdd.*.atime_diff' older.
1199 * All in all, the atime in Lustre does not strictly comply with
1200 * POSIX. Solving this problem needs to send an RPC to MDT for each
1201 * read, this will hurt performance. */
1202 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1203 LTIME_S(inode->i_atime) = lli->lli_atime;
1204 lli->lli_update_atime = 0;
1206 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1207 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1209 atime = LTIME_S(inode->i_atime);
1210 mtime = LTIME_S(inode->i_mtime);
1211 ctime = LTIME_S(inode->i_ctime);
1213 cl_object_attr_lock(obj);
1214 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1217 rc = cl_object_attr_get(env, obj, attr);
1218 cl_object_attr_unlock(obj);
1221 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1223 if (atime < attr->cat_atime)
1224 atime = attr->cat_atime;
1226 if (ctime < attr->cat_ctime)
1227 ctime = attr->cat_ctime;
1229 if (mtime < attr->cat_mtime)
1230 mtime = attr->cat_mtime;
1232 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1233 PFID(&lli->lli_fid), attr->cat_size);
1235 i_size_write(inode, attr->cat_size);
1236 inode->i_blocks = attr->cat_blocks;
1238 LTIME_S(inode->i_atime) = atime;
1239 LTIME_S(inode->i_mtime) = mtime;
1240 LTIME_S(inode->i_ctime) = ctime;
1243 ll_inode_size_unlock(inode);
1249 * Set designated mirror for I/O.
1251 * So far only read, write, and truncated can support to issue I/O to
1252 * designated mirror.
1254 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1256 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1258 /* clear layout version for generic(non-resync) I/O in case it carries
1259 * stale layout version due to I/O restart */
1260 io->ci_layout_version = 0;
1262 /* FLR: disable non-delay for designated mirror I/O because obviously
1263 * only one mirror is available */
1264 if (fd->fd_designated_mirror > 0) {
1266 io->ci_designated_mirror = fd->fd_designated_mirror;
1267 io->ci_layout_version = fd->fd_layout_version;
1268 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1272 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1273 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1276 static bool file_is_noatime(const struct file *file)
1278 const struct vfsmount *mnt = file->f_path.mnt;
1279 const struct inode *inode = file_inode((struct file *)file);
1281 /* Adapted from file_accessed() and touch_atime().*/
1282 if (file->f_flags & O_NOATIME)
1285 if (inode->i_flags & S_NOATIME)
1288 if (IS_NOATIME(inode))
1291 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1294 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1297 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1303 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1305 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1307 struct inode *inode = file_inode(file);
1308 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1310 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1311 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1312 io->u.ci_rw.rw_file = file;
1313 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1314 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1315 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1317 if (iot == CIT_WRITE) {
1318 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1319 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1320 file->f_flags & O_DIRECT ||
1323 io->ci_obj = ll_i2info(inode)->lli_clob;
1324 io->ci_lockreq = CILR_MAYBE;
1325 if (ll_file_nolock(file)) {
1326 io->ci_lockreq = CILR_NEVER;
1327 io->ci_no_srvlock = 1;
1328 } else if (file->f_flags & O_APPEND) {
1329 io->ci_lockreq = CILR_MANDATORY;
1331 io->ci_noatime = file_is_noatime(file);
1332 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1333 io->ci_pio = !io->u.ci_rw.rw_append;
1337 /* FLR: only use non-delay I/O for read as there is only one
1338 * avaliable mirror for write. */
1339 io->ci_ndelay = !(iot == CIT_WRITE);
1341 ll_io_set_mirror(io, file);
1344 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1346 struct cl_io_pt *pt = ptask->pt_cbdata;
1347 struct file *file = pt->cip_file;
1350 loff_t pos = pt->cip_pos;
1355 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1356 file_dentry(file)->d_name.name,
1357 pt->cip_iot == CIT_READ ? "read" : "write",
1358 pos, pos + pt->cip_count);
1360 env = cl_env_get(&refcheck);
1362 RETURN(PTR_ERR(env));
1364 io = vvp_env_thread_io(env);
1365 ll_io_init(io, file, pt->cip_iot);
1366 io->u.ci_rw.rw_iter = pt->cip_iter;
1367 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1368 io->ci_pio = 0; /* It's already in parallel task */
1370 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1371 pt->cip_count - pt->cip_result);
1373 struct vvp_io *vio = vvp_env_io(env);
1375 vio->vui_io_subtype = IO_NORMAL;
1376 vio->vui_fd = LUSTRE_FPRIVATE(file);
1378 ll_cl_add(file, env, io, LCC_RW);
1379 rc = cl_io_loop(env, io);
1380 ll_cl_remove(file, env);
1382 /* cl_io_rw_init() handled IO */
1386 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1392 if (io->ci_nob > 0) {
1393 pt->cip_result += io->ci_nob;
1394 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1396 pt->cip_iocb.ki_pos = pos;
1397 #ifdef HAVE_KIOCB_KI_LEFT
1398 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1399 #elif defined(HAVE_KI_NBYTES)
1400 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1404 cl_io_fini(env, io);
1405 cl_env_put(env, &refcheck);
1407 pt->cip_need_restart = io->ci_need_restart;
1409 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1410 file_dentry(file)->d_name.name,
1411 pt->cip_iot == CIT_READ ? "read" : "write",
1412 pt->cip_result, rc);
1414 RETURN(pt->cip_result > 0 ? 0 : rc);
1418 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1419 struct file *file, enum cl_io_type iot,
1420 loff_t *ppos, size_t count)
1422 struct range_lock range;
1423 struct vvp_io *vio = vvp_env_io(env);
1424 struct inode *inode = file_inode(file);
1425 struct ll_inode_info *lli = ll_i2info(inode);
1426 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1431 unsigned retried = 0;
1432 bool restarted = false;
1436 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1437 file_dentry(file)->d_name.name,
1438 iot == CIT_READ ? "read" : "write", pos, pos + count);
1441 io = vvp_env_thread_io(env);
1442 ll_io_init(io, file, iot);
1443 if (args->via_io_subtype == IO_NORMAL) {
1444 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1445 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1447 if (args->via_io_subtype != IO_NORMAL || restarted)
1449 io->ci_ndelay_tried = retried;
1451 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1452 bool range_locked = false;
1454 if (file->f_flags & O_APPEND)
1455 range_lock_init(&range, 0, LUSTRE_EOF);
1457 range_lock_init(&range, pos, pos + count - 1);
1459 vio->vui_fd = LUSTRE_FPRIVATE(file);
1460 vio->vui_io_subtype = args->via_io_subtype;
1462 switch (vio->vui_io_subtype) {
1464 /* Direct IO reads must also take range lock,
1465 * or multiple reads will try to work on the same pages
1466 * See LU-6227 for details. */
1467 if (((iot == CIT_WRITE) ||
1468 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1469 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1470 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1472 rc = range_lock(&lli->lli_write_tree, &range);
1476 range_locked = true;
1480 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1481 vio->u.splice.vui_flags = args->u.splice.via_flags;
1484 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1488 ll_cl_add(file, env, io, LCC_RW);
1489 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1490 !lli->lli_inode_locked) {
1492 lli->lli_inode_locked = 1;
1494 rc = cl_io_loop(env, io);
1495 if (lli->lli_inode_locked) {
1496 lli->lli_inode_locked = 0;
1497 inode_unlock(inode);
1499 ll_cl_remove(file, env);
1502 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1504 range_unlock(&lli->lli_write_tree, &range);
1507 /* cl_io_rw_init() handled IO */
1511 if (io->ci_nob > 0) {
1512 result += io->ci_nob;
1513 count -= io->ci_nob;
1515 if (args->via_io_subtype == IO_NORMAL) {
1516 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1518 /* CLIO is too complicated. See LU-11069. */
1519 if (cl_io_is_append(io))
1520 pos = io->u.ci_rw.rw_iocb.ki_pos;
1524 args->u.normal.via_iocb->ki_pos = pos;
1525 #ifdef HAVE_KIOCB_KI_LEFT
1526 args->u.normal.via_iocb->ki_left = count;
1527 #elif defined(HAVE_KI_NBYTES)
1528 args->u.normal.via_iocb->ki_nbytes = count;
1532 pos = io->u.ci_rw.rw_range.cir_pos;
1536 cl_io_fini(env, io);
1539 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1540 file->f_path.dentry->d_name.name,
1541 iot, rc, result, io->ci_need_restart);
1543 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1545 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1546 file_dentry(file)->d_name.name,
1547 iot == CIT_READ ? "read" : "write",
1548 pos, pos + count, result, rc);
1549 /* preserve the tried count for FLR */
1550 retried = io->ci_ndelay_tried;
1555 if (iot == CIT_READ) {
1557 ll_stats_ops_tally(ll_i2sbi(inode),
1558 LPROC_LL_READ_BYTES, result);
1559 } else if (iot == CIT_WRITE) {
1561 ll_stats_ops_tally(ll_i2sbi(inode),
1562 LPROC_LL_WRITE_BYTES, result);
1563 fd->fd_write_failed = false;
1564 } else if (result == 0 && rc == 0) {
1567 fd->fd_write_failed = true;
1569 fd->fd_write_failed = false;
1570 } else if (rc != -ERESTARTSYS) {
1571 fd->fd_write_failed = true;
1575 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1576 file_dentry(file)->d_name.name,
1577 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1581 RETURN(result > 0 ? result : rc);
1585 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1586 * especially for small I/O.
1588 * To serve a read request, CLIO has to create and initialize a cl_io and
1589 * then request DLM lock. This has turned out to have siginificant overhead
1590 * and affects the performance of small I/O dramatically.
1592 * It's not necessary to create a cl_io for each I/O. Under the help of read
1593 * ahead, most of the pages being read are already in memory cache and we can
1594 * read those pages directly because if the pages exist, the corresponding DLM
1595 * lock must exist so that page content must be valid.
1597 * In fast read implementation, the llite speculatively finds and reads pages
1598 * in memory cache. There are three scenarios for fast read:
1599 * - If the page exists and is uptodate, kernel VM will provide the data and
1600 * CLIO won't be intervened;
1601 * - If the page was brought into memory by read ahead, it will be exported
1602 * and read ahead parameters will be updated;
1603 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1604 * it will go back and invoke normal read, i.e., a cl_io will be created
1605 * and DLM lock will be requested.
1607 * POSIX compliance: posix standard states that read is intended to be atomic.
1608 * Lustre read implementation is in line with Linux kernel read implementation
1609 * and neither of them complies with POSIX standard in this matter. Fast read
1610 * doesn't make the situation worse on single node but it may interleave write
1611 * results from multiple nodes due to short read handling in ll_file_aio_read().
1613 * \param env - lu_env
1614 * \param iocb - kiocb from kernel
1615 * \param iter - user space buffers where the data will be copied
1617 * \retval - number of bytes have been read, or error code if error occurred.
1620 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1624 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1627 /* NB: we can't do direct IO for fast read because it will need a lock
1628 * to make IO engine happy. */
1629 if (iocb->ki_filp->f_flags & O_DIRECT)
1632 result = generic_file_read_iter(iocb, iter);
1634 /* If the first page is not in cache, generic_file_aio_read() will be
1635 * returned with -ENODATA.
1636 * See corresponding code in ll_readpage(). */
1637 if (result == -ENODATA)
1641 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1642 LPROC_LL_READ_BYTES, result);
1648 * Read from a file (through the page cache).
1650 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1653 struct vvp_io_args *args;
1658 result = ll_do_fast_read(iocb, to);
1659 if (result < 0 || iov_iter_count(to) == 0)
1662 env = cl_env_get(&refcheck);
1664 return PTR_ERR(env);
1666 args = ll_env_args(env, IO_NORMAL);
1667 args->u.normal.via_iter = to;
1668 args->u.normal.via_iocb = iocb;
1670 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1671 &iocb->ki_pos, iov_iter_count(to));
1674 else if (result == 0)
1677 cl_env_put(env, &refcheck);
1683 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1684 * If a page is already in the page cache and dirty (and some other things -
1685 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1686 * write to it without doing a full I/O, because Lustre already knows about it
1687 * and will write it out. This saves a lot of processing time.
1689 * All writes here are within one page, so exclusion is handled by the page
1690 * lock on the vm page. We do not do tiny writes for writes which touch
1691 * multiple pages because it's very unlikely multiple sequential pages are
1692 * are already dirty.
1694 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1695 * and are unlikely to be to already dirty pages.
1697 * Attribute updates are important here, we do them in ll_tiny_write_end.
1699 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1701 ssize_t count = iov_iter_count(iter);
1702 struct file *file = iocb->ki_filp;
1703 struct inode *inode = file_inode(file);
1708 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1709 * of function for why.
1711 if (count >= PAGE_SIZE ||
1712 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1715 result = __generic_file_write_iter(iocb, iter);
1717 /* If the page is not already dirty, ll_tiny_write_begin returns
1718 * -ENODATA. We continue on to normal write.
1720 if (result == -ENODATA)
1724 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1726 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1729 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1735 * Write to a file (through the page cache).
1737 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1739 struct vvp_io_args *args;
1741 ssize_t rc_tiny = 0, rc_normal;
1746 /* NB: we can't do direct IO for tiny writes because they use the page
1747 * cache, we can't do sync writes because tiny writes can't flush
1748 * pages, and we can't do append writes because we can't guarantee the
1749 * required DLM locks are held to protect file size.
1751 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1752 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1753 rc_tiny = ll_do_tiny_write(iocb, from);
1755 /* In case of error, go on and try normal write - Only stop if tiny
1756 * write completed I/O.
1758 if (iov_iter_count(from) == 0)
1759 GOTO(out, rc_normal = rc_tiny);
1761 env = cl_env_get(&refcheck);
1763 return PTR_ERR(env);
1765 args = ll_env_args(env, IO_NORMAL);
1766 args->u.normal.via_iter = from;
1767 args->u.normal.via_iocb = iocb;
1769 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1770 &iocb->ki_pos, iov_iter_count(from));
1772 /* On success, combine bytes written. */
1773 if (rc_tiny >= 0 && rc_normal > 0)
1774 rc_normal += rc_tiny;
1775 /* On error, only return error from normal write if tiny write did not
1776 * write any bytes. Otherwise return bytes written by tiny write.
1778 else if (rc_tiny > 0)
1779 rc_normal = rc_tiny;
1781 cl_env_put(env, &refcheck);
1786 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1788 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1790 static int ll_file_get_iov_count(const struct iovec *iov,
1791 unsigned long *nr_segs, size_t *count)
1796 for (seg = 0; seg < *nr_segs; seg++) {
1797 const struct iovec *iv = &iov[seg];
1800 * If any segment has a negative length, or the cumulative
1801 * length ever wraps negative then return -EINVAL.
1804 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1806 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1811 cnt -= iv->iov_len; /* This segment is no good */
1818 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1819 unsigned long nr_segs, loff_t pos)
1826 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1830 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1831 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1832 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1833 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1834 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1836 result = ll_file_read_iter(iocb, &to);
1841 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1844 struct iovec iov = { .iov_base = buf, .iov_len = count };
1849 init_sync_kiocb(&kiocb, file);
1850 kiocb.ki_pos = *ppos;
1851 #ifdef HAVE_KIOCB_KI_LEFT
1852 kiocb.ki_left = count;
1853 #elif defined(HAVE_KI_NBYTES)
1854 kiocb.i_nbytes = count;
1857 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1858 *ppos = kiocb.ki_pos;
1864 * Write to a file (through the page cache).
1867 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1868 unsigned long nr_segs, loff_t pos)
1870 struct iov_iter from;
1875 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1879 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1880 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1881 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1882 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1883 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1885 result = ll_file_write_iter(iocb, &from);
1890 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1891 size_t count, loff_t *ppos)
1893 struct iovec iov = { .iov_base = (void __user *)buf,
1900 init_sync_kiocb(&kiocb, file);
1901 kiocb.ki_pos = *ppos;
1902 #ifdef HAVE_KIOCB_KI_LEFT
1903 kiocb.ki_left = count;
1904 #elif defined(HAVE_KI_NBYTES)
1905 kiocb.ki_nbytes = count;
1908 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1909 *ppos = kiocb.ki_pos;
1913 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1916 * Send file content (through pagecache) somewhere with helper
1918 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1919 struct pipe_inode_info *pipe, size_t count,
1923 struct vvp_io_args *args;
1928 env = cl_env_get(&refcheck);
1930 RETURN(PTR_ERR(env));
1932 args = ll_env_args(env, IO_SPLICE);
1933 args->u.splice.via_pipe = pipe;
1934 args->u.splice.via_flags = flags;
1936 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1937 cl_env_put(env, &refcheck);
1941 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1942 __u64 flags, struct lov_user_md *lum, int lum_size)
1944 struct lookup_intent oit = {
1946 .it_flags = flags | MDS_OPEN_BY_FID,
1951 ll_inode_size_lock(inode);
1952 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1954 GOTO(out_unlock, rc);
1956 ll_release_openhandle(dentry, &oit);
1959 ll_inode_size_unlock(inode);
1960 ll_intent_release(&oit);
1965 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1966 struct lov_mds_md **lmmp, int *lmm_size,
1967 struct ptlrpc_request **request)
1969 struct ll_sb_info *sbi = ll_i2sbi(inode);
1970 struct mdt_body *body;
1971 struct lov_mds_md *lmm = NULL;
1972 struct ptlrpc_request *req = NULL;
1973 struct md_op_data *op_data;
1976 rc = ll_get_default_mdsize(sbi, &lmmsize);
1980 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1981 strlen(filename), lmmsize,
1982 LUSTRE_OPC_ANY, NULL);
1983 if (IS_ERR(op_data))
1984 RETURN(PTR_ERR(op_data));
1986 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1987 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1988 ll_finish_md_op_data(op_data);
1990 CDEBUG(D_INFO, "md_getattr_name failed "
1991 "on %s: rc %d\n", filename, rc);
1995 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1996 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1998 lmmsize = body->mbo_eadatasize;
2000 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2002 GOTO(out, rc = -ENODATA);
2005 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2006 LASSERT(lmm != NULL);
2008 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2009 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2010 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2011 GOTO(out, rc = -EPROTO);
2014 * This is coming from the MDS, so is probably in
2015 * little endian. We convert it to host endian before
2016 * passing it to userspace.
2018 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2021 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2022 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2023 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2024 if (le32_to_cpu(lmm->lmm_pattern) &
2025 LOV_PATTERN_F_RELEASED)
2029 /* if function called for directory - we should
2030 * avoid swab not existent lsm objects */
2031 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2032 lustre_swab_lov_user_md_v1(
2033 (struct lov_user_md_v1 *)lmm);
2034 if (S_ISREG(body->mbo_mode))
2035 lustre_swab_lov_user_md_objects(
2036 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2038 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2039 lustre_swab_lov_user_md_v3(
2040 (struct lov_user_md_v3 *)lmm);
2041 if (S_ISREG(body->mbo_mode))
2042 lustre_swab_lov_user_md_objects(
2043 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2045 } else if (lmm->lmm_magic ==
2046 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2047 lustre_swab_lov_comp_md_v1(
2048 (struct lov_comp_md_v1 *)lmm);
2054 *lmm_size = lmmsize;
2059 static int ll_lov_setea(struct inode *inode, struct file *file,
2062 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2063 struct lov_user_md *lump;
2064 int lum_size = sizeof(struct lov_user_md) +
2065 sizeof(struct lov_user_ost_data);
2069 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2072 OBD_ALLOC_LARGE(lump, lum_size);
2076 if (copy_from_user(lump, arg, lum_size))
2077 GOTO(out_lump, rc = -EFAULT);
2079 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2081 cl_lov_delay_create_clear(&file->f_flags);
2084 OBD_FREE_LARGE(lump, lum_size);
2088 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2095 env = cl_env_get(&refcheck);
2097 RETURN(PTR_ERR(env));
2099 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2100 cl_env_put(env, &refcheck);
2104 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2107 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2108 struct lov_user_md *klum;
2110 __u64 flags = FMODE_WRITE;
2113 rc = ll_copy_user_md(lum, &klum);
2118 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2123 rc = put_user(0, &lum->lmm_stripe_count);
2127 rc = ll_layout_refresh(inode, &gen);
2131 rc = ll_file_getstripe(inode, arg, lum_size);
2133 cl_lov_delay_create_clear(&file->f_flags);
2136 OBD_FREE(klum, lum_size);
2141 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2143 struct ll_inode_info *lli = ll_i2info(inode);
2144 struct cl_object *obj = lli->lli_clob;
2145 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2146 struct ll_grouplock grouplock;
2151 CWARN("group id for group lock must not be 0\n");
2155 if (ll_file_nolock(file))
2156 RETURN(-EOPNOTSUPP);
2158 spin_lock(&lli->lli_lock);
2159 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2160 CWARN("group lock already existed with gid %lu\n",
2161 fd->fd_grouplock.lg_gid);
2162 spin_unlock(&lli->lli_lock);
2165 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2166 spin_unlock(&lli->lli_lock);
2169 * XXX: group lock needs to protect all OST objects while PFL
2170 * can add new OST objects during the IO, so we'd instantiate
2171 * all OST objects before getting its group lock.
2176 struct cl_layout cl = {
2177 .cl_is_composite = false,
2179 struct lu_extent ext = {
2181 .e_end = OBD_OBJECT_EOF,
2184 env = cl_env_get(&refcheck);
2186 RETURN(PTR_ERR(env));
2188 rc = cl_object_layout_get(env, obj, &cl);
2189 if (!rc && cl.cl_is_composite)
2190 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2193 cl_env_put(env, &refcheck);
2198 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2199 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2203 spin_lock(&lli->lli_lock);
2204 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2205 spin_unlock(&lli->lli_lock);
2206 CERROR("another thread just won the race\n");
2207 cl_put_grouplock(&grouplock);
2211 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2212 fd->fd_grouplock = grouplock;
2213 spin_unlock(&lli->lli_lock);
2215 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2219 static int ll_put_grouplock(struct inode *inode, struct file *file,
2222 struct ll_inode_info *lli = ll_i2info(inode);
2223 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2224 struct ll_grouplock grouplock;
2227 spin_lock(&lli->lli_lock);
2228 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2229 spin_unlock(&lli->lli_lock);
2230 CWARN("no group lock held\n");
2234 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2236 if (fd->fd_grouplock.lg_gid != arg) {
2237 CWARN("group lock %lu doesn't match current id %lu\n",
2238 arg, fd->fd_grouplock.lg_gid);
2239 spin_unlock(&lli->lli_lock);
2243 grouplock = fd->fd_grouplock;
2244 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2245 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2246 spin_unlock(&lli->lli_lock);
2248 cl_put_grouplock(&grouplock);
2249 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2254 * Close inode open handle
2256 * \param dentry [in] dentry which contains the inode
2257 * \param it [in,out] intent which contains open info and result
2260 * \retval <0 failure
2262 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2264 struct inode *inode = dentry->d_inode;
2265 struct obd_client_handle *och;
2271 /* Root ? Do nothing. */
2272 if (dentry->d_inode->i_sb->s_root == dentry)
2275 /* No open handle to close? Move away */
2276 if (!it_disposition(it, DISP_OPEN_OPEN))
2279 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2281 OBD_ALLOC(och, sizeof(*och));
2283 GOTO(out, rc = -ENOMEM);
2285 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2287 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2289 /* this one is in place of ll_file_open */
2290 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2291 ptlrpc_req_finished(it->it_request);
2292 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2298 * Get size for inode for which FIEMAP mapping is requested.
2299 * Make the FIEMAP get_info call and returns the result.
2300 * \param fiemap kernel buffer to hold extens
2301 * \param num_bytes kernel buffer size
2303 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2309 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2312 /* Checks for fiemap flags */
2313 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2314 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2318 /* Check for FIEMAP_FLAG_SYNC */
2319 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2320 rc = filemap_fdatawrite(inode->i_mapping);
2325 env = cl_env_get(&refcheck);
2327 RETURN(PTR_ERR(env));
2329 if (i_size_read(inode) == 0) {
2330 rc = ll_glimpse_size(inode);
2335 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2336 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2337 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2339 /* If filesize is 0, then there would be no objects for mapping */
2340 if (fmkey.lfik_oa.o_size == 0) {
2341 fiemap->fm_mapped_extents = 0;
2345 fmkey.lfik_fiemap = *fiemap;
2347 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2348 &fmkey, fiemap, &num_bytes);
2350 cl_env_put(env, &refcheck);
2354 int ll_fid2path(struct inode *inode, void __user *arg)
2356 struct obd_export *exp = ll_i2mdexp(inode);
2357 const struct getinfo_fid2path __user *gfin = arg;
2359 struct getinfo_fid2path *gfout;
2365 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2366 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2369 /* Only need to get the buflen */
2370 if (get_user(pathlen, &gfin->gf_pathlen))
2373 if (pathlen > PATH_MAX)
2376 outsize = sizeof(*gfout) + pathlen;
2377 OBD_ALLOC(gfout, outsize);
2381 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2382 GOTO(gf_free, rc = -EFAULT);
2383 /* append root FID after gfout to let MDT know the root FID so that it
2384 * can lookup the correct path, this is mainly for fileset.
2385 * old server without fileset mount support will ignore this. */
2386 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2388 /* Call mdc_iocontrol */
2389 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2393 if (copy_to_user(arg, gfout, outsize))
2397 OBD_FREE(gfout, outsize);
2402 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2404 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2412 ioc->idv_version = 0;
2413 ioc->idv_layout_version = UINT_MAX;
2415 /* If no file object initialized, we consider its version is 0. */
2419 env = cl_env_get(&refcheck);
2421 RETURN(PTR_ERR(env));
2423 io = vvp_env_thread_io(env);
2425 io->u.ci_data_version.dv_data_version = 0;
2426 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2427 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2430 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2431 result = cl_io_loop(env, io);
2433 result = io->ci_result;
2435 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2436 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2438 cl_io_fini(env, io);
2440 if (unlikely(io->ci_need_restart))
2443 cl_env_put(env, &refcheck);
2449 * Read the data_version for inode.
2451 * This value is computed using stripe object version on OST.
2452 * Version is computed using server side locking.
2454 * @param flags if do sync on the OST side;
2456 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2457 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2459 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2461 struct ioc_data_version ioc = { .idv_flags = flags };
2464 rc = ll_ioc_data_version(inode, &ioc);
2466 *data_version = ioc.idv_version;
2472 * Trigger a HSM release request for the provided inode.
2474 int ll_hsm_release(struct inode *inode)
2477 struct obd_client_handle *och = NULL;
2478 __u64 data_version = 0;
2483 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2484 ll_get_fsname(inode->i_sb, NULL, 0),
2485 PFID(&ll_i2info(inode)->lli_fid));
2487 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2489 GOTO(out, rc = PTR_ERR(och));
2491 /* Grab latest data_version and [am]time values */
2492 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2496 env = cl_env_get(&refcheck);
2498 GOTO(out, rc = PTR_ERR(env));
2500 rc = ll_merge_attr(env, inode);
2501 cl_env_put(env, &refcheck);
2503 /* If error happen, we have the wrong size for a file.
2509 /* Release the file.
2510 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2511 * we still need it to pack l_remote_handle to MDT. */
2512 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2518 if (och != NULL && !IS_ERR(och)) /* close the file */
2519 ll_lease_close(och, inode, NULL);
2524 struct ll_swap_stack {
2527 struct inode *inode1;
2528 struct inode *inode2;
2533 static int ll_swap_layouts(struct file *file1, struct file *file2,
2534 struct lustre_swap_layouts *lsl)
2536 struct mdc_swap_layouts msl;
2537 struct md_op_data *op_data;
2540 struct ll_swap_stack *llss = NULL;
2543 OBD_ALLOC_PTR(llss);
2547 llss->inode1 = file_inode(file1);
2548 llss->inode2 = file_inode(file2);
2550 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2554 /* we use 2 bool because it is easier to swap than 2 bits */
2555 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2556 llss->check_dv1 = true;
2558 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2559 llss->check_dv2 = true;
2561 /* we cannot use lsl->sl_dvX directly because we may swap them */
2562 llss->dv1 = lsl->sl_dv1;
2563 llss->dv2 = lsl->sl_dv2;
2565 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2566 if (rc == 0) /* same file, done! */
2569 if (rc < 0) { /* sequentialize it */
2570 swap(llss->inode1, llss->inode2);
2572 swap(llss->dv1, llss->dv2);
2573 swap(llss->check_dv1, llss->check_dv2);
2577 if (gid != 0) { /* application asks to flush dirty cache */
2578 rc = ll_get_grouplock(llss->inode1, file1, gid);
2582 rc = ll_get_grouplock(llss->inode2, file2, gid);
2584 ll_put_grouplock(llss->inode1, file1, gid);
2589 /* ultimate check, before swaping the layouts we check if
2590 * dataversion has changed (if requested) */
2591 if (llss->check_dv1) {
2592 rc = ll_data_version(llss->inode1, &dv, 0);
2595 if (dv != llss->dv1)
2596 GOTO(putgl, rc = -EAGAIN);
2599 if (llss->check_dv2) {
2600 rc = ll_data_version(llss->inode2, &dv, 0);
2603 if (dv != llss->dv2)
2604 GOTO(putgl, rc = -EAGAIN);
2607 /* struct md_op_data is used to send the swap args to the mdt
2608 * only flags is missing, so we use struct mdc_swap_layouts
2609 * through the md_op_data->op_data */
2610 /* flags from user space have to be converted before they are send to
2611 * server, no flag is sent today, they are only used on the client */
2614 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2615 0, LUSTRE_OPC_ANY, &msl);
2616 if (IS_ERR(op_data))
2617 GOTO(free, rc = PTR_ERR(op_data));
2619 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2620 sizeof(*op_data), op_data, NULL);
2621 ll_finish_md_op_data(op_data);
2628 ll_put_grouplock(llss->inode2, file2, gid);
2629 ll_put_grouplock(llss->inode1, file1, gid);
2639 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2641 struct md_op_data *op_data;
2645 /* Detect out-of range masks */
2646 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2649 /* Non-root users are forbidden to set or clear flags which are
2650 * NOT defined in HSM_USER_MASK. */
2651 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2652 !cfs_capable(CFS_CAP_SYS_ADMIN))
2655 /* Detect out-of range archive id */
2656 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2657 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2660 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2661 LUSTRE_OPC_ANY, hss);
2662 if (IS_ERR(op_data))
2663 RETURN(PTR_ERR(op_data));
2665 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2666 sizeof(*op_data), op_data, NULL);
2668 ll_finish_md_op_data(op_data);
2673 static int ll_hsm_import(struct inode *inode, struct file *file,
2674 struct hsm_user_import *hui)
2676 struct hsm_state_set *hss = NULL;
2677 struct iattr *attr = NULL;
2681 if (!S_ISREG(inode->i_mode))
2687 GOTO(out, rc = -ENOMEM);
2689 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2690 hss->hss_archive_id = hui->hui_archive_id;
2691 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2692 rc = ll_hsm_state_set(inode, hss);
2696 OBD_ALLOC_PTR(attr);
2698 GOTO(out, rc = -ENOMEM);
2700 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2701 attr->ia_mode |= S_IFREG;
2702 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2703 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2704 attr->ia_size = hui->hui_size;
2705 attr->ia_mtime.tv_sec = hui->hui_mtime;
2706 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2707 attr->ia_atime.tv_sec = hui->hui_atime;
2708 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2710 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2711 ATTR_UID | ATTR_GID |
2712 ATTR_MTIME | ATTR_MTIME_SET |
2713 ATTR_ATIME | ATTR_ATIME_SET;
2717 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2721 inode_unlock(inode);
2733 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2735 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2736 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2739 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2741 struct inode *inode = file_inode(file);
2743 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2744 ATTR_MTIME | ATTR_MTIME_SET |
2747 .tv_sec = lfu->lfu_atime_sec,
2748 .tv_nsec = lfu->lfu_atime_nsec,
2751 .tv_sec = lfu->lfu_mtime_sec,
2752 .tv_nsec = lfu->lfu_mtime_nsec,
2755 .tv_sec = lfu->lfu_ctime_sec,
2756 .tv_nsec = lfu->lfu_ctime_nsec,
2762 if (!capable(CAP_SYS_ADMIN))
2765 if (!S_ISREG(inode->i_mode))
2769 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2771 inode_unlock(inode);
2776 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2779 case MODE_READ_USER:
2781 case MODE_WRITE_USER:
2788 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2790 /* Used to allow the upper layers of the client to request an LDLM lock
2791 * without doing an actual read or write.
2793 * Used for ladvise lockahead to manually request specific locks.
2795 * \param[in] file file this ladvise lock request is on
2796 * \param[in] ladvise ladvise struct describing this lock request
2798 * \retval 0 success, no detailed result available (sync requests
2799 * and requests sent to the server [not handled locally]
2800 * cannot return detailed results)
2801 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2802 * see definitions for details.
2803 * \retval negative negative errno on error
2805 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2807 struct lu_env *env = NULL;
2808 struct cl_io *io = NULL;
2809 struct cl_lock *lock = NULL;
2810 struct cl_lock_descr *descr = NULL;
2811 struct dentry *dentry = file->f_path.dentry;
2812 struct inode *inode = dentry->d_inode;
2813 enum cl_lock_mode cl_mode;
2814 off_t start = ladvise->lla_start;
2815 off_t end = ladvise->lla_end;
2821 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2822 "start=%llu, end=%llu\n", dentry->d_name.len,
2823 dentry->d_name.name, dentry->d_inode,
2824 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2827 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2829 GOTO(out, result = cl_mode);
2831 /* Get IO environment */
2832 result = cl_io_get(inode, &env, &io, &refcheck);
2836 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2839 * nothing to do for this io. This currently happens when
2840 * stripe sub-object's are not yet created.
2842 result = io->ci_result;
2843 } else if (result == 0) {
2844 lock = vvp_env_lock(env);
2845 descr = &lock->cll_descr;
2847 descr->cld_obj = io->ci_obj;
2848 /* Convert byte offsets to pages */
2849 descr->cld_start = cl_index(io->ci_obj, start);
2850 descr->cld_end = cl_index(io->ci_obj, end);
2851 descr->cld_mode = cl_mode;
2852 /* CEF_MUST is used because we do not want to convert a
2853 * lockahead request to a lockless lock */
2854 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2857 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2858 descr->cld_enq_flags |= CEF_SPECULATIVE;
2860 result = cl_lock_request(env, io, lock);
2862 /* On success, we need to release the lock */
2864 cl_lock_release(env, lock);
2866 cl_io_fini(env, io);
2867 cl_env_put(env, &refcheck);
2869 /* -ECANCELED indicates a matching lock with a different extent
2870 * was already present, and -EEXIST indicates a matching lock
2871 * on exactly the same extent was already present.
2872 * We convert them to positive values for userspace to make
2873 * recognizing true errors easier.
2874 * Note we can only return these detailed results on async requests,
2875 * as sync requests look the same as i/o requests for locking. */
2876 if (result == -ECANCELED)
2877 result = LLA_RESULT_DIFFERENT;
2878 else if (result == -EEXIST)
2879 result = LLA_RESULT_SAME;
2884 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2886 static int ll_ladvise_sanity(struct inode *inode,
2887 struct llapi_lu_ladvise *ladvise)
2889 enum lu_ladvise_type advice = ladvise->lla_advice;
2890 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2891 * be in the first 32 bits of enum ladvise_flags */
2892 __u32 flags = ladvise->lla_peradvice_flags;
2893 /* 3 lines at 80 characters per line, should be plenty */
2896 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2898 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2899 "last supported advice is %s (value '%d'): rc = %d\n",
2900 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2901 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2905 /* Per-advice checks */
2907 case LU_LADVISE_LOCKNOEXPAND:
2908 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2910 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2912 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2913 ladvise_names[advice], rc);
2917 case LU_LADVISE_LOCKAHEAD:
2918 /* Currently only READ and WRITE modes can be requested */
2919 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2920 ladvise->lla_lockahead_mode == 0) {
2922 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2924 ll_get_fsname(inode->i_sb, NULL, 0),
2925 ladvise->lla_lockahead_mode,
2926 ladvise_names[advice], rc);
2929 case LU_LADVISE_WILLREAD:
2930 case LU_LADVISE_DONTNEED:
2932 /* Note fall through above - These checks apply to all advices
2933 * except LOCKNOEXPAND */
2934 if (flags & ~LF_DEFAULT_MASK) {
2936 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2938 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2939 ladvise_names[advice], rc);
2942 if (ladvise->lla_start >= ladvise->lla_end) {
2944 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2945 "for %s: rc = %d\n",
2946 ll_get_fsname(inode->i_sb, NULL, 0),
2947 ladvise->lla_start, ladvise->lla_end,
2948 ladvise_names[advice], rc);
2960 * Give file access advices
2962 * The ladvise interface is similar to Linux fadvise() system call, except it
2963 * forwards the advices directly from Lustre client to server. The server side
2964 * codes will apply appropriate read-ahead and caching techniques for the
2965 * corresponding files.
2967 * A typical workload for ladvise is e.g. a bunch of different clients are
2968 * doing small random reads of a file, so prefetching pages into OSS cache
2969 * with big linear reads before the random IO is a net benefit. Fetching
2970 * all that data into each client cache with fadvise() may not be, due to
2971 * much more data being sent to the client.
2973 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2974 struct llapi_lu_ladvise *ladvise)
2978 struct cl_ladvise_io *lio;
2983 env = cl_env_get(&refcheck);
2985 RETURN(PTR_ERR(env));
2987 io = vvp_env_thread_io(env);
2988 io->ci_obj = ll_i2info(inode)->lli_clob;
2990 /* initialize parameters for ladvise */
2991 lio = &io->u.ci_ladvise;
2992 lio->li_start = ladvise->lla_start;
2993 lio->li_end = ladvise->lla_end;
2994 lio->li_fid = ll_inode2fid(inode);
2995 lio->li_advice = ladvise->lla_advice;
2996 lio->li_flags = flags;
2998 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2999 rc = cl_io_loop(env, io);
3003 cl_io_fini(env, io);
3004 cl_env_put(env, &refcheck);
3008 static int ll_lock_noexpand(struct file *file, int flags)
3010 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3012 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3017 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3020 struct fsxattr fsxattr;
3022 if (copy_from_user(&fsxattr,
3023 (const struct fsxattr __user *)arg,
3027 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3028 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3029 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3030 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3031 if (copy_to_user((struct fsxattr __user *)arg,
3032 &fsxattr, sizeof(fsxattr)))
3038 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3041 * Project Quota ID state is only allowed to change from within the init
3042 * namespace. Enforce that restriction only if we are trying to change
3043 * the quota ID state. Everything else is allowed in user namespaces.
3045 if (current_user_ns() == &init_user_ns)
3048 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3051 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3052 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3055 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3062 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3066 struct md_op_data *op_data;
3067 struct ptlrpc_request *req = NULL;
3069 struct fsxattr fsxattr;
3070 struct cl_object *obj;
3074 if (copy_from_user(&fsxattr,
3075 (const struct fsxattr __user *)arg,
3079 rc = ll_ioctl_check_project(inode, &fsxattr);
3083 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3084 LUSTRE_OPC_ANY, NULL);
3085 if (IS_ERR(op_data))
3086 RETURN(PTR_ERR(op_data));
3088 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3089 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3090 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3091 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3092 op_data->op_projid = fsxattr.fsx_projid;
3093 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3094 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3096 ptlrpc_req_finished(req);
3098 GOTO(out_fsxattr, rc);
3099 ll_update_inode_flags(inode, op_data->op_attr_flags);
3100 obj = ll_i2info(inode)->lli_clob;
3102 GOTO(out_fsxattr, rc);
3104 OBD_ALLOC_PTR(attr);
3106 GOTO(out_fsxattr, rc = -ENOMEM);
3108 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3109 fsxattr.fsx_xflags);
3112 ll_finish_md_op_data(op_data);
3116 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3119 struct inode *inode = file_inode(file);
3120 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3121 struct ll_inode_info *lli = ll_i2info(inode);
3122 struct obd_client_handle *och = NULL;
3123 struct split_param sp;
3126 enum mds_op_bias bias = 0;
3127 struct file *layout_file = NULL;
3129 size_t data_size = 0;
3133 mutex_lock(&lli->lli_och_mutex);
3134 if (fd->fd_lease_och != NULL) {
3135 och = fd->fd_lease_och;
3136 fd->fd_lease_och = NULL;
3138 mutex_unlock(&lli->lli_och_mutex);
3141 GOTO(out, rc = -ENOLCK);
3143 fmode = och->och_flags;
3145 switch (ioc->lil_flags) {
3146 case LL_LEASE_RESYNC_DONE:
3147 if (ioc->lil_count > IOC_IDS_MAX)
3148 GOTO(out, rc = -EINVAL);
3150 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3151 OBD_ALLOC(data, data_size);
3153 GOTO(out, rc = -ENOMEM);
3155 if (copy_from_user(data, (void __user *)arg, data_size))
3156 GOTO(out, rc = -EFAULT);
3158 bias = MDS_CLOSE_RESYNC_DONE;
3160 case LL_LEASE_LAYOUT_MERGE: {
3163 if (ioc->lil_count != 1)
3164 GOTO(out, rc = -EINVAL);
3166 arg += sizeof(*ioc);
3167 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3168 GOTO(out, rc = -EFAULT);
3170 layout_file = fget(fd);
3172 GOTO(out, rc = -EBADF);
3174 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3175 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3176 GOTO(out, rc = -EPERM);
3178 data = file_inode(layout_file);
3179 bias = MDS_CLOSE_LAYOUT_MERGE;
3182 case LL_LEASE_LAYOUT_SPLIT: {
3186 if (ioc->lil_count != 2)
3187 GOTO(out, rc = -EINVAL);
3189 arg += sizeof(*ioc);
3190 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3191 GOTO(out, rc = -EFAULT);
3193 arg += sizeof(__u32);
3194 if (copy_from_user(&mirror_id, (void __user *)arg,
3196 GOTO(out, rc = -EFAULT);
3198 layout_file = fget(fdv);
3200 GOTO(out, rc = -EBADF);
3202 sp.sp_inode = file_inode(layout_file);
3203 sp.sp_mirror_id = (__u16)mirror_id;
3205 bias = MDS_CLOSE_LAYOUT_SPLIT;
3209 /* without close intent */
3213 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3217 rc = ll_lease_och_release(inode, file);
3226 switch (ioc->lil_flags) {
3227 case LL_LEASE_RESYNC_DONE:
3229 OBD_FREE(data, data_size);
3231 case LL_LEASE_LAYOUT_MERGE:
3232 case LL_LEASE_LAYOUT_SPLIT:
3239 rc = ll_lease_type_from_fmode(fmode);
3243 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3246 struct inode *inode = file_inode(file);
3247 struct ll_inode_info *lli = ll_i2info(inode);
3248 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3249 struct obd_client_handle *och = NULL;
3250 __u64 open_flags = 0;
3256 switch (ioc->lil_mode) {
3257 case LL_LEASE_WRLCK:
3258 if (!(file->f_mode & FMODE_WRITE))
3260 fmode = FMODE_WRITE;
3262 case LL_LEASE_RDLCK:
3263 if (!(file->f_mode & FMODE_READ))
3267 case LL_LEASE_UNLCK:
3268 RETURN(ll_file_unlock_lease(file, ioc, arg));
3273 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3275 /* apply for lease */
3276 if (ioc->lil_flags & LL_LEASE_RESYNC)
3277 open_flags = MDS_OPEN_RESYNC;
3278 och = ll_lease_open(inode, file, fmode, open_flags);
3280 RETURN(PTR_ERR(och));
3282 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3283 rc = ll_lease_file_resync(och, inode, arg);
3285 ll_lease_close(och, inode, NULL);
3288 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3290 ll_lease_close(och, inode, NULL);
3296 mutex_lock(&lli->lli_och_mutex);
3297 if (fd->fd_lease_och == NULL) {
3298 fd->fd_lease_och = och;
3301 mutex_unlock(&lli->lli_och_mutex);
3303 /* impossible now that only excl is supported for now */
3304 ll_lease_close(och, inode, &lease_broken);
3311 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3313 struct inode *inode = file_inode(file);
3314 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3318 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3319 PFID(ll_inode2fid(inode)), inode, cmd);
3320 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3322 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3323 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3327 case LL_IOC_GETFLAGS:
3328 /* Get the current value of the file flags */
3329 return put_user(fd->fd_flags, (int __user *)arg);
3330 case LL_IOC_SETFLAGS:
3331 case LL_IOC_CLRFLAGS:
3332 /* Set or clear specific file flags */
3333 /* XXX This probably needs checks to ensure the flags are
3334 * not abused, and to handle any flag side effects.
3336 if (get_user(flags, (int __user *) arg))
3339 if (cmd == LL_IOC_SETFLAGS) {
3340 if ((flags & LL_FILE_IGNORE_LOCK) &&
3341 !(file->f_flags & O_DIRECT)) {
3342 CERROR("%s: unable to disable locking on "
3343 "non-O_DIRECT file\n", current->comm);
3347 fd->fd_flags |= flags;
3349 fd->fd_flags &= ~flags;
3352 case LL_IOC_LOV_SETSTRIPE:
3353 case LL_IOC_LOV_SETSTRIPE_NEW:
3354 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3355 case LL_IOC_LOV_SETEA:
3356 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3357 case LL_IOC_LOV_SWAP_LAYOUTS: {
3359 struct lustre_swap_layouts lsl;
3361 if (copy_from_user(&lsl, (char __user *)arg,
3362 sizeof(struct lustre_swap_layouts)))
3365 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3368 file2 = fget(lsl.sl_fd);
3372 /* O_WRONLY or O_RDWR */
3373 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3374 GOTO(out, rc = -EPERM);
3376 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3377 struct inode *inode2;
3378 struct ll_inode_info *lli;
3379 struct obd_client_handle *och = NULL;
3381 lli = ll_i2info(inode);
3382 mutex_lock(&lli->lli_och_mutex);
3383 if (fd->fd_lease_och != NULL) {
3384 och = fd->fd_lease_och;
3385 fd->fd_lease_och = NULL;
3387 mutex_unlock(&lli->lli_och_mutex);
3389 GOTO(out, rc = -ENOLCK);
3390 inode2 = file_inode(file2);
3391 rc = ll_swap_layouts_close(och, inode, inode2);
3393 rc = ll_swap_layouts(file, file2, &lsl);
3399 case LL_IOC_LOV_GETSTRIPE:
3400 case LL_IOC_LOV_GETSTRIPE_NEW:
3401 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3402 case FS_IOC_GETFLAGS:
3403 case FS_IOC_SETFLAGS:
3404 RETURN(ll_iocontrol(inode, file, cmd, arg));
3405 case FSFILT_IOC_GETVERSION:
3406 case FS_IOC_GETVERSION:
3407 RETURN(put_user(inode->i_generation, (int __user *)arg));
3408 /* We need to special case any other ioctls we want to handle,
3409 * to send them to the MDS/OST as appropriate and to properly
3410 * network encode the arg field. */
3411 case FS_IOC_SETVERSION:
3414 case LL_IOC_GROUP_LOCK:
3415 RETURN(ll_get_grouplock(inode, file, arg));
3416 case LL_IOC_GROUP_UNLOCK:
3417 RETURN(ll_put_grouplock(inode, file, arg));
3418 case IOC_OBD_STATFS:
3419 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3421 case LL_IOC_FLUSHCTX:
3422 RETURN(ll_flush_ctx(inode));
3423 case LL_IOC_PATH2FID: {
3424 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3425 sizeof(struct lu_fid)))
3430 case LL_IOC_GETPARENT:
3431 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3433 case OBD_IOC_FID2PATH:
3434 RETURN(ll_fid2path(inode, (void __user *)arg));
3435 case LL_IOC_DATA_VERSION: {
3436 struct ioc_data_version idv;
3439 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3442 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3443 rc = ll_ioc_data_version(inode, &idv);
3446 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3452 case LL_IOC_GET_MDTIDX: {
3455 mdtidx = ll_get_mdt_idx(inode);
3459 if (put_user((int)mdtidx, (int __user *)arg))
3464 case OBD_IOC_GETDTNAME:
3465 case OBD_IOC_GETMDNAME:
3466 RETURN(ll_get_obd_name(inode, cmd, arg));
3467 case LL_IOC_HSM_STATE_GET: {
3468 struct md_op_data *op_data;
3469 struct hsm_user_state *hus;
3476 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3477 LUSTRE_OPC_ANY, hus);
3478 if (IS_ERR(op_data)) {
3480 RETURN(PTR_ERR(op_data));
3483 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3486 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3489 ll_finish_md_op_data(op_data);
3493 case LL_IOC_HSM_STATE_SET: {
3494 struct hsm_state_set *hss;
3501 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3506 rc = ll_hsm_state_set(inode, hss);
3511 case LL_IOC_HSM_ACTION: {
3512 struct md_op_data *op_data;
3513 struct hsm_current_action *hca;
3520 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3521 LUSTRE_OPC_ANY, hca);
3522 if (IS_ERR(op_data)) {
3524 RETURN(PTR_ERR(op_data));
3527 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3530 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3533 ll_finish_md_op_data(op_data);
3537 case LL_IOC_SET_LEASE_OLD: {
3538 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3540 RETURN(ll_file_set_lease(file, &ioc, 0));
3542 case LL_IOC_SET_LEASE: {
3543 struct ll_ioc_lease ioc;
3545 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3548 RETURN(ll_file_set_lease(file, &ioc, arg));
3550 case LL_IOC_GET_LEASE: {
3551 struct ll_inode_info *lli = ll_i2info(inode);
3552 struct ldlm_lock *lock = NULL;
3555 mutex_lock(&lli->lli_och_mutex);
3556 if (fd->fd_lease_och != NULL) {
3557 struct obd_client_handle *och = fd->fd_lease_och;
3559 lock = ldlm_handle2lock(&och->och_lease_handle);
3561 lock_res_and_lock(lock);
3562 if (!ldlm_is_cancel(lock))
3563 fmode = och->och_flags;
3565 unlock_res_and_lock(lock);
3566 LDLM_LOCK_PUT(lock);
3569 mutex_unlock(&lli->lli_och_mutex);
3571 RETURN(ll_lease_type_from_fmode(fmode));
3573 case LL_IOC_HSM_IMPORT: {
3574 struct hsm_user_import *hui;
3580 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3585 rc = ll_hsm_import(inode, file, hui);
3590 case LL_IOC_FUTIMES_3: {
3591 struct ll_futimes_3 lfu;
3593 if (copy_from_user(&lfu,
3594 (const struct ll_futimes_3 __user *)arg,
3598 RETURN(ll_file_futimes_3(file, &lfu));
3600 case LL_IOC_LADVISE: {
3601 struct llapi_ladvise_hdr *k_ladvise_hdr;
3602 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3605 int alloc_size = sizeof(*k_ladvise_hdr);
3608 u_ladvise_hdr = (void __user *)arg;
3609 OBD_ALLOC_PTR(k_ladvise_hdr);
3610 if (k_ladvise_hdr == NULL)
3613 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3614 GOTO(out_ladvise, rc = -EFAULT);
3616 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3617 k_ladvise_hdr->lah_count < 1)
3618 GOTO(out_ladvise, rc = -EINVAL);
3620 num_advise = k_ladvise_hdr->lah_count;
3621 if (num_advise >= LAH_COUNT_MAX)
3622 GOTO(out_ladvise, rc = -EFBIG);
3624 OBD_FREE_PTR(k_ladvise_hdr);
3625 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3626 lah_advise[num_advise]);
3627 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3628 if (k_ladvise_hdr == NULL)
3632 * TODO: submit multiple advices to one server in a single RPC
3634 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3635 GOTO(out_ladvise, rc = -EFAULT);
3637 for (i = 0; i < num_advise; i++) {
3638 struct llapi_lu_ladvise *k_ladvise =
3639 &k_ladvise_hdr->lah_advise[i];
3640 struct llapi_lu_ladvise __user *u_ladvise =
3641 &u_ladvise_hdr->lah_advise[i];
3643 rc = ll_ladvise_sanity(inode, k_ladvise);
3645 GOTO(out_ladvise, rc);
3647 switch (k_ladvise->lla_advice) {
3648 case LU_LADVISE_LOCKNOEXPAND:
3649 rc = ll_lock_noexpand(file,
3650 k_ladvise->lla_peradvice_flags);
3651 GOTO(out_ladvise, rc);
3652 case LU_LADVISE_LOCKAHEAD:
3654 rc = ll_file_lock_ahead(file, k_ladvise);
3657 GOTO(out_ladvise, rc);
3660 &u_ladvise->lla_lockahead_result))
3661 GOTO(out_ladvise, rc = -EFAULT);
3664 rc = ll_ladvise(inode, file,
3665 k_ladvise_hdr->lah_flags,
3668 GOTO(out_ladvise, rc);
3675 OBD_FREE(k_ladvise_hdr, alloc_size);
3678 case LL_IOC_FLR_SET_MIRROR: {
3679 /* mirror I/O must be direct to avoid polluting page cache
3681 if (!(file->f_flags & O_DIRECT))
3684 fd->fd_designated_mirror = (__u32)arg;
3687 case LL_IOC_FSGETXATTR:
3688 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3689 case LL_IOC_FSSETXATTR:
3690 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3692 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3694 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3695 (void __user *)arg));
3699 #ifndef HAVE_FILE_LLSEEK_SIZE
3700 static inline loff_t
3701 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3703 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3705 if (offset > maxsize)
3708 if (offset != file->f_pos) {
3709 file->f_pos = offset;
3710 file->f_version = 0;
3716 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3717 loff_t maxsize, loff_t eof)
3719 struct inode *inode = file_inode(file);
3727 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3728 * position-querying operation. Avoid rewriting the "same"
3729 * f_pos value back to the file because a concurrent read(),
3730 * write() or lseek() might have altered it
3735 * f_lock protects against read/modify/write race with other
3736 * SEEK_CURs. Note that parallel writes and reads behave
3740 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3741 inode_unlock(inode);
3745 * In the generic case the entire file is data, so as long as
3746 * offset isn't at the end of the file then the offset is data.
3753 * There is a virtual hole at the end of the file, so as long as
3754 * offset isn't i_size or larger, return i_size.
3762 return llseek_execute(file, offset, maxsize);
3766 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3768 struct inode *inode = file_inode(file);
3769 loff_t retval, eof = 0;
3772 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3773 (origin == SEEK_CUR) ? file->f_pos : 0);
3774 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3775 PFID(ll_inode2fid(inode)), inode, retval, retval,
3777 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3779 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3780 retval = ll_glimpse_size(inode);
3783 eof = i_size_read(inode);
3786 retval = ll_generic_file_llseek_size(file, offset, origin,
3787 ll_file_maxbytes(inode), eof);
3791 static int ll_flush(struct file *file, fl_owner_t id)
3793 struct inode *inode = file_inode(file);
3794 struct ll_inode_info *lli = ll_i2info(inode);
3795 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3798 LASSERT(!S_ISDIR(inode->i_mode));
3800 /* catch async errors that were recorded back when async writeback
3801 * failed for pages in this mapping. */
3802 rc = lli->lli_async_rc;
3803 lli->lli_async_rc = 0;
3804 if (lli->lli_clob != NULL) {
3805 err = lov_read_and_clear_async_rc(lli->lli_clob);
3810 /* The application has been told write failure already.
3811 * Do not report failure again. */
3812 if (fd->fd_write_failed)
3814 return rc ? -EIO : 0;
3818 * Called to make sure a portion of file has been written out.
3819 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3821 * Return how many pages have been written.
3823 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3824 enum cl_fsync_mode mode, int ignore_layout)
3828 struct cl_fsync_io *fio;
3833 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3834 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3837 env = cl_env_get(&refcheck);
3839 RETURN(PTR_ERR(env));
3841 io = vvp_env_thread_io(env);
3842 io->ci_obj = ll_i2info(inode)->lli_clob;
3843 io->ci_ignore_layout = ignore_layout;
3845 /* initialize parameters for sync */
3846 fio = &io->u.ci_fsync;
3847 fio->fi_start = start;
3849 fio->fi_fid = ll_inode2fid(inode);
3850 fio->fi_mode = mode;
3851 fio->fi_nr_written = 0;
3853 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3854 result = cl_io_loop(env, io);
3856 result = io->ci_result;
3858 result = fio->fi_nr_written;
3859 cl_io_fini(env, io);
3860 cl_env_put(env, &refcheck);
3866 * When dentry is provided (the 'else' case), file_dentry() may be
3867 * null and dentry must be used directly rather than pulled from
3868 * file_dentry() as is done otherwise.
3871 #ifdef HAVE_FILE_FSYNC_4ARGS
3872 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3874 struct dentry *dentry = file_dentry(file);
3876 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3877 int ll_fsync(struct file *file, int datasync)
3879 struct dentry *dentry = file_dentry(file);
3881 loff_t end = LLONG_MAX;
3883 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3886 loff_t end = LLONG_MAX;
3888 struct inode *inode = dentry->d_inode;
3889 struct ll_inode_info *lli = ll_i2info(inode);
3890 struct ptlrpc_request *req;
3894 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3895 PFID(ll_inode2fid(inode)), inode);
3896 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3898 #ifdef HAVE_FILE_FSYNC_4ARGS
3899 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3900 lock_inode = !lli->lli_inode_locked;
3904 /* fsync's caller has already called _fdata{sync,write}, we want
3905 * that IO to finish before calling the osc and mdc sync methods */
3906 rc = filemap_fdatawait(inode->i_mapping);
3909 /* catch async errors that were recorded back when async writeback
3910 * failed for pages in this mapping. */
3911 if (!S_ISDIR(inode->i_mode)) {
3912 err = lli->lli_async_rc;
3913 lli->lli_async_rc = 0;
3916 if (lli->lli_clob != NULL) {
3917 err = lov_read_and_clear_async_rc(lli->lli_clob);
3923 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3927 ptlrpc_req_finished(req);
3929 if (S_ISREG(inode->i_mode)) {
3930 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3932 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3933 if (rc == 0 && err < 0)
3936 fd->fd_write_failed = true;
3938 fd->fd_write_failed = false;
3941 #ifdef HAVE_FILE_FSYNC_4ARGS
3943 inode_unlock(inode);
3949 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3951 struct inode *inode = file_inode(file);
3952 struct ll_sb_info *sbi = ll_i2sbi(inode);
3953 struct ldlm_enqueue_info einfo = {
3954 .ei_type = LDLM_FLOCK,
3955 .ei_cb_cp = ldlm_flock_completion_ast,
3956 .ei_cbdata = file_lock,
3958 struct md_op_data *op_data;
3959 struct lustre_handle lockh = { 0 };
3960 union ldlm_policy_data flock = { { 0 } };
3961 int fl_type = file_lock->fl_type;
3967 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3968 PFID(ll_inode2fid(inode)), file_lock);
3970 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3972 if (file_lock->fl_flags & FL_FLOCK) {
3973 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3974 /* flocks are whole-file locks */
3975 flock.l_flock.end = OFFSET_MAX;
3976 /* For flocks owner is determined by the local file desctiptor*/
3977 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3978 } else if (file_lock->fl_flags & FL_POSIX) {
3979 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3980 flock.l_flock.start = file_lock->fl_start;
3981 flock.l_flock.end = file_lock->fl_end;
3985 flock.l_flock.pid = file_lock->fl_pid;
3987 /* Somewhat ugly workaround for svc lockd.
3988 * lockd installs custom fl_lmops->lm_compare_owner that checks
3989 * for the fl_owner to be the same (which it always is on local node
3990 * I guess between lockd processes) and then compares pid.
3991 * As such we assign pid to the owner field to make it all work,
3992 * conflict with normal locks is unlikely since pid space and
3993 * pointer space for current->files are not intersecting */
3994 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3995 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3999 einfo.ei_mode = LCK_PR;
4002 /* An unlock request may or may not have any relation to
4003 * existing locks so we may not be able to pass a lock handle
4004 * via a normal ldlm_lock_cancel() request. The request may even
4005 * unlock a byte range in the middle of an existing lock. In
4006 * order to process an unlock request we need all of the same
4007 * information that is given with a normal read or write record
4008 * lock request. To avoid creating another ldlm unlock (cancel)
4009 * message we'll treat a LCK_NL flock request as an unlock. */
4010 einfo.ei_mode = LCK_NL;
4013 einfo.ei_mode = LCK_PW;
4016 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4031 flags = LDLM_FL_BLOCK_NOWAIT;
4037 flags = LDLM_FL_TEST_LOCK;
4040 CERROR("unknown fcntl lock command: %d\n", cmd);
4044 /* Save the old mode so that if the mode in the lock changes we
4045 * can decrement the appropriate reader or writer refcount. */
4046 file_lock->fl_type = einfo.ei_mode;
4048 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4049 LUSTRE_OPC_ANY, NULL);
4050 if (IS_ERR(op_data))
4051 RETURN(PTR_ERR(op_data));
4053 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4054 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4055 flock.l_flock.pid, flags, einfo.ei_mode,
4056 flock.l_flock.start, flock.l_flock.end);
4058 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4061 /* Restore the file lock type if not TEST lock. */
4062 if (!(flags & LDLM_FL_TEST_LOCK))
4063 file_lock->fl_type = fl_type;
4065 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4066 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4067 !(flags & LDLM_FL_TEST_LOCK))
4068 rc2 = locks_lock_file_wait(file, file_lock);
4070 if ((file_lock->fl_flags & FL_FLOCK) &&
4071 (rc == 0 || file_lock->fl_type == F_UNLCK))
4072 rc2 = flock_lock_file_wait(file, file_lock);
4073 if ((file_lock->fl_flags & FL_POSIX) &&
4074 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4075 !(flags & LDLM_FL_TEST_LOCK))
4076 rc2 = posix_lock_file_wait(file, file_lock);
4077 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4079 if (rc2 && file_lock->fl_type != F_UNLCK) {
4080 einfo.ei_mode = LCK_NL;
4081 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4086 ll_finish_md_op_data(op_data);
4091 int ll_get_fid_by_name(struct inode *parent, const char *name,
4092 int namelen, struct lu_fid *fid,
4093 struct inode **inode)
4095 struct md_op_data *op_data = NULL;
4096 struct mdt_body *body;
4097 struct ptlrpc_request *req;
4101 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4102 LUSTRE_OPC_ANY, NULL);
4103 if (IS_ERR(op_data))
4104 RETURN(PTR_ERR(op_data));
4106 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4107 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4108 ll_finish_md_op_data(op_data);
4112 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4114 GOTO(out_req, rc = -EFAULT);
4116 *fid = body->mbo_fid1;
4119 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4121 ptlrpc_req_finished(req);
4125 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4128 struct dentry *dchild = NULL;
4129 struct inode *child_inode = NULL;
4130 struct md_op_data *op_data;
4131 struct ptlrpc_request *request = NULL;
4132 struct obd_client_handle *och = NULL;
4134 struct mdt_body *body;
4135 __u64 data_version = 0;
4136 size_t namelen = strlen(name);
4137 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4141 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4142 PFID(ll_inode2fid(parent)), name,
4143 lum->lum_stripe_offset, lum->lum_stripe_count);
4145 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4146 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4147 lustre_swab_lmv_user_md(lum);
4149 /* Get child FID first */
4150 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4153 dchild = d_lookup(file_dentry(file), &qstr);
4155 if (dchild->d_inode)
4156 child_inode = igrab(dchild->d_inode);
4161 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4170 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4171 OBD_CONNECT2_DIR_MIGRATE)) {
4172 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4173 ll_i2info(child_inode)->lli_lsm_md) {
4174 CERROR("%s: MDT doesn't support stripe directory "
4176 ll_get_fsname(parent->i_sb, NULL, 0));
4177 GOTO(out_iput, rc = -EOPNOTSUPP);
4182 * lfs migrate command needs to be blocked on the client
4183 * by checking the migrate FID against the FID of the
4186 if (child_inode == parent->i_sb->s_root->d_inode)
4187 GOTO(out_iput, rc = -EINVAL);
4189 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4190 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4191 if (IS_ERR(op_data))
4192 GOTO(out_iput, rc = PTR_ERR(op_data));
4194 inode_lock(child_inode);
4195 op_data->op_fid3 = *ll_inode2fid(child_inode);
4196 if (!fid_is_sane(&op_data->op_fid3)) {
4197 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4198 ll_get_fsname(parent->i_sb, NULL, 0), name,
4199 PFID(&op_data->op_fid3));
4200 GOTO(out_unlock, rc = -EINVAL);
4203 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4204 op_data->op_data = lum;
4205 op_data->op_data_size = lumlen;
4208 if (S_ISREG(child_inode->i_mode)) {
4209 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4213 GOTO(out_unlock, rc);
4216 rc = ll_data_version(child_inode, &data_version,
4219 GOTO(out_close, rc);
4221 op_data->op_open_handle = och->och_open_handle;
4222 op_data->op_data_version = data_version;
4223 op_data->op_lease_handle = och->och_lease_handle;
4224 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4226 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4227 och->och_mod->mod_open_req->rq_replay = 0;
4228 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4231 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4232 name, namelen, &request);
4234 LASSERT(request != NULL);
4235 ll_update_times(request, parent);
4237 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4238 LASSERT(body != NULL);
4240 /* If the server does release layout lock, then we cleanup
4241 * the client och here, otherwise release it in out_close: */
4242 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4243 obd_mod_put(och->och_mod);
4244 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4246 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4252 if (request != NULL) {
4253 ptlrpc_req_finished(request);
4257 /* Try again if the file layout has changed. */
4258 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4263 ll_lease_close(och, child_inode, NULL);
4265 clear_nlink(child_inode);
4267 inode_unlock(child_inode);
4268 ll_finish_md_op_data(op_data);
4275 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4283 * test if some locks matching bits and l_req_mode are acquired
4284 * - bits can be in different locks
4285 * - if found clear the common lock bits in *bits
4286 * - the bits not found, are kept in *bits
4288 * \param bits [IN] searched lock bits [IN]
4289 * \param l_req_mode [IN] searched lock mode
4290 * \retval boolean, true iff all bits are found
4292 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4294 struct lustre_handle lockh;
4295 union ldlm_policy_data policy;
4296 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4297 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4306 fid = &ll_i2info(inode)->lli_fid;
4307 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4308 ldlm_lockname[mode]);
4310 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4311 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4312 policy.l_inodebits.bits = *bits & (1 << i);
4313 if (policy.l_inodebits.bits == 0)
4316 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4317 &policy, mode, &lockh)) {
4318 struct ldlm_lock *lock;
4320 lock = ldlm_handle2lock(&lockh);
4323 ~(lock->l_policy_data.l_inodebits.bits);
4324 LDLM_LOCK_PUT(lock);
4326 *bits &= ~policy.l_inodebits.bits;
4333 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4334 struct lustre_handle *lockh, __u64 flags,
4335 enum ldlm_mode mode)
4337 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4342 fid = &ll_i2info(inode)->lli_fid;
4343 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4345 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4346 fid, LDLM_IBITS, &policy, mode, lockh);
4351 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4353 /* Already unlinked. Just update nlink and return success */
4354 if (rc == -ENOENT) {
4356 /* If it is striped directory, and there is bad stripe
4357 * Let's revalidate the dentry again, instead of returning
4359 if (S_ISDIR(inode->i_mode) &&
4360 ll_i2info(inode)->lli_lsm_md != NULL)
4363 /* This path cannot be hit for regular files unless in
4364 * case of obscure races, so no need to to validate
4366 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4368 } else if (rc != 0) {
4369 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4370 "%s: revalidate FID "DFID" error: rc = %d\n",
4371 ll_get_fsname(inode->i_sb, NULL, 0),
4372 PFID(ll_inode2fid(inode)), rc);
4378 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4380 struct inode *inode = dentry->d_inode;
4381 struct obd_export *exp = ll_i2mdexp(inode);
4382 struct lookup_intent oit = {
4385 struct ptlrpc_request *req = NULL;
4386 struct md_op_data *op_data;
4390 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4391 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4393 /* Call getattr by fid, so do not provide name at all. */
4394 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4395 LUSTRE_OPC_ANY, NULL);
4396 if (IS_ERR(op_data))
4397 RETURN(PTR_ERR(op_data));
4399 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4400 ll_finish_md_op_data(op_data);
4402 rc = ll_inode_revalidate_fini(inode, rc);
4406 rc = ll_revalidate_it_finish(req, &oit, dentry);
4408 ll_intent_release(&oit);
4412 /* Unlinked? Unhash dentry, so it is not picked up later by
4413 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4414 * here to preserve get_cwd functionality on 2.6.
4416 if (!dentry->d_inode->i_nlink) {
4417 ll_lock_dcache(inode);
4418 d_lustre_invalidate(dentry, 0);
4419 ll_unlock_dcache(inode);
4422 ll_lookup_finish_locks(&oit, dentry);
4424 ptlrpc_req_finished(req);
4429 static int ll_merge_md_attr(struct inode *inode)
4431 struct ll_inode_info *lli = ll_i2info(inode);
4432 struct cl_attr attr = { 0 };
4435 LASSERT(lli->lli_lsm_md != NULL);
4436 down_read(&lli->lli_lsm_sem);
4437 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4438 &attr, ll_md_blocking_ast);
4439 up_read(&lli->lli_lsm_sem);
4443 set_nlink(inode, attr.cat_nlink);
4444 inode->i_blocks = attr.cat_blocks;
4445 i_size_write(inode, attr.cat_size);
4447 ll_i2info(inode)->lli_atime = attr.cat_atime;
4448 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4449 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4454 static inline dev_t ll_compat_encode_dev(dev_t dev)
4456 /* The compat_sys_*stat*() syscalls will fail unless the
4457 * device majors and minors are both less than 256. Note that
4458 * the value returned here will be passed through
4459 * old_encode_dev() in cp_compat_stat(). And so we are not
4460 * trying to return a valid compat (u16) device number, just
4461 * one that will pass the old_valid_dev() check. */
4463 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4466 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4467 int ll_getattr(const struct path *path, struct kstat *stat,
4468 u32 request_mask, unsigned int flags)
4470 struct dentry *de = path->dentry;
4472 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4475 struct inode *inode = de->d_inode;
4476 struct ll_sb_info *sbi = ll_i2sbi(inode);
4477 struct ll_inode_info *lli = ll_i2info(inode);
4480 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4482 rc = ll_inode_revalidate(de, IT_GETATTR);
4486 if (S_ISREG(inode->i_mode)) {
4487 /* In case of restore, the MDT has the right size and has
4488 * already send it back without granting the layout lock,
4489 * inode is up-to-date so glimpse is useless.
4490 * Also to glimpse we need the layout, in case of a running
4491 * restore the MDT holds the layout lock so the glimpse will
4492 * block up to the end of restore (getattr will block)
4494 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4495 rc = ll_glimpse_size(inode);
4500 /* If object isn't regular a file then don't validate size. */
4501 if (S_ISDIR(inode->i_mode) &&
4502 lli->lli_lsm_md != NULL) {
4503 rc = ll_merge_md_attr(inode);
4508 LTIME_S(inode->i_atime) = lli->lli_atime;
4509 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4510 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4513 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4515 if (ll_need_32bit_api(sbi)) {
4516 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4517 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4518 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4520 stat->ino = inode->i_ino;
4521 stat->dev = inode->i_sb->s_dev;
4522 stat->rdev = inode->i_rdev;
4525 stat->mode = inode->i_mode;
4526 stat->uid = inode->i_uid;
4527 stat->gid = inode->i_gid;
4528 stat->atime = inode->i_atime;
4529 stat->mtime = inode->i_mtime;
4530 stat->ctime = inode->i_ctime;
4531 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4533 stat->nlink = inode->i_nlink;
4534 stat->size = i_size_read(inode);
4535 stat->blocks = inode->i_blocks;
4540 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4541 __u64 start, __u64 len)
4545 struct fiemap *fiemap;
4546 unsigned int extent_count = fieinfo->fi_extents_max;
4548 num_bytes = sizeof(*fiemap) + (extent_count *
4549 sizeof(struct fiemap_extent));
4550 OBD_ALLOC_LARGE(fiemap, num_bytes);
4555 fiemap->fm_flags = fieinfo->fi_flags;
4556 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4557 fiemap->fm_start = start;
4558 fiemap->fm_length = len;
4559 if (extent_count > 0 &&
4560 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4561 sizeof(struct fiemap_extent)) != 0)
4562 GOTO(out, rc = -EFAULT);
4564 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4566 fieinfo->fi_flags = fiemap->fm_flags;
4567 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4568 if (extent_count > 0 &&
4569 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4570 fiemap->fm_mapped_extents *
4571 sizeof(struct fiemap_extent)) != 0)
4572 GOTO(out, rc = -EFAULT);
4574 OBD_FREE_LARGE(fiemap, num_bytes);
4578 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4580 struct ll_inode_info *lli = ll_i2info(inode);
4581 struct posix_acl *acl = NULL;
4584 spin_lock(&lli->lli_lock);
4585 /* VFS' acl_permission_check->check_acl will release the refcount */
4586 acl = posix_acl_dup(lli->lli_posix_acl);
4587 spin_unlock(&lli->lli_lock);
4592 #ifdef HAVE_IOP_SET_ACL
4593 #ifdef CONFIG_FS_POSIX_ACL
4594 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4596 struct ll_sb_info *sbi = ll_i2sbi(inode);
4597 struct ptlrpc_request *req = NULL;
4598 const char *name = NULL;
4600 size_t value_size = 0;
4605 case ACL_TYPE_ACCESS:
4606 name = XATTR_NAME_POSIX_ACL_ACCESS;
4608 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4611 case ACL_TYPE_DEFAULT:
4612 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4613 if (!S_ISDIR(inode->i_mode))
4614 rc = acl ? -EACCES : 0;
4625 value_size = posix_acl_xattr_size(acl->a_count);
4626 value = kmalloc(value_size, GFP_NOFS);
4628 GOTO(out, rc = -ENOMEM);
4630 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4632 GOTO(out_value, rc);
4635 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4636 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4637 name, value, value_size, 0, 0, &req);
4639 ptlrpc_req_finished(req);
4644 forget_cached_acl(inode, type);
4646 set_cached_acl(inode, type, acl);
4649 #endif /* CONFIG_FS_POSIX_ACL */
4650 #endif /* HAVE_IOP_SET_ACL */
4652 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4654 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4655 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4657 ll_check_acl(struct inode *inode, int mask)
4660 # ifdef CONFIG_FS_POSIX_ACL
4661 struct posix_acl *acl;
4665 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4666 if (flags & IPERM_FLAG_RCU)
4669 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4674 rc = posix_acl_permission(inode, acl, mask);
4675 posix_acl_release(acl);
4678 # else /* !CONFIG_FS_POSIX_ACL */
4680 # endif /* CONFIG_FS_POSIX_ACL */
4682 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4684 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4685 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4687 # ifdef HAVE_INODE_PERMISION_2ARGS
4688 int ll_inode_permission(struct inode *inode, int mask)
4690 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4695 struct ll_sb_info *sbi;
4696 struct root_squash_info *squash;
4697 struct cred *cred = NULL;
4698 const struct cred *old_cred = NULL;
4700 bool squash_id = false;
4703 #ifdef MAY_NOT_BLOCK
4704 if (mask & MAY_NOT_BLOCK)
4706 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4707 if (flags & IPERM_FLAG_RCU)
4711 /* as root inode are NOT getting validated in lookup operation,
4712 * need to do it before permission check. */
4714 if (inode == inode->i_sb->s_root->d_inode) {
4715 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4720 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4721 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4723 /* squash fsuid/fsgid if needed */
4724 sbi = ll_i2sbi(inode);
4725 squash = &sbi->ll_squash;
4726 if (unlikely(squash->rsi_uid != 0 &&
4727 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4728 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4732 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4733 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4734 squash->rsi_uid, squash->rsi_gid);
4736 /* update current process's credentials
4737 * and FS capability */
4738 cred = prepare_creds();
4742 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4743 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4744 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4745 if ((1 << cap) & CFS_CAP_FS_MASK)
4746 cap_lower(cred->cap_effective, cap);
4748 old_cred = override_creds(cred);
4751 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4752 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4753 /* restore current process's credentials and FS capability */
4755 revert_creds(old_cred);
4762 /* -o localflock - only provides locally consistent flock locks */
4763 struct file_operations ll_file_operations = {
4764 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4765 # ifdef HAVE_SYNC_READ_WRITE
4766 .read = new_sync_read,
4767 .write = new_sync_write,
4769 .read_iter = ll_file_read_iter,
4770 .write_iter = ll_file_write_iter,
4771 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4772 .read = ll_file_read,
4773 .aio_read = ll_file_aio_read,
4774 .write = ll_file_write,
4775 .aio_write = ll_file_aio_write,
4776 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4777 .unlocked_ioctl = ll_file_ioctl,
4778 .open = ll_file_open,
4779 .release = ll_file_release,
4780 .mmap = ll_file_mmap,
4781 .llseek = ll_file_seek,
4782 .splice_read = ll_file_splice_read,
4787 struct file_operations ll_file_operations_flock = {
4788 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4789 # ifdef HAVE_SYNC_READ_WRITE
4790 .read = new_sync_read,
4791 .write = new_sync_write,
4792 # endif /* HAVE_SYNC_READ_WRITE */
4793 .read_iter = ll_file_read_iter,
4794 .write_iter = ll_file_write_iter,
4795 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4796 .read = ll_file_read,
4797 .aio_read = ll_file_aio_read,
4798 .write = ll_file_write,
4799 .aio_write = ll_file_aio_write,
4800 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4801 .unlocked_ioctl = ll_file_ioctl,
4802 .open = ll_file_open,
4803 .release = ll_file_release,
4804 .mmap = ll_file_mmap,
4805 .llseek = ll_file_seek,
4806 .splice_read = ll_file_splice_read,
4809 .flock = ll_file_flock,
4810 .lock = ll_file_flock
4813 /* These are for -o noflock - to return ENOSYS on flock calls */
4814 struct file_operations ll_file_operations_noflock = {
4815 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4816 # ifdef HAVE_SYNC_READ_WRITE
4817 .read = new_sync_read,
4818 .write = new_sync_write,
4819 # endif /* HAVE_SYNC_READ_WRITE */
4820 .read_iter = ll_file_read_iter,
4821 .write_iter = ll_file_write_iter,
4822 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4823 .read = ll_file_read,
4824 .aio_read = ll_file_aio_read,
4825 .write = ll_file_write,
4826 .aio_write = ll_file_aio_write,
4827 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4828 .unlocked_ioctl = ll_file_ioctl,
4829 .open = ll_file_open,
4830 .release = ll_file_release,
4831 .mmap = ll_file_mmap,
4832 .llseek = ll_file_seek,
4833 .splice_read = ll_file_splice_read,
4836 .flock = ll_file_noflock,
4837 .lock = ll_file_noflock
4840 struct inode_operations ll_file_inode_operations = {
4841 .setattr = ll_setattr,
4842 .getattr = ll_getattr,
4843 .permission = ll_inode_permission,
4844 #ifdef HAVE_IOP_XATTR
4845 .setxattr = ll_setxattr,
4846 .getxattr = ll_getxattr,
4847 .removexattr = ll_removexattr,
4849 .listxattr = ll_listxattr,
4850 .fiemap = ll_fiemap,
4851 #ifdef HAVE_IOP_GET_ACL
4852 .get_acl = ll_get_acl,
4854 #ifdef HAVE_IOP_SET_ACL
4855 .set_acl = ll_set_acl,
4859 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4861 struct ll_inode_info *lli = ll_i2info(inode);
4862 struct cl_object *obj = lli->lli_clob;
4871 env = cl_env_get(&refcheck);
4873 RETURN(PTR_ERR(env));
4875 rc = cl_conf_set(env, lli->lli_clob, conf);
4879 if (conf->coc_opc == OBJECT_CONF_SET) {
4880 struct ldlm_lock *lock = conf->coc_lock;
4881 struct cl_layout cl = {
4885 LASSERT(lock != NULL);
4886 LASSERT(ldlm_has_layout(lock));
4888 /* it can only be allowed to match after layout is
4889 * applied to inode otherwise false layout would be
4890 * seen. Applying layout shoud happen before dropping
4891 * the intent lock. */
4892 ldlm_lock_allow_match(lock);
4894 rc = cl_object_layout_get(env, obj, &cl);
4899 DFID": layout version change: %u -> %u\n",
4900 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4902 ll_layout_version_set(lli, cl.cl_layout_gen);
4906 cl_env_put(env, &refcheck);
4911 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4912 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4915 struct ll_sb_info *sbi = ll_i2sbi(inode);
4916 struct ptlrpc_request *req;
4917 struct mdt_body *body;
4924 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4925 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4926 lock->l_lvb_data, lock->l_lvb_len);
4928 if (lock->l_lvb_data != NULL)
4931 /* if layout lock was granted right away, the layout is returned
4932 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4933 * blocked and then granted via completion ast, we have to fetch
4934 * layout here. Please note that we can't use the LVB buffer in
4935 * completion AST because it doesn't have a large enough buffer */
4936 rc = ll_get_default_mdsize(sbi, &lmmsize);
4938 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4939 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4943 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4945 GOTO(out, rc = -EPROTO);
4947 lmmsize = body->mbo_eadatasize;
4948 if (lmmsize == 0) /* empty layout */
4951 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4953 GOTO(out, rc = -EFAULT);
4955 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4956 if (lvbdata == NULL)
4957 GOTO(out, rc = -ENOMEM);
4959 memcpy(lvbdata, lmm, lmmsize);
4960 lock_res_and_lock(lock);
4961 if (unlikely(lock->l_lvb_data == NULL)) {
4962 lock->l_lvb_type = LVB_T_LAYOUT;
4963 lock->l_lvb_data = lvbdata;
4964 lock->l_lvb_len = lmmsize;
4967 unlock_res_and_lock(lock);
4970 OBD_FREE_LARGE(lvbdata, lmmsize);
4975 ptlrpc_req_finished(req);
4980 * Apply the layout to the inode. Layout lock is held and will be released
4983 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4984 struct inode *inode)
4986 struct ll_inode_info *lli = ll_i2info(inode);
4987 struct ll_sb_info *sbi = ll_i2sbi(inode);
4988 struct ldlm_lock *lock;
4989 struct cl_object_conf conf;
4992 bool wait_layout = false;
4995 LASSERT(lustre_handle_is_used(lockh));
4997 lock = ldlm_handle2lock(lockh);
4998 LASSERT(lock != NULL);
4999 LASSERT(ldlm_has_layout(lock));
5001 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5002 PFID(&lli->lli_fid), inode);
5004 /* in case this is a caching lock and reinstate with new inode */
5005 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5007 lock_res_and_lock(lock);
5008 lvb_ready = ldlm_is_lvb_ready(lock);
5009 unlock_res_and_lock(lock);
5011 /* checking lvb_ready is racy but this is okay. The worst case is
5012 * that multi processes may configure the file on the same time. */
5016 rc = ll_layout_fetch(inode, lock);
5020 /* for layout lock, lmm is stored in lock's lvb.
5021 * lvb_data is immutable if the lock is held so it's safe to access it
5024 * set layout to file. Unlikely this will fail as old layout was
5025 * surely eliminated */
5026 memset(&conf, 0, sizeof conf);
5027 conf.coc_opc = OBJECT_CONF_SET;
5028 conf.coc_inode = inode;
5029 conf.coc_lock = lock;
5030 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5031 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5032 rc = ll_layout_conf(inode, &conf);
5034 /* refresh layout failed, need to wait */
5035 wait_layout = rc == -EBUSY;
5038 LDLM_LOCK_PUT(lock);
5039 ldlm_lock_decref(lockh, mode);
5041 /* wait for IO to complete if it's still being used. */
5043 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5044 ll_get_fsname(inode->i_sb, NULL, 0),
5045 PFID(&lli->lli_fid), inode);
5047 memset(&conf, 0, sizeof conf);
5048 conf.coc_opc = OBJECT_CONF_WAIT;
5049 conf.coc_inode = inode;
5050 rc = ll_layout_conf(inode, &conf);
5054 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5055 ll_get_fsname(inode->i_sb, NULL, 0),
5056 PFID(&lli->lli_fid), rc);
5062 * Issue layout intent RPC to MDS.
5063 * \param inode [in] file inode
5064 * \param intent [in] layout intent
5066 * \retval 0 on success
5067 * \retval < 0 error code
5069 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5071 struct ll_inode_info *lli = ll_i2info(inode);
5072 struct ll_sb_info *sbi = ll_i2sbi(inode);
5073 struct md_op_data *op_data;
5074 struct lookup_intent it;
5075 struct ptlrpc_request *req;
5079 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5080 0, 0, LUSTRE_OPC_ANY, NULL);
5081 if (IS_ERR(op_data))
5082 RETURN(PTR_ERR(op_data));
5084 op_data->op_data = intent;
5085 op_data->op_data_size = sizeof(*intent);
5087 memset(&it, 0, sizeof(it));
5088 it.it_op = IT_LAYOUT;
5089 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5090 intent->li_opc == LAYOUT_INTENT_TRUNC)
5091 it.it_flags = FMODE_WRITE;
5093 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5094 ll_get_fsname(inode->i_sb, NULL, 0),
5095 PFID(&lli->lli_fid), inode);
5097 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5098 &ll_md_blocking_ast, 0);
5099 if (it.it_request != NULL)
5100 ptlrpc_req_finished(it.it_request);
5101 it.it_request = NULL;
5103 ll_finish_md_op_data(op_data);
5105 /* set lock data in case this is a new lock */
5107 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5109 ll_intent_drop_lock(&it);
5115 * This function checks if there exists a LAYOUT lock on the client side,
5116 * or enqueues it if it doesn't have one in cache.
5118 * This function will not hold layout lock so it may be revoked any time after
5119 * this function returns. Any operations depend on layout should be redone
5122 * This function should be called before lov_io_init() to get an uptodate
5123 * layout version, the caller should save the version number and after IO
5124 * is finished, this function should be called again to verify that layout
5125 * is not changed during IO time.
5127 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5129 struct ll_inode_info *lli = ll_i2info(inode);
5130 struct ll_sb_info *sbi = ll_i2sbi(inode);
5131 struct lustre_handle lockh;
5132 struct layout_intent intent = {
5133 .li_opc = LAYOUT_INTENT_ACCESS,
5135 enum ldlm_mode mode;
5139 *gen = ll_layout_version_get(lli);
5140 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5144 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5145 LASSERT(S_ISREG(inode->i_mode));
5147 /* take layout lock mutex to enqueue layout lock exclusively. */
5148 mutex_lock(&lli->lli_layout_mutex);
5151 /* mostly layout lock is caching on the local side, so try to
5152 * match it before grabbing layout lock mutex. */
5153 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5154 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5155 if (mode != 0) { /* hit cached lock */
5156 rc = ll_layout_lock_set(&lockh, mode, inode);
5162 rc = ll_layout_intent(inode, &intent);
5168 *gen = ll_layout_version_get(lli);
5169 mutex_unlock(&lli->lli_layout_mutex);
5175 * Issue layout intent RPC indicating where in a file an IO is about to write.
5177 * \param[in] inode file inode.
5178 * \param[in] ext write range with start offset of fille in bytes where
5179 * an IO is about to write, and exclusive end offset in
5182 * \retval 0 on success
5183 * \retval < 0 error code
5185 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5186 struct lu_extent *ext)
5188 struct layout_intent intent = {
5190 .li_extent.e_start = ext->e_start,
5191 .li_extent.e_end = ext->e_end,
5196 rc = ll_layout_intent(inode, &intent);
5202 * This function send a restore request to the MDT
5204 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5206 struct hsm_user_request *hur;
5210 len = sizeof(struct hsm_user_request) +
5211 sizeof(struct hsm_user_item);
5212 OBD_ALLOC(hur, len);
5216 hur->hur_request.hr_action = HUA_RESTORE;
5217 hur->hur_request.hr_archive_id = 0;
5218 hur->hur_request.hr_flags = 0;
5219 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5220 sizeof(hur->hur_user_item[0].hui_fid));
5221 hur->hur_user_item[0].hui_extent.offset = offset;
5222 hur->hur_user_item[0].hui_extent.length = length;
5223 hur->hur_request.hr_itemcount = 1;
5224 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,