4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
64 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
67 static struct ll_file_data *ll_file_data_get(void)
69 struct ll_file_data *fd;
71 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
75 fd->fd_write_failed = false;
80 static void ll_file_data_put(struct ll_file_data *fd)
83 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
87 * Packs all the attributes into @op_data for the CLOSE rpc.
89 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
90 struct obd_client_handle *och)
94 ll_prep_md_op_data(op_data, inode, NULL, NULL,
95 0, 0, LUSTRE_OPC_ANY, NULL);
97 op_data->op_attr.ia_mode = inode->i_mode;
98 op_data->op_attr.ia_atime = inode->i_atime;
99 op_data->op_attr.ia_mtime = inode->i_mtime;
100 op_data->op_attr.ia_ctime = inode->i_ctime;
101 op_data->op_attr.ia_size = i_size_read(inode);
102 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
103 ATTR_MTIME | ATTR_MTIME_SET |
105 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
106 op_data->op_attr_blocks = inode->i_blocks;
107 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
108 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
109 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
110 op_data->op_open_handle = och->och_open_handle;
112 if (och->och_flags & FMODE_WRITE &&
113 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
114 /* For HSM: if inode data has been modified, pack it so that
115 * MDT can set data dirty flag in the archive. */
116 op_data->op_bias |= MDS_DATA_MODIFIED;
122 * Perform a close, possibly with a bias.
123 * The meaning of "data" depends on the value of "bias".
125 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
126 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
129 static int ll_close_inode_openhandle(struct inode *inode,
130 struct obd_client_handle *och,
131 enum mds_op_bias bias, void *data)
133 struct obd_export *md_exp = ll_i2mdexp(inode);
134 const struct ll_inode_info *lli = ll_i2info(inode);
135 struct md_op_data *op_data;
136 struct ptlrpc_request *req = NULL;
140 if (class_exp2obd(md_exp) == NULL) {
141 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
142 ll_get_fsname(inode->i_sb, NULL, 0),
143 PFID(&lli->lli_fid));
147 OBD_ALLOC_PTR(op_data);
148 /* We leak openhandle and request here on error, but not much to be
149 * done in OOM case since app won't retry close on error either. */
151 GOTO(out, rc = -ENOMEM);
153 ll_prepare_close(inode, op_data, och);
155 case MDS_CLOSE_LAYOUT_MERGE:
156 /* merge blocks from the victim inode */
157 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
158 op_data->op_attr.ia_valid |= ATTR_SIZE;
159 op_data->op_xvalid |= OP_XVALID_BLOCKS;
160 case MDS_CLOSE_LAYOUT_SPLIT:
161 case MDS_CLOSE_LAYOUT_SWAP: {
162 struct split_param *sp = data;
164 LASSERT(data != NULL);
165 op_data->op_bias |= bias;
166 op_data->op_data_version = 0;
167 op_data->op_lease_handle = och->och_lease_handle;
168 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
169 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
170 op_data->op_mirror_id = sp->sp_mirror_id;
172 op_data->op_fid2 = *ll_inode2fid(data);
177 case MDS_CLOSE_RESYNC_DONE: {
178 struct ll_ioc_lease *ioc = data;
180 LASSERT(data != NULL);
181 op_data->op_attr_blocks +=
182 ioc->lil_count * op_data->op_attr_blocks;
183 op_data->op_attr.ia_valid |= ATTR_SIZE;
184 op_data->op_xvalid |= OP_XVALID_BLOCKS;
185 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
187 op_data->op_lease_handle = och->och_lease_handle;
188 op_data->op_data = &ioc->lil_ids[0];
189 op_data->op_data_size =
190 ioc->lil_count * sizeof(ioc->lil_ids[0]);
194 case MDS_HSM_RELEASE:
195 LASSERT(data != NULL);
196 op_data->op_bias |= MDS_HSM_RELEASE;
197 op_data->op_data_version = *(__u64 *)data;
198 op_data->op_lease_handle = och->och_lease_handle;
199 op_data->op_attr.ia_valid |= ATTR_SIZE;
200 op_data->op_xvalid |= OP_XVALID_BLOCKS;
204 LASSERT(data == NULL);
208 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
209 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
210 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
211 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
213 rc = md_close(md_exp, op_data, och->och_mod, &req);
214 if (rc != 0 && rc != -EINTR)
215 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
216 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
218 if (rc == 0 && op_data->op_bias & bias) {
219 struct mdt_body *body;
221 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
222 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
226 ll_finish_md_op_data(op_data);
230 md_clear_open_replay_data(md_exp, och);
231 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
234 ptlrpc_req_finished(req); /* This is close request */
238 int ll_md_real_close(struct inode *inode, fmode_t fmode)
240 struct ll_inode_info *lli = ll_i2info(inode);
241 struct obd_client_handle **och_p;
242 struct obd_client_handle *och;
247 if (fmode & FMODE_WRITE) {
248 och_p = &lli->lli_mds_write_och;
249 och_usecount = &lli->lli_open_fd_write_count;
250 } else if (fmode & FMODE_EXEC) {
251 och_p = &lli->lli_mds_exec_och;
252 och_usecount = &lli->lli_open_fd_exec_count;
254 LASSERT(fmode & FMODE_READ);
255 och_p = &lli->lli_mds_read_och;
256 och_usecount = &lli->lli_open_fd_read_count;
259 mutex_lock(&lli->lli_och_mutex);
260 if (*och_usecount > 0) {
261 /* There are still users of this handle, so skip
263 mutex_unlock(&lli->lli_och_mutex);
269 mutex_unlock(&lli->lli_och_mutex);
272 /* There might be a race and this handle may already
274 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
280 static int ll_md_close(struct inode *inode, struct file *file)
282 union ldlm_policy_data policy = {
283 .l_inodebits = { MDS_INODELOCK_OPEN },
285 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
286 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
287 struct ll_inode_info *lli = ll_i2info(inode);
288 struct lustre_handle lockh;
289 enum ldlm_mode lockmode;
293 /* clear group lock, if present */
294 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
295 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
297 if (fd->fd_lease_och != NULL) {
300 /* Usually the lease is not released when the
301 * application crashed, we need to release here. */
302 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
303 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
304 PFID(&lli->lli_fid), rc, lease_broken);
306 fd->fd_lease_och = NULL;
309 if (fd->fd_och != NULL) {
310 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
315 /* Let's see if we have good enough OPEN lock on the file and if
316 we can skip talking to MDS */
317 mutex_lock(&lli->lli_och_mutex);
318 if (fd->fd_omode & FMODE_WRITE) {
320 LASSERT(lli->lli_open_fd_write_count);
321 lli->lli_open_fd_write_count--;
322 } else if (fd->fd_omode & FMODE_EXEC) {
324 LASSERT(lli->lli_open_fd_exec_count);
325 lli->lli_open_fd_exec_count--;
328 LASSERT(lli->lli_open_fd_read_count);
329 lli->lli_open_fd_read_count--;
331 mutex_unlock(&lli->lli_och_mutex);
333 if (!md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
334 LDLM_IBITS, &policy, lockmode, &lockh))
335 rc = ll_md_real_close(inode, fd->fd_omode);
338 LUSTRE_FPRIVATE(file) = NULL;
339 ll_file_data_put(fd);
344 /* While this returns an error code, fput() the caller does not, so we need
345 * to make every effort to clean up all of our state here. Also, applications
346 * rarely check close errors and even if an error is returned they will not
347 * re-try the close call.
349 int ll_file_release(struct inode *inode, struct file *file)
351 struct ll_file_data *fd;
352 struct ll_sb_info *sbi = ll_i2sbi(inode);
353 struct ll_inode_info *lli = ll_i2info(inode);
357 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
358 PFID(ll_inode2fid(inode)), inode);
360 if (inode->i_sb->s_root != file_dentry(file))
361 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
362 fd = LUSTRE_FPRIVATE(file);
365 /* The last ref on @file, maybe not the the owner pid of statahead,
366 * because parent and child process can share the same file handle. */
367 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
368 ll_deauthorize_statahead(inode, fd);
370 if (inode->i_sb->s_root == file_dentry(file)) {
371 LUSTRE_FPRIVATE(file) = NULL;
372 ll_file_data_put(fd);
376 if (!S_ISDIR(inode->i_mode)) {
377 if (lli->lli_clob != NULL)
378 lov_read_and_clear_async_rc(lli->lli_clob);
379 lli->lli_async_rc = 0;
382 rc = ll_md_close(inode, file);
384 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
385 libcfs_debug_dumplog();
390 static inline int ll_dom_readpage(void *data, struct page *page)
392 struct niobuf_local *lnb = data;
395 kaddr = ll_kmap_atomic(page, KM_USER0);
396 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
397 if (lnb->lnb_len < PAGE_SIZE)
398 memset(kaddr + lnb->lnb_len, 0,
399 PAGE_SIZE - lnb->lnb_len);
400 flush_dcache_page(page);
401 SetPageUptodate(page);
402 ll_kunmap_atomic(kaddr, KM_USER0);
408 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
409 struct lookup_intent *it)
411 struct ll_inode_info *lli = ll_i2info(inode);
412 struct cl_object *obj = lli->lli_clob;
413 struct address_space *mapping = inode->i_mapping;
415 struct niobuf_remote *rnb;
417 struct lustre_handle lockh;
418 struct ldlm_lock *lock;
419 unsigned long index, start;
420 struct niobuf_local lnb;
421 bool dom_lock = false;
428 if (it->it_lock_mode != 0) {
429 lockh.cookie = it->it_lock_handle;
430 lock = ldlm_handle2lock(&lockh);
432 dom_lock = ldlm_has_dom(lock);
438 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
442 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
443 if (rnb == NULL || rnb->rnb_len == 0)
446 CDEBUG(D_INFO, "Get data buffer along with open, len %i, i_size %llu\n",
447 rnb->rnb_len, i_size_read(inode));
449 data = (char *)rnb + sizeof(*rnb);
451 lnb.lnb_file_offset = rnb->rnb_offset;
452 start = lnb.lnb_file_offset / PAGE_SIZE;
454 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
455 lnb.lnb_page_offset = 0;
457 lnb.lnb_data = data + (index << PAGE_SHIFT);
458 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
459 if (lnb.lnb_len > PAGE_SIZE)
460 lnb.lnb_len = PAGE_SIZE;
462 vmpage = read_cache_page(mapping, index + start,
463 ll_dom_readpage, &lnb);
464 if (IS_ERR(vmpage)) {
465 CWARN("%s: cannot fill page %lu for "DFID
466 " with data: rc = %li\n",
467 ll_get_fsname(inode->i_sb, NULL, 0),
468 index + start, PFID(lu_object_fid(&obj->co_lu)),
474 } while (rnb->rnb_len > (index << PAGE_SHIFT));
478 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
479 struct lookup_intent *itp)
481 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
482 struct dentry *parent = de->d_parent;
483 const char *name = NULL;
485 struct md_op_data *op_data;
486 struct ptlrpc_request *req = NULL;
490 LASSERT(parent != NULL);
491 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
493 /* if server supports open-by-fid, or file name is invalid, don't pack
494 * name in open request */
495 if (!(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID) &&
496 lu_name_is_valid_2(de->d_name.name, de->d_name.len)) {
497 name = de->d_name.name;
498 len = de->d_name.len;
501 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
502 name, len, 0, LUSTRE_OPC_ANY, NULL);
504 RETURN(PTR_ERR(op_data));
505 op_data->op_data = lmm;
506 op_data->op_data_size = lmmsize;
508 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
509 &ll_md_blocking_ast, 0);
510 ll_finish_md_op_data(op_data);
512 /* reason for keep own exit path - don`t flood log
513 * with messages with -ESTALE errors.
515 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
516 it_open_error(DISP_OPEN_OPEN, itp))
518 ll_release_openhandle(de, itp);
522 if (it_disposition(itp, DISP_LOOKUP_NEG))
523 GOTO(out, rc = -ENOENT);
525 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
526 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
527 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
531 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
533 if (!rc && itp->it_lock_mode) {
534 ll_dom_finish_open(de->d_inode, req, itp);
535 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
539 ptlrpc_req_finished(req);
540 ll_intent_drop_lock(itp);
542 /* We did open by fid, but by the time we got to the server,
543 * the object disappeared. If this is a create, we cannot really
544 * tell the userspace that the file it was trying to create
545 * does not exist. Instead let's return -ESTALE, and the VFS will
546 * retry the create with LOOKUP_REVAL that we are going to catch
547 * in ll_revalidate_dentry() and use lookup then.
549 if (rc == -ENOENT && itp->it_op & IT_CREAT)
555 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
556 struct obd_client_handle *och)
558 struct mdt_body *body;
560 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
561 och->och_open_handle = body->mbo_open_handle;
562 och->och_fid = body->mbo_fid1;
563 och->och_lease_handle.cookie = it->it_lock_handle;
564 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
565 och->och_flags = it->it_flags;
567 return md_set_open_replay_data(md_exp, och, it);
570 static int ll_local_open(struct file *file, struct lookup_intent *it,
571 struct ll_file_data *fd, struct obd_client_handle *och)
573 struct inode *inode = file_inode(file);
576 LASSERT(!LUSTRE_FPRIVATE(file));
583 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
588 LUSTRE_FPRIVATE(file) = fd;
589 ll_readahead_init(inode, &fd->fd_ras);
590 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
592 /* ll_cl_context initialize */
593 rwlock_init(&fd->fd_lock);
594 INIT_LIST_HEAD(&fd->fd_lccs);
599 /* Open a file, and (for the very first open) create objects on the OSTs at
600 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
601 * creation or open until ll_lov_setstripe() ioctl is called.
603 * If we already have the stripe MD locally then we don't request it in
604 * md_open(), by passing a lmm_size = 0.
606 * It is up to the application to ensure no other processes open this file
607 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
608 * used. We might be able to avoid races of that sort by getting lli_open_sem
609 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
610 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
612 int ll_file_open(struct inode *inode, struct file *file)
614 struct ll_inode_info *lli = ll_i2info(inode);
615 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
616 .it_flags = file->f_flags };
617 struct obd_client_handle **och_p = NULL;
618 __u64 *och_usecount = NULL;
619 struct ll_file_data *fd;
623 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
624 PFID(ll_inode2fid(inode)), inode, file->f_flags);
626 it = file->private_data; /* XXX: compat macro */
627 file->private_data = NULL; /* prevent ll_local_open assertion */
629 fd = ll_file_data_get();
631 GOTO(out_nofiledata, rc = -ENOMEM);
634 if (S_ISDIR(inode->i_mode))
635 ll_authorize_statahead(inode, fd);
637 if (inode->i_sb->s_root == file_dentry(file)) {
638 LUSTRE_FPRIVATE(file) = fd;
642 if (!it || !it->it_disposition) {
643 /* Convert f_flags into access mode. We cannot use file->f_mode,
644 * because everything but O_ACCMODE mask was stripped from
646 if ((oit.it_flags + 1) & O_ACCMODE)
648 if (file->f_flags & O_TRUNC)
649 oit.it_flags |= FMODE_WRITE;
651 /* kernel only call f_op->open in dentry_open. filp_open calls
652 * dentry_open after call to open_namei that checks permissions.
653 * Only nfsd_open call dentry_open directly without checking
654 * permissions and because of that this code below is safe.
656 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
657 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
659 /* We do not want O_EXCL here, presumably we opened the file
660 * already? XXX - NFS implications? */
661 oit.it_flags &= ~O_EXCL;
663 /* bug20584, if "it_flags" contains O_CREAT, the file will be
664 * created if necessary, then "IT_CREAT" should be set to keep
665 * consistent with it */
666 if (oit.it_flags & O_CREAT)
667 oit.it_op |= IT_CREAT;
673 /* Let's see if we have file open on MDS already. */
674 if (it->it_flags & FMODE_WRITE) {
675 och_p = &lli->lli_mds_write_och;
676 och_usecount = &lli->lli_open_fd_write_count;
677 } else if (it->it_flags & FMODE_EXEC) {
678 och_p = &lli->lli_mds_exec_och;
679 och_usecount = &lli->lli_open_fd_exec_count;
681 och_p = &lli->lli_mds_read_och;
682 och_usecount = &lli->lli_open_fd_read_count;
685 mutex_lock(&lli->lli_och_mutex);
686 if (*och_p) { /* Open handle is present */
687 if (it_disposition(it, DISP_OPEN_OPEN)) {
688 /* Well, there's extra open request that we do not need,
689 let's close it somehow. This will decref request. */
690 rc = it_open_error(DISP_OPEN_OPEN, it);
692 mutex_unlock(&lli->lli_och_mutex);
693 GOTO(out_openerr, rc);
696 ll_release_openhandle(file_dentry(file), it);
700 rc = ll_local_open(file, it, fd, NULL);
703 mutex_unlock(&lli->lli_och_mutex);
704 GOTO(out_openerr, rc);
707 LASSERT(*och_usecount == 0);
708 if (!it->it_disposition) {
709 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
710 /* We cannot just request lock handle now, new ELC code
711 means that one of other OPEN locks for this file
712 could be cancelled, and since blocking ast handler
713 would attempt to grab och_mutex as well, that would
714 result in a deadlock */
715 mutex_unlock(&lli->lli_och_mutex);
717 * Normally called under two situations:
719 * 2. A race/condition on MDS resulting in no open
720 * handle to be returned from LOOKUP|OPEN request,
721 * for example if the target entry was a symlink.
723 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
724 * marked by a bit set in ll_iget_for_nfs. Clear the
725 * bit so that it's not confusing later callers.
727 * NB; when ldd is NULL, it must have come via normal
728 * lookup path only, since ll_iget_for_nfs always calls
731 if (ldd && ldd->lld_nfs_dentry) {
732 ldd->lld_nfs_dentry = 0;
733 it->it_flags |= MDS_OPEN_LOCK;
737 * Always specify MDS_OPEN_BY_FID because we don't want
738 * to get file with different fid.
740 it->it_flags |= MDS_OPEN_BY_FID;
741 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
744 GOTO(out_openerr, rc);
748 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
750 GOTO(out_och_free, rc = -ENOMEM);
754 /* md_intent_lock() didn't get a request ref if there was an
755 * open error, so don't do cleanup on the request here
757 /* XXX (green): Should not we bail out on any error here, not
758 * just open error? */
759 rc = it_open_error(DISP_OPEN_OPEN, it);
761 GOTO(out_och_free, rc);
763 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
764 "inode %p: disposition %x, status %d\n", inode,
765 it_disposition(it, ~0), it->it_status);
767 rc = ll_local_open(file, it, fd, *och_p);
769 GOTO(out_och_free, rc);
771 mutex_unlock(&lli->lli_och_mutex);
774 /* Must do this outside lli_och_mutex lock to prevent deadlock where
775 different kind of OPEN lock for this same inode gets cancelled
776 by ldlm_cancel_lru */
777 if (!S_ISREG(inode->i_mode))
778 GOTO(out_och_free, rc);
780 cl_lov_delay_create_clear(&file->f_flags);
781 GOTO(out_och_free, rc);
785 if (och_p && *och_p) {
786 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
787 *och_p = NULL; /* OBD_FREE writes some magic there */
790 mutex_unlock(&lli->lli_och_mutex);
793 if (lli->lli_opendir_key == fd)
794 ll_deauthorize_statahead(inode, fd);
796 ll_file_data_put(fd);
798 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
802 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
803 ptlrpc_req_finished(it->it_request);
804 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
810 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
811 struct ldlm_lock_desc *desc, void *data, int flag)
814 struct lustre_handle lockh;
818 case LDLM_CB_BLOCKING:
819 ldlm_lock2handle(lock, &lockh);
820 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
822 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
826 case LDLM_CB_CANCELING:
834 * When setting a lease on a file, we take ownership of the lli_mds_*_och
835 * and save it as fd->fd_och so as to force client to reopen the file even
836 * if it has an open lock in cache already.
838 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
839 struct lustre_handle *old_open_handle)
841 struct ll_inode_info *lli = ll_i2info(inode);
842 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
843 struct obd_client_handle **och_p;
848 /* Get the openhandle of the file */
849 mutex_lock(&lli->lli_och_mutex);
850 if (fd->fd_lease_och != NULL)
851 GOTO(out_unlock, rc = -EBUSY);
853 if (fd->fd_och == NULL) {
854 if (file->f_mode & FMODE_WRITE) {
855 LASSERT(lli->lli_mds_write_och != NULL);
856 och_p = &lli->lli_mds_write_och;
857 och_usecount = &lli->lli_open_fd_write_count;
859 LASSERT(lli->lli_mds_read_och != NULL);
860 och_p = &lli->lli_mds_read_och;
861 och_usecount = &lli->lli_open_fd_read_count;
864 if (*och_usecount > 1)
865 GOTO(out_unlock, rc = -EBUSY);
872 *old_open_handle = fd->fd_och->och_open_handle;
876 mutex_unlock(&lli->lli_och_mutex);
881 * Release ownership on lli_mds_*_och when putting back a file lease.
883 static int ll_lease_och_release(struct inode *inode, struct file *file)
885 struct ll_inode_info *lli = ll_i2info(inode);
886 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
887 struct obd_client_handle **och_p;
888 struct obd_client_handle *old_och = NULL;
893 mutex_lock(&lli->lli_och_mutex);
894 if (file->f_mode & FMODE_WRITE) {
895 och_p = &lli->lli_mds_write_och;
896 och_usecount = &lli->lli_open_fd_write_count;
898 och_p = &lli->lli_mds_read_och;
899 och_usecount = &lli->lli_open_fd_read_count;
902 /* The file may have been open by another process (broken lease) so
903 * *och_p is not NULL. In this case we should simply increase usecount
906 if (*och_p != NULL) {
907 old_och = fd->fd_och;
914 mutex_unlock(&lli->lli_och_mutex);
917 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
923 * Acquire a lease and open the file.
925 static struct obd_client_handle *
926 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
929 struct lookup_intent it = { .it_op = IT_OPEN };
930 struct ll_sb_info *sbi = ll_i2sbi(inode);
931 struct md_op_data *op_data;
932 struct ptlrpc_request *req = NULL;
933 struct lustre_handle old_open_handle = { 0 };
934 struct obd_client_handle *och = NULL;
939 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
940 RETURN(ERR_PTR(-EINVAL));
943 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
944 RETURN(ERR_PTR(-EPERM));
946 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
953 RETURN(ERR_PTR(-ENOMEM));
955 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
956 LUSTRE_OPC_ANY, NULL);
958 GOTO(out, rc = PTR_ERR(op_data));
960 /* To tell the MDT this openhandle is from the same owner */
961 op_data->op_open_handle = old_open_handle;
963 it.it_flags = fmode | open_flags;
964 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
965 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
966 &ll_md_blocking_lease_ast,
967 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
968 * it can be cancelled which may mislead applications that the lease is
970 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
971 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
972 * doesn't deal with openhandle, so normal openhandle will be leaked. */
973 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
974 ll_finish_md_op_data(op_data);
975 ptlrpc_req_finished(req);
977 GOTO(out_release_it, rc);
979 if (it_disposition(&it, DISP_LOOKUP_NEG))
980 GOTO(out_release_it, rc = -ENOENT);
982 rc = it_open_error(DISP_OPEN_OPEN, &it);
984 GOTO(out_release_it, rc);
986 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
987 ll_och_fill(sbi->ll_md_exp, &it, och);
989 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
990 GOTO(out_close, rc = -EOPNOTSUPP);
992 /* already get lease, handle lease lock */
993 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
994 if (it.it_lock_mode == 0 ||
995 it.it_lock_bits != MDS_INODELOCK_OPEN) {
996 /* open lock must return for lease */
997 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
998 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1000 GOTO(out_close, rc = -EPROTO);
1003 ll_intent_release(&it);
1007 /* Cancel open lock */
1008 if (it.it_lock_mode != 0) {
1009 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1011 it.it_lock_mode = 0;
1012 och->och_lease_handle.cookie = 0ULL;
1014 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1016 CERROR("%s: error closing file "DFID": %d\n",
1017 ll_get_fsname(inode->i_sb, NULL, 0),
1018 PFID(&ll_i2info(inode)->lli_fid), rc2);
1019 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1021 ll_intent_release(&it);
1025 RETURN(ERR_PTR(rc));
1029 * Check whether a layout swap can be done between two inodes.
1031 * \param[in] inode1 First inode to check
1032 * \param[in] inode2 Second inode to check
1034 * \retval 0 on success, layout swap can be performed between both inodes
1035 * \retval negative error code if requirements are not met
1037 static int ll_check_swap_layouts_validity(struct inode *inode1,
1038 struct inode *inode2)
1040 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1043 if (inode_permission(inode1, MAY_WRITE) ||
1044 inode_permission(inode2, MAY_WRITE))
1047 if (inode1->i_sb != inode2->i_sb)
1053 static int ll_swap_layouts_close(struct obd_client_handle *och,
1054 struct inode *inode, struct inode *inode2)
1056 const struct lu_fid *fid1 = ll_inode2fid(inode);
1057 const struct lu_fid *fid2;
1061 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1062 ll_get_fsname(inode->i_sb, NULL, 0), PFID(fid1));
1064 rc = ll_check_swap_layouts_validity(inode, inode2);
1066 GOTO(out_free_och, rc);
1068 /* We now know that inode2 is a lustre inode */
1069 fid2 = ll_inode2fid(inode2);
1071 rc = lu_fid_cmp(fid1, fid2);
1073 GOTO(out_free_och, rc = -EINVAL);
1075 /* Close the file and {swap,merge} layouts between inode & inode2.
1076 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1077 * because we still need it to pack l_remote_handle to MDT. */
1078 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1081 och = NULL; /* freed in ll_close_inode_openhandle() */
1091 * Release lease and close the file.
1092 * It will check if the lease has ever broken.
1094 static int ll_lease_close_intent(struct obd_client_handle *och,
1095 struct inode *inode,
1096 bool *lease_broken, enum mds_op_bias bias,
1099 struct ldlm_lock *lock;
1100 bool cancelled = true;
1104 lock = ldlm_handle2lock(&och->och_lease_handle);
1106 lock_res_and_lock(lock);
1107 cancelled = ldlm_is_cancel(lock);
1108 unlock_res_and_lock(lock);
1109 LDLM_LOCK_PUT(lock);
1112 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1113 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1115 if (lease_broken != NULL)
1116 *lease_broken = cancelled;
1118 if (!cancelled && !bias)
1119 ldlm_cli_cancel(&och->och_lease_handle, 0);
1121 if (cancelled) { /* no need to excute intent */
1126 rc = ll_close_inode_openhandle(inode, och, bias, data);
1130 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1133 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1137 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1139 static int ll_lease_file_resync(struct obd_client_handle *och,
1140 struct inode *inode)
1142 struct ll_sb_info *sbi = ll_i2sbi(inode);
1143 struct md_op_data *op_data;
1144 __u64 data_version_unused;
1148 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1149 LUSTRE_OPC_ANY, NULL);
1150 if (IS_ERR(op_data))
1151 RETURN(PTR_ERR(op_data));
1153 /* before starting file resync, it's necessary to clean up page cache
1154 * in client memory, otherwise once the layout version is increased,
1155 * writing back cached data will be denied the OSTs. */
1156 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1160 op_data->op_lease_handle = och->och_lease_handle;
1161 rc = md_file_resync(sbi->ll_md_exp, op_data);
1167 ll_finish_md_op_data(op_data);
1171 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1173 struct ll_inode_info *lli = ll_i2info(inode);
1174 struct cl_object *obj = lli->lli_clob;
1175 struct cl_attr *attr = vvp_env_thread_attr(env);
1183 ll_inode_size_lock(inode);
1185 /* Merge timestamps the most recently obtained from MDS with
1186 * timestamps obtained from OSTs.
1188 * Do not overwrite atime of inode because it may be refreshed
1189 * by file_accessed() function. If the read was served by cache
1190 * data, there is no RPC to be sent so that atime may not be
1191 * transferred to OSTs at all. MDT only updates atime at close time
1192 * if it's at least 'mdd.*.atime_diff' older.
1193 * All in all, the atime in Lustre does not strictly comply with
1194 * POSIX. Solving this problem needs to send an RPC to MDT for each
1195 * read, this will hurt performance. */
1196 if (LTIME_S(inode->i_atime) < lli->lli_atime || lli->lli_update_atime) {
1197 LTIME_S(inode->i_atime) = lli->lli_atime;
1198 lli->lli_update_atime = 0;
1200 LTIME_S(inode->i_mtime) = lli->lli_mtime;
1201 LTIME_S(inode->i_ctime) = lli->lli_ctime;
1203 atime = LTIME_S(inode->i_atime);
1204 mtime = LTIME_S(inode->i_mtime);
1205 ctime = LTIME_S(inode->i_ctime);
1207 cl_object_attr_lock(obj);
1208 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1211 rc = cl_object_attr_get(env, obj, attr);
1212 cl_object_attr_unlock(obj);
1215 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1217 if (atime < attr->cat_atime)
1218 atime = attr->cat_atime;
1220 if (ctime < attr->cat_ctime)
1221 ctime = attr->cat_ctime;
1223 if (mtime < attr->cat_mtime)
1224 mtime = attr->cat_mtime;
1226 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1227 PFID(&lli->lli_fid), attr->cat_size);
1229 i_size_write(inode, attr->cat_size);
1230 inode->i_blocks = attr->cat_blocks;
1232 LTIME_S(inode->i_atime) = atime;
1233 LTIME_S(inode->i_mtime) = mtime;
1234 LTIME_S(inode->i_ctime) = ctime;
1237 ll_inode_size_unlock(inode);
1243 * Set designated mirror for I/O.
1245 * So far only read, write, and truncated can support to issue I/O to
1246 * designated mirror.
1248 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1250 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1252 /* clear layout version for generic(non-resync) I/O in case it carries
1253 * stale layout version due to I/O restart */
1254 io->ci_layout_version = 0;
1256 /* FLR: disable non-delay for designated mirror I/O because obviously
1257 * only one mirror is available */
1258 if (fd->fd_designated_mirror > 0) {
1260 io->ci_designated_mirror = fd->fd_designated_mirror;
1261 io->ci_layout_version = fd->fd_layout_version;
1262 io->ci_pio = 0; /* doesn't have a mechanism to pass mirror
1266 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1267 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1270 static bool file_is_noatime(const struct file *file)
1272 const struct vfsmount *mnt = file->f_path.mnt;
1273 const struct inode *inode = file_inode((struct file *)file);
1275 /* Adapted from file_accessed() and touch_atime().*/
1276 if (file->f_flags & O_NOATIME)
1279 if (inode->i_flags & S_NOATIME)
1282 if (IS_NOATIME(inode))
1285 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1288 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1291 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1297 static int ll_file_io_ptask(struct cfs_ptask *ptask);
1299 static void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot)
1301 struct inode *inode = file_inode(file);
1302 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1304 memset(&io->u.ci_rw.rw_iter, 0, sizeof(io->u.ci_rw.rw_iter));
1305 init_sync_kiocb(&io->u.ci_rw.rw_iocb, file);
1306 io->u.ci_rw.rw_file = file;
1307 io->u.ci_rw.rw_ptask = ll_file_io_ptask;
1308 io->u.ci_rw.rw_nonblock = !!(file->f_flags & O_NONBLOCK);
1309 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1311 if (iot == CIT_WRITE) {
1312 io->u.ci_rw.rw_append = !!(file->f_flags & O_APPEND);
1313 io->u.ci_rw.rw_sync = !!(file->f_flags & O_SYNC ||
1314 file->f_flags & O_DIRECT ||
1317 io->ci_obj = ll_i2info(inode)->lli_clob;
1318 io->ci_lockreq = CILR_MAYBE;
1319 if (ll_file_nolock(file)) {
1320 io->ci_lockreq = CILR_NEVER;
1321 io->ci_no_srvlock = 1;
1322 } else if (file->f_flags & O_APPEND) {
1323 io->ci_lockreq = CILR_MANDATORY;
1325 io->ci_noatime = file_is_noatime(file);
1326 if (ll_i2sbi(inode)->ll_flags & LL_SBI_PIO)
1327 io->ci_pio = !io->u.ci_rw.rw_append;
1331 /* FLR: only use non-delay I/O for read as there is only one
1332 * avaliable mirror for write. */
1333 io->ci_ndelay = !(iot == CIT_WRITE);
1335 ll_io_set_mirror(io, file);
1338 static int ll_file_io_ptask(struct cfs_ptask *ptask)
1340 struct cl_io_pt *pt = ptask->pt_cbdata;
1341 struct file *file = pt->cip_file;
1344 loff_t pos = pt->cip_pos;
1349 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1350 file_dentry(file)->d_name.name,
1351 pt->cip_iot == CIT_READ ? "read" : "write",
1352 pos, pos + pt->cip_count);
1354 env = cl_env_get(&refcheck);
1356 RETURN(PTR_ERR(env));
1358 io = vvp_env_thread_io(env);
1359 ll_io_init(io, file, pt->cip_iot);
1360 io->u.ci_rw.rw_iter = pt->cip_iter;
1361 io->u.ci_rw.rw_iocb = pt->cip_iocb;
1362 io->ci_pio = 0; /* It's already in parallel task */
1364 rc = cl_io_rw_init(env, io, pt->cip_iot, pos,
1365 pt->cip_count - pt->cip_result);
1367 struct vvp_io *vio = vvp_env_io(env);
1369 vio->vui_io_subtype = IO_NORMAL;
1370 vio->vui_fd = LUSTRE_FPRIVATE(file);
1372 ll_cl_add(file, env, io, LCC_RW);
1373 rc = cl_io_loop(env, io);
1374 ll_cl_remove(file, env);
1376 /* cl_io_rw_init() handled IO */
1380 if (OBD_FAIL_CHECK_RESET(OBD_FAIL_LLITE_PTASK_IO_FAIL, 0)) {
1386 if (io->ci_nob > 0) {
1387 pt->cip_result += io->ci_nob;
1388 iov_iter_advance(&pt->cip_iter, io->ci_nob);
1390 pt->cip_iocb.ki_pos = pos;
1391 #ifdef HAVE_KIOCB_KI_LEFT
1392 pt->cip_iocb.ki_left = pt->cip_count - pt->cip_result;
1393 #elif defined(HAVE_KI_NBYTES)
1394 pt->cip_iocb.ki_nbytes = pt->cip_count - pt->cip_result;
1398 cl_io_fini(env, io);
1399 cl_env_put(env, &refcheck);
1401 pt->cip_need_restart = io->ci_need_restart;
1403 CDEBUG(D_VFSTRACE, "%s: %s ret: %zd, rc: %d\n",
1404 file_dentry(file)->d_name.name,
1405 pt->cip_iot == CIT_READ ? "read" : "write",
1406 pt->cip_result, rc);
1408 RETURN(pt->cip_result > 0 ? 0 : rc);
1412 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1413 struct file *file, enum cl_io_type iot,
1414 loff_t *ppos, size_t count)
1416 struct range_lock range;
1417 struct vvp_io *vio = vvp_env_io(env);
1418 struct inode *inode = file_inode(file);
1419 struct ll_inode_info *lli = ll_i2info(inode);
1420 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1425 unsigned retried = 0;
1426 bool restarted = false;
1430 CDEBUG(D_VFSTRACE, "%s: %s range: [%llu, %llu)\n",
1431 file_dentry(file)->d_name.name,
1432 iot == CIT_READ ? "read" : "write", pos, pos + count);
1435 io = vvp_env_thread_io(env);
1436 ll_io_init(io, file, iot);
1437 if (args->via_io_subtype == IO_NORMAL) {
1438 io->u.ci_rw.rw_iter = *args->u.normal.via_iter;
1439 io->u.ci_rw.rw_iocb = *args->u.normal.via_iocb;
1441 if (args->via_io_subtype != IO_NORMAL || restarted)
1443 io->ci_ndelay_tried = retried;
1445 if (cl_io_rw_init(env, io, iot, pos, count) == 0) {
1446 bool range_locked = false;
1448 if (file->f_flags & O_APPEND)
1449 range_lock_init(&range, 0, LUSTRE_EOF);
1451 range_lock_init(&range, pos, pos + count - 1);
1453 vio->vui_fd = LUSTRE_FPRIVATE(file);
1454 vio->vui_io_subtype = args->via_io_subtype;
1456 switch (vio->vui_io_subtype) {
1458 /* Direct IO reads must also take range lock,
1459 * or multiple reads will try to work on the same pages
1460 * See LU-6227 for details. */
1461 if (((iot == CIT_WRITE) ||
1462 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1463 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1464 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1466 rc = range_lock(&lli->lli_write_tree, &range);
1470 range_locked = true;
1474 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1475 vio->u.splice.vui_flags = args->u.splice.via_flags;
1478 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1482 ll_cl_add(file, env, io, LCC_RW);
1483 if (io->ci_pio && iot == CIT_WRITE && !IS_NOSEC(inode) &&
1484 !lli->lli_inode_locked) {
1486 lli->lli_inode_locked = 1;
1488 rc = cl_io_loop(env, io);
1489 if (lli->lli_inode_locked) {
1490 lli->lli_inode_locked = 0;
1491 inode_unlock(inode);
1493 ll_cl_remove(file, env);
1496 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1498 range_unlock(&lli->lli_write_tree, &range);
1501 /* cl_io_rw_init() handled IO */
1505 if (io->ci_nob > 0) {
1506 result += io->ci_nob;
1507 count -= io->ci_nob;
1509 if (args->via_io_subtype == IO_NORMAL) {
1510 iov_iter_advance(args->u.normal.via_iter, io->ci_nob);
1512 /* CLIO is too complicated. See LU-11069. */
1513 if (cl_io_is_append(io))
1514 pos = io->u.ci_rw.rw_iocb.ki_pos;
1518 args->u.normal.via_iocb->ki_pos = pos;
1519 #ifdef HAVE_KIOCB_KI_LEFT
1520 args->u.normal.via_iocb->ki_left = count;
1521 #elif defined(HAVE_KI_NBYTES)
1522 args->u.normal.via_iocb->ki_nbytes = count;
1526 pos = io->u.ci_rw.rw_range.cir_pos;
1530 cl_io_fini(env, io);
1533 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1534 file->f_path.dentry->d_name.name,
1535 iot, rc, result, io->ci_need_restart);
1537 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1539 "%s: restart %s range: [%llu, %llu) ret: %zd, rc: %d\n",
1540 file_dentry(file)->d_name.name,
1541 iot == CIT_READ ? "read" : "write",
1542 pos, pos + count, result, rc);
1543 /* preserve the tried count for FLR */
1544 retried = io->ci_ndelay_tried;
1549 if (iot == CIT_READ) {
1551 ll_stats_ops_tally(ll_i2sbi(inode),
1552 LPROC_LL_READ_BYTES, result);
1553 } else if (iot == CIT_WRITE) {
1555 ll_stats_ops_tally(ll_i2sbi(inode),
1556 LPROC_LL_WRITE_BYTES, result);
1557 fd->fd_write_failed = false;
1558 } else if (result == 0 && rc == 0) {
1561 fd->fd_write_failed = true;
1563 fd->fd_write_failed = false;
1564 } else if (rc != -ERESTARTSYS) {
1565 fd->fd_write_failed = true;
1569 CDEBUG(D_VFSTRACE, "%s: %s *ppos: %llu, pos: %llu, ret: %zd, rc: %d\n",
1570 file_dentry(file)->d_name.name,
1571 iot == CIT_READ ? "read" : "write", *ppos, pos, result, rc);
1575 RETURN(result > 0 ? result : rc);
1579 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1580 * especially for small I/O.
1582 * To serve a read request, CLIO has to create and initialize a cl_io and
1583 * then request DLM lock. This has turned out to have siginificant overhead
1584 * and affects the performance of small I/O dramatically.
1586 * It's not necessary to create a cl_io for each I/O. Under the help of read
1587 * ahead, most of the pages being read are already in memory cache and we can
1588 * read those pages directly because if the pages exist, the corresponding DLM
1589 * lock must exist so that page content must be valid.
1591 * In fast read implementation, the llite speculatively finds and reads pages
1592 * in memory cache. There are three scenarios for fast read:
1593 * - If the page exists and is uptodate, kernel VM will provide the data and
1594 * CLIO won't be intervened;
1595 * - If the page was brought into memory by read ahead, it will be exported
1596 * and read ahead parameters will be updated;
1597 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1598 * it will go back and invoke normal read, i.e., a cl_io will be created
1599 * and DLM lock will be requested.
1601 * POSIX compliance: posix standard states that read is intended to be atomic.
1602 * Lustre read implementation is in line with Linux kernel read implementation
1603 * and neither of them complies with POSIX standard in this matter. Fast read
1604 * doesn't make the situation worse on single node but it may interleave write
1605 * results from multiple nodes due to short read handling in ll_file_aio_read().
1607 * \param env - lu_env
1608 * \param iocb - kiocb from kernel
1609 * \param iter - user space buffers where the data will be copied
1611 * \retval - number of bytes have been read, or error code if error occurred.
1614 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1618 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1621 /* NB: we can't do direct IO for fast read because it will need a lock
1622 * to make IO engine happy. */
1623 if (iocb->ki_filp->f_flags & O_DIRECT)
1626 result = generic_file_read_iter(iocb, iter);
1628 /* If the first page is not in cache, generic_file_aio_read() will be
1629 * returned with -ENODATA.
1630 * See corresponding code in ll_readpage(). */
1631 if (result == -ENODATA)
1635 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1636 LPROC_LL_READ_BYTES, result);
1642 * Read from a file (through the page cache).
1644 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1647 struct vvp_io_args *args;
1652 result = ll_do_fast_read(iocb, to);
1653 if (result < 0 || iov_iter_count(to) == 0)
1656 env = cl_env_get(&refcheck);
1658 return PTR_ERR(env);
1660 args = ll_env_args(env, IO_NORMAL);
1661 args->u.normal.via_iter = to;
1662 args->u.normal.via_iocb = iocb;
1664 rc2 = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ,
1665 &iocb->ki_pos, iov_iter_count(to));
1668 else if (result == 0)
1671 cl_env_put(env, &refcheck);
1677 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1678 * If a page is already in the page cache and dirty (and some other things -
1679 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1680 * write to it without doing a full I/O, because Lustre already knows about it
1681 * and will write it out. This saves a lot of processing time.
1683 * All writes here are within one page, so exclusion is handled by the page
1684 * lock on the vm page. We do not do tiny writes for writes which touch
1685 * multiple pages because it's very unlikely multiple sequential pages are
1686 * are already dirty.
1688 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1689 * and are unlikely to be to already dirty pages.
1691 * Attribute updates are important here, we do them in ll_tiny_write_end.
1693 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1695 ssize_t count = iov_iter_count(iter);
1696 struct file *file = iocb->ki_filp;
1697 struct inode *inode = file_inode(file);
1702 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1703 * of function for why.
1705 if (count >= PAGE_SIZE ||
1706 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1709 result = __generic_file_write_iter(iocb, iter);
1711 /* If the page is not already dirty, ll_tiny_write_begin returns
1712 * -ENODATA. We continue on to normal write.
1714 if (result == -ENODATA)
1718 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1720 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1723 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1729 * Write to a file (through the page cache).
1731 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1733 struct vvp_io_args *args;
1735 ssize_t rc_tiny = 0, rc_normal;
1740 /* NB: we can't do direct IO for tiny writes because they use the page
1741 * cache, we can't do sync writes because tiny writes can't flush
1742 * pages, and we can't do append writes because we can't guarantee the
1743 * required DLM locks are held to protect file size.
1745 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(iocb->ki_filp))) &&
1746 !(iocb->ki_filp->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1747 rc_tiny = ll_do_tiny_write(iocb, from);
1749 /* In case of error, go on and try normal write - Only stop if tiny
1750 * write completed I/O.
1752 if (iov_iter_count(from) == 0)
1753 GOTO(out, rc_normal = rc_tiny);
1755 env = cl_env_get(&refcheck);
1757 return PTR_ERR(env);
1759 args = ll_env_args(env, IO_NORMAL);
1760 args->u.normal.via_iter = from;
1761 args->u.normal.via_iocb = iocb;
1763 rc_normal = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE,
1764 &iocb->ki_pos, iov_iter_count(from));
1766 /* On success, combine bytes written. */
1767 if (rc_tiny >= 0 && rc_normal > 0)
1768 rc_normal += rc_tiny;
1769 /* On error, only return error from normal write if tiny write did not
1770 * write any bytes. Otherwise return bytes written by tiny write.
1772 else if (rc_tiny > 0)
1773 rc_normal = rc_tiny;
1775 cl_env_put(env, &refcheck);
1780 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1782 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1784 static int ll_file_get_iov_count(const struct iovec *iov,
1785 unsigned long *nr_segs, size_t *count)
1790 for (seg = 0; seg < *nr_segs; seg++) {
1791 const struct iovec *iv = &iov[seg];
1794 * If any segment has a negative length, or the cumulative
1795 * length ever wraps negative then return -EINVAL.
1798 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1800 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1805 cnt -= iv->iov_len; /* This segment is no good */
1812 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1813 unsigned long nr_segs, loff_t pos)
1820 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1824 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1825 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1826 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1827 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1828 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1830 result = ll_file_read_iter(iocb, &to);
1835 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1838 struct iovec iov = { .iov_base = buf, .iov_len = count };
1843 init_sync_kiocb(&kiocb, file);
1844 kiocb.ki_pos = *ppos;
1845 #ifdef HAVE_KIOCB_KI_LEFT
1846 kiocb.ki_left = count;
1847 #elif defined(HAVE_KI_NBYTES)
1848 kiocb.i_nbytes = count;
1851 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1852 *ppos = kiocb.ki_pos;
1858 * Write to a file (through the page cache).
1861 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1862 unsigned long nr_segs, loff_t pos)
1864 struct iov_iter from;
1869 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1873 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1874 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1875 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1876 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1877 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1879 result = ll_file_write_iter(iocb, &from);
1884 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1885 size_t count, loff_t *ppos)
1887 struct iovec iov = { .iov_base = (void __user *)buf,
1894 init_sync_kiocb(&kiocb, file);
1895 kiocb.ki_pos = *ppos;
1896 #ifdef HAVE_KIOCB_KI_LEFT
1897 kiocb.ki_left = count;
1898 #elif defined(HAVE_KI_NBYTES)
1899 kiocb.ki_nbytes = count;
1902 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1903 *ppos = kiocb.ki_pos;
1907 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1910 * Send file content (through pagecache) somewhere with helper
1912 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1913 struct pipe_inode_info *pipe, size_t count,
1917 struct vvp_io_args *args;
1922 env = cl_env_get(&refcheck);
1924 RETURN(PTR_ERR(env));
1926 args = ll_env_args(env, IO_SPLICE);
1927 args->u.splice.via_pipe = pipe;
1928 args->u.splice.via_flags = flags;
1930 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
1931 cl_env_put(env, &refcheck);
1935 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
1936 __u64 flags, struct lov_user_md *lum, int lum_size)
1938 struct lookup_intent oit = {
1940 .it_flags = flags | MDS_OPEN_BY_FID,
1945 ll_inode_size_lock(inode);
1946 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
1948 GOTO(out_unlock, rc);
1950 ll_release_openhandle(dentry, &oit);
1953 ll_inode_size_unlock(inode);
1954 ll_intent_release(&oit);
1959 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
1960 struct lov_mds_md **lmmp, int *lmm_size,
1961 struct ptlrpc_request **request)
1963 struct ll_sb_info *sbi = ll_i2sbi(inode);
1964 struct mdt_body *body;
1965 struct lov_mds_md *lmm = NULL;
1966 struct ptlrpc_request *req = NULL;
1967 struct md_op_data *op_data;
1970 rc = ll_get_default_mdsize(sbi, &lmmsize);
1974 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
1975 strlen(filename), lmmsize,
1976 LUSTRE_OPC_ANY, NULL);
1977 if (IS_ERR(op_data))
1978 RETURN(PTR_ERR(op_data));
1980 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
1981 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
1982 ll_finish_md_op_data(op_data);
1984 CDEBUG(D_INFO, "md_getattr_name failed "
1985 "on %s: rc %d\n", filename, rc);
1989 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
1990 LASSERT(body != NULL); /* checked by mdc_getattr_name */
1992 lmmsize = body->mbo_eadatasize;
1994 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
1996 GOTO(out, rc = -ENODATA);
1999 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2000 LASSERT(lmm != NULL);
2002 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2003 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2004 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1))
2005 GOTO(out, rc = -EPROTO);
2008 * This is coming from the MDS, so is probably in
2009 * little endian. We convert it to host endian before
2010 * passing it to userspace.
2012 if (LOV_MAGIC != cpu_to_le32(LOV_MAGIC)) {
2015 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2016 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2017 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2018 if (le32_to_cpu(lmm->lmm_pattern) &
2019 LOV_PATTERN_F_RELEASED)
2023 /* if function called for directory - we should
2024 * avoid swab not existent lsm objects */
2025 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1)) {
2026 lustre_swab_lov_user_md_v1(
2027 (struct lov_user_md_v1 *)lmm);
2028 if (S_ISREG(body->mbo_mode))
2029 lustre_swab_lov_user_md_objects(
2030 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2032 } else if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2033 lustre_swab_lov_user_md_v3(
2034 (struct lov_user_md_v3 *)lmm);
2035 if (S_ISREG(body->mbo_mode))
2036 lustre_swab_lov_user_md_objects(
2037 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2039 } else if (lmm->lmm_magic ==
2040 cpu_to_le32(LOV_MAGIC_COMP_V1)) {
2041 lustre_swab_lov_comp_md_v1(
2042 (struct lov_comp_md_v1 *)lmm);
2048 *lmm_size = lmmsize;
2053 static int ll_lov_setea(struct inode *inode, struct file *file,
2056 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2057 struct lov_user_md *lump;
2058 int lum_size = sizeof(struct lov_user_md) +
2059 sizeof(struct lov_user_ost_data);
2063 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2066 OBD_ALLOC_LARGE(lump, lum_size);
2070 if (copy_from_user(lump, arg, lum_size))
2071 GOTO(out_lump, rc = -EFAULT);
2073 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2075 cl_lov_delay_create_clear(&file->f_flags);
2078 OBD_FREE_LARGE(lump, lum_size);
2082 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2089 env = cl_env_get(&refcheck);
2091 RETURN(PTR_ERR(env));
2093 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2094 cl_env_put(env, &refcheck);
2098 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2101 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2102 struct lov_user_md *klum;
2104 __u64 flags = FMODE_WRITE;
2107 rc = ll_copy_user_md(lum, &klum);
2112 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2117 rc = put_user(0, &lum->lmm_stripe_count);
2121 rc = ll_layout_refresh(inode, &gen);
2125 rc = ll_file_getstripe(inode, arg, lum_size);
2127 cl_lov_delay_create_clear(&file->f_flags);
2130 OBD_FREE(klum, lum_size);
2135 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2137 struct ll_inode_info *lli = ll_i2info(inode);
2138 struct cl_object *obj = lli->lli_clob;
2139 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2140 struct ll_grouplock grouplock;
2145 CWARN("group id for group lock must not be 0\n");
2149 if (ll_file_nolock(file))
2150 RETURN(-EOPNOTSUPP);
2152 spin_lock(&lli->lli_lock);
2153 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2154 CWARN("group lock already existed with gid %lu\n",
2155 fd->fd_grouplock.lg_gid);
2156 spin_unlock(&lli->lli_lock);
2159 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2160 spin_unlock(&lli->lli_lock);
2163 * XXX: group lock needs to protect all OST objects while PFL
2164 * can add new OST objects during the IO, so we'd instantiate
2165 * all OST objects before getting its group lock.
2170 struct cl_layout cl = {
2171 .cl_is_composite = false,
2173 struct lu_extent ext = {
2175 .e_end = OBD_OBJECT_EOF,
2178 env = cl_env_get(&refcheck);
2180 RETURN(PTR_ERR(env));
2182 rc = cl_object_layout_get(env, obj, &cl);
2183 if (!rc && cl.cl_is_composite)
2184 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2187 cl_env_put(env, &refcheck);
2192 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2193 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2197 spin_lock(&lli->lli_lock);
2198 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2199 spin_unlock(&lli->lli_lock);
2200 CERROR("another thread just won the race\n");
2201 cl_put_grouplock(&grouplock);
2205 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2206 fd->fd_grouplock = grouplock;
2207 spin_unlock(&lli->lli_lock);
2209 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2213 static int ll_put_grouplock(struct inode *inode, struct file *file,
2216 struct ll_inode_info *lli = ll_i2info(inode);
2217 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2218 struct ll_grouplock grouplock;
2221 spin_lock(&lli->lli_lock);
2222 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2223 spin_unlock(&lli->lli_lock);
2224 CWARN("no group lock held\n");
2228 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2230 if (fd->fd_grouplock.lg_gid != arg) {
2231 CWARN("group lock %lu doesn't match current id %lu\n",
2232 arg, fd->fd_grouplock.lg_gid);
2233 spin_unlock(&lli->lli_lock);
2237 grouplock = fd->fd_grouplock;
2238 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2239 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2240 spin_unlock(&lli->lli_lock);
2242 cl_put_grouplock(&grouplock);
2243 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2248 * Close inode open handle
2250 * \param dentry [in] dentry which contains the inode
2251 * \param it [in,out] intent which contains open info and result
2254 * \retval <0 failure
2256 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2258 struct inode *inode = dentry->d_inode;
2259 struct obd_client_handle *och;
2265 /* Root ? Do nothing. */
2266 if (dentry->d_inode->i_sb->s_root == dentry)
2269 /* No open handle to close? Move away */
2270 if (!it_disposition(it, DISP_OPEN_OPEN))
2273 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2275 OBD_ALLOC(och, sizeof(*och));
2277 GOTO(out, rc = -ENOMEM);
2279 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2281 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2283 /* this one is in place of ll_file_open */
2284 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2285 ptlrpc_req_finished(it->it_request);
2286 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2292 * Get size for inode for which FIEMAP mapping is requested.
2293 * Make the FIEMAP get_info call and returns the result.
2294 * \param fiemap kernel buffer to hold extens
2295 * \param num_bytes kernel buffer size
2297 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2303 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2306 /* Checks for fiemap flags */
2307 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2308 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2312 /* Check for FIEMAP_FLAG_SYNC */
2313 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2314 rc = filemap_fdatawrite(inode->i_mapping);
2319 env = cl_env_get(&refcheck);
2321 RETURN(PTR_ERR(env));
2323 if (i_size_read(inode) == 0) {
2324 rc = ll_glimpse_size(inode);
2329 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2330 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2331 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2333 /* If filesize is 0, then there would be no objects for mapping */
2334 if (fmkey.lfik_oa.o_size == 0) {
2335 fiemap->fm_mapped_extents = 0;
2339 fmkey.lfik_fiemap = *fiemap;
2341 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2342 &fmkey, fiemap, &num_bytes);
2344 cl_env_put(env, &refcheck);
2348 int ll_fid2path(struct inode *inode, void __user *arg)
2350 struct obd_export *exp = ll_i2mdexp(inode);
2351 const struct getinfo_fid2path __user *gfin = arg;
2353 struct getinfo_fid2path *gfout;
2359 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2360 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2363 /* Only need to get the buflen */
2364 if (get_user(pathlen, &gfin->gf_pathlen))
2367 if (pathlen > PATH_MAX)
2370 outsize = sizeof(*gfout) + pathlen;
2371 OBD_ALLOC(gfout, outsize);
2375 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2376 GOTO(gf_free, rc = -EFAULT);
2377 /* append root FID after gfout to let MDT know the root FID so that it
2378 * can lookup the correct path, this is mainly for fileset.
2379 * old server without fileset mount support will ignore this. */
2380 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2382 /* Call mdc_iocontrol */
2383 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2387 if (copy_to_user(arg, gfout, outsize))
2391 OBD_FREE(gfout, outsize);
2396 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2398 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2406 ioc->idv_version = 0;
2407 ioc->idv_layout_version = UINT_MAX;
2409 /* If no file object initialized, we consider its version is 0. */
2413 env = cl_env_get(&refcheck);
2415 RETURN(PTR_ERR(env));
2417 io = vvp_env_thread_io(env);
2419 io->u.ci_data_version.dv_data_version = 0;
2420 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2421 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2424 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2425 result = cl_io_loop(env, io);
2427 result = io->ci_result;
2429 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2430 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2432 cl_io_fini(env, io);
2434 if (unlikely(io->ci_need_restart))
2437 cl_env_put(env, &refcheck);
2443 * Read the data_version for inode.
2445 * This value is computed using stripe object version on OST.
2446 * Version is computed using server side locking.
2448 * @param flags if do sync on the OST side;
2450 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2451 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2453 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2455 struct ioc_data_version ioc = { .idv_flags = flags };
2458 rc = ll_ioc_data_version(inode, &ioc);
2460 *data_version = ioc.idv_version;
2466 * Trigger a HSM release request for the provided inode.
2468 int ll_hsm_release(struct inode *inode)
2471 struct obd_client_handle *och = NULL;
2472 __u64 data_version = 0;
2477 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2478 ll_get_fsname(inode->i_sb, NULL, 0),
2479 PFID(&ll_i2info(inode)->lli_fid));
2481 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2483 GOTO(out, rc = PTR_ERR(och));
2485 /* Grab latest data_version and [am]time values */
2486 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2490 env = cl_env_get(&refcheck);
2492 GOTO(out, rc = PTR_ERR(env));
2494 rc = ll_merge_attr(env, inode);
2495 cl_env_put(env, &refcheck);
2497 /* If error happen, we have the wrong size for a file.
2503 /* Release the file.
2504 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2505 * we still need it to pack l_remote_handle to MDT. */
2506 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2512 if (och != NULL && !IS_ERR(och)) /* close the file */
2513 ll_lease_close(och, inode, NULL);
2518 struct ll_swap_stack {
2521 struct inode *inode1;
2522 struct inode *inode2;
2527 static int ll_swap_layouts(struct file *file1, struct file *file2,
2528 struct lustre_swap_layouts *lsl)
2530 struct mdc_swap_layouts msl;
2531 struct md_op_data *op_data;
2534 struct ll_swap_stack *llss = NULL;
2537 OBD_ALLOC_PTR(llss);
2541 llss->inode1 = file_inode(file1);
2542 llss->inode2 = file_inode(file2);
2544 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2548 /* we use 2 bool because it is easier to swap than 2 bits */
2549 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2550 llss->check_dv1 = true;
2552 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2553 llss->check_dv2 = true;
2555 /* we cannot use lsl->sl_dvX directly because we may swap them */
2556 llss->dv1 = lsl->sl_dv1;
2557 llss->dv2 = lsl->sl_dv2;
2559 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2560 if (rc == 0) /* same file, done! */
2563 if (rc < 0) { /* sequentialize it */
2564 swap(llss->inode1, llss->inode2);
2566 swap(llss->dv1, llss->dv2);
2567 swap(llss->check_dv1, llss->check_dv2);
2571 if (gid != 0) { /* application asks to flush dirty cache */
2572 rc = ll_get_grouplock(llss->inode1, file1, gid);
2576 rc = ll_get_grouplock(llss->inode2, file2, gid);
2578 ll_put_grouplock(llss->inode1, file1, gid);
2583 /* ultimate check, before swaping the layouts we check if
2584 * dataversion has changed (if requested) */
2585 if (llss->check_dv1) {
2586 rc = ll_data_version(llss->inode1, &dv, 0);
2589 if (dv != llss->dv1)
2590 GOTO(putgl, rc = -EAGAIN);
2593 if (llss->check_dv2) {
2594 rc = ll_data_version(llss->inode2, &dv, 0);
2597 if (dv != llss->dv2)
2598 GOTO(putgl, rc = -EAGAIN);
2601 /* struct md_op_data is used to send the swap args to the mdt
2602 * only flags is missing, so we use struct mdc_swap_layouts
2603 * through the md_op_data->op_data */
2604 /* flags from user space have to be converted before they are send to
2605 * server, no flag is sent today, they are only used on the client */
2608 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2609 0, LUSTRE_OPC_ANY, &msl);
2610 if (IS_ERR(op_data))
2611 GOTO(free, rc = PTR_ERR(op_data));
2613 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2614 sizeof(*op_data), op_data, NULL);
2615 ll_finish_md_op_data(op_data);
2622 ll_put_grouplock(llss->inode2, file2, gid);
2623 ll_put_grouplock(llss->inode1, file1, gid);
2633 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2635 struct md_op_data *op_data;
2639 /* Detect out-of range masks */
2640 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2643 /* Non-root users are forbidden to set or clear flags which are
2644 * NOT defined in HSM_USER_MASK. */
2645 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2646 !cfs_capable(CFS_CAP_SYS_ADMIN))
2649 /* Detect out-of range archive id */
2650 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2651 (hss->hss_archive_id > LL_HSM_MAX_ARCHIVE))
2654 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2655 LUSTRE_OPC_ANY, hss);
2656 if (IS_ERR(op_data))
2657 RETURN(PTR_ERR(op_data));
2659 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, ll_i2mdexp(inode),
2660 sizeof(*op_data), op_data, NULL);
2662 ll_finish_md_op_data(op_data);
2667 static int ll_hsm_import(struct inode *inode, struct file *file,
2668 struct hsm_user_import *hui)
2670 struct hsm_state_set *hss = NULL;
2671 struct iattr *attr = NULL;
2675 if (!S_ISREG(inode->i_mode))
2681 GOTO(out, rc = -ENOMEM);
2683 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2684 hss->hss_archive_id = hui->hui_archive_id;
2685 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2686 rc = ll_hsm_state_set(inode, hss);
2690 OBD_ALLOC_PTR(attr);
2692 GOTO(out, rc = -ENOMEM);
2694 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2695 attr->ia_mode |= S_IFREG;
2696 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2697 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2698 attr->ia_size = hui->hui_size;
2699 attr->ia_mtime.tv_sec = hui->hui_mtime;
2700 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2701 attr->ia_atime.tv_sec = hui->hui_atime;
2702 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2704 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2705 ATTR_UID | ATTR_GID |
2706 ATTR_MTIME | ATTR_MTIME_SET |
2707 ATTR_ATIME | ATTR_ATIME_SET;
2711 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2715 inode_unlock(inode);
2727 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2729 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2730 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2733 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2735 struct inode *inode = file_inode(file);
2737 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2738 ATTR_MTIME | ATTR_MTIME_SET |
2741 .tv_sec = lfu->lfu_atime_sec,
2742 .tv_nsec = lfu->lfu_atime_nsec,
2745 .tv_sec = lfu->lfu_mtime_sec,
2746 .tv_nsec = lfu->lfu_mtime_nsec,
2749 .tv_sec = lfu->lfu_ctime_sec,
2750 .tv_nsec = lfu->lfu_ctime_nsec,
2756 if (!capable(CAP_SYS_ADMIN))
2759 if (!S_ISREG(inode->i_mode))
2763 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2765 inode_unlock(inode);
2770 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2773 case MODE_READ_USER:
2775 case MODE_WRITE_USER:
2782 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2784 /* Used to allow the upper layers of the client to request an LDLM lock
2785 * without doing an actual read or write.
2787 * Used for ladvise lockahead to manually request specific locks.
2789 * \param[in] file file this ladvise lock request is on
2790 * \param[in] ladvise ladvise struct describing this lock request
2792 * \retval 0 success, no detailed result available (sync requests
2793 * and requests sent to the server [not handled locally]
2794 * cannot return detailed results)
2795 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2796 * see definitions for details.
2797 * \retval negative negative errno on error
2799 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2801 struct lu_env *env = NULL;
2802 struct cl_io *io = NULL;
2803 struct cl_lock *lock = NULL;
2804 struct cl_lock_descr *descr = NULL;
2805 struct dentry *dentry = file->f_path.dentry;
2806 struct inode *inode = dentry->d_inode;
2807 enum cl_lock_mode cl_mode;
2808 off_t start = ladvise->lla_start;
2809 off_t end = ladvise->lla_end;
2815 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2816 "start=%llu, end=%llu\n", dentry->d_name.len,
2817 dentry->d_name.name, dentry->d_inode,
2818 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2821 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2823 GOTO(out, result = cl_mode);
2825 /* Get IO environment */
2826 result = cl_io_get(inode, &env, &io, &refcheck);
2830 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2833 * nothing to do for this io. This currently happens when
2834 * stripe sub-object's are not yet created.
2836 result = io->ci_result;
2837 } else if (result == 0) {
2838 lock = vvp_env_lock(env);
2839 descr = &lock->cll_descr;
2841 descr->cld_obj = io->ci_obj;
2842 /* Convert byte offsets to pages */
2843 descr->cld_start = cl_index(io->ci_obj, start);
2844 descr->cld_end = cl_index(io->ci_obj, end);
2845 descr->cld_mode = cl_mode;
2846 /* CEF_MUST is used because we do not want to convert a
2847 * lockahead request to a lockless lock */
2848 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2851 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2852 descr->cld_enq_flags |= CEF_SPECULATIVE;
2854 result = cl_lock_request(env, io, lock);
2856 /* On success, we need to release the lock */
2858 cl_lock_release(env, lock);
2860 cl_io_fini(env, io);
2861 cl_env_put(env, &refcheck);
2863 /* -ECANCELED indicates a matching lock with a different extent
2864 * was already present, and -EEXIST indicates a matching lock
2865 * on exactly the same extent was already present.
2866 * We convert them to positive values for userspace to make
2867 * recognizing true errors easier.
2868 * Note we can only return these detailed results on async requests,
2869 * as sync requests look the same as i/o requests for locking. */
2870 if (result == -ECANCELED)
2871 result = LLA_RESULT_DIFFERENT;
2872 else if (result == -EEXIST)
2873 result = LLA_RESULT_SAME;
2878 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2880 static int ll_ladvise_sanity(struct inode *inode,
2881 struct llapi_lu_ladvise *ladvise)
2883 enum lu_ladvise_type advice = ladvise->lla_advice;
2884 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2885 * be in the first 32 bits of enum ladvise_flags */
2886 __u32 flags = ladvise->lla_peradvice_flags;
2887 /* 3 lines at 80 characters per line, should be plenty */
2890 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2892 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2893 "last supported advice is %s (value '%d'): rc = %d\n",
2894 ll_get_fsname(inode->i_sb, NULL, 0), advice,
2895 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
2899 /* Per-advice checks */
2901 case LU_LADVISE_LOCKNOEXPAND:
2902 if (flags & ~LF_LOCKNOEXPAND_MASK) {
2904 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2906 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2907 ladvise_names[advice], rc);
2911 case LU_LADVISE_LOCKAHEAD:
2912 /* Currently only READ and WRITE modes can be requested */
2913 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
2914 ladvise->lla_lockahead_mode == 0) {
2916 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
2918 ll_get_fsname(inode->i_sb, NULL, 0),
2919 ladvise->lla_lockahead_mode,
2920 ladvise_names[advice], rc);
2923 case LU_LADVISE_WILLREAD:
2924 case LU_LADVISE_DONTNEED:
2926 /* Note fall through above - These checks apply to all advices
2927 * except LOCKNOEXPAND */
2928 if (flags & ~LF_DEFAULT_MASK) {
2930 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
2932 ll_get_fsname(inode->i_sb, NULL, 0), flags,
2933 ladvise_names[advice], rc);
2936 if (ladvise->lla_start >= ladvise->lla_end) {
2938 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
2939 "for %s: rc = %d\n",
2940 ll_get_fsname(inode->i_sb, NULL, 0),
2941 ladvise->lla_start, ladvise->lla_end,
2942 ladvise_names[advice], rc);
2954 * Give file access advices
2956 * The ladvise interface is similar to Linux fadvise() system call, except it
2957 * forwards the advices directly from Lustre client to server. The server side
2958 * codes will apply appropriate read-ahead and caching techniques for the
2959 * corresponding files.
2961 * A typical workload for ladvise is e.g. a bunch of different clients are
2962 * doing small random reads of a file, so prefetching pages into OSS cache
2963 * with big linear reads before the random IO is a net benefit. Fetching
2964 * all that data into each client cache with fadvise() may not be, due to
2965 * much more data being sent to the client.
2967 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
2968 struct llapi_lu_ladvise *ladvise)
2972 struct cl_ladvise_io *lio;
2977 env = cl_env_get(&refcheck);
2979 RETURN(PTR_ERR(env));
2981 io = vvp_env_thread_io(env);
2982 io->ci_obj = ll_i2info(inode)->lli_clob;
2984 /* initialize parameters for ladvise */
2985 lio = &io->u.ci_ladvise;
2986 lio->li_start = ladvise->lla_start;
2987 lio->li_end = ladvise->lla_end;
2988 lio->li_fid = ll_inode2fid(inode);
2989 lio->li_advice = ladvise->lla_advice;
2990 lio->li_flags = flags;
2992 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
2993 rc = cl_io_loop(env, io);
2997 cl_io_fini(env, io);
2998 cl_env_put(env, &refcheck);
3002 static int ll_lock_noexpand(struct file *file, int flags)
3004 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3006 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3011 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3014 struct fsxattr fsxattr;
3016 if (copy_from_user(&fsxattr,
3017 (const struct fsxattr __user *)arg,
3021 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3022 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3023 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3024 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3025 if (copy_to_user((struct fsxattr __user *)arg,
3026 &fsxattr, sizeof(fsxattr)))
3032 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3036 struct md_op_data *op_data;
3037 struct ptlrpc_request *req = NULL;
3039 struct fsxattr fsxattr;
3040 struct cl_object *obj;
3044 /* only root could change project ID */
3045 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
3048 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3049 LUSTRE_OPC_ANY, NULL);
3050 if (IS_ERR(op_data))
3051 RETURN(PTR_ERR(op_data));
3053 if (copy_from_user(&fsxattr,
3054 (const struct fsxattr __user *)arg,
3056 GOTO(out_fsxattr, rc = -EFAULT);
3058 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3059 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3060 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3061 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3062 op_data->op_projid = fsxattr.fsx_projid;
3063 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3064 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3066 ptlrpc_req_finished(req);
3068 GOTO(out_fsxattr, rc);
3069 ll_update_inode_flags(inode, op_data->op_attr_flags);
3070 obj = ll_i2info(inode)->lli_clob;
3072 GOTO(out_fsxattr, rc);
3074 OBD_ALLOC_PTR(attr);
3076 GOTO(out_fsxattr, rc = -ENOMEM);
3078 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3079 fsxattr.fsx_xflags);
3082 ll_finish_md_op_data(op_data);
3086 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3089 struct inode *inode = file_inode(file);
3090 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3091 struct ll_inode_info *lli = ll_i2info(inode);
3092 struct obd_client_handle *och = NULL;
3093 struct split_param sp;
3096 enum mds_op_bias bias = 0;
3097 struct file *layout_file = NULL;
3099 size_t data_size = 0;
3103 mutex_lock(&lli->lli_och_mutex);
3104 if (fd->fd_lease_och != NULL) {
3105 och = fd->fd_lease_och;
3106 fd->fd_lease_och = NULL;
3108 mutex_unlock(&lli->lli_och_mutex);
3111 GOTO(out, rc = -ENOLCK);
3113 fmode = och->och_flags;
3115 switch (ioc->lil_flags) {
3116 case LL_LEASE_RESYNC_DONE:
3117 if (ioc->lil_count > IOC_IDS_MAX)
3118 GOTO(out, rc = -EINVAL);
3120 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3121 OBD_ALLOC(data, data_size);
3123 GOTO(out, rc = -ENOMEM);
3125 if (copy_from_user(data, (void __user *)arg, data_size))
3126 GOTO(out, rc = -EFAULT);
3128 bias = MDS_CLOSE_RESYNC_DONE;
3130 case LL_LEASE_LAYOUT_MERGE: {
3133 if (ioc->lil_count != 1)
3134 GOTO(out, rc = -EINVAL);
3136 arg += sizeof(*ioc);
3137 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3138 GOTO(out, rc = -EFAULT);
3140 layout_file = fget(fd);
3142 GOTO(out, rc = -EBADF);
3144 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3145 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3146 GOTO(out, rc = -EPERM);
3148 data = file_inode(layout_file);
3149 bias = MDS_CLOSE_LAYOUT_MERGE;
3152 case LL_LEASE_LAYOUT_SPLIT: {
3156 if (ioc->lil_count != 2)
3157 GOTO(out, rc = -EINVAL);
3159 arg += sizeof(*ioc);
3160 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3161 GOTO(out, rc = -EFAULT);
3163 arg += sizeof(__u32);
3164 if (copy_from_user(&mirror_id, (void __user *)arg,
3166 GOTO(out, rc = -EFAULT);
3168 layout_file = fget(fdv);
3170 GOTO(out, rc = -EBADF);
3172 sp.sp_inode = file_inode(layout_file);
3173 sp.sp_mirror_id = (__u16)mirror_id;
3175 bias = MDS_CLOSE_LAYOUT_SPLIT;
3179 /* without close intent */
3183 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3187 rc = ll_lease_och_release(inode, file);
3196 switch (ioc->lil_flags) {
3197 case LL_LEASE_RESYNC_DONE:
3199 OBD_FREE(data, data_size);
3201 case LL_LEASE_LAYOUT_MERGE:
3202 case LL_LEASE_LAYOUT_SPLIT:
3209 rc = ll_lease_type_from_fmode(fmode);
3213 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3216 struct inode *inode = file_inode(file);
3217 struct ll_inode_info *lli = ll_i2info(inode);
3218 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3219 struct obd_client_handle *och = NULL;
3220 __u64 open_flags = 0;
3226 switch (ioc->lil_mode) {
3227 case LL_LEASE_WRLCK:
3228 if (!(file->f_mode & FMODE_WRITE))
3230 fmode = FMODE_WRITE;
3232 case LL_LEASE_RDLCK:
3233 if (!(file->f_mode & FMODE_READ))
3237 case LL_LEASE_UNLCK:
3238 RETURN(ll_file_unlock_lease(file, ioc, arg));
3243 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3245 /* apply for lease */
3246 if (ioc->lil_flags & LL_LEASE_RESYNC)
3247 open_flags = MDS_OPEN_RESYNC;
3248 och = ll_lease_open(inode, file, fmode, open_flags);
3250 RETURN(PTR_ERR(och));
3252 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3253 rc = ll_lease_file_resync(och, inode);
3255 ll_lease_close(och, inode, NULL);
3258 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3260 ll_lease_close(och, inode, NULL);
3266 mutex_lock(&lli->lli_och_mutex);
3267 if (fd->fd_lease_och == NULL) {
3268 fd->fd_lease_och = och;
3271 mutex_unlock(&lli->lli_och_mutex);
3273 /* impossible now that only excl is supported for now */
3274 ll_lease_close(och, inode, &lease_broken);
3281 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3283 struct inode *inode = file_inode(file);
3284 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3288 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3289 PFID(ll_inode2fid(inode)), inode, cmd);
3290 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3292 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3293 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3297 case LL_IOC_GETFLAGS:
3298 /* Get the current value of the file flags */
3299 return put_user(fd->fd_flags, (int __user *)arg);
3300 case LL_IOC_SETFLAGS:
3301 case LL_IOC_CLRFLAGS:
3302 /* Set or clear specific file flags */
3303 /* XXX This probably needs checks to ensure the flags are
3304 * not abused, and to handle any flag side effects.
3306 if (get_user(flags, (int __user *) arg))
3309 if (cmd == LL_IOC_SETFLAGS) {
3310 if ((flags & LL_FILE_IGNORE_LOCK) &&
3311 !(file->f_flags & O_DIRECT)) {
3312 CERROR("%s: unable to disable locking on "
3313 "non-O_DIRECT file\n", current->comm);
3317 fd->fd_flags |= flags;
3319 fd->fd_flags &= ~flags;
3322 case LL_IOC_LOV_SETSTRIPE:
3323 case LL_IOC_LOV_SETSTRIPE_NEW:
3324 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3325 case LL_IOC_LOV_SETEA:
3326 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3327 case LL_IOC_LOV_SWAP_LAYOUTS: {
3329 struct lustre_swap_layouts lsl;
3331 if (copy_from_user(&lsl, (char __user *)arg,
3332 sizeof(struct lustre_swap_layouts)))
3335 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3338 file2 = fget(lsl.sl_fd);
3342 /* O_WRONLY or O_RDWR */
3343 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3344 GOTO(out, rc = -EPERM);
3346 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3347 struct inode *inode2;
3348 struct ll_inode_info *lli;
3349 struct obd_client_handle *och = NULL;
3351 lli = ll_i2info(inode);
3352 mutex_lock(&lli->lli_och_mutex);
3353 if (fd->fd_lease_och != NULL) {
3354 och = fd->fd_lease_och;
3355 fd->fd_lease_och = NULL;
3357 mutex_unlock(&lli->lli_och_mutex);
3359 GOTO(out, rc = -ENOLCK);
3360 inode2 = file_inode(file2);
3361 rc = ll_swap_layouts_close(och, inode, inode2);
3363 rc = ll_swap_layouts(file, file2, &lsl);
3369 case LL_IOC_LOV_GETSTRIPE:
3370 case LL_IOC_LOV_GETSTRIPE_NEW:
3371 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3372 case FS_IOC_GETFLAGS:
3373 case FS_IOC_SETFLAGS:
3374 RETURN(ll_iocontrol(inode, file, cmd, arg));
3375 case FSFILT_IOC_GETVERSION:
3376 case FS_IOC_GETVERSION:
3377 RETURN(put_user(inode->i_generation, (int __user *)arg));
3378 /* We need to special case any other ioctls we want to handle,
3379 * to send them to the MDS/OST as appropriate and to properly
3380 * network encode the arg field. */
3381 case FS_IOC_SETVERSION:
3384 case LL_IOC_GROUP_LOCK:
3385 RETURN(ll_get_grouplock(inode, file, arg));
3386 case LL_IOC_GROUP_UNLOCK:
3387 RETURN(ll_put_grouplock(inode, file, arg));
3388 case IOC_OBD_STATFS:
3389 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3391 case LL_IOC_FLUSHCTX:
3392 RETURN(ll_flush_ctx(inode));
3393 case LL_IOC_PATH2FID: {
3394 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3395 sizeof(struct lu_fid)))
3400 case LL_IOC_GETPARENT:
3401 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3403 case OBD_IOC_FID2PATH:
3404 RETURN(ll_fid2path(inode, (void __user *)arg));
3405 case LL_IOC_DATA_VERSION: {
3406 struct ioc_data_version idv;
3409 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3412 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3413 rc = ll_ioc_data_version(inode, &idv);
3416 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3422 case LL_IOC_GET_MDTIDX: {
3425 mdtidx = ll_get_mdt_idx(inode);
3429 if (put_user((int)mdtidx, (int __user *)arg))
3434 case OBD_IOC_GETDTNAME:
3435 case OBD_IOC_GETMDNAME:
3436 RETURN(ll_get_obd_name(inode, cmd, arg));
3437 case LL_IOC_HSM_STATE_GET: {
3438 struct md_op_data *op_data;
3439 struct hsm_user_state *hus;
3446 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3447 LUSTRE_OPC_ANY, hus);
3448 if (IS_ERR(op_data)) {
3450 RETURN(PTR_ERR(op_data));
3453 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3456 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3459 ll_finish_md_op_data(op_data);
3463 case LL_IOC_HSM_STATE_SET: {
3464 struct hsm_state_set *hss;
3471 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3476 rc = ll_hsm_state_set(inode, hss);
3481 case LL_IOC_HSM_ACTION: {
3482 struct md_op_data *op_data;
3483 struct hsm_current_action *hca;
3490 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3491 LUSTRE_OPC_ANY, hca);
3492 if (IS_ERR(op_data)) {
3494 RETURN(PTR_ERR(op_data));
3497 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3500 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3503 ll_finish_md_op_data(op_data);
3507 case LL_IOC_SET_LEASE_OLD: {
3508 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3510 RETURN(ll_file_set_lease(file, &ioc, 0));
3512 case LL_IOC_SET_LEASE: {
3513 struct ll_ioc_lease ioc;
3515 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3518 RETURN(ll_file_set_lease(file, &ioc, arg));
3520 case LL_IOC_GET_LEASE: {
3521 struct ll_inode_info *lli = ll_i2info(inode);
3522 struct ldlm_lock *lock = NULL;
3525 mutex_lock(&lli->lli_och_mutex);
3526 if (fd->fd_lease_och != NULL) {
3527 struct obd_client_handle *och = fd->fd_lease_och;
3529 lock = ldlm_handle2lock(&och->och_lease_handle);
3531 lock_res_and_lock(lock);
3532 if (!ldlm_is_cancel(lock))
3533 fmode = och->och_flags;
3535 unlock_res_and_lock(lock);
3536 LDLM_LOCK_PUT(lock);
3539 mutex_unlock(&lli->lli_och_mutex);
3541 RETURN(ll_lease_type_from_fmode(fmode));
3543 case LL_IOC_HSM_IMPORT: {
3544 struct hsm_user_import *hui;
3550 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3555 rc = ll_hsm_import(inode, file, hui);
3560 case LL_IOC_FUTIMES_3: {
3561 struct ll_futimes_3 lfu;
3563 if (copy_from_user(&lfu,
3564 (const struct ll_futimes_3 __user *)arg,
3568 RETURN(ll_file_futimes_3(file, &lfu));
3570 case LL_IOC_LADVISE: {
3571 struct llapi_ladvise_hdr *k_ladvise_hdr;
3572 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3575 int alloc_size = sizeof(*k_ladvise_hdr);
3578 u_ladvise_hdr = (void __user *)arg;
3579 OBD_ALLOC_PTR(k_ladvise_hdr);
3580 if (k_ladvise_hdr == NULL)
3583 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3584 GOTO(out_ladvise, rc = -EFAULT);
3586 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3587 k_ladvise_hdr->lah_count < 1)
3588 GOTO(out_ladvise, rc = -EINVAL);
3590 num_advise = k_ladvise_hdr->lah_count;
3591 if (num_advise >= LAH_COUNT_MAX)
3592 GOTO(out_ladvise, rc = -EFBIG);
3594 OBD_FREE_PTR(k_ladvise_hdr);
3595 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3596 lah_advise[num_advise]);
3597 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3598 if (k_ladvise_hdr == NULL)
3602 * TODO: submit multiple advices to one server in a single RPC
3604 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3605 GOTO(out_ladvise, rc = -EFAULT);
3607 for (i = 0; i < num_advise; i++) {
3608 struct llapi_lu_ladvise *k_ladvise =
3609 &k_ladvise_hdr->lah_advise[i];
3610 struct llapi_lu_ladvise __user *u_ladvise =
3611 &u_ladvise_hdr->lah_advise[i];
3613 rc = ll_ladvise_sanity(inode, k_ladvise);
3615 GOTO(out_ladvise, rc);
3617 switch (k_ladvise->lla_advice) {
3618 case LU_LADVISE_LOCKNOEXPAND:
3619 rc = ll_lock_noexpand(file,
3620 k_ladvise->lla_peradvice_flags);
3621 GOTO(out_ladvise, rc);
3622 case LU_LADVISE_LOCKAHEAD:
3624 rc = ll_file_lock_ahead(file, k_ladvise);
3627 GOTO(out_ladvise, rc);
3630 &u_ladvise->lla_lockahead_result))
3631 GOTO(out_ladvise, rc = -EFAULT);
3634 rc = ll_ladvise(inode, file,
3635 k_ladvise_hdr->lah_flags,
3638 GOTO(out_ladvise, rc);
3645 OBD_FREE(k_ladvise_hdr, alloc_size);
3648 case LL_IOC_FLR_SET_MIRROR: {
3649 /* mirror I/O must be direct to avoid polluting page cache
3651 if (!(file->f_flags & O_DIRECT))
3654 fd->fd_designated_mirror = (__u32)arg;
3657 case LL_IOC_FSGETXATTR:
3658 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3659 case LL_IOC_FSSETXATTR:
3660 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3662 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3664 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3665 (void __user *)arg));
3669 #ifndef HAVE_FILE_LLSEEK_SIZE
3670 static inline loff_t
3671 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3673 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3675 if (offset > maxsize)
3678 if (offset != file->f_pos) {
3679 file->f_pos = offset;
3680 file->f_version = 0;
3686 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3687 loff_t maxsize, loff_t eof)
3689 struct inode *inode = file_inode(file);
3697 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3698 * position-querying operation. Avoid rewriting the "same"
3699 * f_pos value back to the file because a concurrent read(),
3700 * write() or lseek() might have altered it
3705 * f_lock protects against read/modify/write race with other
3706 * SEEK_CURs. Note that parallel writes and reads behave
3710 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3711 inode_unlock(inode);
3715 * In the generic case the entire file is data, so as long as
3716 * offset isn't at the end of the file then the offset is data.
3723 * There is a virtual hole at the end of the file, so as long as
3724 * offset isn't i_size or larger, return i_size.
3732 return llseek_execute(file, offset, maxsize);
3736 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
3738 struct inode *inode = file_inode(file);
3739 loff_t retval, eof = 0;
3742 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
3743 (origin == SEEK_CUR) ? file->f_pos : 0);
3744 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
3745 PFID(ll_inode2fid(inode)), inode, retval, retval,
3747 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
3749 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
3750 retval = ll_glimpse_size(inode);
3753 eof = i_size_read(inode);
3756 retval = ll_generic_file_llseek_size(file, offset, origin,
3757 ll_file_maxbytes(inode), eof);
3761 static int ll_flush(struct file *file, fl_owner_t id)
3763 struct inode *inode = file_inode(file);
3764 struct ll_inode_info *lli = ll_i2info(inode);
3765 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3768 LASSERT(!S_ISDIR(inode->i_mode));
3770 /* catch async errors that were recorded back when async writeback
3771 * failed for pages in this mapping. */
3772 rc = lli->lli_async_rc;
3773 lli->lli_async_rc = 0;
3774 if (lli->lli_clob != NULL) {
3775 err = lov_read_and_clear_async_rc(lli->lli_clob);
3780 /* The application has been told write failure already.
3781 * Do not report failure again. */
3782 if (fd->fd_write_failed)
3784 return rc ? -EIO : 0;
3788 * Called to make sure a portion of file has been written out.
3789 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
3791 * Return how many pages have been written.
3793 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
3794 enum cl_fsync_mode mode, int ignore_layout)
3798 struct cl_fsync_io *fio;
3803 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
3804 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
3807 env = cl_env_get(&refcheck);
3809 RETURN(PTR_ERR(env));
3811 io = vvp_env_thread_io(env);
3812 io->ci_obj = ll_i2info(inode)->lli_clob;
3813 io->ci_ignore_layout = ignore_layout;
3815 /* initialize parameters for sync */
3816 fio = &io->u.ci_fsync;
3817 fio->fi_start = start;
3819 fio->fi_fid = ll_inode2fid(inode);
3820 fio->fi_mode = mode;
3821 fio->fi_nr_written = 0;
3823 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
3824 result = cl_io_loop(env, io);
3826 result = io->ci_result;
3828 result = fio->fi_nr_written;
3829 cl_io_fini(env, io);
3830 cl_env_put(env, &refcheck);
3836 * When dentry is provided (the 'else' case), file_dentry() may be
3837 * null and dentry must be used directly rather than pulled from
3838 * file_dentry() as is done otherwise.
3841 #ifdef HAVE_FILE_FSYNC_4ARGS
3842 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
3844 struct dentry *dentry = file_dentry(file);
3846 #elif defined(HAVE_FILE_FSYNC_2ARGS)
3847 int ll_fsync(struct file *file, int datasync)
3849 struct dentry *dentry = file_dentry(file);
3851 loff_t end = LLONG_MAX;
3853 int ll_fsync(struct file *file, struct dentry *dentry, int datasync)
3856 loff_t end = LLONG_MAX;
3858 struct inode *inode = dentry->d_inode;
3859 struct ll_inode_info *lli = ll_i2info(inode);
3860 struct ptlrpc_request *req;
3864 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
3865 PFID(ll_inode2fid(inode)), inode);
3866 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
3868 #ifdef HAVE_FILE_FSYNC_4ARGS
3869 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
3870 lock_inode = !lli->lli_inode_locked;
3874 /* fsync's caller has already called _fdata{sync,write}, we want
3875 * that IO to finish before calling the osc and mdc sync methods */
3876 rc = filemap_fdatawait(inode->i_mapping);
3879 /* catch async errors that were recorded back when async writeback
3880 * failed for pages in this mapping. */
3881 if (!S_ISDIR(inode->i_mode)) {
3882 err = lli->lli_async_rc;
3883 lli->lli_async_rc = 0;
3886 if (lli->lli_clob != NULL) {
3887 err = lov_read_and_clear_async_rc(lli->lli_clob);
3893 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
3897 ptlrpc_req_finished(req);
3899 if (S_ISREG(inode->i_mode)) {
3900 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3902 err = cl_sync_file_range(inode, start, end, CL_FSYNC_ALL, 0);
3903 if (rc == 0 && err < 0)
3906 fd->fd_write_failed = true;
3908 fd->fd_write_failed = false;
3911 #ifdef HAVE_FILE_FSYNC_4ARGS
3913 inode_unlock(inode);
3919 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
3921 struct inode *inode = file_inode(file);
3922 struct ll_sb_info *sbi = ll_i2sbi(inode);
3923 struct ldlm_enqueue_info einfo = {
3924 .ei_type = LDLM_FLOCK,
3925 .ei_cb_cp = ldlm_flock_completion_ast,
3926 .ei_cbdata = file_lock,
3928 struct md_op_data *op_data;
3929 struct lustre_handle lockh = { 0 };
3930 union ldlm_policy_data flock = { { 0 } };
3931 int fl_type = file_lock->fl_type;
3937 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
3938 PFID(ll_inode2fid(inode)), file_lock);
3940 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
3942 if (file_lock->fl_flags & FL_FLOCK) {
3943 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
3944 /* flocks are whole-file locks */
3945 flock.l_flock.end = OFFSET_MAX;
3946 /* For flocks owner is determined by the local file desctiptor*/
3947 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
3948 } else if (file_lock->fl_flags & FL_POSIX) {
3949 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
3950 flock.l_flock.start = file_lock->fl_start;
3951 flock.l_flock.end = file_lock->fl_end;
3955 flock.l_flock.pid = file_lock->fl_pid;
3957 /* Somewhat ugly workaround for svc lockd.
3958 * lockd installs custom fl_lmops->lm_compare_owner that checks
3959 * for the fl_owner to be the same (which it always is on local node
3960 * I guess between lockd processes) and then compares pid.
3961 * As such we assign pid to the owner field to make it all work,
3962 * conflict with normal locks is unlikely since pid space and
3963 * pointer space for current->files are not intersecting */
3964 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
3965 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
3969 einfo.ei_mode = LCK_PR;
3972 /* An unlock request may or may not have any relation to
3973 * existing locks so we may not be able to pass a lock handle
3974 * via a normal ldlm_lock_cancel() request. The request may even
3975 * unlock a byte range in the middle of an existing lock. In
3976 * order to process an unlock request we need all of the same
3977 * information that is given with a normal read or write record
3978 * lock request. To avoid creating another ldlm unlock (cancel)
3979 * message we'll treat a LCK_NL flock request as an unlock. */
3980 einfo.ei_mode = LCK_NL;
3983 einfo.ei_mode = LCK_PW;
3986 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4001 flags = LDLM_FL_BLOCK_NOWAIT;
4007 flags = LDLM_FL_TEST_LOCK;
4010 CERROR("unknown fcntl lock command: %d\n", cmd);
4014 /* Save the old mode so that if the mode in the lock changes we
4015 * can decrement the appropriate reader or writer refcount. */
4016 file_lock->fl_type = einfo.ei_mode;
4018 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4019 LUSTRE_OPC_ANY, NULL);
4020 if (IS_ERR(op_data))
4021 RETURN(PTR_ERR(op_data));
4023 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4024 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4025 flock.l_flock.pid, flags, einfo.ei_mode,
4026 flock.l_flock.start, flock.l_flock.end);
4028 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4031 /* Restore the file lock type if not TEST lock. */
4032 if (!(flags & LDLM_FL_TEST_LOCK))
4033 file_lock->fl_type = fl_type;
4035 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4036 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4037 !(flags & LDLM_FL_TEST_LOCK))
4038 rc2 = locks_lock_file_wait(file, file_lock);
4040 if ((file_lock->fl_flags & FL_FLOCK) &&
4041 (rc == 0 || file_lock->fl_type == F_UNLCK))
4042 rc2 = flock_lock_file_wait(file, file_lock);
4043 if ((file_lock->fl_flags & FL_POSIX) &&
4044 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4045 !(flags & LDLM_FL_TEST_LOCK))
4046 rc2 = posix_lock_file_wait(file, file_lock);
4047 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4049 if (rc2 && file_lock->fl_type != F_UNLCK) {
4050 einfo.ei_mode = LCK_NL;
4051 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4056 ll_finish_md_op_data(op_data);
4061 int ll_get_fid_by_name(struct inode *parent, const char *name,
4062 int namelen, struct lu_fid *fid,
4063 struct inode **inode)
4065 struct md_op_data *op_data = NULL;
4066 struct mdt_body *body;
4067 struct ptlrpc_request *req;
4071 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4072 LUSTRE_OPC_ANY, NULL);
4073 if (IS_ERR(op_data))
4074 RETURN(PTR_ERR(op_data));
4076 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4077 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4078 ll_finish_md_op_data(op_data);
4082 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4084 GOTO(out_req, rc = -EFAULT);
4086 *fid = body->mbo_fid1;
4089 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4091 ptlrpc_req_finished(req);
4095 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4098 struct dentry *dchild = NULL;
4099 struct inode *child_inode = NULL;
4100 struct md_op_data *op_data;
4101 struct ptlrpc_request *request = NULL;
4102 struct obd_client_handle *och = NULL;
4104 struct mdt_body *body;
4105 __u64 data_version = 0;
4106 size_t namelen = strlen(name);
4107 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4111 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4112 PFID(ll_inode2fid(parent)), name,
4113 lum->lum_stripe_offset, lum->lum_stripe_count);
4115 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4116 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4117 lustre_swab_lmv_user_md(lum);
4119 /* Get child FID first */
4120 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4123 dchild = d_lookup(file_dentry(file), &qstr);
4125 if (dchild->d_inode)
4126 child_inode = igrab(dchild->d_inode);
4131 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4140 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4141 OBD_CONNECT2_DIR_MIGRATE)) {
4142 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4143 ll_i2info(child_inode)->lli_lsm_md) {
4144 CERROR("%s: MDT doesn't support stripe directory "
4146 ll_get_fsname(parent->i_sb, NULL, 0));
4147 GOTO(out_iput, rc = -EOPNOTSUPP);
4152 * lfs migrate command needs to be blocked on the client
4153 * by checking the migrate FID against the FID of the
4156 if (child_inode == parent->i_sb->s_root->d_inode)
4157 GOTO(out_iput, rc = -EINVAL);
4159 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4160 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4161 if (IS_ERR(op_data))
4162 GOTO(out_iput, rc = PTR_ERR(op_data));
4164 inode_lock(child_inode);
4165 op_data->op_fid3 = *ll_inode2fid(child_inode);
4166 if (!fid_is_sane(&op_data->op_fid3)) {
4167 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4168 ll_get_fsname(parent->i_sb, NULL, 0), name,
4169 PFID(&op_data->op_fid3));
4170 GOTO(out_unlock, rc = -EINVAL);
4173 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4174 op_data->op_data = lum;
4175 op_data->op_data_size = lumlen;
4178 if (S_ISREG(child_inode->i_mode)) {
4179 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4183 GOTO(out_unlock, rc);
4186 rc = ll_data_version(child_inode, &data_version,
4189 GOTO(out_close, rc);
4191 op_data->op_open_handle = och->och_open_handle;
4192 op_data->op_data_version = data_version;
4193 op_data->op_lease_handle = och->och_lease_handle;
4194 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4196 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4197 och->och_mod->mod_open_req->rq_replay = 0;
4198 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4201 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4202 name, namelen, &request);
4204 LASSERT(request != NULL);
4205 ll_update_times(request, parent);
4207 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4208 LASSERT(body != NULL);
4210 /* If the server does release layout lock, then we cleanup
4211 * the client och here, otherwise release it in out_close: */
4212 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4213 obd_mod_put(och->och_mod);
4214 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4216 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4222 if (request != NULL) {
4223 ptlrpc_req_finished(request);
4227 /* Try again if the file layout has changed. */
4228 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4233 ll_lease_close(och, child_inode, NULL);
4235 clear_nlink(child_inode);
4237 inode_unlock(child_inode);
4238 ll_finish_md_op_data(op_data);
4245 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4253 * test if some locks matching bits and l_req_mode are acquired
4254 * - bits can be in different locks
4255 * - if found clear the common lock bits in *bits
4256 * - the bits not found, are kept in *bits
4258 * \param bits [IN] searched lock bits [IN]
4259 * \param l_req_mode [IN] searched lock mode
4260 * \retval boolean, true iff all bits are found
4262 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4264 struct lustre_handle lockh;
4265 union ldlm_policy_data policy;
4266 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4267 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4276 fid = &ll_i2info(inode)->lli_fid;
4277 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4278 ldlm_lockname[mode]);
4280 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4281 for (i = 0; i <= MDS_INODELOCK_MAXSHIFT && *bits != 0; i++) {
4282 policy.l_inodebits.bits = *bits & (1 << i);
4283 if (policy.l_inodebits.bits == 0)
4286 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4287 &policy, mode, &lockh)) {
4288 struct ldlm_lock *lock;
4290 lock = ldlm_handle2lock(&lockh);
4293 ~(lock->l_policy_data.l_inodebits.bits);
4294 LDLM_LOCK_PUT(lock);
4296 *bits &= ~policy.l_inodebits.bits;
4303 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4304 struct lustre_handle *lockh, __u64 flags,
4305 enum ldlm_mode mode)
4307 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4312 fid = &ll_i2info(inode)->lli_fid;
4313 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4315 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4316 fid, LDLM_IBITS, &policy, mode, lockh);
4321 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4323 /* Already unlinked. Just update nlink and return success */
4324 if (rc == -ENOENT) {
4326 /* If it is striped directory, and there is bad stripe
4327 * Let's revalidate the dentry again, instead of returning
4329 if (S_ISDIR(inode->i_mode) &&
4330 ll_i2info(inode)->lli_lsm_md != NULL)
4333 /* This path cannot be hit for regular files unless in
4334 * case of obscure races, so no need to to validate
4336 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4338 } else if (rc != 0) {
4339 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4340 "%s: revalidate FID "DFID" error: rc = %d\n",
4341 ll_get_fsname(inode->i_sb, NULL, 0),
4342 PFID(ll_inode2fid(inode)), rc);
4348 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4350 struct inode *inode = dentry->d_inode;
4351 struct obd_export *exp = ll_i2mdexp(inode);
4352 struct lookup_intent oit = {
4355 struct ptlrpc_request *req = NULL;
4356 struct md_op_data *op_data;
4360 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4361 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4363 /* Call getattr by fid, so do not provide name at all. */
4364 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4365 LUSTRE_OPC_ANY, NULL);
4366 if (IS_ERR(op_data))
4367 RETURN(PTR_ERR(op_data));
4369 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4370 ll_finish_md_op_data(op_data);
4372 rc = ll_inode_revalidate_fini(inode, rc);
4376 rc = ll_revalidate_it_finish(req, &oit, dentry);
4378 ll_intent_release(&oit);
4382 /* Unlinked? Unhash dentry, so it is not picked up later by
4383 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4384 * here to preserve get_cwd functionality on 2.6.
4386 if (!dentry->d_inode->i_nlink) {
4387 ll_lock_dcache(inode);
4388 d_lustre_invalidate(dentry, 0);
4389 ll_unlock_dcache(inode);
4392 ll_lookup_finish_locks(&oit, dentry);
4394 ptlrpc_req_finished(req);
4399 static int ll_merge_md_attr(struct inode *inode)
4401 struct cl_attr attr = { 0 };
4404 LASSERT(ll_i2info(inode)->lli_lsm_md != NULL);
4405 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4406 &attr, ll_md_blocking_ast);
4410 set_nlink(inode, attr.cat_nlink);
4411 inode->i_blocks = attr.cat_blocks;
4412 i_size_write(inode, attr.cat_size);
4414 ll_i2info(inode)->lli_atime = attr.cat_atime;
4415 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4416 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4421 static inline dev_t ll_compat_encode_dev(dev_t dev)
4423 /* The compat_sys_*stat*() syscalls will fail unless the
4424 * device majors and minors are both less than 256. Note that
4425 * the value returned here will be passed through
4426 * old_encode_dev() in cp_compat_stat(). And so we are not
4427 * trying to return a valid compat (u16) device number, just
4428 * one that will pass the old_valid_dev() check. */
4430 return MKDEV(MAJOR(dev) & 0xff, MINOR(dev) & 0xff);
4433 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4434 int ll_getattr(const struct path *path, struct kstat *stat,
4435 u32 request_mask, unsigned int flags)
4437 struct dentry *de = path->dentry;
4439 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4442 struct inode *inode = de->d_inode;
4443 struct ll_sb_info *sbi = ll_i2sbi(inode);
4444 struct ll_inode_info *lli = ll_i2info(inode);
4447 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4449 rc = ll_inode_revalidate(de, IT_GETATTR);
4453 if (S_ISREG(inode->i_mode)) {
4454 /* In case of restore, the MDT has the right size and has
4455 * already send it back without granting the layout lock,
4456 * inode is up-to-date so glimpse is useless.
4457 * Also to glimpse we need the layout, in case of a running
4458 * restore the MDT holds the layout lock so the glimpse will
4459 * block up to the end of restore (getattr will block)
4461 if (!ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4462 rc = ll_glimpse_size(inode);
4467 /* If object isn't regular a file then don't validate size. */
4468 if (S_ISDIR(inode->i_mode) &&
4469 lli->lli_lsm_md != NULL) {
4470 rc = ll_merge_md_attr(inode);
4475 LTIME_S(inode->i_atime) = lli->lli_atime;
4476 LTIME_S(inode->i_mtime) = lli->lli_mtime;
4477 LTIME_S(inode->i_ctime) = lli->lli_ctime;
4480 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4482 if (ll_need_32bit_api(sbi)) {
4483 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4484 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4485 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4487 stat->ino = inode->i_ino;
4488 stat->dev = inode->i_sb->s_dev;
4489 stat->rdev = inode->i_rdev;
4492 stat->mode = inode->i_mode;
4493 stat->uid = inode->i_uid;
4494 stat->gid = inode->i_gid;
4495 stat->atime = inode->i_atime;
4496 stat->mtime = inode->i_mtime;
4497 stat->ctime = inode->i_ctime;
4498 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4500 stat->nlink = inode->i_nlink;
4501 stat->size = i_size_read(inode);
4502 stat->blocks = inode->i_blocks;
4507 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4508 __u64 start, __u64 len)
4512 struct fiemap *fiemap;
4513 unsigned int extent_count = fieinfo->fi_extents_max;
4515 num_bytes = sizeof(*fiemap) + (extent_count *
4516 sizeof(struct fiemap_extent));
4517 OBD_ALLOC_LARGE(fiemap, num_bytes);
4522 fiemap->fm_flags = fieinfo->fi_flags;
4523 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4524 fiemap->fm_start = start;
4525 fiemap->fm_length = len;
4526 if (extent_count > 0 &&
4527 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4528 sizeof(struct fiemap_extent)) != 0)
4529 GOTO(out, rc = -EFAULT);
4531 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4533 fieinfo->fi_flags = fiemap->fm_flags;
4534 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4535 if (extent_count > 0 &&
4536 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4537 fiemap->fm_mapped_extents *
4538 sizeof(struct fiemap_extent)) != 0)
4539 GOTO(out, rc = -EFAULT);
4541 OBD_FREE_LARGE(fiemap, num_bytes);
4545 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4547 struct ll_inode_info *lli = ll_i2info(inode);
4548 struct posix_acl *acl = NULL;
4551 spin_lock(&lli->lli_lock);
4552 /* VFS' acl_permission_check->check_acl will release the refcount */
4553 acl = posix_acl_dup(lli->lli_posix_acl);
4554 spin_unlock(&lli->lli_lock);
4559 #ifdef HAVE_IOP_SET_ACL
4560 #ifdef CONFIG_FS_POSIX_ACL
4561 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4563 struct ll_sb_info *sbi = ll_i2sbi(inode);
4564 struct ptlrpc_request *req = NULL;
4565 const char *name = NULL;
4567 size_t value_size = 0;
4572 case ACL_TYPE_ACCESS:
4573 name = XATTR_NAME_POSIX_ACL_ACCESS;
4575 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4578 case ACL_TYPE_DEFAULT:
4579 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4580 if (!S_ISDIR(inode->i_mode))
4581 rc = acl ? -EACCES : 0;
4592 value_size = posix_acl_xattr_size(acl->a_count);
4593 value = kmalloc(value_size, GFP_NOFS);
4595 GOTO(out, rc = -ENOMEM);
4597 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4599 GOTO(out_value, rc);
4602 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4603 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4604 name, value, value_size, 0, 0, &req);
4606 ptlrpc_req_finished(req);
4611 forget_cached_acl(inode, type);
4613 set_cached_acl(inode, type, acl);
4616 #endif /* CONFIG_FS_POSIX_ACL */
4617 #endif /* HAVE_IOP_SET_ACL */
4619 #ifndef HAVE_GENERIC_PERMISSION_2ARGS
4621 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4622 ll_check_acl(struct inode *inode, int mask, unsigned int flags)
4624 ll_check_acl(struct inode *inode, int mask)
4627 # ifdef CONFIG_FS_POSIX_ACL
4628 struct posix_acl *acl;
4632 # ifdef HAVE_GENERIC_PERMISSION_4ARGS
4633 if (flags & IPERM_FLAG_RCU)
4636 acl = ll_get_acl(inode, ACL_TYPE_ACCESS);
4641 rc = posix_acl_permission(inode, acl, mask);
4642 posix_acl_release(acl);
4645 # else /* !CONFIG_FS_POSIX_ACL */
4647 # endif /* CONFIG_FS_POSIX_ACL */
4649 #endif /* HAVE_GENERIC_PERMISSION_2ARGS */
4651 #ifdef HAVE_GENERIC_PERMISSION_4ARGS
4652 int ll_inode_permission(struct inode *inode, int mask, unsigned int flags)
4654 # ifdef HAVE_INODE_PERMISION_2ARGS
4655 int ll_inode_permission(struct inode *inode, int mask)
4657 int ll_inode_permission(struct inode *inode, int mask, struct nameidata *nd)
4662 struct ll_sb_info *sbi;
4663 struct root_squash_info *squash;
4664 struct cred *cred = NULL;
4665 const struct cred *old_cred = NULL;
4667 bool squash_id = false;
4670 #ifdef MAY_NOT_BLOCK
4671 if (mask & MAY_NOT_BLOCK)
4673 #elif defined(HAVE_GENERIC_PERMISSION_4ARGS)
4674 if (flags & IPERM_FLAG_RCU)
4678 /* as root inode are NOT getting validated in lookup operation,
4679 * need to do it before permission check. */
4681 if (inode == inode->i_sb->s_root->d_inode) {
4682 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4687 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4688 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4690 /* squash fsuid/fsgid if needed */
4691 sbi = ll_i2sbi(inode);
4692 squash = &sbi->ll_squash;
4693 if (unlikely(squash->rsi_uid != 0 &&
4694 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4695 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4699 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4700 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4701 squash->rsi_uid, squash->rsi_gid);
4703 /* update current process's credentials
4704 * and FS capability */
4705 cred = prepare_creds();
4709 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4710 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4711 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4712 if ((1 << cap) & CFS_CAP_FS_MASK)
4713 cap_lower(cred->cap_effective, cap);
4715 old_cred = override_creds(cred);
4718 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4719 rc = ll_generic_permission(inode, mask, flags, ll_check_acl);
4720 /* restore current process's credentials and FS capability */
4722 revert_creds(old_cred);
4729 /* -o localflock - only provides locally consistent flock locks */
4730 struct file_operations ll_file_operations = {
4731 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4732 # ifdef HAVE_SYNC_READ_WRITE
4733 .read = new_sync_read,
4734 .write = new_sync_write,
4736 .read_iter = ll_file_read_iter,
4737 .write_iter = ll_file_write_iter,
4738 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4739 .read = ll_file_read,
4740 .aio_read = ll_file_aio_read,
4741 .write = ll_file_write,
4742 .aio_write = ll_file_aio_write,
4743 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4744 .unlocked_ioctl = ll_file_ioctl,
4745 .open = ll_file_open,
4746 .release = ll_file_release,
4747 .mmap = ll_file_mmap,
4748 .llseek = ll_file_seek,
4749 .splice_read = ll_file_splice_read,
4754 struct file_operations ll_file_operations_flock = {
4755 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4756 # ifdef HAVE_SYNC_READ_WRITE
4757 .read = new_sync_read,
4758 .write = new_sync_write,
4759 # endif /* HAVE_SYNC_READ_WRITE */
4760 .read_iter = ll_file_read_iter,
4761 .write_iter = ll_file_write_iter,
4762 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4763 .read = ll_file_read,
4764 .aio_read = ll_file_aio_read,
4765 .write = ll_file_write,
4766 .aio_write = ll_file_aio_write,
4767 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4768 .unlocked_ioctl = ll_file_ioctl,
4769 .open = ll_file_open,
4770 .release = ll_file_release,
4771 .mmap = ll_file_mmap,
4772 .llseek = ll_file_seek,
4773 .splice_read = ll_file_splice_read,
4776 .flock = ll_file_flock,
4777 .lock = ll_file_flock
4780 /* These are for -o noflock - to return ENOSYS on flock calls */
4781 struct file_operations ll_file_operations_noflock = {
4782 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4783 # ifdef HAVE_SYNC_READ_WRITE
4784 .read = new_sync_read,
4785 .write = new_sync_write,
4786 # endif /* HAVE_SYNC_READ_WRITE */
4787 .read_iter = ll_file_read_iter,
4788 .write_iter = ll_file_write_iter,
4789 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4790 .read = ll_file_read,
4791 .aio_read = ll_file_aio_read,
4792 .write = ll_file_write,
4793 .aio_write = ll_file_aio_write,
4794 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4795 .unlocked_ioctl = ll_file_ioctl,
4796 .open = ll_file_open,
4797 .release = ll_file_release,
4798 .mmap = ll_file_mmap,
4799 .llseek = ll_file_seek,
4800 .splice_read = ll_file_splice_read,
4803 .flock = ll_file_noflock,
4804 .lock = ll_file_noflock
4807 struct inode_operations ll_file_inode_operations = {
4808 .setattr = ll_setattr,
4809 .getattr = ll_getattr,
4810 .permission = ll_inode_permission,
4811 #ifdef HAVE_IOP_XATTR
4812 .setxattr = ll_setxattr,
4813 .getxattr = ll_getxattr,
4814 .removexattr = ll_removexattr,
4816 .listxattr = ll_listxattr,
4817 .fiemap = ll_fiemap,
4818 #ifdef HAVE_IOP_GET_ACL
4819 .get_acl = ll_get_acl,
4821 #ifdef HAVE_IOP_SET_ACL
4822 .set_acl = ll_set_acl,
4826 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
4828 struct ll_inode_info *lli = ll_i2info(inode);
4829 struct cl_object *obj = lli->lli_clob;
4838 env = cl_env_get(&refcheck);
4840 RETURN(PTR_ERR(env));
4842 rc = cl_conf_set(env, lli->lli_clob, conf);
4846 if (conf->coc_opc == OBJECT_CONF_SET) {
4847 struct ldlm_lock *lock = conf->coc_lock;
4848 struct cl_layout cl = {
4852 LASSERT(lock != NULL);
4853 LASSERT(ldlm_has_layout(lock));
4855 /* it can only be allowed to match after layout is
4856 * applied to inode otherwise false layout would be
4857 * seen. Applying layout shoud happen before dropping
4858 * the intent lock. */
4859 ldlm_lock_allow_match(lock);
4861 rc = cl_object_layout_get(env, obj, &cl);
4866 DFID": layout version change: %u -> %u\n",
4867 PFID(&lli->lli_fid), ll_layout_version_get(lli),
4869 ll_layout_version_set(lli, cl.cl_layout_gen);
4873 cl_env_put(env, &refcheck);
4878 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
4879 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
4882 struct ll_sb_info *sbi = ll_i2sbi(inode);
4883 struct ptlrpc_request *req;
4884 struct mdt_body *body;
4891 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
4892 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
4893 lock->l_lvb_data, lock->l_lvb_len);
4895 if (lock->l_lvb_data != NULL)
4898 /* if layout lock was granted right away, the layout is returned
4899 * within DLM_LVB of dlm reply; otherwise if the lock was ever
4900 * blocked and then granted via completion ast, we have to fetch
4901 * layout here. Please note that we can't use the LVB buffer in
4902 * completion AST because it doesn't have a large enough buffer */
4903 rc = ll_get_default_mdsize(sbi, &lmmsize);
4905 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4906 OBD_MD_FLXATTR, XATTR_NAME_LOV, lmmsize, &req);
4910 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4912 GOTO(out, rc = -EPROTO);
4914 lmmsize = body->mbo_eadatasize;
4915 if (lmmsize == 0) /* empty layout */
4918 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
4920 GOTO(out, rc = -EFAULT);
4922 OBD_ALLOC_LARGE(lvbdata, lmmsize);
4923 if (lvbdata == NULL)
4924 GOTO(out, rc = -ENOMEM);
4926 memcpy(lvbdata, lmm, lmmsize);
4927 lock_res_and_lock(lock);
4928 if (unlikely(lock->l_lvb_data == NULL)) {
4929 lock->l_lvb_type = LVB_T_LAYOUT;
4930 lock->l_lvb_data = lvbdata;
4931 lock->l_lvb_len = lmmsize;
4934 unlock_res_and_lock(lock);
4937 OBD_FREE_LARGE(lvbdata, lmmsize);
4942 ptlrpc_req_finished(req);
4947 * Apply the layout to the inode. Layout lock is held and will be released
4950 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
4951 struct inode *inode)
4953 struct ll_inode_info *lli = ll_i2info(inode);
4954 struct ll_sb_info *sbi = ll_i2sbi(inode);
4955 struct ldlm_lock *lock;
4956 struct cl_object_conf conf;
4959 bool wait_layout = false;
4962 LASSERT(lustre_handle_is_used(lockh));
4964 lock = ldlm_handle2lock(lockh);
4965 LASSERT(lock != NULL);
4966 LASSERT(ldlm_has_layout(lock));
4968 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
4969 PFID(&lli->lli_fid), inode);
4971 /* in case this is a caching lock and reinstate with new inode */
4972 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
4974 lock_res_and_lock(lock);
4975 lvb_ready = ldlm_is_lvb_ready(lock);
4976 unlock_res_and_lock(lock);
4978 /* checking lvb_ready is racy but this is okay. The worst case is
4979 * that multi processes may configure the file on the same time. */
4983 rc = ll_layout_fetch(inode, lock);
4987 /* for layout lock, lmm is stored in lock's lvb.
4988 * lvb_data is immutable if the lock is held so it's safe to access it
4991 * set layout to file. Unlikely this will fail as old layout was
4992 * surely eliminated */
4993 memset(&conf, 0, sizeof conf);
4994 conf.coc_opc = OBJECT_CONF_SET;
4995 conf.coc_inode = inode;
4996 conf.coc_lock = lock;
4997 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
4998 conf.u.coc_layout.lb_len = lock->l_lvb_len;
4999 rc = ll_layout_conf(inode, &conf);
5001 /* refresh layout failed, need to wait */
5002 wait_layout = rc == -EBUSY;
5005 LDLM_LOCK_PUT(lock);
5006 ldlm_lock_decref(lockh, mode);
5008 /* wait for IO to complete if it's still being used. */
5010 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5011 ll_get_fsname(inode->i_sb, NULL, 0),
5012 PFID(&lli->lli_fid), inode);
5014 memset(&conf, 0, sizeof conf);
5015 conf.coc_opc = OBJECT_CONF_WAIT;
5016 conf.coc_inode = inode;
5017 rc = ll_layout_conf(inode, &conf);
5021 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5022 ll_get_fsname(inode->i_sb, NULL, 0),
5023 PFID(&lli->lli_fid), rc);
5029 * Issue layout intent RPC to MDS.
5030 * \param inode [in] file inode
5031 * \param intent [in] layout intent
5033 * \retval 0 on success
5034 * \retval < 0 error code
5036 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5038 struct ll_inode_info *lli = ll_i2info(inode);
5039 struct ll_sb_info *sbi = ll_i2sbi(inode);
5040 struct md_op_data *op_data;
5041 struct lookup_intent it;
5042 struct ptlrpc_request *req;
5046 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5047 0, 0, LUSTRE_OPC_ANY, NULL);
5048 if (IS_ERR(op_data))
5049 RETURN(PTR_ERR(op_data));
5051 op_data->op_data = intent;
5052 op_data->op_data_size = sizeof(*intent);
5054 memset(&it, 0, sizeof(it));
5055 it.it_op = IT_LAYOUT;
5056 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5057 intent->li_opc == LAYOUT_INTENT_TRUNC)
5058 it.it_flags = FMODE_WRITE;
5060 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5061 ll_get_fsname(inode->i_sb, NULL, 0),
5062 PFID(&lli->lli_fid), inode);
5064 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5065 &ll_md_blocking_ast, 0);
5066 if (it.it_request != NULL)
5067 ptlrpc_req_finished(it.it_request);
5068 it.it_request = NULL;
5070 ll_finish_md_op_data(op_data);
5072 /* set lock data in case this is a new lock */
5074 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5076 ll_intent_drop_lock(&it);
5082 * This function checks if there exists a LAYOUT lock on the client side,
5083 * or enqueues it if it doesn't have one in cache.
5085 * This function will not hold layout lock so it may be revoked any time after
5086 * this function returns. Any operations depend on layout should be redone
5089 * This function should be called before lov_io_init() to get an uptodate
5090 * layout version, the caller should save the version number and after IO
5091 * is finished, this function should be called again to verify that layout
5092 * is not changed during IO time.
5094 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5096 struct ll_inode_info *lli = ll_i2info(inode);
5097 struct ll_sb_info *sbi = ll_i2sbi(inode);
5098 struct lustre_handle lockh;
5099 struct layout_intent intent = {
5100 .li_opc = LAYOUT_INTENT_ACCESS,
5102 enum ldlm_mode mode;
5106 *gen = ll_layout_version_get(lli);
5107 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5111 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5112 LASSERT(S_ISREG(inode->i_mode));
5114 /* take layout lock mutex to enqueue layout lock exclusively. */
5115 mutex_lock(&lli->lli_layout_mutex);
5118 /* mostly layout lock is caching on the local side, so try to
5119 * match it before grabbing layout lock mutex. */
5120 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5121 LCK_CR | LCK_CW | LCK_PR | LCK_PW);
5122 if (mode != 0) { /* hit cached lock */
5123 rc = ll_layout_lock_set(&lockh, mode, inode);
5129 rc = ll_layout_intent(inode, &intent);
5135 *gen = ll_layout_version_get(lli);
5136 mutex_unlock(&lli->lli_layout_mutex);
5142 * Issue layout intent RPC indicating where in a file an IO is about to write.
5144 * \param[in] inode file inode.
5145 * \param[in] ext write range with start offset of fille in bytes where
5146 * an IO is about to write, and exclusive end offset in
5149 * \retval 0 on success
5150 * \retval < 0 error code
5152 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5153 struct lu_extent *ext)
5155 struct layout_intent intent = {
5157 .li_extent.e_start = ext->e_start,
5158 .li_extent.e_end = ext->e_end,
5163 rc = ll_layout_intent(inode, &intent);
5169 * This function send a restore request to the MDT
5171 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5173 struct hsm_user_request *hur;
5177 len = sizeof(struct hsm_user_request) +
5178 sizeof(struct hsm_user_item);
5179 OBD_ALLOC(hur, len);
5183 hur->hur_request.hr_action = HUA_RESTORE;
5184 hur->hur_request.hr_archive_id = 0;
5185 hur->hur_request.hr_flags = 0;
5186 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5187 sizeof(hur->hur_user_item[0].hui_fid));
5188 hur->hur_user_item[0].hui_extent.offset = offset;
5189 hur->hur_user_item[0].hui_extent.length = length;
5190 hur->hur_request.hr_itemcount = 1;
5191 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,