4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 only,
8 * as published by the Free Software Foundation.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License version 2 for more details (a copy is included
14 * in the LICENSE file that accompanied this code).
16 * You should have received a copy of the GNU General Public License
17 * version 2 along with this program; If not, see
18 * http://www.gnu.org/licenses/gpl-2.0.html
23 * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright (c) 2011, 2017, Intel Corporation.
29 * This file is part of Lustre, http://www.lustre.org/
30 * Lustre is a trademark of Sun Microsystems, Inc.
34 * Author: Peter Braam <braam@clusterfs.com>
35 * Author: Phil Schwan <phil@clusterfs.com>
36 * Author: Andreas Dilger <adilger@clusterfs.com>
39 #define DEBUG_SUBSYSTEM S_LLITE
40 #include <lustre_dlm.h>
41 #include <linux/pagemap.h>
42 #include <linux/file.h>
43 #include <linux/sched.h>
44 #include <linux/user_namespace.h>
45 #ifdef HAVE_UIDGID_HEADER
46 # include <linux/uidgid.h>
49 #include <uapi/linux/lustre/lustre_ioctl.h>
50 #include <lustre_swab.h>
52 #include "cl_object.h"
53 #include "llite_internal.h"
54 #include "vvp_internal.h"
57 struct inode *sp_inode;
62 __u64 pa_data_version;
68 ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg);
70 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
73 static struct ll_file_data *ll_file_data_get(void)
75 struct ll_file_data *fd;
77 OBD_SLAB_ALLOC_PTR_GFP(fd, ll_file_data_slab, GFP_NOFS);
81 fd->fd_write_failed = false;
82 pcc_file_init(&fd->fd_pcc_file);
87 static void ll_file_data_put(struct ll_file_data *fd)
90 OBD_SLAB_FREE_PTR(fd, ll_file_data_slab);
94 * Packs all the attributes into @op_data for the CLOSE rpc.
96 static void ll_prepare_close(struct inode *inode, struct md_op_data *op_data,
97 struct obd_client_handle *och)
101 ll_prep_md_op_data(op_data, inode, NULL, NULL,
102 0, 0, LUSTRE_OPC_ANY, NULL);
104 op_data->op_attr.ia_mode = inode->i_mode;
105 op_data->op_attr.ia_atime = inode->i_atime;
106 op_data->op_attr.ia_mtime = inode->i_mtime;
107 op_data->op_attr.ia_ctime = inode->i_ctime;
108 op_data->op_attr.ia_size = i_size_read(inode);
109 op_data->op_attr.ia_valid |= (ATTR_MODE | ATTR_ATIME | ATTR_ATIME_SET |
110 ATTR_MTIME | ATTR_MTIME_SET |
112 op_data->op_xvalid |= OP_XVALID_CTIME_SET;
113 op_data->op_attr_blocks = inode->i_blocks;
114 op_data->op_attr_flags = ll_inode_to_ext_flags(inode->i_flags);
115 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
116 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
117 op_data->op_open_handle = och->och_open_handle;
119 if (och->och_flags & FMODE_WRITE &&
120 ll_file_test_and_clear_flag(ll_i2info(inode), LLIF_DATA_MODIFIED))
121 /* For HSM: if inode data has been modified, pack it so that
122 * MDT can set data dirty flag in the archive. */
123 op_data->op_bias |= MDS_DATA_MODIFIED;
129 * Perform a close, possibly with a bias.
130 * The meaning of "data" depends on the value of "bias".
132 * If \a bias is MDS_HSM_RELEASE then \a data is a pointer to the data version.
133 * If \a bias is MDS_CLOSE_LAYOUT_SWAP then \a data is a pointer to the inode to
136 static int ll_close_inode_openhandle(struct inode *inode,
137 struct obd_client_handle *och,
138 enum mds_op_bias bias, void *data)
140 struct obd_export *md_exp = ll_i2mdexp(inode);
141 const struct ll_inode_info *lli = ll_i2info(inode);
142 struct md_op_data *op_data;
143 struct ptlrpc_request *req = NULL;
147 if (class_exp2obd(md_exp) == NULL) {
148 CERROR("%s: invalid MDC connection handle closing "DFID"\n",
149 ll_i2sbi(inode)->ll_fsname, PFID(&lli->lli_fid));
153 OBD_ALLOC_PTR(op_data);
154 /* We leak openhandle and request here on error, but not much to be
155 * done in OOM case since app won't retry close on error either. */
157 GOTO(out, rc = -ENOMEM);
159 ll_prepare_close(inode, op_data, och);
161 case MDS_CLOSE_LAYOUT_MERGE:
162 /* merge blocks from the victim inode */
163 op_data->op_attr_blocks += ((struct inode *)data)->i_blocks;
164 op_data->op_attr.ia_valid |= ATTR_SIZE;
165 op_data->op_xvalid |= OP_XVALID_BLOCKS;
166 case MDS_CLOSE_LAYOUT_SPLIT:
167 case MDS_CLOSE_LAYOUT_SWAP: {
168 struct split_param *sp = data;
170 LASSERT(data != NULL);
171 op_data->op_bias |= bias;
172 op_data->op_data_version = 0;
173 op_data->op_lease_handle = och->och_lease_handle;
174 if (bias == MDS_CLOSE_LAYOUT_SPLIT) {
175 op_data->op_fid2 = *ll_inode2fid(sp->sp_inode);
176 op_data->op_mirror_id = sp->sp_mirror_id;
178 op_data->op_fid2 = *ll_inode2fid(data);
183 case MDS_CLOSE_RESYNC_DONE: {
184 struct ll_ioc_lease *ioc = data;
186 LASSERT(data != NULL);
187 op_data->op_attr_blocks +=
188 ioc->lil_count * op_data->op_attr_blocks;
189 op_data->op_attr.ia_valid |= ATTR_SIZE;
190 op_data->op_xvalid |= OP_XVALID_BLOCKS;
191 op_data->op_bias |= MDS_CLOSE_RESYNC_DONE;
193 op_data->op_lease_handle = och->och_lease_handle;
194 op_data->op_data = &ioc->lil_ids[0];
195 op_data->op_data_size =
196 ioc->lil_count * sizeof(ioc->lil_ids[0]);
200 case MDS_PCC_ATTACH: {
201 struct pcc_param *param = data;
203 LASSERT(data != NULL);
204 op_data->op_bias |= MDS_HSM_RELEASE | MDS_PCC_ATTACH;
205 op_data->op_archive_id = param->pa_archive_id;
206 op_data->op_data_version = param->pa_data_version;
207 op_data->op_lease_handle = och->och_lease_handle;
211 case MDS_HSM_RELEASE:
212 LASSERT(data != NULL);
213 op_data->op_bias |= MDS_HSM_RELEASE;
214 op_data->op_data_version = *(__u64 *)data;
215 op_data->op_lease_handle = och->och_lease_handle;
216 op_data->op_attr.ia_valid |= ATTR_SIZE;
217 op_data->op_xvalid |= OP_XVALID_BLOCKS;
221 LASSERT(data == NULL);
225 if (!(op_data->op_attr.ia_valid & ATTR_SIZE))
226 op_data->op_xvalid |= OP_XVALID_LAZYSIZE;
227 if (!(op_data->op_xvalid & OP_XVALID_BLOCKS))
228 op_data->op_xvalid |= OP_XVALID_LAZYBLOCKS;
230 rc = md_close(md_exp, op_data, och->och_mod, &req);
231 if (rc != 0 && rc != -EINTR)
232 CERROR("%s: inode "DFID" mdc close failed: rc = %d\n",
233 md_exp->exp_obd->obd_name, PFID(&lli->lli_fid), rc);
235 if (rc == 0 && op_data->op_bias & bias) {
236 struct mdt_body *body;
238 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
239 if (!(body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED))
242 if (bias & MDS_PCC_ATTACH) {
243 struct pcc_param *param = data;
245 param->pa_layout_gen = body->mbo_layout_gen;
249 ll_finish_md_op_data(op_data);
253 md_clear_open_replay_data(md_exp, och);
254 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
257 ptlrpc_req_finished(req); /* This is close request */
261 int ll_md_real_close(struct inode *inode, fmode_t fmode)
263 struct ll_inode_info *lli = ll_i2info(inode);
264 struct obd_client_handle **och_p;
265 struct obd_client_handle *och;
270 if (fmode & FMODE_WRITE) {
271 och_p = &lli->lli_mds_write_och;
272 och_usecount = &lli->lli_open_fd_write_count;
273 } else if (fmode & FMODE_EXEC) {
274 och_p = &lli->lli_mds_exec_och;
275 och_usecount = &lli->lli_open_fd_exec_count;
277 LASSERT(fmode & FMODE_READ);
278 och_p = &lli->lli_mds_read_och;
279 och_usecount = &lli->lli_open_fd_read_count;
282 mutex_lock(&lli->lli_och_mutex);
283 if (*och_usecount > 0) {
284 /* There are still users of this handle, so skip
286 mutex_unlock(&lli->lli_och_mutex);
292 mutex_unlock(&lli->lli_och_mutex);
295 /* There might be a race and this handle may already
297 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
303 static int ll_md_close(struct inode *inode, struct file *file)
305 union ldlm_policy_data policy = {
306 .l_inodebits = { MDS_INODELOCK_OPEN },
308 __u64 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_TEST_LOCK;
309 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
310 struct ll_inode_info *lli = ll_i2info(inode);
311 struct lustre_handle lockh;
312 enum ldlm_mode lockmode;
316 /* clear group lock, if present */
317 if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED))
318 ll_put_grouplock(inode, file, fd->fd_grouplock.lg_gid);
320 if (fd->fd_lease_och != NULL) {
323 /* Usually the lease is not released when the
324 * application crashed, we need to release here. */
325 rc = ll_lease_close(fd->fd_lease_och, inode, &lease_broken);
326 CDEBUG(rc ? D_ERROR : D_INODE, "Clean up lease "DFID" %d/%d\n",
327 PFID(&lli->lli_fid), rc, lease_broken);
329 fd->fd_lease_och = NULL;
332 if (fd->fd_och != NULL) {
333 rc = ll_close_inode_openhandle(inode, fd->fd_och, 0, NULL);
338 /* Let's see if we have good enough OPEN lock on the file and if
339 we can skip talking to MDS */
340 mutex_lock(&lli->lli_och_mutex);
341 if (fd->fd_omode & FMODE_WRITE) {
343 LASSERT(lli->lli_open_fd_write_count);
344 lli->lli_open_fd_write_count--;
345 } else if (fd->fd_omode & FMODE_EXEC) {
347 LASSERT(lli->lli_open_fd_exec_count);
348 lli->lli_open_fd_exec_count--;
351 LASSERT(lli->lli_open_fd_read_count);
352 lli->lli_open_fd_read_count--;
354 mutex_unlock(&lli->lli_och_mutex);
356 /* LU-4398: do not cache write open lock if the file has exec bit */
357 if ((lockmode == LCK_CW && inode->i_mode & S_IXUGO) ||
358 !md_lock_match(ll_i2mdexp(inode), flags, ll_inode2fid(inode),
359 LDLM_IBITS, &policy, lockmode, &lockh))
360 rc = ll_md_real_close(inode, fd->fd_omode);
363 LUSTRE_FPRIVATE(file) = NULL;
364 ll_file_data_put(fd);
369 /* While this returns an error code, fput() the caller does not, so we need
370 * to make every effort to clean up all of our state here. Also, applications
371 * rarely check close errors and even if an error is returned they will not
372 * re-try the close call.
374 int ll_file_release(struct inode *inode, struct file *file)
376 struct ll_file_data *fd;
377 struct ll_sb_info *sbi = ll_i2sbi(inode);
378 struct ll_inode_info *lli = ll_i2info(inode);
382 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p)\n",
383 PFID(ll_inode2fid(inode)), inode);
385 if (inode->i_sb->s_root != file_dentry(file))
386 ll_stats_ops_tally(sbi, LPROC_LL_RELEASE, 1);
387 fd = LUSTRE_FPRIVATE(file);
390 /* The last ref on @file, maybe not the the owner pid of statahead,
391 * because parent and child process can share the same file handle. */
392 if (S_ISDIR(inode->i_mode) && lli->lli_opendir_key == fd)
393 ll_deauthorize_statahead(inode, fd);
395 if (inode->i_sb->s_root == file_dentry(file)) {
396 LUSTRE_FPRIVATE(file) = NULL;
397 ll_file_data_put(fd);
401 pcc_file_release(inode, file);
403 if (!S_ISDIR(inode->i_mode)) {
404 if (lli->lli_clob != NULL)
405 lov_read_and_clear_async_rc(lli->lli_clob);
406 lli->lli_async_rc = 0;
409 rc = ll_md_close(inode, file);
411 if (CFS_FAIL_TIMEOUT_MS(OBD_FAIL_PTLRPC_DUMP_LOG, cfs_fail_val))
412 libcfs_debug_dumplog();
417 static inline int ll_dom_readpage(void *data, struct page *page)
419 struct niobuf_local *lnb = data;
422 kaddr = ll_kmap_atomic(page, KM_USER0);
423 memcpy(kaddr, lnb->lnb_data, lnb->lnb_len);
424 if (lnb->lnb_len < PAGE_SIZE)
425 memset(kaddr + lnb->lnb_len, 0,
426 PAGE_SIZE - lnb->lnb_len);
427 flush_dcache_page(page);
428 SetPageUptodate(page);
429 ll_kunmap_atomic(kaddr, KM_USER0);
435 void ll_dom_finish_open(struct inode *inode, struct ptlrpc_request *req,
436 struct lookup_intent *it)
438 struct ll_inode_info *lli = ll_i2info(inode);
439 struct cl_object *obj = lli->lli_clob;
440 struct address_space *mapping = inode->i_mapping;
442 struct niobuf_remote *rnb;
443 struct mdt_body *body;
445 unsigned long index, start;
446 struct niobuf_local lnb;
453 if (!req_capsule_has_field(&req->rq_pill, &RMF_NIOBUF_INLINE,
457 rnb = req_capsule_server_get(&req->rq_pill, &RMF_NIOBUF_INLINE);
458 if (rnb == NULL || rnb->rnb_len == 0)
461 /* LU-11595: Server may return whole file and that is OK always or
462 * it may return just file tail and its offset must be aligned with
463 * client PAGE_SIZE to be used on that client, if server's PAGE_SIZE is
464 * smaller then offset may be not aligned and that data is just ignored.
466 if (rnb->rnb_offset % PAGE_SIZE)
469 /* Server returns whole file or just file tail if it fills in reply
470 * buffer, in both cases total size should be equal to the file size.
472 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
473 if (rnb->rnb_offset + rnb->rnb_len != body->mbo_dom_size) {
474 CERROR("%s: server returns off/len %llu/%u but size %llu\n",
475 ll_i2sbi(inode)->ll_fsname, rnb->rnb_offset,
476 rnb->rnb_len, body->mbo_dom_size);
480 CDEBUG(D_INFO, "Get data along with open at %llu len %i, size %llu\n",
481 rnb->rnb_offset, rnb->rnb_len, body->mbo_dom_size);
483 data = (char *)rnb + sizeof(*rnb);
485 lnb.lnb_file_offset = rnb->rnb_offset;
486 start = lnb.lnb_file_offset / PAGE_SIZE;
488 LASSERT(lnb.lnb_file_offset % PAGE_SIZE == 0);
489 lnb.lnb_page_offset = 0;
491 lnb.lnb_data = data + (index << PAGE_SHIFT);
492 lnb.lnb_len = rnb->rnb_len - (index << PAGE_SHIFT);
493 if (lnb.lnb_len > PAGE_SIZE)
494 lnb.lnb_len = PAGE_SIZE;
496 vmpage = read_cache_page(mapping, index + start,
497 ll_dom_readpage, &lnb);
498 if (IS_ERR(vmpage)) {
499 CWARN("%s: cannot fill page %lu for "DFID
500 " with data: rc = %li\n",
501 ll_i2sbi(inode)->ll_fsname, index + start,
502 PFID(lu_object_fid(&obj->co_lu)),
508 } while (rnb->rnb_len > (index << PAGE_SHIFT));
512 static int ll_intent_file_open(struct dentry *de, void *lmm, int lmmsize,
513 struct lookup_intent *itp)
515 struct ll_sb_info *sbi = ll_i2sbi(de->d_inode);
516 struct dentry *parent = de->d_parent;
519 struct md_op_data *op_data;
520 struct ptlrpc_request *req = NULL;
524 LASSERT(parent != NULL);
525 LASSERT(itp->it_flags & MDS_OPEN_BY_FID);
527 /* if server supports open-by-fid, or file name is invalid, don't pack
528 * name in open request */
529 if (OBD_FAIL_CHECK(OBD_FAIL_LLITE_OPEN_BY_NAME) ||
530 !(exp_connect_flags(sbi->ll_md_exp) & OBD_CONNECT_OPEN_BY_FID)) {
532 len = de->d_name.len;
533 name = kmalloc(len + 1, GFP_NOFS);
538 spin_lock(&de->d_lock);
539 if (len != de->d_name.len) {
540 spin_unlock(&de->d_lock);
544 memcpy(name, de->d_name.name, len);
546 spin_unlock(&de->d_lock);
548 if (!lu_name_is_valid_2(name, len)) {
554 op_data = ll_prep_md_op_data(NULL, parent->d_inode, de->d_inode,
555 name, len, 0, LUSTRE_OPC_ANY, NULL);
556 if (IS_ERR(op_data)) {
558 RETURN(PTR_ERR(op_data));
560 op_data->op_data = lmm;
561 op_data->op_data_size = lmmsize;
563 rc = md_intent_lock(sbi->ll_md_exp, op_data, itp, &req,
564 &ll_md_blocking_ast, 0);
566 ll_finish_md_op_data(op_data);
568 /* reason for keep own exit path - don`t flood log
569 * with messages with -ESTALE errors.
571 if (!it_disposition(itp, DISP_OPEN_OPEN) ||
572 it_open_error(DISP_OPEN_OPEN, itp))
574 ll_release_openhandle(de, itp);
578 if (it_disposition(itp, DISP_LOOKUP_NEG))
579 GOTO(out, rc = -ENOENT);
581 if (rc != 0 || it_open_error(DISP_OPEN_OPEN, itp)) {
582 rc = rc ? rc : it_open_error(DISP_OPEN_OPEN, itp);
583 CDEBUG(D_VFSTRACE, "lock enqueue: err: %d\n", rc);
587 rc = ll_prep_inode(&de->d_inode, req, NULL, itp);
589 if (!rc && itp->it_lock_mode) {
590 struct lustre_handle handle = {.cookie = itp->it_lock_handle};
591 struct ldlm_lock *lock;
592 bool has_dom_bit = false;
594 /* If we got a lock back and it has a LOOKUP bit set,
595 * make sure the dentry is marked as valid so we can find it.
596 * We don't need to care about actual hashing since other bits
597 * of kernel will deal with that later.
599 lock = ldlm_handle2lock(&handle);
601 has_dom_bit = ldlm_has_dom(lock);
602 if (lock->l_policy_data.l_inodebits.bits &
603 MDS_INODELOCK_LOOKUP)
604 d_lustre_revalidate(de);
608 ll_set_lock_data(sbi->ll_md_exp, de->d_inode, itp, NULL);
610 ll_dom_finish_open(de->d_inode, req, itp);
614 ptlrpc_req_finished(req);
615 ll_intent_drop_lock(itp);
617 /* We did open by fid, but by the time we got to the server,
618 * the object disappeared. If this is a create, we cannot really
619 * tell the userspace that the file it was trying to create
620 * does not exist. Instead let's return -ESTALE, and the VFS will
621 * retry the create with LOOKUP_REVAL that we are going to catch
622 * in ll_revalidate_dentry() and use lookup then.
624 if (rc == -ENOENT && itp->it_op & IT_CREAT)
630 static int ll_och_fill(struct obd_export *md_exp, struct lookup_intent *it,
631 struct obd_client_handle *och)
633 struct mdt_body *body;
635 body = req_capsule_server_get(&it->it_request->rq_pill, &RMF_MDT_BODY);
636 och->och_open_handle = body->mbo_open_handle;
637 och->och_fid = body->mbo_fid1;
638 och->och_lease_handle.cookie = it->it_lock_handle;
639 och->och_magic = OBD_CLIENT_HANDLE_MAGIC;
640 och->och_flags = it->it_flags;
642 return md_set_open_replay_data(md_exp, och, it);
645 static int ll_local_open(struct file *file, struct lookup_intent *it,
646 struct ll_file_data *fd, struct obd_client_handle *och)
648 struct inode *inode = file_inode(file);
651 LASSERT(!LUSTRE_FPRIVATE(file));
658 rc = ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
663 LUSTRE_FPRIVATE(file) = fd;
664 ll_readahead_init(inode, &fd->fd_ras);
665 fd->fd_omode = it->it_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
667 /* ll_cl_context initialize */
668 rwlock_init(&fd->fd_lock);
669 INIT_LIST_HEAD(&fd->fd_lccs);
674 /* Open a file, and (for the very first open) create objects on the OSTs at
675 * this time. If opened with O_LOV_DELAY_CREATE, then we don't do the object
676 * creation or open until ll_lov_setstripe() ioctl is called.
678 * If we already have the stripe MD locally then we don't request it in
679 * md_open(), by passing a lmm_size = 0.
681 * It is up to the application to ensure no other processes open this file
682 * in the O_LOV_DELAY_CREATE case, or the default striping pattern will be
683 * used. We might be able to avoid races of that sort by getting lli_open_sem
684 * before returning in the O_LOV_DELAY_CREATE case and dropping it here
685 * or in ll_file_release(), but I'm not sure that is desirable/necessary.
687 int ll_file_open(struct inode *inode, struct file *file)
689 struct ll_inode_info *lli = ll_i2info(inode);
690 struct lookup_intent *it, oit = { .it_op = IT_OPEN,
691 .it_flags = file->f_flags };
692 struct obd_client_handle **och_p = NULL;
693 __u64 *och_usecount = NULL;
694 struct ll_file_data *fd;
698 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), flags %o\n",
699 PFID(ll_inode2fid(inode)), inode, file->f_flags);
701 it = file->private_data; /* XXX: compat macro */
702 file->private_data = NULL; /* prevent ll_local_open assertion */
704 fd = ll_file_data_get();
706 GOTO(out_nofiledata, rc = -ENOMEM);
709 if (S_ISDIR(inode->i_mode))
710 ll_authorize_statahead(inode, fd);
712 if (inode->i_sb->s_root == file_dentry(file)) {
713 LUSTRE_FPRIVATE(file) = fd;
717 if (!it || !it->it_disposition) {
718 /* Convert f_flags into access mode. We cannot use file->f_mode,
719 * because everything but O_ACCMODE mask was stripped from
721 if ((oit.it_flags + 1) & O_ACCMODE)
723 if (file->f_flags & O_TRUNC)
724 oit.it_flags |= FMODE_WRITE;
726 /* kernel only call f_op->open in dentry_open. filp_open calls
727 * dentry_open after call to open_namei that checks permissions.
728 * Only nfsd_open call dentry_open directly without checking
729 * permissions and because of that this code below is safe.
731 if (oit.it_flags & (FMODE_WRITE | FMODE_READ))
732 oit.it_flags |= MDS_OPEN_OWNEROVERRIDE;
734 /* We do not want O_EXCL here, presumably we opened the file
735 * already? XXX - NFS implications? */
736 oit.it_flags &= ~O_EXCL;
738 /* bug20584, if "it_flags" contains O_CREAT, the file will be
739 * created if necessary, then "IT_CREAT" should be set to keep
740 * consistent with it */
741 if (oit.it_flags & O_CREAT)
742 oit.it_op |= IT_CREAT;
748 /* Let's see if we have file open on MDS already. */
749 if (it->it_flags & FMODE_WRITE) {
750 och_p = &lli->lli_mds_write_och;
751 och_usecount = &lli->lli_open_fd_write_count;
752 } else if (it->it_flags & FMODE_EXEC) {
753 och_p = &lli->lli_mds_exec_och;
754 och_usecount = &lli->lli_open_fd_exec_count;
756 och_p = &lli->lli_mds_read_och;
757 och_usecount = &lli->lli_open_fd_read_count;
760 mutex_lock(&lli->lli_och_mutex);
761 if (*och_p) { /* Open handle is present */
762 if (it_disposition(it, DISP_OPEN_OPEN)) {
763 /* Well, there's extra open request that we do not need,
764 let's close it somehow. This will decref request. */
765 rc = it_open_error(DISP_OPEN_OPEN, it);
767 mutex_unlock(&lli->lli_och_mutex);
768 GOTO(out_openerr, rc);
771 ll_release_openhandle(file_dentry(file), it);
775 rc = ll_local_open(file, it, fd, NULL);
778 mutex_unlock(&lli->lli_och_mutex);
779 GOTO(out_openerr, rc);
782 LASSERT(*och_usecount == 0);
783 if (!it->it_disposition) {
784 struct ll_dentry_data *ldd = ll_d2d(file->f_path.dentry);
785 /* We cannot just request lock handle now, new ELC code
786 means that one of other OPEN locks for this file
787 could be cancelled, and since blocking ast handler
788 would attempt to grab och_mutex as well, that would
789 result in a deadlock */
790 mutex_unlock(&lli->lli_och_mutex);
792 * Normally called under two situations:
794 * 2. A race/condition on MDS resulting in no open
795 * handle to be returned from LOOKUP|OPEN request,
796 * for example if the target entry was a symlink.
798 * Only fetch MDS_OPEN_LOCK if this is in NFS path,
799 * marked by a bit set in ll_iget_for_nfs. Clear the
800 * bit so that it's not confusing later callers.
802 * NB; when ldd is NULL, it must have come via normal
803 * lookup path only, since ll_iget_for_nfs always calls
806 if (ldd && ldd->lld_nfs_dentry) {
807 ldd->lld_nfs_dentry = 0;
808 it->it_flags |= MDS_OPEN_LOCK;
812 * Always specify MDS_OPEN_BY_FID because we don't want
813 * to get file with different fid.
815 it->it_flags |= MDS_OPEN_BY_FID;
816 rc = ll_intent_file_open(file_dentry(file), NULL, 0,
819 GOTO(out_openerr, rc);
823 OBD_ALLOC(*och_p, sizeof (struct obd_client_handle));
825 GOTO(out_och_free, rc = -ENOMEM);
829 /* md_intent_lock() didn't get a request ref if there was an
830 * open error, so don't do cleanup on the request here
832 /* XXX (green): Should not we bail out on any error here, not
833 * just open error? */
834 rc = it_open_error(DISP_OPEN_OPEN, it);
836 GOTO(out_och_free, rc);
838 LASSERTF(it_disposition(it, DISP_ENQ_OPEN_REF),
839 "inode %p: disposition %x, status %d\n", inode,
840 it_disposition(it, ~0), it->it_status);
842 rc = ll_local_open(file, it, fd, *och_p);
844 GOTO(out_och_free, rc);
847 rc = pcc_file_open(inode, file);
849 GOTO(out_och_free, rc);
851 mutex_unlock(&lli->lli_och_mutex);
854 /* Must do this outside lli_och_mutex lock to prevent deadlock where
855 different kind of OPEN lock for this same inode gets cancelled
856 by ldlm_cancel_lru */
857 if (!S_ISREG(inode->i_mode))
858 GOTO(out_och_free, rc);
860 cl_lov_delay_create_clear(&file->f_flags);
861 GOTO(out_och_free, rc);
865 if (och_p && *och_p) {
866 OBD_FREE(*och_p, sizeof (struct obd_client_handle));
867 *och_p = NULL; /* OBD_FREE writes some magic there */
870 mutex_unlock(&lli->lli_och_mutex);
873 if (lli->lli_opendir_key == fd)
874 ll_deauthorize_statahead(inode, fd);
877 ll_file_data_put(fd);
879 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_OPEN, 1);
883 if (it && it_disposition(it, DISP_ENQ_OPEN_REF)) {
884 ptlrpc_req_finished(it->it_request);
885 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
891 static int ll_md_blocking_lease_ast(struct ldlm_lock *lock,
892 struct ldlm_lock_desc *desc, void *data, int flag)
895 struct lustre_handle lockh;
899 case LDLM_CB_BLOCKING:
900 ldlm_lock2handle(lock, &lockh);
901 rc = ldlm_cli_cancel(&lockh, LCF_ASYNC);
903 CDEBUG(D_INODE, "ldlm_cli_cancel: %d\n", rc);
907 case LDLM_CB_CANCELING:
915 * When setting a lease on a file, we take ownership of the lli_mds_*_och
916 * and save it as fd->fd_och so as to force client to reopen the file even
917 * if it has an open lock in cache already.
919 static int ll_lease_och_acquire(struct inode *inode, struct file *file,
920 struct lustre_handle *old_open_handle)
922 struct ll_inode_info *lli = ll_i2info(inode);
923 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
924 struct obd_client_handle **och_p;
929 /* Get the openhandle of the file */
930 mutex_lock(&lli->lli_och_mutex);
931 if (fd->fd_lease_och != NULL)
932 GOTO(out_unlock, rc = -EBUSY);
934 if (fd->fd_och == NULL) {
935 if (file->f_mode & FMODE_WRITE) {
936 LASSERT(lli->lli_mds_write_och != NULL);
937 och_p = &lli->lli_mds_write_och;
938 och_usecount = &lli->lli_open_fd_write_count;
940 LASSERT(lli->lli_mds_read_och != NULL);
941 och_p = &lli->lli_mds_read_och;
942 och_usecount = &lli->lli_open_fd_read_count;
945 if (*och_usecount > 1)
946 GOTO(out_unlock, rc = -EBUSY);
953 *old_open_handle = fd->fd_och->och_open_handle;
957 mutex_unlock(&lli->lli_och_mutex);
962 * Release ownership on lli_mds_*_och when putting back a file lease.
964 static int ll_lease_och_release(struct inode *inode, struct file *file)
966 struct ll_inode_info *lli = ll_i2info(inode);
967 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
968 struct obd_client_handle **och_p;
969 struct obd_client_handle *old_och = NULL;
974 mutex_lock(&lli->lli_och_mutex);
975 if (file->f_mode & FMODE_WRITE) {
976 och_p = &lli->lli_mds_write_och;
977 och_usecount = &lli->lli_open_fd_write_count;
979 och_p = &lli->lli_mds_read_och;
980 och_usecount = &lli->lli_open_fd_read_count;
983 /* The file may have been open by another process (broken lease) so
984 * *och_p is not NULL. In this case we should simply increase usecount
987 if (*och_p != NULL) {
988 old_och = fd->fd_och;
995 mutex_unlock(&lli->lli_och_mutex);
998 rc = ll_close_inode_openhandle(inode, old_och, 0, NULL);
1004 * Acquire a lease and open the file.
1006 static struct obd_client_handle *
1007 ll_lease_open(struct inode *inode, struct file *file, fmode_t fmode,
1010 struct lookup_intent it = { .it_op = IT_OPEN };
1011 struct ll_sb_info *sbi = ll_i2sbi(inode);
1012 struct md_op_data *op_data;
1013 struct ptlrpc_request *req = NULL;
1014 struct lustre_handle old_open_handle = { 0 };
1015 struct obd_client_handle *och = NULL;
1020 if (fmode != FMODE_WRITE && fmode != FMODE_READ)
1021 RETURN(ERR_PTR(-EINVAL));
1024 if (!(fmode & file->f_mode) || (file->f_mode & FMODE_EXEC))
1025 RETURN(ERR_PTR(-EPERM));
1027 rc = ll_lease_och_acquire(inode, file, &old_open_handle);
1029 RETURN(ERR_PTR(rc));
1034 RETURN(ERR_PTR(-ENOMEM));
1036 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
1037 LUSTRE_OPC_ANY, NULL);
1038 if (IS_ERR(op_data))
1039 GOTO(out, rc = PTR_ERR(op_data));
1041 /* To tell the MDT this openhandle is from the same owner */
1042 op_data->op_open_handle = old_open_handle;
1044 it.it_flags = fmode | open_flags;
1045 it.it_flags |= MDS_OPEN_LOCK | MDS_OPEN_BY_FID | MDS_OPEN_LEASE;
1046 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
1047 &ll_md_blocking_lease_ast,
1048 /* LDLM_FL_NO_LRU: To not put the lease lock into LRU list, otherwise
1049 * it can be cancelled which may mislead applications that the lease is
1051 * LDLM_FL_EXCL: Set this flag so that it won't be matched by normal
1052 * open in ll_md_blocking_ast(). Otherwise as ll_md_blocking_lease_ast
1053 * doesn't deal with openhandle, so normal openhandle will be leaked. */
1054 LDLM_FL_NO_LRU | LDLM_FL_EXCL);
1055 ll_finish_md_op_data(op_data);
1056 ptlrpc_req_finished(req);
1058 GOTO(out_release_it, rc);
1060 if (it_disposition(&it, DISP_LOOKUP_NEG))
1061 GOTO(out_release_it, rc = -ENOENT);
1063 rc = it_open_error(DISP_OPEN_OPEN, &it);
1065 GOTO(out_release_it, rc);
1067 LASSERT(it_disposition(&it, DISP_ENQ_OPEN_REF));
1068 ll_och_fill(sbi->ll_md_exp, &it, och);
1070 if (!it_disposition(&it, DISP_OPEN_LEASE)) /* old server? */
1071 GOTO(out_close, rc = -EOPNOTSUPP);
1073 /* already get lease, handle lease lock */
1074 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
1075 if (it.it_lock_mode == 0 ||
1076 it.it_lock_bits != MDS_INODELOCK_OPEN) {
1077 /* open lock must return for lease */
1078 CERROR(DFID "lease granted but no open lock, %d/%llu.\n",
1079 PFID(ll_inode2fid(inode)), it.it_lock_mode,
1081 GOTO(out_close, rc = -EPROTO);
1084 ll_intent_release(&it);
1088 /* Cancel open lock */
1089 if (it.it_lock_mode != 0) {
1090 ldlm_lock_decref_and_cancel(&och->och_lease_handle,
1092 it.it_lock_mode = 0;
1093 och->och_lease_handle.cookie = 0ULL;
1095 rc2 = ll_close_inode_openhandle(inode, och, 0, NULL);
1097 CERROR("%s: error closing file "DFID": %d\n",
1098 sbi->ll_fsname, PFID(&ll_i2info(inode)->lli_fid), rc2);
1099 och = NULL; /* och has been freed in ll_close_inode_openhandle() */
1101 ll_intent_release(&it);
1105 RETURN(ERR_PTR(rc));
1109 * Check whether a layout swap can be done between two inodes.
1111 * \param[in] inode1 First inode to check
1112 * \param[in] inode2 Second inode to check
1114 * \retval 0 on success, layout swap can be performed between both inodes
1115 * \retval negative error code if requirements are not met
1117 static int ll_check_swap_layouts_validity(struct inode *inode1,
1118 struct inode *inode2)
1120 if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode))
1123 if (inode_permission(inode1, MAY_WRITE) ||
1124 inode_permission(inode2, MAY_WRITE))
1127 if (inode1->i_sb != inode2->i_sb)
1133 static int ll_swap_layouts_close(struct obd_client_handle *och,
1134 struct inode *inode, struct inode *inode2)
1136 const struct lu_fid *fid1 = ll_inode2fid(inode);
1137 const struct lu_fid *fid2;
1141 CDEBUG(D_INODE, "%s: biased close of file "DFID"\n",
1142 ll_i2sbi(inode)->ll_fsname, PFID(fid1));
1144 rc = ll_check_swap_layouts_validity(inode, inode2);
1146 GOTO(out_free_och, rc);
1148 /* We now know that inode2 is a lustre inode */
1149 fid2 = ll_inode2fid(inode2);
1151 rc = lu_fid_cmp(fid1, fid2);
1153 GOTO(out_free_och, rc = -EINVAL);
1155 /* Close the file and {swap,merge} layouts between inode & inode2.
1156 * NB: lease lock handle is released in mdc_close_layout_swap_pack()
1157 * because we still need it to pack l_remote_handle to MDT. */
1158 rc = ll_close_inode_openhandle(inode, och, MDS_CLOSE_LAYOUT_SWAP,
1161 och = NULL; /* freed in ll_close_inode_openhandle() */
1171 * Release lease and close the file.
1172 * It will check if the lease has ever broken.
1174 static int ll_lease_close_intent(struct obd_client_handle *och,
1175 struct inode *inode,
1176 bool *lease_broken, enum mds_op_bias bias,
1179 struct ldlm_lock *lock;
1180 bool cancelled = true;
1184 lock = ldlm_handle2lock(&och->och_lease_handle);
1186 lock_res_and_lock(lock);
1187 cancelled = ldlm_is_cancel(lock);
1188 unlock_res_and_lock(lock);
1189 LDLM_LOCK_PUT(lock);
1192 CDEBUG(D_INODE, "lease for "DFID" broken? %d, bias: %x\n",
1193 PFID(&ll_i2info(inode)->lli_fid), cancelled, bias);
1195 if (lease_broken != NULL)
1196 *lease_broken = cancelled;
1198 if (!cancelled && !bias)
1199 ldlm_cli_cancel(&och->och_lease_handle, 0);
1201 if (cancelled) { /* no need to excute intent */
1206 rc = ll_close_inode_openhandle(inode, och, bias, data);
1210 static int ll_lease_close(struct obd_client_handle *och, struct inode *inode,
1213 return ll_lease_close_intent(och, inode, lease_broken, 0, NULL);
1217 * After lease is taken, send the RPC MDS_REINT_RESYNC to the MDT
1219 static int ll_lease_file_resync(struct obd_client_handle *och,
1220 struct inode *inode, unsigned long arg)
1222 struct ll_sb_info *sbi = ll_i2sbi(inode);
1223 struct md_op_data *op_data;
1224 struct ll_ioc_lease_id ioc;
1225 __u64 data_version_unused;
1229 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
1230 LUSTRE_OPC_ANY, NULL);
1231 if (IS_ERR(op_data))
1232 RETURN(PTR_ERR(op_data));
1234 if (copy_from_user(&ioc, (struct ll_ioc_lease_id __user *)arg,
1238 /* before starting file resync, it's necessary to clean up page cache
1239 * in client memory, otherwise once the layout version is increased,
1240 * writing back cached data will be denied the OSTs. */
1241 rc = ll_data_version(inode, &data_version_unused, LL_DV_WR_FLUSH);
1245 op_data->op_lease_handle = och->och_lease_handle;
1246 op_data->op_mirror_id = ioc.lil_mirror_id;
1247 rc = md_file_resync(sbi->ll_md_exp, op_data);
1253 ll_finish_md_op_data(op_data);
1257 int ll_merge_attr(const struct lu_env *env, struct inode *inode)
1259 struct ll_inode_info *lli = ll_i2info(inode);
1260 struct cl_object *obj = lli->lli_clob;
1261 struct cl_attr *attr = vvp_env_thread_attr(env);
1269 ll_inode_size_lock(inode);
1271 /* Merge timestamps the most recently obtained from MDS with
1272 * timestamps obtained from OSTs.
1274 * Do not overwrite atime of inode because it may be refreshed
1275 * by file_accessed() function. If the read was served by cache
1276 * data, there is no RPC to be sent so that atime may not be
1277 * transferred to OSTs at all. MDT only updates atime at close time
1278 * if it's at least 'mdd.*.atime_diff' older.
1279 * All in all, the atime in Lustre does not strictly comply with
1280 * POSIX. Solving this problem needs to send an RPC to MDT for each
1281 * read, this will hurt performance.
1283 if (inode->i_atime.tv_sec < lli->lli_atime ||
1284 lli->lli_update_atime) {
1285 inode->i_atime.tv_sec = lli->lli_atime;
1286 lli->lli_update_atime = 0;
1288 inode->i_mtime.tv_sec = lli->lli_mtime;
1289 inode->i_ctime.tv_sec = lli->lli_ctime;
1291 mtime = inode->i_mtime.tv_sec;
1292 atime = inode->i_atime.tv_sec;
1293 ctime = inode->i_ctime.tv_sec;
1295 cl_object_attr_lock(obj);
1296 if (OBD_FAIL_CHECK(OBD_FAIL_MDC_MERGE))
1299 rc = cl_object_attr_get(env, obj, attr);
1300 cl_object_attr_unlock(obj);
1303 GOTO(out_size_unlock, rc = (rc == -ENODATA ? 0 : rc));
1305 if (atime < attr->cat_atime)
1306 atime = attr->cat_atime;
1308 if (ctime < attr->cat_ctime)
1309 ctime = attr->cat_ctime;
1311 if (mtime < attr->cat_mtime)
1312 mtime = attr->cat_mtime;
1314 CDEBUG(D_VFSTRACE, DFID" updating i_size %llu\n",
1315 PFID(&lli->lli_fid), attr->cat_size);
1317 i_size_write(inode, attr->cat_size);
1318 inode->i_blocks = attr->cat_blocks;
1320 inode->i_mtime.tv_sec = mtime;
1321 inode->i_atime.tv_sec = atime;
1322 inode->i_ctime.tv_sec = ctime;
1325 ll_inode_size_unlock(inode);
1331 * Set designated mirror for I/O.
1333 * So far only read, write, and truncated can support to issue I/O to
1334 * designated mirror.
1336 void ll_io_set_mirror(struct cl_io *io, const struct file *file)
1338 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1340 /* clear layout version for generic(non-resync) I/O in case it carries
1341 * stale layout version due to I/O restart */
1342 io->ci_layout_version = 0;
1344 /* FLR: disable non-delay for designated mirror I/O because obviously
1345 * only one mirror is available */
1346 if (fd->fd_designated_mirror > 0) {
1348 io->ci_designated_mirror = fd->fd_designated_mirror;
1349 io->ci_layout_version = fd->fd_layout_version;
1352 CDEBUG(D_VFSTRACE, "%s: desiginated mirror: %d\n",
1353 file->f_path.dentry->d_name.name, io->ci_designated_mirror);
1356 static bool file_is_noatime(const struct file *file)
1358 const struct vfsmount *mnt = file->f_path.mnt;
1359 const struct inode *inode = file_inode((struct file *)file);
1361 /* Adapted from file_accessed() and touch_atime().*/
1362 if (file->f_flags & O_NOATIME)
1365 if (inode->i_flags & S_NOATIME)
1368 if (IS_NOATIME(inode))
1371 if (mnt->mnt_flags & (MNT_NOATIME | MNT_READONLY))
1374 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1377 if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))
1383 void ll_io_init(struct cl_io *io, struct file *file, enum cl_io_type iot,
1384 struct vvp_io_args *args)
1386 struct inode *inode = file_inode(file);
1387 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1389 io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK;
1390 io->ci_lock_no_expand = fd->ll_lock_no_expand;
1392 if (iot == CIT_WRITE) {
1393 io->u.ci_wr.wr_append = !!(file->f_flags & O_APPEND);
1394 io->u.ci_wr.wr_sync = !!(file->f_flags & O_SYNC ||
1395 file->f_flags & O_DIRECT ||
1397 #ifdef HAVE_GENERIC_WRITE_SYNC_2ARGS
1398 io->u.ci_wr.wr_sync |= !!(args &&
1399 args->via_io_subtype == IO_NORMAL &&
1400 args->u.normal.via_iocb->ki_flags & IOCB_DSYNC);
1404 io->ci_obj = ll_i2info(inode)->lli_clob;
1405 io->ci_lockreq = CILR_MAYBE;
1406 if (ll_file_nolock(file)) {
1407 io->ci_lockreq = CILR_NEVER;
1408 io->ci_no_srvlock = 1;
1409 } else if (file->f_flags & O_APPEND) {
1410 io->ci_lockreq = CILR_MANDATORY;
1412 io->ci_noatime = file_is_noatime(file);
1413 io->ci_async_readahead = false;
1415 /* FLR: only use non-delay I/O for read as there is only one
1416 * avaliable mirror for write. */
1417 io->ci_ndelay = !(iot == CIT_WRITE);
1419 ll_io_set_mirror(io, file);
1422 static void ll_heat_add(struct inode *inode, enum cl_io_type iot,
1425 struct ll_inode_info *lli = ll_i2info(inode);
1426 struct ll_sb_info *sbi = ll_i2sbi(inode);
1427 enum obd_heat_type sample_type;
1428 enum obd_heat_type iobyte_type;
1429 __u64 now = ktime_get_real_seconds();
1431 if (!ll_sbi_has_file_heat(sbi) ||
1432 lli->lli_heat_flags & LU_HEAT_FLAG_OFF)
1435 if (iot == CIT_READ) {
1436 sample_type = OBD_HEAT_READSAMPLE;
1437 iobyte_type = OBD_HEAT_READBYTE;
1438 } else if (iot == CIT_WRITE) {
1439 sample_type = OBD_HEAT_WRITESAMPLE;
1440 iobyte_type = OBD_HEAT_WRITEBYTE;
1445 spin_lock(&lli->lli_heat_lock);
1446 obd_heat_add(&lli->lli_heat_instances[sample_type], now, 1,
1447 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1448 obd_heat_add(&lli->lli_heat_instances[iobyte_type], now, count,
1449 sbi->ll_heat_decay_weight, sbi->ll_heat_period_second);
1450 spin_unlock(&lli->lli_heat_lock);
1454 ll_file_io_generic(const struct lu_env *env, struct vvp_io_args *args,
1455 struct file *file, enum cl_io_type iot,
1456 loff_t *ppos, size_t count)
1458 struct vvp_io *vio = vvp_env_io(env);
1459 struct inode *inode = file_inode(file);
1460 struct ll_inode_info *lli = ll_i2info(inode);
1461 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
1462 struct range_lock range;
1466 unsigned retried = 0;
1467 bool restarted = false;
1471 CDEBUG(D_VFSTRACE, "%s: %s ppos: %llu, count: %zu\n",
1472 file_dentry(file)->d_name.name,
1473 iot == CIT_READ ? "read" : "write", *ppos, count);
1476 io = vvp_env_thread_io(env);
1477 ll_io_init(io, file, iot, args);
1478 io->ci_ndelay_tried = retried;
1480 if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) {
1481 bool range_locked = false;
1483 if (file->f_flags & O_APPEND)
1484 range_lock_init(&range, 0, LUSTRE_EOF);
1486 range_lock_init(&range, *ppos, *ppos + count - 1);
1488 vio->vui_fd = LUSTRE_FPRIVATE(file);
1489 vio->vui_io_subtype = args->via_io_subtype;
1491 switch (vio->vui_io_subtype) {
1493 vio->vui_iter = args->u.normal.via_iter;
1494 vio->vui_iocb = args->u.normal.via_iocb;
1495 /* Direct IO reads must also take range lock,
1496 * or multiple reads will try to work on the same pages
1497 * See LU-6227 for details. */
1498 if (((iot == CIT_WRITE) ||
1499 (iot == CIT_READ && (file->f_flags & O_DIRECT))) &&
1500 !(vio->vui_fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
1501 CDEBUG(D_VFSTRACE, "Range lock "RL_FMT"\n",
1503 rc = range_lock(&lli->lli_write_tree, &range);
1507 range_locked = true;
1511 vio->u.splice.vui_pipe = args->u.splice.via_pipe;
1512 vio->u.splice.vui_flags = args->u.splice.via_flags;
1515 CERROR("unknown IO subtype %u\n", vio->vui_io_subtype);
1519 ll_cl_add(file, env, io, LCC_RW);
1520 rc = cl_io_loop(env, io);
1521 ll_cl_remove(file, env);
1524 CDEBUG(D_VFSTRACE, "Range unlock "RL_FMT"\n",
1526 range_unlock(&lli->lli_write_tree, &range);
1529 /* cl_io_rw_init() handled IO */
1533 if (io->ci_nob > 0) {
1534 result += io->ci_nob;
1535 count -= io->ci_nob;
1536 *ppos = io->u.ci_wr.wr.crw_pos; /* for splice */
1538 /* prepare IO restart */
1539 if (count > 0 && args->via_io_subtype == IO_NORMAL)
1540 args->u.normal.via_iter = vio->vui_iter;
1543 cl_io_fini(env, io);
1546 "%s: %d io complete with rc: %d, result: %zd, restart: %d\n",
1547 file->f_path.dentry->d_name.name,
1548 iot, rc, result, io->ci_need_restart);
1550 if ((rc == 0 || rc == -ENODATA) && count > 0 && io->ci_need_restart) {
1552 "%s: restart %s from %lld, count: %zu, ret: %zd, rc: %d\n",
1553 file_dentry(file)->d_name.name,
1554 iot == CIT_READ ? "read" : "write",
1555 *ppos, count, result, rc);
1556 /* preserve the tried count for FLR */
1557 retried = io->ci_ndelay_tried;
1562 if (iot == CIT_READ) {
1564 ll_stats_ops_tally(ll_i2sbi(inode),
1565 LPROC_LL_READ_BYTES, result);
1566 } else if (iot == CIT_WRITE) {
1568 ll_stats_ops_tally(ll_i2sbi(inode),
1569 LPROC_LL_WRITE_BYTES, result);
1570 fd->fd_write_failed = false;
1571 } else if (result == 0 && rc == 0) {
1574 fd->fd_write_failed = true;
1576 fd->fd_write_failed = false;
1577 } else if (rc != -ERESTARTSYS) {
1578 fd->fd_write_failed = true;
1582 CDEBUG(D_VFSTRACE, "iot: %d, result: %zd\n", iot, result);
1584 ll_heat_add(inode, iot, result);
1586 RETURN(result > 0 ? result : rc);
1590 * The purpose of fast read is to overcome per I/O overhead and improve IOPS
1591 * especially for small I/O.
1593 * To serve a read request, CLIO has to create and initialize a cl_io and
1594 * then request DLM lock. This has turned out to have siginificant overhead
1595 * and affects the performance of small I/O dramatically.
1597 * It's not necessary to create a cl_io for each I/O. Under the help of read
1598 * ahead, most of the pages being read are already in memory cache and we can
1599 * read those pages directly because if the pages exist, the corresponding DLM
1600 * lock must exist so that page content must be valid.
1602 * In fast read implementation, the llite speculatively finds and reads pages
1603 * in memory cache. There are three scenarios for fast read:
1604 * - If the page exists and is uptodate, kernel VM will provide the data and
1605 * CLIO won't be intervened;
1606 * - If the page was brought into memory by read ahead, it will be exported
1607 * and read ahead parameters will be updated;
1608 * - Otherwise the page is not in memory, we can't do fast read. Therefore,
1609 * it will go back and invoke normal read, i.e., a cl_io will be created
1610 * and DLM lock will be requested.
1612 * POSIX compliance: posix standard states that read is intended to be atomic.
1613 * Lustre read implementation is in line with Linux kernel read implementation
1614 * and neither of them complies with POSIX standard in this matter. Fast read
1615 * doesn't make the situation worse on single node but it may interleave write
1616 * results from multiple nodes due to short read handling in ll_file_aio_read().
1618 * \param env - lu_env
1619 * \param iocb - kiocb from kernel
1620 * \param iter - user space buffers where the data will be copied
1622 * \retval - number of bytes have been read, or error code if error occurred.
1625 ll_do_fast_read(struct kiocb *iocb, struct iov_iter *iter)
1629 if (!ll_sbi_has_fast_read(ll_i2sbi(file_inode(iocb->ki_filp))))
1632 /* NB: we can't do direct IO for fast read because it will need a lock
1633 * to make IO engine happy. */
1634 if (iocb->ki_filp->f_flags & O_DIRECT)
1637 result = generic_file_read_iter(iocb, iter);
1639 /* If the first page is not in cache, generic_file_aio_read() will be
1640 * returned with -ENODATA.
1641 * See corresponding code in ll_readpage(). */
1642 if (result == -ENODATA)
1646 ll_heat_add(file_inode(iocb->ki_filp), CIT_READ, result);
1647 ll_stats_ops_tally(ll_i2sbi(file_inode(iocb->ki_filp)),
1648 LPROC_LL_READ_BYTES, result);
1655 * Read from a file (through the page cache).
1657 static ssize_t ll_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
1660 struct vvp_io_args *args;
1661 struct file *file = iocb->ki_filp;
1667 if (!iov_iter_count(to))
1671 * Currently when PCC read failed, we do not fall back to the
1672 * normal read path, just return the error.
1673 * The resaon is that: for RW-PCC, the file data may be modified
1674 * in the PCC and inconsistent with the data on OSTs (or file
1675 * data has been removed from the Lustre file system), at this
1676 * time, fallback to the normal read path may read the wrong
1678 * TODO: for RO-PCC (readonly PCC), fall back to normal read
1679 * path: read data from data copy on OSTs.
1681 result = pcc_file_read_iter(iocb, to, &cached);
1687 result = ll_do_fast_read(iocb, to);
1688 if (result < 0 || iov_iter_count(to) == 0)
1691 env = cl_env_get(&refcheck);
1693 return PTR_ERR(env);
1695 args = ll_env_args(env, IO_NORMAL);
1696 args->u.normal.via_iter = to;
1697 args->u.normal.via_iocb = iocb;
1699 rc2 = ll_file_io_generic(env, args, file, CIT_READ,
1700 &iocb->ki_pos, iov_iter_count(to));
1703 else if (result == 0)
1706 cl_env_put(env, &refcheck);
1709 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1710 LUSTRE_FPRIVATE(file), iocb->ki_pos, result,
1717 * Similar trick to ll_do_fast_read, this improves write speed for tiny writes.
1718 * If a page is already in the page cache and dirty (and some other things -
1719 * See ll_tiny_write_begin for the instantiation of these rules), then we can
1720 * write to it without doing a full I/O, because Lustre already knows about it
1721 * and will write it out. This saves a lot of processing time.
1723 * All writes here are within one page, so exclusion is handled by the page
1724 * lock on the vm page. We do not do tiny writes for writes which touch
1725 * multiple pages because it's very unlikely multiple sequential pages are
1726 * are already dirty.
1728 * We limit these to < PAGE_SIZE because PAGE_SIZE writes are relatively common
1729 * and are unlikely to be to already dirty pages.
1731 * Attribute updates are important here, we do them in ll_tiny_write_end.
1733 static ssize_t ll_do_tiny_write(struct kiocb *iocb, struct iov_iter *iter)
1735 ssize_t count = iov_iter_count(iter);
1736 struct file *file = iocb->ki_filp;
1737 struct inode *inode = file_inode(file);
1738 bool lock_inode = !IS_NOSEC(inode);
1743 /* Restrict writes to single page and < PAGE_SIZE. See comment at top
1744 * of function for why.
1746 if (count >= PAGE_SIZE ||
1747 (iocb->ki_pos & (PAGE_SIZE-1)) + count > PAGE_SIZE)
1750 if (unlikely(lock_inode))
1752 result = __generic_file_write_iter(iocb, iter);
1754 if (unlikely(lock_inode))
1755 inode_unlock(inode);
1757 /* If the page is not already dirty, ll_tiny_write_begin returns
1758 * -ENODATA. We continue on to normal write.
1760 if (result == -ENODATA)
1764 ll_heat_add(inode, CIT_WRITE, result);
1765 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES,
1767 ll_file_set_flag(ll_i2info(inode), LLIF_DATA_MODIFIED);
1770 CDEBUG(D_VFSTRACE, "result: %zu, original count %zu\n", result, count);
1776 * Write to a file (through the page cache).
1778 static ssize_t ll_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1780 struct vvp_io_args *args;
1782 ssize_t rc_tiny = 0, rc_normal;
1783 struct file *file = iocb->ki_filp;
1790 if (!iov_iter_count(from))
1791 GOTO(out, rc_normal = 0);
1794 * When PCC write failed, we usually do not fall back to the normal
1795 * write path, just return the error. But there is a special case when
1796 * returned error code is -ENOSPC due to running out of space on PCC HSM
1797 * bakcend. At this time, it will fall back to normal I/O path and
1798 * retry the I/O. As the file is in HSM released state, it will restore
1799 * the file data to OSTs first and redo the write again. And the
1800 * restore process will revoke the layout lock and detach the file
1801 * from PCC cache automatically.
1803 result = pcc_file_write_iter(iocb, from, &cached);
1804 if (cached && result != -ENOSPC && result != -EDQUOT)
1807 /* NB: we can't do direct IO for tiny writes because they use the page
1808 * cache, we can't do sync writes because tiny writes can't flush
1809 * pages, and we can't do append writes because we can't guarantee the
1810 * required DLM locks are held to protect file size.
1812 if (ll_sbi_has_tiny_write(ll_i2sbi(file_inode(file))) &&
1813 !(file->f_flags & (O_DIRECT | O_SYNC | O_APPEND)))
1814 rc_tiny = ll_do_tiny_write(iocb, from);
1816 /* In case of error, go on and try normal write - Only stop if tiny
1817 * write completed I/O.
1819 if (iov_iter_count(from) == 0)
1820 GOTO(out, rc_normal = rc_tiny);
1822 env = cl_env_get(&refcheck);
1824 return PTR_ERR(env);
1826 args = ll_env_args(env, IO_NORMAL);
1827 args->u.normal.via_iter = from;
1828 args->u.normal.via_iocb = iocb;
1830 rc_normal = ll_file_io_generic(env, args, file, CIT_WRITE,
1831 &iocb->ki_pos, iov_iter_count(from));
1833 /* On success, combine bytes written. */
1834 if (rc_tiny >= 0 && rc_normal > 0)
1835 rc_normal += rc_tiny;
1836 /* On error, only return error from normal write if tiny write did not
1837 * write any bytes. Otherwise return bytes written by tiny write.
1839 else if (rc_tiny > 0)
1840 rc_normal = rc_tiny;
1842 cl_env_put(env, &refcheck);
1845 ll_rw_stats_tally(ll_i2sbi(file_inode(file)), current->pid,
1846 LUSTRE_FPRIVATE(file), iocb->ki_pos,
1851 #ifndef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
1853 * XXX: exact copy from kernel code (__generic_file_aio_write_nolock)
1855 static int ll_file_get_iov_count(const struct iovec *iov,
1856 unsigned long *nr_segs, size_t *count)
1861 for (seg = 0; seg < *nr_segs; seg++) {
1862 const struct iovec *iv = &iov[seg];
1865 * If any segment has a negative length, or the cumulative
1866 * length ever wraps negative then return -EINVAL.
1869 if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
1871 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1876 cnt -= iv->iov_len; /* This segment is no good */
1883 static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1884 unsigned long nr_segs, loff_t pos)
1891 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1898 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1899 iov_iter_init(&to, READ, iov, nr_segs, iov_count);
1900 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1901 iov_iter_init(&to, iov, nr_segs, iov_count, 0);
1902 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1904 result = ll_file_read_iter(iocb, &to);
1909 static ssize_t ll_file_read(struct file *file, char __user *buf, size_t count,
1912 struct iovec iov = { .iov_base = buf, .iov_len = count };
1921 init_sync_kiocb(&kiocb, file);
1922 kiocb.ki_pos = *ppos;
1923 #ifdef HAVE_KIOCB_KI_LEFT
1924 kiocb.ki_left = count;
1925 #elif defined(HAVE_KI_NBYTES)
1926 kiocb.i_nbytes = count;
1929 result = ll_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
1930 *ppos = kiocb.ki_pos;
1936 * Write to a file (through the page cache).
1939 static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
1940 unsigned long nr_segs, loff_t pos)
1942 struct iov_iter from;
1947 result = ll_file_get_iov_count(iov, &nr_segs, &iov_count);
1954 # ifdef HAVE_IOV_ITER_INIT_DIRECTION
1955 iov_iter_init(&from, WRITE, iov, nr_segs, iov_count);
1956 # else /* !HAVE_IOV_ITER_INIT_DIRECTION */
1957 iov_iter_init(&from, iov, nr_segs, iov_count, 0);
1958 # endif /* HAVE_IOV_ITER_INIT_DIRECTION */
1960 result = ll_file_write_iter(iocb, &from);
1965 static ssize_t ll_file_write(struct file *file, const char __user *buf,
1966 size_t count, loff_t *ppos)
1968 struct iovec iov = { .iov_base = (void __user *)buf,
1978 init_sync_kiocb(&kiocb, file);
1979 kiocb.ki_pos = *ppos;
1980 #ifdef HAVE_KIOCB_KI_LEFT
1981 kiocb.ki_left = count;
1982 #elif defined(HAVE_KI_NBYTES)
1983 kiocb.ki_nbytes = count;
1986 result = ll_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
1987 *ppos = kiocb.ki_pos;
1991 #endif /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
1994 * Send file content (through pagecache) somewhere with helper
1996 static ssize_t ll_file_splice_read(struct file *in_file, loff_t *ppos,
1997 struct pipe_inode_info *pipe, size_t count,
2001 struct vvp_io_args *args;
2008 result = pcc_file_splice_read(in_file, ppos, pipe,
2009 count, flags, &cached);
2013 ll_ras_enter(in_file);
2015 env = cl_env_get(&refcheck);
2017 RETURN(PTR_ERR(env));
2019 args = ll_env_args(env, IO_SPLICE);
2020 args->u.splice.via_pipe = pipe;
2021 args->u.splice.via_flags = flags;
2023 result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count);
2024 cl_env_put(env, &refcheck);
2027 ll_rw_stats_tally(ll_i2sbi(file_inode(in_file)), current->pid,
2028 LUSTRE_FPRIVATE(in_file), *ppos, result,
2033 int ll_lov_setstripe_ea_info(struct inode *inode, struct dentry *dentry,
2034 __u64 flags, struct lov_user_md *lum, int lum_size)
2036 struct lookup_intent oit = {
2038 .it_flags = flags | MDS_OPEN_BY_FID,
2043 if ((__swab32(lum->lmm_magic) & le32_to_cpu(LOV_MAGIC_MASK)) ==
2044 le32_to_cpu(LOV_MAGIC_MAGIC)) {
2045 /* this code will only exist for big-endian systems */
2046 lustre_swab_lov_user_md(lum, 0);
2049 ll_inode_size_lock(inode);
2050 rc = ll_intent_file_open(dentry, lum, lum_size, &oit);
2052 GOTO(out_unlock, rc);
2054 ll_release_openhandle(dentry, &oit);
2057 ll_inode_size_unlock(inode);
2058 ll_intent_release(&oit);
2063 int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename,
2064 struct lov_mds_md **lmmp, int *lmm_size,
2065 struct ptlrpc_request **request)
2067 struct ll_sb_info *sbi = ll_i2sbi(inode);
2068 struct mdt_body *body;
2069 struct lov_mds_md *lmm = NULL;
2070 struct ptlrpc_request *req = NULL;
2071 struct md_op_data *op_data;
2074 rc = ll_get_default_mdsize(sbi, &lmmsize);
2078 op_data = ll_prep_md_op_data(NULL, inode, NULL, filename,
2079 strlen(filename), lmmsize,
2080 LUSTRE_OPC_ANY, NULL);
2081 if (IS_ERR(op_data))
2082 RETURN(PTR_ERR(op_data));
2084 op_data->op_valid = OBD_MD_FLEASIZE | OBD_MD_FLDIREA;
2085 rc = md_getattr_name(sbi->ll_md_exp, op_data, &req);
2086 ll_finish_md_op_data(op_data);
2088 CDEBUG(D_INFO, "md_getattr_name failed "
2089 "on %s: rc %d\n", filename, rc);
2093 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
2094 LASSERT(body != NULL); /* checked by mdc_getattr_name */
2096 lmmsize = body->mbo_eadatasize;
2098 if (!(body->mbo_valid & (OBD_MD_FLEASIZE | OBD_MD_FLDIREA)) ||
2100 GOTO(out, rc = -ENODATA);
2103 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_MDT_MD, lmmsize);
2104 LASSERT(lmm != NULL);
2106 if (lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V1) &&
2107 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_V3) &&
2108 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_COMP_V1) &&
2109 lmm->lmm_magic != cpu_to_le32(LOV_MAGIC_FOREIGN))
2110 GOTO(out, rc = -EPROTO);
2113 * This is coming from the MDS, so is probably in
2114 * little endian. We convert it to host endian before
2115 * passing it to userspace.
2117 if ((lmm->lmm_magic & __swab32(LOV_MAGIC_MAGIC)) ==
2118 __swab32(LOV_MAGIC_MAGIC)) {
2119 int stripe_count = 0;
2121 if (lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V1) ||
2122 lmm->lmm_magic == cpu_to_le32(LOV_MAGIC_V3)) {
2123 stripe_count = le16_to_cpu(lmm->lmm_stripe_count);
2124 if (le32_to_cpu(lmm->lmm_pattern) &
2125 LOV_PATTERN_F_RELEASED)
2129 lustre_swab_lov_user_md((struct lov_user_md *)lmm, 0);
2131 /* if function called for directory - we should
2132 * avoid swab not existent lsm objects */
2133 if (lmm->lmm_magic == LOV_MAGIC_V1 && S_ISREG(body->mbo_mode))
2134 lustre_swab_lov_user_md_objects(
2135 ((struct lov_user_md_v1 *)lmm)->lmm_objects,
2137 else if (lmm->lmm_magic == LOV_MAGIC_V3 &&
2138 S_ISREG(body->mbo_mode))
2139 lustre_swab_lov_user_md_objects(
2140 ((struct lov_user_md_v3 *)lmm)->lmm_objects,
2146 *lmm_size = lmmsize;
2151 static int ll_lov_setea(struct inode *inode, struct file *file,
2154 __u64 flags = MDS_OPEN_HAS_OBJS | FMODE_WRITE;
2155 struct lov_user_md *lump;
2156 int lum_size = sizeof(struct lov_user_md) +
2157 sizeof(struct lov_user_ost_data);
2161 if (!cfs_capable(CFS_CAP_SYS_ADMIN))
2164 OBD_ALLOC_LARGE(lump, lum_size);
2168 if (copy_from_user(lump, arg, lum_size))
2169 GOTO(out_lump, rc = -EFAULT);
2171 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, lump,
2173 cl_lov_delay_create_clear(&file->f_flags);
2176 OBD_FREE_LARGE(lump, lum_size);
2180 static int ll_file_getstripe(struct inode *inode, void __user *lum, size_t size)
2187 env = cl_env_get(&refcheck);
2189 RETURN(PTR_ERR(env));
2191 rc = cl_object_getstripe(env, ll_i2info(inode)->lli_clob, lum, size);
2192 cl_env_put(env, &refcheck);
2196 static int ll_lov_setstripe(struct inode *inode, struct file *file,
2199 struct lov_user_md __user *lum = (struct lov_user_md __user *)arg;
2200 struct lov_user_md *klum;
2202 __u64 flags = FMODE_WRITE;
2205 rc = ll_copy_user_md(lum, &klum);
2210 rc = ll_lov_setstripe_ea_info(inode, file_dentry(file), flags, klum,
2215 rc = put_user(0, &lum->lmm_stripe_count);
2219 rc = ll_layout_refresh(inode, &gen);
2223 rc = ll_file_getstripe(inode, arg, lum_size);
2225 cl_lov_delay_create_clear(&file->f_flags);
2228 OBD_FREE_LARGE(klum, lum_size);
2233 ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg)
2235 struct ll_inode_info *lli = ll_i2info(inode);
2236 struct cl_object *obj = lli->lli_clob;
2237 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2238 struct ll_grouplock grouplock;
2243 CWARN("group id for group lock must not be 0\n");
2247 if (ll_file_nolock(file))
2248 RETURN(-EOPNOTSUPP);
2250 spin_lock(&lli->lli_lock);
2251 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2252 CWARN("group lock already existed with gid %lu\n",
2253 fd->fd_grouplock.lg_gid);
2254 spin_unlock(&lli->lli_lock);
2257 LASSERT(fd->fd_grouplock.lg_lock == NULL);
2258 spin_unlock(&lli->lli_lock);
2261 * XXX: group lock needs to protect all OST objects while PFL
2262 * can add new OST objects during the IO, so we'd instantiate
2263 * all OST objects before getting its group lock.
2268 struct cl_layout cl = {
2269 .cl_is_composite = false,
2271 struct lu_extent ext = {
2273 .e_end = OBD_OBJECT_EOF,
2276 env = cl_env_get(&refcheck);
2278 RETURN(PTR_ERR(env));
2280 rc = cl_object_layout_get(env, obj, &cl);
2281 if (!rc && cl.cl_is_composite)
2282 rc = ll_layout_write_intent(inode, LAYOUT_INTENT_WRITE,
2285 cl_env_put(env, &refcheck);
2290 rc = cl_get_grouplock(ll_i2info(inode)->lli_clob,
2291 arg, (file->f_flags & O_NONBLOCK), &grouplock);
2295 spin_lock(&lli->lli_lock);
2296 if (fd->fd_flags & LL_FILE_GROUP_LOCKED) {
2297 spin_unlock(&lli->lli_lock);
2298 CERROR("another thread just won the race\n");
2299 cl_put_grouplock(&grouplock);
2303 fd->fd_flags |= LL_FILE_GROUP_LOCKED;
2304 fd->fd_grouplock = grouplock;
2305 spin_unlock(&lli->lli_lock);
2307 CDEBUG(D_INFO, "group lock %lu obtained\n", arg);
2311 static int ll_put_grouplock(struct inode *inode, struct file *file,
2314 struct ll_inode_info *lli = ll_i2info(inode);
2315 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
2316 struct ll_grouplock grouplock;
2319 spin_lock(&lli->lli_lock);
2320 if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) {
2321 spin_unlock(&lli->lli_lock);
2322 CWARN("no group lock held\n");
2326 LASSERT(fd->fd_grouplock.lg_lock != NULL);
2328 if (fd->fd_grouplock.lg_gid != arg) {
2329 CWARN("group lock %lu doesn't match current id %lu\n",
2330 arg, fd->fd_grouplock.lg_gid);
2331 spin_unlock(&lli->lli_lock);
2335 grouplock = fd->fd_grouplock;
2336 memset(&fd->fd_grouplock, 0, sizeof(fd->fd_grouplock));
2337 fd->fd_flags &= ~LL_FILE_GROUP_LOCKED;
2338 spin_unlock(&lli->lli_lock);
2340 cl_put_grouplock(&grouplock);
2341 CDEBUG(D_INFO, "group lock %lu released\n", arg);
2346 * Close inode open handle
2348 * \param dentry [in] dentry which contains the inode
2349 * \param it [in,out] intent which contains open info and result
2352 * \retval <0 failure
2354 int ll_release_openhandle(struct dentry *dentry, struct lookup_intent *it)
2356 struct inode *inode = dentry->d_inode;
2357 struct obd_client_handle *och;
2363 /* Root ? Do nothing. */
2364 if (dentry->d_inode->i_sb->s_root == dentry)
2367 /* No open handle to close? Move away */
2368 if (!it_disposition(it, DISP_OPEN_OPEN))
2371 LASSERT(it_open_error(DISP_OPEN_OPEN, it) == 0);
2373 OBD_ALLOC(och, sizeof(*och));
2375 GOTO(out, rc = -ENOMEM);
2377 ll_och_fill(ll_i2sbi(inode)->ll_md_exp, it, och);
2379 rc = ll_close_inode_openhandle(inode, och, 0, NULL);
2381 /* this one is in place of ll_file_open */
2382 if (it_disposition(it, DISP_ENQ_OPEN_REF)) {
2383 ptlrpc_req_finished(it->it_request);
2384 it_clear_disposition(it, DISP_ENQ_OPEN_REF);
2390 * Get size for inode for which FIEMAP mapping is requested.
2391 * Make the FIEMAP get_info call and returns the result.
2392 * \param fiemap kernel buffer to hold extens
2393 * \param num_bytes kernel buffer size
2395 static int ll_do_fiemap(struct inode *inode, struct fiemap *fiemap,
2401 struct ll_fiemap_info_key fmkey = { .lfik_name = KEY_FIEMAP, };
2404 /* Checks for fiemap flags */
2405 if (fiemap->fm_flags & ~LUSTRE_FIEMAP_FLAGS_COMPAT) {
2406 fiemap->fm_flags &= ~LUSTRE_FIEMAP_FLAGS_COMPAT;
2410 /* Check for FIEMAP_FLAG_SYNC */
2411 if (fiemap->fm_flags & FIEMAP_FLAG_SYNC) {
2412 rc = filemap_fdatawrite(inode->i_mapping);
2417 env = cl_env_get(&refcheck);
2419 RETURN(PTR_ERR(env));
2421 if (i_size_read(inode) == 0) {
2422 rc = ll_glimpse_size(inode);
2427 fmkey.lfik_oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP;
2428 obdo_from_inode(&fmkey.lfik_oa, inode, OBD_MD_FLSIZE);
2429 obdo_set_parent_fid(&fmkey.lfik_oa, &ll_i2info(inode)->lli_fid);
2431 /* If filesize is 0, then there would be no objects for mapping */
2432 if (fmkey.lfik_oa.o_size == 0) {
2433 fiemap->fm_mapped_extents = 0;
2437 fmkey.lfik_fiemap = *fiemap;
2439 rc = cl_object_fiemap(env, ll_i2info(inode)->lli_clob,
2440 &fmkey, fiemap, &num_bytes);
2442 cl_env_put(env, &refcheck);
2446 int ll_fid2path(struct inode *inode, void __user *arg)
2448 struct obd_export *exp = ll_i2mdexp(inode);
2449 const struct getinfo_fid2path __user *gfin = arg;
2451 struct getinfo_fid2path *gfout;
2457 if (!cfs_capable(CFS_CAP_DAC_READ_SEARCH) &&
2458 !(ll_i2sbi(inode)->ll_flags & LL_SBI_USER_FID2PATH))
2461 /* Only need to get the buflen */
2462 if (get_user(pathlen, &gfin->gf_pathlen))
2465 if (pathlen > PATH_MAX)
2468 outsize = sizeof(*gfout) + pathlen;
2469 OBD_ALLOC(gfout, outsize);
2473 if (copy_from_user(gfout, arg, sizeof(*gfout)))
2474 GOTO(gf_free, rc = -EFAULT);
2475 /* append root FID after gfout to let MDT know the root FID so that it
2476 * can lookup the correct path, this is mainly for fileset.
2477 * old server without fileset mount support will ignore this. */
2478 *gfout->gf_u.gf_root_fid = *ll_inode2fid(inode);
2480 /* Call mdc_iocontrol */
2481 rc = obd_iocontrol(OBD_IOC_FID2PATH, exp, outsize, gfout, NULL);
2485 if (copy_to_user(arg, gfout, outsize))
2489 OBD_FREE(gfout, outsize);
2494 ll_ioc_data_version(struct inode *inode, struct ioc_data_version *ioc)
2496 struct cl_object *obj = ll_i2info(inode)->lli_clob;
2504 ioc->idv_version = 0;
2505 ioc->idv_layout_version = UINT_MAX;
2507 /* If no file object initialized, we consider its version is 0. */
2511 env = cl_env_get(&refcheck);
2513 RETURN(PTR_ERR(env));
2515 io = vvp_env_thread_io(env);
2517 io->u.ci_data_version.dv_data_version = 0;
2518 io->u.ci_data_version.dv_layout_version = UINT_MAX;
2519 io->u.ci_data_version.dv_flags = ioc->idv_flags;
2522 if (cl_io_init(env, io, CIT_DATA_VERSION, io->ci_obj) == 0)
2523 result = cl_io_loop(env, io);
2525 result = io->ci_result;
2527 ioc->idv_version = io->u.ci_data_version.dv_data_version;
2528 ioc->idv_layout_version = io->u.ci_data_version.dv_layout_version;
2530 cl_io_fini(env, io);
2532 if (unlikely(io->ci_need_restart))
2535 cl_env_put(env, &refcheck);
2541 * Read the data_version for inode.
2543 * This value is computed using stripe object version on OST.
2544 * Version is computed using server side locking.
2546 * @param flags if do sync on the OST side;
2548 * LL_DV_RD_FLUSH: flush dirty pages, LCK_PR on OSTs
2549 * LL_DV_WR_FLUSH: drop all caching pages, LCK_PW on OSTs
2551 int ll_data_version(struct inode *inode, __u64 *data_version, int flags)
2553 struct ioc_data_version ioc = { .idv_flags = flags };
2556 rc = ll_ioc_data_version(inode, &ioc);
2558 *data_version = ioc.idv_version;
2564 * Trigger a HSM release request for the provided inode.
2566 int ll_hsm_release(struct inode *inode)
2569 struct obd_client_handle *och = NULL;
2570 __u64 data_version = 0;
2575 CDEBUG(D_INODE, "%s: Releasing file "DFID".\n",
2576 ll_i2sbi(inode)->ll_fsname,
2577 PFID(&ll_i2info(inode)->lli_fid));
2579 och = ll_lease_open(inode, NULL, FMODE_WRITE, MDS_OPEN_RELEASE);
2581 GOTO(out, rc = PTR_ERR(och));
2583 /* Grab latest data_version and [am]time values */
2584 rc = ll_data_version(inode, &data_version, LL_DV_WR_FLUSH);
2588 env = cl_env_get(&refcheck);
2590 GOTO(out, rc = PTR_ERR(env));
2592 rc = ll_merge_attr(env, inode);
2593 cl_env_put(env, &refcheck);
2595 /* If error happen, we have the wrong size for a file.
2601 /* Release the file.
2602 * NB: lease lock handle is released in mdc_hsm_release_pack() because
2603 * we still need it to pack l_remote_handle to MDT. */
2604 rc = ll_close_inode_openhandle(inode, och, MDS_HSM_RELEASE,
2610 if (och != NULL && !IS_ERR(och)) /* close the file */
2611 ll_lease_close(och, inode, NULL);
2616 struct ll_swap_stack {
2619 struct inode *inode1;
2620 struct inode *inode2;
2625 static int ll_swap_layouts(struct file *file1, struct file *file2,
2626 struct lustre_swap_layouts *lsl)
2628 struct mdc_swap_layouts msl;
2629 struct md_op_data *op_data;
2632 struct ll_swap_stack *llss = NULL;
2635 OBD_ALLOC_PTR(llss);
2639 llss->inode1 = file_inode(file1);
2640 llss->inode2 = file_inode(file2);
2642 rc = ll_check_swap_layouts_validity(llss->inode1, llss->inode2);
2646 /* we use 2 bool because it is easier to swap than 2 bits */
2647 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV1)
2648 llss->check_dv1 = true;
2650 if (lsl->sl_flags & SWAP_LAYOUTS_CHECK_DV2)
2651 llss->check_dv2 = true;
2653 /* we cannot use lsl->sl_dvX directly because we may swap them */
2654 llss->dv1 = lsl->sl_dv1;
2655 llss->dv2 = lsl->sl_dv2;
2657 rc = lu_fid_cmp(ll_inode2fid(llss->inode1), ll_inode2fid(llss->inode2));
2658 if (rc == 0) /* same file, done! */
2661 if (rc < 0) { /* sequentialize it */
2662 swap(llss->inode1, llss->inode2);
2664 swap(llss->dv1, llss->dv2);
2665 swap(llss->check_dv1, llss->check_dv2);
2669 if (gid != 0) { /* application asks to flush dirty cache */
2670 rc = ll_get_grouplock(llss->inode1, file1, gid);
2674 rc = ll_get_grouplock(llss->inode2, file2, gid);
2676 ll_put_grouplock(llss->inode1, file1, gid);
2681 /* ultimate check, before swaping the layouts we check if
2682 * dataversion has changed (if requested) */
2683 if (llss->check_dv1) {
2684 rc = ll_data_version(llss->inode1, &dv, 0);
2687 if (dv != llss->dv1)
2688 GOTO(putgl, rc = -EAGAIN);
2691 if (llss->check_dv2) {
2692 rc = ll_data_version(llss->inode2, &dv, 0);
2695 if (dv != llss->dv2)
2696 GOTO(putgl, rc = -EAGAIN);
2699 /* struct md_op_data is used to send the swap args to the mdt
2700 * only flags is missing, so we use struct mdc_swap_layouts
2701 * through the md_op_data->op_data */
2702 /* flags from user space have to be converted before they are send to
2703 * server, no flag is sent today, they are only used on the client */
2706 op_data = ll_prep_md_op_data(NULL, llss->inode1, llss->inode2, NULL, 0,
2707 0, LUSTRE_OPC_ANY, &msl);
2708 if (IS_ERR(op_data))
2709 GOTO(free, rc = PTR_ERR(op_data));
2711 rc = obd_iocontrol(LL_IOC_LOV_SWAP_LAYOUTS, ll_i2mdexp(llss->inode1),
2712 sizeof(*op_data), op_data, NULL);
2713 ll_finish_md_op_data(op_data);
2720 ll_put_grouplock(llss->inode2, file2, gid);
2721 ll_put_grouplock(llss->inode1, file1, gid);
2731 int ll_hsm_state_set(struct inode *inode, struct hsm_state_set *hss)
2733 struct obd_export *exp = ll_i2mdexp(inode);
2734 struct md_op_data *op_data;
2738 /* Detect out-of range masks */
2739 if ((hss->hss_setmask | hss->hss_clearmask) & ~HSM_FLAGS_MASK)
2742 /* Non-root users are forbidden to set or clear flags which are
2743 * NOT defined in HSM_USER_MASK. */
2744 if (((hss->hss_setmask | hss->hss_clearmask) & ~HSM_USER_MASK) &&
2745 !cfs_capable(CFS_CAP_SYS_ADMIN))
2748 if (!exp_connect_archive_id_array(exp)) {
2749 /* Detect out-of range archive id */
2750 if ((hss->hss_valid & HSS_ARCHIVE_ID) &&
2751 (hss->hss_archive_id > LL_HSM_ORIGIN_MAX_ARCHIVE))
2755 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
2756 LUSTRE_OPC_ANY, hss);
2757 if (IS_ERR(op_data))
2758 RETURN(PTR_ERR(op_data));
2760 rc = obd_iocontrol(LL_IOC_HSM_STATE_SET, exp, sizeof(*op_data),
2763 ll_finish_md_op_data(op_data);
2768 static int ll_hsm_import(struct inode *inode, struct file *file,
2769 struct hsm_user_import *hui)
2771 struct hsm_state_set *hss = NULL;
2772 struct iattr *attr = NULL;
2776 if (!S_ISREG(inode->i_mode))
2782 GOTO(out, rc = -ENOMEM);
2784 hss->hss_valid = HSS_SETMASK | HSS_ARCHIVE_ID;
2785 hss->hss_archive_id = hui->hui_archive_id;
2786 hss->hss_setmask = HS_ARCHIVED | HS_EXISTS | HS_RELEASED;
2787 rc = ll_hsm_state_set(inode, hss);
2791 OBD_ALLOC_PTR(attr);
2793 GOTO(out, rc = -ENOMEM);
2795 attr->ia_mode = hui->hui_mode & (S_IRWXU | S_IRWXG | S_IRWXO);
2796 attr->ia_mode |= S_IFREG;
2797 attr->ia_uid = make_kuid(&init_user_ns, hui->hui_uid);
2798 attr->ia_gid = make_kgid(&init_user_ns, hui->hui_gid);
2799 attr->ia_size = hui->hui_size;
2800 attr->ia_mtime.tv_sec = hui->hui_mtime;
2801 attr->ia_mtime.tv_nsec = hui->hui_mtime_ns;
2802 attr->ia_atime.tv_sec = hui->hui_atime;
2803 attr->ia_atime.tv_nsec = hui->hui_atime_ns;
2805 attr->ia_valid = ATTR_SIZE | ATTR_MODE | ATTR_FORCE |
2806 ATTR_UID | ATTR_GID |
2807 ATTR_MTIME | ATTR_MTIME_SET |
2808 ATTR_ATIME | ATTR_ATIME_SET;
2812 rc = ll_setattr_raw(file_dentry(file), attr, 0, true);
2816 inode_unlock(inode);
2828 static inline long ll_lease_type_from_fmode(fmode_t fmode)
2830 return ((fmode & FMODE_READ) ? LL_LEASE_RDLCK : 0) |
2831 ((fmode & FMODE_WRITE) ? LL_LEASE_WRLCK : 0);
2834 static int ll_file_futimes_3(struct file *file, const struct ll_futimes_3 *lfu)
2836 struct inode *inode = file_inode(file);
2838 .ia_valid = ATTR_ATIME | ATTR_ATIME_SET |
2839 ATTR_MTIME | ATTR_MTIME_SET |
2842 .tv_sec = lfu->lfu_atime_sec,
2843 .tv_nsec = lfu->lfu_atime_nsec,
2846 .tv_sec = lfu->lfu_mtime_sec,
2847 .tv_nsec = lfu->lfu_mtime_nsec,
2850 .tv_sec = lfu->lfu_ctime_sec,
2851 .tv_nsec = lfu->lfu_ctime_nsec,
2857 if (!capable(CAP_SYS_ADMIN))
2860 if (!S_ISREG(inode->i_mode))
2864 rc = ll_setattr_raw(file_dentry(file), &ia, OP_XVALID_CTIME_SET,
2866 inode_unlock(inode);
2871 static enum cl_lock_mode cl_mode_user_to_kernel(enum lock_mode_user mode)
2874 case MODE_READ_USER:
2876 case MODE_WRITE_USER:
2883 static const char *const user_lockname[] = LOCK_MODE_NAMES;
2885 /* Used to allow the upper layers of the client to request an LDLM lock
2886 * without doing an actual read or write.
2888 * Used for ladvise lockahead to manually request specific locks.
2890 * \param[in] file file this ladvise lock request is on
2891 * \param[in] ladvise ladvise struct describing this lock request
2893 * \retval 0 success, no detailed result available (sync requests
2894 * and requests sent to the server [not handled locally]
2895 * cannot return detailed results)
2896 * \retval LLA_RESULT_{SAME,DIFFERENT} - detailed result of the lock request,
2897 * see definitions for details.
2898 * \retval negative negative errno on error
2900 int ll_file_lock_ahead(struct file *file, struct llapi_lu_ladvise *ladvise)
2902 struct lu_env *env = NULL;
2903 struct cl_io *io = NULL;
2904 struct cl_lock *lock = NULL;
2905 struct cl_lock_descr *descr = NULL;
2906 struct dentry *dentry = file->f_path.dentry;
2907 struct inode *inode = dentry->d_inode;
2908 enum cl_lock_mode cl_mode;
2909 off_t start = ladvise->lla_start;
2910 off_t end = ladvise->lla_end;
2916 CDEBUG(D_VFSTRACE, "Lock request: file=%.*s, inode=%p, mode=%s "
2917 "start=%llu, end=%llu\n", dentry->d_name.len,
2918 dentry->d_name.name, dentry->d_inode,
2919 user_lockname[ladvise->lla_lockahead_mode], (__u64) start,
2922 cl_mode = cl_mode_user_to_kernel(ladvise->lla_lockahead_mode);
2924 GOTO(out, result = cl_mode);
2926 /* Get IO environment */
2927 result = cl_io_get(inode, &env, &io, &refcheck);
2931 result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
2934 * nothing to do for this io. This currently happens when
2935 * stripe sub-object's are not yet created.
2937 result = io->ci_result;
2938 } else if (result == 0) {
2939 lock = vvp_env_lock(env);
2940 descr = &lock->cll_descr;
2942 descr->cld_obj = io->ci_obj;
2943 /* Convert byte offsets to pages */
2944 descr->cld_start = cl_index(io->ci_obj, start);
2945 descr->cld_end = cl_index(io->ci_obj, end);
2946 descr->cld_mode = cl_mode;
2947 /* CEF_MUST is used because we do not want to convert a
2948 * lockahead request to a lockless lock */
2949 descr->cld_enq_flags = CEF_MUST | CEF_LOCK_NO_EXPAND |
2952 if (ladvise->lla_peradvice_flags & LF_ASYNC)
2953 descr->cld_enq_flags |= CEF_SPECULATIVE;
2955 result = cl_lock_request(env, io, lock);
2957 /* On success, we need to release the lock */
2959 cl_lock_release(env, lock);
2961 cl_io_fini(env, io);
2962 cl_env_put(env, &refcheck);
2964 /* -ECANCELED indicates a matching lock with a different extent
2965 * was already present, and -EEXIST indicates a matching lock
2966 * on exactly the same extent was already present.
2967 * We convert them to positive values for userspace to make
2968 * recognizing true errors easier.
2969 * Note we can only return these detailed results on async requests,
2970 * as sync requests look the same as i/o requests for locking. */
2971 if (result == -ECANCELED)
2972 result = LLA_RESULT_DIFFERENT;
2973 else if (result == -EEXIST)
2974 result = LLA_RESULT_SAME;
2979 static const char *const ladvise_names[] = LU_LADVISE_NAMES;
2981 static int ll_ladvise_sanity(struct inode *inode,
2982 struct llapi_lu_ladvise *ladvise)
2984 struct ll_sb_info *sbi = ll_i2sbi(inode);
2985 enum lu_ladvise_type advice = ladvise->lla_advice;
2986 /* Note the peradvice flags is a 32 bit field, so per advice flags must
2987 * be in the first 32 bits of enum ladvise_flags */
2988 __u32 flags = ladvise->lla_peradvice_flags;
2989 /* 3 lines at 80 characters per line, should be plenty */
2992 if (advice > LU_LADVISE_MAX || advice == LU_LADVISE_INVALID) {
2994 CDEBUG(D_VFSTRACE, "%s: advice with value '%d' not recognized,"
2995 "last supported advice is %s (value '%d'): rc = %d\n",
2996 sbi->ll_fsname, advice,
2997 ladvise_names[LU_LADVISE_MAX-1], LU_LADVISE_MAX-1, rc);
3001 /* Per-advice checks */
3003 case LU_LADVISE_LOCKNOEXPAND:
3004 if (flags & ~LF_LOCKNOEXPAND_MASK) {
3006 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3007 "rc = %d\n", sbi->ll_fsname, flags,
3008 ladvise_names[advice], rc);
3012 case LU_LADVISE_LOCKAHEAD:
3013 /* Currently only READ and WRITE modes can be requested */
3014 if (ladvise->lla_lockahead_mode >= MODE_MAX_USER ||
3015 ladvise->lla_lockahead_mode == 0) {
3017 CDEBUG(D_VFSTRACE, "%s: Invalid mode (%d) for %s: "
3018 "rc = %d\n", sbi->ll_fsname,
3019 ladvise->lla_lockahead_mode,
3020 ladvise_names[advice], rc);
3023 case LU_LADVISE_WILLREAD:
3024 case LU_LADVISE_DONTNEED:
3026 /* Note fall through above - These checks apply to all advices
3027 * except LOCKNOEXPAND */
3028 if (flags & ~LF_DEFAULT_MASK) {
3030 CDEBUG(D_VFSTRACE, "%s: Invalid flags (%x) for %s: "
3031 "rc = %d\n", sbi->ll_fsname, flags,
3032 ladvise_names[advice], rc);
3035 if (ladvise->lla_start >= ladvise->lla_end) {
3037 CDEBUG(D_VFSTRACE, "%s: Invalid range (%llu to %llu) "
3038 "for %s: rc = %d\n", sbi->ll_fsname,
3039 ladvise->lla_start, ladvise->lla_end,
3040 ladvise_names[advice], rc);
3052 * Give file access advices
3054 * The ladvise interface is similar to Linux fadvise() system call, except it
3055 * forwards the advices directly from Lustre client to server. The server side
3056 * codes will apply appropriate read-ahead and caching techniques for the
3057 * corresponding files.
3059 * A typical workload for ladvise is e.g. a bunch of different clients are
3060 * doing small random reads of a file, so prefetching pages into OSS cache
3061 * with big linear reads before the random IO is a net benefit. Fetching
3062 * all that data into each client cache with fadvise() may not be, due to
3063 * much more data being sent to the client.
3065 static int ll_ladvise(struct inode *inode, struct file *file, __u64 flags,
3066 struct llapi_lu_ladvise *ladvise)
3070 struct cl_ladvise_io *lio;
3075 env = cl_env_get(&refcheck);
3077 RETURN(PTR_ERR(env));
3079 io = vvp_env_thread_io(env);
3080 io->ci_obj = ll_i2info(inode)->lli_clob;
3082 /* initialize parameters for ladvise */
3083 lio = &io->u.ci_ladvise;
3084 lio->li_start = ladvise->lla_start;
3085 lio->li_end = ladvise->lla_end;
3086 lio->li_fid = ll_inode2fid(inode);
3087 lio->li_advice = ladvise->lla_advice;
3088 lio->li_flags = flags;
3090 if (cl_io_init(env, io, CIT_LADVISE, io->ci_obj) == 0)
3091 rc = cl_io_loop(env, io);
3095 cl_io_fini(env, io);
3096 cl_env_put(env, &refcheck);
3100 static int ll_lock_noexpand(struct file *file, int flags)
3102 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3104 fd->ll_lock_no_expand = !(flags & LF_UNSET);
3109 int ll_ioctl_fsgetxattr(struct inode *inode, unsigned int cmd,
3112 struct fsxattr fsxattr;
3114 if (copy_from_user(&fsxattr,
3115 (const struct fsxattr __user *)arg,
3119 fsxattr.fsx_xflags = ll_inode_flags_to_xflags(inode->i_flags);
3120 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT))
3121 fsxattr.fsx_xflags |= FS_XFLAG_PROJINHERIT;
3122 fsxattr.fsx_projid = ll_i2info(inode)->lli_projid;
3123 if (copy_to_user((struct fsxattr __user *)arg,
3124 &fsxattr, sizeof(fsxattr)))
3130 int ll_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
3133 * Project Quota ID state is only allowed to change from within the init
3134 * namespace. Enforce that restriction only if we are trying to change
3135 * the quota ID state. Everything else is allowed in user namespaces.
3137 if (current_user_ns() == &init_user_ns)
3140 if (ll_i2info(inode)->lli_projid != fa->fsx_projid)
3143 if (ll_file_test_flag(ll_i2info(inode), LLIF_PROJECT_INHERIT)) {
3144 if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
3147 if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
3154 int ll_ioctl_fssetxattr(struct inode *inode, unsigned int cmd,
3158 struct md_op_data *op_data;
3159 struct ptlrpc_request *req = NULL;
3161 struct fsxattr fsxattr;
3162 struct cl_object *obj;
3166 if (copy_from_user(&fsxattr,
3167 (const struct fsxattr __user *)arg,
3171 rc = ll_ioctl_check_project(inode, &fsxattr);
3175 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3176 LUSTRE_OPC_ANY, NULL);
3177 if (IS_ERR(op_data))
3178 RETURN(PTR_ERR(op_data));
3180 flags = ll_xflags_to_inode_flags(fsxattr.fsx_xflags);
3181 op_data->op_attr_flags = ll_inode_to_ext_flags(flags);
3182 if (fsxattr.fsx_xflags & FS_XFLAG_PROJINHERIT)
3183 op_data->op_attr_flags |= LUSTRE_PROJINHERIT_FL;
3184 op_data->op_projid = fsxattr.fsx_projid;
3185 op_data->op_xvalid |= OP_XVALID_PROJID | OP_XVALID_FLAGS;
3186 rc = md_setattr(ll_i2sbi(inode)->ll_md_exp, op_data, NULL,
3188 ptlrpc_req_finished(req);
3190 GOTO(out_fsxattr, rc);
3191 ll_update_inode_flags(inode, op_data->op_attr_flags);
3192 obj = ll_i2info(inode)->lli_clob;
3194 GOTO(out_fsxattr, rc);
3196 OBD_ALLOC_PTR(attr);
3198 GOTO(out_fsxattr, rc = -ENOMEM);
3200 rc = cl_setattr_ost(obj, attr, OP_XVALID_FLAGS,
3201 fsxattr.fsx_xflags);
3204 ll_finish_md_op_data(op_data);
3208 static long ll_file_unlock_lease(struct file *file, struct ll_ioc_lease *ioc,
3211 struct inode *inode = file_inode(file);
3212 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3213 struct ll_inode_info *lli = ll_i2info(inode);
3214 struct obd_client_handle *och = NULL;
3215 struct split_param sp;
3216 struct pcc_param param;
3217 bool lease_broken = false;
3219 enum mds_op_bias bias = 0;
3220 struct file *layout_file = NULL;
3222 size_t data_size = 0;
3223 bool attached = false;
3228 mutex_lock(&lli->lli_och_mutex);
3229 if (fd->fd_lease_och != NULL) {
3230 och = fd->fd_lease_och;
3231 fd->fd_lease_och = NULL;
3233 mutex_unlock(&lli->lli_och_mutex);
3238 fmode = och->och_flags;
3240 switch (ioc->lil_flags) {
3241 case LL_LEASE_RESYNC_DONE:
3242 if (ioc->lil_count > IOC_IDS_MAX)
3243 GOTO(out_lease_close, rc = -EINVAL);
3245 data_size = offsetof(typeof(*ioc), lil_ids[ioc->lil_count]);
3246 OBD_ALLOC(data, data_size);
3248 GOTO(out_lease_close, rc = -ENOMEM);
3250 if (copy_from_user(data, (void __user *)arg, data_size))
3251 GOTO(out_lease_close, rc = -EFAULT);
3253 bias = MDS_CLOSE_RESYNC_DONE;
3255 case LL_LEASE_LAYOUT_MERGE: {
3258 if (ioc->lil_count != 1)
3259 GOTO(out_lease_close, rc = -EINVAL);
3261 arg += sizeof(*ioc);
3262 if (copy_from_user(&fd, (void __user *)arg, sizeof(__u32)))
3263 GOTO(out_lease_close, rc = -EFAULT);
3265 layout_file = fget(fd);
3267 GOTO(out_lease_close, rc = -EBADF);
3269 if ((file->f_flags & O_ACCMODE) == O_RDONLY ||
3270 (layout_file->f_flags & O_ACCMODE) == O_RDONLY)
3271 GOTO(out_lease_close, rc = -EPERM);
3273 data = file_inode(layout_file);
3274 bias = MDS_CLOSE_LAYOUT_MERGE;
3277 case LL_LEASE_LAYOUT_SPLIT: {
3281 if (ioc->lil_count != 2)
3282 GOTO(out_lease_close, rc = -EINVAL);
3284 arg += sizeof(*ioc);
3285 if (copy_from_user(&fdv, (void __user *)arg, sizeof(__u32)))
3286 GOTO(out_lease_close, rc = -EFAULT);
3288 arg += sizeof(__u32);
3289 if (copy_from_user(&mirror_id, (void __user *)arg,
3291 GOTO(out_lease_close, rc = -EFAULT);
3293 layout_file = fget(fdv);
3295 GOTO(out_lease_close, rc = -EBADF);
3297 sp.sp_inode = file_inode(layout_file);
3298 sp.sp_mirror_id = (__u16)mirror_id;
3300 bias = MDS_CLOSE_LAYOUT_SPLIT;
3303 case LL_LEASE_PCC_ATTACH:
3304 if (ioc->lil_count != 1)
3307 arg += sizeof(*ioc);
3308 if (copy_from_user(¶m.pa_archive_id, (void __user *)arg,
3310 GOTO(out_lease_close, rc2 = -EFAULT);
3312 rc2 = pcc_readwrite_attach(file, inode, param.pa_archive_id);
3314 GOTO(out_lease_close, rc2);
3317 /* Grab latest data version */
3318 rc2 = ll_data_version(inode, ¶m.pa_data_version,
3321 GOTO(out_lease_close, rc2);
3324 bias = MDS_PCC_ATTACH;
3327 /* without close intent */
3332 rc = ll_lease_close_intent(och, inode, &lease_broken, bias, data);
3336 rc = ll_lease_och_release(inode, file);
3345 switch (ioc->lil_flags) {
3346 case LL_LEASE_RESYNC_DONE:
3348 OBD_FREE(data, data_size);
3350 case LL_LEASE_LAYOUT_MERGE:
3351 case LL_LEASE_LAYOUT_SPLIT:
3355 case LL_LEASE_PCC_ATTACH:
3358 rc = pcc_readwrite_attach_fini(file, inode,
3359 param.pa_layout_gen,
3366 rc = ll_lease_type_from_fmode(fmode);
3370 static long ll_file_set_lease(struct file *file, struct ll_ioc_lease *ioc,
3373 struct inode *inode = file_inode(file);
3374 struct ll_inode_info *lli = ll_i2info(inode);
3375 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3376 struct obd_client_handle *och = NULL;
3377 __u64 open_flags = 0;
3383 switch (ioc->lil_mode) {
3384 case LL_LEASE_WRLCK:
3385 if (!(file->f_mode & FMODE_WRITE))
3387 fmode = FMODE_WRITE;
3389 case LL_LEASE_RDLCK:
3390 if (!(file->f_mode & FMODE_READ))
3394 case LL_LEASE_UNLCK:
3395 RETURN(ll_file_unlock_lease(file, ioc, arg));
3400 CDEBUG(D_INODE, "Set lease with mode %u\n", fmode);
3402 /* apply for lease */
3403 if (ioc->lil_flags & LL_LEASE_RESYNC)
3404 open_flags = MDS_OPEN_RESYNC;
3405 och = ll_lease_open(inode, file, fmode, open_flags);
3407 RETURN(PTR_ERR(och));
3409 if (ioc->lil_flags & LL_LEASE_RESYNC) {
3410 rc = ll_lease_file_resync(och, inode, arg);
3412 ll_lease_close(och, inode, NULL);
3415 rc = ll_layout_refresh(inode, &fd->fd_layout_version);
3417 ll_lease_close(och, inode, NULL);
3423 mutex_lock(&lli->lli_och_mutex);
3424 if (fd->fd_lease_och == NULL) {
3425 fd->fd_lease_och = och;
3428 mutex_unlock(&lli->lli_och_mutex);
3430 /* impossible now that only excl is supported for now */
3431 ll_lease_close(och, inode, &lease_broken);
3437 static void ll_heat_get(struct inode *inode, struct lu_heat *heat)
3439 struct ll_inode_info *lli = ll_i2info(inode);
3440 struct ll_sb_info *sbi = ll_i2sbi(inode);
3441 __u64 now = ktime_get_real_seconds();
3444 spin_lock(&lli->lli_heat_lock);
3445 heat->lh_flags = lli->lli_heat_flags;
3446 for (i = 0; i < heat->lh_count; i++)
3447 heat->lh_heat[i] = obd_heat_get(&lli->lli_heat_instances[i],
3448 now, sbi->ll_heat_decay_weight,
3449 sbi->ll_heat_period_second);
3450 spin_unlock(&lli->lli_heat_lock);
3453 static int ll_heat_set(struct inode *inode, enum lu_heat_flag flags)
3455 struct ll_inode_info *lli = ll_i2info(inode);
3458 spin_lock(&lli->lli_heat_lock);
3459 if (flags & LU_HEAT_FLAG_CLEAR)
3460 obd_heat_clear(lli->lli_heat_instances, OBD_HEAT_COUNT);
3462 if (flags & LU_HEAT_FLAG_OFF)
3463 lli->lli_heat_flags |= LU_HEAT_FLAG_OFF;
3465 lli->lli_heat_flags &= ~LU_HEAT_FLAG_OFF;
3467 spin_unlock(&lli->lli_heat_lock);
3473 ll_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3475 struct inode *inode = file_inode(file);
3476 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
3480 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), cmd=%x\n",
3481 PFID(ll_inode2fid(inode)), inode, cmd);
3482 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_IOCTL, 1);
3484 /* asm-ppc{,64} declares TCGETS, et. al. as type 't' not 'T' */
3485 if (_IOC_TYPE(cmd) == 'T' || _IOC_TYPE(cmd) == 't') /* tty ioctls */
3489 case LL_IOC_GETFLAGS:
3490 /* Get the current value of the file flags */
3491 return put_user(fd->fd_flags, (int __user *)arg);
3492 case LL_IOC_SETFLAGS:
3493 case LL_IOC_CLRFLAGS:
3494 /* Set or clear specific file flags */
3495 /* XXX This probably needs checks to ensure the flags are
3496 * not abused, and to handle any flag side effects.
3498 if (get_user(flags, (int __user *) arg))
3501 if (cmd == LL_IOC_SETFLAGS) {
3502 if ((flags & LL_FILE_IGNORE_LOCK) &&
3503 !(file->f_flags & O_DIRECT)) {
3504 CERROR("%s: unable to disable locking on "
3505 "non-O_DIRECT file\n", current->comm);
3509 fd->fd_flags |= flags;
3511 fd->fd_flags &= ~flags;
3514 case LL_IOC_LOV_SETSTRIPE:
3515 case LL_IOC_LOV_SETSTRIPE_NEW:
3516 RETURN(ll_lov_setstripe(inode, file, (void __user *)arg));
3517 case LL_IOC_LOV_SETEA:
3518 RETURN(ll_lov_setea(inode, file, (void __user *)arg));
3519 case LL_IOC_LOV_SWAP_LAYOUTS: {
3521 struct lustre_swap_layouts lsl;
3523 if (copy_from_user(&lsl, (char __user *)arg,
3524 sizeof(struct lustre_swap_layouts)))
3527 if ((file->f_flags & O_ACCMODE) == O_RDONLY)
3530 file2 = fget(lsl.sl_fd);
3534 /* O_WRONLY or O_RDWR */
3535 if ((file2->f_flags & O_ACCMODE) == O_RDONLY)
3536 GOTO(out, rc = -EPERM);
3538 if (lsl.sl_flags & SWAP_LAYOUTS_CLOSE) {
3539 struct inode *inode2;
3540 struct ll_inode_info *lli;
3541 struct obd_client_handle *och = NULL;
3543 lli = ll_i2info(inode);
3544 mutex_lock(&lli->lli_och_mutex);
3545 if (fd->fd_lease_och != NULL) {
3546 och = fd->fd_lease_och;
3547 fd->fd_lease_och = NULL;
3549 mutex_unlock(&lli->lli_och_mutex);
3551 GOTO(out, rc = -ENOLCK);
3552 inode2 = file_inode(file2);
3553 rc = ll_swap_layouts_close(och, inode, inode2);
3555 rc = ll_swap_layouts(file, file2, &lsl);
3561 case LL_IOC_LOV_GETSTRIPE:
3562 case LL_IOC_LOV_GETSTRIPE_NEW:
3563 RETURN(ll_file_getstripe(inode, (void __user *)arg, 0));
3564 case FS_IOC_GETFLAGS:
3565 case FS_IOC_SETFLAGS:
3566 RETURN(ll_iocontrol(inode, file, cmd, arg));
3567 case FSFILT_IOC_GETVERSION:
3568 case FS_IOC_GETVERSION:
3569 RETURN(put_user(inode->i_generation, (int __user *)arg));
3570 /* We need to special case any other ioctls we want to handle,
3571 * to send them to the MDS/OST as appropriate and to properly
3572 * network encode the arg field. */
3573 case FS_IOC_SETVERSION:
3576 case LL_IOC_GROUP_LOCK:
3577 RETURN(ll_get_grouplock(inode, file, arg));
3578 case LL_IOC_GROUP_UNLOCK:
3579 RETURN(ll_put_grouplock(inode, file, arg));
3580 case IOC_OBD_STATFS:
3581 RETURN(ll_obd_statfs(inode, (void __user *)arg));
3583 case LL_IOC_FLUSHCTX:
3584 RETURN(ll_flush_ctx(inode));
3585 case LL_IOC_PATH2FID: {
3586 if (copy_to_user((void __user *)arg, ll_inode2fid(inode),
3587 sizeof(struct lu_fid)))
3592 case LL_IOC_GETPARENT:
3593 RETURN(ll_getparent(file, (struct getparent __user *)arg));
3595 case OBD_IOC_FID2PATH:
3596 RETURN(ll_fid2path(inode, (void __user *)arg));
3597 case LL_IOC_DATA_VERSION: {
3598 struct ioc_data_version idv;
3601 if (copy_from_user(&idv, (char __user *)arg, sizeof(idv)))
3604 idv.idv_flags &= LL_DV_RD_FLUSH | LL_DV_WR_FLUSH;
3605 rc = ll_ioc_data_version(inode, &idv);
3608 copy_to_user((char __user *)arg, &idv, sizeof(idv)))
3614 case LL_IOC_GET_MDTIDX: {
3617 mdtidx = ll_get_mdt_idx(inode);
3621 if (put_user((int)mdtidx, (int __user *)arg))
3626 case OBD_IOC_GETDTNAME:
3627 case OBD_IOC_GETMDNAME:
3628 RETURN(ll_get_obd_name(inode, cmd, arg));
3629 case LL_IOC_HSM_STATE_GET: {
3630 struct md_op_data *op_data;
3631 struct hsm_user_state *hus;
3638 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3639 LUSTRE_OPC_ANY, hus);
3640 if (IS_ERR(op_data)) {
3642 RETURN(PTR_ERR(op_data));
3645 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3648 if (copy_to_user((void __user *)arg, hus, sizeof(*hus)))
3651 ll_finish_md_op_data(op_data);
3655 case LL_IOC_HSM_STATE_SET: {
3656 struct hsm_state_set *hss;
3663 if (copy_from_user(hss, (char __user *)arg, sizeof(*hss))) {
3668 rc = ll_hsm_state_set(inode, hss);
3673 case LL_IOC_HSM_ACTION: {
3674 struct md_op_data *op_data;
3675 struct hsm_current_action *hca;
3682 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
3683 LUSTRE_OPC_ANY, hca);
3684 if (IS_ERR(op_data)) {
3686 RETURN(PTR_ERR(op_data));
3689 rc = obd_iocontrol(cmd, ll_i2mdexp(inode), sizeof(*op_data),
3692 if (copy_to_user((char __user *)arg, hca, sizeof(*hca)))
3695 ll_finish_md_op_data(op_data);
3699 case LL_IOC_SET_LEASE_OLD: {
3700 struct ll_ioc_lease ioc = { .lil_mode = (__u32)arg };
3702 RETURN(ll_file_set_lease(file, &ioc, 0));
3704 case LL_IOC_SET_LEASE: {
3705 struct ll_ioc_lease ioc;
3707 if (copy_from_user(&ioc, (void __user *)arg, sizeof(ioc)))
3710 RETURN(ll_file_set_lease(file, &ioc, arg));
3712 case LL_IOC_GET_LEASE: {
3713 struct ll_inode_info *lli = ll_i2info(inode);
3714 struct ldlm_lock *lock = NULL;
3717 mutex_lock(&lli->lli_och_mutex);
3718 if (fd->fd_lease_och != NULL) {
3719 struct obd_client_handle *och = fd->fd_lease_och;
3721 lock = ldlm_handle2lock(&och->och_lease_handle);
3723 lock_res_and_lock(lock);
3724 if (!ldlm_is_cancel(lock))
3725 fmode = och->och_flags;
3727 unlock_res_and_lock(lock);
3728 LDLM_LOCK_PUT(lock);
3731 mutex_unlock(&lli->lli_och_mutex);
3733 RETURN(ll_lease_type_from_fmode(fmode));
3735 case LL_IOC_HSM_IMPORT: {
3736 struct hsm_user_import *hui;
3742 if (copy_from_user(hui, (void __user *)arg, sizeof(*hui))) {
3747 rc = ll_hsm_import(inode, file, hui);
3752 case LL_IOC_FUTIMES_3: {
3753 struct ll_futimes_3 lfu;
3755 if (copy_from_user(&lfu,
3756 (const struct ll_futimes_3 __user *)arg,
3760 RETURN(ll_file_futimes_3(file, &lfu));
3762 case LL_IOC_LADVISE: {
3763 struct llapi_ladvise_hdr *k_ladvise_hdr;
3764 struct llapi_ladvise_hdr __user *u_ladvise_hdr;
3767 int alloc_size = sizeof(*k_ladvise_hdr);
3770 u_ladvise_hdr = (void __user *)arg;
3771 OBD_ALLOC_PTR(k_ladvise_hdr);
3772 if (k_ladvise_hdr == NULL)
3775 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3776 GOTO(out_ladvise, rc = -EFAULT);
3778 if (k_ladvise_hdr->lah_magic != LADVISE_MAGIC ||
3779 k_ladvise_hdr->lah_count < 1)
3780 GOTO(out_ladvise, rc = -EINVAL);
3782 num_advise = k_ladvise_hdr->lah_count;
3783 if (num_advise >= LAH_COUNT_MAX)
3784 GOTO(out_ladvise, rc = -EFBIG);
3786 OBD_FREE_PTR(k_ladvise_hdr);
3787 alloc_size = offsetof(typeof(*k_ladvise_hdr),
3788 lah_advise[num_advise]);
3789 OBD_ALLOC(k_ladvise_hdr, alloc_size);
3790 if (k_ladvise_hdr == NULL)
3794 * TODO: submit multiple advices to one server in a single RPC
3796 if (copy_from_user(k_ladvise_hdr, u_ladvise_hdr, alloc_size))
3797 GOTO(out_ladvise, rc = -EFAULT);
3799 for (i = 0; i < num_advise; i++) {
3800 struct llapi_lu_ladvise *k_ladvise =
3801 &k_ladvise_hdr->lah_advise[i];
3802 struct llapi_lu_ladvise __user *u_ladvise =
3803 &u_ladvise_hdr->lah_advise[i];
3805 rc = ll_ladvise_sanity(inode, k_ladvise);
3807 GOTO(out_ladvise, rc);
3809 switch (k_ladvise->lla_advice) {
3810 case LU_LADVISE_LOCKNOEXPAND:
3811 rc = ll_lock_noexpand(file,
3812 k_ladvise->lla_peradvice_flags);
3813 GOTO(out_ladvise, rc);
3814 case LU_LADVISE_LOCKAHEAD:
3816 rc = ll_file_lock_ahead(file, k_ladvise);
3819 GOTO(out_ladvise, rc);
3822 &u_ladvise->lla_lockahead_result))
3823 GOTO(out_ladvise, rc = -EFAULT);
3826 rc = ll_ladvise(inode, file,
3827 k_ladvise_hdr->lah_flags,
3830 GOTO(out_ladvise, rc);
3837 OBD_FREE(k_ladvise_hdr, alloc_size);
3840 case LL_IOC_FLR_SET_MIRROR: {
3841 /* mirror I/O must be direct to avoid polluting page cache
3843 if (!(file->f_flags & O_DIRECT))
3846 fd->fd_designated_mirror = (__u32)arg;
3849 case LL_IOC_FSGETXATTR:
3850 RETURN(ll_ioctl_fsgetxattr(inode, cmd, arg));
3851 case LL_IOC_FSSETXATTR:
3852 RETURN(ll_ioctl_fssetxattr(inode, cmd, arg));
3854 RETURN(put_user(PAGE_SIZE, (int __user *)arg));
3855 case LL_IOC_HEAT_GET: {
3856 struct lu_heat uheat;
3857 struct lu_heat *heat;
3860 if (copy_from_user(&uheat, (void __user *)arg, sizeof(uheat)))
3863 if (uheat.lh_count > OBD_HEAT_COUNT)
3864 uheat.lh_count = OBD_HEAT_COUNT;
3866 size = offsetof(typeof(uheat), lh_heat[uheat.lh_count]);
3867 OBD_ALLOC(heat, size);
3871 heat->lh_count = uheat.lh_count;
3872 ll_heat_get(inode, heat);
3873 rc = copy_to_user((char __user *)arg, heat, size);
3874 OBD_FREE(heat, size);
3875 RETURN(rc ? -EFAULT : 0);
3877 case LL_IOC_HEAT_SET: {
3880 if (copy_from_user(&flags, (void __user *)arg, sizeof(flags)))
3883 rc = ll_heat_set(inode, flags);
3886 case LL_IOC_PCC_DETACH: {
3887 struct lu_pcc_detach *detach;
3889 OBD_ALLOC_PTR(detach);
3893 if (copy_from_user(detach,
3894 (const struct lu_pcc_detach __user *)arg,
3896 GOTO(out_detach_free, rc = -EFAULT);
3898 if (!S_ISREG(inode->i_mode))
3899 GOTO(out_detach_free, rc = -EINVAL);
3901 if (!inode_owner_or_capable(inode))
3902 GOTO(out_detach_free, rc = -EPERM);
3904 rc = pcc_ioctl_detach(inode, detach->pccd_opt);
3906 OBD_FREE_PTR(detach);
3909 case LL_IOC_PCC_STATE: {
3910 struct lu_pcc_state __user *ustate =
3911 (struct lu_pcc_state __user *)arg;
3912 struct lu_pcc_state *state;
3914 OBD_ALLOC_PTR(state);
3918 if (copy_from_user(state, ustate, sizeof(*state)))
3919 GOTO(out_state, rc = -EFAULT);
3921 rc = pcc_ioctl_state(file, inode, state);
3923 GOTO(out_state, rc);
3925 if (copy_to_user(ustate, state, sizeof(*state)))
3926 GOTO(out_state, rc = -EFAULT);
3929 OBD_FREE_PTR(state);
3933 RETURN(obd_iocontrol(cmd, ll_i2dtexp(inode), 0, NULL,
3934 (void __user *)arg));
3938 #ifndef HAVE_FILE_LLSEEK_SIZE
3939 static inline loff_t
3940 llseek_execute(struct file *file, loff_t offset, loff_t maxsize)
3942 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
3944 if (offset > maxsize)
3947 if (offset != file->f_pos) {
3948 file->f_pos = offset;
3949 file->f_version = 0;
3955 generic_file_llseek_size(struct file *file, loff_t offset, int origin,
3956 loff_t maxsize, loff_t eof)
3958 struct inode *inode = file_inode(file);
3966 * Here we special-case the lseek(fd, 0, SEEK_CUR)
3967 * position-querying operation. Avoid rewriting the "same"
3968 * f_pos value back to the file because a concurrent read(),
3969 * write() or lseek() might have altered it
3974 * f_lock protects against read/modify/write race with other
3975 * SEEK_CURs. Note that parallel writes and reads behave
3979 offset = llseek_execute(file, file->f_pos + offset, maxsize);
3980 inode_unlock(inode);
3984 * In the generic case the entire file is data, so as long as
3985 * offset isn't at the end of the file then the offset is data.
3992 * There is a virtual hole at the end of the file, so as long as
3993 * offset isn't i_size or larger, return i_size.
4001 return llseek_execute(file, offset, maxsize);
4005 static loff_t ll_file_seek(struct file *file, loff_t offset, int origin)
4007 struct inode *inode = file_inode(file);
4008 loff_t retval, eof = 0;
4011 retval = offset + ((origin == SEEK_END) ? i_size_read(inode) :
4012 (origin == SEEK_CUR) ? file->f_pos : 0);
4013 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), to=%llu=%#llx(%d)\n",
4014 PFID(ll_inode2fid(inode)), inode, retval, retval,
4016 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LLSEEK, 1);
4018 if (origin == SEEK_END || origin == SEEK_HOLE || origin == SEEK_DATA) {
4019 retval = ll_glimpse_size(inode);
4022 eof = i_size_read(inode);
4025 retval = ll_generic_file_llseek_size(file, offset, origin,
4026 ll_file_maxbytes(inode), eof);
4030 static int ll_flush(struct file *file, fl_owner_t id)
4032 struct inode *inode = file_inode(file);
4033 struct ll_inode_info *lli = ll_i2info(inode);
4034 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4037 LASSERT(!S_ISDIR(inode->i_mode));
4039 /* catch async errors that were recorded back when async writeback
4040 * failed for pages in this mapping. */
4041 rc = lli->lli_async_rc;
4042 lli->lli_async_rc = 0;
4043 if (lli->lli_clob != NULL) {
4044 err = lov_read_and_clear_async_rc(lli->lli_clob);
4049 /* The application has been told write failure already.
4050 * Do not report failure again. */
4051 if (fd->fd_write_failed)
4053 return rc ? -EIO : 0;
4057 * Called to make sure a portion of file has been written out.
4058 * if @mode is not CL_FSYNC_LOCAL, it will send OST_SYNC RPCs to OST.
4060 * Return how many pages have been written.
4062 int cl_sync_file_range(struct inode *inode, loff_t start, loff_t end,
4063 enum cl_fsync_mode mode, int ignore_layout)
4067 struct cl_fsync_io *fio;
4072 if (mode != CL_FSYNC_NONE && mode != CL_FSYNC_LOCAL &&
4073 mode != CL_FSYNC_DISCARD && mode != CL_FSYNC_ALL)
4076 env = cl_env_get(&refcheck);
4078 RETURN(PTR_ERR(env));
4080 io = vvp_env_thread_io(env);
4081 io->ci_obj = ll_i2info(inode)->lli_clob;
4082 io->ci_ignore_layout = ignore_layout;
4084 /* initialize parameters for sync */
4085 fio = &io->u.ci_fsync;
4086 fio->fi_start = start;
4088 fio->fi_fid = ll_inode2fid(inode);
4089 fio->fi_mode = mode;
4090 fio->fi_nr_written = 0;
4092 if (cl_io_init(env, io, CIT_FSYNC, io->ci_obj) == 0)
4093 result = cl_io_loop(env, io);
4095 result = io->ci_result;
4097 result = fio->fi_nr_written;
4098 cl_io_fini(env, io);
4099 cl_env_put(env, &refcheck);
4105 * When dentry is provided (the 'else' case), file_dentry() may be
4106 * null and dentry must be used directly rather than pulled from
4107 * file_dentry() as is done otherwise.
4110 int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync)
4112 struct dentry *dentry = file_dentry(file);
4113 struct inode *inode = dentry->d_inode;
4114 struct ll_inode_info *lli = ll_i2info(inode);
4115 struct ptlrpc_request *req;
4120 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), start %lld, end %lld,"
4122 PFID(ll_inode2fid(inode)), inode, start, end, datasync);
4124 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1);
4126 /* fsync's caller has already called _fdata{sync,write}, we want
4127 * that IO to finish before calling the osc and mdc sync methods */
4128 rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
4131 /* catch async errors that were recorded back when async writeback
4132 * failed for pages in this mapping. */
4133 if (!S_ISDIR(inode->i_mode)) {
4134 err = lli->lli_async_rc;
4135 lli->lli_async_rc = 0;
4138 if (lli->lli_clob != NULL) {
4139 err = lov_read_and_clear_async_rc(lli->lli_clob);
4145 err = md_fsync(ll_i2sbi(inode)->ll_md_exp, ll_inode2fid(inode), &req);
4149 ptlrpc_req_finished(req);
4151 if (S_ISREG(inode->i_mode)) {
4152 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4155 /* Sync metadata on MDT first, and then sync the cached data
4158 err = pcc_fsync(file, start, end, datasync, &cached);
4160 err = cl_sync_file_range(inode, start, end,
4162 if (rc == 0 && err < 0)
4165 fd->fd_write_failed = true;
4167 fd->fd_write_failed = false;
4170 inode_unlock(inode);
4175 ll_file_flock(struct file *file, int cmd, struct file_lock *file_lock)
4177 struct inode *inode = file_inode(file);
4178 struct ll_sb_info *sbi = ll_i2sbi(inode);
4179 struct ldlm_enqueue_info einfo = {
4180 .ei_type = LDLM_FLOCK,
4181 .ei_cb_cp = ldlm_flock_completion_ast,
4182 .ei_cbdata = file_lock,
4184 struct md_op_data *op_data;
4185 struct lustre_handle lockh = { 0 };
4186 union ldlm_policy_data flock = { { 0 } };
4187 int fl_type = file_lock->fl_type;
4193 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID" file_lock=%p\n",
4194 PFID(ll_inode2fid(inode)), file_lock);
4196 ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FLOCK, 1);
4198 if (file_lock->fl_flags & FL_FLOCK) {
4199 LASSERT((cmd == F_SETLKW) || (cmd == F_SETLK));
4200 /* flocks are whole-file locks */
4201 flock.l_flock.end = OFFSET_MAX;
4202 /* For flocks owner is determined by the local file desctiptor*/
4203 flock.l_flock.owner = (unsigned long)file_lock->fl_file;
4204 } else if (file_lock->fl_flags & FL_POSIX) {
4205 flock.l_flock.owner = (unsigned long)file_lock->fl_owner;
4206 flock.l_flock.start = file_lock->fl_start;
4207 flock.l_flock.end = file_lock->fl_end;
4211 flock.l_flock.pid = file_lock->fl_pid;
4213 /* Somewhat ugly workaround for svc lockd.
4214 * lockd installs custom fl_lmops->lm_compare_owner that checks
4215 * for the fl_owner to be the same (which it always is on local node
4216 * I guess between lockd processes) and then compares pid.
4217 * As such we assign pid to the owner field to make it all work,
4218 * conflict with normal locks is unlikely since pid space and
4219 * pointer space for current->files are not intersecting */
4220 if (file_lock->fl_lmops && file_lock->fl_lmops->lm_compare_owner)
4221 flock.l_flock.owner = (unsigned long)file_lock->fl_pid;
4225 einfo.ei_mode = LCK_PR;
4228 /* An unlock request may or may not have any relation to
4229 * existing locks so we may not be able to pass a lock handle
4230 * via a normal ldlm_lock_cancel() request. The request may even
4231 * unlock a byte range in the middle of an existing lock. In
4232 * order to process an unlock request we need all of the same
4233 * information that is given with a normal read or write record
4234 * lock request. To avoid creating another ldlm unlock (cancel)
4235 * message we'll treat a LCK_NL flock request as an unlock. */
4236 einfo.ei_mode = LCK_NL;
4239 einfo.ei_mode = LCK_PW;
4242 CDEBUG(D_INFO, "Unknown fcntl lock type: %d\n", fl_type);
4257 flags = LDLM_FL_BLOCK_NOWAIT;
4263 flags = LDLM_FL_TEST_LOCK;
4266 CERROR("unknown fcntl lock command: %d\n", cmd);
4270 /* Save the old mode so that if the mode in the lock changes we
4271 * can decrement the appropriate reader or writer refcount. */
4272 file_lock->fl_type = einfo.ei_mode;
4274 op_data = ll_prep_md_op_data(NULL, inode, NULL, NULL, 0, 0,
4275 LUSTRE_OPC_ANY, NULL);
4276 if (IS_ERR(op_data))
4277 RETURN(PTR_ERR(op_data));
4279 CDEBUG(D_DLMTRACE, "inode="DFID", pid=%u, flags=%#llx, mode=%u, "
4280 "start=%llu, end=%llu\n", PFID(ll_inode2fid(inode)),
4281 flock.l_flock.pid, flags, einfo.ei_mode,
4282 flock.l_flock.start, flock.l_flock.end);
4284 rc = md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data, &lockh,
4287 /* Restore the file lock type if not TEST lock. */
4288 if (!(flags & LDLM_FL_TEST_LOCK))
4289 file_lock->fl_type = fl_type;
4291 #ifdef HAVE_LOCKS_LOCK_FILE_WAIT
4292 if ((rc == 0 || file_lock->fl_type == F_UNLCK) &&
4293 !(flags & LDLM_FL_TEST_LOCK))
4294 rc2 = locks_lock_file_wait(file, file_lock);
4296 if ((file_lock->fl_flags & FL_FLOCK) &&
4297 (rc == 0 || file_lock->fl_type == F_UNLCK))
4298 rc2 = flock_lock_file_wait(file, file_lock);
4299 if ((file_lock->fl_flags & FL_POSIX) &&
4300 (rc == 0 || file_lock->fl_type == F_UNLCK) &&
4301 !(flags & LDLM_FL_TEST_LOCK))
4302 rc2 = posix_lock_file_wait(file, file_lock);
4303 #endif /* HAVE_LOCKS_LOCK_FILE_WAIT */
4305 if (rc2 && file_lock->fl_type != F_UNLCK) {
4306 einfo.ei_mode = LCK_NL;
4307 md_enqueue(sbi->ll_md_exp, &einfo, &flock, op_data,
4312 ll_finish_md_op_data(op_data);
4317 int ll_get_fid_by_name(struct inode *parent, const char *name,
4318 int namelen, struct lu_fid *fid,
4319 struct inode **inode)
4321 struct md_op_data *op_data = NULL;
4322 struct mdt_body *body;
4323 struct ptlrpc_request *req;
4327 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen, 0,
4328 LUSTRE_OPC_ANY, NULL);
4329 if (IS_ERR(op_data))
4330 RETURN(PTR_ERR(op_data));
4332 op_data->op_valid = OBD_MD_FLID | OBD_MD_FLTYPE;
4333 rc = md_getattr_name(ll_i2sbi(parent)->ll_md_exp, op_data, &req);
4334 ll_finish_md_op_data(op_data);
4338 body = req_capsule_server_get(&req->rq_pill, &RMF_MDT_BODY);
4340 GOTO(out_req, rc = -EFAULT);
4342 *fid = body->mbo_fid1;
4345 rc = ll_prep_inode(inode, req, parent->i_sb, NULL);
4347 ptlrpc_req_finished(req);
4351 int ll_migrate(struct inode *parent, struct file *file, struct lmv_user_md *lum,
4354 struct dentry *dchild = NULL;
4355 struct inode *child_inode = NULL;
4356 struct md_op_data *op_data;
4357 struct ptlrpc_request *request = NULL;
4358 struct obd_client_handle *och = NULL;
4360 struct mdt_body *body;
4361 __u64 data_version = 0;
4362 size_t namelen = strlen(name);
4363 int lumlen = lmv_user_md_size(lum->lum_stripe_count, lum->lum_magic);
4367 CDEBUG(D_VFSTRACE, "migrate "DFID"/%s to MDT%04x stripe count %d\n",
4368 PFID(ll_inode2fid(parent)), name,
4369 lum->lum_stripe_offset, lum->lum_stripe_count);
4371 if (lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC) &&
4372 lum->lum_magic != cpu_to_le32(LMV_USER_MAGIC_SPECIFIC))
4373 lustre_swab_lmv_user_md(lum);
4375 /* Get child FID first */
4376 qstr.hash = ll_full_name_hash(file_dentry(file), name, namelen);
4379 dchild = d_lookup(file_dentry(file), &qstr);
4381 if (dchild->d_inode)
4382 child_inode = igrab(dchild->d_inode);
4387 rc = ll_get_fid_by_name(parent, name, namelen, NULL,
4396 if (!(exp_connect_flags2(ll_i2sbi(parent)->ll_md_exp) &
4397 OBD_CONNECT2_DIR_MIGRATE)) {
4398 if (le32_to_cpu(lum->lum_stripe_count) > 1 ||
4399 ll_dir_striped(child_inode)) {
4400 CERROR("%s: MDT doesn't support stripe directory "
4401 "migration!\n", ll_i2sbi(parent)->ll_fsname);
4402 GOTO(out_iput, rc = -EOPNOTSUPP);
4407 * lfs migrate command needs to be blocked on the client
4408 * by checking the migrate FID against the FID of the
4411 if (child_inode == parent->i_sb->s_root->d_inode)
4412 GOTO(out_iput, rc = -EINVAL);
4414 op_data = ll_prep_md_op_data(NULL, parent, NULL, name, namelen,
4415 child_inode->i_mode, LUSTRE_OPC_ANY, NULL);
4416 if (IS_ERR(op_data))
4417 GOTO(out_iput, rc = PTR_ERR(op_data));
4419 inode_lock(child_inode);
4420 op_data->op_fid3 = *ll_inode2fid(child_inode);
4421 if (!fid_is_sane(&op_data->op_fid3)) {
4422 CERROR("%s: migrate %s, but FID "DFID" is insane\n",
4423 ll_i2sbi(parent)->ll_fsname, name,
4424 PFID(&op_data->op_fid3));
4425 GOTO(out_unlock, rc = -EINVAL);
4428 op_data->op_cli_flags |= CLI_MIGRATE | CLI_SET_MEA;
4429 op_data->op_data = lum;
4430 op_data->op_data_size = lumlen;
4433 if (S_ISREG(child_inode->i_mode)) {
4434 och = ll_lease_open(child_inode, NULL, FMODE_WRITE, 0);
4438 GOTO(out_unlock, rc);
4441 rc = ll_data_version(child_inode, &data_version,
4444 GOTO(out_close, rc);
4446 op_data->op_open_handle = och->och_open_handle;
4447 op_data->op_data_version = data_version;
4448 op_data->op_lease_handle = och->och_lease_handle;
4449 op_data->op_bias |= MDS_CLOSE_MIGRATE;
4451 spin_lock(&och->och_mod->mod_open_req->rq_lock);
4452 och->och_mod->mod_open_req->rq_replay = 0;
4453 spin_unlock(&och->och_mod->mod_open_req->rq_lock);
4456 rc = md_rename(ll_i2sbi(parent)->ll_md_exp, op_data, name, namelen,
4457 name, namelen, &request);
4459 LASSERT(request != NULL);
4460 ll_update_times(request, parent);
4463 if (rc == 0 || rc == -EAGAIN) {
4464 body = req_capsule_server_get(&request->rq_pill, &RMF_MDT_BODY);
4465 LASSERT(body != NULL);
4467 /* If the server does release layout lock, then we cleanup
4468 * the client och here, otherwise release it in out_close: */
4469 if (och && body->mbo_valid & OBD_MD_CLOSE_INTENT_EXECED) {
4470 obd_mod_put(och->och_mod);
4471 md_clear_open_replay_data(ll_i2sbi(parent)->ll_md_exp,
4473 och->och_open_handle.cookie = DEAD_HANDLE_MAGIC;
4479 if (request != NULL) {
4480 ptlrpc_req_finished(request);
4484 /* Try again if the lease has cancelled. */
4485 if (rc == -EAGAIN && S_ISREG(child_inode->i_mode))
4490 ll_lease_close(och, child_inode, NULL);
4492 clear_nlink(child_inode);
4494 inode_unlock(child_inode);
4495 ll_finish_md_op_data(op_data);
4502 ll_file_noflock(struct file *file, int cmd, struct file_lock *file_lock)
4504 struct ll_file_data *fd = LUSTRE_FPRIVATE(file);
4508 * In order to avoid flood of warning messages, only print one message
4509 * for one file. And the entire message rate on the client is limited
4510 * by CDEBUG_LIMIT too.
4512 if (!(fd->fd_flags & LL_FILE_FLOCK_WARNING)) {
4513 fd->fd_flags |= LL_FILE_FLOCK_WARNING;
4514 CDEBUG_LIMIT(D_TTY | D_CONSOLE,
4515 "flock disabled, mount with '-o [local]flock' to enable\r\n");
4521 * test if some locks matching bits and l_req_mode are acquired
4522 * - bits can be in different locks
4523 * - if found clear the common lock bits in *bits
4524 * - the bits not found, are kept in *bits
4526 * \param bits [IN] searched lock bits [IN]
4527 * \param l_req_mode [IN] searched lock mode
4528 * \retval boolean, true iff all bits are found
4530 int ll_have_md_lock(struct inode *inode, __u64 *bits, enum ldlm_mode l_req_mode)
4532 struct lustre_handle lockh;
4533 union ldlm_policy_data policy;
4534 enum ldlm_mode mode = (l_req_mode == LCK_MINMODE) ?
4535 (LCK_CR | LCK_CW | LCK_PR | LCK_PW) : l_req_mode;
4544 fid = &ll_i2info(inode)->lli_fid;
4545 CDEBUG(D_INFO, "trying to match res "DFID" mode %s\n", PFID(fid),
4546 ldlm_lockname[mode]);
4548 flags = LDLM_FL_BLOCK_GRANTED | LDLM_FL_CBPENDING | LDLM_FL_TEST_LOCK;
4549 for (i = 0; i < MDS_INODELOCK_NUMBITS && *bits != 0; i++) {
4550 policy.l_inodebits.bits = *bits & (1 << i);
4551 if (policy.l_inodebits.bits == 0)
4554 if (md_lock_match(ll_i2mdexp(inode), flags, fid, LDLM_IBITS,
4555 &policy, mode, &lockh)) {
4556 struct ldlm_lock *lock;
4558 lock = ldlm_handle2lock(&lockh);
4561 ~(lock->l_policy_data.l_inodebits.bits);
4562 LDLM_LOCK_PUT(lock);
4564 *bits &= ~policy.l_inodebits.bits;
4571 enum ldlm_mode ll_take_md_lock(struct inode *inode, __u64 bits,
4572 struct lustre_handle *lockh, __u64 flags,
4573 enum ldlm_mode mode)
4575 union ldlm_policy_data policy = { .l_inodebits = { bits } };
4580 fid = &ll_i2info(inode)->lli_fid;
4581 CDEBUG(D_INFO, "trying to match res "DFID"\n", PFID(fid));
4583 rc = md_lock_match(ll_i2mdexp(inode), LDLM_FL_BLOCK_GRANTED|flags,
4584 fid, LDLM_IBITS, &policy, mode, lockh);
4589 static int ll_inode_revalidate_fini(struct inode *inode, int rc)
4591 /* Already unlinked. Just update nlink and return success */
4592 if (rc == -ENOENT) {
4594 /* If it is striped directory, and there is bad stripe
4595 * Let's revalidate the dentry again, instead of returning
4597 if (ll_dir_striped(inode))
4600 /* This path cannot be hit for regular files unless in
4601 * case of obscure races, so no need to to validate
4603 if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
4605 } else if (rc != 0) {
4606 CDEBUG_LIMIT((rc == -EACCES || rc == -EIDRM) ? D_INFO : D_ERROR,
4607 "%s: revalidate FID "DFID" error: rc = %d\n",
4608 ll_i2sbi(inode)->ll_fsname,
4609 PFID(ll_inode2fid(inode)), rc);
4615 static int ll_inode_revalidate(struct dentry *dentry, enum ldlm_intent_flags op)
4617 struct inode *inode = dentry->d_inode;
4618 struct obd_export *exp = ll_i2mdexp(inode);
4619 struct lookup_intent oit = {
4622 struct ptlrpc_request *req = NULL;
4623 struct md_op_data *op_data;
4627 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p),name=%s\n",
4628 PFID(ll_inode2fid(inode)), inode, dentry->d_name.name);
4630 /* Call getattr by fid, so do not provide name at all. */
4631 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL, 0, 0,
4632 LUSTRE_OPC_ANY, NULL);
4633 if (IS_ERR(op_data))
4634 RETURN(PTR_ERR(op_data));
4636 rc = md_intent_lock(exp, op_data, &oit, &req, &ll_md_blocking_ast, 0);
4637 ll_finish_md_op_data(op_data);
4639 rc = ll_inode_revalidate_fini(inode, rc);
4643 rc = ll_revalidate_it_finish(req, &oit, dentry);
4645 ll_intent_release(&oit);
4649 /* Unlinked? Unhash dentry, so it is not picked up later by
4650 * do_lookup() -> ll_revalidate_it(). We cannot use d_drop
4651 * here to preserve get_cwd functionality on 2.6.
4653 if (!dentry->d_inode->i_nlink) {
4654 spin_lock(&inode->i_lock);
4655 d_lustre_invalidate(dentry, 0);
4656 spin_unlock(&inode->i_lock);
4659 ll_lookup_finish_locks(&oit, dentry);
4661 ptlrpc_req_finished(req);
4666 static int ll_merge_md_attr(struct inode *inode)
4668 struct ll_inode_info *lli = ll_i2info(inode);
4669 struct cl_attr attr = { 0 };
4672 LASSERT(lli->lli_lsm_md != NULL);
4674 if (!lmv_dir_striped(lli->lli_lsm_md))
4677 down_read(&lli->lli_lsm_sem);
4678 rc = md_merge_attr(ll_i2mdexp(inode), ll_i2info(inode)->lli_lsm_md,
4679 &attr, ll_md_blocking_ast);
4680 up_read(&lli->lli_lsm_sem);
4684 set_nlink(inode, attr.cat_nlink);
4685 inode->i_blocks = attr.cat_blocks;
4686 i_size_write(inode, attr.cat_size);
4688 ll_i2info(inode)->lli_atime = attr.cat_atime;
4689 ll_i2info(inode)->lli_mtime = attr.cat_mtime;
4690 ll_i2info(inode)->lli_ctime = attr.cat_ctime;
4695 int ll_getattr_dentry(struct dentry *de, struct kstat *stat)
4697 struct inode *inode = de->d_inode;
4698 struct ll_sb_info *sbi = ll_i2sbi(inode);
4699 struct ll_inode_info *lli = ll_i2info(inode);
4702 ll_stats_ops_tally(sbi, LPROC_LL_GETATTR, 1);
4704 rc = ll_inode_revalidate(de, IT_GETATTR);
4708 if (S_ISREG(inode->i_mode)) {
4711 rc = pcc_inode_getattr(inode, &cached);
4712 if (cached && rc < 0)
4715 /* In case of restore, the MDT has the right size and has
4716 * already send it back without granting the layout lock,
4717 * inode is up-to-date so glimpse is useless.
4718 * Also to glimpse we need the layout, in case of a running
4719 * restore the MDT holds the layout lock so the glimpse will
4720 * block up to the end of restore (getattr will block)
4722 if (!cached && !ll_file_test_flag(lli, LLIF_FILE_RESTORING)) {
4723 rc = ll_glimpse_size(inode);
4728 /* If object isn't regular a file then don't validate size. */
4729 if (ll_dir_striped(inode)) {
4730 rc = ll_merge_md_attr(inode);
4735 inode->i_atime.tv_sec = lli->lli_atime;
4736 inode->i_mtime.tv_sec = lli->lli_mtime;
4737 inode->i_ctime.tv_sec = lli->lli_ctime;
4740 OBD_FAIL_TIMEOUT(OBD_FAIL_GETATTR_DELAY, 30);
4742 if (ll_need_32bit_api(sbi)) {
4743 stat->ino = cl_fid_build_ino(&lli->lli_fid, 1);
4744 stat->dev = ll_compat_encode_dev(inode->i_sb->s_dev);
4745 stat->rdev = ll_compat_encode_dev(inode->i_rdev);
4747 stat->ino = inode->i_ino;
4748 stat->dev = inode->i_sb->s_dev;
4749 stat->rdev = inode->i_rdev;
4752 stat->mode = inode->i_mode;
4753 stat->uid = inode->i_uid;
4754 stat->gid = inode->i_gid;
4755 stat->atime = inode->i_atime;
4756 stat->mtime = inode->i_mtime;
4757 stat->ctime = inode->i_ctime;
4758 stat->blksize = sbi->ll_stat_blksize ?: 1 << inode->i_blkbits;
4760 stat->nlink = inode->i_nlink;
4761 stat->size = i_size_read(inode);
4762 stat->blocks = inode->i_blocks;
4767 #ifdef HAVE_INODEOPS_ENHANCED_GETATTR
4768 int ll_getattr(const struct path *path, struct kstat *stat,
4769 u32 request_mask, unsigned int flags)
4771 struct dentry *de = path->dentry;
4773 int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat)
4776 return ll_getattr_dentry(de, stat);
4779 static int ll_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
4780 __u64 start, __u64 len)
4784 struct fiemap *fiemap;
4785 unsigned int extent_count = fieinfo->fi_extents_max;
4787 num_bytes = sizeof(*fiemap) + (extent_count *
4788 sizeof(struct fiemap_extent));
4789 OBD_ALLOC_LARGE(fiemap, num_bytes);
4794 fiemap->fm_flags = fieinfo->fi_flags;
4795 fiemap->fm_extent_count = fieinfo->fi_extents_max;
4796 fiemap->fm_start = start;
4797 fiemap->fm_length = len;
4798 if (extent_count > 0 &&
4799 copy_from_user(&fiemap->fm_extents[0], fieinfo->fi_extents_start,
4800 sizeof(struct fiemap_extent)) != 0)
4801 GOTO(out, rc = -EFAULT);
4803 rc = ll_do_fiemap(inode, fiemap, num_bytes);
4805 fieinfo->fi_flags = fiemap->fm_flags;
4806 fieinfo->fi_extents_mapped = fiemap->fm_mapped_extents;
4807 if (extent_count > 0 &&
4808 copy_to_user(fieinfo->fi_extents_start, &fiemap->fm_extents[0],
4809 fiemap->fm_mapped_extents *
4810 sizeof(struct fiemap_extent)) != 0)
4811 GOTO(out, rc = -EFAULT);
4813 OBD_FREE_LARGE(fiemap, num_bytes);
4817 struct posix_acl *ll_get_acl(struct inode *inode, int type)
4819 struct ll_inode_info *lli = ll_i2info(inode);
4820 struct posix_acl *acl = NULL;
4823 spin_lock(&lli->lli_lock);
4824 /* VFS' acl_permission_check->check_acl will release the refcount */
4825 acl = posix_acl_dup(lli->lli_posix_acl);
4826 spin_unlock(&lli->lli_lock);
4831 #ifdef HAVE_IOP_SET_ACL
4832 #ifdef CONFIG_FS_POSIX_ACL
4833 int ll_set_acl(struct inode *inode, struct posix_acl *acl, int type)
4835 struct ll_sb_info *sbi = ll_i2sbi(inode);
4836 struct ptlrpc_request *req = NULL;
4837 const char *name = NULL;
4839 size_t value_size = 0;
4844 case ACL_TYPE_ACCESS:
4845 name = XATTR_NAME_POSIX_ACL_ACCESS;
4847 rc = posix_acl_update_mode(inode, &inode->i_mode, &acl);
4850 case ACL_TYPE_DEFAULT:
4851 name = XATTR_NAME_POSIX_ACL_DEFAULT;
4852 if (!S_ISDIR(inode->i_mode))
4853 rc = acl ? -EACCES : 0;
4864 value_size = posix_acl_xattr_size(acl->a_count);
4865 value = kmalloc(value_size, GFP_NOFS);
4867 GOTO(out, rc = -ENOMEM);
4869 rc = posix_acl_to_xattr(&init_user_ns, acl, value, value_size);
4871 GOTO(out_value, rc);
4874 rc = md_setxattr(sbi->ll_md_exp, ll_inode2fid(inode),
4875 value ? OBD_MD_FLXATTR : OBD_MD_FLXATTRRM,
4876 name, value, value_size, 0, 0, &req);
4878 ptlrpc_req_finished(req);
4883 forget_cached_acl(inode, type);
4885 set_cached_acl(inode, type, acl);
4888 #endif /* CONFIG_FS_POSIX_ACL */
4889 #endif /* HAVE_IOP_SET_ACL */
4891 int ll_inode_permission(struct inode *inode, int mask)
4894 struct ll_sb_info *sbi;
4895 struct root_squash_info *squash;
4896 struct cred *cred = NULL;
4897 const struct cred *old_cred = NULL;
4899 bool squash_id = false;
4902 if (mask & MAY_NOT_BLOCK)
4905 /* as root inode are NOT getting validated in lookup operation,
4906 * need to do it before permission check. */
4908 if (inode == inode->i_sb->s_root->d_inode) {
4909 rc = ll_inode_revalidate(inode->i_sb->s_root, IT_LOOKUP);
4914 CDEBUG(D_VFSTRACE, "VFS Op:inode="DFID"(%p), inode mode %x mask %o\n",
4915 PFID(ll_inode2fid(inode)), inode, inode->i_mode, mask);
4917 /* squash fsuid/fsgid if needed */
4918 sbi = ll_i2sbi(inode);
4919 squash = &sbi->ll_squash;
4920 if (unlikely(squash->rsi_uid != 0 &&
4921 uid_eq(current_fsuid(), GLOBAL_ROOT_UID) &&
4922 !(sbi->ll_flags & LL_SBI_NOROOTSQUASH))) {
4926 CDEBUG(D_OTHER, "squash creds (%d:%d)=>(%d:%d)\n",
4927 __kuid_val(current_fsuid()), __kgid_val(current_fsgid()),
4928 squash->rsi_uid, squash->rsi_gid);
4930 /* update current process's credentials
4931 * and FS capability */
4932 cred = prepare_creds();
4936 cred->fsuid = make_kuid(&init_user_ns, squash->rsi_uid);
4937 cred->fsgid = make_kgid(&init_user_ns, squash->rsi_gid);
4938 for (cap = 0; cap < sizeof(cfs_cap_t) * 8; cap++) {
4939 if ((1 << cap) & CFS_CAP_FS_MASK)
4940 cap_lower(cred->cap_effective, cap);
4942 old_cred = override_creds(cred);
4945 ll_stats_ops_tally(sbi, LPROC_LL_INODE_PERM, 1);
4946 rc = generic_permission(inode, mask);
4947 /* restore current process's credentials and FS capability */
4949 revert_creds(old_cred);
4956 /* -o localflock - only provides locally consistent flock locks */
4957 struct file_operations ll_file_operations = {
4958 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4959 # ifdef HAVE_SYNC_READ_WRITE
4960 .read = new_sync_read,
4961 .write = new_sync_write,
4963 .read_iter = ll_file_read_iter,
4964 .write_iter = ll_file_write_iter,
4965 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4966 .read = ll_file_read,
4967 .aio_read = ll_file_aio_read,
4968 .write = ll_file_write,
4969 .aio_write = ll_file_aio_write,
4970 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4971 .unlocked_ioctl = ll_file_ioctl,
4972 .open = ll_file_open,
4973 .release = ll_file_release,
4974 .mmap = ll_file_mmap,
4975 .llseek = ll_file_seek,
4976 .splice_read = ll_file_splice_read,
4981 struct file_operations ll_file_operations_flock = {
4982 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
4983 # ifdef HAVE_SYNC_READ_WRITE
4984 .read = new_sync_read,
4985 .write = new_sync_write,
4986 # endif /* HAVE_SYNC_READ_WRITE */
4987 .read_iter = ll_file_read_iter,
4988 .write_iter = ll_file_write_iter,
4989 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4990 .read = ll_file_read,
4991 .aio_read = ll_file_aio_read,
4992 .write = ll_file_write,
4993 .aio_write = ll_file_aio_write,
4994 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
4995 .unlocked_ioctl = ll_file_ioctl,
4996 .open = ll_file_open,
4997 .release = ll_file_release,
4998 .mmap = ll_file_mmap,
4999 .llseek = ll_file_seek,
5000 .splice_read = ll_file_splice_read,
5003 .flock = ll_file_flock,
5004 .lock = ll_file_flock
5007 /* These are for -o noflock - to return ENOSYS on flock calls */
5008 struct file_operations ll_file_operations_noflock = {
5009 #ifdef HAVE_FILE_OPERATIONS_READ_WRITE_ITER
5010 # ifdef HAVE_SYNC_READ_WRITE
5011 .read = new_sync_read,
5012 .write = new_sync_write,
5013 # endif /* HAVE_SYNC_READ_WRITE */
5014 .read_iter = ll_file_read_iter,
5015 .write_iter = ll_file_write_iter,
5016 #else /* !HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5017 .read = ll_file_read,
5018 .aio_read = ll_file_aio_read,
5019 .write = ll_file_write,
5020 .aio_write = ll_file_aio_write,
5021 #endif /* HAVE_FILE_OPERATIONS_READ_WRITE_ITER */
5022 .unlocked_ioctl = ll_file_ioctl,
5023 .open = ll_file_open,
5024 .release = ll_file_release,
5025 .mmap = ll_file_mmap,
5026 .llseek = ll_file_seek,
5027 .splice_read = ll_file_splice_read,
5030 .flock = ll_file_noflock,
5031 .lock = ll_file_noflock
5034 struct inode_operations ll_file_inode_operations = {
5035 .setattr = ll_setattr,
5036 .getattr = ll_getattr,
5037 .permission = ll_inode_permission,
5038 #ifdef HAVE_IOP_XATTR
5039 .setxattr = ll_setxattr,
5040 .getxattr = ll_getxattr,
5041 .removexattr = ll_removexattr,
5043 .listxattr = ll_listxattr,
5044 .fiemap = ll_fiemap,
5045 #ifdef HAVE_IOP_GET_ACL
5046 .get_acl = ll_get_acl,
5048 #ifdef HAVE_IOP_SET_ACL
5049 .set_acl = ll_set_acl,
5053 int ll_layout_conf(struct inode *inode, const struct cl_object_conf *conf)
5055 struct ll_inode_info *lli = ll_i2info(inode);
5056 struct cl_object *obj = lli->lli_clob;
5065 env = cl_env_get(&refcheck);
5067 RETURN(PTR_ERR(env));
5069 rc = cl_conf_set(env, lli->lli_clob, conf);
5073 if (conf->coc_opc == OBJECT_CONF_SET) {
5074 struct ldlm_lock *lock = conf->coc_lock;
5075 struct cl_layout cl = {
5079 LASSERT(lock != NULL);
5080 LASSERT(ldlm_has_layout(lock));
5082 /* it can only be allowed to match after layout is
5083 * applied to inode otherwise false layout would be
5084 * seen. Applying layout shoud happen before dropping
5085 * the intent lock. */
5086 ldlm_lock_allow_match(lock);
5088 rc = cl_object_layout_get(env, obj, &cl);
5093 DFID": layout version change: %u -> %u\n",
5094 PFID(&lli->lli_fid), ll_layout_version_get(lli),
5096 ll_layout_version_set(lli, cl.cl_layout_gen);
5100 cl_env_put(env, &refcheck);
5105 /* Fetch layout from MDT with getxattr request, if it's not ready yet */
5106 static int ll_layout_fetch(struct inode *inode, struct ldlm_lock *lock)
5109 struct ll_sb_info *sbi = ll_i2sbi(inode);
5110 struct ptlrpc_request *req;
5117 CDEBUG(D_INODE, DFID" LVB_READY=%d l_lvb_data=%p l_lvb_len=%d\n",
5118 PFID(ll_inode2fid(inode)), ldlm_is_lvb_ready(lock),
5119 lock->l_lvb_data, lock->l_lvb_len);
5121 if (lock->l_lvb_data != NULL)
5124 /* if layout lock was granted right away, the layout is returned
5125 * within DLM_LVB of dlm reply; otherwise if the lock was ever
5126 * blocked and then granted via completion ast, we have to fetch
5127 * layout here. Please note that we can't use the LVB buffer in
5128 * completion AST because it doesn't have a large enough buffer */
5129 rc = ll_get_default_mdsize(sbi, &lmmsize);
5133 rc = md_getxattr(sbi->ll_md_exp, ll_inode2fid(inode), OBD_MD_FLXATTR,
5134 XATTR_NAME_LOV, lmmsize, &req);
5137 GOTO(out, rc = 0); /* empty layout */
5144 if (lmmsize == 0) /* empty layout */
5147 lmm = req_capsule_server_sized_get(&req->rq_pill, &RMF_EADATA, lmmsize);
5149 GOTO(out, rc = -EFAULT);
5151 OBD_ALLOC_LARGE(lvbdata, lmmsize);
5152 if (lvbdata == NULL)
5153 GOTO(out, rc = -ENOMEM);
5155 memcpy(lvbdata, lmm, lmmsize);
5156 lock_res_and_lock(lock);
5157 if (unlikely(lock->l_lvb_data == NULL)) {
5158 lock->l_lvb_type = LVB_T_LAYOUT;
5159 lock->l_lvb_data = lvbdata;
5160 lock->l_lvb_len = lmmsize;
5163 unlock_res_and_lock(lock);
5166 OBD_FREE_LARGE(lvbdata, lmmsize);
5171 ptlrpc_req_finished(req);
5176 * Apply the layout to the inode. Layout lock is held and will be released
5179 static int ll_layout_lock_set(struct lustre_handle *lockh, enum ldlm_mode mode,
5180 struct inode *inode)
5182 struct ll_inode_info *lli = ll_i2info(inode);
5183 struct ll_sb_info *sbi = ll_i2sbi(inode);
5184 struct ldlm_lock *lock;
5185 struct cl_object_conf conf;
5188 bool wait_layout = false;
5191 LASSERT(lustre_handle_is_used(lockh));
5193 lock = ldlm_handle2lock(lockh);
5194 LASSERT(lock != NULL);
5195 LASSERT(ldlm_has_layout(lock));
5197 LDLM_DEBUG(lock, "file "DFID"(%p) being reconfigured",
5198 PFID(&lli->lli_fid), inode);
5200 /* in case this is a caching lock and reinstate with new inode */
5201 md_set_lock_data(sbi->ll_md_exp, lockh, inode, NULL);
5203 lock_res_and_lock(lock);
5204 lvb_ready = ldlm_is_lvb_ready(lock);
5205 unlock_res_and_lock(lock);
5207 /* checking lvb_ready is racy but this is okay. The worst case is
5208 * that multi processes may configure the file on the same time. */
5212 rc = ll_layout_fetch(inode, lock);
5216 /* for layout lock, lmm is stored in lock's lvb.
5217 * lvb_data is immutable if the lock is held so it's safe to access it
5220 * set layout to file. Unlikely this will fail as old layout was
5221 * surely eliminated */
5222 memset(&conf, 0, sizeof conf);
5223 conf.coc_opc = OBJECT_CONF_SET;
5224 conf.coc_inode = inode;
5225 conf.coc_lock = lock;
5226 conf.u.coc_layout.lb_buf = lock->l_lvb_data;
5227 conf.u.coc_layout.lb_len = lock->l_lvb_len;
5228 rc = ll_layout_conf(inode, &conf);
5230 /* refresh layout failed, need to wait */
5231 wait_layout = rc == -EBUSY;
5234 LDLM_LOCK_PUT(lock);
5235 ldlm_lock_decref(lockh, mode);
5237 /* wait for IO to complete if it's still being used. */
5239 CDEBUG(D_INODE, "%s: "DFID"(%p) wait for layout reconf\n",
5240 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5242 memset(&conf, 0, sizeof conf);
5243 conf.coc_opc = OBJECT_CONF_WAIT;
5244 conf.coc_inode = inode;
5245 rc = ll_layout_conf(inode, &conf);
5249 CDEBUG(D_INODE, "%s file="DFID" waiting layout return: %d\n",
5250 sbi->ll_fsname, PFID(&lli->lli_fid), rc);
5256 * Issue layout intent RPC to MDS.
5257 * \param inode [in] file inode
5258 * \param intent [in] layout intent
5260 * \retval 0 on success
5261 * \retval < 0 error code
5263 static int ll_layout_intent(struct inode *inode, struct layout_intent *intent)
5265 struct ll_inode_info *lli = ll_i2info(inode);
5266 struct ll_sb_info *sbi = ll_i2sbi(inode);
5267 struct md_op_data *op_data;
5268 struct lookup_intent it;
5269 struct ptlrpc_request *req;
5273 op_data = ll_prep_md_op_data(NULL, inode, inode, NULL,
5274 0, 0, LUSTRE_OPC_ANY, NULL);
5275 if (IS_ERR(op_data))
5276 RETURN(PTR_ERR(op_data));
5278 op_data->op_data = intent;
5279 op_data->op_data_size = sizeof(*intent);
5281 memset(&it, 0, sizeof(it));
5282 it.it_op = IT_LAYOUT;
5283 if (intent->li_opc == LAYOUT_INTENT_WRITE ||
5284 intent->li_opc == LAYOUT_INTENT_TRUNC)
5285 it.it_flags = FMODE_WRITE;
5287 LDLM_DEBUG_NOLOCK("%s: requeue layout lock for file "DFID"(%p)",
5288 sbi->ll_fsname, PFID(&lli->lli_fid), inode);
5290 rc = md_intent_lock(sbi->ll_md_exp, op_data, &it, &req,
5291 &ll_md_blocking_ast, 0);
5292 if (it.it_request != NULL)
5293 ptlrpc_req_finished(it.it_request);
5294 it.it_request = NULL;
5296 ll_finish_md_op_data(op_data);
5298 /* set lock data in case this is a new lock */
5300 ll_set_lock_data(sbi->ll_md_exp, inode, &it, NULL);
5302 ll_intent_drop_lock(&it);
5308 * This function checks if there exists a LAYOUT lock on the client side,
5309 * or enqueues it if it doesn't have one in cache.
5311 * This function will not hold layout lock so it may be revoked any time after
5312 * this function returns. Any operations depend on layout should be redone
5315 * This function should be called before lov_io_init() to get an uptodate
5316 * layout version, the caller should save the version number and after IO
5317 * is finished, this function should be called again to verify that layout
5318 * is not changed during IO time.
5320 int ll_layout_refresh(struct inode *inode, __u32 *gen)
5322 struct ll_inode_info *lli = ll_i2info(inode);
5323 struct ll_sb_info *sbi = ll_i2sbi(inode);
5324 struct lustre_handle lockh;
5325 struct layout_intent intent = {
5326 .li_opc = LAYOUT_INTENT_ACCESS,
5328 enum ldlm_mode mode;
5332 *gen = ll_layout_version_get(lli);
5333 if (!(sbi->ll_flags & LL_SBI_LAYOUT_LOCK) || *gen != CL_LAYOUT_GEN_NONE)
5337 LASSERT(fid_is_sane(ll_inode2fid(inode)));
5338 LASSERT(S_ISREG(inode->i_mode));
5340 /* take layout lock mutex to enqueue layout lock exclusively. */
5341 mutex_lock(&lli->lli_layout_mutex);
5344 /* mostly layout lock is caching on the local side, so try to
5345 * match it before grabbing layout lock mutex. */
5346 mode = ll_take_md_lock(inode, MDS_INODELOCK_LAYOUT, &lockh, 0,
5347 LCK_CR | LCK_CW | LCK_PR |
5349 if (mode != 0) { /* hit cached lock */
5350 rc = ll_layout_lock_set(&lockh, mode, inode);
5356 rc = ll_layout_intent(inode, &intent);
5362 *gen = ll_layout_version_get(lli);
5363 mutex_unlock(&lli->lli_layout_mutex);
5369 * Issue layout intent RPC indicating where in a file an IO is about to write.
5371 * \param[in] inode file inode.
5372 * \param[in] ext write range with start offset of fille in bytes where
5373 * an IO is about to write, and exclusive end offset in
5376 * \retval 0 on success
5377 * \retval < 0 error code
5379 int ll_layout_write_intent(struct inode *inode, enum layout_intent_opc opc,
5380 struct lu_extent *ext)
5382 struct layout_intent intent = {
5384 .li_extent.e_start = ext->e_start,
5385 .li_extent.e_end = ext->e_end,
5390 rc = ll_layout_intent(inode, &intent);
5396 * This function send a restore request to the MDT
5398 int ll_layout_restore(struct inode *inode, loff_t offset, __u64 length)
5400 struct hsm_user_request *hur;
5404 len = sizeof(struct hsm_user_request) +
5405 sizeof(struct hsm_user_item);
5406 OBD_ALLOC(hur, len);
5410 hur->hur_request.hr_action = HUA_RESTORE;
5411 hur->hur_request.hr_archive_id = 0;
5412 hur->hur_request.hr_flags = 0;
5413 memcpy(&hur->hur_user_item[0].hui_fid, &ll_i2info(inode)->lli_fid,
5414 sizeof(hur->hur_user_item[0].hui_fid));
5415 hur->hur_user_item[0].hui_extent.offset = offset;
5416 hur->hur_user_item[0].hui_extent.length = length;
5417 hur->hur_request.hr_itemcount = 1;
5418 rc = obd_iocontrol(LL_IOC_HSM_REQUEST, ll_i2sbi(inode)->ll_md_exp,